rtl.cpp source code [offload/plugins-nextgen/cuda/src/rtl.cpp]

1	//===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ --===//*
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// RTL NextGen for CUDA machine
10	//
11	//===----------------------------------------------------------------------===//
12
13	#include <cassert>
14	#include <cstddef>
15	#include <cuda.h>
16	#include <string>
17	#include <unordered_map>
18
19	#include "Shared/APITypes.h"
20	#include "Shared/Debug.h"
21	#include "Shared/Environment.h"
22
23	#include "GlobalHandler.h"
24	#include "OpenMP/OMPT/Callback.h"
25	#include "PluginInterface.h"
26	#include "Utils/ELF.h"
27
28	#include "llvm/BinaryFormat/ELF.h"
29	#include "llvm/Frontend/OpenMP/OMPConstants.h"
30	#include "llvm/Frontend/OpenMP/OMPGridValues.h"
31	#include "llvm/Support/Error.h"
32	#include "llvm/Support/FileOutputBuffer.h"
33	#include "llvm/Support/FileSystem.h"
34	#include "llvm/Support/Program.h"
35
36	using namespace error;
37
38	namespace llvm {
39	namespace omp {
40	namespace target {
41	namespace plugin {
42
43	/// Forward declarations for all specialized data structures.
44	struct CUDAKernelTy;
45	struct CUDADeviceTy;
46	struct CUDAPluginTy;
47
48	#if (defined(CUDA_VERSION) && (CUDA_VERSION < 11000))
49	/// Forward declarations for all Virtual Memory Management
50	/// related data structures and functions. This is necessary
51	/// for older cuda versions.
52	typedef void *CUmemGenericAllocationHandle;
53	typedef void *CUmemAllocationProp;
54	typedef void *CUmemAccessDesc;
55	typedef void *CUmemAllocationGranularity_flags;
56	CUresult cuMemAddressReserve(CUdeviceptr *ptr, size_t size, size_t alignment,
57	CUdeviceptr addr, unsigned long long flags) {}
58	CUresult cuMemMap(CUdeviceptr ptr, size_t size, size_t offset,
59	CUmemGenericAllocationHandle handle,
60	unsigned long long flags) {}
61	CUresult cuMemCreate(CUmemGenericAllocationHandle *handle, size_t size,
62	const CUmemAllocationProp *prop,
63	unsigned long long flags) {}
64	CUresult cuMemSetAccess(CUdeviceptr ptr, size_t size,
65	const CUmemAccessDesc *desc, size_t count) {}
66	CUresult
67	cuMemGetAllocationGranularity(size_t *granularity,
68	const CUmemAllocationProp *prop,
69	CUmemAllocationGranularity_flags option) {}
70	#endif
71
72	#if (defined(CUDA_VERSION) && (CUDA_VERSION < 11020))
73	// Forward declarations of asynchronous memory management functions. This is
74	// necessary for older versions of CUDA.
75	CUresult cuMemAllocAsync(CUdeviceptr ptr, size_t, CUstream) { ptr = `0`; }
76
77	CUresult cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream) {}
78	#endif
79
80	/// Class implementing the CUDA device images properties.
81	struct CUDADeviceImageTy : public DeviceImageTy {
82	/// Create the CUDA image with the id and the target image pointer.
83	CUDADeviceImageTy(int32_t ImageId, GenericDeviceTy &Device,
84	const __tgt_device_image *TgtImage)
85	: DeviceImageTy(ImageId, Device, TgtImage), Module(nullptr) {}
86
87	/// Load the image as a CUDA module.
88	Error loadModule() {
89	assert(!Module && "Module already loaded");
90
91	CUresult Res = cuModuleLoadDataEx(&Module, getStart(), `0`, nullptr, nullptr);
92	if (auto Err = Plugin::check(Res, "error in cuModuleLoadDataEx: %s"))
93	return Err;
94
95	return Plugin::success();
96	}
97
98	/// Unload the CUDA module corresponding to the image.
99	Error unloadModule() {
100	assert(Module && "Module not loaded");
101
102	CUresult Res = cuModuleUnload(Module);
103	if (auto Err = Plugin::check(Res, "error in cuModuleUnload: %s"))
104	return Err;
105
106	Module = nullptr;
107
108	return Plugin::success();
109	}
110
111	/// Getter of the CUDA module.
112	CUmodule getModule() const { return Module; }
113
114	private:
115	/// The CUDA module that loaded the image.
116	CUmodule Module;
117	};
118
119	/// Class implementing the CUDA kernel functionalities which derives from the
120	/// generic kernel class.
121	struct CUDAKernelTy : public GenericKernelTy {
122	/// Create a CUDA kernel with a name and an execution mode.
123	CUDAKernelTy(const char Name) : GenericKernelTy(Name), Func(nullptr*) {}
124
125	/// Initialize the CUDA kernel.
126	Error initImpl(GenericDeviceTy &GenericDevice,
127	DeviceImageTy &Image) override {
128	CUresult Res;
129	CUDADeviceImageTy &CUDAImage = static_cast<CUDADeviceImageTy &>(Image);
130
131	// Retrieve the function pointer of the kernel.
132	Res = cuModuleGetFunction(&Func, CUDAImage.getModule(), getName());
133	if (auto Err = Plugin::check(Res, "error in cuModuleGetFunction('%s'): %s",
134	getName()))
135	return Err;
136
137	// Check that the function pointer is valid.
138	if (!Func)
139	return Plugin::error(ErrorCode::INVALID_BINARY,
140	"invalid function for kernel %s", getName());
141
142	int MaxThreads;
143	Res = cuFuncGetAttribute(&MaxThreads,
144	CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Func);
145	if (auto Err = Plugin::check(Res, "error in cuFuncGetAttribute: %s"))
146	return Err;
147
148	// The maximum number of threads cannot exceed the maximum of the kernel.
149	MaxNumThreads = std::min(MaxNumThreads, (uint32_t)MaxThreads);
150
151	return Plugin::success();
152	}
153
154	/// Launch the CUDA kernel function.
155	Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[`3`],
156	uint32_t NumBlocks[`3`], KernelArgsTy &KernelArgs,
157	KernelLaunchParamsTy LaunchParams,
158	AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
159
160	private:
161	/// The CUDA kernel function to execute.
162	CUfunction Func;
163	};
164
165	/// Class wrapping a CUDA stream reference. These are the objects handled by the
166	/// Stream Manager for the CUDA plugin.
167	struct CUDAStreamRef final : public GenericDeviceResourceRef {
168	/// The underlying handle type for streams.
169	using HandleTy = CUstream;
170
171	/// Create an empty reference to an invalid stream.
172	CUDAStreamRef() : Stream(nullptr) {}
173
174	/// Create a reference to an existing stream.
175	CUDAStreamRef(HandleTy Stream) : Stream(Stream) {}
176
177	/// Create a new stream and save the reference. The reference must be empty
178	/// before calling to this function.
179	Error create(GenericDeviceTy &Device) override {
180	if (Stream)
181	return Plugin::error(ErrorCode::INVALID_ARGUMENT,
182	"creating an existing stream");
183
184	CUresult Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING);
185	if (auto Err = Plugin::check(Res, "error in cuStreamCreate: %s"))
186	return Err;
187
188	return Plugin::success();
189	}
190
191	/// Destroy the referenced stream and invalidate the reference. The reference
192	/// must be to a valid stream before calling to this function.
193	Error destroy(GenericDeviceTy &Device) override {
194	if (!Stream)
195	return Plugin::error(ErrorCode::INVALID_ARGUMENT,
196	"destroying an invalid stream");
197
198	CUresult Res = cuStreamDestroy(Stream);
199	if (auto Err = Plugin::check(Res, "error in cuStreamDestroy: %s"))
200	return Err;
201
202	Stream = nullptr;
203	return Plugin::success();
204	}
205
206	/// Get the underlying CUDA stream.
207	operator HandleTy() const { return Stream; }
208
209	private:
210	/// The reference to the CUDA stream.
211	HandleTy Stream;
212	};
213
214	/// Class wrapping a CUDA event reference. These are the objects handled by the
215	/// Event Manager for the CUDA plugin.
216	struct CUDAEventRef final : public GenericDeviceResourceRef {
217	/// The underlying handle type for events.
218	using HandleTy = CUevent;
219
220	/// Create an empty reference to an invalid event.
221	CUDAEventRef() : Event(nullptr) {}
222
223	/// Create a reference to an existing event.
224	CUDAEventRef(HandleTy Event) : Event(Event) {}
225
226	/// Create a new event and save the reference. The reference must be empty
227	/// before calling to this function.
228	Error create(GenericDeviceTy &Device) override {
229	if (Event)
230	return Plugin::error(ErrorCode::INVALID_ARGUMENT,
231	"creating an existing event");
232
233	CUresult Res = cuEventCreate(&Event, CU_EVENT_DEFAULT);
234	if (auto Err = Plugin::check(Res, "error in cuEventCreate: %s"))
235	return Err;
236
237	return Plugin::success();
238	}
239
240	/// Destroy the referenced event and invalidate the reference. The reference
241	/// must be to a valid event before calling to this function.
242	Error destroy(GenericDeviceTy &Device) override {
243	if (!Event)
244	return Plugin::error(ErrorCode::INVALID_ARGUMENT,
245	"destroying an invalid event");
246
247	CUresult Res = cuEventDestroy(Event);
248	if (auto Err = Plugin::check(Res, "error in cuEventDestroy: %s"))
249	return Err;
250
251	Event = nullptr;
252	return Plugin::success();
253	}
254
255	/// Get the underlying CUevent.
256	operator HandleTy() const { return Event; }
257
258	private:
259	/// The reference to the CUDA event.
260	HandleTy Event;
261	};
262
263	/// Class implementing the CUDA device functionalities which derives from the
264	/// generic device class.
265	struct CUDADeviceTy : public GenericDeviceTy {
266	// Create a CUDA device with a device id and the default CUDA grid values.
267	CUDADeviceTy(GenericPluginTy &Plugin, int32_t DeviceId, int32_t NumDevices)
268	: GenericDeviceTy(Plugin, DeviceId, NumDevices, NVPTXGridValues),
269	CUDAStreamManager(*this), CUDAEventManager(*this) {}
270
271	~CUDADeviceTy() {}
272
273	/// Initialize the device, its resources and get its properties.
274	Error initImpl(GenericPluginTy &Plugin) override {
275	CUresult Res = cuDeviceGet(&Device, DeviceId);
276	if (auto Err = Plugin::check(Res, "error in cuDeviceGet: %s"))
277	return Err;
278
279	// Query the current flags of the primary context and set its flags if
280	// it is inactive.
281	unsigned int FormerPrimaryCtxFlags = `0`;
282	int FormerPrimaryCtxIsActive = `0`;
283	Res = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags,
284	&FormerPrimaryCtxIsActive);
285	if (auto Err =
286	Plugin::check(Res, "error in cuDevicePrimaryCtxGetState: %s"))
287	return Err;
288
289	if (FormerPrimaryCtxIsActive) {
290	INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
291	"The primary context is active, no change to its flags\n");
292	if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) !=
293	CU_CTX_SCHED_BLOCKING_SYNC)
294	INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
295	"Warning: The current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n");
296	} else {
297	INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
298	"The primary context is inactive, set its flags to "
299	"CU_CTX_SCHED_BLOCKING_SYNC\n");
300	Res = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC);
301	if (auto Err =
302	Plugin::check(Res, "error in cuDevicePrimaryCtxSetFlags: %s"))
303	return Err;
304	}
305
306	// Retain the per device primary context and save it to use whenever this
307	// device is selected.
308	Res = cuDevicePrimaryCtxRetain(&Context, Device);
309	if (auto Err = Plugin::check(Res, "error in cuDevicePrimaryCtxRetain: %s"))
310	return Err;
311
312	if (auto Err = setContext())
313	return Err;
314
315	// Initialize stream pool.
316	if (auto Err = CUDAStreamManager.init(OMPX_InitialNumStreams))
317	return Err;
318
319	// Initialize event pool.
320	if (auto Err = CUDAEventManager.init(OMPX_InitialNumEvents))
321	return Err;
322
323	// Query attributes to determine number of threads/block and blocks/grid.
324	if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
325	GridValues.GV_Max_Teams))
326	return Err;
327
328	if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
329	GridValues.GV_Max_WG_Size))
330	return Err;
331
332	if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_WARP_SIZE,
333	GridValues.GV_Warp_Size))
334	return Err;
335
336	if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
337	ComputeCapability.Major))
338	return Err;
339
340	if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
341	ComputeCapability.Minor))
342	return Err;
343
344	uint32_t NumMuliprocessors = `0`;
345	uint32_t MaxThreadsPerSM = `0`;
346	uint32_t WarpSize = `0`;
347	if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
348	NumMuliprocessors))
349	return Err;
350	if (auto Err =
351	getDeviceAttr(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
352	MaxThreadsPerSM))
353	return Err;
354	if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_WARP_SIZE, WarpSize))
355	return Err;
356	HardwareParallelism = NumMuliprocessors * (MaxThreadsPerSM / WarpSize);
357
358	return Plugin::success();
359	}
360
361	/// Deinitialize the device and release its resources.
362	Error deinitImpl() override {
363	if (Context) {
364	if (auto Err = setContext())
365	return Err;
366	}
367
368	// Deinitialize the stream manager.
369	if (auto Err = CUDAStreamManager.deinit())
370	return Err;
371
372	if (auto Err = CUDAEventManager.deinit())
373	return Err;
374
375	// Close modules if necessary.
376	if (!LoadedImages.empty()) {
377	assert(Context && "Invalid CUDA context");
378
379	// Each image has its own module.
380	for (DeviceImageTy *Image : LoadedImages) {
381	CUDADeviceImageTy &CUDAImage = static_cast<CUDADeviceImageTy &>(*Image);
382
383	// Unload the module of the image.
384	if (auto Err = CUDAImage.unloadModule())
385	return Err;
386	}
387	}
388
389	if (Context) {
390	CUresult Res = cuDevicePrimaryCtxRelease(Device);
391	if (auto Err =
392	Plugin::check(Res, "error in cuDevicePrimaryCtxRelease: %s"))
393	return Err;
394	}
395
396	// Invalidate context and device references.
397	Context = nullptr;
398	Device = CU_DEVICE_INVALID;
399
400	return Plugin::success();
401	}
402
403	virtual Error callGlobalConstructors(GenericPluginTy &Plugin,
404	DeviceImageTy &Image) override {
405	// Check for the presence of global destructors at initialization time. This
406	// is required when the image may be deallocated before destructors are run.
407	GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
408	if (Handler.isSymbolInImage(*this, Image, "nvptx$device$fini"))
409	Image.setPendingGlobalDtors();
410
411	return callGlobalCtorDtorCommon(Plugin, Image, /IsCtor=/true);
412	}
413
414	virtual Error callGlobalDestructors(GenericPluginTy &Plugin,
415	DeviceImageTy &Image) override {
416	if (Image.hasPendingGlobalDtors())
417	return callGlobalCtorDtorCommon(Plugin, Image, /IsCtor=/false);
418	return Plugin::success();
419	}
420
421	Expected<std::unique_ptr<MemoryBuffer>>
422	doJITPostProcessing(std::unique_ptr<MemoryBuffer> MB) const override {
423	// TODO: We should be able to use the 'nvidia-ptxjitcompiler' interface to
424	// avoid the call to 'ptxas'.
425	SmallString<`128`> PTXInputFilePath;
426	std::error_code EC = sys::fs::createTemporaryFile("nvptx-pre-link-jit", "s",
427	PTXInputFilePath);
428	if (EC)
429	return Plugin::error(ErrorCode::HOST_IO,
430	"failed to create temporary file for ptxas");
431
432	// Write the file's contents to the output file.
433	Expected<std::unique_ptr<FileOutputBuffer>> OutputOrErr =
434	FileOutputBuffer::create(PTXInputFilePath, MB->getBuffer().size());
435	if (!OutputOrErr)
436	return OutputOrErr.takeError();
437	std::unique_ptr<FileOutputBuffer> Output = std::move(*OutputOrErr);
438	llvm::copy(MB->getBuffer(), Output ->getBufferStart());
439	if (Error E = Output ->commit())
440	return std::move(E);
441
442	SmallString<`128`> PTXOutputFilePath;
443	EC = sys::fs::createTemporaryFile("nvptx-post-link-jit", "cubin",
444	PTXOutputFilePath);
445	if (EC)
446	return Plugin::error(ErrorCode::HOST_IO,
447	"failed to create temporary file for ptxas");
448
449	// Try to find `ptxas` in the path to compile the PTX to a binary.
450	const auto ErrorOrPath = sys::findProgramByName("ptxas");
451	if (!ErrorOrPath)
452	return Plugin::error(ErrorCode::HOST_TOOL_NOT_FOUND,
453	"failed to find 'ptxas' on the PATH.");
454
455	std::string Arch = getComputeUnitKind();
456	StringRef Args[] = {*ErrorOrPath,
457	"-m64",
458	"-O2",
459	"--gpu-name",
460	Arch,
461	"--output-file",
462	PTXOutputFilePath,
463	PTXInputFilePath};
464
465	std::string ErrMsg;
466	if (sys::ExecuteAndWait(*ErrorOrPath, Args, std::nullopt, {}, `0`, `0`,
467	&ErrMsg))
468	return Plugin::error(ErrorCode::ASSEMBLE_FAILURE,
469	"running 'ptxas' failed: %s\n", ErrMsg.c_str());
470
471	auto BufferOrErr = MemoryBuffer::getFileOrSTDIN(PTXOutputFilePath.data());
472	if (!BufferOrErr)
473	return Plugin::error(ErrorCode::HOST_IO,
474	"failed to open temporary file for ptxas");
475
476	// Clean up the temporary files afterwards.
477	if (sys::fs::remove(PTXOutputFilePath))
478	return Plugin::error(ErrorCode::HOST_IO,
479	"failed to remove temporary file for ptxas");
480	if (sys::fs::remove(PTXInputFilePath))
481	return Plugin::error(ErrorCode::HOST_IO,
482	"failed to remove temporary file for ptxas");
483
484	return std::move(*BufferOrErr);
485	}
486
487	/// Allocate and construct a CUDA kernel.
488	Expected<GenericKernelTy &> constructKernel(const char *Name) override {
489	// Allocate and construct the CUDA kernel.
490	CUDAKernelTy *CUDAKernel = Plugin.allocate<CUDAKernelTy>();
491	if (!CUDAKernel)
492	return Plugin::error(ErrorCode::OUT_OF_RESOURCES,
493	"failed to allocate memory for CUDA kernel");
494
495	new (CUDAKernel) CUDAKernelTy (Name);
496
497	return *CUDAKernel;
498	}
499
500	/// Set the current context to this device's context.
501	Error setContext() override {
502	CUresult Res = cuCtxSetCurrent(Context);
503	return Plugin::check(Res, "error in cuCtxSetCurrent: %s");
504	}
505
506	/// NVIDIA returns the product of the SM count and the number of warps that
507	/// fit if the maximum number of threads were scheduled on each SM.
508	uint64_t getHardwareParallelism() const override {
509	return HardwareParallelism;
510	}
511
512	/// We want to set up the RPC server for host services to the GPU if it is
513	/// available.
514	bool shouldSetupRPCServer() const override { return true; }
515
516	/// The RPC interface should have enough space for all available parallelism.
517	uint64_t requestedRPCPortCount() const override {
518	return getHardwareParallelism();
519	}
520
521	/// Get the stream of the asynchronous info structure or get a new one.
522	Error getStream(AsyncInfoWrapperTy &AsyncInfoWrapper, CUstream &Stream) {
523	// Get the stream (if any) from the async info.
524	Stream = AsyncInfoWrapper.getQueueAs<CUstream>();
525	if (!Stream) {
526	// There was no stream; get an idle one.
527	if (auto Err = CUDAStreamManager.getResource(Stream))
528	return Err;
529
530	// Modify the async info's stream.
531	AsyncInfoWrapper.setQueueAs<CUstream>(Stream);
532	}
533	return Plugin::success();
534	}
535
536	/// Getters of CUDA references.
537	CUcontext getCUDAContext() const { return Context; }
538	CUdevice getCUDADevice() const { return Device; }
539
540	/// Load the binary image into the device and allocate an image object.
541	Expected<DeviceImageTy > loadBinaryImpl(const* __tgt_device_image *TgtImage,
542	int32_t ImageId) override {
543	if (auto Err = setContext())
544	return std::move(Err);
545
546	// Allocate and initialize the image object.
547	CUDADeviceImageTy *CUDAImage = Plugin.allocate<CUDADeviceImageTy>();
548	new (CUDAImage) CUDADeviceImageTy(ImageId, *this, TgtImage);
549
550	// Load the CUDA module.
551	if (auto Err = CUDAImage->loadModule())
552	return std::move(Err);
553
554	return CUDAImage;
555	}
556
557	/// Allocate memory on the device or related to the device.
558	void allocate(size_t Size, void* *, TargetAllocTy Kind) override {
559	if (Size == `0`)
560	return nullptr;
561
562	if (auto Err = setContext()) {
563	REPORT("Failure to alloc memory: %s\n", toString(E: std::move(Err)).data());
564	return nullptr;
565	}
566
567	void MemAlloc = nullptr*;
568	CUdeviceptr DevicePtr;
569	CUresult Res;
570
571	switch (Kind) {
572	case TARGET_ALLOC_DEFAULT:
573	case TARGET_ALLOC_DEVICE:
574	Res = cuMemAlloc(&DevicePtr, Size);
575	MemAlloc = (void *)DevicePtr;
576	break;
577	case TARGET_ALLOC_HOST:
578	Res = cuMemAllocHost(&MemAlloc, Size);
579	break;
580	case TARGET_ALLOC_SHARED:
581	Res = cuMemAllocManaged(&DevicePtr, Size, CU_MEM_ATTACH_GLOBAL);
582	MemAlloc = (void *)DevicePtr;
583	break;
584	case TARGET_ALLOC_DEVICE_NON_BLOCKING: {
585	CUstream Stream;
586	if ((Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING)))
587	break;
588	if ((Res = cuMemAllocAsync(&DevicePtr, Size, Stream)))
589	break;
590	cuStreamSynchronize(Stream);
591	Res = cuStreamDestroy(Stream);
592	MemAlloc = (void *)DevicePtr;
593	}
594	}
595
596	if (auto Err =
597	Plugin::check(Res, "error in cuMemAlloc[Host\|Managed]: %s")) {
598	REPORT("Failure to alloc memory: %s\n", toString(std::move(Err)).data());
599	return nullptr;
600	}
601	return MemAlloc;
602	}
603
604	/// Deallocate memory on the device or related to the device.
605	int free(void *TgtPtr, TargetAllocTy Kind) override {
606	if (TgtPtr == nullptr)
607	return OFFLOAD_SUCCESS;
608
609	if (auto Err = setContext()) {
610	REPORT("Failure to free memory: %s\n", toString(E: std::move(Err)).data());
611	return OFFLOAD_FAIL;
612	}
613
614	CUresult Res;
615	switch (Kind) {
616	case TARGET_ALLOC_DEFAULT:
617	case TARGET_ALLOC_DEVICE:
618	case TARGET_ALLOC_SHARED:
619	Res = cuMemFree((CUdeviceptr)TgtPtr);
620	break;
621	case TARGET_ALLOC_HOST:
622	Res = cuMemFreeHost(TgtPtr);
623	break;
624	case TARGET_ALLOC_DEVICE_NON_BLOCKING: {
625	CUstream Stream;
626	if ((Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING)))
627	break;
628	cuMemFreeAsync(reinterpret_cast<CUdeviceptr>(TgtPtr), Stream);
629	cuStreamSynchronize(Stream);
630	if ((Res = cuStreamDestroy(Stream)))
631	break;
632	}
633	}
634
635	if (auto Err = Plugin::check(Res, "error in cuMemFree[Host]: %s")) {
636	REPORT("Failure to free memory: %s\n", toString(std::move(Err)).data());
637	return OFFLOAD_FAIL;
638	}
639	return OFFLOAD_SUCCESS;
640	}
641
642	/// Synchronize current thread with the pending operations on the async info.
643	Error synchronizeImpl(__tgt_async_info &AsyncInfo) override {
644	CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo.Queue);
645	CUresult Res;
646	Res = cuStreamSynchronize(Stream);
647
648	// Once the stream is synchronized, return it to stream pool and reset
649	// AsyncInfo. This is to make sure the synchronization only works for its
650	// own tasks.
651	AsyncInfo.Queue = nullptr;
652	if (auto Err = CUDAStreamManager.returnResource(Stream))
653	return Err;
654
655	return Plugin::check(Res, "error in cuStreamSynchronize: %s");
656	}
657
658	/// CUDA support VA management
659	bool supportVAManagement() const override {
660	#if (defined(CUDA_VERSION) && (CUDA_VERSION >= 11000))
661	return true;
662	#else
663	return false;
664	#endif
665	}
666
667	/// Allocates \p RSize bytes (rounded up to page size) and hints the cuda
668	/// driver to map it to \p VAddr. The obtained address is stored in \p Addr.
669	/// At return \p RSize contains the actual size
670	Error memoryVAMap(void *Addr, void* VAddr, size_t RSize) override {
671	CUdeviceptr DVAddr = reinterpret_cast<CUdeviceptr>(VAddr);
672	auto IHandle = DeviceMMaps.find(DVAddr);
673	size_t Size = *RSize;
674
675	if (Size == `0`)
676	return Plugin::error(ErrorCode::INVALID_ARGUMENT,
677	"memory Map Size must be larger than 0");
678
679	// Check if we have already mapped this address
680	if (IHandle != DeviceMMaps.end())
681	return Plugin::error(ErrorCode::INVALID_ARGUMENT,
682	"address already memory mapped");
683
684	CUmemAllocationProp Prop = {};
685	size_t Granularity = `0`;
686
687	size_t Free, Total;
688	CUresult Res = cuMemGetInfo(&Free, &Total);
689	if (auto Err = Plugin::check(Res, "Error in cuMemGetInfo: %s"))
690	return Err;
691
692	if (Size >= Free) {
693	Addr = nullptr*;
694	return Plugin::error(
695	ErrorCode::OUT_OF_RESOURCES,
696	"cannot map memory size larger than the available device memory");
697	}
698
699	// currently NVidia only supports pinned device types
700	Prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
701	Prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
702
703	Prop.location.id = DeviceId;
704	cuMemGetAllocationGranularity(&Granularity, &Prop,
705	CU_MEM_ALLOC_GRANULARITY_MINIMUM);
706	if (auto Err =
707	Plugin::check(Res, "error in cuMemGetAllocationGranularity: %s"))
708	return Err;
709
710	if (Granularity == `0`)
711	return Plugin::error(ErrorCode::INVALID_ARGUMENT,
712	"wrong device Page size");
713
714	// Ceil to page size.
715	Size = utils::roundUp(Size, Granularity);
716
717	// Create a handler of our allocation
718	CUmemGenericAllocationHandle AHandle;
719	Res = cuMemCreate(&AHandle, Size, &Prop, `0`);
720	if (auto Err = Plugin::check(Res, "error in cuMemCreate: %s"))
721	return Err;
722
723	CUdeviceptr DevPtr = `0`;
724	Res = cuMemAddressReserve(&DevPtr, Size, `0`, DVAddr, `0`);
725	if (auto Err = Plugin::check(Res, "error in cuMemAddressReserve: %s"))
726	return Err;
727
728	Res = cuMemMap(DevPtr, Size, `0`, AHandle, `0`);
729	if (auto Err = Plugin::check(Res, "error in cuMemMap: %s"))
730	return Err;
731
732	CUmemAccessDesc ADesc = {};
733	ADesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
734	ADesc.location.id = DeviceId;
735	ADesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
736
737	// Sets address
738	Res = cuMemSetAccess(DevPtr, Size, &ADesc, `1`);
739	if (auto Err = Plugin::check(Res, "error in cuMemSetAccess: %s"))
740	return Err;
741
742	Addr = reinterpret_cast<void* *>(DevPtr);
743	*RSize = Size;
744	DeviceMMaps.insert({DevPtr, AHandle});
745	return Plugin::success();
746	}
747
748	/// De-allocates device memory and Unmaps the Virtual Addr
749	Error memoryVAUnMap(void *VAddr, size_t Size) override {
750	CUdeviceptr DVAddr = reinterpret_cast<CUdeviceptr>(VAddr);
751	auto IHandle = DeviceMMaps.find(DVAddr);
752	// Mapping does not exist
753	if (IHandle == DeviceMMaps.end()) {
754	return Plugin::error(ErrorCode::INVALID_ARGUMENT,
755	"addr is not MemoryMapped");
756	}
757
758	if (IHandle == DeviceMMaps.end())
759	return Plugin::error(ErrorCode::INVALID_ARGUMENT,
760	"addr is not MemoryMapped");
761
762	CUmemGenericAllocationHandle &AllocHandle = IHandle->second;
763
764	CUresult Res = cuMemUnmap(DVAddr, Size);
765	if (auto Err = Plugin::check(Res, "error in cuMemUnmap: %s"))
766	return Err;
767
768	Res = cuMemRelease(AllocHandle);
769	if (auto Err = Plugin::check(Res, "error in cuMemRelease: %s"))
770	return Err;
771
772	Res = cuMemAddressFree(DVAddr, Size);
773	if (auto Err = Plugin::check(Res, "error in cuMemAddressFree: %s"))
774	return Err;
775
776	DeviceMMaps.erase(IHandle);
777	return Plugin::success();
778	}
779
780	/// Query for the completion of the pending operations on the async info.
781	Error queryAsyncImpl(__tgt_async_info &AsyncInfo) override {
782	CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo.Queue);
783	CUresult Res = cuStreamQuery(Stream);
784
785	// Not ready streams must be considered as successful operations.
786	if (Res == CUDA_ERROR_NOT_READY)
787	return Plugin::success();
788
789	// Once the stream is synchronized and the operations completed (or an error
790	// occurs), return it to stream pool and reset AsyncInfo. This is to make
791	// sure the synchronization only works for its own tasks.
792	AsyncInfo.Queue = nullptr;
793	if (auto Err = CUDAStreamManager.returnResource(Stream))
794	return Err;
795
796	return Plugin::check(Res, "error in cuStreamQuery: %s");
797	}
798
799	Expected<void > dataLockImpl(void* *HstPtr, int64_t Size) override {
800	// TODO: Register the buffer as CUDA host memory.
801	return HstPtr;
802	}
803
804	Error dataUnlockImpl(void HstPtr) override { return* Plugin::success(); }
805
806	Expected<bool> isPinnedPtrImpl(void HstPtr, void* *&BaseHstPtr,
807	void *&BaseDevAccessiblePtr,
808	size_t &BaseSize) const override {
809	// TODO: Implement pinning feature for CUDA.
810	return false;
811	}
812
813	/// Submit data to the device (host to device transfer).
814	Error dataSubmitImpl(void TgtPtr, const* void *HstPtr, int64_t Size,
815	AsyncInfoWrapperTy &AsyncInfoWrapper) override {
816	if (auto Err = setContext())
817	return Err;
818
819	CUstream Stream;
820	if (auto Err = getStream(AsyncInfoWrapper, Stream))
821	return Err;
822
823	CUresult Res = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
824	return Plugin::check(Res, "error in cuMemcpyHtoDAsync: %s");
825	}
826
827	/// Retrieve data from the device (device to host transfer).
828	Error dataRetrieveImpl(void HstPtr, const* void *TgtPtr, int64_t Size,
829	AsyncInfoWrapperTy &AsyncInfoWrapper) override {
830	if (auto Err = setContext())
831	return Err;
832
833	CUstream Stream;
834	if (auto Err = getStream(AsyncInfoWrapper, Stream))
835	return Err;
836
837	CUresult Res = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
838	return Plugin::check(Res, "error in cuMemcpyDtoHAsync: %s");
839	}
840
841	/// Exchange data between two devices directly. We may use peer access if
842	/// the CUDA devices and driver allow them.
843	Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstGenericDevice,
844	void *DstPtr, int64_t Size,
845	AsyncInfoWrapperTy &AsyncInfoWrapper) override;
846
847	/// Initialize the async info for interoperability purposes.
848	Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
849	if (auto Err = setContext())
850	return Err;
851
852	CUstream Stream;
853	if (auto Err = getStream(AsyncInfoWrapper, Stream))
854	return Err;
855
856	return Plugin::success();
857	}
858
859	/// Initialize the device info for interoperability purposes.
860	Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) override {
861	assert(Context && "Context is null");
862	assert(Device != CU_DEVICE_INVALID && "Invalid CUDA device");
863
864	if (auto Err = setContext())
865	return Err;
866
867	if (!DeviceInfo->Context)
868	DeviceInfo->Context = Context;
869
870	if (!DeviceInfo->Device)
871	DeviceInfo->Device = reinterpret_cast<void *>(Device);
872
873	return Plugin::success();
874	}
875
876	/// Create an event.
877	Error createEventImpl(void **EventPtrStorage) override {
878	CUevent Event = reinterpret_cast<CUevent >(EventPtrStorage);
879	return CUDAEventManager.getResource(*Event);
880	}
881
882	/// Destroy a previously created event.
883	Error destroyEventImpl(void *EventPtr) override {
884	CUevent Event = reinterpret_cast<CUevent>(EventPtr);
885	return CUDAEventManager.returnResource(Event);
886	}
887
888	/// Record the event.
889	Error recordEventImpl(void *EventPtr,
890	AsyncInfoWrapperTy &AsyncInfoWrapper) override {
891	CUevent Event = reinterpret_cast<CUevent>(EventPtr);
892
893	CUstream Stream;
894	if (auto Err = getStream(AsyncInfoWrapper, Stream))
895	return Err;
896
897	CUresult Res = cuEventRecord(Event, Stream);
898	return Plugin::check(Res, "error in cuEventRecord: %s");
899	}
900
901	/// Make the stream wait on the event.
902	Error waitEventImpl(void *EventPtr,
903	AsyncInfoWrapperTy &AsyncInfoWrapper) override {
904	CUevent Event = reinterpret_cast<CUevent>(EventPtr);
905
906	CUstream Stream;
907	if (auto Err = getStream(AsyncInfoWrapper, Stream))
908	return Err;
909
910	// Do not use CU_EVENT_WAIT_DEFAULT here as it is only available from
911	// specific CUDA version, and defined as 0x0. In previous version, per CUDA
912	// API document, that argument has to be 0x0.
913	CUresult Res = cuStreamWaitEvent(Stream, Event, `0`);
914	return Plugin::check(Res, "error in cuStreamWaitEvent: %s");
915	}
916
917	/// Synchronize the current thread with the event.
918	Error syncEventImpl(void *EventPtr) override {
919	CUevent Event = reinterpret_cast<CUevent>(EventPtr);
920	CUresult Res = cuEventSynchronize(Event);
921	return Plugin::check(Res, "error in cuEventSynchronize: %s");
922	}
923
924	/// Print information about the device.
925	Error obtainInfoImpl(InfoQueueTy &Info) override {
926	char TmpChar[`1000`];
927	const char *TmpCharPtr;
928	size_t TmpSt;
929	int TmpInt;
930
931	CUresult Res = cuDriverGetVersion(&TmpInt);
932	if (Res == CUDA_SUCCESS)
933	Info.add("CUDA Driver Version", TmpInt);
934
935	Info.add("CUDA OpenMP Device Number", DeviceId);
936
937	Res = cuDeviceGetName(TmpChar, `1000`, Device);
938	if (Res == CUDA_SUCCESS)
939	Info.add("Device Name", TmpChar);
940
941	Res = cuDeviceTotalMem(&TmpSt, Device);
942	if (Res == CUDA_SUCCESS)
943	Info.add("Global Memory Size", TmpSt, "bytes");
944
945	Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, TmpInt);
946	if (Res == CUDA_SUCCESS)
947	Info.add("Number of Multiprocessors", TmpInt);
948
949	Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, TmpInt);
950	if (Res == CUDA_SUCCESS)
951	Info.add("Concurrent Copy and Execution", (bool)TmpInt);
952
953	Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, TmpInt);
954	if (Res == CUDA_SUCCESS)
955	Info.add("Total Constant Memory", TmpInt, "bytes");
956
957	Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
958	TmpInt);
959	if (Res == CUDA_SUCCESS)
960	Info.add("Max Shared Memory per Block", TmpInt, "bytes");
961
962	Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, TmpInt);
963	if (Res == CUDA_SUCCESS)
964	Info.add("Registers per Block", TmpInt);
965
966	Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_WARP_SIZE, TmpInt);
967	if (Res == CUDA_SUCCESS)
968	Info.add("Warp Size", TmpInt);
969
970	Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, TmpInt);
971	if (Res == CUDA_SUCCESS)
972	Info.add("Maximum Threads per Block", TmpInt);
973
974	Info.add("Maximum Block Dimensions", "");
975	Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, TmpInt);
976	if (Res == CUDA_SUCCESS)
977	Info.add<InfoLevel2>("x", TmpInt);
978	Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, TmpInt);
979	if (Res == CUDA_SUCCESS)
980	Info.add<InfoLevel2>("y", TmpInt);
981	Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, TmpInt);
982	if (Res == CUDA_SUCCESS)
983	Info.add<InfoLevel2>("z", TmpInt);
984
985	Info.add("Maximum Grid Dimensions", "");
986	Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, TmpInt);
987	if (Res == CUDA_SUCCESS)
988	Info.add<InfoLevel2>("x", TmpInt);
989	Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, TmpInt);
990	if (Res == CUDA_SUCCESS)
991	Info.add<InfoLevel2>("y", TmpInt);
992	Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, TmpInt);
993	if (Res == CUDA_SUCCESS)
994	Info.add<InfoLevel2>("z", TmpInt);
995
996	Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_PITCH, TmpInt);
997	if (Res == CUDA_SUCCESS)
998	Info.add("Maximum Memory Pitch", TmpInt, "bytes");
999
1000	Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, TmpInt);
1001	if (Res == CUDA_SUCCESS)
1002	Info.add("Texture Alignment", TmpInt, "bytes");
1003
1004	Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CLOCK_RATE, TmpInt);
1005	if (Res == CUDA_SUCCESS)
1006	Info.add("Clock Rate", TmpInt, "kHz");
1007
1008	Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, TmpInt);
1009	if (Res == CUDA_SUCCESS)
1010	Info.add("Execution Timeout", (bool)TmpInt);
1011
1012	Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_INTEGRATED, TmpInt);
1013	if (Res == CUDA_SUCCESS)
1014	Info.add("Integrated Device", (bool)TmpInt);
1015
1016	Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, TmpInt);
1017	if (Res == CUDA_SUCCESS)
1018	Info.add("Can Map Host Memory", (bool)TmpInt);
1019
1020	Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, TmpInt);
1021	if (Res == CUDA_SUCCESS) {
1022	if (TmpInt == CU_COMPUTEMODE_DEFAULT)
1023	TmpCharPtr = "Default";
1024	else if (TmpInt == CU_COMPUTEMODE_PROHIBITED)
1025	TmpCharPtr = "Prohibited";
1026	else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS)
1027	TmpCharPtr = "Exclusive process";
1028	else
1029	TmpCharPtr = "Unknown";
1030	Info.add("Compute Mode", TmpCharPtr);
1031	}
1032
1033	Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, TmpInt);
1034	if (Res == CUDA_SUCCESS)
1035	Info.add("Concurrent Kernels", (bool)TmpInt);
1036
1037	Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_ECC_ENABLED, TmpInt);
1038	if (Res == CUDA_SUCCESS)
1039	Info.add("ECC Enabled", (bool)TmpInt);
1040
1041	Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, TmpInt);
1042	if (Res == CUDA_SUCCESS)
1043	Info.add("Memory Clock Rate", TmpInt, "kHz");
1044
1045	Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, TmpInt);
1046	if (Res == CUDA_SUCCESS)
1047	Info.add("Memory Bus Width", TmpInt, "bits");
1048
1049	Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, TmpInt);
1050	if (Res == CUDA_SUCCESS)
1051	Info.add("L2 Cache Size", TmpInt, "bytes");
1052
1053	Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
1054	TmpInt);
1055	if (Res == CUDA_SUCCESS)
1056	Info.add("Max Threads Per SMP", TmpInt);
1057
1058	Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, TmpInt);
1059	if (Res == CUDA_SUCCESS)
1060	Info.add("Async Engines", TmpInt);
1061
1062	Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, TmpInt);
1063	if (Res == CUDA_SUCCESS)
1064	Info.add("Unified Addressing", (bool)TmpInt);
1065
1066	Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, TmpInt);
1067	if (Res == CUDA_SUCCESS)
1068	Info.add("Managed Memory", (bool)TmpInt);
1069
1070	Res =
1071	getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, TmpInt);
1072	if (Res == CUDA_SUCCESS)
1073	Info.add("Concurrent Managed Memory", (bool)TmpInt);
1074
1075	Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED,
1076	TmpInt);
1077	if (Res == CUDA_SUCCESS)
1078	Info.add("Preemption Supported", (bool)TmpInt);
1079
1080	Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, TmpInt);
1081	if (Res == CUDA_SUCCESS)
1082	Info.add("Cooperative Launch", (bool)TmpInt);
1083
1084	Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, TmpInt);
1085	if (Res == CUDA_SUCCESS)
1086	Info.add("Multi-Device Boars", (bool)TmpInt);
1087
1088	Info.add("Compute Capabilities", ComputeCapability.str());
1089
1090	return Plugin::success();
1091	}
1092
1093	virtual bool shouldSetupDeviceMemoryPool() const override {
1094	/// We use the CUDA malloc for now.
1095	return false;
1096	}
1097
1098	/// Getters and setters for stack and heap sizes.
1099	Error getDeviceStackSize(uint64_t &Value) override {
1100	return getCtxLimit(CU_LIMIT_STACK_SIZE, Value);
1101	}
1102	Error setDeviceStackSize(uint64_t Value) override {
1103	return setCtxLimit(CU_LIMIT_STACK_SIZE, Value);
1104	}
1105	Error getDeviceHeapSize(uint64_t &Value) override {
1106	return getCtxLimit(CU_LIMIT_MALLOC_HEAP_SIZE, Value);
1107	}
1108	Error setDeviceHeapSize(uint64_t Value) override {
1109	return setCtxLimit(CU_LIMIT_MALLOC_HEAP_SIZE, Value);
1110	}
1111	Error getDeviceMemorySize(uint64_t &Value) override {
1112	CUresult Res = cuDeviceTotalMem(&Value, Device);
1113	return Plugin::check(Res, "error in getDeviceMemorySize %s");
1114	}
1115
1116	/// CUDA-specific functions for getting and setting context limits.
1117	Error setCtxLimit(CUlimit Kind, uint64_t Value) {
1118	CUresult Res = cuCtxSetLimit(Kind, Value);
1119	return Plugin::check(Res, "error in cuCtxSetLimit: %s");
1120	}
1121	Error getCtxLimit(CUlimit Kind, uint64_t &Value) {
1122	CUresult Res = cuCtxGetLimit(&Value, Kind);
1123	return Plugin::check(Res, "error in cuCtxGetLimit: %s");
1124	}
1125
1126	/// CUDA-specific function to get device attributes.
1127	Error getDeviceAttr(uint32_t Kind, uint32_t &Value) {
1128	// TODO: Warn if the new value is larger than the old.
1129	CUresult Res =
1130	cuDeviceGetAttribute((int *)&Value, (CUdevice_attribute)Kind, Device);
1131	return Plugin::check(Res, "error in cuDeviceGetAttribute: %s");
1132	}
1133
1134	CUresult getDeviceAttrRaw(uint32_t Kind, int &Value) {
1135	return cuDeviceGetAttribute(&Value, (CUdevice_attribute)Kind, Device);
1136	}
1137
1138	/// See GenericDeviceTy::getComputeUnitKind().
1139	std::string getComputeUnitKind() const override {
1140	return ComputeCapability.str();
1141	}
1142
1143	/// Returns the clock frequency for the given NVPTX device.
1144	uint64_t getClockFrequency() const override { return `1000000000`; }
1145
1146	private:
1147	using CUDAStreamManagerTy = GenericDeviceResourceManagerTy<CUDAStreamRef>;
1148	using CUDAEventManagerTy = GenericDeviceResourceManagerTy<CUDAEventRef>;
1149
1150	Error callGlobalCtorDtorCommon(GenericPluginTy &Plugin, DeviceImageTy &Image,
1151	bool IsCtor) {
1152	const char *KernelName = IsCtor ? "nvptx$device$init" : "nvptx$device$fini";
1153	// Perform a quick check for the named kernel in the image. The kernel
1154	// should be created by the 'nvptx-lower-ctor-dtor' pass.
1155	GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
1156	if (IsCtor && !Handler.isSymbolInImage(*this, Image, KernelName))
1157	return Plugin::success();
1158
1159	// The Nvidia backend cannot handle creating the ctor / dtor array
1160	// automatically so we must create it ourselves. The backend will emit
1161	// several globals that contain function pointers we can call. These are
1162	// prefixed with a known name due to Nvidia's lack of section support.
1163	auto ELFObjOrErr = Handler.getELFObjectFile(Image);
1164	if (!ELFObjOrErr)
1165	return ELFObjOrErr.takeError();
1166
1167	// Search for all symbols that contain a constructor or destructor.
1168	SmallVector<std::pair<StringRef, uint16_t>> Funcs;
1169	for (ELFSymbolRef Sym : (*ELFObjOrErr)->symbols()) {
1170	auto NameOrErr = Sym.getName();
1171	if (!NameOrErr)
1172	return NameOrErr.takeError();
1173
1174	if (!NameOrErr->starts_with(IsCtor ? "__init_array_object_"
1175	: "__fini_array_object_"))
1176	continue;
1177
1178	uint16_t Priority;
1179	if (NameOrErr->rsplit(`'_'`).second.getAsInteger(`10`, Priority))
1180	return Plugin::error(ErrorCode::INVALID_BINARY,
1181	"invalid priority for constructor or destructor");
1182
1183	Funcs.emplace_back(*NameOrErr, Priority);
1184	}
1185
1186	// Sort the created array to be in priority order.
1187	llvm::sort(Funcs, [=](auto X, auto Y) { return X.second < Y.second; });
1188
1189	// Allocate a buffer to store all of the known constructor / destructor
1190	// functions in so we can iterate them on the device.
1191	void *Buffer =
1192	allocate(Funcs.size() * sizeof(void ), nullptr*, TARGET_ALLOC_DEVICE);
1193	if (!Buffer)
1194	return Plugin::error(ErrorCode::OUT_OF_RESOURCES,
1195	"failed to allocate memory for global buffer");
1196
1197	auto GlobalPtrStart = reinterpret_cast<uintptr_t >(Buffer);
1198	auto GlobalPtrStop = reinterpret_cast<uintptr_t >(Buffer) + Funcs.size();
1199
1200	SmallVector<void *> FunctionPtrs(Funcs.size());
1201	std::size_t Idx = `0`;
1202	for (auto [Name, Priority] : Funcs) {
1203	GlobalTy FunctionAddr(Name.str(), sizeof(void *), &FunctionPtrs[Idx++]);
1204	if (auto Err = Handler.readGlobalFromDevice(*this, Image, FunctionAddr))
1205	return Err;
1206	}
1207
1208	// Copy the local buffer to the device.
1209	if (auto Err = dataSubmit(GlobalPtrStart, FunctionPtrs.data(),
1210	FunctionPtrs.size() * sizeof(void ), nullptr*))
1211	return Err;
1212
1213	// Copy the created buffer to the appropriate symbols so the kernel can
1214	// iterate through them.
1215	GlobalTy StartGlobal(IsCtor ? "__init_array_start" : "__fini_array_start",
1216	sizeof(void *), &GlobalPtrStart);
1217	if (auto Err = Handler.writeGlobalToDevice(*this, Image, StartGlobal))
1218	return Err;
1219
1220	GlobalTy StopGlobal(IsCtor ? "__init_array_end" : "__fini_array_end",
1221	sizeof(void *), &GlobalPtrStop);
1222	if (auto Err = Handler.writeGlobalToDevice(*this, Image, StopGlobal))
1223	return Err;
1224
1225	CUDAKernelTy CUDAKernel(KernelName);
1226
1227	if (auto Err = CUDAKernel.init(*this, Image))
1228	return Err;
1229
1230	AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr);
1231
1232	KernelArgsTy KernelArgs = {};
1233	uint32_t NumBlocksAndThreads[`3`] = {`1u`, `1u`, `1u`};
1234	if (auto Err = CUDAKernel.launchImpl(
1235	*this, NumBlocksAndThreads, NumBlocksAndThreads, KernelArgs,
1236	KernelLaunchParamsTy{}, AsyncInfoWrapper))
1237	return Err;
1238
1239	Error Err = Plugin::success();
1240	AsyncInfoWrapper.finalize(Err);
1241
1242	if (free(Buffer, TARGET_ALLOC_DEVICE) != OFFLOAD_SUCCESS)
1243	return Plugin::error(ErrorCode::UNKNOWN,
1244	"failed to free memory for global buffer");
1245
1246	return Err;
1247	}
1248
1249	/// Stream manager for CUDA streams.
1250	CUDAStreamManagerTy CUDAStreamManager;
1251
1252	/// Event manager for CUDA events.
1253	CUDAEventManagerTy CUDAEventManager;
1254
1255	/// The device's context. This context should be set before performing
1256	/// operations on the device.
1257	CUcontext Context = nullptr;
1258
1259	/// The CUDA device handler.
1260	CUdevice Device = CU_DEVICE_INVALID;
1261
1262	/// The memory mapped addresses and their handles
1263	std::unordered_map<CUdeviceptr, CUmemGenericAllocationHandle> DeviceMMaps;
1264
1265	/// The compute capability of the corresponding CUDA device.
1266	struct ComputeCapabilityTy {
1267	uint32_t Major;
1268	uint32_t Minor;
1269	std::string str() const {
1270	return "sm_" + std::to_string(val: Major * `10` + Minor);
1271	}
1272	} ComputeCapability;
1273
1274	/// The maximum number of warps that can be resident on all the SMs
1275	/// simultaneously.
1276	uint32_t HardwareParallelism = `0`;
1277	};
1278
1279	Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
1280	uint32_t NumThreads[`3`], uint32_t NumBlocks[`3`],
1281	KernelArgsTy &KernelArgs,
1282	KernelLaunchParamsTy LaunchParams,
1283	AsyncInfoWrapperTy &AsyncInfoWrapper) const {
1284	CUDADeviceTy &CUDADevice = static_cast<CUDADeviceTy &>(GenericDevice);
1285
1286	CUstream Stream;
1287	if (auto Err = CUDADevice.getStream(AsyncInfoWrapper, Stream))
1288	return Err;
1289
1290	uint32_t MaxDynCGroupMem =
1291	std::max(KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize());
1292
1293	void *Config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, LaunchParams.Data,
1294	CU_LAUNCH_PARAM_BUFFER_SIZE,
1295	reinterpret_cast<void *>(&LaunchParams.Size),
1296	CU_LAUNCH_PARAM_END};
1297
1298	// If we are running an RPC server we want to wake up the server thread
1299	// whenever there is a kernel running and let it sleep otherwise.
1300	if (GenericDevice.getRPCServer())
1301	GenericDevice.Plugin.getRPCServer().Thread->notify();
1302
1303	CUresult Res = cuLaunchKernel(Func, NumBlocks[`0`], NumBlocks[`1`], NumBlocks[`2`],
1304	NumThreads[`0`], NumThreads[`1`], NumThreads[`2`],
1305	MaxDynCGroupMem, Stream, nullptr, Config);
1306
1307	// Register a callback to indicate when the kernel is complete.
1308	if (GenericDevice.getRPCServer())
1309	cuLaunchHostFunc(
1310	Stream,
1311	[](void *Data) {
1312	GenericPluginTy &Plugin = *reinterpret_cast<GenericPluginTy *>(Data);
1313	Plugin.getRPCServer().Thread->finish();
1314	},
1315	&GenericDevice.Plugin);
1316
1317	return Plugin::check(Res, "error in cuLaunchKernel for '%s': %s", getName());
1318	}
1319
1320	/// Class implementing the CUDA-specific functionalities of the global handler.
1321	class CUDAGlobalHandlerTy final : public GenericGlobalHandlerTy {
1322	public:
1323	/// Get the metadata of a global from the device. The name and size of the
1324	/// global is read from DeviceGlobal and the address of the global is written
1325	/// to DeviceGlobal.
1326	Error getGlobalMetadataFromDevice(GenericDeviceTy &Device,
1327	DeviceImageTy &Image,
1328	GlobalTy &DeviceGlobal) override {
1329	CUDADeviceImageTy &CUDAImage = static_cast<CUDADeviceImageTy &>(Image);
1330
1331	const char *GlobalName = DeviceGlobal.getName().data();
1332
1333	size_t CUSize;
1334	CUdeviceptr CUPtr;
1335	CUresult Res =
1336	cuModuleGetGlobal(&CUPtr, &CUSize, CUDAImage.getModule(), GlobalName);
1337	if (auto Err = Plugin::check(Res, "error in cuModuleGetGlobal for '%s': %s",
1338	GlobalName))
1339	return Err;
1340
1341	if (CUSize != DeviceGlobal.getSize())
1342	return Plugin::error(
1343	ErrorCode::INVALID_BINARY,
1344	"failed to load global '%s' due to size mismatch (%zu != %zu)",
1345	GlobalName, CUSize, (size_t)DeviceGlobal.getSize());
1346
1347	DeviceGlobal.setPtr(reinterpret_cast<void *>(CUPtr));
1348	return Plugin::success();
1349	}
1350	};
1351
1352	/// Class implementing the CUDA-specific functionalities of the plugin.
1353	struct CUDAPluginTy final : public GenericPluginTy {
1354	/// Create a CUDA plugin.
1355	CUDAPluginTy() : GenericPluginTy(getTripleArch()) {}
1356
1357	/// This class should not be copied.
1358	CUDAPluginTy(const CUDAPluginTy &) = delete;
1359	CUDAPluginTy(CUDAPluginTy &&) = delete;
1360
1361	/// Initialize the plugin and return the number of devices.
1362	Expected<int32_t> initImpl() override {
1363	CUresult Res = cuInit(`0`);
1364	if (Res == CUDA_ERROR_INVALID_HANDLE) {
1365	// Cannot call cuGetErrorString if dlsym failed.
1366	DP("Failed to load CUDA shared library\n");
1367	return `0`;
1368	}
1369
1370	if (Res == CUDA_ERROR_NO_DEVICE) {
1371	// Do not initialize if there are no devices.
1372	DP("There are no devices supporting CUDA.\n");
1373	return `0`;
1374	}
1375
1376	if (auto Err = Plugin::check(Res, "error in cuInit: %s"))
1377	return std::move(Err);
1378
1379	// Get the number of devices.
1380	int NumDevices;
1381	Res = cuDeviceGetCount(&NumDevices);
1382	if (auto Err = Plugin::check(Res, "error in cuDeviceGetCount: %s"))
1383	return std::move(Err);
1384
1385	// Do not initialize if there are no devices.
1386	if (NumDevices == `0`)
1387	DP("There are no devices supporting CUDA.\n");
1388
1389	return NumDevices;
1390	}
1391
1392	/// Deinitialize the plugin.
1393	Error deinitImpl() override { return Plugin::success(); }
1394
1395	/// Creates a CUDA device to use for offloading.
1396	GenericDeviceTy *createDevice(GenericPluginTy &Plugin, int32_t DeviceId,
1397	int32_t NumDevices) override {
1398	return new CUDADeviceTy(Plugin, DeviceId, NumDevices);
1399	}
1400
1401	/// Creates a CUDA global handler.
1402	GenericGlobalHandlerTy *createGlobalHandler() override {
1403	return new CUDAGlobalHandlerTy ();
1404	}
1405
1406	/// Get the ELF code for recognizing the compatible image binary.
1407	uint16_t getMagicElfBits() const override { return ELF::EM_CUDA; }
1408
1409	Triple::ArchType getTripleArch() const override {
1410	// TODO: I think we can drop the support for 32-bit NVPTX devices.
1411	return Triple::nvptx64;
1412	}
1413
1414	const char getName() const* override { return GETNAME(TARGET_NAME); }
1415
1416	/// Check whether the image is compatible with a CUDA device.
1417	Expected<bool> isELFCompatible(uint32_t DeviceId,
1418	StringRef Image) const override {
1419	auto ElfOrErr =
1420	ELF64LEObjectFile::create(MemoryBufferRef(Image, /Identifier=/""),
1421	/InitContent=/false);
1422	if (!ElfOrErr)
1423	return ElfOrErr.takeError();
1424
1425	// Get the numeric value for the image's `sm_` value.
1426	auto SM = ElfOrErr->getPlatformFlags() & ELF::EF_CUDA_SM;
1427
1428	CUdevice Device;
1429	CUresult Res = cuDeviceGet(&Device, DeviceId);
1430	if (auto Err = Plugin::check(Res, "error in cuDeviceGet: %s"))
1431	return std::move(Err);
1432
1433	int32_t Major, Minor;
1434	Res = cuDeviceGetAttribute(
1435	&Major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device);
1436	if (auto Err = Plugin::check(Res, "error in cuDeviceGetAttribute: %s"))
1437	return std::move(Err);
1438
1439	Res = cuDeviceGetAttribute(
1440	&Minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device);
1441	if (auto Err = Plugin::check(Res, "error in cuDeviceGetAttribute: %s"))
1442	return std::move(Err);
1443
1444	int32_t ImageMajor = SM / `10`;
1445	int32_t ImageMinor = SM % `10`;
1446
1447	// A cubin generated for a certain compute capability is supported to
1448	// run on any GPU with the same major revision and same or higher minor
1449	// revision.
1450	return Major == ImageMajor && Minor >= ImageMinor;
1451	}
1452	};
1453
1454	Error CUDADeviceTy::dataExchangeImpl(const void *SrcPtr,
1455	GenericDeviceTy &DstGenericDevice,
1456	void *DstPtr, int64_t Size,
1457	AsyncInfoWrapperTy &AsyncInfoWrapper) {
1458	if (auto Err = setContext())
1459	return Err;
1460
1461	CUDADeviceTy &DstDevice = static_cast<CUDADeviceTy &>(DstGenericDevice);
1462
1463	CUresult Res;
1464	int32_t DstDeviceId = DstDevice.DeviceId;
1465	CUdeviceptr CUSrcPtr = (CUdeviceptr)SrcPtr;
1466	CUdeviceptr CUDstPtr = (CUdeviceptr)DstPtr;
1467
1468	int CanAccessPeer = `0`;
1469	if (DeviceId != DstDeviceId) {
1470	// Make sure the lock is released before performing the copies.
1471	std::lock_guard<std::mutex> Lock(PeerAccessesLock);
1472
1473	switch (PeerAccesses[DstDeviceId]) {
1474	case PeerAccessState::AVAILABLE:
1475	CanAccessPeer = `1`;
1476	break;
1477	case PeerAccessState::UNAVAILABLE:
1478	CanAccessPeer = `0`;
1479	break;
1480	case PeerAccessState::PENDING:
1481	// Check whether the source device can access the destination device.
1482	Res = cuDeviceCanAccessPeer(&CanAccessPeer, Device, DstDevice.Device);
1483	if (auto Err = Plugin::check(Res, "Error in cuDeviceCanAccessPeer: %s"))
1484	return Err;
1485
1486	if (CanAccessPeer) {
1487	Res = cuCtxEnablePeerAccess(DstDevice.Context, `0`);
1488	if (Res == CUDA_ERROR_TOO_MANY_PEERS) {
1489	// Resources may be exhausted due to many P2P links.
1490	CanAccessPeer = `0`;
1491	DP("Too many P2P so fall back to D2D memcpy");
1492	} else if (auto Err =
1493	Plugin::check(Res, "error in cuCtxEnablePeerAccess: %s"))
1494	return Err;
1495	}
1496	PeerAccesses[DstDeviceId] = (CanAccessPeer)
1497	? PeerAccessState::AVAILABLE
1498	: PeerAccessState::UNAVAILABLE;
1499	}
1500	}
1501
1502	CUstream Stream;
1503	if (auto Err = getStream(AsyncInfoWrapper, Stream))
1504	return Err;
1505
1506	if (CanAccessPeer) {
1507	// TODO: Should we fallback to D2D if peer access fails?
1508	Res = cuMemcpyPeerAsync(CUDstPtr, Context, CUSrcPtr, DstDevice.Context,
1509	Size, Stream);
1510	return Plugin::check(Res, "error in cuMemcpyPeerAsync: %s");
1511	}
1512
1513	// Fallback to D2D copy.
1514	Res = cuMemcpyDtoDAsync(CUDstPtr, CUSrcPtr, Size, Stream);
1515	return Plugin::check(Res, "error in cuMemcpyDtoDAsync: %s");
1516	}
1517
1518	template <typename... ArgsTy>
1519	static Error Plugin::check(int32_t Code, const char *ErrFmt, ArgsTy... Args) {
1520	CUresult ResultCode = static_cast<CUresult>(Code);
1521	if (ResultCode == CUDA_SUCCESS)
1522	return Plugin::success();
1523
1524	const char *Desc = "Unknown error";
1525	CUresult Ret = cuGetErrorString(ResultCode, &Desc);
1526	if (Ret != CUDA_SUCCESS)
1527	REPORT("Unrecognized " GETNAME(TARGET_NAME) " error code %d\n", Code);
1528
1529	// TODO: Add more entries to this switch
1530	ErrorCode OffloadErrCode;
1531	switch (ResultCode) {
1532	case CUDA_ERROR_NOT_FOUND:
1533	OffloadErrCode = ErrorCode::NOT_FOUND;
1534	break;
1535	default:
1536	OffloadErrCode = ErrorCode::UNKNOWN;
1537	}
1538
1539	// TODO: Create a map for CUDA error codes to Offload error codes
1540	return Plugin::error(OffloadErrCode, ErrFmt, Args..., Desc);
1541	}
1542
1543	} // namespace plugin
1544	} // namespace target
1545	} // namespace omp
1546	} // namespace llvm
1547
1548	extern "C" {
1549	llvm::omp::target::plugin::GenericPluginTy *createPlugin_cuda() {
1550	return new llvm::omp::target::plugin::CUDAPluginTy ();
1551	}
1552	}
1553

source code of offload/plugins-nextgen/cuda/src/rtl.cpp