1//===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// RTL NextGen for CUDA machine
10//
11//===----------------------------------------------------------------------===//
12
13#include <cassert>
14#include <cstddef>
15#include <cuda.h>
16#include <string>
17#include <unordered_map>
18
19#include "Shared/APITypes.h"
20#include "Shared/Debug.h"
21#include "Shared/Environment.h"
22
23#include "GlobalHandler.h"
24#include "OpenMP/OMPT/Callback.h"
25#include "PluginInterface.h"
26#include "Utils/ELF.h"
27
28#include "llvm/BinaryFormat/ELF.h"
29#include "llvm/Frontend/OpenMP/OMPConstants.h"
30#include "llvm/Frontend/OpenMP/OMPGridValues.h"
31#include "llvm/Support/Error.h"
32#include "llvm/Support/FileOutputBuffer.h"
33#include "llvm/Support/FileSystem.h"
34#include "llvm/Support/Program.h"
35
36using namespace error;
37
38namespace llvm {
39namespace omp {
40namespace target {
41namespace plugin {
42
43/// Forward declarations for all specialized data structures.
44struct CUDAKernelTy;
45struct CUDADeviceTy;
46struct CUDAPluginTy;
47
48#if (defined(CUDA_VERSION) && (CUDA_VERSION < 11000))
49/// Forward declarations for all Virtual Memory Management
50/// related data structures and functions. This is necessary
51/// for older cuda versions.
52typedef void *CUmemGenericAllocationHandle;
53typedef void *CUmemAllocationProp;
54typedef void *CUmemAccessDesc;
55typedef void *CUmemAllocationGranularity_flags;
56CUresult cuMemAddressReserve(CUdeviceptr *ptr, size_t size, size_t alignment,
57 CUdeviceptr addr, unsigned long long flags) {}
58CUresult cuMemMap(CUdeviceptr ptr, size_t size, size_t offset,
59 CUmemGenericAllocationHandle handle,
60 unsigned long long flags) {}
61CUresult cuMemCreate(CUmemGenericAllocationHandle *handle, size_t size,
62 const CUmemAllocationProp *prop,
63 unsigned long long flags) {}
64CUresult cuMemSetAccess(CUdeviceptr ptr, size_t size,
65 const CUmemAccessDesc *desc, size_t count) {}
66CUresult
67cuMemGetAllocationGranularity(size_t *granularity,
68 const CUmemAllocationProp *prop,
69 CUmemAllocationGranularity_flags option) {}
70#endif
71
72#if (defined(CUDA_VERSION) && (CUDA_VERSION < 11020))
73// Forward declarations of asynchronous memory management functions. This is
74// necessary for older versions of CUDA.
75CUresult cuMemAllocAsync(CUdeviceptr *ptr, size_t, CUstream) { *ptr = 0; }
76
77CUresult cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream) {}
78#endif
79
80/// Class implementing the CUDA device images properties.
81struct CUDADeviceImageTy : public DeviceImageTy {
82 /// Create the CUDA image with the id and the target image pointer.
83 CUDADeviceImageTy(int32_t ImageId, GenericDeviceTy &Device,
84 const __tgt_device_image *TgtImage)
85 : DeviceImageTy(ImageId, Device, TgtImage), Module(nullptr) {}
86
87 /// Load the image as a CUDA module.
88 Error loadModule() {
89 assert(!Module && "Module already loaded");
90
91 CUresult Res = cuModuleLoadDataEx(&Module, getStart(), 0, nullptr, nullptr);
92 if (auto Err = Plugin::check(Res, "error in cuModuleLoadDataEx: %s"))
93 return Err;
94
95 return Plugin::success();
96 }
97
98 /// Unload the CUDA module corresponding to the image.
99 Error unloadModule() {
100 assert(Module && "Module not loaded");
101
102 CUresult Res = cuModuleUnload(Module);
103 if (auto Err = Plugin::check(Res, "error in cuModuleUnload: %s"))
104 return Err;
105
106 Module = nullptr;
107
108 return Plugin::success();
109 }
110
111 /// Getter of the CUDA module.
112 CUmodule getModule() const { return Module; }
113
114private:
115 /// The CUDA module that loaded the image.
116 CUmodule Module;
117};
118
119/// Class implementing the CUDA kernel functionalities which derives from the
120/// generic kernel class.
121struct CUDAKernelTy : public GenericKernelTy {
122 /// Create a CUDA kernel with a name and an execution mode.
123 CUDAKernelTy(const char *Name) : GenericKernelTy(Name), Func(nullptr) {}
124
125 /// Initialize the CUDA kernel.
126 Error initImpl(GenericDeviceTy &GenericDevice,
127 DeviceImageTy &Image) override {
128 CUresult Res;
129 CUDADeviceImageTy &CUDAImage = static_cast<CUDADeviceImageTy &>(Image);
130
131 // Retrieve the function pointer of the kernel.
132 Res = cuModuleGetFunction(&Func, CUDAImage.getModule(), getName());
133 if (auto Err = Plugin::check(Res, "error in cuModuleGetFunction('%s'): %s",
134 getName()))
135 return Err;
136
137 // Check that the function pointer is valid.
138 if (!Func)
139 return Plugin::error(ErrorCode::INVALID_BINARY,
140 "invalid function for kernel %s", getName());
141
142 int MaxThreads;
143 Res = cuFuncGetAttribute(&MaxThreads,
144 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Func);
145 if (auto Err = Plugin::check(Res, "error in cuFuncGetAttribute: %s"))
146 return Err;
147
148 // The maximum number of threads cannot exceed the maximum of the kernel.
149 MaxNumThreads = std::min(MaxNumThreads, (uint32_t)MaxThreads);
150
151 return Plugin::success();
152 }
153
154 /// Launch the CUDA kernel function.
155 Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3],
156 uint32_t NumBlocks[3], KernelArgsTy &KernelArgs,
157 KernelLaunchParamsTy LaunchParams,
158 AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
159
160private:
161 /// The CUDA kernel function to execute.
162 CUfunction Func;
163 /// The maximum amount of dynamic shared memory per thread group. By default,
164 /// this is set to 48 KB.
165 mutable uint32_t MaxDynCGroupMemLimit = 49152;
166};
167
168/// Class wrapping a CUDA stream reference. These are the objects handled by the
169/// Stream Manager for the CUDA plugin.
170struct CUDAStreamRef final : public GenericDeviceResourceRef {
171 /// The underlying handle type for streams.
172 using HandleTy = CUstream;
173
174 /// Create an empty reference to an invalid stream.
175 CUDAStreamRef() : Stream(nullptr) {}
176
177 /// Create a reference to an existing stream.
178 CUDAStreamRef(HandleTy Stream) : Stream(Stream) {}
179
180 /// Create a new stream and save the reference. The reference must be empty
181 /// before calling to this function.
182 Error create(GenericDeviceTy &Device) override {
183 if (Stream)
184 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
185 "creating an existing stream");
186
187 CUresult Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING);
188 if (auto Err = Plugin::check(Res, "error in cuStreamCreate: %s"))
189 return Err;
190
191 return Plugin::success();
192 }
193
194 /// Destroy the referenced stream and invalidate the reference. The reference
195 /// must be to a valid stream before calling to this function.
196 Error destroy(GenericDeviceTy &Device) override {
197 if (!Stream)
198 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
199 "destroying an invalid stream");
200
201 CUresult Res = cuStreamDestroy(Stream);
202 if (auto Err = Plugin::check(Res, "error in cuStreamDestroy: %s"))
203 return Err;
204
205 Stream = nullptr;
206 return Plugin::success();
207 }
208
209 /// Get the underlying CUDA stream.
210 operator HandleTy() const { return Stream; }
211
212private:
213 /// The reference to the CUDA stream.
214 HandleTy Stream;
215};
216
217/// Class wrapping a CUDA event reference. These are the objects handled by the
218/// Event Manager for the CUDA plugin.
219struct CUDAEventRef final : public GenericDeviceResourceRef {
220 /// The underlying handle type for events.
221 using HandleTy = CUevent;
222
223 /// Create an empty reference to an invalid event.
224 CUDAEventRef() : Event(nullptr) {}
225
226 /// Create a reference to an existing event.
227 CUDAEventRef(HandleTy Event) : Event(Event) {}
228
229 /// Create a new event and save the reference. The reference must be empty
230 /// before calling to this function.
231 Error create(GenericDeviceTy &Device) override {
232 if (Event)
233 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
234 "creating an existing event");
235
236 CUresult Res = cuEventCreate(&Event, CU_EVENT_DEFAULT);
237 if (auto Err = Plugin::check(Res, "error in cuEventCreate: %s"))
238 return Err;
239
240 return Plugin::success();
241 }
242
243 /// Destroy the referenced event and invalidate the reference. The reference
244 /// must be to a valid event before calling to this function.
245 Error destroy(GenericDeviceTy &Device) override {
246 if (!Event)
247 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
248 "destroying an invalid event");
249
250 CUresult Res = cuEventDestroy(Event);
251 if (auto Err = Plugin::check(Res, "error in cuEventDestroy: %s"))
252 return Err;
253
254 Event = nullptr;
255 return Plugin::success();
256 }
257
258 /// Get the underlying CUevent.
259 operator HandleTy() const { return Event; }
260
261private:
262 /// The reference to the CUDA event.
263 HandleTy Event;
264};
265
266/// Class implementing the CUDA device functionalities which derives from the
267/// generic device class.
268struct CUDADeviceTy : public GenericDeviceTy {
269 // Create a CUDA device with a device id and the default CUDA grid values.
270 CUDADeviceTy(GenericPluginTy &Plugin, int32_t DeviceId, int32_t NumDevices)
271 : GenericDeviceTy(Plugin, DeviceId, NumDevices, NVPTXGridValues),
272 CUDAStreamManager(*this), CUDAEventManager(*this) {}
273
274 ~CUDADeviceTy() {}
275
276 /// Initialize the device, its resources and get its properties.
277 Error initImpl(GenericPluginTy &Plugin) override {
278 CUresult Res = cuDeviceGet(&Device, DeviceId);
279 if (auto Err = Plugin::check(Res, "error in cuDeviceGet: %s"))
280 return Err;
281
282 // Query the current flags of the primary context and set its flags if
283 // it is inactive.
284 unsigned int FormerPrimaryCtxFlags = 0;
285 int FormerPrimaryCtxIsActive = 0;
286 Res = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags,
287 &FormerPrimaryCtxIsActive);
288 if (auto Err =
289 Plugin::check(Res, "error in cuDevicePrimaryCtxGetState: %s"))
290 return Err;
291
292 if (FormerPrimaryCtxIsActive) {
293 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
294 "The primary context is active, no change to its flags\n");
295 if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) !=
296 CU_CTX_SCHED_BLOCKING_SYNC)
297 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
298 "Warning: The current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n");
299 } else {
300 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
301 "The primary context is inactive, set its flags to "
302 "CU_CTX_SCHED_BLOCKING_SYNC\n");
303 Res = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC);
304 if (auto Err =
305 Plugin::check(Res, "error in cuDevicePrimaryCtxSetFlags: %s"))
306 return Err;
307 }
308
309 // Retain the per device primary context and save it to use whenever this
310 // device is selected.
311 Res = cuDevicePrimaryCtxRetain(&Context, Device);
312 if (auto Err = Plugin::check(Res, "error in cuDevicePrimaryCtxRetain: %s"))
313 return Err;
314
315 if (auto Err = setContext())
316 return Err;
317
318 // Initialize stream pool.
319 if (auto Err = CUDAStreamManager.init(OMPX_InitialNumStreams))
320 return Err;
321
322 // Initialize event pool.
323 if (auto Err = CUDAEventManager.init(OMPX_InitialNumEvents))
324 return Err;
325
326 // Query attributes to determine number of threads/block and blocks/grid.
327 if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
328 GridValues.GV_Max_Teams))
329 return Err;
330
331 if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
332 GridValues.GV_Max_WG_Size))
333 return Err;
334
335 if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_WARP_SIZE,
336 GridValues.GV_Warp_Size))
337 return Err;
338
339 if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
340 ComputeCapability.Major))
341 return Err;
342
343 if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
344 ComputeCapability.Minor))
345 return Err;
346
347 uint32_t NumMuliprocessors = 0;
348 uint32_t MaxThreadsPerSM = 0;
349 uint32_t WarpSize = 0;
350 if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
351 NumMuliprocessors))
352 return Err;
353 if (auto Err =
354 getDeviceAttr(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
355 MaxThreadsPerSM))
356 return Err;
357 if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_WARP_SIZE, WarpSize))
358 return Err;
359 HardwareParallelism = NumMuliprocessors * (MaxThreadsPerSM / WarpSize);
360
361 return Plugin::success();
362 }
363
364 Error unloadBinaryImpl(DeviceImageTy *Image) override {
365 assert(Context && "Invalid CUDA context");
366
367 // Each image has its own module.
368 CUDADeviceImageTy &CUDAImage = static_cast<CUDADeviceImageTy &>(*Image);
369
370 // Unload the module of the image.
371 if (auto Err = CUDAImage.unloadModule())
372 return Err;
373
374 return Plugin::success();
375 }
376
377 /// Deinitialize the device and release its resources.
378 Error deinitImpl() override {
379 if (Context) {
380 if (auto Err = setContext())
381 return Err;
382 }
383
384 // Deinitialize the stream manager.
385 if (auto Err = CUDAStreamManager.deinit())
386 return Err;
387
388 if (auto Err = CUDAEventManager.deinit())
389 return Err;
390
391 if (Context) {
392 CUresult Res = cuDevicePrimaryCtxRelease(Device);
393 if (auto Err =
394 Plugin::check(Res, "error in cuDevicePrimaryCtxRelease: %s"))
395 return Err;
396 }
397
398 // Invalidate context and device references.
399 Context = nullptr;
400 Device = CU_DEVICE_INVALID;
401
402 return Plugin::success();
403 }
404
405 virtual Error callGlobalConstructors(GenericPluginTy &Plugin,
406 DeviceImageTy &Image) override {
407 // Check for the presence of global destructors at initialization time. This
408 // is required when the image may be deallocated before destructors are run.
409 GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
410 if (Handler.isSymbolInImage(*this, Image, "nvptx$device$fini"))
411 Image.setPendingGlobalDtors();
412
413 return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/true);
414 }
415
416 virtual Error callGlobalDestructors(GenericPluginTy &Plugin,
417 DeviceImageTy &Image) override {
418 if (Image.hasPendingGlobalDtors())
419 return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/false);
420 return Plugin::success();
421 }
422
423 Expected<std::unique_ptr<MemoryBuffer>>
424 doJITPostProcessing(std::unique_ptr<MemoryBuffer> MB) const override {
425 // TODO: We should be able to use the 'nvidia-ptxjitcompiler' interface to
426 // avoid the call to 'ptxas'.
427 SmallString<128> PTXInputFilePath;
428 std::error_code EC = sys::fs::createTemporaryFile(Prefix: "nvptx-pre-link-jit", Suffix: "s",
429 ResultPath&: PTXInputFilePath);
430 if (EC)
431 return Plugin::error(ErrorCode::HOST_IO,
432 "failed to create temporary file for ptxas");
433
434 // Write the file's contents to the output file.
435 Expected<std::unique_ptr<FileOutputBuffer>> OutputOrErr =
436 FileOutputBuffer::create(FilePath: PTXInputFilePath, Size: MB->getBuffer().size());
437 if (!OutputOrErr)
438 return OutputOrErr.takeError();
439 std::unique_ptr<FileOutputBuffer> Output = std::move(*OutputOrErr);
440 llvm::copy(MB->getBuffer(), Output->getBufferStart());
441 if (Error E = Output->commit())
442 return std::move(E);
443
444 SmallString<128> PTXOutputFilePath;
445 EC = sys::fs::createTemporaryFile(Prefix: "nvptx-post-link-jit", Suffix: "cubin",
446 ResultPath&: PTXOutputFilePath);
447 if (EC)
448 return Plugin::error(ErrorCode::HOST_IO,
449 "failed to create temporary file for ptxas");
450
451 // Try to find `ptxas` in the path to compile the PTX to a binary.
452 const auto ErrorOrPath = sys::findProgramByName(Name: "ptxas");
453 if (!ErrorOrPath)
454 return Plugin::error(ErrorCode::HOST_TOOL_NOT_FOUND,
455 "failed to find 'ptxas' on the PATH.");
456
457 std::string Arch = getComputeUnitKind();
458 StringRef Args[] = {*ErrorOrPath,
459 "-m64",
460 "-O2",
461 "--gpu-name",
462 Arch,
463 "--output-file",
464 PTXOutputFilePath,
465 PTXInputFilePath};
466
467 std::string ErrMsg;
468 if (sys::ExecuteAndWait(*ErrorOrPath, Args, std::nullopt, {}, 0, 0,
469 &ErrMsg))
470 return Plugin::error(ErrorCode::ASSEMBLE_FAILURE,
471 "running 'ptxas' failed: %s\n", ErrMsg.c_str());
472
473 auto BufferOrErr = MemoryBuffer::getFileOrSTDIN(PTXOutputFilePath.data());
474 if (!BufferOrErr)
475 return Plugin::error(ErrorCode::HOST_IO,
476 "failed to open temporary file for ptxas");
477
478 // Clean up the temporary files afterwards.
479 if (sys::fs::remove(PTXOutputFilePath))
480 return Plugin::error(ErrorCode::HOST_IO,
481 "failed to remove temporary file for ptxas");
482 if (sys::fs::remove(PTXInputFilePath))
483 return Plugin::error(ErrorCode::HOST_IO,
484 "failed to remove temporary file for ptxas");
485
486 return std::move(*BufferOrErr);
487 }
488
489 /// Allocate and construct a CUDA kernel.
490 Expected<GenericKernelTy &> constructKernel(const char *Name) override {
491 // Allocate and construct the CUDA kernel.
492 CUDAKernelTy *CUDAKernel = Plugin.allocate<CUDAKernelTy>();
493 if (!CUDAKernel)
494 return Plugin::error(ErrorCode::OUT_OF_RESOURCES,
495 "failed to allocate memory for CUDA kernel");
496
497 new (CUDAKernel) CUDAKernelTy(Name);
498
499 return *CUDAKernel;
500 }
501
502 /// Set the current context to this device's context.
503 Error setContext() override {
504 CUresult Res = cuCtxSetCurrent(Context);
505 return Plugin::check(Res, "error in cuCtxSetCurrent: %s");
506 }
507
508 /// NVIDIA returns the product of the SM count and the number of warps that
509 /// fit if the maximum number of threads were scheduled on each SM.
510 uint64_t getHardwareParallelism() const override {
511 return HardwareParallelism;
512 }
513
514 /// We want to set up the RPC server for host services to the GPU if it is
515 /// available.
516 bool shouldSetupRPCServer() const override { return true; }
517
518 /// The RPC interface should have enough space for all available parallelism.
519 uint64_t requestedRPCPortCount() const override {
520 return getHardwareParallelism();
521 }
522
523 /// Get the stream of the asynchronous info structure or get a new one.
524 Error getStream(AsyncInfoWrapperTy &AsyncInfoWrapper, CUstream &Stream) {
525 // Get the stream (if any) from the async info.
526 Stream = AsyncInfoWrapper.getQueueAs<CUstream>();
527 if (!Stream) {
528 // There was no stream; get an idle one.
529 if (auto Err = CUDAStreamManager.getResource(Stream))
530 return Err;
531
532 // Modify the async info's stream.
533 AsyncInfoWrapper.setQueueAs<CUstream>(Stream);
534 }
535 return Plugin::success();
536 }
537
538 /// Getters of CUDA references.
539 CUcontext getCUDAContext() const { return Context; }
540 CUdevice getCUDADevice() const { return Device; }
541
542 /// Load the binary image into the device and allocate an image object.
543 Expected<DeviceImageTy *> loadBinaryImpl(const __tgt_device_image *TgtImage,
544 int32_t ImageId) override {
545 if (auto Err = setContext())
546 return std::move(Err);
547
548 // Allocate and initialize the image object.
549 CUDADeviceImageTy *CUDAImage = Plugin.allocate<CUDADeviceImageTy>();
550 new (CUDAImage) CUDADeviceImageTy(ImageId, *this, TgtImage);
551
552 // Load the CUDA module.
553 if (auto Err = CUDAImage->loadModule())
554 return std::move(Err);
555
556 return CUDAImage;
557 }
558
559 /// Allocate memory on the device or related to the device.
560 void *allocate(size_t Size, void *, TargetAllocTy Kind) override {
561 if (Size == 0)
562 return nullptr;
563
564 if (auto Err = setContext()) {
565 REPORT("Failure to alloc memory: %s\n", toString(E: std::move(Err)).data());
566 return nullptr;
567 }
568
569 void *MemAlloc = nullptr;
570 CUdeviceptr DevicePtr;
571 CUresult Res;
572
573 switch (Kind) {
574 case TARGET_ALLOC_DEFAULT:
575 case TARGET_ALLOC_DEVICE:
576 Res = cuMemAlloc(&DevicePtr, Size);
577 MemAlloc = (void *)DevicePtr;
578 break;
579 case TARGET_ALLOC_HOST:
580 Res = cuMemAllocHost(&MemAlloc, Size);
581 break;
582 case TARGET_ALLOC_SHARED:
583 Res = cuMemAllocManaged(&DevicePtr, Size, CU_MEM_ATTACH_GLOBAL);
584 MemAlloc = (void *)DevicePtr;
585 break;
586 case TARGET_ALLOC_DEVICE_NON_BLOCKING: {
587 CUstream Stream;
588 if ((Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING)))
589 break;
590 if ((Res = cuMemAllocAsync(&DevicePtr, Size, Stream)))
591 break;
592 cuStreamSynchronize(Stream);
593 Res = cuStreamDestroy(Stream);
594 MemAlloc = (void *)DevicePtr;
595 }
596 }
597
598 if (auto Err =
599 Plugin::check(Res, "error in cuMemAlloc[Host|Managed]: %s")) {
600 REPORT("Failure to alloc memory: %s\n", toString(std::move(Err)).data());
601 return nullptr;
602 }
603 return MemAlloc;
604 }
605
606 /// Deallocate memory on the device or related to the device.
607 int free(void *TgtPtr, TargetAllocTy Kind) override {
608 if (TgtPtr == nullptr)
609 return OFFLOAD_SUCCESS;
610
611 if (auto Err = setContext()) {
612 REPORT("Failure to free memory: %s\n", toString(E: std::move(Err)).data());
613 return OFFLOAD_FAIL;
614 }
615
616 CUresult Res;
617 switch (Kind) {
618 case TARGET_ALLOC_DEFAULT:
619 case TARGET_ALLOC_DEVICE:
620 case TARGET_ALLOC_SHARED:
621 Res = cuMemFree((CUdeviceptr)TgtPtr);
622 break;
623 case TARGET_ALLOC_HOST:
624 Res = cuMemFreeHost(TgtPtr);
625 break;
626 case TARGET_ALLOC_DEVICE_NON_BLOCKING: {
627 CUstream Stream;
628 if ((Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING)))
629 break;
630 cuMemFreeAsync(reinterpret_cast<CUdeviceptr>(TgtPtr), Stream);
631 cuStreamSynchronize(Stream);
632 if ((Res = cuStreamDestroy(Stream)))
633 break;
634 }
635 }
636
637 if (auto Err = Plugin::check(Res, "error in cuMemFree[Host]: %s")) {
638 REPORT("Failure to free memory: %s\n", toString(std::move(Err)).data());
639 return OFFLOAD_FAIL;
640 }
641 return OFFLOAD_SUCCESS;
642 }
643
644 /// Synchronize current thread with the pending operations on the async info.
645 Error synchronizeImpl(__tgt_async_info &AsyncInfo) override {
646 CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo.Queue);
647 CUresult Res;
648 Res = cuStreamSynchronize(Stream);
649
650 // Once the stream is synchronized, return it to stream pool and reset
651 // AsyncInfo. This is to make sure the synchronization only works for its
652 // own tasks.
653 AsyncInfo.Queue = nullptr;
654 if (auto Err = CUDAStreamManager.returnResource(Stream))
655 return Err;
656
657 return Plugin::check(Res, "error in cuStreamSynchronize: %s");
658 }
659
660 /// CUDA support VA management
661 bool supportVAManagement() const override {
662#if (defined(CUDA_VERSION) && (CUDA_VERSION >= 11000))
663 return true;
664#else
665 return false;
666#endif
667 }
668
669 /// Allocates \p RSize bytes (rounded up to page size) and hints the cuda
670 /// driver to map it to \p VAddr. The obtained address is stored in \p Addr.
671 /// At return \p RSize contains the actual size
672 Error memoryVAMap(void **Addr, void *VAddr, size_t *RSize) override {
673 CUdeviceptr DVAddr = reinterpret_cast<CUdeviceptr>(VAddr);
674 auto IHandle = DeviceMMaps.find(DVAddr);
675 size_t Size = *RSize;
676
677 if (Size == 0)
678 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
679 "memory Map Size must be larger than 0");
680
681 // Check if we have already mapped this address
682 if (IHandle != DeviceMMaps.end())
683 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
684 "address already memory mapped");
685
686 CUmemAllocationProp Prop = {};
687 size_t Granularity = 0;
688
689 size_t Free, Total;
690 CUresult Res = cuMemGetInfo(&Free, &Total);
691 if (auto Err = Plugin::check(Res, "Error in cuMemGetInfo: %s"))
692 return Err;
693
694 if (Size >= Free) {
695 *Addr = nullptr;
696 return Plugin::error(
697 ErrorCode::OUT_OF_RESOURCES,
698 "cannot map memory size larger than the available device memory");
699 }
700
701 // currently NVidia only supports pinned device types
702 Prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
703 Prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
704
705 Prop.location.id = DeviceId;
706 cuMemGetAllocationGranularity(&Granularity, &Prop,
707 CU_MEM_ALLOC_GRANULARITY_MINIMUM);
708 if (auto Err =
709 Plugin::check(Res, "error in cuMemGetAllocationGranularity: %s"))
710 return Err;
711
712 if (Granularity == 0)
713 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
714 "wrong device Page size");
715
716 // Ceil to page size.
717 Size = utils::roundUp(Size, Granularity);
718
719 // Create a handler of our allocation
720 CUmemGenericAllocationHandle AHandle;
721 Res = cuMemCreate(&AHandle, Size, &Prop, 0);
722 if (auto Err = Plugin::check(Res, "error in cuMemCreate: %s"))
723 return Err;
724
725 CUdeviceptr DevPtr = 0;
726 Res = cuMemAddressReserve(&DevPtr, Size, 0, DVAddr, 0);
727 if (auto Err = Plugin::check(Res, "error in cuMemAddressReserve: %s"))
728 return Err;
729
730 Res = cuMemMap(DevPtr, Size, 0, AHandle, 0);
731 if (auto Err = Plugin::check(Res, "error in cuMemMap: %s"))
732 return Err;
733
734 CUmemAccessDesc ADesc = {};
735 ADesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
736 ADesc.location.id = DeviceId;
737 ADesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
738
739 // Sets address
740 Res = cuMemSetAccess(DevPtr, Size, &ADesc, 1);
741 if (auto Err = Plugin::check(Res, "error in cuMemSetAccess: %s"))
742 return Err;
743
744 *Addr = reinterpret_cast<void *>(DevPtr);
745 *RSize = Size;
746 DeviceMMaps.insert({DevPtr, AHandle});
747 return Plugin::success();
748 }
749
750 /// De-allocates device memory and Unmaps the Virtual Addr
751 Error memoryVAUnMap(void *VAddr, size_t Size) override {
752 CUdeviceptr DVAddr = reinterpret_cast<CUdeviceptr>(VAddr);
753 auto IHandle = DeviceMMaps.find(DVAddr);
754 // Mapping does not exist
755 if (IHandle == DeviceMMaps.end()) {
756 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
757 "addr is not MemoryMapped");
758 }
759
760 if (IHandle == DeviceMMaps.end())
761 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
762 "addr is not MemoryMapped");
763
764 CUmemGenericAllocationHandle &AllocHandle = IHandle->second;
765
766 CUresult Res = cuMemUnmap(DVAddr, Size);
767 if (auto Err = Plugin::check(Res, "error in cuMemUnmap: %s"))
768 return Err;
769
770 Res = cuMemRelease(AllocHandle);
771 if (auto Err = Plugin::check(Res, "error in cuMemRelease: %s"))
772 return Err;
773
774 Res = cuMemAddressFree(DVAddr, Size);
775 if (auto Err = Plugin::check(Res, "error in cuMemAddressFree: %s"))
776 return Err;
777
778 DeviceMMaps.erase(IHandle);
779 return Plugin::success();
780 }
781
782 /// Query for the completion of the pending operations on the async info.
783 Error queryAsyncImpl(__tgt_async_info &AsyncInfo) override {
784 CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo.Queue);
785 CUresult Res = cuStreamQuery(Stream);
786
787 // Not ready streams must be considered as successful operations.
788 if (Res == CUDA_ERROR_NOT_READY)
789 return Plugin::success();
790
791 // Once the stream is synchronized and the operations completed (or an error
792 // occurs), return it to stream pool and reset AsyncInfo. This is to make
793 // sure the synchronization only works for its own tasks.
794 AsyncInfo.Queue = nullptr;
795 if (auto Err = CUDAStreamManager.returnResource(Stream))
796 return Err;
797
798 return Plugin::check(Res, "error in cuStreamQuery: %s");
799 }
800
801 Expected<void *> dataLockImpl(void *HstPtr, int64_t Size) override {
802 // TODO: Register the buffer as CUDA host memory.
803 return HstPtr;
804 }
805
806 Error dataUnlockImpl(void *HstPtr) override { return Plugin::success(); }
807
808 Expected<bool> isPinnedPtrImpl(void *HstPtr, void *&BaseHstPtr,
809 void *&BaseDevAccessiblePtr,
810 size_t &BaseSize) const override {
811 // TODO: Implement pinning feature for CUDA.
812 return false;
813 }
814
815 /// Submit data to the device (host to device transfer).
816 Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size,
817 AsyncInfoWrapperTy &AsyncInfoWrapper) override {
818 if (auto Err = setContext())
819 return Err;
820
821 CUstream Stream;
822 if (auto Err = getStream(AsyncInfoWrapper, Stream))
823 return Err;
824
825 CUresult Res = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
826 return Plugin::check(Res, "error in cuMemcpyHtoDAsync: %s");
827 }
828
829 /// Retrieve data from the device (device to host transfer).
830 Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size,
831 AsyncInfoWrapperTy &AsyncInfoWrapper) override {
832 if (auto Err = setContext())
833 return Err;
834
835 CUstream Stream;
836 if (auto Err = getStream(AsyncInfoWrapper, Stream))
837 return Err;
838
839 CUresult Res = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
840 return Plugin::check(Res, "error in cuMemcpyDtoHAsync: %s");
841 }
842
843 /// Exchange data between two devices directly. We may use peer access if
844 /// the CUDA devices and driver allow them.
845 Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstGenericDevice,
846 void *DstPtr, int64_t Size,
847 AsyncInfoWrapperTy &AsyncInfoWrapper) override;
848
849 /// Initialize the async info for interoperability purposes.
850 Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
851 if (auto Err = setContext())
852 return Err;
853
854 CUstream Stream;
855 if (auto Err = getStream(AsyncInfoWrapper, Stream))
856 return Err;
857
858 return Plugin::success();
859 }
860
861 /// Initialize the device info for interoperability purposes.
862 Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) override {
863 assert(Context && "Context is null");
864 assert(Device != CU_DEVICE_INVALID && "Invalid CUDA device");
865
866 if (auto Err = setContext())
867 return Err;
868
869 if (!DeviceInfo->Context)
870 DeviceInfo->Context = Context;
871
872 if (!DeviceInfo->Device)
873 DeviceInfo->Device = reinterpret_cast<void *>(Device);
874
875 return Plugin::success();
876 }
877
878 /// Create an event.
879 Error createEventImpl(void **EventPtrStorage) override {
880 CUevent *Event = reinterpret_cast<CUevent *>(EventPtrStorage);
881 return CUDAEventManager.getResource(*Event);
882 }
883
884 /// Destroy a previously created event.
885 Error destroyEventImpl(void *EventPtr) override {
886 CUevent Event = reinterpret_cast<CUevent>(EventPtr);
887 return CUDAEventManager.returnResource(Event);
888 }
889
890 /// Record the event.
891 Error recordEventImpl(void *EventPtr,
892 AsyncInfoWrapperTy &AsyncInfoWrapper) override {
893 CUevent Event = reinterpret_cast<CUevent>(EventPtr);
894
895 CUstream Stream;
896 if (auto Err = getStream(AsyncInfoWrapper, Stream))
897 return Err;
898
899 CUresult Res = cuEventRecord(Event, Stream);
900 return Plugin::check(Res, "error in cuEventRecord: %s");
901 }
902
903 /// Make the stream wait on the event.
904 Error waitEventImpl(void *EventPtr,
905 AsyncInfoWrapperTy &AsyncInfoWrapper) override {
906 CUevent Event = reinterpret_cast<CUevent>(EventPtr);
907
908 CUstream Stream;
909 if (auto Err = getStream(AsyncInfoWrapper, Stream))
910 return Err;
911
912 // Do not use CU_EVENT_WAIT_DEFAULT here as it is only available from
913 // specific CUDA version, and defined as 0x0. In previous version, per CUDA
914 // API document, that argument has to be 0x0.
915 CUresult Res = cuStreamWaitEvent(Stream, Event, 0);
916 return Plugin::check(Res, "error in cuStreamWaitEvent: %s");
917 }
918
919 /// Synchronize the current thread with the event.
920 Error syncEventImpl(void *EventPtr) override {
921 CUevent Event = reinterpret_cast<CUevent>(EventPtr);
922 CUresult Res = cuEventSynchronize(Event);
923 return Plugin::check(Res, "error in cuEventSynchronize: %s");
924 }
925
926 /// Print information about the device.
927 Expected<InfoTreeNode> obtainInfoImpl() override {
928 char TmpChar[1000];
929 const char *TmpCharPtr;
930 size_t TmpSt;
931 int TmpInt;
932 InfoTreeNode Info;
933
934 CUresult Res = cuDriverGetVersion(&TmpInt);
935 if (Res == CUDA_SUCCESS)
936 // For consistency with other drivers, store the version as a string
937 // rather than an integer
938 Info.add("CUDA Driver Version", std::to_string(val: TmpInt));
939
940 Info.add("CUDA OpenMP Device Number", DeviceId);
941
942 Res = cuDeviceGetName(TmpChar, 1000, Device);
943 if (Res == CUDA_SUCCESS)
944 Info.add("Device Name", TmpChar);
945
946 Info.add("Vendor Name", "NVIDIA");
947
948 Res = cuDeviceTotalMem(&TmpSt, Device);
949 if (Res == CUDA_SUCCESS)
950 Info.add("Global Memory Size", TmpSt, "bytes");
951
952 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, TmpInt);
953 if (Res == CUDA_SUCCESS)
954 Info.add("Number of Multiprocessors", TmpInt);
955
956 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, TmpInt);
957 if (Res == CUDA_SUCCESS)
958 Info.add("Concurrent Copy and Execution", (bool)TmpInt);
959
960 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, TmpInt);
961 if (Res == CUDA_SUCCESS)
962 Info.add("Total Constant Memory", TmpInt, "bytes");
963
964 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
965 TmpInt);
966 if (Res == CUDA_SUCCESS)
967 Info.add("Max Shared Memory per Block", TmpInt, "bytes");
968
969 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, TmpInt);
970 if (Res == CUDA_SUCCESS)
971 Info.add("Registers per Block", TmpInt);
972
973 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_WARP_SIZE, TmpInt);
974 if (Res == CUDA_SUCCESS)
975 Info.add("Warp Size", TmpInt);
976
977 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, TmpInt);
978 if (Res == CUDA_SUCCESS)
979 Info.add("Maximum Threads per Block", TmpInt);
980
981 auto &MaxBlock = *Info.add("Maximum Block Dimensions", "");
982 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, TmpInt);
983 if (Res == CUDA_SUCCESS)
984 MaxBlock.add("x", TmpInt);
985 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, TmpInt);
986 if (Res == CUDA_SUCCESS)
987 MaxBlock.add("y", TmpInt);
988 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, TmpInt);
989 if (Res == CUDA_SUCCESS)
990 MaxBlock.add("z", TmpInt);
991
992 auto &MaxGrid = *Info.add("Maximum Grid Dimensions", "");
993 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, TmpInt);
994 if (Res == CUDA_SUCCESS)
995 MaxGrid.add("x", TmpInt);
996 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, TmpInt);
997 if (Res == CUDA_SUCCESS)
998 MaxGrid.add("y", TmpInt);
999 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, TmpInt);
1000 if (Res == CUDA_SUCCESS)
1001 MaxGrid.add("z", TmpInt);
1002
1003 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_PITCH, TmpInt);
1004 if (Res == CUDA_SUCCESS)
1005 Info.add("Maximum Memory Pitch", TmpInt, "bytes");
1006
1007 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, TmpInt);
1008 if (Res == CUDA_SUCCESS)
1009 Info.add("Texture Alignment", TmpInt, "bytes");
1010
1011 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CLOCK_RATE, TmpInt);
1012 if (Res == CUDA_SUCCESS)
1013 Info.add("Clock Rate", TmpInt, "kHz");
1014
1015 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, TmpInt);
1016 if (Res == CUDA_SUCCESS)
1017 Info.add("Execution Timeout", (bool)TmpInt);
1018
1019 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_INTEGRATED, TmpInt);
1020 if (Res == CUDA_SUCCESS)
1021 Info.add("Integrated Device", (bool)TmpInt);
1022
1023 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, TmpInt);
1024 if (Res == CUDA_SUCCESS)
1025 Info.add("Can Map Host Memory", (bool)TmpInt);
1026
1027 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, TmpInt);
1028 if (Res == CUDA_SUCCESS) {
1029 if (TmpInt == CU_COMPUTEMODE_DEFAULT)
1030 TmpCharPtr = "Default";
1031 else if (TmpInt == CU_COMPUTEMODE_PROHIBITED)
1032 TmpCharPtr = "Prohibited";
1033 else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS)
1034 TmpCharPtr = "Exclusive process";
1035 else
1036 TmpCharPtr = "Unknown";
1037 Info.add("Compute Mode", TmpCharPtr);
1038 }
1039
1040 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, TmpInt);
1041 if (Res == CUDA_SUCCESS)
1042 Info.add("Concurrent Kernels", (bool)TmpInt);
1043
1044 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_ECC_ENABLED, TmpInt);
1045 if (Res == CUDA_SUCCESS)
1046 Info.add("ECC Enabled", (bool)TmpInt);
1047
1048 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, TmpInt);
1049 if (Res == CUDA_SUCCESS)
1050 Info.add("Memory Clock Rate", TmpInt, "kHz");
1051
1052 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, TmpInt);
1053 if (Res == CUDA_SUCCESS)
1054 Info.add("Memory Bus Width", TmpInt, "bits");
1055
1056 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, TmpInt);
1057 if (Res == CUDA_SUCCESS)
1058 Info.add("L2 Cache Size", TmpInt, "bytes");
1059
1060 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
1061 TmpInt);
1062 if (Res == CUDA_SUCCESS)
1063 Info.add("Max Threads Per SMP", TmpInt);
1064
1065 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, TmpInt);
1066 if (Res == CUDA_SUCCESS)
1067 Info.add("Async Engines", TmpInt);
1068
1069 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, TmpInt);
1070 if (Res == CUDA_SUCCESS)
1071 Info.add("Unified Addressing", (bool)TmpInt);
1072
1073 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, TmpInt);
1074 if (Res == CUDA_SUCCESS)
1075 Info.add("Managed Memory", (bool)TmpInt);
1076
1077 Res =
1078 getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, TmpInt);
1079 if (Res == CUDA_SUCCESS)
1080 Info.add("Concurrent Managed Memory", (bool)TmpInt);
1081
1082 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED,
1083 TmpInt);
1084 if (Res == CUDA_SUCCESS)
1085 Info.add("Preemption Supported", (bool)TmpInt);
1086
1087 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, TmpInt);
1088 if (Res == CUDA_SUCCESS)
1089 Info.add("Cooperative Launch", (bool)TmpInt);
1090
1091 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, TmpInt);
1092 if (Res == CUDA_SUCCESS)
1093 Info.add("Multi-Device Boars", (bool)TmpInt);
1094
1095 Info.add("Compute Capabilities", ComputeCapability.str());
1096
1097 return Info;
1098 }
1099
1100 virtual bool shouldSetupDeviceMemoryPool() const override {
1101 /// We use the CUDA malloc for now.
1102 return false;
1103 }
1104
1105 /// Getters and setters for stack and heap sizes.
1106 Error getDeviceStackSize(uint64_t &Value) override {
1107 return getCtxLimit(CU_LIMIT_STACK_SIZE, Value);
1108 }
1109 Error setDeviceStackSize(uint64_t Value) override {
1110 return setCtxLimit(CU_LIMIT_STACK_SIZE, Value);
1111 }
1112 Error getDeviceHeapSize(uint64_t &Value) override {
1113 return getCtxLimit(CU_LIMIT_MALLOC_HEAP_SIZE, Value);
1114 }
1115 Error setDeviceHeapSize(uint64_t Value) override {
1116 return setCtxLimit(CU_LIMIT_MALLOC_HEAP_SIZE, Value);
1117 }
1118 Error getDeviceMemorySize(uint64_t &Value) override {
1119 CUresult Res = cuDeviceTotalMem(&Value, Device);
1120 return Plugin::check(Res, "error in getDeviceMemorySize %s");
1121 }
1122
1123 /// CUDA-specific functions for getting and setting context limits.
1124 Error setCtxLimit(CUlimit Kind, uint64_t Value) {
1125 CUresult Res = cuCtxSetLimit(Kind, Value);
1126 return Plugin::check(Res, "error in cuCtxSetLimit: %s");
1127 }
1128 Error getCtxLimit(CUlimit Kind, uint64_t &Value) {
1129 CUresult Res = cuCtxGetLimit(&Value, Kind);
1130 return Plugin::check(Res, "error in cuCtxGetLimit: %s");
1131 }
1132
1133 /// CUDA-specific function to get device attributes.
1134 Error getDeviceAttr(uint32_t Kind, uint32_t &Value) {
1135 // TODO: Warn if the new value is larger than the old.
1136 CUresult Res =
1137 cuDeviceGetAttribute((int *)&Value, (CUdevice_attribute)Kind, Device);
1138 return Plugin::check(Res, "error in cuDeviceGetAttribute: %s");
1139 }
1140
1141 CUresult getDeviceAttrRaw(uint32_t Kind, int &Value) {
1142 return cuDeviceGetAttribute(&Value, (CUdevice_attribute)Kind, Device);
1143 }
1144
1145 /// See GenericDeviceTy::getComputeUnitKind().
1146 std::string getComputeUnitKind() const override {
1147 return ComputeCapability.str();
1148 }
1149
1150 /// Returns the clock frequency for the given NVPTX device.
1151 uint64_t getClockFrequency() const override { return 1000000000; }
1152
1153private:
1154 using CUDAStreamManagerTy = GenericDeviceResourceManagerTy<CUDAStreamRef>;
1155 using CUDAEventManagerTy = GenericDeviceResourceManagerTy<CUDAEventRef>;
1156
1157 Error callGlobalCtorDtorCommon(GenericPluginTy &Plugin, DeviceImageTy &Image,
1158 bool IsCtor) {
1159 const char *KernelName = IsCtor ? "nvptx$device$init" : "nvptx$device$fini";
1160 // Perform a quick check for the named kernel in the image. The kernel
1161 // should be created by the 'nvptx-lower-ctor-dtor' pass.
1162 GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
1163 if (IsCtor && !Handler.isSymbolInImage(*this, Image, KernelName))
1164 return Plugin::success();
1165
1166 // The Nvidia backend cannot handle creating the ctor / dtor array
1167 // automatically so we must create it ourselves. The backend will emit
1168 // several globals that contain function pointers we can call. These are
1169 // prefixed with a known name due to Nvidia's lack of section support.
1170 auto ELFObjOrErr = Handler.getELFObjectFile(Image);
1171 if (!ELFObjOrErr)
1172 return ELFObjOrErr.takeError();
1173
1174 // Search for all symbols that contain a constructor or destructor.
1175 SmallVector<std::pair<StringRef, uint16_t>> Funcs;
1176 for (ELFSymbolRef Sym : (*ELFObjOrErr)->symbols()) {
1177 auto NameOrErr = Sym.getName();
1178 if (!NameOrErr)
1179 return NameOrErr.takeError();
1180
1181 if (!NameOrErr->starts_with(IsCtor ? "__init_array_object_"
1182 : "__fini_array_object_"))
1183 continue;
1184
1185 uint16_t Priority;
1186 if (NameOrErr->rsplit('_').second.getAsInteger(10, Priority))
1187 return Plugin::error(ErrorCode::INVALID_BINARY,
1188 "invalid priority for constructor or destructor");
1189
1190 Funcs.emplace_back(*NameOrErr, Priority);
1191 }
1192
1193 // Sort the created array to be in priority order.
1194 llvm::sort(C&: Funcs, Comp: [=](auto X, auto Y) { return X.second < Y.second; });
1195
1196 // Allocate a buffer to store all of the known constructor / destructor
1197 // functions in so we can iterate them on the device.
1198 void *Buffer =
1199 allocate(Funcs.size() * sizeof(void *), nullptr, TARGET_ALLOC_DEVICE);
1200 if (!Buffer)
1201 return Plugin::error(ErrorCode::OUT_OF_RESOURCES,
1202 "failed to allocate memory for global buffer");
1203
1204 auto *GlobalPtrStart = reinterpret_cast<uintptr_t *>(Buffer);
1205 auto *GlobalPtrStop = reinterpret_cast<uintptr_t *>(Buffer) + Funcs.size();
1206
1207 SmallVector<void *> FunctionPtrs(Funcs.size());
1208 std::size_t Idx = 0;
1209 for (auto [Name, Priority] : Funcs) {
1210 GlobalTy FunctionAddr(Name.str(), sizeof(void *), &FunctionPtrs[Idx++]);
1211 if (auto Err = Handler.readGlobalFromDevice(*this, Image, FunctionAddr))
1212 return Err;
1213 }
1214
1215 // Copy the local buffer to the device.
1216 if (auto Err = dataSubmit(GlobalPtrStart, FunctionPtrs.data(),
1217 FunctionPtrs.size() * sizeof(void *), nullptr))
1218 return Err;
1219
1220 // Copy the created buffer to the appropriate symbols so the kernel can
1221 // iterate through them.
1222 GlobalTy StartGlobal(IsCtor ? "__init_array_start" : "__fini_array_start",
1223 sizeof(void *), &GlobalPtrStart);
1224 if (auto Err = Handler.writeGlobalToDevice(*this, Image, StartGlobal))
1225 return Err;
1226
1227 GlobalTy StopGlobal(IsCtor ? "__init_array_end" : "__fini_array_end",
1228 sizeof(void *), &GlobalPtrStop);
1229 if (auto Err = Handler.writeGlobalToDevice(*this, Image, StopGlobal))
1230 return Err;
1231
1232 CUDAKernelTy CUDAKernel(KernelName);
1233
1234 if (auto Err = CUDAKernel.init(*this, Image))
1235 return Err;
1236
1237 AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr);
1238
1239 KernelArgsTy KernelArgs = {};
1240 uint32_t NumBlocksAndThreads[3] = {1u, 1u, 1u};
1241 if (auto Err = CUDAKernel.launchImpl(
1242 *this, NumBlocksAndThreads, NumBlocksAndThreads, KernelArgs,
1243 KernelLaunchParamsTy{}, AsyncInfoWrapper))
1244 return Err;
1245
1246 Error Err = Plugin::success();
1247 AsyncInfoWrapper.finalize(Err);
1248
1249 if (free(Buffer, TARGET_ALLOC_DEVICE) != OFFLOAD_SUCCESS)
1250 return Plugin::error(ErrorCode::UNKNOWN,
1251 "failed to free memory for global buffer");
1252
1253 return Err;
1254 }
1255
1256 /// Stream manager for CUDA streams.
1257 CUDAStreamManagerTy CUDAStreamManager;
1258
1259 /// Event manager for CUDA events.
1260 CUDAEventManagerTy CUDAEventManager;
1261
1262 /// The device's context. This context should be set before performing
1263 /// operations on the device.
1264 CUcontext Context = nullptr;
1265
1266 /// The CUDA device handler.
1267 CUdevice Device = CU_DEVICE_INVALID;
1268
1269 /// The memory mapped addresses and their handles
1270 std::unordered_map<CUdeviceptr, CUmemGenericAllocationHandle> DeviceMMaps;
1271
1272 /// The compute capability of the corresponding CUDA device.
1273 struct ComputeCapabilityTy {
1274 uint32_t Major;
1275 uint32_t Minor;
1276 std::string str() const {
1277 return "sm_" + std::to_string(val: Major * 10 + Minor);
1278 }
1279 } ComputeCapability;
1280
1281 /// The maximum number of warps that can be resident on all the SMs
1282 /// simultaneously.
1283 uint32_t HardwareParallelism = 0;
1284};
1285
1286Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
1287 uint32_t NumThreads[3], uint32_t NumBlocks[3],
1288 KernelArgsTy &KernelArgs,
1289 KernelLaunchParamsTy LaunchParams,
1290 AsyncInfoWrapperTy &AsyncInfoWrapper) const {
1291 CUDADeviceTy &CUDADevice = static_cast<CUDADeviceTy &>(GenericDevice);
1292
1293 CUstream Stream;
1294 if (auto Err = CUDADevice.getStream(AsyncInfoWrapper, Stream))
1295 return Err;
1296
1297 uint32_t MaxDynCGroupMem =
1298 std::max(KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize());
1299
1300 void *Config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, LaunchParams.Data,
1301 CU_LAUNCH_PARAM_BUFFER_SIZE,
1302 reinterpret_cast<void *>(&LaunchParams.Size),
1303 CU_LAUNCH_PARAM_END};
1304
1305 // If we are running an RPC server we want to wake up the server thread
1306 // whenever there is a kernel running and let it sleep otherwise.
1307 if (GenericDevice.getRPCServer())
1308 GenericDevice.Plugin.getRPCServer().Thread->notify();
1309
1310 // In case we require more memory than the current limit.
1311 if (MaxDynCGroupMem >= MaxDynCGroupMemLimit) {
1312 CUresult AttrResult = cuFuncSetAttribute(
1313 Func, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, MaxDynCGroupMem);
1314 return Plugin::check(
1315 AttrResult,
1316 "Error in cuLaunchKernel while setting the memory limits: %s");
1317 MaxDynCGroupMemLimit = MaxDynCGroupMem;
1318 }
1319
1320 CUresult Res = cuLaunchKernel(Func, NumBlocks[0], NumBlocks[1], NumBlocks[2],
1321 NumThreads[0], NumThreads[1], NumThreads[2],
1322 MaxDynCGroupMem, Stream, nullptr, Config);
1323
1324 // Register a callback to indicate when the kernel is complete.
1325 if (GenericDevice.getRPCServer())
1326 cuLaunchHostFunc(
1327 Stream,
1328 [](void *Data) {
1329 GenericPluginTy &Plugin = *reinterpret_cast<GenericPluginTy *>(Data);
1330 Plugin.getRPCServer().Thread->finish();
1331 },
1332 &GenericDevice.Plugin);
1333
1334 return Plugin::check(Res, "error in cuLaunchKernel for '%s': %s", getName());
1335}
1336
1337/// Class implementing the CUDA-specific functionalities of the global handler.
1338class CUDAGlobalHandlerTy final : public GenericGlobalHandlerTy {
1339public:
1340 /// Get the metadata of a global from the device. The name and size of the
1341 /// global is read from DeviceGlobal and the address of the global is written
1342 /// to DeviceGlobal.
1343 Error getGlobalMetadataFromDevice(GenericDeviceTy &Device,
1344 DeviceImageTy &Image,
1345 GlobalTy &DeviceGlobal) override {
1346 CUDADeviceImageTy &CUDAImage = static_cast<CUDADeviceImageTy &>(Image);
1347
1348 const char *GlobalName = DeviceGlobal.getName().data();
1349
1350 size_t CUSize;
1351 CUdeviceptr CUPtr;
1352 CUresult Res =
1353 cuModuleGetGlobal(&CUPtr, &CUSize, CUDAImage.getModule(), GlobalName);
1354 if (auto Err = Plugin::check(Res, "error in cuModuleGetGlobal for '%s': %s",
1355 GlobalName))
1356 return Err;
1357
1358 if (DeviceGlobal.getSize() && CUSize != DeviceGlobal.getSize())
1359 return Plugin::error(
1360 ErrorCode::INVALID_BINARY,
1361 "failed to load global '%s' due to size mismatch (%zu != %zu)",
1362 GlobalName, CUSize, (size_t)DeviceGlobal.getSize());
1363
1364 DeviceGlobal.setPtr(reinterpret_cast<void *>(CUPtr));
1365 DeviceGlobal.setSize(CUSize);
1366
1367 return Plugin::success();
1368 }
1369};
1370
1371/// Class implementing the CUDA-specific functionalities of the plugin.
1372struct CUDAPluginTy final : public GenericPluginTy {
1373 /// Create a CUDA plugin.
1374 CUDAPluginTy() : GenericPluginTy(getTripleArch()) {}
1375
1376 /// This class should not be copied.
1377 CUDAPluginTy(const CUDAPluginTy &) = delete;
1378 CUDAPluginTy(CUDAPluginTy &&) = delete;
1379
1380 /// Initialize the plugin and return the number of devices.
1381 Expected<int32_t> initImpl() override {
1382 CUresult Res = cuInit(0);
1383 if (Res == CUDA_ERROR_INVALID_HANDLE) {
1384 // Cannot call cuGetErrorString if dlsym failed.
1385 DP("Failed to load CUDA shared library\n");
1386 return 0;
1387 }
1388
1389 if (Res == CUDA_ERROR_NO_DEVICE) {
1390 // Do not initialize if there are no devices.
1391 DP("There are no devices supporting CUDA.\n");
1392 return 0;
1393 }
1394
1395 if (auto Err = Plugin::check(Res, "error in cuInit: %s"))
1396 return std::move(Err);
1397
1398 // Get the number of devices.
1399 int NumDevices;
1400 Res = cuDeviceGetCount(&NumDevices);
1401 if (auto Err = Plugin::check(Res, "error in cuDeviceGetCount: %s"))
1402 return std::move(Err);
1403
1404 // Do not initialize if there are no devices.
1405 if (NumDevices == 0)
1406 DP("There are no devices supporting CUDA.\n");
1407
1408 return NumDevices;
1409 }
1410
1411 /// Deinitialize the plugin.
1412 Error deinitImpl() override { return Plugin::success(); }
1413
1414 /// Creates a CUDA device to use for offloading.
1415 GenericDeviceTy *createDevice(GenericPluginTy &Plugin, int32_t DeviceId,
1416 int32_t NumDevices) override {
1417 return new CUDADeviceTy(Plugin, DeviceId, NumDevices);
1418 }
1419
1420 /// Creates a CUDA global handler.
1421 GenericGlobalHandlerTy *createGlobalHandler() override {
1422 return new CUDAGlobalHandlerTy();
1423 }
1424
1425 /// Get the ELF code for recognizing the compatible image binary.
1426 uint16_t getMagicElfBits() const override { return ELF::EM_CUDA; }
1427
1428 Triple::ArchType getTripleArch() const override {
1429 // TODO: I think we can drop the support for 32-bit NVPTX devices.
1430 return Triple::nvptx64;
1431 }
1432
1433 const char *getName() const override { return GETNAME(TARGET_NAME); }
1434
1435 /// Check whether the image is compatible with a CUDA device.
1436 Expected<bool> isELFCompatible(uint32_t DeviceId,
1437 StringRef Image) const override {
1438 auto ElfOrErr =
1439 ELF64LEObjectFile::create(MemoryBufferRef(Image, /*Identifier=*/""),
1440 /*InitContent=*/false);
1441 if (!ElfOrErr)
1442 return ElfOrErr.takeError();
1443
1444 // Get the numeric value for the image's `sm_` value.
1445 const auto Header = ElfOrErr->getELFFile().getHeader();
1446 unsigned SM =
1447 Header.e_ident[ELF::EI_ABIVERSION] == ELF::ELFABIVERSION_CUDA_V1
1448 ? Header.e_flags & ELF::EF_CUDA_SM
1449 : (Header.e_flags & ELF::EF_CUDA_SM_MASK) >> ELF::EF_CUDA_SM_OFFSET;
1450
1451 CUdevice Device;
1452 CUresult Res = cuDeviceGet(&Device, DeviceId);
1453 if (auto Err = Plugin::check(Res, "error in cuDeviceGet: %s"))
1454 return std::move(Err);
1455
1456 int32_t Major, Minor;
1457 Res = cuDeviceGetAttribute(
1458 &Major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device);
1459 if (auto Err = Plugin::check(Res, "error in cuDeviceGetAttribute: %s"))
1460 return std::move(Err);
1461
1462 Res = cuDeviceGetAttribute(
1463 &Minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device);
1464 if (auto Err = Plugin::check(Res, "error in cuDeviceGetAttribute: %s"))
1465 return std::move(Err);
1466
1467 int32_t ImageMajor = SM / 10;
1468 int32_t ImageMinor = SM % 10;
1469
1470 // A cubin generated for a certain compute capability is supported to
1471 // run on any GPU with the same major revision and same or higher minor
1472 // revision.
1473 return Major == ImageMajor && Minor >= ImageMinor;
1474 }
1475};
1476
1477Error CUDADeviceTy::dataExchangeImpl(const void *SrcPtr,
1478 GenericDeviceTy &DstGenericDevice,
1479 void *DstPtr, int64_t Size,
1480 AsyncInfoWrapperTy &AsyncInfoWrapper) {
1481 if (auto Err = setContext())
1482 return Err;
1483
1484 CUDADeviceTy &DstDevice = static_cast<CUDADeviceTy &>(DstGenericDevice);
1485
1486 CUresult Res;
1487 int32_t DstDeviceId = DstDevice.DeviceId;
1488 CUdeviceptr CUSrcPtr = (CUdeviceptr)SrcPtr;
1489 CUdeviceptr CUDstPtr = (CUdeviceptr)DstPtr;
1490
1491 int CanAccessPeer = 0;
1492 if (DeviceId != DstDeviceId) {
1493 // Make sure the lock is released before performing the copies.
1494 std::lock_guard<std::mutex> Lock(PeerAccessesLock);
1495
1496 switch (PeerAccesses[DstDeviceId]) {
1497 case PeerAccessState::AVAILABLE:
1498 CanAccessPeer = 1;
1499 break;
1500 case PeerAccessState::UNAVAILABLE:
1501 CanAccessPeer = 0;
1502 break;
1503 case PeerAccessState::PENDING:
1504 // Check whether the source device can access the destination device.
1505 Res = cuDeviceCanAccessPeer(&CanAccessPeer, Device, DstDevice.Device);
1506 if (auto Err = Plugin::check(Res, "Error in cuDeviceCanAccessPeer: %s"))
1507 return Err;
1508
1509 if (CanAccessPeer) {
1510 Res = cuCtxEnablePeerAccess(DstDevice.Context, 0);
1511 if (Res == CUDA_ERROR_TOO_MANY_PEERS) {
1512 // Resources may be exhausted due to many P2P links.
1513 CanAccessPeer = 0;
1514 DP("Too many P2P so fall back to D2D memcpy");
1515 } else if (auto Err =
1516 Plugin::check(Res, "error in cuCtxEnablePeerAccess: %s"))
1517 return Err;
1518 }
1519 PeerAccesses[DstDeviceId] = (CanAccessPeer)
1520 ? PeerAccessState::AVAILABLE
1521 : PeerAccessState::UNAVAILABLE;
1522 }
1523 }
1524
1525 CUstream Stream;
1526 if (auto Err = getStream(AsyncInfoWrapper, Stream))
1527 return Err;
1528
1529 if (CanAccessPeer) {
1530 // TODO: Should we fallback to D2D if peer access fails?
1531 Res = cuMemcpyPeerAsync(CUDstPtr, Context, CUSrcPtr, DstDevice.Context,
1532 Size, Stream);
1533 return Plugin::check(Res, "error in cuMemcpyPeerAsync: %s");
1534 }
1535
1536 // Fallback to D2D copy.
1537 Res = cuMemcpyDtoDAsync(CUDstPtr, CUSrcPtr, Size, Stream);
1538 return Plugin::check(Res, "error in cuMemcpyDtoDAsync: %s");
1539}
1540
1541template <typename... ArgsTy>
1542static Error Plugin::check(int32_t Code, const char *ErrFmt, ArgsTy... Args) {
1543 CUresult ResultCode = static_cast<CUresult>(Code);
1544 if (ResultCode == CUDA_SUCCESS)
1545 return Plugin::success();
1546
1547 const char *Desc = "Unknown error";
1548 CUresult Ret = cuGetErrorString(ResultCode, &Desc);
1549 if (Ret != CUDA_SUCCESS)
1550 REPORT("Unrecognized " GETNAME(TARGET_NAME) " error code %d\n", Code);
1551
1552 // TODO: Add more entries to this switch
1553 ErrorCode OffloadErrCode;
1554 switch (ResultCode) {
1555 case CUDA_ERROR_NOT_FOUND:
1556 OffloadErrCode = ErrorCode::NOT_FOUND;
1557 break;
1558 default:
1559 OffloadErrCode = ErrorCode::UNKNOWN;
1560 }
1561
1562 // TODO: Create a map for CUDA error codes to Offload error codes
1563 return Plugin::error(OffloadErrCode, ErrFmt, Args..., Desc);
1564}
1565
1566} // namespace plugin
1567} // namespace target
1568} // namespace omp
1569} // namespace llvm
1570
1571extern "C" {
1572llvm::omp::target::plugin::GenericPluginTy *createPlugin_cuda() {
1573 return new llvm::omp::target::plugin::CUDAPluginTy();
1574}
1575}
1576

source code of offload/plugins-nextgen/cuda/src/rtl.cpp