1//===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// RTL NextGen for CUDA machine
10//
11//===----------------------------------------------------------------------===//
12
13#include <cassert>
14#include <cstddef>
15#include <cuda.h>
16#include <string>
17#include <unordered_map>
18
19#include "Shared/APITypes.h"
20#include "Shared/Debug.h"
21#include "Shared/Environment.h"
22
23#include "GlobalHandler.h"
24#include "OpenMP/OMPT/Callback.h"
25#include "PluginInterface.h"
26#include "Utils/ELF.h"
27
28#include "llvm/BinaryFormat/ELF.h"
29#include "llvm/Frontend/OpenMP/OMPConstants.h"
30#include "llvm/Frontend/OpenMP/OMPGridValues.h"
31#include "llvm/Support/Error.h"
32#include "llvm/Support/FileOutputBuffer.h"
33#include "llvm/Support/FileSystem.h"
34#include "llvm/Support/Program.h"
35
36using namespace error;
37
38namespace llvm {
39namespace omp {
40namespace target {
41namespace plugin {
42
43/// Forward declarations for all specialized data structures.
44struct CUDAKernelTy;
45struct CUDADeviceTy;
46struct CUDAPluginTy;
47
48#if (defined(CUDA_VERSION) && (CUDA_VERSION < 11000))
49/// Forward declarations for all Virtual Memory Management
50/// related data structures and functions. This is necessary
51/// for older cuda versions.
52typedef void *CUmemGenericAllocationHandle;
53typedef void *CUmemAllocationProp;
54typedef void *CUmemAccessDesc;
55typedef void *CUmemAllocationGranularity_flags;
56CUresult cuMemAddressReserve(CUdeviceptr *ptr, size_t size, size_t alignment,
57 CUdeviceptr addr, unsigned long long flags) {}
58CUresult cuMemMap(CUdeviceptr ptr, size_t size, size_t offset,
59 CUmemGenericAllocationHandle handle,
60 unsigned long long flags) {}
61CUresult cuMemCreate(CUmemGenericAllocationHandle *handle, size_t size,
62 const CUmemAllocationProp *prop,
63 unsigned long long flags) {}
64CUresult cuMemSetAccess(CUdeviceptr ptr, size_t size,
65 const CUmemAccessDesc *desc, size_t count) {}
66CUresult
67cuMemGetAllocationGranularity(size_t *granularity,
68 const CUmemAllocationProp *prop,
69 CUmemAllocationGranularity_flags option) {}
70#endif
71
72#if (defined(CUDA_VERSION) && (CUDA_VERSION < 11020))
73// Forward declarations of asynchronous memory management functions. This is
74// necessary for older versions of CUDA.
75CUresult cuMemAllocAsync(CUdeviceptr *ptr, size_t, CUstream) { *ptr = 0; }
76
77CUresult cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream) {}
78#endif
79
80/// Class implementing the CUDA device images properties.
81struct CUDADeviceImageTy : public DeviceImageTy {
82 /// Create the CUDA image with the id and the target image pointer.
83 CUDADeviceImageTy(int32_t ImageId, GenericDeviceTy &Device,
84 const __tgt_device_image *TgtImage)
85 : DeviceImageTy(ImageId, Device, TgtImage), Module(nullptr) {}
86
87 /// Load the image as a CUDA module.
88 Error loadModule() {
89 assert(!Module && "Module already loaded");
90
91 CUresult Res = cuModuleLoadDataEx(&Module, getStart(), 0, nullptr, nullptr);
92 if (auto Err = Plugin::check(Res, "error in cuModuleLoadDataEx: %s"))
93 return Err;
94
95 return Plugin::success();
96 }
97
98 /// Unload the CUDA module corresponding to the image.
99 Error unloadModule() {
100 assert(Module && "Module not loaded");
101
102 CUresult Res = cuModuleUnload(Module);
103 if (auto Err = Plugin::check(Res, "error in cuModuleUnload: %s"))
104 return Err;
105
106 Module = nullptr;
107
108 return Plugin::success();
109 }
110
111 /// Getter of the CUDA module.
112 CUmodule getModule() const { return Module; }
113
114private:
115 /// The CUDA module that loaded the image.
116 CUmodule Module;
117};
118
119/// Class implementing the CUDA kernel functionalities which derives from the
120/// generic kernel class.
121struct CUDAKernelTy : public GenericKernelTy {
122 /// Create a CUDA kernel with a name and an execution mode.
123 CUDAKernelTy(const char *Name) : GenericKernelTy(Name), Func(nullptr) {}
124
125 /// Initialize the CUDA kernel.
126 Error initImpl(GenericDeviceTy &GenericDevice,
127 DeviceImageTy &Image) override {
128 CUresult Res;
129 CUDADeviceImageTy &CUDAImage = static_cast<CUDADeviceImageTy &>(Image);
130
131 // Retrieve the function pointer of the kernel.
132 Res = cuModuleGetFunction(&Func, CUDAImage.getModule(), getName());
133 if (auto Err = Plugin::check(Res, "error in cuModuleGetFunction('%s'): %s",
134 getName()))
135 return Err;
136
137 // Check that the function pointer is valid.
138 if (!Func)
139 return Plugin::error(ErrorCode::INVALID_BINARY,
140 "invalid function for kernel %s", getName());
141
142 int MaxThreads;
143 Res = cuFuncGetAttribute(&MaxThreads,
144 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Func);
145 if (auto Err = Plugin::check(Res, "error in cuFuncGetAttribute: %s"))
146 return Err;
147
148 // The maximum number of threads cannot exceed the maximum of the kernel.
149 MaxNumThreads = std::min(MaxNumThreads, (uint32_t)MaxThreads);
150
151 return Plugin::success();
152 }
153
154 /// Launch the CUDA kernel function.
155 Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3],
156 uint32_t NumBlocks[3], KernelArgsTy &KernelArgs,
157 KernelLaunchParamsTy LaunchParams,
158 AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
159
160private:
161 /// The CUDA kernel function to execute.
162 CUfunction Func;
163};
164
165/// Class wrapping a CUDA stream reference. These are the objects handled by the
166/// Stream Manager for the CUDA plugin.
167struct CUDAStreamRef final : public GenericDeviceResourceRef {
168 /// The underlying handle type for streams.
169 using HandleTy = CUstream;
170
171 /// Create an empty reference to an invalid stream.
172 CUDAStreamRef() : Stream(nullptr) {}
173
174 /// Create a reference to an existing stream.
175 CUDAStreamRef(HandleTy Stream) : Stream(Stream) {}
176
177 /// Create a new stream and save the reference. The reference must be empty
178 /// before calling to this function.
179 Error create(GenericDeviceTy &Device) override {
180 if (Stream)
181 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
182 "creating an existing stream");
183
184 CUresult Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING);
185 if (auto Err = Plugin::check(Res, "error in cuStreamCreate: %s"))
186 return Err;
187
188 return Plugin::success();
189 }
190
191 /// Destroy the referenced stream and invalidate the reference. The reference
192 /// must be to a valid stream before calling to this function.
193 Error destroy(GenericDeviceTy &Device) override {
194 if (!Stream)
195 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
196 "destroying an invalid stream");
197
198 CUresult Res = cuStreamDestroy(Stream);
199 if (auto Err = Plugin::check(Res, "error in cuStreamDestroy: %s"))
200 return Err;
201
202 Stream = nullptr;
203 return Plugin::success();
204 }
205
206 /// Get the underlying CUDA stream.
207 operator HandleTy() const { return Stream; }
208
209private:
210 /// The reference to the CUDA stream.
211 HandleTy Stream;
212};
213
214/// Class wrapping a CUDA event reference. These are the objects handled by the
215/// Event Manager for the CUDA plugin.
216struct CUDAEventRef final : public GenericDeviceResourceRef {
217 /// The underlying handle type for events.
218 using HandleTy = CUevent;
219
220 /// Create an empty reference to an invalid event.
221 CUDAEventRef() : Event(nullptr) {}
222
223 /// Create a reference to an existing event.
224 CUDAEventRef(HandleTy Event) : Event(Event) {}
225
226 /// Create a new event and save the reference. The reference must be empty
227 /// before calling to this function.
228 Error create(GenericDeviceTy &Device) override {
229 if (Event)
230 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
231 "creating an existing event");
232
233 CUresult Res = cuEventCreate(&Event, CU_EVENT_DEFAULT);
234 if (auto Err = Plugin::check(Res, "error in cuEventCreate: %s"))
235 return Err;
236
237 return Plugin::success();
238 }
239
240 /// Destroy the referenced event and invalidate the reference. The reference
241 /// must be to a valid event before calling to this function.
242 Error destroy(GenericDeviceTy &Device) override {
243 if (!Event)
244 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
245 "destroying an invalid event");
246
247 CUresult Res = cuEventDestroy(Event);
248 if (auto Err = Plugin::check(Res, "error in cuEventDestroy: %s"))
249 return Err;
250
251 Event = nullptr;
252 return Plugin::success();
253 }
254
255 /// Get the underlying CUevent.
256 operator HandleTy() const { return Event; }
257
258private:
259 /// The reference to the CUDA event.
260 HandleTy Event;
261};
262
263/// Class implementing the CUDA device functionalities which derives from the
264/// generic device class.
265struct CUDADeviceTy : public GenericDeviceTy {
266 // Create a CUDA device with a device id and the default CUDA grid values.
267 CUDADeviceTy(GenericPluginTy &Plugin, int32_t DeviceId, int32_t NumDevices)
268 : GenericDeviceTy(Plugin, DeviceId, NumDevices, NVPTXGridValues),
269 CUDAStreamManager(*this), CUDAEventManager(*this) {}
270
271 ~CUDADeviceTy() {}
272
273 /// Initialize the device, its resources and get its properties.
274 Error initImpl(GenericPluginTy &Plugin) override {
275 CUresult Res = cuDeviceGet(&Device, DeviceId);
276 if (auto Err = Plugin::check(Res, "error in cuDeviceGet: %s"))
277 return Err;
278
279 // Query the current flags of the primary context and set its flags if
280 // it is inactive.
281 unsigned int FormerPrimaryCtxFlags = 0;
282 int FormerPrimaryCtxIsActive = 0;
283 Res = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags,
284 &FormerPrimaryCtxIsActive);
285 if (auto Err =
286 Plugin::check(Res, "error in cuDevicePrimaryCtxGetState: %s"))
287 return Err;
288
289 if (FormerPrimaryCtxIsActive) {
290 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
291 "The primary context is active, no change to its flags\n");
292 if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) !=
293 CU_CTX_SCHED_BLOCKING_SYNC)
294 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
295 "Warning: The current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n");
296 } else {
297 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
298 "The primary context is inactive, set its flags to "
299 "CU_CTX_SCHED_BLOCKING_SYNC\n");
300 Res = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC);
301 if (auto Err =
302 Plugin::check(Res, "error in cuDevicePrimaryCtxSetFlags: %s"))
303 return Err;
304 }
305
306 // Retain the per device primary context and save it to use whenever this
307 // device is selected.
308 Res = cuDevicePrimaryCtxRetain(&Context, Device);
309 if (auto Err = Plugin::check(Res, "error in cuDevicePrimaryCtxRetain: %s"))
310 return Err;
311
312 if (auto Err = setContext())
313 return Err;
314
315 // Initialize stream pool.
316 if (auto Err = CUDAStreamManager.init(OMPX_InitialNumStreams))
317 return Err;
318
319 // Initialize event pool.
320 if (auto Err = CUDAEventManager.init(OMPX_InitialNumEvents))
321 return Err;
322
323 // Query attributes to determine number of threads/block and blocks/grid.
324 if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
325 GridValues.GV_Max_Teams))
326 return Err;
327
328 if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
329 GridValues.GV_Max_WG_Size))
330 return Err;
331
332 if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_WARP_SIZE,
333 GridValues.GV_Warp_Size))
334 return Err;
335
336 if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
337 ComputeCapability.Major))
338 return Err;
339
340 if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
341 ComputeCapability.Minor))
342 return Err;
343
344 uint32_t NumMuliprocessors = 0;
345 uint32_t MaxThreadsPerSM = 0;
346 uint32_t WarpSize = 0;
347 if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
348 NumMuliprocessors))
349 return Err;
350 if (auto Err =
351 getDeviceAttr(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
352 MaxThreadsPerSM))
353 return Err;
354 if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_WARP_SIZE, WarpSize))
355 return Err;
356 HardwareParallelism = NumMuliprocessors * (MaxThreadsPerSM / WarpSize);
357
358 return Plugin::success();
359 }
360
361 /// Deinitialize the device and release its resources.
362 Error deinitImpl() override {
363 if (Context) {
364 if (auto Err = setContext())
365 return Err;
366 }
367
368 // Deinitialize the stream manager.
369 if (auto Err = CUDAStreamManager.deinit())
370 return Err;
371
372 if (auto Err = CUDAEventManager.deinit())
373 return Err;
374
375 // Close modules if necessary.
376 if (!LoadedImages.empty()) {
377 assert(Context && "Invalid CUDA context");
378
379 // Each image has its own module.
380 for (DeviceImageTy *Image : LoadedImages) {
381 CUDADeviceImageTy &CUDAImage = static_cast<CUDADeviceImageTy &>(*Image);
382
383 // Unload the module of the image.
384 if (auto Err = CUDAImage.unloadModule())
385 return Err;
386 }
387 }
388
389 if (Context) {
390 CUresult Res = cuDevicePrimaryCtxRelease(Device);
391 if (auto Err =
392 Plugin::check(Res, "error in cuDevicePrimaryCtxRelease: %s"))
393 return Err;
394 }
395
396 // Invalidate context and device references.
397 Context = nullptr;
398 Device = CU_DEVICE_INVALID;
399
400 return Plugin::success();
401 }
402
403 virtual Error callGlobalConstructors(GenericPluginTy &Plugin,
404 DeviceImageTy &Image) override {
405 // Check for the presence of global destructors at initialization time. This
406 // is required when the image may be deallocated before destructors are run.
407 GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
408 if (Handler.isSymbolInImage(*this, Image, "nvptx$device$fini"))
409 Image.setPendingGlobalDtors();
410
411 return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/true);
412 }
413
414 virtual Error callGlobalDestructors(GenericPluginTy &Plugin,
415 DeviceImageTy &Image) override {
416 if (Image.hasPendingGlobalDtors())
417 return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/false);
418 return Plugin::success();
419 }
420
421 Expected<std::unique_ptr<MemoryBuffer>>
422 doJITPostProcessing(std::unique_ptr<MemoryBuffer> MB) const override {
423 // TODO: We should be able to use the 'nvidia-ptxjitcompiler' interface to
424 // avoid the call to 'ptxas'.
425 SmallString<128> PTXInputFilePath;
426 std::error_code EC = sys::fs::createTemporaryFile("nvptx-pre-link-jit", "s",
427 PTXInputFilePath);
428 if (EC)
429 return Plugin::error(ErrorCode::HOST_IO,
430 "failed to create temporary file for ptxas");
431
432 // Write the file's contents to the output file.
433 Expected<std::unique_ptr<FileOutputBuffer>> OutputOrErr =
434 FileOutputBuffer::create(PTXInputFilePath, MB->getBuffer().size());
435 if (!OutputOrErr)
436 return OutputOrErr.takeError();
437 std::unique_ptr<FileOutputBuffer> Output = std::move(*OutputOrErr);
438 llvm::copy(MB->getBuffer(), Output->getBufferStart());
439 if (Error E = Output->commit())
440 return std::move(E);
441
442 SmallString<128> PTXOutputFilePath;
443 EC = sys::fs::createTemporaryFile("nvptx-post-link-jit", "cubin",
444 PTXOutputFilePath);
445 if (EC)
446 return Plugin::error(ErrorCode::HOST_IO,
447 "failed to create temporary file for ptxas");
448
449 // Try to find `ptxas` in the path to compile the PTX to a binary.
450 const auto ErrorOrPath = sys::findProgramByName("ptxas");
451 if (!ErrorOrPath)
452 return Plugin::error(ErrorCode::HOST_TOOL_NOT_FOUND,
453 "failed to find 'ptxas' on the PATH.");
454
455 std::string Arch = getComputeUnitKind();
456 StringRef Args[] = {*ErrorOrPath,
457 "-m64",
458 "-O2",
459 "--gpu-name",
460 Arch,
461 "--output-file",
462 PTXOutputFilePath,
463 PTXInputFilePath};
464
465 std::string ErrMsg;
466 if (sys::ExecuteAndWait(*ErrorOrPath, Args, std::nullopt, {}, 0, 0,
467 &ErrMsg))
468 return Plugin::error(ErrorCode::ASSEMBLE_FAILURE,
469 "running 'ptxas' failed: %s\n", ErrMsg.c_str());
470
471 auto BufferOrErr = MemoryBuffer::getFileOrSTDIN(PTXOutputFilePath.data());
472 if (!BufferOrErr)
473 return Plugin::error(ErrorCode::HOST_IO,
474 "failed to open temporary file for ptxas");
475
476 // Clean up the temporary files afterwards.
477 if (sys::fs::remove(PTXOutputFilePath))
478 return Plugin::error(ErrorCode::HOST_IO,
479 "failed to remove temporary file for ptxas");
480 if (sys::fs::remove(PTXInputFilePath))
481 return Plugin::error(ErrorCode::HOST_IO,
482 "failed to remove temporary file for ptxas");
483
484 return std::move(*BufferOrErr);
485 }
486
487 /// Allocate and construct a CUDA kernel.
488 Expected<GenericKernelTy &> constructKernel(const char *Name) override {
489 // Allocate and construct the CUDA kernel.
490 CUDAKernelTy *CUDAKernel = Plugin.allocate<CUDAKernelTy>();
491 if (!CUDAKernel)
492 return Plugin::error(ErrorCode::OUT_OF_RESOURCES,
493 "failed to allocate memory for CUDA kernel");
494
495 new (CUDAKernel) CUDAKernelTy(Name);
496
497 return *CUDAKernel;
498 }
499
500 /// Set the current context to this device's context.
501 Error setContext() override {
502 CUresult Res = cuCtxSetCurrent(Context);
503 return Plugin::check(Res, "error in cuCtxSetCurrent: %s");
504 }
505
506 /// NVIDIA returns the product of the SM count and the number of warps that
507 /// fit if the maximum number of threads were scheduled on each SM.
508 uint64_t getHardwareParallelism() const override {
509 return HardwareParallelism;
510 }
511
512 /// We want to set up the RPC server for host services to the GPU if it is
513 /// available.
514 bool shouldSetupRPCServer() const override { return true; }
515
516 /// The RPC interface should have enough space for all available parallelism.
517 uint64_t requestedRPCPortCount() const override {
518 return getHardwareParallelism();
519 }
520
521 /// Get the stream of the asynchronous info structure or get a new one.
522 Error getStream(AsyncInfoWrapperTy &AsyncInfoWrapper, CUstream &Stream) {
523 // Get the stream (if any) from the async info.
524 Stream = AsyncInfoWrapper.getQueueAs<CUstream>();
525 if (!Stream) {
526 // There was no stream; get an idle one.
527 if (auto Err = CUDAStreamManager.getResource(Stream))
528 return Err;
529
530 // Modify the async info's stream.
531 AsyncInfoWrapper.setQueueAs<CUstream>(Stream);
532 }
533 return Plugin::success();
534 }
535
536 /// Getters of CUDA references.
537 CUcontext getCUDAContext() const { return Context; }
538 CUdevice getCUDADevice() const { return Device; }
539
540 /// Load the binary image into the device and allocate an image object.
541 Expected<DeviceImageTy *> loadBinaryImpl(const __tgt_device_image *TgtImage,
542 int32_t ImageId) override {
543 if (auto Err = setContext())
544 return std::move(Err);
545
546 // Allocate and initialize the image object.
547 CUDADeviceImageTy *CUDAImage = Plugin.allocate<CUDADeviceImageTy>();
548 new (CUDAImage) CUDADeviceImageTy(ImageId, *this, TgtImage);
549
550 // Load the CUDA module.
551 if (auto Err = CUDAImage->loadModule())
552 return std::move(Err);
553
554 return CUDAImage;
555 }
556
557 /// Allocate memory on the device or related to the device.
558 void *allocate(size_t Size, void *, TargetAllocTy Kind) override {
559 if (Size == 0)
560 return nullptr;
561
562 if (auto Err = setContext()) {
563 REPORT("Failure to alloc memory: %s\n", toString(E: std::move(Err)).data());
564 return nullptr;
565 }
566
567 void *MemAlloc = nullptr;
568 CUdeviceptr DevicePtr;
569 CUresult Res;
570
571 switch (Kind) {
572 case TARGET_ALLOC_DEFAULT:
573 case TARGET_ALLOC_DEVICE:
574 Res = cuMemAlloc(&DevicePtr, Size);
575 MemAlloc = (void *)DevicePtr;
576 break;
577 case TARGET_ALLOC_HOST:
578 Res = cuMemAllocHost(&MemAlloc, Size);
579 break;
580 case TARGET_ALLOC_SHARED:
581 Res = cuMemAllocManaged(&DevicePtr, Size, CU_MEM_ATTACH_GLOBAL);
582 MemAlloc = (void *)DevicePtr;
583 break;
584 case TARGET_ALLOC_DEVICE_NON_BLOCKING: {
585 CUstream Stream;
586 if ((Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING)))
587 break;
588 if ((Res = cuMemAllocAsync(&DevicePtr, Size, Stream)))
589 break;
590 cuStreamSynchronize(Stream);
591 Res = cuStreamDestroy(Stream);
592 MemAlloc = (void *)DevicePtr;
593 }
594 }
595
596 if (auto Err =
597 Plugin::check(Res, "error in cuMemAlloc[Host|Managed]: %s")) {
598 REPORT("Failure to alloc memory: %s\n", toString(std::move(Err)).data());
599 return nullptr;
600 }
601 return MemAlloc;
602 }
603
604 /// Deallocate memory on the device or related to the device.
605 int free(void *TgtPtr, TargetAllocTy Kind) override {
606 if (TgtPtr == nullptr)
607 return OFFLOAD_SUCCESS;
608
609 if (auto Err = setContext()) {
610 REPORT("Failure to free memory: %s\n", toString(E: std::move(Err)).data());
611 return OFFLOAD_FAIL;
612 }
613
614 CUresult Res;
615 switch (Kind) {
616 case TARGET_ALLOC_DEFAULT:
617 case TARGET_ALLOC_DEVICE:
618 case TARGET_ALLOC_SHARED:
619 Res = cuMemFree((CUdeviceptr)TgtPtr);
620 break;
621 case TARGET_ALLOC_HOST:
622 Res = cuMemFreeHost(TgtPtr);
623 break;
624 case TARGET_ALLOC_DEVICE_NON_BLOCKING: {
625 CUstream Stream;
626 if ((Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING)))
627 break;
628 cuMemFreeAsync(reinterpret_cast<CUdeviceptr>(TgtPtr), Stream);
629 cuStreamSynchronize(Stream);
630 if ((Res = cuStreamDestroy(Stream)))
631 break;
632 }
633 }
634
635 if (auto Err = Plugin::check(Res, "error in cuMemFree[Host]: %s")) {
636 REPORT("Failure to free memory: %s\n", toString(std::move(Err)).data());
637 return OFFLOAD_FAIL;
638 }
639 return OFFLOAD_SUCCESS;
640 }
641
642 /// Synchronize current thread with the pending operations on the async info.
643 Error synchronizeImpl(__tgt_async_info &AsyncInfo) override {
644 CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo.Queue);
645 CUresult Res;
646 Res = cuStreamSynchronize(Stream);
647
648 // Once the stream is synchronized, return it to stream pool and reset
649 // AsyncInfo. This is to make sure the synchronization only works for its
650 // own tasks.
651 AsyncInfo.Queue = nullptr;
652 if (auto Err = CUDAStreamManager.returnResource(Stream))
653 return Err;
654
655 return Plugin::check(Res, "error in cuStreamSynchronize: %s");
656 }
657
658 /// CUDA support VA management
659 bool supportVAManagement() const override {
660#if (defined(CUDA_VERSION) && (CUDA_VERSION >= 11000))
661 return true;
662#else
663 return false;
664#endif
665 }
666
667 /// Allocates \p RSize bytes (rounded up to page size) and hints the cuda
668 /// driver to map it to \p VAddr. The obtained address is stored in \p Addr.
669 /// At return \p RSize contains the actual size
670 Error memoryVAMap(void **Addr, void *VAddr, size_t *RSize) override {
671 CUdeviceptr DVAddr = reinterpret_cast<CUdeviceptr>(VAddr);
672 auto IHandle = DeviceMMaps.find(DVAddr);
673 size_t Size = *RSize;
674
675 if (Size == 0)
676 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
677 "memory Map Size must be larger than 0");
678
679 // Check if we have already mapped this address
680 if (IHandle != DeviceMMaps.end())
681 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
682 "address already memory mapped");
683
684 CUmemAllocationProp Prop = {};
685 size_t Granularity = 0;
686
687 size_t Free, Total;
688 CUresult Res = cuMemGetInfo(&Free, &Total);
689 if (auto Err = Plugin::check(Res, "Error in cuMemGetInfo: %s"))
690 return Err;
691
692 if (Size >= Free) {
693 *Addr = nullptr;
694 return Plugin::error(
695 ErrorCode::OUT_OF_RESOURCES,
696 "cannot map memory size larger than the available device memory");
697 }
698
699 // currently NVidia only supports pinned device types
700 Prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
701 Prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
702
703 Prop.location.id = DeviceId;
704 cuMemGetAllocationGranularity(&Granularity, &Prop,
705 CU_MEM_ALLOC_GRANULARITY_MINIMUM);
706 if (auto Err =
707 Plugin::check(Res, "error in cuMemGetAllocationGranularity: %s"))
708 return Err;
709
710 if (Granularity == 0)
711 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
712 "wrong device Page size");
713
714 // Ceil to page size.
715 Size = utils::roundUp(Size, Granularity);
716
717 // Create a handler of our allocation
718 CUmemGenericAllocationHandle AHandle;
719 Res = cuMemCreate(&AHandle, Size, &Prop, 0);
720 if (auto Err = Plugin::check(Res, "error in cuMemCreate: %s"))
721 return Err;
722
723 CUdeviceptr DevPtr = 0;
724 Res = cuMemAddressReserve(&DevPtr, Size, 0, DVAddr, 0);
725 if (auto Err = Plugin::check(Res, "error in cuMemAddressReserve: %s"))
726 return Err;
727
728 Res = cuMemMap(DevPtr, Size, 0, AHandle, 0);
729 if (auto Err = Plugin::check(Res, "error in cuMemMap: %s"))
730 return Err;
731
732 CUmemAccessDesc ADesc = {};
733 ADesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
734 ADesc.location.id = DeviceId;
735 ADesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
736
737 // Sets address
738 Res = cuMemSetAccess(DevPtr, Size, &ADesc, 1);
739 if (auto Err = Plugin::check(Res, "error in cuMemSetAccess: %s"))
740 return Err;
741
742 *Addr = reinterpret_cast<void *>(DevPtr);
743 *RSize = Size;
744 DeviceMMaps.insert({DevPtr, AHandle});
745 return Plugin::success();
746 }
747
748 /// De-allocates device memory and Unmaps the Virtual Addr
749 Error memoryVAUnMap(void *VAddr, size_t Size) override {
750 CUdeviceptr DVAddr = reinterpret_cast<CUdeviceptr>(VAddr);
751 auto IHandle = DeviceMMaps.find(DVAddr);
752 // Mapping does not exist
753 if (IHandle == DeviceMMaps.end()) {
754 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
755 "addr is not MemoryMapped");
756 }
757
758 if (IHandle == DeviceMMaps.end())
759 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
760 "addr is not MemoryMapped");
761
762 CUmemGenericAllocationHandle &AllocHandle = IHandle->second;
763
764 CUresult Res = cuMemUnmap(DVAddr, Size);
765 if (auto Err = Plugin::check(Res, "error in cuMemUnmap: %s"))
766 return Err;
767
768 Res = cuMemRelease(AllocHandle);
769 if (auto Err = Plugin::check(Res, "error in cuMemRelease: %s"))
770 return Err;
771
772 Res = cuMemAddressFree(DVAddr, Size);
773 if (auto Err = Plugin::check(Res, "error in cuMemAddressFree: %s"))
774 return Err;
775
776 DeviceMMaps.erase(IHandle);
777 return Plugin::success();
778 }
779
780 /// Query for the completion of the pending operations on the async info.
781 Error queryAsyncImpl(__tgt_async_info &AsyncInfo) override {
782 CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo.Queue);
783 CUresult Res = cuStreamQuery(Stream);
784
785 // Not ready streams must be considered as successful operations.
786 if (Res == CUDA_ERROR_NOT_READY)
787 return Plugin::success();
788
789 // Once the stream is synchronized and the operations completed (or an error
790 // occurs), return it to stream pool and reset AsyncInfo. This is to make
791 // sure the synchronization only works for its own tasks.
792 AsyncInfo.Queue = nullptr;
793 if (auto Err = CUDAStreamManager.returnResource(Stream))
794 return Err;
795
796 return Plugin::check(Res, "error in cuStreamQuery: %s");
797 }
798
799 Expected<void *> dataLockImpl(void *HstPtr, int64_t Size) override {
800 // TODO: Register the buffer as CUDA host memory.
801 return HstPtr;
802 }
803
804 Error dataUnlockImpl(void *HstPtr) override { return Plugin::success(); }
805
806 Expected<bool> isPinnedPtrImpl(void *HstPtr, void *&BaseHstPtr,
807 void *&BaseDevAccessiblePtr,
808 size_t &BaseSize) const override {
809 // TODO: Implement pinning feature for CUDA.
810 return false;
811 }
812
813 /// Submit data to the device (host to device transfer).
814 Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size,
815 AsyncInfoWrapperTy &AsyncInfoWrapper) override {
816 if (auto Err = setContext())
817 return Err;
818
819 CUstream Stream;
820 if (auto Err = getStream(AsyncInfoWrapper, Stream))
821 return Err;
822
823 CUresult Res = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
824 return Plugin::check(Res, "error in cuMemcpyHtoDAsync: %s");
825 }
826
827 /// Retrieve data from the device (device to host transfer).
828 Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size,
829 AsyncInfoWrapperTy &AsyncInfoWrapper) override {
830 if (auto Err = setContext())
831 return Err;
832
833 CUstream Stream;
834 if (auto Err = getStream(AsyncInfoWrapper, Stream))
835 return Err;
836
837 CUresult Res = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
838 return Plugin::check(Res, "error in cuMemcpyDtoHAsync: %s");
839 }
840
841 /// Exchange data between two devices directly. We may use peer access if
842 /// the CUDA devices and driver allow them.
843 Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstGenericDevice,
844 void *DstPtr, int64_t Size,
845 AsyncInfoWrapperTy &AsyncInfoWrapper) override;
846
847 /// Initialize the async info for interoperability purposes.
848 Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
849 if (auto Err = setContext())
850 return Err;
851
852 CUstream Stream;
853 if (auto Err = getStream(AsyncInfoWrapper, Stream))
854 return Err;
855
856 return Plugin::success();
857 }
858
859 /// Initialize the device info for interoperability purposes.
860 Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) override {
861 assert(Context && "Context is null");
862 assert(Device != CU_DEVICE_INVALID && "Invalid CUDA device");
863
864 if (auto Err = setContext())
865 return Err;
866
867 if (!DeviceInfo->Context)
868 DeviceInfo->Context = Context;
869
870 if (!DeviceInfo->Device)
871 DeviceInfo->Device = reinterpret_cast<void *>(Device);
872
873 return Plugin::success();
874 }
875
876 /// Create an event.
877 Error createEventImpl(void **EventPtrStorage) override {
878 CUevent *Event = reinterpret_cast<CUevent *>(EventPtrStorage);
879 return CUDAEventManager.getResource(*Event);
880 }
881
882 /// Destroy a previously created event.
883 Error destroyEventImpl(void *EventPtr) override {
884 CUevent Event = reinterpret_cast<CUevent>(EventPtr);
885 return CUDAEventManager.returnResource(Event);
886 }
887
888 /// Record the event.
889 Error recordEventImpl(void *EventPtr,
890 AsyncInfoWrapperTy &AsyncInfoWrapper) override {
891 CUevent Event = reinterpret_cast<CUevent>(EventPtr);
892
893 CUstream Stream;
894 if (auto Err = getStream(AsyncInfoWrapper, Stream))
895 return Err;
896
897 CUresult Res = cuEventRecord(Event, Stream);
898 return Plugin::check(Res, "error in cuEventRecord: %s");
899 }
900
901 /// Make the stream wait on the event.
902 Error waitEventImpl(void *EventPtr,
903 AsyncInfoWrapperTy &AsyncInfoWrapper) override {
904 CUevent Event = reinterpret_cast<CUevent>(EventPtr);
905
906 CUstream Stream;
907 if (auto Err = getStream(AsyncInfoWrapper, Stream))
908 return Err;
909
910 // Do not use CU_EVENT_WAIT_DEFAULT here as it is only available from
911 // specific CUDA version, and defined as 0x0. In previous version, per CUDA
912 // API document, that argument has to be 0x0.
913 CUresult Res = cuStreamWaitEvent(Stream, Event, 0);
914 return Plugin::check(Res, "error in cuStreamWaitEvent: %s");
915 }
916
917 /// Synchronize the current thread with the event.
918 Error syncEventImpl(void *EventPtr) override {
919 CUevent Event = reinterpret_cast<CUevent>(EventPtr);
920 CUresult Res = cuEventSynchronize(Event);
921 return Plugin::check(Res, "error in cuEventSynchronize: %s");
922 }
923
924 /// Print information about the device.
925 Error obtainInfoImpl(InfoQueueTy &Info) override {
926 char TmpChar[1000];
927 const char *TmpCharPtr;
928 size_t TmpSt;
929 int TmpInt;
930
931 CUresult Res = cuDriverGetVersion(&TmpInt);
932 if (Res == CUDA_SUCCESS)
933 Info.add("CUDA Driver Version", TmpInt);
934
935 Info.add("CUDA OpenMP Device Number", DeviceId);
936
937 Res = cuDeviceGetName(TmpChar, 1000, Device);
938 if (Res == CUDA_SUCCESS)
939 Info.add("Device Name", TmpChar);
940
941 Res = cuDeviceTotalMem(&TmpSt, Device);
942 if (Res == CUDA_SUCCESS)
943 Info.add("Global Memory Size", TmpSt, "bytes");
944
945 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, TmpInt);
946 if (Res == CUDA_SUCCESS)
947 Info.add("Number of Multiprocessors", TmpInt);
948
949 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, TmpInt);
950 if (Res == CUDA_SUCCESS)
951 Info.add("Concurrent Copy and Execution", (bool)TmpInt);
952
953 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, TmpInt);
954 if (Res == CUDA_SUCCESS)
955 Info.add("Total Constant Memory", TmpInt, "bytes");
956
957 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
958 TmpInt);
959 if (Res == CUDA_SUCCESS)
960 Info.add("Max Shared Memory per Block", TmpInt, "bytes");
961
962 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, TmpInt);
963 if (Res == CUDA_SUCCESS)
964 Info.add("Registers per Block", TmpInt);
965
966 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_WARP_SIZE, TmpInt);
967 if (Res == CUDA_SUCCESS)
968 Info.add("Warp Size", TmpInt);
969
970 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, TmpInt);
971 if (Res == CUDA_SUCCESS)
972 Info.add("Maximum Threads per Block", TmpInt);
973
974 Info.add("Maximum Block Dimensions", "");
975 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, TmpInt);
976 if (Res == CUDA_SUCCESS)
977 Info.add<InfoLevel2>("x", TmpInt);
978 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, TmpInt);
979 if (Res == CUDA_SUCCESS)
980 Info.add<InfoLevel2>("y", TmpInt);
981 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, TmpInt);
982 if (Res == CUDA_SUCCESS)
983 Info.add<InfoLevel2>("z", TmpInt);
984
985 Info.add("Maximum Grid Dimensions", "");
986 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, TmpInt);
987 if (Res == CUDA_SUCCESS)
988 Info.add<InfoLevel2>("x", TmpInt);
989 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, TmpInt);
990 if (Res == CUDA_SUCCESS)
991 Info.add<InfoLevel2>("y", TmpInt);
992 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, TmpInt);
993 if (Res == CUDA_SUCCESS)
994 Info.add<InfoLevel2>("z", TmpInt);
995
996 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_PITCH, TmpInt);
997 if (Res == CUDA_SUCCESS)
998 Info.add("Maximum Memory Pitch", TmpInt, "bytes");
999
1000 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, TmpInt);
1001 if (Res == CUDA_SUCCESS)
1002 Info.add("Texture Alignment", TmpInt, "bytes");
1003
1004 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CLOCK_RATE, TmpInt);
1005 if (Res == CUDA_SUCCESS)
1006 Info.add("Clock Rate", TmpInt, "kHz");
1007
1008 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, TmpInt);
1009 if (Res == CUDA_SUCCESS)
1010 Info.add("Execution Timeout", (bool)TmpInt);
1011
1012 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_INTEGRATED, TmpInt);
1013 if (Res == CUDA_SUCCESS)
1014 Info.add("Integrated Device", (bool)TmpInt);
1015
1016 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, TmpInt);
1017 if (Res == CUDA_SUCCESS)
1018 Info.add("Can Map Host Memory", (bool)TmpInt);
1019
1020 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, TmpInt);
1021 if (Res == CUDA_SUCCESS) {
1022 if (TmpInt == CU_COMPUTEMODE_DEFAULT)
1023 TmpCharPtr = "Default";
1024 else if (TmpInt == CU_COMPUTEMODE_PROHIBITED)
1025 TmpCharPtr = "Prohibited";
1026 else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS)
1027 TmpCharPtr = "Exclusive process";
1028 else
1029 TmpCharPtr = "Unknown";
1030 Info.add("Compute Mode", TmpCharPtr);
1031 }
1032
1033 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, TmpInt);
1034 if (Res == CUDA_SUCCESS)
1035 Info.add("Concurrent Kernels", (bool)TmpInt);
1036
1037 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_ECC_ENABLED, TmpInt);
1038 if (Res == CUDA_SUCCESS)
1039 Info.add("ECC Enabled", (bool)TmpInt);
1040
1041 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, TmpInt);
1042 if (Res == CUDA_SUCCESS)
1043 Info.add("Memory Clock Rate", TmpInt, "kHz");
1044
1045 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, TmpInt);
1046 if (Res == CUDA_SUCCESS)
1047 Info.add("Memory Bus Width", TmpInt, "bits");
1048
1049 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, TmpInt);
1050 if (Res == CUDA_SUCCESS)
1051 Info.add("L2 Cache Size", TmpInt, "bytes");
1052
1053 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
1054 TmpInt);
1055 if (Res == CUDA_SUCCESS)
1056 Info.add("Max Threads Per SMP", TmpInt);
1057
1058 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, TmpInt);
1059 if (Res == CUDA_SUCCESS)
1060 Info.add("Async Engines", TmpInt);
1061
1062 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, TmpInt);
1063 if (Res == CUDA_SUCCESS)
1064 Info.add("Unified Addressing", (bool)TmpInt);
1065
1066 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, TmpInt);
1067 if (Res == CUDA_SUCCESS)
1068 Info.add("Managed Memory", (bool)TmpInt);
1069
1070 Res =
1071 getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, TmpInt);
1072 if (Res == CUDA_SUCCESS)
1073 Info.add("Concurrent Managed Memory", (bool)TmpInt);
1074
1075 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED,
1076 TmpInt);
1077 if (Res == CUDA_SUCCESS)
1078 Info.add("Preemption Supported", (bool)TmpInt);
1079
1080 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, TmpInt);
1081 if (Res == CUDA_SUCCESS)
1082 Info.add("Cooperative Launch", (bool)TmpInt);
1083
1084 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, TmpInt);
1085 if (Res == CUDA_SUCCESS)
1086 Info.add("Multi-Device Boars", (bool)TmpInt);
1087
1088 Info.add("Compute Capabilities", ComputeCapability.str());
1089
1090 return Plugin::success();
1091 }
1092
1093 virtual bool shouldSetupDeviceMemoryPool() const override {
1094 /// We use the CUDA malloc for now.
1095 return false;
1096 }
1097
1098 /// Getters and setters for stack and heap sizes.
1099 Error getDeviceStackSize(uint64_t &Value) override {
1100 return getCtxLimit(CU_LIMIT_STACK_SIZE, Value);
1101 }
1102 Error setDeviceStackSize(uint64_t Value) override {
1103 return setCtxLimit(CU_LIMIT_STACK_SIZE, Value);
1104 }
1105 Error getDeviceHeapSize(uint64_t &Value) override {
1106 return getCtxLimit(CU_LIMIT_MALLOC_HEAP_SIZE, Value);
1107 }
1108 Error setDeviceHeapSize(uint64_t Value) override {
1109 return setCtxLimit(CU_LIMIT_MALLOC_HEAP_SIZE, Value);
1110 }
1111 Error getDeviceMemorySize(uint64_t &Value) override {
1112 CUresult Res = cuDeviceTotalMem(&Value, Device);
1113 return Plugin::check(Res, "error in getDeviceMemorySize %s");
1114 }
1115
1116 /// CUDA-specific functions for getting and setting context limits.
1117 Error setCtxLimit(CUlimit Kind, uint64_t Value) {
1118 CUresult Res = cuCtxSetLimit(Kind, Value);
1119 return Plugin::check(Res, "error in cuCtxSetLimit: %s");
1120 }
1121 Error getCtxLimit(CUlimit Kind, uint64_t &Value) {
1122 CUresult Res = cuCtxGetLimit(&Value, Kind);
1123 return Plugin::check(Res, "error in cuCtxGetLimit: %s");
1124 }
1125
1126 /// CUDA-specific function to get device attributes.
1127 Error getDeviceAttr(uint32_t Kind, uint32_t &Value) {
1128 // TODO: Warn if the new value is larger than the old.
1129 CUresult Res =
1130 cuDeviceGetAttribute((int *)&Value, (CUdevice_attribute)Kind, Device);
1131 return Plugin::check(Res, "error in cuDeviceGetAttribute: %s");
1132 }
1133
1134 CUresult getDeviceAttrRaw(uint32_t Kind, int &Value) {
1135 return cuDeviceGetAttribute(&Value, (CUdevice_attribute)Kind, Device);
1136 }
1137
1138 /// See GenericDeviceTy::getComputeUnitKind().
1139 std::string getComputeUnitKind() const override {
1140 return ComputeCapability.str();
1141 }
1142
1143 /// Returns the clock frequency for the given NVPTX device.
1144 uint64_t getClockFrequency() const override { return 1000000000; }
1145
1146private:
1147 using CUDAStreamManagerTy = GenericDeviceResourceManagerTy<CUDAStreamRef>;
1148 using CUDAEventManagerTy = GenericDeviceResourceManagerTy<CUDAEventRef>;
1149
1150 Error callGlobalCtorDtorCommon(GenericPluginTy &Plugin, DeviceImageTy &Image,
1151 bool IsCtor) {
1152 const char *KernelName = IsCtor ? "nvptx$device$init" : "nvptx$device$fini";
1153 // Perform a quick check for the named kernel in the image. The kernel
1154 // should be created by the 'nvptx-lower-ctor-dtor' pass.
1155 GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
1156 if (IsCtor && !Handler.isSymbolInImage(*this, Image, KernelName))
1157 return Plugin::success();
1158
1159 // The Nvidia backend cannot handle creating the ctor / dtor array
1160 // automatically so we must create it ourselves. The backend will emit
1161 // several globals that contain function pointers we can call. These are
1162 // prefixed with a known name due to Nvidia's lack of section support.
1163 auto ELFObjOrErr = Handler.getELFObjectFile(Image);
1164 if (!ELFObjOrErr)
1165 return ELFObjOrErr.takeError();
1166
1167 // Search for all symbols that contain a constructor or destructor.
1168 SmallVector<std::pair<StringRef, uint16_t>> Funcs;
1169 for (ELFSymbolRef Sym : (*ELFObjOrErr)->symbols()) {
1170 auto NameOrErr = Sym.getName();
1171 if (!NameOrErr)
1172 return NameOrErr.takeError();
1173
1174 if (!NameOrErr->starts_with(IsCtor ? "__init_array_object_"
1175 : "__fini_array_object_"))
1176 continue;
1177
1178 uint16_t Priority;
1179 if (NameOrErr->rsplit('_').second.getAsInteger(10, Priority))
1180 return Plugin::error(ErrorCode::INVALID_BINARY,
1181 "invalid priority for constructor or destructor");
1182
1183 Funcs.emplace_back(*NameOrErr, Priority);
1184 }
1185
1186 // Sort the created array to be in priority order.
1187 llvm::sort(Funcs, [=](auto X, auto Y) { return X.second < Y.second; });
1188
1189 // Allocate a buffer to store all of the known constructor / destructor
1190 // functions in so we can iterate them on the device.
1191 void *Buffer =
1192 allocate(Funcs.size() * sizeof(void *), nullptr, TARGET_ALLOC_DEVICE);
1193 if (!Buffer)
1194 return Plugin::error(ErrorCode::OUT_OF_RESOURCES,
1195 "failed to allocate memory for global buffer");
1196
1197 auto *GlobalPtrStart = reinterpret_cast<uintptr_t *>(Buffer);
1198 auto *GlobalPtrStop = reinterpret_cast<uintptr_t *>(Buffer) + Funcs.size();
1199
1200 SmallVector<void *> FunctionPtrs(Funcs.size());
1201 std::size_t Idx = 0;
1202 for (auto [Name, Priority] : Funcs) {
1203 GlobalTy FunctionAddr(Name.str(), sizeof(void *), &FunctionPtrs[Idx++]);
1204 if (auto Err = Handler.readGlobalFromDevice(*this, Image, FunctionAddr))
1205 return Err;
1206 }
1207
1208 // Copy the local buffer to the device.
1209 if (auto Err = dataSubmit(GlobalPtrStart, FunctionPtrs.data(),
1210 FunctionPtrs.size() * sizeof(void *), nullptr))
1211 return Err;
1212
1213 // Copy the created buffer to the appropriate symbols so the kernel can
1214 // iterate through them.
1215 GlobalTy StartGlobal(IsCtor ? "__init_array_start" : "__fini_array_start",
1216 sizeof(void *), &GlobalPtrStart);
1217 if (auto Err = Handler.writeGlobalToDevice(*this, Image, StartGlobal))
1218 return Err;
1219
1220 GlobalTy StopGlobal(IsCtor ? "__init_array_end" : "__fini_array_end",
1221 sizeof(void *), &GlobalPtrStop);
1222 if (auto Err = Handler.writeGlobalToDevice(*this, Image, StopGlobal))
1223 return Err;
1224
1225 CUDAKernelTy CUDAKernel(KernelName);
1226
1227 if (auto Err = CUDAKernel.init(*this, Image))
1228 return Err;
1229
1230 AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr);
1231
1232 KernelArgsTy KernelArgs = {};
1233 uint32_t NumBlocksAndThreads[3] = {1u, 1u, 1u};
1234 if (auto Err = CUDAKernel.launchImpl(
1235 *this, NumBlocksAndThreads, NumBlocksAndThreads, KernelArgs,
1236 KernelLaunchParamsTy{}, AsyncInfoWrapper))
1237 return Err;
1238
1239 Error Err = Plugin::success();
1240 AsyncInfoWrapper.finalize(Err);
1241
1242 if (free(Buffer, TARGET_ALLOC_DEVICE) != OFFLOAD_SUCCESS)
1243 return Plugin::error(ErrorCode::UNKNOWN,
1244 "failed to free memory for global buffer");
1245
1246 return Err;
1247 }
1248
1249 /// Stream manager for CUDA streams.
1250 CUDAStreamManagerTy CUDAStreamManager;
1251
1252 /// Event manager for CUDA events.
1253 CUDAEventManagerTy CUDAEventManager;
1254
1255 /// The device's context. This context should be set before performing
1256 /// operations on the device.
1257 CUcontext Context = nullptr;
1258
1259 /// The CUDA device handler.
1260 CUdevice Device = CU_DEVICE_INVALID;
1261
1262 /// The memory mapped addresses and their handles
1263 std::unordered_map<CUdeviceptr, CUmemGenericAllocationHandle> DeviceMMaps;
1264
1265 /// The compute capability of the corresponding CUDA device.
1266 struct ComputeCapabilityTy {
1267 uint32_t Major;
1268 uint32_t Minor;
1269 std::string str() const {
1270 return "sm_" + std::to_string(val: Major * 10 + Minor);
1271 }
1272 } ComputeCapability;
1273
1274 /// The maximum number of warps that can be resident on all the SMs
1275 /// simultaneously.
1276 uint32_t HardwareParallelism = 0;
1277};
1278
1279Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
1280 uint32_t NumThreads[3], uint32_t NumBlocks[3],
1281 KernelArgsTy &KernelArgs,
1282 KernelLaunchParamsTy LaunchParams,
1283 AsyncInfoWrapperTy &AsyncInfoWrapper) const {
1284 CUDADeviceTy &CUDADevice = static_cast<CUDADeviceTy &>(GenericDevice);
1285
1286 CUstream Stream;
1287 if (auto Err = CUDADevice.getStream(AsyncInfoWrapper, Stream))
1288 return Err;
1289
1290 uint32_t MaxDynCGroupMem =
1291 std::max(KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize());
1292
1293 void *Config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, LaunchParams.Data,
1294 CU_LAUNCH_PARAM_BUFFER_SIZE,
1295 reinterpret_cast<void *>(&LaunchParams.Size),
1296 CU_LAUNCH_PARAM_END};
1297
1298 // If we are running an RPC server we want to wake up the server thread
1299 // whenever there is a kernel running and let it sleep otherwise.
1300 if (GenericDevice.getRPCServer())
1301 GenericDevice.Plugin.getRPCServer().Thread->notify();
1302
1303 CUresult Res = cuLaunchKernel(Func, NumBlocks[0], NumBlocks[1], NumBlocks[2],
1304 NumThreads[0], NumThreads[1], NumThreads[2],
1305 MaxDynCGroupMem, Stream, nullptr, Config);
1306
1307 // Register a callback to indicate when the kernel is complete.
1308 if (GenericDevice.getRPCServer())
1309 cuLaunchHostFunc(
1310 Stream,
1311 [](void *Data) {
1312 GenericPluginTy &Plugin = *reinterpret_cast<GenericPluginTy *>(Data);
1313 Plugin.getRPCServer().Thread->finish();
1314 },
1315 &GenericDevice.Plugin);
1316
1317 return Plugin::check(Res, "error in cuLaunchKernel for '%s': %s", getName());
1318}
1319
1320/// Class implementing the CUDA-specific functionalities of the global handler.
1321class CUDAGlobalHandlerTy final : public GenericGlobalHandlerTy {
1322public:
1323 /// Get the metadata of a global from the device. The name and size of the
1324 /// global is read from DeviceGlobal and the address of the global is written
1325 /// to DeviceGlobal.
1326 Error getGlobalMetadataFromDevice(GenericDeviceTy &Device,
1327 DeviceImageTy &Image,
1328 GlobalTy &DeviceGlobal) override {
1329 CUDADeviceImageTy &CUDAImage = static_cast<CUDADeviceImageTy &>(Image);
1330
1331 const char *GlobalName = DeviceGlobal.getName().data();
1332
1333 size_t CUSize;
1334 CUdeviceptr CUPtr;
1335 CUresult Res =
1336 cuModuleGetGlobal(&CUPtr, &CUSize, CUDAImage.getModule(), GlobalName);
1337 if (auto Err = Plugin::check(Res, "error in cuModuleGetGlobal for '%s': %s",
1338 GlobalName))
1339 return Err;
1340
1341 if (CUSize != DeviceGlobal.getSize())
1342 return Plugin::error(
1343 ErrorCode::INVALID_BINARY,
1344 "failed to load global '%s' due to size mismatch (%zu != %zu)",
1345 GlobalName, CUSize, (size_t)DeviceGlobal.getSize());
1346
1347 DeviceGlobal.setPtr(reinterpret_cast<void *>(CUPtr));
1348 return Plugin::success();
1349 }
1350};
1351
1352/// Class implementing the CUDA-specific functionalities of the plugin.
1353struct CUDAPluginTy final : public GenericPluginTy {
1354 /// Create a CUDA plugin.
1355 CUDAPluginTy() : GenericPluginTy(getTripleArch()) {}
1356
1357 /// This class should not be copied.
1358 CUDAPluginTy(const CUDAPluginTy &) = delete;
1359 CUDAPluginTy(CUDAPluginTy &&) = delete;
1360
1361 /// Initialize the plugin and return the number of devices.
1362 Expected<int32_t> initImpl() override {
1363 CUresult Res = cuInit(0);
1364 if (Res == CUDA_ERROR_INVALID_HANDLE) {
1365 // Cannot call cuGetErrorString if dlsym failed.
1366 DP("Failed to load CUDA shared library\n");
1367 return 0;
1368 }
1369
1370 if (Res == CUDA_ERROR_NO_DEVICE) {
1371 // Do not initialize if there are no devices.
1372 DP("There are no devices supporting CUDA.\n");
1373 return 0;
1374 }
1375
1376 if (auto Err = Plugin::check(Res, "error in cuInit: %s"))
1377 return std::move(Err);
1378
1379 // Get the number of devices.
1380 int NumDevices;
1381 Res = cuDeviceGetCount(&NumDevices);
1382 if (auto Err = Plugin::check(Res, "error in cuDeviceGetCount: %s"))
1383 return std::move(Err);
1384
1385 // Do not initialize if there are no devices.
1386 if (NumDevices == 0)
1387 DP("There are no devices supporting CUDA.\n");
1388
1389 return NumDevices;
1390 }
1391
1392 /// Deinitialize the plugin.
1393 Error deinitImpl() override { return Plugin::success(); }
1394
1395 /// Creates a CUDA device to use for offloading.
1396 GenericDeviceTy *createDevice(GenericPluginTy &Plugin, int32_t DeviceId,
1397 int32_t NumDevices) override {
1398 return new CUDADeviceTy(Plugin, DeviceId, NumDevices);
1399 }
1400
1401 /// Creates a CUDA global handler.
1402 GenericGlobalHandlerTy *createGlobalHandler() override {
1403 return new CUDAGlobalHandlerTy();
1404 }
1405
1406 /// Get the ELF code for recognizing the compatible image binary.
1407 uint16_t getMagicElfBits() const override { return ELF::EM_CUDA; }
1408
1409 Triple::ArchType getTripleArch() const override {
1410 // TODO: I think we can drop the support for 32-bit NVPTX devices.
1411 return Triple::nvptx64;
1412 }
1413
1414 const char *getName() const override { return GETNAME(TARGET_NAME); }
1415
1416 /// Check whether the image is compatible with a CUDA device.
1417 Expected<bool> isELFCompatible(uint32_t DeviceId,
1418 StringRef Image) const override {
1419 auto ElfOrErr =
1420 ELF64LEObjectFile::create(MemoryBufferRef(Image, /*Identifier=*/""),
1421 /*InitContent=*/false);
1422 if (!ElfOrErr)
1423 return ElfOrErr.takeError();
1424
1425 // Get the numeric value for the image's `sm_` value.
1426 auto SM = ElfOrErr->getPlatformFlags() & ELF::EF_CUDA_SM;
1427
1428 CUdevice Device;
1429 CUresult Res = cuDeviceGet(&Device, DeviceId);
1430 if (auto Err = Plugin::check(Res, "error in cuDeviceGet: %s"))
1431 return std::move(Err);
1432
1433 int32_t Major, Minor;
1434 Res = cuDeviceGetAttribute(
1435 &Major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device);
1436 if (auto Err = Plugin::check(Res, "error in cuDeviceGetAttribute: %s"))
1437 return std::move(Err);
1438
1439 Res = cuDeviceGetAttribute(
1440 &Minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device);
1441 if (auto Err = Plugin::check(Res, "error in cuDeviceGetAttribute: %s"))
1442 return std::move(Err);
1443
1444 int32_t ImageMajor = SM / 10;
1445 int32_t ImageMinor = SM % 10;
1446
1447 // A cubin generated for a certain compute capability is supported to
1448 // run on any GPU with the same major revision and same or higher minor
1449 // revision.
1450 return Major == ImageMajor && Minor >= ImageMinor;
1451 }
1452};
1453
1454Error CUDADeviceTy::dataExchangeImpl(const void *SrcPtr,
1455 GenericDeviceTy &DstGenericDevice,
1456 void *DstPtr, int64_t Size,
1457 AsyncInfoWrapperTy &AsyncInfoWrapper) {
1458 if (auto Err = setContext())
1459 return Err;
1460
1461 CUDADeviceTy &DstDevice = static_cast<CUDADeviceTy &>(DstGenericDevice);
1462
1463 CUresult Res;
1464 int32_t DstDeviceId = DstDevice.DeviceId;
1465 CUdeviceptr CUSrcPtr = (CUdeviceptr)SrcPtr;
1466 CUdeviceptr CUDstPtr = (CUdeviceptr)DstPtr;
1467
1468 int CanAccessPeer = 0;
1469 if (DeviceId != DstDeviceId) {
1470 // Make sure the lock is released before performing the copies.
1471 std::lock_guard<std::mutex> Lock(PeerAccessesLock);
1472
1473 switch (PeerAccesses[DstDeviceId]) {
1474 case PeerAccessState::AVAILABLE:
1475 CanAccessPeer = 1;
1476 break;
1477 case PeerAccessState::UNAVAILABLE:
1478 CanAccessPeer = 0;
1479 break;
1480 case PeerAccessState::PENDING:
1481 // Check whether the source device can access the destination device.
1482 Res = cuDeviceCanAccessPeer(&CanAccessPeer, Device, DstDevice.Device);
1483 if (auto Err = Plugin::check(Res, "Error in cuDeviceCanAccessPeer: %s"))
1484 return Err;
1485
1486 if (CanAccessPeer) {
1487 Res = cuCtxEnablePeerAccess(DstDevice.Context, 0);
1488 if (Res == CUDA_ERROR_TOO_MANY_PEERS) {
1489 // Resources may be exhausted due to many P2P links.
1490 CanAccessPeer = 0;
1491 DP("Too many P2P so fall back to D2D memcpy");
1492 } else if (auto Err =
1493 Plugin::check(Res, "error in cuCtxEnablePeerAccess: %s"))
1494 return Err;
1495 }
1496 PeerAccesses[DstDeviceId] = (CanAccessPeer)
1497 ? PeerAccessState::AVAILABLE
1498 : PeerAccessState::UNAVAILABLE;
1499 }
1500 }
1501
1502 CUstream Stream;
1503 if (auto Err = getStream(AsyncInfoWrapper, Stream))
1504 return Err;
1505
1506 if (CanAccessPeer) {
1507 // TODO: Should we fallback to D2D if peer access fails?
1508 Res = cuMemcpyPeerAsync(CUDstPtr, Context, CUSrcPtr, DstDevice.Context,
1509 Size, Stream);
1510 return Plugin::check(Res, "error in cuMemcpyPeerAsync: %s");
1511 }
1512
1513 // Fallback to D2D copy.
1514 Res = cuMemcpyDtoDAsync(CUDstPtr, CUSrcPtr, Size, Stream);
1515 return Plugin::check(Res, "error in cuMemcpyDtoDAsync: %s");
1516}
1517
1518template <typename... ArgsTy>
1519static Error Plugin::check(int32_t Code, const char *ErrFmt, ArgsTy... Args) {
1520 CUresult ResultCode = static_cast<CUresult>(Code);
1521 if (ResultCode == CUDA_SUCCESS)
1522 return Plugin::success();
1523
1524 const char *Desc = "Unknown error";
1525 CUresult Ret = cuGetErrorString(ResultCode, &Desc);
1526 if (Ret != CUDA_SUCCESS)
1527 REPORT("Unrecognized " GETNAME(TARGET_NAME) " error code %d\n", Code);
1528
1529 // TODO: Add more entries to this switch
1530 ErrorCode OffloadErrCode;
1531 switch (ResultCode) {
1532 case CUDA_ERROR_NOT_FOUND:
1533 OffloadErrCode = ErrorCode::NOT_FOUND;
1534 break;
1535 default:
1536 OffloadErrCode = ErrorCode::UNKNOWN;
1537 }
1538
1539 // TODO: Create a map for CUDA error codes to Offload error codes
1540 return Plugin::error(OffloadErrCode, ErrFmt, Args..., Desc);
1541}
1542
1543} // namespace plugin
1544} // namespace target
1545} // namespace omp
1546} // namespace llvm
1547
1548extern "C" {
1549llvm::omp::target::plugin::GenericPluginTy *createPlugin_cuda() {
1550 return new llvm::omp::target::plugin::CUDAPluginTy();
1551}
1552}
1553

source code of offload/plugins-nextgen/cuda/src/rtl.cpp