1//===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// RTL NextGen for CUDA machine
10//
11//===----------------------------------------------------------------------===//
12
13#include <cassert>
14#include <cstddef>
15#include <cuda.h>
16#include <string>
17#include <unordered_map>
18
19#include "Shared/Debug.h"
20#include "Shared/Environment.h"
21
22#include "GlobalHandler.h"
23#include "OpenMP/OMPT/Callback.h"
24#include "PluginInterface.h"
25#include "Utils/ELF.h"
26
27#include "llvm/BinaryFormat/ELF.h"
28#include "llvm/Frontend/OpenMP/OMPConstants.h"
29#include "llvm/Frontend/OpenMP/OMPGridValues.h"
30#include "llvm/Support/Error.h"
31#include "llvm/Support/FileOutputBuffer.h"
32#include "llvm/Support/FileSystem.h"
33#include "llvm/Support/Program.h"
34
35namespace llvm {
36namespace omp {
37namespace target {
38namespace plugin {
39
40/// Forward declarations for all specialized data structures.
41struct CUDAKernelTy;
42struct CUDADeviceTy;
43struct CUDAPluginTy;
44
45#if (defined(CUDA_VERSION) && (CUDA_VERSION < 11000))
46/// Forward declarations for all Virtual Memory Management
47/// related data structures and functions. This is necessary
48/// for older cuda versions.
49typedef void *CUmemGenericAllocationHandle;
50typedef void *CUmemAllocationProp;
51typedef void *CUmemAccessDesc;
52typedef void *CUmemAllocationGranularity_flags;
53CUresult cuMemAddressReserve(CUdeviceptr *ptr, size_t size, size_t alignment,
54 CUdeviceptr addr, unsigned long long flags) {}
55CUresult cuMemMap(CUdeviceptr ptr, size_t size, size_t offset,
56 CUmemGenericAllocationHandle handle,
57 unsigned long long flags) {}
58CUresult cuMemCreate(CUmemGenericAllocationHandle *handle, size_t size,
59 const CUmemAllocationProp *prop,
60 unsigned long long flags) {}
61CUresult cuMemSetAccess(CUdeviceptr ptr, size_t size,
62 const CUmemAccessDesc *desc, size_t count) {}
63CUresult
64cuMemGetAllocationGranularity(size_t *granularity,
65 const CUmemAllocationProp *prop,
66 CUmemAllocationGranularity_flags option) {}
67#endif
68
69#if (defined(CUDA_VERSION) && (CUDA_VERSION < 11020))
70// Forward declarations of asynchronous memory management functions. This is
71// necessary for older versions of CUDA.
72CUresult cuMemAllocAsync(CUdeviceptr *ptr, size_t, CUstream) { *ptr = 0; }
73
74CUresult cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream) {}
75#endif
76
77/// Class implementing the CUDA device images properties.
78struct CUDADeviceImageTy : public DeviceImageTy {
79 /// Create the CUDA image with the id and the target image pointer.
80 CUDADeviceImageTy(int32_t ImageId, GenericDeviceTy &Device,
81 const __tgt_device_image *TgtImage)
82 : DeviceImageTy(ImageId, Device, TgtImage), Module(nullptr) {}
83
84 /// Load the image as a CUDA module.
85 Error loadModule() {
86 assert(!Module && "Module already loaded");
87
88 CUresult Res = cuModuleLoadDataEx(&Module, getStart(), 0, nullptr, nullptr);
89 if (auto Err = Plugin::check(Res, "Error in cuModuleLoadDataEx: %s"))
90 return Err;
91
92 return Plugin::success();
93 }
94
95 /// Unload the CUDA module corresponding to the image.
96 Error unloadModule() {
97 assert(Module && "Module not loaded");
98
99 CUresult Res = cuModuleUnload(Module);
100 if (auto Err = Plugin::check(Res, "Error in cuModuleUnload: %s"))
101 return Err;
102
103 Module = nullptr;
104
105 return Plugin::success();
106 }
107
108 /// Getter of the CUDA module.
109 CUmodule getModule() const { return Module; }
110
111private:
112 /// The CUDA module that loaded the image.
113 CUmodule Module;
114};
115
116/// Class implementing the CUDA kernel functionalities which derives from the
117/// generic kernel class.
118struct CUDAKernelTy : public GenericKernelTy {
119 /// Create a CUDA kernel with a name and an execution mode.
120 CUDAKernelTy(const char *Name) : GenericKernelTy(Name), Func(nullptr) {}
121
122 /// Initialize the CUDA kernel.
123 Error initImpl(GenericDeviceTy &GenericDevice,
124 DeviceImageTy &Image) override {
125 CUresult Res;
126 CUDADeviceImageTy &CUDAImage = static_cast<CUDADeviceImageTy &>(Image);
127
128 // Retrieve the function pointer of the kernel.
129 Res = cuModuleGetFunction(&Func, CUDAImage.getModule(), getName());
130 if (auto Err = Plugin::check(Res, "Error in cuModuleGetFunction('%s'): %s",
131 getName()))
132 return Err;
133
134 // Check that the function pointer is valid.
135 if (!Func)
136 return Plugin::error("Invalid function for kernel %s", getName());
137
138 int MaxThreads;
139 Res = cuFuncGetAttribute(&MaxThreads,
140 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Func);
141 if (auto Err = Plugin::check(Res, "Error in cuFuncGetAttribute: %s"))
142 return Err;
143
144 // The maximum number of threads cannot exceed the maximum of the kernel.
145 MaxNumThreads = std::min(MaxNumThreads, (uint32_t)MaxThreads);
146
147 return Plugin::success();
148 }
149
150 /// Launch the CUDA kernel function.
151 Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads,
152 uint64_t NumBlocks, KernelArgsTy &KernelArgs, void *Args,
153 AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
154
155private:
156 /// The CUDA kernel function to execute.
157 CUfunction Func;
158};
159
160/// Class wrapping a CUDA stream reference. These are the objects handled by the
161/// Stream Manager for the CUDA plugin.
162struct CUDAStreamRef final : public GenericDeviceResourceRef {
163 /// The underlying handle type for streams.
164 using HandleTy = CUstream;
165
166 /// Create an empty reference to an invalid stream.
167 CUDAStreamRef() : Stream(nullptr) {}
168
169 /// Create a reference to an existing stream.
170 CUDAStreamRef(HandleTy Stream) : Stream(Stream) {}
171
172 /// Create a new stream and save the reference. The reference must be empty
173 /// before calling to this function.
174 Error create(GenericDeviceTy &Device) override {
175 if (Stream)
176 return Plugin::error("Creating an existing stream");
177
178 CUresult Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING);
179 if (auto Err = Plugin::check(Res, "Error in cuStreamCreate: %s"))
180 return Err;
181
182 return Plugin::success();
183 }
184
185 /// Destroy the referenced stream and invalidate the reference. The reference
186 /// must be to a valid stream before calling to this function.
187 Error destroy(GenericDeviceTy &Device) override {
188 if (!Stream)
189 return Plugin::error("Destroying an invalid stream");
190
191 CUresult Res = cuStreamDestroy(Stream);
192 if (auto Err = Plugin::check(Res, "Error in cuStreamDestroy: %s"))
193 return Err;
194
195 Stream = nullptr;
196 return Plugin::success();
197 }
198
199 /// Get the underlying CUDA stream.
200 operator HandleTy() const { return Stream; }
201
202private:
203 /// The reference to the CUDA stream.
204 HandleTy Stream;
205};
206
207/// Class wrapping a CUDA event reference. These are the objects handled by the
208/// Event Manager for the CUDA plugin.
209struct CUDAEventRef final : public GenericDeviceResourceRef {
210 /// The underlying handle type for events.
211 using HandleTy = CUevent;
212
213 /// Create an empty reference to an invalid event.
214 CUDAEventRef() : Event(nullptr) {}
215
216 /// Create a reference to an existing event.
217 CUDAEventRef(HandleTy Event) : Event(Event) {}
218
219 /// Create a new event and save the reference. The reference must be empty
220 /// before calling to this function.
221 Error create(GenericDeviceTy &Device) override {
222 if (Event)
223 return Plugin::error("Creating an existing event");
224
225 CUresult Res = cuEventCreate(&Event, CU_EVENT_DEFAULT);
226 if (auto Err = Plugin::check(Res, "Error in cuEventCreate: %s"))
227 return Err;
228
229 return Plugin::success();
230 }
231
232 /// Destroy the referenced event and invalidate the reference. The reference
233 /// must be to a valid event before calling to this function.
234 Error destroy(GenericDeviceTy &Device) override {
235 if (!Event)
236 return Plugin::error("Destroying an invalid event");
237
238 CUresult Res = cuEventDestroy(Event);
239 if (auto Err = Plugin::check(Res, "Error in cuEventDestroy: %s"))
240 return Err;
241
242 Event = nullptr;
243 return Plugin::success();
244 }
245
246 /// Get the underlying CUevent.
247 operator HandleTy() const { return Event; }
248
249private:
250 /// The reference to the CUDA event.
251 HandleTy Event;
252};
253
254/// Class implementing the CUDA device functionalities which derives from the
255/// generic device class.
256struct CUDADeviceTy : public GenericDeviceTy {
257 // Create a CUDA device with a device id and the default CUDA grid values.
258 CUDADeviceTy(GenericPluginTy &Plugin, int32_t DeviceId, int32_t NumDevices)
259 : GenericDeviceTy(Plugin, DeviceId, NumDevices, NVPTXGridValues),
260 CUDAStreamManager(*this), CUDAEventManager(*this) {}
261
262 ~CUDADeviceTy() {}
263
264 /// Initialize the device, its resources and get its properties.
265 Error initImpl(GenericPluginTy &Plugin) override {
266 CUresult Res = cuDeviceGet(&Device, DeviceId);
267 if (auto Err = Plugin::check(Res, "Error in cuDeviceGet: %s"))
268 return Err;
269
270 // Query the current flags of the primary context and set its flags if
271 // it is inactive.
272 unsigned int FormerPrimaryCtxFlags = 0;
273 int FormerPrimaryCtxIsActive = 0;
274 Res = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags,
275 &FormerPrimaryCtxIsActive);
276 if (auto Err =
277 Plugin::check(Res, "Error in cuDevicePrimaryCtxGetState: %s"))
278 return Err;
279
280 if (FormerPrimaryCtxIsActive) {
281 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
282 "The primary context is active, no change to its flags\n");
283 if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) !=
284 CU_CTX_SCHED_BLOCKING_SYNC)
285 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
286 "Warning: The current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n");
287 } else {
288 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
289 "The primary context is inactive, set its flags to "
290 "CU_CTX_SCHED_BLOCKING_SYNC\n");
291 Res = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC);
292 if (auto Err =
293 Plugin::check(Res, "Error in cuDevicePrimaryCtxSetFlags: %s"))
294 return Err;
295 }
296
297 // Retain the per device primary context and save it to use whenever this
298 // device is selected.
299 Res = cuDevicePrimaryCtxRetain(&Context, Device);
300 if (auto Err = Plugin::check(Res, "Error in cuDevicePrimaryCtxRetain: %s"))
301 return Err;
302
303 if (auto Err = setContext())
304 return Err;
305
306 // Initialize stream pool.
307 if (auto Err = CUDAStreamManager.init(OMPX_InitialNumStreams))
308 return Err;
309
310 // Initialize event pool.
311 if (auto Err = CUDAEventManager.init(OMPX_InitialNumEvents))
312 return Err;
313
314 // Query attributes to determine number of threads/block and blocks/grid.
315 if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
316 GridValues.GV_Max_Teams))
317 return Err;
318
319 if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
320 GridValues.GV_Max_WG_Size))
321 return Err;
322
323 if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_WARP_SIZE,
324 GridValues.GV_Warp_Size))
325 return Err;
326
327 if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
328 ComputeCapability.Major))
329 return Err;
330
331 if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
332 ComputeCapability.Minor))
333 return Err;
334
335 uint32_t NumMuliprocessors = 0;
336 uint32_t MaxThreadsPerSM = 0;
337 uint32_t WarpSize = 0;
338 if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
339 NumMuliprocessors))
340 return Err;
341 if (auto Err =
342 getDeviceAttr(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
343 MaxThreadsPerSM))
344 return Err;
345 if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_WARP_SIZE, WarpSize))
346 return Err;
347 HardwareParallelism = NumMuliprocessors * (MaxThreadsPerSM / WarpSize);
348
349 return Plugin::success();
350 }
351
352 /// Deinitialize the device and release its resources.
353 Error deinitImpl() override {
354 if (Context) {
355 if (auto Err = setContext())
356 return Err;
357 }
358
359 // Deinitialize the stream manager.
360 if (auto Err = CUDAStreamManager.deinit())
361 return Err;
362
363 if (auto Err = CUDAEventManager.deinit())
364 return Err;
365
366 // Close modules if necessary.
367 if (!LoadedImages.empty()) {
368 assert(Context && "Invalid CUDA context");
369
370 // Each image has its own module.
371 for (DeviceImageTy *Image : LoadedImages) {
372 CUDADeviceImageTy &CUDAImage = static_cast<CUDADeviceImageTy &>(*Image);
373
374 // Unload the module of the image.
375 if (auto Err = CUDAImage.unloadModule())
376 return Err;
377 }
378 }
379
380 if (Context) {
381 CUresult Res = cuDevicePrimaryCtxRelease(Device);
382 if (auto Err =
383 Plugin::check(Res, "Error in cuDevicePrimaryCtxRelease: %s"))
384 return Err;
385 }
386
387 // Invalidate context and device references.
388 Context = nullptr;
389 Device = CU_DEVICE_INVALID;
390
391 return Plugin::success();
392 }
393
394 virtual Error callGlobalConstructors(GenericPluginTy &Plugin,
395 DeviceImageTy &Image) override {
396 // Check for the presense of global destructors at initialization time. This
397 // is required when the image may be deallocated before destructors are run.
398 GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
399 if (Handler.isSymbolInImage(*this, Image, "nvptx$device$fini"))
400 Image.setPendingGlobalDtors();
401
402 return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/true);
403 }
404
405 virtual Error callGlobalDestructors(GenericPluginTy &Plugin,
406 DeviceImageTy &Image) override {
407 if (Image.hasPendingGlobalDtors())
408 return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/false);
409 return Plugin::success();
410 }
411
412 Expected<std::unique_ptr<MemoryBuffer>>
413 doJITPostProcessing(std::unique_ptr<MemoryBuffer> MB) const override {
414 // TODO: We should be able to use the 'nvidia-ptxjitcompiler' interface to
415 // avoid the call to 'ptxas'.
416 SmallString<128> PTXInputFilePath;
417 std::error_code EC = sys::fs::createTemporaryFile("nvptx-pre-link-jit", "s",
418 PTXInputFilePath);
419 if (EC)
420 return Plugin::error("Failed to create temporary file for ptxas");
421
422 // Write the file's contents to the output file.
423 Expected<std::unique_ptr<FileOutputBuffer>> OutputOrErr =
424 FileOutputBuffer::create(PTXInputFilePath, MB->getBuffer().size());
425 if (!OutputOrErr)
426 return OutputOrErr.takeError();
427 std::unique_ptr<FileOutputBuffer> Output = std::move(*OutputOrErr);
428 llvm::copy(MB->getBuffer(), Output->getBufferStart());
429 if (Error E = Output->commit())
430 return std::move(E);
431
432 SmallString<128> PTXOutputFilePath;
433 EC = sys::fs::createTemporaryFile("nvptx-post-link-jit", "cubin",
434 PTXOutputFilePath);
435 if (EC)
436 return Plugin::error("Failed to create temporary file for ptxas");
437
438 // Try to find `ptxas` in the path to compile the PTX to a binary.
439 const auto ErrorOrPath = sys::findProgramByName("ptxas");
440 if (!ErrorOrPath)
441 return Plugin::error("Failed to find 'ptxas' on the PATH.");
442
443 std::string Arch = getComputeUnitKind();
444 StringRef Args[] = {*ErrorOrPath,
445 "-m64",
446 "-O2",
447 "--gpu-name",
448 Arch,
449 "--output-file",
450 PTXOutputFilePath,
451 PTXInputFilePath};
452
453 std::string ErrMsg;
454 if (sys::ExecuteAndWait(*ErrorOrPath, Args, std::nullopt, {}, 0, 0,
455 &ErrMsg))
456 return Plugin::error("Running 'ptxas' failed: %s\n", ErrMsg.c_str());
457
458 auto BufferOrErr = MemoryBuffer::getFileOrSTDIN(PTXOutputFilePath.data());
459 if (!BufferOrErr)
460 return Plugin::error("Failed to open temporary file for ptxas");
461
462 // Clean up the temporary files afterwards.
463 if (sys::fs::remove(PTXOutputFilePath))
464 return Plugin::error("Failed to remove temporary file for ptxas");
465 if (sys::fs::remove(PTXInputFilePath))
466 return Plugin::error("Failed to remove temporary file for ptxas");
467
468 return std::move(*BufferOrErr);
469 }
470
471 /// Allocate and construct a CUDA kernel.
472 Expected<GenericKernelTy &> constructKernel(const char *Name) override {
473 // Allocate and construct the CUDA kernel.
474 CUDAKernelTy *CUDAKernel = Plugin.allocate<CUDAKernelTy>();
475 if (!CUDAKernel)
476 return Plugin::error("Failed to allocate memory for CUDA kernel");
477
478 new (CUDAKernel) CUDAKernelTy(Name);
479
480 return *CUDAKernel;
481 }
482
483 /// Set the current context to this device's context.
484 Error setContext() override {
485 CUresult Res = cuCtxSetCurrent(Context);
486 return Plugin::check(Res, "Error in cuCtxSetCurrent: %s");
487 }
488
489 /// NVIDIA returns the product of the SM count and the number of warps that
490 /// fit if the maximum number of threads were scheduled on each SM.
491 uint64_t getHardwareParallelism() const override {
492 return HardwareParallelism;
493 }
494
495 /// We want to set up the RPC server for host services to the GPU if it is
496 /// availible.
497 bool shouldSetupRPCServer() const override {
498 return libomptargetSupportsRPC();
499 }
500
501 /// The RPC interface should have enough space for all availible parallelism.
502 uint64_t requestedRPCPortCount() const override {
503 return getHardwareParallelism();
504 }
505
506 /// Get the stream of the asynchronous info sructure or get a new one.
507 Error getStream(AsyncInfoWrapperTy &AsyncInfoWrapper, CUstream &Stream) {
508 // Get the stream (if any) from the async info.
509 Stream = AsyncInfoWrapper.getQueueAs<CUstream>();
510 if (!Stream) {
511 // There was no stream; get an idle one.
512 if (auto Err = CUDAStreamManager.getResource(Stream))
513 return Err;
514
515 // Modify the async info's stream.
516 AsyncInfoWrapper.setQueueAs<CUstream>(Stream);
517 }
518 return Plugin::success();
519 }
520
521 /// Getters of CUDA references.
522 CUcontext getCUDAContext() const { return Context; }
523 CUdevice getCUDADevice() const { return Device; }
524
525 /// Load the binary image into the device and allocate an image object.
526 Expected<DeviceImageTy *> loadBinaryImpl(const __tgt_device_image *TgtImage,
527 int32_t ImageId) override {
528 if (auto Err = setContext())
529 return std::move(Err);
530
531 // Allocate and initialize the image object.
532 CUDADeviceImageTy *CUDAImage = Plugin.allocate<CUDADeviceImageTy>();
533 new (CUDAImage) CUDADeviceImageTy(ImageId, *this, TgtImage);
534
535 // Load the CUDA module.
536 if (auto Err = CUDAImage->loadModule())
537 return std::move(Err);
538
539 return CUDAImage;
540 }
541
542 /// Allocate memory on the device or related to the device.
543 void *allocate(size_t Size, void *, TargetAllocTy Kind) override {
544 if (Size == 0)
545 return nullptr;
546
547 if (auto Err = setContext()) {
548 REPORT("Failure to alloc memory: %s\n", toString(E: std::move(Err)).data());
549 return nullptr;
550 }
551
552 void *MemAlloc = nullptr;
553 CUdeviceptr DevicePtr;
554 CUresult Res;
555
556 switch (Kind) {
557 case TARGET_ALLOC_DEFAULT:
558 case TARGET_ALLOC_DEVICE:
559 Res = cuMemAlloc(&DevicePtr, Size);
560 MemAlloc = (void *)DevicePtr;
561 break;
562 case TARGET_ALLOC_HOST:
563 Res = cuMemAllocHost(&MemAlloc, Size);
564 break;
565 case TARGET_ALLOC_SHARED:
566 Res = cuMemAllocManaged(&DevicePtr, Size, CU_MEM_ATTACH_GLOBAL);
567 MemAlloc = (void *)DevicePtr;
568 break;
569 case TARGET_ALLOC_DEVICE_NON_BLOCKING: {
570 CUstream Stream;
571 if ((Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING)))
572 break;
573 if ((Res = cuMemAllocAsync(&DevicePtr, Size, Stream)))
574 break;
575 cuStreamSynchronize(Stream);
576 Res = cuStreamDestroy(Stream);
577 MemAlloc = (void *)DevicePtr;
578 }
579 }
580
581 if (auto Err =
582 Plugin::check(Res, "Error in cuMemAlloc[Host|Managed]: %s")) {
583 REPORT("Failure to alloc memory: %s\n", toString(std::move(Err)).data());
584 return nullptr;
585 }
586 return MemAlloc;
587 }
588
589 /// Deallocate memory on the device or related to the device.
590 int free(void *TgtPtr, TargetAllocTy Kind) override {
591 if (TgtPtr == nullptr)
592 return OFFLOAD_SUCCESS;
593
594 if (auto Err = setContext()) {
595 REPORT("Failure to free memory: %s\n", toString(E: std::move(Err)).data());
596 return OFFLOAD_FAIL;
597 }
598
599 CUresult Res;
600 switch (Kind) {
601 case TARGET_ALLOC_DEFAULT:
602 case TARGET_ALLOC_DEVICE:
603 case TARGET_ALLOC_SHARED:
604 Res = cuMemFree((CUdeviceptr)TgtPtr);
605 break;
606 case TARGET_ALLOC_HOST:
607 Res = cuMemFreeHost(TgtPtr);
608 break;
609 case TARGET_ALLOC_DEVICE_NON_BLOCKING: {
610 CUstream Stream;
611 if ((Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING)))
612 break;
613 cuMemFreeAsync(reinterpret_cast<CUdeviceptr>(TgtPtr), Stream);
614 cuStreamSynchronize(Stream);
615 if ((Res = cuStreamDestroy(Stream)))
616 break;
617 }
618 }
619
620 if (auto Err = Plugin::check(Res, "Error in cuMemFree[Host]: %s")) {
621 REPORT("Failure to free memory: %s\n", toString(std::move(Err)).data());
622 return OFFLOAD_FAIL;
623 }
624 return OFFLOAD_SUCCESS;
625 }
626
627 /// Synchronize current thread with the pending operations on the async info.
628 Error synchronizeImpl(__tgt_async_info &AsyncInfo) override {
629 CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo.Queue);
630 CUresult Res;
631 // If we have an RPC server running on this device we will continuously
632 // query it for work rather than blocking.
633 if (!getRPCServer()) {
634 Res = cuStreamSynchronize(Stream);
635 } else {
636 do {
637 Res = cuStreamQuery(Stream);
638 if (auto Err = getRPCServer()->runServer(*this))
639 return Err;
640 } while (Res == CUDA_ERROR_NOT_READY);
641 }
642
643 // Once the stream is synchronized, return it to stream pool and reset
644 // AsyncInfo. This is to make sure the synchronization only works for its
645 // own tasks.
646 AsyncInfo.Queue = nullptr;
647 if (auto Err = CUDAStreamManager.returnResource(Stream))
648 return Err;
649
650 return Plugin::check(Res, "Error in cuStreamSynchronize: %s");
651 }
652
653 /// CUDA support VA management
654 bool supportVAManagement() const override {
655#if (defined(CUDA_VERSION) && (CUDA_VERSION >= 11000))
656 return true;
657#else
658 return false;
659#endif
660 }
661
662 /// Allocates \p RSize bytes (rounded up to page size) and hints the cuda
663 /// driver to map it to \p VAddr. The obtained address is stored in \p Addr.
664 /// At return \p RSize contains the actual size
665 Error memoryVAMap(void **Addr, void *VAddr, size_t *RSize) override {
666 CUdeviceptr DVAddr = reinterpret_cast<CUdeviceptr>(VAddr);
667 auto IHandle = DeviceMMaps.find(DVAddr);
668 size_t Size = *RSize;
669
670 if (Size == 0)
671 return Plugin::error("Memory Map Size must be larger than 0");
672
673 // Check if we have already mapped this address
674 if (IHandle != DeviceMMaps.end())
675 return Plugin::error("Address already memory mapped");
676
677 CUmemAllocationProp Prop = {};
678 size_t Granularity = 0;
679
680 size_t Free, Total;
681 CUresult Res = cuMemGetInfo(&Free, &Total);
682 if (auto Err = Plugin::check(Res, "Error in cuMemGetInfo: %s"))
683 return Err;
684
685 if (Size >= Free) {
686 *Addr = nullptr;
687 return Plugin::error(
688 "Canot map memory size larger than the available device memory");
689 }
690
691 // currently NVidia only supports pinned device types
692 Prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
693 Prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
694
695 Prop.location.id = DeviceId;
696 cuMemGetAllocationGranularity(&Granularity, &Prop,
697 CU_MEM_ALLOC_GRANULARITY_MINIMUM);
698 if (auto Err =
699 Plugin::check(Res, "Error in cuMemGetAllocationGranularity: %s"))
700 return Err;
701
702 if (Granularity == 0)
703 return Plugin::error("Wrong device Page size");
704
705 // Ceil to page size.
706 Size = roundUp(Size, Granularity);
707
708 // Create a handler of our allocation
709 CUmemGenericAllocationHandle AHandle;
710 Res = cuMemCreate(&AHandle, Size, &Prop, 0);
711 if (auto Err = Plugin::check(Res, "Error in cuMemCreate: %s"))
712 return Err;
713
714 CUdeviceptr DevPtr = 0;
715 Res = cuMemAddressReserve(&DevPtr, Size, 0, DVAddr, 0);
716 if (auto Err = Plugin::check(Res, "Error in cuMemAddressReserve: %s"))
717 return Err;
718
719 Res = cuMemMap(DevPtr, Size, 0, AHandle, 0);
720 if (auto Err = Plugin::check(Res, "Error in cuMemMap: %s"))
721 return Err;
722
723 CUmemAccessDesc ADesc = {};
724 ADesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
725 ADesc.location.id = DeviceId;
726 ADesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
727
728 // Sets address
729 Res = cuMemSetAccess(DevPtr, Size, &ADesc, 1);
730 if (auto Err = Plugin::check(Res, "Error in cuMemSetAccess: %s"))
731 return Err;
732
733 *Addr = reinterpret_cast<void *>(DevPtr);
734 *RSize = Size;
735 DeviceMMaps.insert({DevPtr, AHandle});
736 return Plugin::success();
737 }
738
739 /// De-allocates device memory and Unmaps the Virtual Addr
740 Error memoryVAUnMap(void *VAddr, size_t Size) override {
741 CUdeviceptr DVAddr = reinterpret_cast<CUdeviceptr>(VAddr);
742 auto IHandle = DeviceMMaps.find(DVAddr);
743 // Mapping does not exist
744 if (IHandle == DeviceMMaps.end()) {
745 return Plugin::error("Addr is not MemoryMapped");
746 }
747
748 if (IHandle == DeviceMMaps.end())
749 return Plugin::error("Addr is not MemoryMapped");
750
751 CUmemGenericAllocationHandle &AllocHandle = IHandle->second;
752
753 CUresult Res = cuMemUnmap(DVAddr, Size);
754 if (auto Err = Plugin::check(Res, "Error in cuMemUnmap: %s"))
755 return Err;
756
757 Res = cuMemRelease(AllocHandle);
758 if (auto Err = Plugin::check(Res, "Error in cuMemRelease: %s"))
759 return Err;
760
761 Res = cuMemAddressFree(DVAddr, Size);
762 if (auto Err = Plugin::check(Res, "Error in cuMemAddressFree: %s"))
763 return Err;
764
765 DeviceMMaps.erase(IHandle);
766 return Plugin::success();
767 }
768
769 /// Query for the completion of the pending operations on the async info.
770 Error queryAsyncImpl(__tgt_async_info &AsyncInfo) override {
771 CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo.Queue);
772 CUresult Res = cuStreamQuery(Stream);
773
774 // Not ready streams must be considered as successful operations.
775 if (Res == CUDA_ERROR_NOT_READY)
776 return Plugin::success();
777
778 // Once the stream is synchronized and the operations completed (or an error
779 // occurs), return it to stream pool and reset AsyncInfo. This is to make
780 // sure the synchronization only works for its own tasks.
781 AsyncInfo.Queue = nullptr;
782 if (auto Err = CUDAStreamManager.returnResource(Stream))
783 return Err;
784
785 return Plugin::check(Res, "Error in cuStreamQuery: %s");
786 }
787
788 Expected<void *> dataLockImpl(void *HstPtr, int64_t Size) override {
789 // TODO: Register the buffer as CUDA host memory.
790 return HstPtr;
791 }
792
793 Error dataUnlockImpl(void *HstPtr) override { return Plugin::success(); }
794
795 Expected<bool> isPinnedPtrImpl(void *HstPtr, void *&BaseHstPtr,
796 void *&BaseDevAccessiblePtr,
797 size_t &BaseSize) const override {
798 // TODO: Implement pinning feature for CUDA.
799 return false;
800 }
801
802 /// Submit data to the device (host to device transfer).
803 Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size,
804 AsyncInfoWrapperTy &AsyncInfoWrapper) override {
805 if (auto Err = setContext())
806 return Err;
807
808 CUstream Stream;
809 if (auto Err = getStream(AsyncInfoWrapper, Stream))
810 return Err;
811
812 CUresult Res = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
813 return Plugin::check(Res, "Error in cuMemcpyHtoDAsync: %s");
814 }
815
816 /// Retrieve data from the device (device to host transfer).
817 Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size,
818 AsyncInfoWrapperTy &AsyncInfoWrapper) override {
819 if (auto Err = setContext())
820 return Err;
821
822 CUstream Stream;
823 if (auto Err = getStream(AsyncInfoWrapper, Stream))
824 return Err;
825
826 // If there is already pending work on the stream it could be waiting for
827 // someone to check the RPC server.
828 if (auto *RPCServer = getRPCServer()) {
829 CUresult Res = cuStreamQuery(Stream);
830 while (Res == CUDA_ERROR_NOT_READY) {
831 if (auto Err = RPCServer->runServer(*this))
832 return Err;
833 Res = cuStreamQuery(Stream);
834 }
835 }
836
837 CUresult Res = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
838 return Plugin::check(Res, "Error in cuMemcpyDtoHAsync: %s");
839 }
840
841 /// Exchange data between two devices directly. We may use peer access if
842 /// the CUDA devices and driver allow them.
843 Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstGenericDevice,
844 void *DstPtr, int64_t Size,
845 AsyncInfoWrapperTy &AsyncInfoWrapper) override;
846
847 /// Initialize the async info for interoperability purposes.
848 Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
849 if (auto Err = setContext())
850 return Err;
851
852 CUstream Stream;
853 if (auto Err = getStream(AsyncInfoWrapper, Stream))
854 return Err;
855
856 return Plugin::success();
857 }
858
859 /// Initialize the device info for interoperability purposes.
860 Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) override {
861 assert(Context && "Context is null");
862 assert(Device != CU_DEVICE_INVALID && "Invalid CUDA device");
863
864 if (auto Err = setContext())
865 return Err;
866
867 if (!DeviceInfo->Context)
868 DeviceInfo->Context = Context;
869
870 if (!DeviceInfo->Device)
871 DeviceInfo->Device = reinterpret_cast<void *>(Device);
872
873 return Plugin::success();
874 }
875
876 /// Create an event.
877 Error createEventImpl(void **EventPtrStorage) override {
878 CUevent *Event = reinterpret_cast<CUevent *>(EventPtrStorage);
879 return CUDAEventManager.getResource(*Event);
880 }
881
882 /// Destroy a previously created event.
883 Error destroyEventImpl(void *EventPtr) override {
884 CUevent Event = reinterpret_cast<CUevent>(EventPtr);
885 return CUDAEventManager.returnResource(Event);
886 }
887
888 /// Record the event.
889 Error recordEventImpl(void *EventPtr,
890 AsyncInfoWrapperTy &AsyncInfoWrapper) override {
891 CUevent Event = reinterpret_cast<CUevent>(EventPtr);
892
893 CUstream Stream;
894 if (auto Err = getStream(AsyncInfoWrapper, Stream))
895 return Err;
896
897 CUresult Res = cuEventRecord(Event, Stream);
898 return Plugin::check(Res, "Error in cuEventRecord: %s");
899 }
900
901 /// Make the stream wait on the event.
902 Error waitEventImpl(void *EventPtr,
903 AsyncInfoWrapperTy &AsyncInfoWrapper) override {
904 CUevent Event = reinterpret_cast<CUevent>(EventPtr);
905
906 CUstream Stream;
907 if (auto Err = getStream(AsyncInfoWrapper, Stream))
908 return Err;
909
910 // Do not use CU_EVENT_WAIT_DEFAULT here as it is only available from
911 // specific CUDA version, and defined as 0x0. In previous version, per CUDA
912 // API document, that argument has to be 0x0.
913 CUresult Res = cuStreamWaitEvent(Stream, Event, 0);
914 return Plugin::check(Res, "Error in cuStreamWaitEvent: %s");
915 }
916
917 /// Synchronize the current thread with the event.
918 Error syncEventImpl(void *EventPtr) override {
919 CUevent Event = reinterpret_cast<CUevent>(EventPtr);
920 CUresult Res = cuEventSynchronize(Event);
921 return Plugin::check(Res, "Error in cuEventSynchronize: %s");
922 }
923
924 /// Print information about the device.
925 Error obtainInfoImpl(InfoQueueTy &Info) override {
926 char TmpChar[1000];
927 const char *TmpCharPtr;
928 size_t TmpSt;
929 int TmpInt;
930
931 CUresult Res = cuDriverGetVersion(&TmpInt);
932 if (Res == CUDA_SUCCESS)
933 Info.add("CUDA Driver Version", TmpInt);
934
935 Info.add("CUDA OpenMP Device Number", DeviceId);
936
937 Res = cuDeviceGetName(TmpChar, 1000, Device);
938 if (Res == CUDA_SUCCESS)
939 Info.add("Device Name", TmpChar);
940
941 Res = cuDeviceTotalMem(&TmpSt, Device);
942 if (Res == CUDA_SUCCESS)
943 Info.add("Global Memory Size", TmpSt, "bytes");
944
945 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, TmpInt);
946 if (Res == CUDA_SUCCESS)
947 Info.add("Number of Multiprocessors", TmpInt);
948
949 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, TmpInt);
950 if (Res == CUDA_SUCCESS)
951 Info.add("Concurrent Copy and Execution", (bool)TmpInt);
952
953 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, TmpInt);
954 if (Res == CUDA_SUCCESS)
955 Info.add("Total Constant Memory", TmpInt, "bytes");
956
957 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
958 TmpInt);
959 if (Res == CUDA_SUCCESS)
960 Info.add("Max Shared Memory per Block", TmpInt, "bytes");
961
962 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, TmpInt);
963 if (Res == CUDA_SUCCESS)
964 Info.add("Registers per Block", TmpInt);
965
966 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_WARP_SIZE, TmpInt);
967 if (Res == CUDA_SUCCESS)
968 Info.add("Warp Size", TmpInt);
969
970 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, TmpInt);
971 if (Res == CUDA_SUCCESS)
972 Info.add("Maximum Threads per Block", TmpInt);
973
974 Info.add("Maximum Block Dimensions", "");
975 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, TmpInt);
976 if (Res == CUDA_SUCCESS)
977 Info.add<InfoLevel2>("x", TmpInt);
978 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, TmpInt);
979 if (Res == CUDA_SUCCESS)
980 Info.add<InfoLevel2>("y", TmpInt);
981 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, TmpInt);
982 if (Res == CUDA_SUCCESS)
983 Info.add<InfoLevel2>("z", TmpInt);
984
985 Info.add("Maximum Grid Dimensions", "");
986 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, TmpInt);
987 if (Res == CUDA_SUCCESS)
988 Info.add<InfoLevel2>("x", TmpInt);
989 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, TmpInt);
990 if (Res == CUDA_SUCCESS)
991 Info.add<InfoLevel2>("y", TmpInt);
992 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, TmpInt);
993 if (Res == CUDA_SUCCESS)
994 Info.add<InfoLevel2>("z", TmpInt);
995
996 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_PITCH, TmpInt);
997 if (Res == CUDA_SUCCESS)
998 Info.add("Maximum Memory Pitch", TmpInt, "bytes");
999
1000 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, TmpInt);
1001 if (Res == CUDA_SUCCESS)
1002 Info.add("Texture Alignment", TmpInt, "bytes");
1003
1004 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CLOCK_RATE, TmpInt);
1005 if (Res == CUDA_SUCCESS)
1006 Info.add("Clock Rate", TmpInt, "kHz");
1007
1008 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, TmpInt);
1009 if (Res == CUDA_SUCCESS)
1010 Info.add("Execution Timeout", (bool)TmpInt);
1011
1012 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_INTEGRATED, TmpInt);
1013 if (Res == CUDA_SUCCESS)
1014 Info.add("Integrated Device", (bool)TmpInt);
1015
1016 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, TmpInt);
1017 if (Res == CUDA_SUCCESS)
1018 Info.add("Can Map Host Memory", (bool)TmpInt);
1019
1020 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, TmpInt);
1021 if (Res == CUDA_SUCCESS) {
1022 if (TmpInt == CU_COMPUTEMODE_DEFAULT)
1023 TmpCharPtr = "Default";
1024 else if (TmpInt == CU_COMPUTEMODE_PROHIBITED)
1025 TmpCharPtr = "Prohibited";
1026 else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS)
1027 TmpCharPtr = "Exclusive process";
1028 else
1029 TmpCharPtr = "Unknown";
1030 Info.add("Compute Mode", TmpCharPtr);
1031 }
1032
1033 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, TmpInt);
1034 if (Res == CUDA_SUCCESS)
1035 Info.add("Concurrent Kernels", (bool)TmpInt);
1036
1037 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_ECC_ENABLED, TmpInt);
1038 if (Res == CUDA_SUCCESS)
1039 Info.add("ECC Enabled", (bool)TmpInt);
1040
1041 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, TmpInt);
1042 if (Res == CUDA_SUCCESS)
1043 Info.add("Memory Clock Rate", TmpInt, "kHz");
1044
1045 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, TmpInt);
1046 if (Res == CUDA_SUCCESS)
1047 Info.add("Memory Bus Width", TmpInt, "bits");
1048
1049 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, TmpInt);
1050 if (Res == CUDA_SUCCESS)
1051 Info.add("L2 Cache Size", TmpInt, "bytes");
1052
1053 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
1054 TmpInt);
1055 if (Res == CUDA_SUCCESS)
1056 Info.add("Max Threads Per SMP", TmpInt);
1057
1058 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, TmpInt);
1059 if (Res == CUDA_SUCCESS)
1060 Info.add("Async Engines", TmpInt);
1061
1062 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, TmpInt);
1063 if (Res == CUDA_SUCCESS)
1064 Info.add("Unified Addressing", (bool)TmpInt);
1065
1066 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, TmpInt);
1067 if (Res == CUDA_SUCCESS)
1068 Info.add("Managed Memory", (bool)TmpInt);
1069
1070 Res =
1071 getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, TmpInt);
1072 if (Res == CUDA_SUCCESS)
1073 Info.add("Concurrent Managed Memory", (bool)TmpInt);
1074
1075 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED,
1076 TmpInt);
1077 if (Res == CUDA_SUCCESS)
1078 Info.add("Preemption Supported", (bool)TmpInt);
1079
1080 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, TmpInt);
1081 if (Res == CUDA_SUCCESS)
1082 Info.add("Cooperative Launch", (bool)TmpInt);
1083
1084 Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, TmpInt);
1085 if (Res == CUDA_SUCCESS)
1086 Info.add("Multi-Device Boars", (bool)TmpInt);
1087
1088 Info.add("Compute Capabilities", ComputeCapability.str());
1089
1090 return Plugin::success();
1091 }
1092
1093 virtual bool shouldSetupDeviceMemoryPool() const override {
1094 /// We use the CUDA malloc for now.
1095 return false;
1096 }
1097
1098 /// Getters and setters for stack and heap sizes.
1099 Error getDeviceStackSize(uint64_t &Value) override {
1100 return getCtxLimit(CU_LIMIT_STACK_SIZE, Value);
1101 }
1102 Error setDeviceStackSize(uint64_t Value) override {
1103 return setCtxLimit(CU_LIMIT_STACK_SIZE, Value);
1104 }
1105 Error getDeviceHeapSize(uint64_t &Value) override {
1106 return getCtxLimit(CU_LIMIT_MALLOC_HEAP_SIZE, Value);
1107 }
1108 Error setDeviceHeapSize(uint64_t Value) override {
1109 return setCtxLimit(CU_LIMIT_MALLOC_HEAP_SIZE, Value);
1110 }
1111 Error getDeviceMemorySize(uint64_t &Value) override {
1112 CUresult Res = cuDeviceTotalMem(&Value, Device);
1113 return Plugin::check(Res, "Error in getDeviceMemorySize %s");
1114 }
1115
1116 /// CUDA-specific functions for getting and setting context limits.
1117 Error setCtxLimit(CUlimit Kind, uint64_t Value) {
1118 CUresult Res = cuCtxSetLimit(Kind, Value);
1119 return Plugin::check(Res, "Error in cuCtxSetLimit: %s");
1120 }
1121 Error getCtxLimit(CUlimit Kind, uint64_t &Value) {
1122 CUresult Res = cuCtxGetLimit(&Value, Kind);
1123 return Plugin::check(Res, "Error in cuCtxGetLimit: %s");
1124 }
1125
1126 /// CUDA-specific function to get device attributes.
1127 Error getDeviceAttr(uint32_t Kind, uint32_t &Value) {
1128 // TODO: Warn if the new value is larger than the old.
1129 CUresult Res =
1130 cuDeviceGetAttribute((int *)&Value, (CUdevice_attribute)Kind, Device);
1131 return Plugin::check(Res, "Error in cuDeviceGetAttribute: %s");
1132 }
1133
1134 CUresult getDeviceAttrRaw(uint32_t Kind, int &Value) {
1135 return cuDeviceGetAttribute(&Value, (CUdevice_attribute)Kind, Device);
1136 }
1137
1138 /// See GenericDeviceTy::getComputeUnitKind().
1139 std::string getComputeUnitKind() const override {
1140 return ComputeCapability.str();
1141 }
1142
1143 /// Returns the clock frequency for the given NVPTX device.
1144 uint64_t getClockFrequency() const override { return 1000000000; }
1145
1146private:
1147 using CUDAStreamManagerTy = GenericDeviceResourceManagerTy<CUDAStreamRef>;
1148 using CUDAEventManagerTy = GenericDeviceResourceManagerTy<CUDAEventRef>;
1149
1150 Error callGlobalCtorDtorCommon(GenericPluginTy &Plugin, DeviceImageTy &Image,
1151 bool IsCtor) {
1152 const char *KernelName = IsCtor ? "nvptx$device$init" : "nvptx$device$fini";
1153 // Perform a quick check for the named kernel in the image. The kernel
1154 // should be created by the 'nvptx-lower-ctor-dtor' pass.
1155 GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
1156 if (IsCtor && !Handler.isSymbolInImage(*this, Image, KernelName))
1157 return Plugin::success();
1158
1159 // The Nvidia backend cannot handle creating the ctor / dtor array
1160 // automatically so we must create it ourselves. The backend will emit
1161 // several globals that contain function pointers we can call. These are
1162 // prefixed with a known name due to Nvidia's lack of section support.
1163 auto ELFObjOrErr = Handler.getELFObjectFile(Image);
1164 if (!ELFObjOrErr)
1165 return ELFObjOrErr.takeError();
1166
1167 // Search for all symbols that contain a constructor or destructor.
1168 SmallVector<std::pair<StringRef, uint16_t>> Funcs;
1169 for (ELFSymbolRef Sym : (*ELFObjOrErr)->symbols()) {
1170 auto NameOrErr = Sym.getName();
1171 if (!NameOrErr)
1172 return NameOrErr.takeError();
1173
1174 if (!NameOrErr->starts_with(IsCtor ? "__init_array_object_"
1175 : "__fini_array_object_"))
1176 continue;
1177
1178 uint16_t Priority;
1179 if (NameOrErr->rsplit('_').second.getAsInteger(10, Priority))
1180 return Plugin::error("Invalid priority for constructor or destructor");
1181
1182 Funcs.emplace_back(*NameOrErr, Priority);
1183 }
1184
1185 // Sort the created array to be in priority order.
1186 llvm::sort(Funcs, [=](auto X, auto Y) { return X.second < Y.second; });
1187
1188 // Allocate a buffer to store all of the known constructor / destructor
1189 // functions in so we can iterate them on the device.
1190 void *Buffer =
1191 allocate(Funcs.size() * sizeof(void *), nullptr, TARGET_ALLOC_DEVICE);
1192 if (!Buffer)
1193 return Plugin::error("Failed to allocate memory for global buffer");
1194
1195 auto *GlobalPtrStart = reinterpret_cast<uintptr_t *>(Buffer);
1196 auto *GlobalPtrStop = reinterpret_cast<uintptr_t *>(Buffer) + Funcs.size();
1197
1198 SmallVector<void *> FunctionPtrs(Funcs.size());
1199 std::size_t Idx = 0;
1200 for (auto [Name, Priority] : Funcs) {
1201 GlobalTy FunctionAddr(Name.str(), sizeof(void *), &FunctionPtrs[Idx++]);
1202 if (auto Err = Handler.readGlobalFromDevice(*this, Image, FunctionAddr))
1203 return Err;
1204 }
1205
1206 // Copy the local buffer to the device.
1207 if (auto Err = dataSubmit(GlobalPtrStart, FunctionPtrs.data(),
1208 FunctionPtrs.size() * sizeof(void *), nullptr))
1209 return Err;
1210
1211 // Copy the created buffer to the appropriate symbols so the kernel can
1212 // iterate through them.
1213 GlobalTy StartGlobal(IsCtor ? "__init_array_start" : "__fini_array_start",
1214 sizeof(void *), &GlobalPtrStart);
1215 if (auto Err = Handler.writeGlobalToDevice(*this, Image, StartGlobal))
1216 return Err;
1217
1218 GlobalTy StopGlobal(IsCtor ? "__init_array_end" : "__fini_array_end",
1219 sizeof(void *), &GlobalPtrStop);
1220 if (auto Err = Handler.writeGlobalToDevice(*this, Image, StopGlobal))
1221 return Err;
1222
1223 CUDAKernelTy CUDAKernel(KernelName);
1224
1225 if (auto Err = CUDAKernel.init(*this, Image))
1226 return Err;
1227
1228 AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr);
1229
1230 KernelArgsTy KernelArgs = {};
1231 if (auto Err = CUDAKernel.launchImpl(*this, /*NumThread=*/1u,
1232 /*NumBlocks=*/1ul, KernelArgs, nullptr,
1233 AsyncInfoWrapper))
1234 return Err;
1235
1236 Error Err = Plugin::success();
1237 AsyncInfoWrapper.finalize(Err);
1238
1239 if (free(Buffer, TARGET_ALLOC_DEVICE) != OFFLOAD_SUCCESS)
1240 return Plugin::error("Failed to free memory for global buffer");
1241
1242 return Err;
1243 }
1244
1245 /// Stream manager for CUDA streams.
1246 CUDAStreamManagerTy CUDAStreamManager;
1247
1248 /// Event manager for CUDA events.
1249 CUDAEventManagerTy CUDAEventManager;
1250
1251 /// The device's context. This context should be set before performing
1252 /// operations on the device.
1253 CUcontext Context = nullptr;
1254
1255 /// The CUDA device handler.
1256 CUdevice Device = CU_DEVICE_INVALID;
1257
1258 /// The memory mapped addresses and their handles
1259 std::unordered_map<CUdeviceptr, CUmemGenericAllocationHandle> DeviceMMaps;
1260
1261 /// The compute capability of the corresponding CUDA device.
1262 struct ComputeCapabilityTy {
1263 uint32_t Major;
1264 uint32_t Minor;
1265 std::string str() const {
1266 return "sm_" + std::to_string(val: Major * 10 + Minor);
1267 }
1268 } ComputeCapability;
1269
1270 /// The maximum number of warps that can be resident on all the SMs
1271 /// simultaneously.
1272 uint32_t HardwareParallelism = 0;
1273};
1274
1275Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
1276 uint32_t NumThreads, uint64_t NumBlocks,
1277 KernelArgsTy &KernelArgs, void *Args,
1278 AsyncInfoWrapperTy &AsyncInfoWrapper) const {
1279 CUDADeviceTy &CUDADevice = static_cast<CUDADeviceTy &>(GenericDevice);
1280
1281 CUstream Stream;
1282 if (auto Err = CUDADevice.getStream(AsyncInfoWrapper, Stream))
1283 return Err;
1284
1285 uint32_t MaxDynCGroupMem =
1286 std::max(KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize());
1287
1288 CUresult Res =
1289 cuLaunchKernel(Func, NumBlocks, /*gridDimY=*/1,
1290 /*gridDimZ=*/1, NumThreads,
1291 /*blockDimY=*/1, /*blockDimZ=*/1, MaxDynCGroupMem, Stream,
1292 (void **)Args, nullptr);
1293 return Plugin::check(Res, "Error in cuLaunchKernel for '%s': %s", getName());
1294}
1295
1296/// Class implementing the CUDA-specific functionalities of the global handler.
1297class CUDAGlobalHandlerTy final : public GenericGlobalHandlerTy {
1298public:
1299 /// Get the metadata of a global from the device. The name and size of the
1300 /// global is read from DeviceGlobal and the address of the global is written
1301 /// to DeviceGlobal.
1302 Error getGlobalMetadataFromDevice(GenericDeviceTy &Device,
1303 DeviceImageTy &Image,
1304 GlobalTy &DeviceGlobal) override {
1305 CUDADeviceImageTy &CUDAImage = static_cast<CUDADeviceImageTy &>(Image);
1306
1307 const char *GlobalName = DeviceGlobal.getName().data();
1308
1309 size_t CUSize;
1310 CUdeviceptr CUPtr;
1311 CUresult Res =
1312 cuModuleGetGlobal(&CUPtr, &CUSize, CUDAImage.getModule(), GlobalName);
1313 if (auto Err = Plugin::check(Res, "Error in cuModuleGetGlobal for '%s': %s",
1314 GlobalName))
1315 return Err;
1316
1317 if (CUSize != DeviceGlobal.getSize())
1318 return Plugin::error(
1319 "Failed to load global '%s' due to size mismatch (%zu != %zu)",
1320 GlobalName, CUSize, (size_t)DeviceGlobal.getSize());
1321
1322 DeviceGlobal.setPtr(reinterpret_cast<void *>(CUPtr));
1323 return Plugin::success();
1324 }
1325};
1326
1327/// Class implementing the CUDA-specific functionalities of the plugin.
1328struct CUDAPluginTy final : public GenericPluginTy {
1329 /// Create a CUDA plugin.
1330 CUDAPluginTy() : GenericPluginTy(getTripleArch()) {}
1331
1332 /// This class should not be copied.
1333 CUDAPluginTy(const CUDAPluginTy &) = delete;
1334 CUDAPluginTy(CUDAPluginTy &&) = delete;
1335
1336 /// Initialize the plugin and return the number of devices.
1337 Expected<int32_t> initImpl() override {
1338 CUresult Res = cuInit(0);
1339 if (Res == CUDA_ERROR_INVALID_HANDLE) {
1340 // Cannot call cuGetErrorString if dlsym failed.
1341 DP("Failed to load CUDA shared library\n");
1342 return 0;
1343 }
1344
1345#ifdef OMPT_SUPPORT
1346 ompt::connectLibrary();
1347#endif
1348
1349 if (Res == CUDA_ERROR_NO_DEVICE) {
1350 // Do not initialize if there are no devices.
1351 DP("There are no devices supporting CUDA.\n");
1352 return 0;
1353 }
1354
1355 if (auto Err = Plugin::check(Res, "Error in cuInit: %s"))
1356 return std::move(Err);
1357
1358 // Get the number of devices.
1359 int NumDevices;
1360 Res = cuDeviceGetCount(&NumDevices);
1361 if (auto Err = Plugin::check(Res, "Error in cuDeviceGetCount: %s"))
1362 return std::move(Err);
1363
1364 // Do not initialize if there are no devices.
1365 if (NumDevices == 0)
1366 DP("There are no devices supporting CUDA.\n");
1367
1368 return NumDevices;
1369 }
1370
1371 /// Deinitialize the plugin.
1372 Error deinitImpl() override { return Plugin::success(); }
1373
1374 /// Creates a CUDA device to use for offloading.
1375 GenericDeviceTy *createDevice(GenericPluginTy &Plugin, int32_t DeviceId,
1376 int32_t NumDevices) override {
1377 return new CUDADeviceTy(Plugin, DeviceId, NumDevices);
1378 }
1379
1380 /// Creates a CUDA global handler.
1381 GenericGlobalHandlerTy *createGlobalHandler() override {
1382 return new CUDAGlobalHandlerTy();
1383 }
1384
1385 /// Get the ELF code for recognizing the compatible image binary.
1386 uint16_t getMagicElfBits() const override { return ELF::EM_CUDA; }
1387
1388 Triple::ArchType getTripleArch() const override {
1389 // TODO: I think we can drop the support for 32-bit NVPTX devices.
1390 return Triple::nvptx64;
1391 }
1392
1393 /// Check whether the image is compatible with the available CUDA devices.
1394 Expected<bool> isELFCompatible(StringRef Image) const override {
1395 auto ElfOrErr =
1396 ELF64LEObjectFile::create(MemoryBufferRef(Image, /*Identifier=*/""),
1397 /*InitContent=*/false);
1398 if (!ElfOrErr)
1399 return ElfOrErr.takeError();
1400
1401 // Get the numeric value for the image's `sm_` value.
1402 auto SM = ElfOrErr->getPlatformFlags() & ELF::EF_CUDA_SM;
1403
1404 for (int32_t DevId = 0; DevId < getNumDevices(); ++DevId) {
1405 CUdevice Device;
1406 CUresult Res = cuDeviceGet(&Device, DevId);
1407 if (auto Err = Plugin::check(Res, "Error in cuDeviceGet: %s"))
1408 return std::move(Err);
1409
1410 int32_t Major, Minor;
1411 Res = cuDeviceGetAttribute(
1412 &Major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device);
1413 if (auto Err = Plugin::check(Res, "Error in cuDeviceGetAttribute: %s"))
1414 return std::move(Err);
1415
1416 Res = cuDeviceGetAttribute(
1417 &Minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device);
1418 if (auto Err = Plugin::check(Res, "Error in cuDeviceGetAttribute: %s"))
1419 return std::move(Err);
1420
1421 int32_t ImageMajor = SM / 10;
1422 int32_t ImageMinor = SM % 10;
1423
1424 // A cubin generated for a certain compute capability is supported to
1425 // run on any GPU with the same major revision and same or higher minor
1426 // revision.
1427 if (Major != ImageMajor || Minor < ImageMinor)
1428 return false;
1429 }
1430 return true;
1431 }
1432};
1433
1434Error CUDADeviceTy::dataExchangeImpl(const void *SrcPtr,
1435 GenericDeviceTy &DstGenericDevice,
1436 void *DstPtr, int64_t Size,
1437 AsyncInfoWrapperTy &AsyncInfoWrapper) {
1438 if (auto Err = setContext())
1439 return Err;
1440
1441 CUDADeviceTy &DstDevice = static_cast<CUDADeviceTy &>(DstGenericDevice);
1442
1443 CUresult Res;
1444 int32_t DstDeviceId = DstDevice.DeviceId;
1445 CUdeviceptr CUSrcPtr = (CUdeviceptr)SrcPtr;
1446 CUdeviceptr CUDstPtr = (CUdeviceptr)DstPtr;
1447
1448 int CanAccessPeer = 0;
1449 if (DeviceId != DstDeviceId) {
1450 // Make sure the lock is released before performing the copies.
1451 std::lock_guard<std::mutex> Lock(PeerAccessesLock);
1452
1453 switch (PeerAccesses[DstDeviceId]) {
1454 case PeerAccessState::AVAILABLE:
1455 CanAccessPeer = 1;
1456 break;
1457 case PeerAccessState::UNAVAILABLE:
1458 CanAccessPeer = 0;
1459 break;
1460 case PeerAccessState::PENDING:
1461 // Check whether the source device can access the destination device.
1462 Res = cuDeviceCanAccessPeer(&CanAccessPeer, Device, DstDevice.Device);
1463 if (auto Err = Plugin::check(Res, "Error in cuDeviceCanAccessPeer: %s"))
1464 return Err;
1465
1466 if (CanAccessPeer) {
1467 Res = cuCtxEnablePeerAccess(DstDevice.Context, 0);
1468 if (Res == CUDA_ERROR_TOO_MANY_PEERS) {
1469 // Resources may be exhausted due to many P2P links.
1470 CanAccessPeer = 0;
1471 DP("Too many P2P so fall back to D2D memcpy");
1472 } else if (auto Err =
1473 Plugin::check(Res, "Error in cuCtxEnablePeerAccess: %s"))
1474 return Err;
1475 }
1476 PeerAccesses[DstDeviceId] = (CanAccessPeer)
1477 ? PeerAccessState::AVAILABLE
1478 : PeerAccessState::UNAVAILABLE;
1479 }
1480 }
1481
1482 CUstream Stream;
1483 if (auto Err = getStream(AsyncInfoWrapper, Stream))
1484 return Err;
1485
1486 if (CanAccessPeer) {
1487 // TODO: Should we fallback to D2D if peer access fails?
1488 Res = cuMemcpyPeerAsync(CUDstPtr, Context, CUSrcPtr, DstDevice.Context,
1489 Size, Stream);
1490 return Plugin::check(Res, "Error in cuMemcpyPeerAsync: %s");
1491 }
1492
1493 // Fallback to D2D copy.
1494 Res = cuMemcpyDtoDAsync(CUDstPtr, CUSrcPtr, Size, Stream);
1495 return Plugin::check(Res, "Error in cuMemcpyDtoDAsync: %s");
1496}
1497
1498GenericPluginTy *PluginTy::createPlugin() { return new CUDAPluginTy(); }
1499
1500template <typename... ArgsTy>
1501static Error Plugin::check(int32_t Code, const char *ErrFmt, ArgsTy... Args) {
1502 CUresult ResultCode = static_cast<CUresult>(Code);
1503 if (ResultCode == CUDA_SUCCESS)
1504 return Error::success();
1505
1506 const char *Desc = "Unknown error";
1507 CUresult Ret = cuGetErrorString(ResultCode, &Desc);
1508 if (Ret != CUDA_SUCCESS)
1509 REPORT("Unrecognized " GETNAME(TARGET_NAME) " error code %d\n", Code);
1510
1511 return createStringError<ArgsTy..., const char *>(inconvertibleErrorCode(),
1512 ErrFmt, Args..., Desc);
1513}
1514
1515} // namespace plugin
1516} // namespace target
1517} // namespace omp
1518} // namespace llvm
1519

source code of offload/plugins-nextgen/cuda/src/rtl.cpp