1 | //===--- cuda/dynamic_cuda/cuda.pp ------------------------------- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // Implement subset of cuda api by calling into cuda library via dlopen |
10 | // Does the dlopen/dlsym calls as part of the call to cuInit |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "llvm/Support/DynamicLibrary.h" |
15 | |
16 | #include "Shared/Debug.h" |
17 | |
18 | #include "DLWrap.h" |
19 | #include "cuda.h" |
20 | |
21 | #include <memory> |
22 | #include <string> |
23 | #include <unordered_map> |
24 | |
25 | DLWRAP_INITIALIZE() |
26 | |
27 | DLWRAP_INTERNAL(cuInit, 1) |
28 | |
29 | DLWRAP(cuCtxGetDevice, 1) |
30 | DLWRAP(cuDeviceGet, 2) |
31 | DLWRAP(cuDeviceGetAttribute, 3) |
32 | DLWRAP(cuDeviceGetCount, 1) |
33 | DLWRAP(cuFuncGetAttribute, 3) |
34 | |
35 | // Device info |
36 | DLWRAP(cuDeviceGetName, 3) |
37 | DLWRAP(cuDeviceTotalMem, 2) |
38 | DLWRAP(cuDriverGetVersion, 1) |
39 | |
40 | DLWRAP(cuGetErrorString, 2) |
41 | DLWRAP(cuLaunchKernel, 11) |
42 | |
43 | DLWRAP(cuMemAlloc, 2) |
44 | DLWRAP(cuMemAllocHost, 2) |
45 | DLWRAP(cuMemAllocManaged, 3) |
46 | DLWRAP(cuMemAllocAsync, 3) |
47 | |
48 | DLWRAP(cuMemcpyDtoDAsync, 4) |
49 | DLWRAP(cuMemcpyDtoH, 3) |
50 | DLWRAP(cuMemcpyDtoHAsync, 4) |
51 | DLWRAP(cuMemcpyHtoD, 3) |
52 | DLWRAP(cuMemcpyHtoDAsync, 4) |
53 | |
54 | DLWRAP(cuMemFree, 1) |
55 | DLWRAP(cuMemFreeHost, 1) |
56 | DLWRAP(cuMemFreeAsync, 2) |
57 | |
58 | DLWRAP(cuModuleGetFunction, 3) |
59 | DLWRAP(cuModuleGetGlobal, 4) |
60 | |
61 | DLWRAP(cuModuleUnload, 1) |
62 | DLWRAP(cuStreamCreate, 2) |
63 | DLWRAP(cuStreamDestroy, 1) |
64 | DLWRAP(cuStreamSynchronize, 1) |
65 | DLWRAP(cuStreamQuery, 1) |
66 | DLWRAP(cuCtxSetCurrent, 1) |
67 | DLWRAP(cuDevicePrimaryCtxRelease, 1) |
68 | DLWRAP(cuDevicePrimaryCtxGetState, 3) |
69 | DLWRAP(cuDevicePrimaryCtxSetFlags, 2) |
70 | DLWRAP(cuDevicePrimaryCtxRetain, 2) |
71 | DLWRAP(cuModuleLoadDataEx, 5) |
72 | |
73 | DLWRAP(cuDeviceCanAccessPeer, 3) |
74 | DLWRAP(cuCtxEnablePeerAccess, 2) |
75 | DLWRAP(cuMemcpyPeerAsync, 6) |
76 | |
77 | DLWRAP(cuCtxGetLimit, 2) |
78 | DLWRAP(cuCtxSetLimit, 2) |
79 | |
80 | DLWRAP(cuEventCreate, 2) |
81 | DLWRAP(cuEventRecord, 2) |
82 | DLWRAP(cuStreamWaitEvent, 3) |
83 | DLWRAP(cuEventSynchronize, 1) |
84 | DLWRAP(cuEventDestroy, 1) |
85 | |
86 | DLWRAP_FINALIZE() |
87 | |
88 | DLWRAP(cuMemUnmap, 2) |
89 | DLWRAP(cuMemRelease, 1) |
90 | DLWRAP(cuMemAddressFree, 2) |
91 | DLWRAP(cuMemGetInfo, 2) |
92 | DLWRAP(cuMemAddressReserve, 5) |
93 | DLWRAP(cuMemMap, 5) |
94 | DLWRAP(cuMemCreate, 4) |
95 | DLWRAP(cuMemSetAccess, 4) |
96 | DLWRAP(cuMemGetAllocationGranularity, 3) |
97 | |
98 | #ifndef DYNAMIC_CUDA_PATH |
99 | #define DYNAMIC_CUDA_PATH "libcuda.so" |
100 | #endif |
101 | |
102 | #ifndef TARGET_NAME |
103 | #define TARGET_NAME CUDA |
104 | #endif |
105 | #ifndef DEBUG_PREFIX |
106 | #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL" |
107 | #endif |
108 | |
109 | static bool checkForCUDA() { |
110 | // return true if dlopen succeeded and all functions found |
111 | |
112 | // Prefer _v2 versions of functions if found in the library |
113 | std::unordered_map<std::string, const char *> TryFirst = { |
114 | {"cuMemAlloc" , "cuMemAlloc_v2" }, |
115 | {"cuMemFree" , "cuMemFree_v2" }, |
116 | {"cuMemcpyDtoH" , "cuMemcpyDtoH_v2" }, |
117 | {"cuMemcpyHtoD" , "cuMemcpyHtoD_v2" }, |
118 | {"cuStreamDestroy" , "cuStreamDestroy_v2" }, |
119 | {"cuModuleGetGlobal" , "cuModuleGetGlobal_v2" }, |
120 | {"cuMemcpyDtoHAsync" , "cuMemcpyDtoHAsync_v2" }, |
121 | {"cuMemcpyDtoDAsync" , "cuMemcpyDtoDAsync_v2" }, |
122 | {"cuMemcpyHtoDAsync" , "cuMemcpyHtoDAsync_v2" }, |
123 | {"cuDevicePrimaryCtxRelease" , "cuDevicePrimaryCtxRelease_v2" }, |
124 | {"cuDevicePrimaryCtxSetFlags" , "cuDevicePrimaryCtxSetFlags_v2" }, |
125 | }; |
126 | |
127 | const char *CudaLib = DYNAMIC_CUDA_PATH; |
128 | std::string ErrMsg; |
129 | auto DynlibHandle = std::make_unique<llvm::sys::DynamicLibrary>( |
130 | llvm::sys::DynamicLibrary::getPermanentLibrary(CudaLib, &ErrMsg)); |
131 | if (!DynlibHandle->isValid()) { |
132 | DP("Unable to load library '%s': %s!\n" , CudaLib, ErrMsg.c_str()); |
133 | return false; |
134 | } |
135 | |
136 | for (size_t I = 0; I < dlwrap::size(); I++) { |
137 | const char *Sym = dlwrap::symbol(I); |
138 | |
139 | auto It = TryFirst.find(Sym); |
140 | if (It != TryFirst.end()) { |
141 | const char *First = It->second; |
142 | void *P = DynlibHandle->getAddressOfSymbol(First); |
143 | if (P) { |
144 | DP("Implementing %s with dlsym(%s) -> %p\n" , Sym, First, P); |
145 | *dlwrap::pointer(I) = P; |
146 | continue; |
147 | } |
148 | } |
149 | |
150 | void *P = DynlibHandle->getAddressOfSymbol(Sym); |
151 | if (P == nullptr) { |
152 | DP("Unable to find '%s' in '%s'!\n" , Sym, CudaLib); |
153 | return false; |
154 | } |
155 | DP("Implementing %s with dlsym(%s) -> %p\n" , Sym, Sym, P); |
156 | |
157 | *dlwrap::pointer(I) = P; |
158 | } |
159 | |
160 | return true; |
161 | } |
162 | |
163 | CUresult cuInit(unsigned X) { |
164 | // Note: Called exactly once from cuda rtl.cpp in a global constructor so |
165 | // does not need to handle being called repeatedly or concurrently |
166 | if (!checkForCUDA()) { |
167 | return CUDA_ERROR_INVALID_HANDLE; |
168 | } |
169 | return dlwrap_cuInit(X); |
170 | } |
171 | |