1 | //===--- cuda/dynamic_cuda/cuda.pp ------------------------------- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // Implement subset of cuda api by calling into cuda library via dlopen |
10 | // Does the dlopen/dlsym calls as part of the call to cuInit |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "llvm/Support/DynamicLibrary.h" |
15 | |
16 | #include "Shared/Debug.h" |
17 | |
18 | #include "DLWrap.h" |
19 | #include "cuda.h" |
20 | |
21 | #include <memory> |
22 | #include <string> |
23 | #include <unordered_map> |
24 | |
25 | DLWRAP_INITIALIZE() |
26 | |
27 | DLWRAP_INTERNAL(cuInit, 1) |
28 | |
29 | DLWRAP(cuCtxGetDevice, 1) |
30 | DLWRAP(cuDeviceGet, 2) |
31 | DLWRAP(cuDeviceGetAttribute, 3) |
32 | DLWRAP(cuDeviceGetCount, 1) |
33 | DLWRAP(cuFuncGetAttribute, 3) |
34 | |
35 | // Device info |
36 | DLWRAP(cuDeviceGetName, 3) |
37 | DLWRAP(cuDeviceTotalMem, 2) |
38 | DLWRAP(cuDriverGetVersion, 1) |
39 | |
40 | DLWRAP(cuGetErrorString, 2) |
41 | DLWRAP(cuLaunchKernel, 11) |
42 | DLWRAP(cuLaunchHostFunc, 3) |
43 | |
44 | DLWRAP(cuMemAlloc, 2) |
45 | DLWRAP(cuMemAllocHost, 2) |
46 | DLWRAP(cuMemAllocManaged, 3) |
47 | DLWRAP(cuMemAllocAsync, 3) |
48 | |
49 | DLWRAP(cuMemcpyDtoDAsync, 4) |
50 | DLWRAP(cuMemcpyDtoH, 3) |
51 | DLWRAP(cuMemcpyDtoHAsync, 4) |
52 | DLWRAP(cuMemcpyHtoD, 3) |
53 | DLWRAP(cuMemcpyHtoDAsync, 4) |
54 | |
55 | DLWRAP(cuMemFree, 1) |
56 | DLWRAP(cuMemFreeHost, 1) |
57 | DLWRAP(cuMemFreeAsync, 2) |
58 | |
59 | DLWRAP(cuModuleGetFunction, 3) |
60 | DLWRAP(cuModuleGetGlobal, 4) |
61 | |
62 | DLWRAP(cuModuleUnload, 1) |
63 | DLWRAP(cuStreamCreate, 2) |
64 | DLWRAP(cuStreamDestroy, 1) |
65 | DLWRAP(cuStreamSynchronize, 1) |
66 | DLWRAP(cuStreamQuery, 1) |
67 | DLWRAP(cuStreamAddCallback, 4) |
68 | DLWRAP(cuCtxSetCurrent, 1) |
69 | DLWRAP(cuDevicePrimaryCtxRelease, 1) |
70 | DLWRAP(cuDevicePrimaryCtxGetState, 3) |
71 | DLWRAP(cuDevicePrimaryCtxSetFlags, 2) |
72 | DLWRAP(cuDevicePrimaryCtxRetain, 2) |
73 | DLWRAP(cuModuleLoadDataEx, 5) |
74 | |
75 | DLWRAP(cuDeviceCanAccessPeer, 3) |
76 | DLWRAP(cuCtxEnablePeerAccess, 2) |
77 | DLWRAP(cuMemcpyPeerAsync, 6) |
78 | |
79 | DLWRAP(cuCtxGetLimit, 2) |
80 | DLWRAP(cuCtxSetLimit, 2) |
81 | |
82 | DLWRAP(cuEventCreate, 2) |
83 | DLWRAP(cuEventRecord, 2) |
84 | DLWRAP(cuStreamWaitEvent, 3) |
85 | DLWRAP(cuEventSynchronize, 1) |
86 | DLWRAP(cuEventDestroy, 1) |
87 | |
88 | DLWRAP_FINALIZE() |
89 | |
90 | DLWRAP(cuMemUnmap, 2) |
91 | DLWRAP(cuMemRelease, 1) |
92 | DLWRAP(cuMemAddressFree, 2) |
93 | DLWRAP(cuMemGetInfo, 2) |
94 | DLWRAP(cuMemAddressReserve, 5) |
95 | DLWRAP(cuMemMap, 5) |
96 | DLWRAP(cuMemCreate, 4) |
97 | DLWRAP(cuMemSetAccess, 4) |
98 | DLWRAP(cuMemGetAllocationGranularity, 3) |
99 | |
100 | #ifndef DYNAMIC_CUDA_PATH |
101 | #define DYNAMIC_CUDA_PATH "libcuda.so" |
102 | #endif |
103 | |
104 | #ifndef TARGET_NAME |
105 | #define TARGET_NAME CUDA |
106 | #endif |
107 | #ifndef DEBUG_PREFIX |
108 | #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL" |
109 | #endif |
110 | |
111 | static bool checkForCUDA() { |
112 | // return true if dlopen succeeded and all functions found |
113 | |
114 | // Prefer _v2 versions of functions if found in the library |
115 | std::unordered_map<std::string, const char *> TryFirst = { |
116 | {"cuMemAlloc" , "cuMemAlloc_v2" }, |
117 | {"cuMemFree" , "cuMemFree_v2" }, |
118 | {"cuMemcpyDtoH" , "cuMemcpyDtoH_v2" }, |
119 | {"cuMemcpyHtoD" , "cuMemcpyHtoD_v2" }, |
120 | {"cuStreamDestroy" , "cuStreamDestroy_v2" }, |
121 | {"cuModuleGetGlobal" , "cuModuleGetGlobal_v2" }, |
122 | {"cuMemcpyDtoHAsync" , "cuMemcpyDtoHAsync_v2" }, |
123 | {"cuMemcpyDtoDAsync" , "cuMemcpyDtoDAsync_v2" }, |
124 | {"cuMemcpyHtoDAsync" , "cuMemcpyHtoDAsync_v2" }, |
125 | {"cuDevicePrimaryCtxRelease" , "cuDevicePrimaryCtxRelease_v2" }, |
126 | {"cuDevicePrimaryCtxSetFlags" , "cuDevicePrimaryCtxSetFlags_v2" }, |
127 | }; |
128 | |
129 | const char *CudaLib = DYNAMIC_CUDA_PATH; |
130 | std::string ErrMsg; |
131 | auto DynlibHandle = std::make_unique<llvm::sys::DynamicLibrary>( |
132 | llvm::sys::DynamicLibrary::getPermanentLibrary(CudaLib, &ErrMsg)); |
133 | if (!DynlibHandle->isValid()) { |
134 | DP("Unable to load library '%s': %s!\n" , CudaLib, ErrMsg.c_str()); |
135 | return false; |
136 | } |
137 | |
138 | for (size_t I = 0; I < dlwrap::size(); I++) { |
139 | const char *Sym = dlwrap::symbol(I); |
140 | |
141 | auto It = TryFirst.find(Sym); |
142 | if (It != TryFirst.end()) { |
143 | const char *First = It->second; |
144 | void *P = DynlibHandle->getAddressOfSymbol(First); |
145 | if (P) { |
146 | DP("Implementing %s with dlsym(%s) -> %p\n" , Sym, First, P); |
147 | *dlwrap::pointer(I) = P; |
148 | continue; |
149 | } |
150 | } |
151 | |
152 | void *P = DynlibHandle->getAddressOfSymbol(Sym); |
153 | if (P == nullptr) { |
154 | DP("Unable to find '%s' in '%s'!\n" , Sym, CudaLib); |
155 | return false; |
156 | } |
157 | DP("Implementing %s with dlsym(%s) -> %p\n" , Sym, Sym, P); |
158 | |
159 | *dlwrap::pointer(I) = P; |
160 | } |
161 | |
162 | return true; |
163 | } |
164 | |
165 | CUresult cuInit(unsigned X) { |
166 | // Note: Called exactly once from cuda rtl.cpp in a global constructor so |
167 | // does not need to handle being called repeatedly or concurrently |
168 | if (!checkForCUDA()) { |
169 | return CUDA_ERROR_INVALID_HANDLE; |
170 | } |
171 | return dlwrap_cuInit(X); |
172 | } |
173 | |