1 | //===-- Loader Implementation for NVPTX devices --------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file impelements a simple loader to run images supporting the NVPTX |
10 | // architecture. The file launches the '_start' kernel which should be provided |
11 | // by the device application start code and call ultimately call the 'main' |
12 | // function. |
13 | // |
14 | //===----------------------------------------------------------------------===// |
15 | |
16 | #include "Loader.h" |
17 | |
18 | #include "cuda.h" |
19 | |
20 | #include "llvm/Object/ELF.h" |
21 | #include "llvm/Object/ELFObjectFile.h" |
22 | |
23 | #include <cstddef> |
24 | #include <cstdio> |
25 | #include <cstdlib> |
26 | #include <cstring> |
27 | #include <vector> |
28 | |
29 | using namespace llvm; |
30 | using namespace object; |
31 | |
32 | static void handle_error(CUresult err) { |
33 | if (err == CUDA_SUCCESS) |
34 | return; |
35 | |
36 | const char *err_str = nullptr; |
37 | CUresult result = cuGetErrorString(err, &err_str); |
38 | if (result != CUDA_SUCCESS) |
39 | fprintf(stderr, format: "Unknown Error\n" ); |
40 | else |
41 | fprintf(stderr, format: "%s\n" , err_str); |
42 | exit(status: 1); |
43 | } |
44 | |
45 | // Gets the names of all the globals that contain functions to initialize or |
46 | // deinitialize. We need to do this manually because the NVPTX toolchain does |
47 | // not contain the necessary binary manipulation tools. |
48 | template <typename Alloc> |
49 | Expected<void *> get_ctor_dtor_array(const void *image, const size_t size, |
50 | Alloc allocator, CUmodule binary) { |
51 | auto mem_buffer = MemoryBuffer::getMemBuffer( |
52 | InputData: StringRef(reinterpret_cast<const char *>(image), size), BufferName: "image" , |
53 | /*RequiresNullTerminator=*/false); |
54 | Expected<ELF64LEObjectFile> elf_or_err = |
55 | ELF64LEObjectFile::create(Object: *mem_buffer); |
56 | if (!elf_or_err) |
57 | handle_error(toString(E: elf_or_err.takeError()).c_str()); |
58 | |
59 | std::vector<std::pair<const char *, uint16_t>> ctors; |
60 | std::vector<std::pair<const char *, uint16_t>> dtors; |
61 | // CUDA has no way to iterate over all the symbols so we need to inspect the |
62 | // ELF directly using the LLVM libraries. |
63 | for (const auto &symbol : elf_or_err->symbols()) { |
64 | auto name_or_err = symbol.getName(); |
65 | if (!name_or_err) |
66 | handle_error(toString(E: name_or_err.takeError()).c_str()); |
67 | |
68 | // Search for all symbols that contain a constructor or destructor. |
69 | if (!name_or_err->starts_with(Prefix: "__init_array_object_" ) && |
70 | !name_or_err->starts_with(Prefix: "__fini_array_object_" )) |
71 | continue; |
72 | |
73 | uint16_t priority; |
74 | if (name_or_err->rsplit(Separator: '_').second.getAsInteger(Radix: 10, Result&: priority)) |
75 | handle_error("Invalid priority for constructor or destructor" ); |
76 | |
77 | if (name_or_err->starts_with(Prefix: "__init" )) |
78 | ctors.emplace_back(args: std::make_pair(x: name_or_err->data(), y&: priority)); |
79 | else |
80 | dtors.emplace_back(args: std::make_pair(x: name_or_err->data(), y&: priority)); |
81 | } |
82 | // Lower priority constructors are run before higher ones. The reverse is true |
83 | // for destructors. |
84 | llvm::sort(ctors, [](auto x, auto y) { return x.second < y.second; }); |
85 | llvm::sort(dtors, [](auto x, auto y) { return x.second < y.second; }); |
86 | |
87 | // Allocate host pinned memory to make these arrays visible to the GPU. |
88 | CUdeviceptr *dev_memory = reinterpret_cast<CUdeviceptr *>(allocator( |
89 | ctors.size() * sizeof(CUdeviceptr) + dtors.size() * sizeof(CUdeviceptr))); |
90 | uint64_t global_size = 0; |
91 | |
92 | // Get the address of the global and then store the address of the constructor |
93 | // function to call in the constructor array. |
94 | CUdeviceptr *dev_ctors_start = dev_memory; |
95 | CUdeviceptr *dev_ctors_end = dev_ctors_start + ctors.size(); |
96 | for (uint64_t i = 0; i < ctors.size(); ++i) { |
97 | CUdeviceptr dev_ptr; |
98 | if (CUresult err = |
99 | cuModuleGetGlobal(&dev_ptr, &global_size, binary, ctors[i].first)) |
100 | handle_error(err); |
101 | if (CUresult err = |
102 | cuMemcpyDtoH(&dev_ctors_start[i], dev_ptr, sizeof(uintptr_t))) |
103 | handle_error(err); |
104 | } |
105 | |
106 | // Get the address of the global and then store the address of the destructor |
107 | // function to call in the destructor array. |
108 | CUdeviceptr *dev_dtors_start = dev_ctors_end; |
109 | CUdeviceptr *dev_dtors_end = dev_dtors_start + dtors.size(); |
110 | for (uint64_t i = 0; i < dtors.size(); ++i) { |
111 | CUdeviceptr dev_ptr; |
112 | if (CUresult err = |
113 | cuModuleGetGlobal(&dev_ptr, &global_size, binary, dtors[i].first)) |
114 | handle_error(err); |
115 | if (CUresult err = |
116 | cuMemcpyDtoH(&dev_dtors_start[i], dev_ptr, sizeof(uintptr_t))) |
117 | handle_error(err); |
118 | } |
119 | |
120 | // Obtain the address of the pointers the startup implementation uses to |
121 | // iterate the constructors and destructors. |
122 | CUdeviceptr init_start; |
123 | if (CUresult err = cuModuleGetGlobal(&init_start, &global_size, binary, |
124 | "__init_array_start" )) |
125 | handle_error(err); |
126 | CUdeviceptr init_end; |
127 | if (CUresult err = cuModuleGetGlobal(&init_end, &global_size, binary, |
128 | "__init_array_end" )) |
129 | handle_error(err); |
130 | CUdeviceptr fini_start; |
131 | if (CUresult err = cuModuleGetGlobal(&fini_start, &global_size, binary, |
132 | "__fini_array_start" )) |
133 | handle_error(err); |
134 | CUdeviceptr fini_end; |
135 | if (CUresult err = cuModuleGetGlobal(&fini_end, &global_size, binary, |
136 | "__fini_array_end" )) |
137 | handle_error(err); |
138 | |
139 | // Copy the pointers to the newly written array to the symbols so the startup |
140 | // implementation can iterate them. |
141 | if (CUresult err = |
142 | cuMemcpyHtoD(init_start, &dev_ctors_start, sizeof(uintptr_t))) |
143 | handle_error(err); |
144 | if (CUresult err = cuMemcpyHtoD(init_end, &dev_ctors_end, sizeof(uintptr_t))) |
145 | handle_error(err); |
146 | if (CUresult err = |
147 | cuMemcpyHtoD(fini_start, &dev_dtors_start, sizeof(uintptr_t))) |
148 | handle_error(err); |
149 | if (CUresult err = cuMemcpyHtoD(fini_end, &dev_dtors_end, sizeof(uintptr_t))) |
150 | handle_error(err); |
151 | |
152 | return dev_memory; |
153 | } |
154 | |
155 | template <typename args_t> |
156 | CUresult launch_kernel(CUmodule binary, CUstream stream, |
157 | rpc_device_t rpc_device, const LaunchParameters ¶ms, |
158 | const char *kernel_name, args_t kernel_args) { |
159 | // look up the '_start' kernel in the loaded module. |
160 | CUfunction function; |
161 | if (CUresult err = cuModuleGetFunction(&function, binary, kernel_name)) |
162 | handle_error(err); |
163 | |
164 | // Set up the arguments to the '_start' kernel on the GPU. |
165 | uint64_t args_size = sizeof(args_t); |
166 | void *args_config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, &kernel_args, |
167 | CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size, |
168 | CU_LAUNCH_PARAM_END}; |
169 | |
170 | // Initialize a non-blocking CUDA stream to allocate memory if needed. This |
171 | // needs to be done on a separate stream or else it will deadlock with the |
172 | // executing kernel. |
173 | CUstream memory_stream; |
174 | if (CUresult err = cuStreamCreate(&memory_stream, CU_STREAM_NON_BLOCKING)) |
175 | handle_error(err); |
176 | |
177 | // Register RPC callbacks for the malloc and free functions on HSA. |
178 | register_rpc_callbacks<32>(rpc_device); |
179 | |
180 | rpc_register_callback( |
181 | rpc_device, RPC_MALLOC, |
182 | [](rpc_port_t port, void *data) { |
183 | auto malloc_handler = [](rpc_buffer_t *buffer, void *data) -> void { |
184 | CUstream memory_stream = *static_cast<CUstream *>(data); |
185 | uint64_t size = buffer->data[0]; |
186 | CUdeviceptr dev_ptr; |
187 | if (CUresult err = cuMemAllocAsync(&dev_ptr, size, memory_stream)) |
188 | handle_error(err); |
189 | |
190 | // Wait until the memory allocation is complete. |
191 | while (cuStreamQuery(memory_stream) == CUDA_ERROR_NOT_READY) |
192 | ; |
193 | buffer->data[0] = static_cast<uintptr_t>(dev_ptr); |
194 | }; |
195 | rpc_recv_and_send(port, malloc_handler, data); |
196 | }, |
197 | &memory_stream); |
198 | rpc_register_callback( |
199 | rpc_device, RPC_FREE, |
200 | [](rpc_port_t port, void *data) { |
201 | auto free_handler = [](rpc_buffer_t *buffer, void *data) { |
202 | CUstream memory_stream = *static_cast<CUstream *>(data); |
203 | if (CUresult err = cuMemFreeAsync( |
204 | static_cast<CUdeviceptr>(buffer->data[0]), memory_stream)) |
205 | handle_error(err); |
206 | }; |
207 | rpc_recv_and_send(port, free_handler, data); |
208 | }, |
209 | &memory_stream); |
210 | |
211 | // Call the kernel with the given arguments. |
212 | if (CUresult err = cuLaunchKernel( |
213 | function, params.num_blocks_x, params.num_blocks_y, |
214 | params.num_blocks_z, params.num_threads_x, params.num_threads_y, |
215 | params.num_threads_z, 0, stream, nullptr, args_config)) |
216 | handle_error(err); |
217 | |
218 | // Wait until the kernel has completed execution on the device. Periodically |
219 | // check the RPC client for work to be performed on the server. |
220 | while (cuStreamQuery(stream) == CUDA_ERROR_NOT_READY) |
221 | if (rpc_status_t err = rpc_handle_server(rpc_device)) |
222 | handle_error(err); |
223 | |
224 | // Handle the server one more time in case the kernel exited with a pending |
225 | // send still in flight. |
226 | if (rpc_status_t err = rpc_handle_server(rpc_device)) |
227 | handle_error(err); |
228 | |
229 | return CUDA_SUCCESS; |
230 | } |
231 | |
232 | int load(int argc, char **argv, char **envp, void *image, size_t size, |
233 | const LaunchParameters ¶ms) { |
234 | if (CUresult err = cuInit(0)) |
235 | handle_error(err); |
236 | // Obtain the first device found on the system. |
237 | uint32_t device_id = 0; |
238 | CUdevice device; |
239 | if (CUresult err = cuDeviceGet(&device, device_id)) |
240 | handle_error(err); |
241 | |
242 | // Initialize the CUDA context and claim it for this execution. |
243 | CUcontext context; |
244 | if (CUresult err = cuDevicePrimaryCtxRetain(&context, device)) |
245 | handle_error(err); |
246 | if (CUresult err = cuCtxSetCurrent(context)) |
247 | handle_error(err); |
248 | |
249 | // Increase the stack size per thread. |
250 | // TODO: We should allow this to be passed in so only the tests that require a |
251 | // larger stack can specify it to save on memory usage. |
252 | if (CUresult err = cuCtxSetLimit(CU_LIMIT_STACK_SIZE, 3 * 1024)) |
253 | handle_error(err); |
254 | |
255 | // Initialize a non-blocking CUDA stream to execute the kernel. |
256 | CUstream stream; |
257 | if (CUresult err = cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING)) |
258 | handle_error(err); |
259 | |
260 | // Load the image into a CUDA module. |
261 | CUmodule binary; |
262 | if (CUresult err = cuModuleLoadDataEx(&binary, image, 0, nullptr, nullptr)) |
263 | handle_error(err); |
264 | |
265 | // Allocate pinned memory on the host to hold the pointer array for the |
266 | // copied argv and allow the GPU device to access it. |
267 | auto allocator = [&](uint64_t size) -> void * { |
268 | void *dev_ptr; |
269 | if (CUresult err = cuMemAllocHost(&dev_ptr, size)) |
270 | handle_error(err); |
271 | return dev_ptr; |
272 | }; |
273 | |
274 | auto memory_or_err = get_ctor_dtor_array(image, size, allocator, binary); |
275 | if (!memory_or_err) |
276 | handle_error(toString(memory_or_err.takeError()).c_str()); |
277 | |
278 | void *dev_argv = copy_argument_vector(argc, argv, allocator); |
279 | if (!dev_argv) |
280 | handle_error("Failed to allocate device argv" ); |
281 | |
282 | // Allocate pinned memory on the host to hold the pointer array for the |
283 | // copied environment array and allow the GPU device to access it. |
284 | void *dev_envp = copy_environment(envp, allocator); |
285 | if (!dev_envp) |
286 | handle_error("Failed to allocate device environment" ); |
287 | |
288 | // Allocate space for the return pointer and initialize it to zero. |
289 | CUdeviceptr dev_ret; |
290 | if (CUresult err = cuMemAlloc(&dev_ret, sizeof(int))) |
291 | handle_error(err); |
292 | if (CUresult err = cuMemsetD32(dev_ret, 0, 1)) |
293 | handle_error(err); |
294 | |
295 | uint32_t warp_size = 32; |
296 | auto rpc_alloc = [](uint64_t size, void *) -> void * { |
297 | void *dev_ptr; |
298 | if (CUresult err = cuMemAllocHost(&dev_ptr, size)) |
299 | handle_error(err); |
300 | return dev_ptr; |
301 | }; |
302 | rpc_device_t rpc_device; |
303 | if (rpc_status_t err = rpc_server_init(&rpc_device, RPC_MAXIMUM_PORT_COUNT, |
304 | warp_size, rpc_alloc, nullptr)) |
305 | handle_error(err); |
306 | |
307 | // Initialize the RPC client on the device by copying the local data to the |
308 | // device's internal pointer. |
309 | CUdeviceptr rpc_client_dev = 0; |
310 | uint64_t client_ptr_size = sizeof(void *); |
311 | if (CUresult err = cuModuleGetGlobal(&rpc_client_dev, &client_ptr_size, |
312 | binary, rpc_client_symbol_name)) |
313 | handle_error(err); |
314 | |
315 | CUdeviceptr rpc_client_host = 0; |
316 | if (CUresult err = |
317 | cuMemcpyDtoH(&rpc_client_host, rpc_client_dev, sizeof(void *))) |
318 | handle_error(err); |
319 | if (CUresult err = |
320 | cuMemcpyHtoD(rpc_client_host, rpc_get_client_buffer(rpc_device), |
321 | rpc_get_client_size())) |
322 | handle_error(err); |
323 | |
324 | LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1}; |
325 | begin_args_t init_args = {argc, dev_argv, dev_envp}; |
326 | if (CUresult err = launch_kernel(binary, stream, rpc_device, |
327 | single_threaded_params, "_begin" , init_args)) |
328 | handle_error(err); |
329 | |
330 | start_args_t args = {argc, dev_argv, dev_envp, |
331 | reinterpret_cast<void *>(dev_ret)}; |
332 | if (CUresult err = |
333 | launch_kernel(binary, stream, rpc_device, params, "_start" , args)) |
334 | handle_error(err); |
335 | |
336 | // Copy the return value back from the kernel and wait. |
337 | int host_ret = 0; |
338 | if (CUresult err = cuMemcpyDtoH(&host_ret, dev_ret, sizeof(int))) |
339 | handle_error(err); |
340 | |
341 | if (CUresult err = cuStreamSynchronize(stream)) |
342 | handle_error(err); |
343 | |
344 | end_args_t fini_args = {host_ret}; |
345 | if (CUresult err = launch_kernel(binary, stream, rpc_device, |
346 | single_threaded_params, "_end" , fini_args)) |
347 | handle_error(err); |
348 | |
349 | // Free the memory allocated for the device. |
350 | if (CUresult err = cuMemFreeHost(*memory_or_err)) |
351 | handle_error(err); |
352 | if (CUresult err = cuMemFree(dev_ret)) |
353 | handle_error(err); |
354 | if (CUresult err = cuMemFreeHost(dev_argv)) |
355 | handle_error(err); |
356 | if (rpc_status_t err = rpc_server_shutdown( |
357 | rpc_device, [](void *ptr, void *) { cuMemFreeHost(ptr); }, nullptr)) |
358 | handle_error(err); |
359 | |
360 | // Destroy the context and the loaded binary. |
361 | if (CUresult err = cuModuleUnload(binary)) |
362 | handle_error(err); |
363 | if (CUresult err = cuDevicePrimaryCtxRelease(device)) |
364 | handle_error(err); |
365 | return host_ret; |
366 | } |
367 | |