1//===-- Loader Implementation for NVPTX devices --------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file impelements a simple loader to run images supporting the NVPTX
10// architecture. The file launches the '_start' kernel which should be provided
11// by the device application start code and call ultimately call the 'main'
12// function.
13//
14//===----------------------------------------------------------------------===//
15
16#include "Loader.h"
17
18#include "cuda.h"
19
20#include "llvm/Object/ELF.h"
21#include "llvm/Object/ELFObjectFile.h"
22
23#include <cstddef>
24#include <cstdio>
25#include <cstdlib>
26#include <cstring>
27#include <vector>
28
29using namespace llvm;
30using namespace object;
31
32static void handle_error(CUresult err) {
33 if (err == CUDA_SUCCESS)
34 return;
35
36 const char *err_str = nullptr;
37 CUresult result = cuGetErrorString(err, &err_str);
38 if (result != CUDA_SUCCESS)
39 fprintf(stderr, format: "Unknown Error\n");
40 else
41 fprintf(stderr, format: "%s\n", err_str);
42 exit(status: 1);
43}
44
45// Gets the names of all the globals that contain functions to initialize or
46// deinitialize. We need to do this manually because the NVPTX toolchain does
47// not contain the necessary binary manipulation tools.
48template <typename Alloc>
49Expected<void *> get_ctor_dtor_array(const void *image, const size_t size,
50 Alloc allocator, CUmodule binary) {
51 auto mem_buffer = MemoryBuffer::getMemBuffer(
52 InputData: StringRef(reinterpret_cast<const char *>(image), size), BufferName: "image",
53 /*RequiresNullTerminator=*/false);
54 Expected<ELF64LEObjectFile> elf_or_err =
55 ELF64LEObjectFile::create(Object: *mem_buffer);
56 if (!elf_or_err)
57 handle_error(toString(E: elf_or_err.takeError()).c_str());
58
59 std::vector<std::pair<const char *, uint16_t>> ctors;
60 std::vector<std::pair<const char *, uint16_t>> dtors;
61 // CUDA has no way to iterate over all the symbols so we need to inspect the
62 // ELF directly using the LLVM libraries.
63 for (const auto &symbol : elf_or_err->symbols()) {
64 auto name_or_err = symbol.getName();
65 if (!name_or_err)
66 handle_error(toString(E: name_or_err.takeError()).c_str());
67
68 // Search for all symbols that contain a constructor or destructor.
69 if (!name_or_err->starts_with(Prefix: "__init_array_object_") &&
70 !name_or_err->starts_with(Prefix: "__fini_array_object_"))
71 continue;
72
73 uint16_t priority;
74 if (name_or_err->rsplit(Separator: '_').second.getAsInteger(Radix: 10, Result&: priority))
75 handle_error("Invalid priority for constructor or destructor");
76
77 if (name_or_err->starts_with(Prefix: "__init"))
78 ctors.emplace_back(args: std::make_pair(x: name_or_err->data(), y&: priority));
79 else
80 dtors.emplace_back(args: std::make_pair(x: name_or_err->data(), y&: priority));
81 }
82 // Lower priority constructors are run before higher ones. The reverse is true
83 // for destructors.
84 llvm::sort(ctors, [](auto x, auto y) { return x.second < y.second; });
85 llvm::sort(dtors, [](auto x, auto y) { return x.second < y.second; });
86
87 // Allocate host pinned memory to make these arrays visible to the GPU.
88 CUdeviceptr *dev_memory = reinterpret_cast<CUdeviceptr *>(allocator(
89 ctors.size() * sizeof(CUdeviceptr) + dtors.size() * sizeof(CUdeviceptr)));
90 uint64_t global_size = 0;
91
92 // Get the address of the global and then store the address of the constructor
93 // function to call in the constructor array.
94 CUdeviceptr *dev_ctors_start = dev_memory;
95 CUdeviceptr *dev_ctors_end = dev_ctors_start + ctors.size();
96 for (uint64_t i = 0; i < ctors.size(); ++i) {
97 CUdeviceptr dev_ptr;
98 if (CUresult err =
99 cuModuleGetGlobal(&dev_ptr, &global_size, binary, ctors[i].first))
100 handle_error(err);
101 if (CUresult err =
102 cuMemcpyDtoH(&dev_ctors_start[i], dev_ptr, sizeof(uintptr_t)))
103 handle_error(err);
104 }
105
106 // Get the address of the global and then store the address of the destructor
107 // function to call in the destructor array.
108 CUdeviceptr *dev_dtors_start = dev_ctors_end;
109 CUdeviceptr *dev_dtors_end = dev_dtors_start + dtors.size();
110 for (uint64_t i = 0; i < dtors.size(); ++i) {
111 CUdeviceptr dev_ptr;
112 if (CUresult err =
113 cuModuleGetGlobal(&dev_ptr, &global_size, binary, dtors[i].first))
114 handle_error(err);
115 if (CUresult err =
116 cuMemcpyDtoH(&dev_dtors_start[i], dev_ptr, sizeof(uintptr_t)))
117 handle_error(err);
118 }
119
120 // Obtain the address of the pointers the startup implementation uses to
121 // iterate the constructors and destructors.
122 CUdeviceptr init_start;
123 if (CUresult err = cuModuleGetGlobal(&init_start, &global_size, binary,
124 "__init_array_start"))
125 handle_error(err);
126 CUdeviceptr init_end;
127 if (CUresult err = cuModuleGetGlobal(&init_end, &global_size, binary,
128 "__init_array_end"))
129 handle_error(err);
130 CUdeviceptr fini_start;
131 if (CUresult err = cuModuleGetGlobal(&fini_start, &global_size, binary,
132 "__fini_array_start"))
133 handle_error(err);
134 CUdeviceptr fini_end;
135 if (CUresult err = cuModuleGetGlobal(&fini_end, &global_size, binary,
136 "__fini_array_end"))
137 handle_error(err);
138
139 // Copy the pointers to the newly written array to the symbols so the startup
140 // implementation can iterate them.
141 if (CUresult err =
142 cuMemcpyHtoD(init_start, &dev_ctors_start, sizeof(uintptr_t)))
143 handle_error(err);
144 if (CUresult err = cuMemcpyHtoD(init_end, &dev_ctors_end, sizeof(uintptr_t)))
145 handle_error(err);
146 if (CUresult err =
147 cuMemcpyHtoD(fini_start, &dev_dtors_start, sizeof(uintptr_t)))
148 handle_error(err);
149 if (CUresult err = cuMemcpyHtoD(fini_end, &dev_dtors_end, sizeof(uintptr_t)))
150 handle_error(err);
151
152 return dev_memory;
153}
154
155template <typename args_t>
156CUresult launch_kernel(CUmodule binary, CUstream stream,
157 rpc_device_t rpc_device, const LaunchParameters &params,
158 const char *kernel_name, args_t kernel_args) {
159 // look up the '_start' kernel in the loaded module.
160 CUfunction function;
161 if (CUresult err = cuModuleGetFunction(&function, binary, kernel_name))
162 handle_error(err);
163
164 // Set up the arguments to the '_start' kernel on the GPU.
165 uint64_t args_size = sizeof(args_t);
166 void *args_config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, &kernel_args,
167 CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size,
168 CU_LAUNCH_PARAM_END};
169
170 // Initialize a non-blocking CUDA stream to allocate memory if needed. This
171 // needs to be done on a separate stream or else it will deadlock with the
172 // executing kernel.
173 CUstream memory_stream;
174 if (CUresult err = cuStreamCreate(&memory_stream, CU_STREAM_NON_BLOCKING))
175 handle_error(err);
176
177 // Register RPC callbacks for the malloc and free functions on HSA.
178 register_rpc_callbacks<32>(rpc_device);
179
180 rpc_register_callback(
181 rpc_device, RPC_MALLOC,
182 [](rpc_port_t port, void *data) {
183 auto malloc_handler = [](rpc_buffer_t *buffer, void *data) -> void {
184 CUstream memory_stream = *static_cast<CUstream *>(data);
185 uint64_t size = buffer->data[0];
186 CUdeviceptr dev_ptr;
187 if (CUresult err = cuMemAllocAsync(&dev_ptr, size, memory_stream))
188 handle_error(err);
189
190 // Wait until the memory allocation is complete.
191 while (cuStreamQuery(memory_stream) == CUDA_ERROR_NOT_READY)
192 ;
193 buffer->data[0] = static_cast<uintptr_t>(dev_ptr);
194 };
195 rpc_recv_and_send(port, malloc_handler, data);
196 },
197 &memory_stream);
198 rpc_register_callback(
199 rpc_device, RPC_FREE,
200 [](rpc_port_t port, void *data) {
201 auto free_handler = [](rpc_buffer_t *buffer, void *data) {
202 CUstream memory_stream = *static_cast<CUstream *>(data);
203 if (CUresult err = cuMemFreeAsync(
204 static_cast<CUdeviceptr>(buffer->data[0]), memory_stream))
205 handle_error(err);
206 };
207 rpc_recv_and_send(port, free_handler, data);
208 },
209 &memory_stream);
210
211 // Call the kernel with the given arguments.
212 if (CUresult err = cuLaunchKernel(
213 function, params.num_blocks_x, params.num_blocks_y,
214 params.num_blocks_z, params.num_threads_x, params.num_threads_y,
215 params.num_threads_z, 0, stream, nullptr, args_config))
216 handle_error(err);
217
218 // Wait until the kernel has completed execution on the device. Periodically
219 // check the RPC client for work to be performed on the server.
220 while (cuStreamQuery(stream) == CUDA_ERROR_NOT_READY)
221 if (rpc_status_t err = rpc_handle_server(rpc_device))
222 handle_error(err);
223
224 // Handle the server one more time in case the kernel exited with a pending
225 // send still in flight.
226 if (rpc_status_t err = rpc_handle_server(rpc_device))
227 handle_error(err);
228
229 return CUDA_SUCCESS;
230}
231
232int load(int argc, char **argv, char **envp, void *image, size_t size,
233 const LaunchParameters &params) {
234 if (CUresult err = cuInit(0))
235 handle_error(err);
236 // Obtain the first device found on the system.
237 uint32_t device_id = 0;
238 CUdevice device;
239 if (CUresult err = cuDeviceGet(&device, device_id))
240 handle_error(err);
241
242 // Initialize the CUDA context and claim it for this execution.
243 CUcontext context;
244 if (CUresult err = cuDevicePrimaryCtxRetain(&context, device))
245 handle_error(err);
246 if (CUresult err = cuCtxSetCurrent(context))
247 handle_error(err);
248
249 // Increase the stack size per thread.
250 // TODO: We should allow this to be passed in so only the tests that require a
251 // larger stack can specify it to save on memory usage.
252 if (CUresult err = cuCtxSetLimit(CU_LIMIT_STACK_SIZE, 3 * 1024))
253 handle_error(err);
254
255 // Initialize a non-blocking CUDA stream to execute the kernel.
256 CUstream stream;
257 if (CUresult err = cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING))
258 handle_error(err);
259
260 // Load the image into a CUDA module.
261 CUmodule binary;
262 if (CUresult err = cuModuleLoadDataEx(&binary, image, 0, nullptr, nullptr))
263 handle_error(err);
264
265 // Allocate pinned memory on the host to hold the pointer array for the
266 // copied argv and allow the GPU device to access it.
267 auto allocator = [&](uint64_t size) -> void * {
268 void *dev_ptr;
269 if (CUresult err = cuMemAllocHost(&dev_ptr, size))
270 handle_error(err);
271 return dev_ptr;
272 };
273
274 auto memory_or_err = get_ctor_dtor_array(image, size, allocator, binary);
275 if (!memory_or_err)
276 handle_error(toString(memory_or_err.takeError()).c_str());
277
278 void *dev_argv = copy_argument_vector(argc, argv, allocator);
279 if (!dev_argv)
280 handle_error("Failed to allocate device argv");
281
282 // Allocate pinned memory on the host to hold the pointer array for the
283 // copied environment array and allow the GPU device to access it.
284 void *dev_envp = copy_environment(envp, allocator);
285 if (!dev_envp)
286 handle_error("Failed to allocate device environment");
287
288 // Allocate space for the return pointer and initialize it to zero.
289 CUdeviceptr dev_ret;
290 if (CUresult err = cuMemAlloc(&dev_ret, sizeof(int)))
291 handle_error(err);
292 if (CUresult err = cuMemsetD32(dev_ret, 0, 1))
293 handle_error(err);
294
295 uint32_t warp_size = 32;
296 auto rpc_alloc = [](uint64_t size, void *) -> void * {
297 void *dev_ptr;
298 if (CUresult err = cuMemAllocHost(&dev_ptr, size))
299 handle_error(err);
300 return dev_ptr;
301 };
302 rpc_device_t rpc_device;
303 if (rpc_status_t err = rpc_server_init(&rpc_device, RPC_MAXIMUM_PORT_COUNT,
304 warp_size, rpc_alloc, nullptr))
305 handle_error(err);
306
307 // Initialize the RPC client on the device by copying the local data to the
308 // device's internal pointer.
309 CUdeviceptr rpc_client_dev = 0;
310 uint64_t client_ptr_size = sizeof(void *);
311 if (CUresult err = cuModuleGetGlobal(&rpc_client_dev, &client_ptr_size,
312 binary, rpc_client_symbol_name))
313 handle_error(err);
314
315 CUdeviceptr rpc_client_host = 0;
316 if (CUresult err =
317 cuMemcpyDtoH(&rpc_client_host, rpc_client_dev, sizeof(void *)))
318 handle_error(err);
319 if (CUresult err =
320 cuMemcpyHtoD(rpc_client_host, rpc_get_client_buffer(rpc_device),
321 rpc_get_client_size()))
322 handle_error(err);
323
324 LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
325 begin_args_t init_args = {argc, dev_argv, dev_envp};
326 if (CUresult err = launch_kernel(binary, stream, rpc_device,
327 single_threaded_params, "_begin", init_args))
328 handle_error(err);
329
330 start_args_t args = {argc, dev_argv, dev_envp,
331 reinterpret_cast<void *>(dev_ret)};
332 if (CUresult err =
333 launch_kernel(binary, stream, rpc_device, params, "_start", args))
334 handle_error(err);
335
336 // Copy the return value back from the kernel and wait.
337 int host_ret = 0;
338 if (CUresult err = cuMemcpyDtoH(&host_ret, dev_ret, sizeof(int)))
339 handle_error(err);
340
341 if (CUresult err = cuStreamSynchronize(stream))
342 handle_error(err);
343
344 end_args_t fini_args = {host_ret};
345 if (CUresult err = launch_kernel(binary, stream, rpc_device,
346 single_threaded_params, "_end", fini_args))
347 handle_error(err);
348
349 // Free the memory allocated for the device.
350 if (CUresult err = cuMemFreeHost(*memory_or_err))
351 handle_error(err);
352 if (CUresult err = cuMemFree(dev_ret))
353 handle_error(err);
354 if (CUresult err = cuMemFreeHost(dev_argv))
355 handle_error(err);
356 if (rpc_status_t err = rpc_server_shutdown(
357 rpc_device, [](void *ptr, void *) { cuMemFreeHost(ptr); }, nullptr))
358 handle_error(err);
359
360 // Destroy the context and the loaded binary.
361 if (CUresult err = cuModuleUnload(binary))
362 handle_error(err);
363 if (CUresult err = cuDevicePrimaryCtxRelease(device))
364 handle_error(err);
365 return host_ret;
366}
367

source code of libc/utils/gpu/loader/nvptx/Loader.cpp