Loader.cpp source code [libc/utils/gpu/loader/nvptx/Loader.cpp]

1	//===-- Loader Implementation for NVPTX devices --------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file impelements a simple loader to run images supporting the NVPTX
10	// architecture. The file launches the '_start' kernel which should be provided
11	// by the device application start code and call ultimately call the 'main'
12	// function.
13	//
14	//===----------------------------------------------------------------------===//
15
16	#include "Loader.h"
17
18	#include "cuda.h"
19
20	#include "llvm/Object/ELF.h"
21	#include "llvm/Object/ELFObjectFile.h"
22
23	#include <cstddef>
24	#include <cstdio>
25	#include <cstdlib>
26	#include <cstring>
27	#include <vector>
28
29	using namespace llvm;
30	using namespace object;
31
32	static void handle_error(CUresult err) {
33	if (err == CUDA_SUCCESS)
34	return;
35
36	const char err_str = nullptr*;
37	CUresult result = cuGetErrorString(err, &err_str);
38	if (result != CUDA_SUCCESS)
39	fprintf(stderr, format: "Unknown Error\n");
40	else
41	fprintf(stderr, format: "%s\n", err_str);
42	exit(status: `1`);
43	}
44
45	// Gets the names of all the globals that contain functions to initialize or
46	// deinitialize. We need to do this manually because the NVPTX toolchain does
47	// not contain the necessary binary manipulation tools.
48	template <typename Alloc>
49	Expected<void > get_ctor_dtor_array(const* void image, const* size_t size,
50	Alloc allocator, CUmodule binary) {
51	auto mem_buffer = MemoryBuffer::getMemBuffer(
52	InputData: StringRef (reinterpret_cast<const char *>(image), size), BufferName: "image",
53	/RequiresNullTerminator=/false);
54	Expected<ELF64LEObjectFile> elf_or_err =
55	ELF64LEObjectFile::create(Object: *mem_buffer);
56	if (!elf_or_err)
57	handle_error(toString(E: elf_or_err.takeError()).c_str());
58
59	std::vector<std::pair<const char *, uint16_t>> ctors;
60	std::vector<std::pair<const char *, uint16_t>> dtors;
61	// CUDA has no way to iterate over all the symbols so we need to inspect the
62	// ELF directly using the LLVM libraries.
63	for (const auto &symbol : elf_or_err ->symbols()) {
64	auto name_or_err = symbol.getName();
65	if (!name_or_err)
66	handle_error(toString(E: name_or_err.takeError()).c_str());
67
68	// Search for all symbols that contain a constructor or destructor.
69	if (!name_or_err ->starts_with(Prefix: "__init_array_object_") &&
70	!name_or_err ->starts_with(Prefix: "__fini_array_object_"))
71	continue;
72
73	uint16_t priority;
74	if (name_or_err ->rsplit(Separator: `'_'`).second.getAsInteger(Radix: `10`, Result&: priority))
75	handle_error("Invalid priority for constructor or destructor");
76
77	if (name_or_err ->starts_with(Prefix: "__init"))
78	ctors.emplace_back(args: std::make_pair(x: name_or_err ->data(), y&: priority));
79	else
80	dtors.emplace_back(args: std::make_pair(x: name_or_err ->data(), y&: priority));
81	}
82	// Lower priority constructors are run before higher ones. The reverse is true
83	// for destructors.
84	llvm::sort(ctors, [](auto x, auto y) { return x.second < y.second; });
85	llvm::sort(dtors, [](auto x, auto y) { return x.second < y.second; });
86
87	// Allocate host pinned memory to make these arrays visible to the GPU.
88	CUdeviceptr dev_memory = reinterpret_cast<CUdeviceptr >(allocator(
89	ctors.size() * sizeof(CUdeviceptr) + dtors.size() * sizeof(CUdeviceptr)));
90	uint64_t global_size = `0`;
91
92	// Get the address of the global and then store the address of the constructor
93	// function to call in the constructor array.
94	CUdeviceptr *dev_ctors_start = dev_memory;
95	CUdeviceptr *dev_ctors_end = dev_ctors_start + ctors.size();
96	for (uint64_t i = `0`; i < ctors.size(); ++i) {
97	CUdeviceptr dev_ptr;
98	if (CUresult err =
99	cuModuleGetGlobal(&dev_ptr, &global_size, binary, ctors[i].first))
100	handle_error(err);
101	if (CUresult err =
102	cuMemcpyDtoH(&dev_ctors_start[i], dev_ptr, sizeof(uintptr_t)))
103	handle_error(err);
104	}
105
106	// Get the address of the global and then store the address of the destructor
107	// function to call in the destructor array.
108	CUdeviceptr *dev_dtors_start = dev_ctors_end;
109	CUdeviceptr *dev_dtors_end = dev_dtors_start + dtors.size();
110	for (uint64_t i = `0`; i < dtors.size(); ++i) {
111	CUdeviceptr dev_ptr;
112	if (CUresult err =
113	cuModuleGetGlobal(&dev_ptr, &global_size, binary, dtors[i].first))
114	handle_error(err);
115	if (CUresult err =
116	cuMemcpyDtoH(&dev_dtors_start[i], dev_ptr, sizeof(uintptr_t)))
117	handle_error(err);
118	}
119
120	// Obtain the address of the pointers the startup implementation uses to
121	// iterate the constructors and destructors.
122	CUdeviceptr init_start;
123	if (CUresult err = cuModuleGetGlobal(&init_start, &global_size, binary,
124	"__init_array_start"))
125	handle_error(err);
126	CUdeviceptr init_end;
127	if (CUresult err = cuModuleGetGlobal(&init_end, &global_size, binary,
128	"__init_array_end"))
129	handle_error(err);
130	CUdeviceptr fini_start;
131	if (CUresult err = cuModuleGetGlobal(&fini_start, &global_size, binary,
132	"__fini_array_start"))
133	handle_error(err);
134	CUdeviceptr fini_end;
135	if (CUresult err = cuModuleGetGlobal(&fini_end, &global_size, binary,
136	"__fini_array_end"))
137	handle_error(err);
138
139	// Copy the pointers to the newly written array to the symbols so the startup
140	// implementation can iterate them.
141	if (CUresult err =
142	cuMemcpyHtoD(init_start, &dev_ctors_start, sizeof(uintptr_t)))
143	handle_error(err);
144	if (CUresult err = cuMemcpyHtoD(init_end, &dev_ctors_end, sizeof(uintptr_t)))
145	handle_error(err);
146	if (CUresult err =
147	cuMemcpyHtoD(fini_start, &dev_dtors_start, sizeof(uintptr_t)))
148	handle_error(err);
149	if (CUresult err = cuMemcpyHtoD(fini_end, &dev_dtors_end, sizeof(uintptr_t)))
150	handle_error(err);
151
152	return dev_memory;
153	}
154
155	template <typename args_t>
156	CUresult launch_kernel(CUmodule binary, CUstream stream,
157	rpc_device_t rpc_device, const LaunchParameters &params,
158	const char *kernel_name, args_t kernel_args) {
159	// look up the '_start' kernel in the loaded module.
160	CUfunction function;
161	if (CUresult err = cuModuleGetFunction(&function, binary, kernel_name))
162	handle_error(err);
163
164	// Set up the arguments to the '_start' kernel on the GPU.
165	uint64_t args_size = sizeof(args_t);
166	void *args_config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, &kernel_args,
167	CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size,
168	CU_LAUNCH_PARAM_END};
169
170	// Initialize a non-blocking CUDA stream to allocate memory if needed. This
171	// needs to be done on a separate stream or else it will deadlock with the
172	// executing kernel.
173	CUstream memory_stream;
174	if (CUresult err = cuStreamCreate(&memory_stream, CU_STREAM_NON_BLOCKING))
175	handle_error(err);
176
177	// Register RPC callbacks for the malloc and free functions on HSA.
178	register_rpc_callbacks<`32`>(rpc_device);
179
180	rpc_register_callback(
181	rpc_device, RPC_MALLOC,
182	[](rpc_port_t port, void *data) {
183	auto malloc_handler = [](rpc_buffer_t buffer, void* data) -> void* {
184	CUstream memory_stream = *static_cast<CUstream *>(data);
185	uint64_t size = buffer->data[`0`];
186	CUdeviceptr dev_ptr;
187	if (CUresult err = cuMemAllocAsync(&dev_ptr, size, memory_stream))
188	handle_error(err);
189
190	// Wait until the memory allocation is complete.
191	while (cuStreamQuery(memory_stream) == CUDA_ERROR_NOT_READY)
192	;
193	buffer->data[`0`] = static_cast<uintptr_t>(dev_ptr);
194	};
195	rpc_recv_and_send(port, malloc_handler, data);
196	},
197	&memory_stream);
198	rpc_register_callback(
199	rpc_device, RPC_FREE,
200	[](rpc_port_t port, void *data) {
201	auto free_handler = [](rpc_buffer_t buffer, void* *data) {
202	CUstream memory_stream = *static_cast<CUstream *>(data);
203	if (CUresult err = cuMemFreeAsync(
204	static_cast<CUdeviceptr>(buffer->data[`0`]), memory_stream))
205	handle_error(err);
206	};
207	rpc_recv_and_send(port, free_handler, data);
208	},
209	&memory_stream);
210
211	// Call the kernel with the given arguments.
212	if (CUresult err = cuLaunchKernel(
213	function, params.num_blocks_x, params.num_blocks_y,
214	params.num_blocks_z, params.num_threads_x, params.num_threads_y,
215	params.num_threads_z, `0`, stream, nullptr, args_config))
216	handle_error(err);
217
218	// Wait until the kernel has completed execution on the device. Periodically
219	// check the RPC client for work to be performed on the server.
220	while (cuStreamQuery(stream) == CUDA_ERROR_NOT_READY)
221	if (rpc_status_t err = rpc_handle_server(rpc_device))
222	handle_error(err);
223
224	// Handle the server one more time in case the kernel exited with a pending
225	// send still in flight.
226	if (rpc_status_t err = rpc_handle_server(rpc_device))
227	handle_error(err);
228
229	return CUDA_SUCCESS;
230	}
231
232	int load(int argc, char *argv, char* *envp, void* *image, size_t size,
233	const LaunchParameters &params) {
234	if (CUresult err = cuInit(`0`))
235	handle_error(err);
236	// Obtain the first device found on the system.
237	uint32_t device_id = `0`;
238	CUdevice device;
239	if (CUresult err = cuDeviceGet(&device, device_id))
240	handle_error(err);
241
242	// Initialize the CUDA context and claim it for this execution.
243	CUcontext context;
244	if (CUresult err = cuDevicePrimaryCtxRetain(&context, device))
245	handle_error(err);
246	if (CUresult err = cuCtxSetCurrent(context))
247	handle_error(err);
248
249	// Increase the stack size per thread.
250	// TODO: We should allow this to be passed in so only the tests that require a
251	// larger stack can specify it to save on memory usage.
252	if (CUresult err = cuCtxSetLimit(CU_LIMIT_STACK_SIZE, `3` * `1024`))
253	handle_error(err);
254
255	// Initialize a non-blocking CUDA stream to execute the kernel.
256	CUstream stream;
257	if (CUresult err = cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING))
258	handle_error(err);
259
260	// Load the image into a CUDA module.
261	CUmodule binary;
262	if (CUresult err = cuModuleLoadDataEx(&binary, image, `0`, nullptr, nullptr))
263	handle_error(err);
264
265	// Allocate pinned memory on the host to hold the pointer array for the
266	// copied argv and allow the GPU device to access it.
267	auto allocator = [&](uint64_t size) -> void * {
268	void *dev_ptr;
269	if (CUresult err = cuMemAllocHost(&dev_ptr, size))
270	handle_error(err);
271	return dev_ptr;
272	};
273
274	auto memory_or_err = get_ctor_dtor_array(image, size, allocator, binary);
275	if (!memory_or_err)
276	handle_error(toString(memory_or_err.takeError()).c_str());
277
278	void *dev_argv = copy_argument_vector(argc, argv, allocator);
279	if (!dev_argv)
280	handle_error("Failed to allocate device argv");
281
282	// Allocate pinned memory on the host to hold the pointer array for the
283	// copied environment array and allow the GPU device to access it.
284	void *dev_envp = copy_environment(envp, allocator);
285	if (!dev_envp)
286	handle_error("Failed to allocate device environment");
287
288	// Allocate space for the return pointer and initialize it to zero.
289	CUdeviceptr dev_ret;
290	if (CUresult err = cuMemAlloc(&dev_ret, sizeof(int)))
291	handle_error(err);
292	if (CUresult err = cuMemsetD32(dev_ret, `0`, `1`))
293	handle_error(err);
294
295	uint32_t warp_size = `32`;
296	auto rpc_alloc = [](uint64_t size, void ) -> void* * {
297	void *dev_ptr;
298	if (CUresult err = cuMemAllocHost(&dev_ptr, size))
299	handle_error(err);
300	return dev_ptr;
301	};
302	rpc_device_t rpc_device;
303	if (rpc_status_t err = rpc_server_init(&rpc_device, RPC_MAXIMUM_PORT_COUNT,
304	warp_size, rpc_alloc, nullptr))
305	handle_error(err);
306
307	// Initialize the RPC client on the device by copying the local data to the
308	// device's internal pointer.
309	CUdeviceptr rpc_client_dev = `0`;
310	uint64_t client_ptr_size = sizeof(void *);
311	if (CUresult err = cuModuleGetGlobal(&rpc_client_dev, &client_ptr_size,
312	binary, rpc_client_symbol_name))
313	handle_error(err);
314
315	CUdeviceptr rpc_client_host = `0`;
316	if (CUresult err =
317	cuMemcpyDtoH(&rpc_client_host, rpc_client_dev, sizeof(void *)))
318	handle_error(err);
319	if (CUresult err =
320	cuMemcpyHtoD(rpc_client_host, rpc_get_client_buffer(rpc_device),
321	rpc_get_client_size()))
322	handle_error(err);
323
324	LaunchParameters single_threaded_params = {`1`, `1`, `1`, `1`, `1`, `1`};
325	begin_args_t init_args = {argc, dev_argv, dev_envp};
326	if (CUresult err = launch_kernel(binary, stream, rpc_device,
327	single_threaded_params, "_begin", init_args))
328	handle_error(err);
329
330	start_args_t args = {argc, dev_argv, dev_envp,
331	reinterpret_cast<void *>(dev_ret)};
332	if (CUresult err =
333	launch_kernel(binary, stream, rpc_device, params, "_start", args))
334	handle_error(err);
335
336	// Copy the return value back from the kernel and wait.
337	int host_ret = `0`;
338	if (CUresult err = cuMemcpyDtoH(&host_ret, dev_ret, sizeof(int)))
339	handle_error(err);
340
341	if (CUresult err = cuStreamSynchronize(stream))
342	handle_error(err);
343
344	end_args_t fini_args = {host_ret};
345	if (CUresult err = launch_kernel(binary, stream, rpc_device,
346	single_threaded_params, "_end", fini_args))
347	handle_error(err);
348
349	// Free the memory allocated for the device.
350	if (CUresult err = cuMemFreeHost(*memory_or_err))
351	handle_error(err);
352	if (CUresult err = cuMemFree(dev_ret))
353	handle_error(err);
354	if (CUresult err = cuMemFreeHost(dev_argv))
355	handle_error(err);
356	if (rpc_status_t err = rpc_server_shutdown(
357	rpc_device, [](void ptr, void* ) { cuMemFreeHost(ptr); }, nullptr*))
358	handle_error(err);
359
360	// Destroy the context and the loaded binary.
361	if (CUresult err = cuModuleUnload(binary))
362	handle_error(err);
363	if (CUresult err = cuDevicePrimaryCtxRelease(device))
364	handle_error(err);
365	return host_ret;
366	}
367

source code of libc/utils/gpu/loader/nvptx/Loader.cpp