Loader.cpp source code [libc/utils/gpu/loader/amdgpu/Loader.cpp]

1	//===-- Loader Implementation for AMDHSA devices --------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file impelements a simple loader to run images supporting the AMDHSA
10	// architecture. The file launches the '_start' kernel which should be provided
11	// by the device application start code and call ultimately call the 'main'
12	// function.
13	//
14	//===----------------------------------------------------------------------===//
15
16	#include "Loader.h"
17
18	#if defined(__has_include)
19	#if __has_include("hsa/hsa.h")
20	#include "hsa/hsa.h"
21	#include "hsa/hsa_ext_amd.h"
22	#elif __has_include("hsa.h")
23	#include "hsa.h"
24	#include "hsa_ext_amd.h"
25	#endif
26	#else
27	#include "hsa/hsa.h"
28	#include "hsa/hsa_ext_amd.h"
29	#endif
30
31	#include <cstdio>
32	#include <cstdlib>
33	#include <cstring>
34	#include <tuple>
35	#include <utility>
36
37	// The implicit arguments of COV5 AMDGPU kernels.
38	struct implicit_args_t {
39	uint32_t grid_size_x;
40	uint32_t grid_size_y;
41	uint32_t grid_size_z;
42	uint16_t workgroup_size_x;
43	uint16_t workgroup_size_y;
44	uint16_t workgroup_size_z;
45	uint8_t Unused0[`46`];
46	uint16_t grid_dims;
47	uint8_t Unused1[`190`];
48	};
49
50	/// Print the error code and exit if \p code indicates an error.
51	static void handle_error(hsa_status_t code) {
52	if (code == HSA_STATUS_SUCCESS \|\| code == HSA_STATUS_INFO_BREAK)
53	return;
54
55	const char *desc;
56	if (hsa_status_string(code, &desc) != HSA_STATUS_SUCCESS)
57	desc = "Unknown error";
58	fprintf(stderr, format: "%s\n", desc);
59	exit(EXIT_FAILURE);
60	}
61
62	/// Generic interface for iterating using the HSA callbacks.
63	template <typename elem_ty, typename func_ty, typename callback_ty>
64	hsa_status_t iterate(func_ty func, callback_ty cb) {
65	auto l = [](elem_ty elem, void *data) -> hsa_status_t {
66	callback_ty unwrapped = static_cast<callback_ty >(data);
67	return (*unwrapped)(elem);
68	};
69	return func(l, static_cast<void *>(&cb));
70	}
71
72	/// Generic interface for iterating using the HSA callbacks.
73	template <typename elem_ty, typename func_ty, typename func_arg_ty,
74	typename callback_ty>
75	hsa_status_t iterate(func_ty func, func_arg_ty func_arg, callback_ty cb) {
76	auto l = [](elem_ty elem, void *data) -> hsa_status_t {
77	callback_ty unwrapped = static_cast<callback_ty >(data);
78	return (*unwrapped)(elem);
79	};
80	return func(func_arg, l, static_cast<void *>(&cb));
81	}
82
83	/// Iterate through all availible agents.
84	template <typename callback_ty>
85	hsa_status_t iterate_agents(callback_ty callback) {
86	return iterate<hsa_agent_t>(hsa_iterate_agents, callback);
87	}
88
89	/// Iterate through all availible memory pools.
90	template <typename callback_ty>
91	hsa_status_t iterate_agent_memory_pools(hsa_agent_t agent, callback_ty cb) {
92	return iterate<hsa_amd_memory_pool_t>(hsa_amd_agent_iterate_memory_pools,
93	agent, cb);
94	}
95
96	template <hsa_device_type_t flag>
97	hsa_status_t get_agent(hsa_agent_t *output_agent) {
98	// Find the first agent with a matching device type.
99	auto cb = [&](hsa_agent_t hsa_agent) -> hsa_status_t {
100	hsa_device_type_t type;
101	hsa_status_t status =
102	hsa_agent_get_info(hsa_agent, HSA_AGENT_INFO_DEVICE, &type);
103	if (status != HSA_STATUS_SUCCESS)
104	return status;
105
106	if (type == flag) {
107	// Ensure that a GPU agent supports kernel dispatch packets.
108	if (type == HSA_DEVICE_TYPE_GPU) {
109	hsa_agent_feature_t features;
110	status =
111	hsa_agent_get_info(hsa_agent, HSA_AGENT_INFO_FEATURE, &features);
112	if (status != HSA_STATUS_SUCCESS)
113	return status;
114	if (features & HSA_AGENT_FEATURE_KERNEL_DISPATCH)
115	*output_agent = hsa_agent;
116	} else {
117	*output_agent = hsa_agent;
118	}
119	return HSA_STATUS_INFO_BREAK;
120	}
121	return HSA_STATUS_SUCCESS;
122	};
123
124	return iterate_agents(cb);
125	}
126
127	/// Retrieve a global memory pool with a \p flag from the agent.
128	template <hsa_amd_memory_pool_global_flag_t flag>
129	hsa_status_t get_agent_memory_pool(hsa_agent_t agent,
130	hsa_amd_memory_pool_t *output_pool) {
131	auto cb = [&](hsa_amd_memory_pool_t memory_pool) {
132	uint32_t flags;
133	hsa_amd_segment_t segment;
134	if (auto err = hsa_amd_memory_pool_get_info(
135	memory_pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment))
136	return err;
137	if (auto err = hsa_amd_memory_pool_get_info(
138	memory_pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flags))
139	return err;
140
141	if (segment != HSA_AMD_SEGMENT_GLOBAL)
142	return HSA_STATUS_SUCCESS;
143
144	if (flags & flag)
145	*output_pool = memory_pool;
146
147	return HSA_STATUS_SUCCESS;
148	};
149	return iterate_agent_memory_pools(agent, cb);
150	}
151
152	template <typename args_t>
153	hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
154	hsa_amd_memory_pool_t kernargs_pool,
155	hsa_amd_memory_pool_t coarsegrained_pool,
156	hsa_queue_t *queue, rpc_device_t device,
157	const LaunchParameters &params,
158	const char *kernel_name, args_t kernel_args) {
159	// Look up the '_start' kernel in the loaded executable.
160	hsa_executable_symbol_t symbol;
161	if (hsa_status_t err = hsa_executable_get_symbol_by_name(
162	executable, kernel_name, &dev_agent, &symbol))
163	return err;
164
165	// Register RPC callbacks for the malloc and free functions on HSA.
166	auto tuple = std::make_tuple(dev_agent, coarsegrained_pool);
167	rpc_register_callback(
168	device, RPC_MALLOC,
169	[](rpc_port_t port, void *data) {
170	auto malloc_handler = [](rpc_buffer_t buffer, void* data) -> void* {
171	auto &[dev_agent, pool] = *static_cast<decltype(tuple) *>(data);
172	uint64_t size = buffer->data[`0`];
173	void dev_ptr = nullptr*;
174	if (hsa_status_t err =
175	hsa_amd_memory_pool_allocate(pool, size,
176	/flags=/`0`, &dev_ptr))
177	handle_error(err);
178	hsa_amd_agents_allow_access(`1`, &dev_agent, nullptr, dev_ptr);
179	buffer->data[`0`] = reinterpret_cast<uintptr_t>(dev_ptr);
180	};
181	rpc_recv_and_send(port, malloc_handler, data);
182	},
183	&tuple);
184	rpc_register_callback(
185	device, RPC_FREE,
186	[](rpc_port_t port, void *data) {
187	auto free_handler = [](rpc_buffer_t buffer, void* *) {
188	if (hsa_status_t err = hsa_amd_memory_pool_free(
189	reinterpret_cast<void *>(buffer->data[`0`])))
190	handle_error(err);
191	};
192	rpc_recv_and_send(port, free_handler, data);
193	},
194	nullptr);
195
196	// Retrieve different properties of the kernel symbol used for launch.
197	uint64_t kernel;
198	uint32_t args_size;
199	uint32_t group_size;
200	uint32_t private_size;
201	bool dynamic_stack;
202
203	std::pair<hsa_executable_symbol_info_t, void *> symbol_infos[] = {
204	{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel},
205	{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &args_size},
206	{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_size},
207	{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK, &dynamic_stack},
208	{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &private_size}};
209
210	for (auto &[info, value] : symbol_infos)
211	if (hsa_status_t err = hsa_executable_symbol_get_info(symbol, info, value))
212	return err;
213
214	// Allocate space for the kernel arguments on the host and allow the GPU agent
215	// to access it.
216	void *args;
217	if (hsa_status_t err = hsa_amd_memory_pool_allocate(kernargs_pool, args_size,
218	/flags=/`0`, &args))
219	handle_error(err);
220	hsa_amd_agents_allow_access(`1`, &dev_agent, nullptr, args);
221
222	// Initialie all the arguments (explicit and implicit) to zero, then set the
223	// explicit arguments to the values created above.
224	std::memset(s: args, c: `0`, n: args_size);
225	std::memcpy(dest: args, src: &kernel_args, n: sizeof(args_t));
226
227	// Initialize the necessary implicit arguments to the proper values.
228	bool dims = `1` + (params.num_blocks_y * params.num_threads_y != `1`) +
229	(params.num_blocks_z * params.num_threads_z != `1`);
230	implicit_args_t implicit_args = reinterpret_cast<implicit_args_t >(
231	reinterpret_cast<uint8_t >(args) + sizeof*(args_t));
232	implicit_args->grid_dims = dims;
233	implicit_args->grid_size_x = params.num_blocks_x;
234	implicit_args->grid_size_y = params.num_blocks_y;
235	implicit_args->grid_size_z = params.num_blocks_z;
236	implicit_args->workgroup_size_x = params.num_threads_x;
237	implicit_args->workgroup_size_y = params.num_threads_y;
238	implicit_args->workgroup_size_z = params.num_threads_z;
239
240	// Obtain a packet from the queue.
241	uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, `1`);
242	while (packet_id - hsa_queue_load_read_index_scacquire(queue) >= queue->size)
243	;
244
245	const uint32_t mask = queue->size - `1`;
246	hsa_kernel_dispatch_packet_t *packet =
247	static_cast<hsa_kernel_dispatch_packet_t *>(queue->base_address) +
248	(packet_id & mask);
249
250	// Set up the packet for exeuction on the device. We currently only launch
251	// with one thread on the device, forcing the rest of the wavefront to be
252	// masked off.
253	uint16_t setup = (dims) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
254	packet->workgroup_size_x = params.num_threads_x;
255	packet->workgroup_size_y = params.num_threads_y;
256	packet->workgroup_size_z = params.num_threads_z;
257	packet->reserved0 = `0`;
258	packet->grid_size_x = params.num_blocks_x * params.num_threads_x;
259	packet->grid_size_y = params.num_blocks_y * params.num_threads_y;
260	packet->grid_size_z = params.num_blocks_z * params.num_threads_z;
261	packet->private_segment_size =
262	dynamic_stack ? `16` * `1024` / 16 KB / : private_size;
263	packet->group_segment_size = group_size;
264	packet->kernel_object = kernel;
265	packet->kernarg_address = args;
266	packet->reserved2 = `0`;
267	// Create a signal to indicate when this packet has been completed.
268	if (hsa_status_t err =
269	hsa_signal_create(`1`, `0`, nullptr, &packet->completion_signal))
270	handle_error(err);
271
272	// Initialize the packet header and set the doorbell signal to begin execution
273	// by the HSA runtime.
274	uint16_t header =
275	(HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) \|
276	(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) \|
277	(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
278	uint32_t header_word = header \| (setup << `16u`);
279	__atomic_store_n((uint32_t *)&packet->header, header_word, __ATOMIC_RELEASE);
280	hsa_signal_store_relaxed(queue->doorbell_signal, packet_id);
281
282	// Wait until the kernel has completed execution on the device. Periodically
283	// check the RPC client for work to be performed on the server.
284	while (hsa_signal_wait_scacquire(
285	packet->completion_signal, HSA_SIGNAL_CONDITION_EQ, `0`,
286	/timeout_hint=/`1024`, HSA_WAIT_STATE_ACTIVE) != `0`)
287	if (rpc_status_t err = rpc_handle_server(device))
288	handle_error(err);
289
290	// Handle the server one more time in case the kernel exited with a pending
291	// send still in flight.
292	if (rpc_status_t err = rpc_handle_server(device))
293	handle_error(err);
294
295	// Destroy the resources acquired to launch the kernel and return.
296	if (hsa_status_t err = hsa_amd_memory_pool_free(args))
297	handle_error(err);
298	if (hsa_status_t err = hsa_signal_destroy(packet->completion_signal))
299	handle_error(err);
300
301	return HSA_STATUS_SUCCESS;
302	}
303
304	/// Copies data from the source agent to the destination agent. The source
305	/// memory must first be pinned explicitly or allocated via HSA.
306	static hsa_status_t hsa_memcpy(void *dst, hsa_agent_t dst_agent,
307	const void *src, hsa_agent_t src_agent,
308	uint64_t size) {
309	// Create a memory signal to copy information between the host and device.
310	hsa_signal_t memory_signal;
311	if (hsa_status_t err = hsa_signal_create(`1`, `0`, nullptr, &memory_signal))
312	return err;
313
314	if (hsa_status_t err = hsa_amd_memory_async_copy(
315	dst, dst_agent, src, src_agent, size, `0`, nullptr, memory_signal))
316	return err;
317
318	while (hsa_signal_wait_scacquire(memory_signal, HSA_SIGNAL_CONDITION_EQ, `0`,
319	UINT64_MAX, HSA_WAIT_STATE_ACTIVE) != `0`)
320	;
321
322	if (hsa_status_t err = hsa_signal_destroy(memory_signal))
323	return err;
324
325	return HSA_STATUS_SUCCESS;
326	}
327
328	int load(int argc, char *argv, char* *envp, void* *image, size_t size,
329	const LaunchParameters &params) {
330	// Initialize the HSA runtime used to communicate with the device.
331	if (hsa_status_t err = hsa_init())
332	handle_error(err);
333
334	// Register a callback when the device encounters a memory fault.
335	if (hsa_status_t err = hsa_amd_register_system_event_handler(
336	[](const hsa_amd_event_t event, void* *) -> hsa_status_t {
337	if (event->event_type == HSA_AMD_GPU_MEMORY_FAULT_EVENT)
338	return HSA_STATUS_ERROR;
339	return HSA_STATUS_SUCCESS;
340	},
341	nullptr))
342	handle_error(err);
343
344	// Obtain a single agent for the device and host to use the HSA memory model.
345	hsa_agent_t dev_agent;
346	hsa_agent_t host_agent;
347	if (hsa_status_t err = get_agent<HSA_DEVICE_TYPE_GPU>(&dev_agent))
348	handle_error(err);
349	if (hsa_status_t err = get_agent<HSA_DEVICE_TYPE_CPU>(&host_agent))
350	handle_error(err);
351
352	// Load the code object's ISA information and executable data segments.
353	hsa_code_object_t object;
354	if (hsa_status_t err = hsa_code_object_deserialize(image, size, "", &object))
355	handle_error(err);
356
357	hsa_executable_t executable;
358	if (hsa_status_t err = hsa_executable_create_alt(
359	HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO, "",
360	&executable))
361	handle_error(err);
362
363	if (hsa_status_t err =
364	hsa_executable_load_code_object(executable, dev_agent, object, ""))
365	handle_error(err);
366
367	// No modifications to the executable are allowed after this point.
368	if (hsa_status_t err = hsa_executable_freeze(executable, ""))
369	handle_error(err);
370
371	// Check the validity of the loaded executable. If the agents ISA features do
372	// not match the executable's code object it will fail here.
373	uint32_t result;
374	if (hsa_status_t err = hsa_executable_validate(executable, &result))
375	handle_error(err);
376	if (result)
377	handle_error(HSA_STATUS_ERROR);
378
379	// Obtain memory pools to exchange data between the host and the device. The
380	// fine-grained pool acts as pinned memory on the host for DMA transfers to
381	// the device, the coarse-grained pool is for allocations directly on the
382	// device, and the kernerl-argument pool is for executing the kernel.
383	hsa_amd_memory_pool_t kernargs_pool;
384	hsa_amd_memory_pool_t finegrained_pool;
385	hsa_amd_memory_pool_t coarsegrained_pool;
386	if (hsa_status_t err =
387	get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT>(
388	host_agent, &kernargs_pool))
389	handle_error(err);
390	if (hsa_status_t err =
391	get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED>(
392	host_agent, &finegrained_pool))
393	handle_error(err);
394	if (hsa_status_t err =
395	get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED>(
396	dev_agent, &coarsegrained_pool))
397	handle_error(err);
398
399	// Allocate fine-grained memory on the host to hold the pointer array for the
400	// copied argv and allow the GPU agent to access it.
401	auto allocator = [&](uint64_t size) -> void * {
402	void dev_ptr = nullptr*;
403	if (hsa_status_t err = hsa_amd_memory_pool_allocate(finegrained_pool, size,
404	/flags=/`0`, &dev_ptr))
405	handle_error(err);
406	hsa_amd_agents_allow_access(`1`, &dev_agent, nullptr, dev_ptr);
407	return dev_ptr;
408	};
409	void *dev_argv = copy_argument_vector(argc, argv, allocator);
410	if (!dev_argv)
411	handle_error("Failed to allocate device argv");
412
413	// Allocate fine-grained memory on the host to hold the pointer array for the
414	// copied environment array and allow the GPU agent to access it.
415	void *dev_envp = copy_environment(envp, allocator);
416	if (!dev_envp)
417	handle_error("Failed to allocate device environment");
418
419	// Allocate space for the return pointer and initialize it to zero.
420	void *dev_ret;
421	if (hsa_status_t err =
422	hsa_amd_memory_pool_allocate(coarsegrained_pool, sizeof(int),
423	/flags=/`0`, &dev_ret))
424	handle_error(err);
425	hsa_amd_memory_fill(dev_ret, `0`, /count=/`1`);
426
427	// Allocate finegrained memory for the RPC server and client to share.
428	uint32_t wavefront_size = `0`;
429	if (hsa_status_t err = hsa_agent_get_info(
430	dev_agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size))
431	handle_error(err);
432
433	// Set up the RPC server.
434	auto tuple = std::make_tuple(dev_agent, finegrained_pool);
435	auto rpc_alloc = [](uint64_t size, void *data) {
436	auto &[dev_agent, finegrained_pool] = *static_cast<decltype(tuple) *>(data);
437	void dev_ptr = nullptr*;
438	if (hsa_status_t err = hsa_amd_memory_pool_allocate(finegrained_pool, size,
439	/flags=/`0`, &dev_ptr))
440	handle_error(err);
441	hsa_amd_agents_allow_access(`1`, &dev_agent, nullptr, dev_ptr);
442	return dev_ptr;
443	};
444	rpc_device_t device;
445	if (rpc_status_t err = rpc_server_init(&device, RPC_MAXIMUM_PORT_COUNT,
446	wavefront_size, rpc_alloc, &tuple))
447	handle_error(err);
448
449	// Register callbacks for the RPC unit tests.
450	if (wavefront_size == `32`)
451	register_rpc_callbacks<`32`>(device);
452	else if (wavefront_size == `64`)
453	register_rpc_callbacks<`64`>(device);
454	else
455	handle_error("Invalid wavefront size");
456
457	// Initialize the RPC client on the device by copying the local data to the
458	// device's internal pointer.
459	hsa_executable_symbol_t rpc_client_sym;
460	if (hsa_status_t err = hsa_executable_get_symbol_by_name(
461	executable, rpc_client_symbol_name, &dev_agent, &rpc_client_sym))
462	handle_error(err);
463
464	void *rpc_client_host;
465	if (hsa_status_t err =
466	hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(void *),
467	/flags=/`0`, &rpc_client_host))
468	handle_error(err);
469	hsa_amd_agents_allow_access(`1`, &dev_agent, nullptr, rpc_client_host);
470
471	void *rpc_client_dev;
472	if (hsa_status_t err = hsa_executable_symbol_get_info(
473	rpc_client_sym, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
474	&rpc_client_dev))
475	handle_error(err);
476
477	// Copy the address of the client buffer from the device to the host.
478	if (hsa_status_t err = hsa_memcpy(rpc_client_host, host_agent, rpc_client_dev,
479	dev_agent, sizeof(void *)))
480	handle_error(err);
481
482	void *rpc_client_buffer;
483	if (hsa_status_t err =
484	hsa_amd_memory_lock(const_cast<void *>(rpc_get_client_buffer(device)),
485	rpc_get_client_size(),
486	/agents=/nullptr, `0`, &rpc_client_buffer))
487	handle_error(err);
488
489	// Copy the RPC client buffer to the address pointed to by the symbol.
490	if (hsa_status_t err =
491	hsa_memcpy(*reinterpret_cast<void **>(rpc_client_host), dev_agent,
492	rpc_client_buffer, host_agent, rpc_get_client_size()))
493	handle_error(err);
494
495	if (hsa_status_t err = hsa_amd_memory_unlock(
496	const_cast<void *>(rpc_get_client_buffer(device))))
497	handle_error(err);
498	if (hsa_status_t err = hsa_amd_memory_pool_free(rpc_client_host))
499	handle_error(err);
500
501	// Obtain the GPU's fixed-frequency clock rate and copy it to the GPU.
502	// If the clock_freq symbol is missing, no work to do.
503	hsa_executable_symbol_t freq_sym;
504	if (HSA_STATUS_SUCCESS ==
505	hsa_executable_get_symbol_by_name(executable, "__llvm_libc_clock_freq",
506	&dev_agent, &freq_sym)) {
507
508	void *host_clock_freq;
509	if (hsa_status_t err =
510	hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(uint64_t),
511	/flags=/`0`, &host_clock_freq))
512	handle_error(err);
513	hsa_amd_agents_allow_access(`1`, &dev_agent, nullptr, host_clock_freq);
514
515	if (HSA_STATUS_SUCCESS ==
516	hsa_agent_get_info(dev_agent,
517	static_cast<hsa_agent_info_t>(
518	HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY),
519	host_clock_freq)) {
520
521	void *freq_addr;
522	if (hsa_status_t err = hsa_executable_symbol_get_info(
523	freq_sym, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
524	&freq_addr))
525	handle_error(err);
526
527	if (hsa_status_t err = hsa_memcpy(freq_addr, dev_agent, host_clock_freq,
528	host_agent, sizeof(uint64_t)))
529	handle_error(err);
530	}
531	}
532
533	// Obtain a queue with the minimum (power of two) size, used to send commands
534	// to the HSA runtime and launch execution on the device.
535	uint64_t queue_size;
536	if (hsa_status_t err = hsa_agent_get_info(
537	dev_agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE, &queue_size))
538	handle_error(err);
539	hsa_queue_t queue = nullptr*;
540	if (hsa_status_t err =
541	hsa_queue_create(dev_agent, queue_size, HSA_QUEUE_TYPE_MULTI, nullptr,
542	nullptr, UINT32_MAX, UINT32_MAX, &queue))
543	handle_error(err);
544
545	LaunchParameters single_threaded_params = {`1`, `1`, `1`, `1`, `1`, `1`};
546	begin_args_t init_args = {argc, dev_argv, dev_envp};
547	if (hsa_status_t err = launch_kernel(
548	dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
549	device, single_threaded_params, "_begin.kd", init_args))
550	handle_error(err);
551
552	start_args_t args = {argc, dev_argv, dev_envp, dev_ret};
553	if (hsa_status_t err = launch_kernel(dev_agent, executable, kernargs_pool,
554	coarsegrained_pool, queue, device,
555	params, "_start.kd", args))
556	handle_error(err);
557
558	void *host_ret;
559	if (hsa_status_t err =
560	hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(int),
561	/flags=/`0`, &host_ret))
562	handle_error(err);
563	hsa_amd_agents_allow_access(`1`, &dev_agent, nullptr, host_ret);
564
565	if (hsa_status_t err =
566	hsa_memcpy(host_ret, host_agent, dev_ret, dev_agent, sizeof(int)))
567	handle_error(err);
568
569	// Save the return value and perform basic clean-up.
570	int ret = *static_cast<int *>(host_ret);
571
572	end_args_t fini_args = {ret};
573	if (hsa_status_t err = launch_kernel(
574	dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
575	device, single_threaded_params, "_end.kd", fini_args))
576	handle_error(err);
577
578	if (rpc_status_t err = rpc_server_shutdown(
579	device, [](void ptr, void* *) { hsa_amd_memory_pool_free(ptr); },
580	nullptr))
581	handle_error(err);
582
583	// Free the memory allocated for the device.
584	if (hsa_status_t err = hsa_amd_memory_pool_free(dev_argv))
585	handle_error(err);
586	if (hsa_status_t err = hsa_amd_memory_pool_free(dev_ret))
587	handle_error(err);
588	if (hsa_status_t err = hsa_amd_memory_pool_free(host_ret))
589	handle_error(err);
590
591	if (hsa_status_t err = hsa_queue_destroy(queue))
592	handle_error(err);
593
594	if (hsa_status_t err = hsa_executable_destroy(executable))
595	handle_error(err);
596
597	if (hsa_status_t err = hsa_code_object_destroy(object))
598	handle_error(err);
599
600	if (hsa_status_t err = hsa_shut_down())
601	handle_error(err);
602
603	return ret;
604	}
605

source code of libc/utils/gpu/loader/amdgpu/Loader.cpp