1//===-- Loader Implementation for AMDHSA devices --------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file impelements a simple loader to run images supporting the AMDHSA
10// architecture. The file launches the '_start' kernel which should be provided
11// by the device application start code and call ultimately call the 'main'
12// function.
13//
14//===----------------------------------------------------------------------===//
15
16#include "Loader.h"
17
18#if defined(__has_include)
19#if __has_include("hsa/hsa.h")
20#include "hsa/hsa.h"
21#include "hsa/hsa_ext_amd.h"
22#elif __has_include("hsa.h")
23#include "hsa.h"
24#include "hsa_ext_amd.h"
25#endif
26#else
27#include "hsa/hsa.h"
28#include "hsa/hsa_ext_amd.h"
29#endif
30
31#include <cstdio>
32#include <cstdlib>
33#include <cstring>
34#include <tuple>
35#include <utility>
36
37// The implicit arguments of COV5 AMDGPU kernels.
38struct implicit_args_t {
39 uint32_t grid_size_x;
40 uint32_t grid_size_y;
41 uint32_t grid_size_z;
42 uint16_t workgroup_size_x;
43 uint16_t workgroup_size_y;
44 uint16_t workgroup_size_z;
45 uint8_t Unused0[46];
46 uint16_t grid_dims;
47 uint8_t Unused1[190];
48};
49
50/// Print the error code and exit if \p code indicates an error.
51static void handle_error(hsa_status_t code) {
52 if (code == HSA_STATUS_SUCCESS || code == HSA_STATUS_INFO_BREAK)
53 return;
54
55 const char *desc;
56 if (hsa_status_string(code, &desc) != HSA_STATUS_SUCCESS)
57 desc = "Unknown error";
58 fprintf(stderr, format: "%s\n", desc);
59 exit(EXIT_FAILURE);
60}
61
62/// Generic interface for iterating using the HSA callbacks.
63template <typename elem_ty, typename func_ty, typename callback_ty>
64hsa_status_t iterate(func_ty func, callback_ty cb) {
65 auto l = [](elem_ty elem, void *data) -> hsa_status_t {
66 callback_ty *unwrapped = static_cast<callback_ty *>(data);
67 return (*unwrapped)(elem);
68 };
69 return func(l, static_cast<void *>(&cb));
70}
71
72/// Generic interface for iterating using the HSA callbacks.
73template <typename elem_ty, typename func_ty, typename func_arg_ty,
74 typename callback_ty>
75hsa_status_t iterate(func_ty func, func_arg_ty func_arg, callback_ty cb) {
76 auto l = [](elem_ty elem, void *data) -> hsa_status_t {
77 callback_ty *unwrapped = static_cast<callback_ty *>(data);
78 return (*unwrapped)(elem);
79 };
80 return func(func_arg, l, static_cast<void *>(&cb));
81}
82
83/// Iterate through all availible agents.
84template <typename callback_ty>
85hsa_status_t iterate_agents(callback_ty callback) {
86 return iterate<hsa_agent_t>(hsa_iterate_agents, callback);
87}
88
89/// Iterate through all availible memory pools.
90template <typename callback_ty>
91hsa_status_t iterate_agent_memory_pools(hsa_agent_t agent, callback_ty cb) {
92 return iterate<hsa_amd_memory_pool_t>(hsa_amd_agent_iterate_memory_pools,
93 agent, cb);
94}
95
96template <hsa_device_type_t flag>
97hsa_status_t get_agent(hsa_agent_t *output_agent) {
98 // Find the first agent with a matching device type.
99 auto cb = [&](hsa_agent_t hsa_agent) -> hsa_status_t {
100 hsa_device_type_t type;
101 hsa_status_t status =
102 hsa_agent_get_info(hsa_agent, HSA_AGENT_INFO_DEVICE, &type);
103 if (status != HSA_STATUS_SUCCESS)
104 return status;
105
106 if (type == flag) {
107 // Ensure that a GPU agent supports kernel dispatch packets.
108 if (type == HSA_DEVICE_TYPE_GPU) {
109 hsa_agent_feature_t features;
110 status =
111 hsa_agent_get_info(hsa_agent, HSA_AGENT_INFO_FEATURE, &features);
112 if (status != HSA_STATUS_SUCCESS)
113 return status;
114 if (features & HSA_AGENT_FEATURE_KERNEL_DISPATCH)
115 *output_agent = hsa_agent;
116 } else {
117 *output_agent = hsa_agent;
118 }
119 return HSA_STATUS_INFO_BREAK;
120 }
121 return HSA_STATUS_SUCCESS;
122 };
123
124 return iterate_agents(cb);
125}
126
127/// Retrieve a global memory pool with a \p flag from the agent.
128template <hsa_amd_memory_pool_global_flag_t flag>
129hsa_status_t get_agent_memory_pool(hsa_agent_t agent,
130 hsa_amd_memory_pool_t *output_pool) {
131 auto cb = [&](hsa_amd_memory_pool_t memory_pool) {
132 uint32_t flags;
133 hsa_amd_segment_t segment;
134 if (auto err = hsa_amd_memory_pool_get_info(
135 memory_pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment))
136 return err;
137 if (auto err = hsa_amd_memory_pool_get_info(
138 memory_pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flags))
139 return err;
140
141 if (segment != HSA_AMD_SEGMENT_GLOBAL)
142 return HSA_STATUS_SUCCESS;
143
144 if (flags & flag)
145 *output_pool = memory_pool;
146
147 return HSA_STATUS_SUCCESS;
148 };
149 return iterate_agent_memory_pools(agent, cb);
150}
151
152template <typename args_t>
153hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
154 hsa_amd_memory_pool_t kernargs_pool,
155 hsa_amd_memory_pool_t coarsegrained_pool,
156 hsa_queue_t *queue, rpc_device_t device,
157 const LaunchParameters &params,
158 const char *kernel_name, args_t kernel_args) {
159 // Look up the '_start' kernel in the loaded executable.
160 hsa_executable_symbol_t symbol;
161 if (hsa_status_t err = hsa_executable_get_symbol_by_name(
162 executable, kernel_name, &dev_agent, &symbol))
163 return err;
164
165 // Register RPC callbacks for the malloc and free functions on HSA.
166 auto tuple = std::make_tuple(dev_agent, coarsegrained_pool);
167 rpc_register_callback(
168 device, RPC_MALLOC,
169 [](rpc_port_t port, void *data) {
170 auto malloc_handler = [](rpc_buffer_t *buffer, void *data) -> void {
171 auto &[dev_agent, pool] = *static_cast<decltype(tuple) *>(data);
172 uint64_t size = buffer->data[0];
173 void *dev_ptr = nullptr;
174 if (hsa_status_t err =
175 hsa_amd_memory_pool_allocate(pool, size,
176 /*flags=*/0, &dev_ptr))
177 handle_error(err);
178 hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr);
179 buffer->data[0] = reinterpret_cast<uintptr_t>(dev_ptr);
180 };
181 rpc_recv_and_send(port, malloc_handler, data);
182 },
183 &tuple);
184 rpc_register_callback(
185 device, RPC_FREE,
186 [](rpc_port_t port, void *data) {
187 auto free_handler = [](rpc_buffer_t *buffer, void *) {
188 if (hsa_status_t err = hsa_amd_memory_pool_free(
189 reinterpret_cast<void *>(buffer->data[0])))
190 handle_error(err);
191 };
192 rpc_recv_and_send(port, free_handler, data);
193 },
194 nullptr);
195
196 // Retrieve different properties of the kernel symbol used for launch.
197 uint64_t kernel;
198 uint32_t args_size;
199 uint32_t group_size;
200 uint32_t private_size;
201 bool dynamic_stack;
202
203 std::pair<hsa_executable_symbol_info_t, void *> symbol_infos[] = {
204 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel},
205 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &args_size},
206 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_size},
207 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK, &dynamic_stack},
208 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &private_size}};
209
210 for (auto &[info, value] : symbol_infos)
211 if (hsa_status_t err = hsa_executable_symbol_get_info(symbol, info, value))
212 return err;
213
214 // Allocate space for the kernel arguments on the host and allow the GPU agent
215 // to access it.
216 void *args;
217 if (hsa_status_t err = hsa_amd_memory_pool_allocate(kernargs_pool, args_size,
218 /*flags=*/0, &args))
219 handle_error(err);
220 hsa_amd_agents_allow_access(1, &dev_agent, nullptr, args);
221
222 // Initialie all the arguments (explicit and implicit) to zero, then set the
223 // explicit arguments to the values created above.
224 std::memset(s: args, c: 0, n: args_size);
225 std::memcpy(dest: args, src: &kernel_args, n: sizeof(args_t));
226
227 // Initialize the necessary implicit arguments to the proper values.
228 bool dims = 1 + (params.num_blocks_y * params.num_threads_y != 1) +
229 (params.num_blocks_z * params.num_threads_z != 1);
230 implicit_args_t *implicit_args = reinterpret_cast<implicit_args_t *>(
231 reinterpret_cast<uint8_t *>(args) + sizeof(args_t));
232 implicit_args->grid_dims = dims;
233 implicit_args->grid_size_x = params.num_blocks_x;
234 implicit_args->grid_size_y = params.num_blocks_y;
235 implicit_args->grid_size_z = params.num_blocks_z;
236 implicit_args->workgroup_size_x = params.num_threads_x;
237 implicit_args->workgroup_size_y = params.num_threads_y;
238 implicit_args->workgroup_size_z = params.num_threads_z;
239
240 // Obtain a packet from the queue.
241 uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, 1);
242 while (packet_id - hsa_queue_load_read_index_scacquire(queue) >= queue->size)
243 ;
244
245 const uint32_t mask = queue->size - 1;
246 hsa_kernel_dispatch_packet_t *packet =
247 static_cast<hsa_kernel_dispatch_packet_t *>(queue->base_address) +
248 (packet_id & mask);
249
250 // Set up the packet for exeuction on the device. We currently only launch
251 // with one thread on the device, forcing the rest of the wavefront to be
252 // masked off.
253 uint16_t setup = (dims) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
254 packet->workgroup_size_x = params.num_threads_x;
255 packet->workgroup_size_y = params.num_threads_y;
256 packet->workgroup_size_z = params.num_threads_z;
257 packet->reserved0 = 0;
258 packet->grid_size_x = params.num_blocks_x * params.num_threads_x;
259 packet->grid_size_y = params.num_blocks_y * params.num_threads_y;
260 packet->grid_size_z = params.num_blocks_z * params.num_threads_z;
261 packet->private_segment_size =
262 dynamic_stack ? 16 * 1024 /* 16 KB */ : private_size;
263 packet->group_segment_size = group_size;
264 packet->kernel_object = kernel;
265 packet->kernarg_address = args;
266 packet->reserved2 = 0;
267 // Create a signal to indicate when this packet has been completed.
268 if (hsa_status_t err =
269 hsa_signal_create(1, 0, nullptr, &packet->completion_signal))
270 handle_error(err);
271
272 // Initialize the packet header and set the doorbell signal to begin execution
273 // by the HSA runtime.
274 uint16_t header =
275 (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
276 (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
277 (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
278 uint32_t header_word = header | (setup << 16u);
279 __atomic_store_n((uint32_t *)&packet->header, header_word, __ATOMIC_RELEASE);
280 hsa_signal_store_relaxed(queue->doorbell_signal, packet_id);
281
282 // Wait until the kernel has completed execution on the device. Periodically
283 // check the RPC client for work to be performed on the server.
284 while (hsa_signal_wait_scacquire(
285 packet->completion_signal, HSA_SIGNAL_CONDITION_EQ, 0,
286 /*timeout_hint=*/1024, HSA_WAIT_STATE_ACTIVE) != 0)
287 if (rpc_status_t err = rpc_handle_server(device))
288 handle_error(err);
289
290 // Handle the server one more time in case the kernel exited with a pending
291 // send still in flight.
292 if (rpc_status_t err = rpc_handle_server(device))
293 handle_error(err);
294
295 // Destroy the resources acquired to launch the kernel and return.
296 if (hsa_status_t err = hsa_amd_memory_pool_free(args))
297 handle_error(err);
298 if (hsa_status_t err = hsa_signal_destroy(packet->completion_signal))
299 handle_error(err);
300
301 return HSA_STATUS_SUCCESS;
302}
303
304/// Copies data from the source agent to the destination agent. The source
305/// memory must first be pinned explicitly or allocated via HSA.
306static hsa_status_t hsa_memcpy(void *dst, hsa_agent_t dst_agent,
307 const void *src, hsa_agent_t src_agent,
308 uint64_t size) {
309 // Create a memory signal to copy information between the host and device.
310 hsa_signal_t memory_signal;
311 if (hsa_status_t err = hsa_signal_create(1, 0, nullptr, &memory_signal))
312 return err;
313
314 if (hsa_status_t err = hsa_amd_memory_async_copy(
315 dst, dst_agent, src, src_agent, size, 0, nullptr, memory_signal))
316 return err;
317
318 while (hsa_signal_wait_scacquire(memory_signal, HSA_SIGNAL_CONDITION_EQ, 0,
319 UINT64_MAX, HSA_WAIT_STATE_ACTIVE) != 0)
320 ;
321
322 if (hsa_status_t err = hsa_signal_destroy(memory_signal))
323 return err;
324
325 return HSA_STATUS_SUCCESS;
326}
327
328int load(int argc, char **argv, char **envp, void *image, size_t size,
329 const LaunchParameters &params) {
330 // Initialize the HSA runtime used to communicate with the device.
331 if (hsa_status_t err = hsa_init())
332 handle_error(err);
333
334 // Register a callback when the device encounters a memory fault.
335 if (hsa_status_t err = hsa_amd_register_system_event_handler(
336 [](const hsa_amd_event_t *event, void *) -> hsa_status_t {
337 if (event->event_type == HSA_AMD_GPU_MEMORY_FAULT_EVENT)
338 return HSA_STATUS_ERROR;
339 return HSA_STATUS_SUCCESS;
340 },
341 nullptr))
342 handle_error(err);
343
344 // Obtain a single agent for the device and host to use the HSA memory model.
345 hsa_agent_t dev_agent;
346 hsa_agent_t host_agent;
347 if (hsa_status_t err = get_agent<HSA_DEVICE_TYPE_GPU>(&dev_agent))
348 handle_error(err);
349 if (hsa_status_t err = get_agent<HSA_DEVICE_TYPE_CPU>(&host_agent))
350 handle_error(err);
351
352 // Load the code object's ISA information and executable data segments.
353 hsa_code_object_t object;
354 if (hsa_status_t err = hsa_code_object_deserialize(image, size, "", &object))
355 handle_error(err);
356
357 hsa_executable_t executable;
358 if (hsa_status_t err = hsa_executable_create_alt(
359 HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO, "",
360 &executable))
361 handle_error(err);
362
363 if (hsa_status_t err =
364 hsa_executable_load_code_object(executable, dev_agent, object, ""))
365 handle_error(err);
366
367 // No modifications to the executable are allowed after this point.
368 if (hsa_status_t err = hsa_executable_freeze(executable, ""))
369 handle_error(err);
370
371 // Check the validity of the loaded executable. If the agents ISA features do
372 // not match the executable's code object it will fail here.
373 uint32_t result;
374 if (hsa_status_t err = hsa_executable_validate(executable, &result))
375 handle_error(err);
376 if (result)
377 handle_error(HSA_STATUS_ERROR);
378
379 // Obtain memory pools to exchange data between the host and the device. The
380 // fine-grained pool acts as pinned memory on the host for DMA transfers to
381 // the device, the coarse-grained pool is for allocations directly on the
382 // device, and the kernerl-argument pool is for executing the kernel.
383 hsa_amd_memory_pool_t kernargs_pool;
384 hsa_amd_memory_pool_t finegrained_pool;
385 hsa_amd_memory_pool_t coarsegrained_pool;
386 if (hsa_status_t err =
387 get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT>(
388 host_agent, &kernargs_pool))
389 handle_error(err);
390 if (hsa_status_t err =
391 get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED>(
392 host_agent, &finegrained_pool))
393 handle_error(err);
394 if (hsa_status_t err =
395 get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED>(
396 dev_agent, &coarsegrained_pool))
397 handle_error(err);
398
399 // Allocate fine-grained memory on the host to hold the pointer array for the
400 // copied argv and allow the GPU agent to access it.
401 auto allocator = [&](uint64_t size) -> void * {
402 void *dev_ptr = nullptr;
403 if (hsa_status_t err = hsa_amd_memory_pool_allocate(finegrained_pool, size,
404 /*flags=*/0, &dev_ptr))
405 handle_error(err);
406 hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr);
407 return dev_ptr;
408 };
409 void *dev_argv = copy_argument_vector(argc, argv, allocator);
410 if (!dev_argv)
411 handle_error("Failed to allocate device argv");
412
413 // Allocate fine-grained memory on the host to hold the pointer array for the
414 // copied environment array and allow the GPU agent to access it.
415 void *dev_envp = copy_environment(envp, allocator);
416 if (!dev_envp)
417 handle_error("Failed to allocate device environment");
418
419 // Allocate space for the return pointer and initialize it to zero.
420 void *dev_ret;
421 if (hsa_status_t err =
422 hsa_amd_memory_pool_allocate(coarsegrained_pool, sizeof(int),
423 /*flags=*/0, &dev_ret))
424 handle_error(err);
425 hsa_amd_memory_fill(dev_ret, 0, /*count=*/1);
426
427 // Allocate finegrained memory for the RPC server and client to share.
428 uint32_t wavefront_size = 0;
429 if (hsa_status_t err = hsa_agent_get_info(
430 dev_agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size))
431 handle_error(err);
432
433 // Set up the RPC server.
434 auto tuple = std::make_tuple(dev_agent, finegrained_pool);
435 auto rpc_alloc = [](uint64_t size, void *data) {
436 auto &[dev_agent, finegrained_pool] = *static_cast<decltype(tuple) *>(data);
437 void *dev_ptr = nullptr;
438 if (hsa_status_t err = hsa_amd_memory_pool_allocate(finegrained_pool, size,
439 /*flags=*/0, &dev_ptr))
440 handle_error(err);
441 hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr);
442 return dev_ptr;
443 };
444 rpc_device_t device;
445 if (rpc_status_t err = rpc_server_init(&device, RPC_MAXIMUM_PORT_COUNT,
446 wavefront_size, rpc_alloc, &tuple))
447 handle_error(err);
448
449 // Register callbacks for the RPC unit tests.
450 if (wavefront_size == 32)
451 register_rpc_callbacks<32>(device);
452 else if (wavefront_size == 64)
453 register_rpc_callbacks<64>(device);
454 else
455 handle_error("Invalid wavefront size");
456
457 // Initialize the RPC client on the device by copying the local data to the
458 // device's internal pointer.
459 hsa_executable_symbol_t rpc_client_sym;
460 if (hsa_status_t err = hsa_executable_get_symbol_by_name(
461 executable, rpc_client_symbol_name, &dev_agent, &rpc_client_sym))
462 handle_error(err);
463
464 void *rpc_client_host;
465 if (hsa_status_t err =
466 hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(void *),
467 /*flags=*/0, &rpc_client_host))
468 handle_error(err);
469 hsa_amd_agents_allow_access(1, &dev_agent, nullptr, rpc_client_host);
470
471 void *rpc_client_dev;
472 if (hsa_status_t err = hsa_executable_symbol_get_info(
473 rpc_client_sym, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
474 &rpc_client_dev))
475 handle_error(err);
476
477 // Copy the address of the client buffer from the device to the host.
478 if (hsa_status_t err = hsa_memcpy(rpc_client_host, host_agent, rpc_client_dev,
479 dev_agent, sizeof(void *)))
480 handle_error(err);
481
482 void *rpc_client_buffer;
483 if (hsa_status_t err =
484 hsa_amd_memory_lock(const_cast<void *>(rpc_get_client_buffer(device)),
485 rpc_get_client_size(),
486 /*agents=*/nullptr, 0, &rpc_client_buffer))
487 handle_error(err);
488
489 // Copy the RPC client buffer to the address pointed to by the symbol.
490 if (hsa_status_t err =
491 hsa_memcpy(*reinterpret_cast<void **>(rpc_client_host), dev_agent,
492 rpc_client_buffer, host_agent, rpc_get_client_size()))
493 handle_error(err);
494
495 if (hsa_status_t err = hsa_amd_memory_unlock(
496 const_cast<void *>(rpc_get_client_buffer(device))))
497 handle_error(err);
498 if (hsa_status_t err = hsa_amd_memory_pool_free(rpc_client_host))
499 handle_error(err);
500
501 // Obtain the GPU's fixed-frequency clock rate and copy it to the GPU.
502 // If the clock_freq symbol is missing, no work to do.
503 hsa_executable_symbol_t freq_sym;
504 if (HSA_STATUS_SUCCESS ==
505 hsa_executable_get_symbol_by_name(executable, "__llvm_libc_clock_freq",
506 &dev_agent, &freq_sym)) {
507
508 void *host_clock_freq;
509 if (hsa_status_t err =
510 hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(uint64_t),
511 /*flags=*/0, &host_clock_freq))
512 handle_error(err);
513 hsa_amd_agents_allow_access(1, &dev_agent, nullptr, host_clock_freq);
514
515 if (HSA_STATUS_SUCCESS ==
516 hsa_agent_get_info(dev_agent,
517 static_cast<hsa_agent_info_t>(
518 HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY),
519 host_clock_freq)) {
520
521 void *freq_addr;
522 if (hsa_status_t err = hsa_executable_symbol_get_info(
523 freq_sym, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
524 &freq_addr))
525 handle_error(err);
526
527 if (hsa_status_t err = hsa_memcpy(freq_addr, dev_agent, host_clock_freq,
528 host_agent, sizeof(uint64_t)))
529 handle_error(err);
530 }
531 }
532
533 // Obtain a queue with the minimum (power of two) size, used to send commands
534 // to the HSA runtime and launch execution on the device.
535 uint64_t queue_size;
536 if (hsa_status_t err = hsa_agent_get_info(
537 dev_agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE, &queue_size))
538 handle_error(err);
539 hsa_queue_t *queue = nullptr;
540 if (hsa_status_t err =
541 hsa_queue_create(dev_agent, queue_size, HSA_QUEUE_TYPE_MULTI, nullptr,
542 nullptr, UINT32_MAX, UINT32_MAX, &queue))
543 handle_error(err);
544
545 LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
546 begin_args_t init_args = {argc, dev_argv, dev_envp};
547 if (hsa_status_t err = launch_kernel(
548 dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
549 device, single_threaded_params, "_begin.kd", init_args))
550 handle_error(err);
551
552 start_args_t args = {argc, dev_argv, dev_envp, dev_ret};
553 if (hsa_status_t err = launch_kernel(dev_agent, executable, kernargs_pool,
554 coarsegrained_pool, queue, device,
555 params, "_start.kd", args))
556 handle_error(err);
557
558 void *host_ret;
559 if (hsa_status_t err =
560 hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(int),
561 /*flags=*/0, &host_ret))
562 handle_error(err);
563 hsa_amd_agents_allow_access(1, &dev_agent, nullptr, host_ret);
564
565 if (hsa_status_t err =
566 hsa_memcpy(host_ret, host_agent, dev_ret, dev_agent, sizeof(int)))
567 handle_error(err);
568
569 // Save the return value and perform basic clean-up.
570 int ret = *static_cast<int *>(host_ret);
571
572 end_args_t fini_args = {ret};
573 if (hsa_status_t err = launch_kernel(
574 dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
575 device, single_threaded_params, "_end.kd", fini_args))
576 handle_error(err);
577
578 if (rpc_status_t err = rpc_server_shutdown(
579 device, [](void *ptr, void *) { hsa_amd_memory_pool_free(ptr); },
580 nullptr))
581 handle_error(err);
582
583 // Free the memory allocated for the device.
584 if (hsa_status_t err = hsa_amd_memory_pool_free(dev_argv))
585 handle_error(err);
586 if (hsa_status_t err = hsa_amd_memory_pool_free(dev_ret))
587 handle_error(err);
588 if (hsa_status_t err = hsa_amd_memory_pool_free(host_ret))
589 handle_error(err);
590
591 if (hsa_status_t err = hsa_queue_destroy(queue))
592 handle_error(err);
593
594 if (hsa_status_t err = hsa_executable_destroy(executable))
595 handle_error(err);
596
597 if (hsa_status_t err = hsa_code_object_destroy(object))
598 handle_error(err);
599
600 if (hsa_status_t err = hsa_shut_down())
601 handle_error(err);
602
603 return ret;
604}
605

source code of libc/utils/gpu/loader/amdgpu/Loader.cpp