1 | //===-- Loader Implementation for AMDHSA devices --------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file impelements a simple loader to run images supporting the AMDHSA |
10 | // architecture. The file launches the '_start' kernel which should be provided |
11 | // by the device application start code and call ultimately call the 'main' |
12 | // function. |
13 | // |
14 | //===----------------------------------------------------------------------===// |
15 | |
16 | #include "Loader.h" |
17 | |
18 | #if defined(__has_include) |
19 | #if __has_include("hsa/hsa.h") |
20 | #include "hsa/hsa.h" |
21 | #include "hsa/hsa_ext_amd.h" |
22 | #elif __has_include("hsa.h") |
23 | #include "hsa.h" |
24 | #include "hsa_ext_amd.h" |
25 | #endif |
26 | #else |
27 | #include "hsa/hsa.h" |
28 | #include "hsa/hsa_ext_amd.h" |
29 | #endif |
30 | |
31 | #include <cstdio> |
32 | #include <cstdlib> |
33 | #include <cstring> |
34 | #include <tuple> |
35 | #include <utility> |
36 | |
37 | // The implicit arguments of COV5 AMDGPU kernels. |
38 | struct implicit_args_t { |
39 | uint32_t grid_size_x; |
40 | uint32_t grid_size_y; |
41 | uint32_t grid_size_z; |
42 | uint16_t workgroup_size_x; |
43 | uint16_t workgroup_size_y; |
44 | uint16_t workgroup_size_z; |
45 | uint8_t Unused0[46]; |
46 | uint16_t grid_dims; |
47 | uint8_t Unused1[190]; |
48 | }; |
49 | |
50 | /// Print the error code and exit if \p code indicates an error. |
51 | static void handle_error(hsa_status_t code) { |
52 | if (code == HSA_STATUS_SUCCESS || code == HSA_STATUS_INFO_BREAK) |
53 | return; |
54 | |
55 | const char *desc; |
56 | if (hsa_status_string(code, &desc) != HSA_STATUS_SUCCESS) |
57 | desc = "Unknown error" ; |
58 | fprintf(stderr, format: "%s\n" , desc); |
59 | exit(EXIT_FAILURE); |
60 | } |
61 | |
62 | /// Generic interface for iterating using the HSA callbacks. |
63 | template <typename elem_ty, typename func_ty, typename callback_ty> |
64 | hsa_status_t iterate(func_ty func, callback_ty cb) { |
65 | auto l = [](elem_ty elem, void *data) -> hsa_status_t { |
66 | callback_ty *unwrapped = static_cast<callback_ty *>(data); |
67 | return (*unwrapped)(elem); |
68 | }; |
69 | return func(l, static_cast<void *>(&cb)); |
70 | } |
71 | |
72 | /// Generic interface for iterating using the HSA callbacks. |
73 | template <typename elem_ty, typename func_ty, typename func_arg_ty, |
74 | typename callback_ty> |
75 | hsa_status_t iterate(func_ty func, func_arg_ty func_arg, callback_ty cb) { |
76 | auto l = [](elem_ty elem, void *data) -> hsa_status_t { |
77 | callback_ty *unwrapped = static_cast<callback_ty *>(data); |
78 | return (*unwrapped)(elem); |
79 | }; |
80 | return func(func_arg, l, static_cast<void *>(&cb)); |
81 | } |
82 | |
83 | /// Iterate through all availible agents. |
84 | template <typename callback_ty> |
85 | hsa_status_t iterate_agents(callback_ty callback) { |
86 | return iterate<hsa_agent_t>(hsa_iterate_agents, callback); |
87 | } |
88 | |
89 | /// Iterate through all availible memory pools. |
90 | template <typename callback_ty> |
91 | hsa_status_t iterate_agent_memory_pools(hsa_agent_t agent, callback_ty cb) { |
92 | return iterate<hsa_amd_memory_pool_t>(hsa_amd_agent_iterate_memory_pools, |
93 | agent, cb); |
94 | } |
95 | |
96 | template <hsa_device_type_t flag> |
97 | hsa_status_t get_agent(hsa_agent_t *output_agent) { |
98 | // Find the first agent with a matching device type. |
99 | auto cb = [&](hsa_agent_t hsa_agent) -> hsa_status_t { |
100 | hsa_device_type_t type; |
101 | hsa_status_t status = |
102 | hsa_agent_get_info(hsa_agent, HSA_AGENT_INFO_DEVICE, &type); |
103 | if (status != HSA_STATUS_SUCCESS) |
104 | return status; |
105 | |
106 | if (type == flag) { |
107 | // Ensure that a GPU agent supports kernel dispatch packets. |
108 | if (type == HSA_DEVICE_TYPE_GPU) { |
109 | hsa_agent_feature_t features; |
110 | status = |
111 | hsa_agent_get_info(hsa_agent, HSA_AGENT_INFO_FEATURE, &features); |
112 | if (status != HSA_STATUS_SUCCESS) |
113 | return status; |
114 | if (features & HSA_AGENT_FEATURE_KERNEL_DISPATCH) |
115 | *output_agent = hsa_agent; |
116 | } else { |
117 | *output_agent = hsa_agent; |
118 | } |
119 | return HSA_STATUS_INFO_BREAK; |
120 | } |
121 | return HSA_STATUS_SUCCESS; |
122 | }; |
123 | |
124 | return iterate_agents(cb); |
125 | } |
126 | |
127 | /// Retrieve a global memory pool with a \p flag from the agent. |
128 | template <hsa_amd_memory_pool_global_flag_t flag> |
129 | hsa_status_t get_agent_memory_pool(hsa_agent_t agent, |
130 | hsa_amd_memory_pool_t *output_pool) { |
131 | auto cb = [&](hsa_amd_memory_pool_t memory_pool) { |
132 | uint32_t flags; |
133 | hsa_amd_segment_t segment; |
134 | if (auto err = hsa_amd_memory_pool_get_info( |
135 | memory_pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment)) |
136 | return err; |
137 | if (auto err = hsa_amd_memory_pool_get_info( |
138 | memory_pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flags)) |
139 | return err; |
140 | |
141 | if (segment != HSA_AMD_SEGMENT_GLOBAL) |
142 | return HSA_STATUS_SUCCESS; |
143 | |
144 | if (flags & flag) |
145 | *output_pool = memory_pool; |
146 | |
147 | return HSA_STATUS_SUCCESS; |
148 | }; |
149 | return iterate_agent_memory_pools(agent, cb); |
150 | } |
151 | |
152 | template <typename args_t> |
153 | hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable, |
154 | hsa_amd_memory_pool_t kernargs_pool, |
155 | hsa_amd_memory_pool_t coarsegrained_pool, |
156 | hsa_queue_t *queue, rpc_device_t device, |
157 | const LaunchParameters ¶ms, |
158 | const char *kernel_name, args_t kernel_args) { |
159 | // Look up the '_start' kernel in the loaded executable. |
160 | hsa_executable_symbol_t symbol; |
161 | if (hsa_status_t err = hsa_executable_get_symbol_by_name( |
162 | executable, kernel_name, &dev_agent, &symbol)) |
163 | return err; |
164 | |
165 | // Register RPC callbacks for the malloc and free functions on HSA. |
166 | auto tuple = std::make_tuple(dev_agent, coarsegrained_pool); |
167 | rpc_register_callback( |
168 | device, RPC_MALLOC, |
169 | [](rpc_port_t port, void *data) { |
170 | auto malloc_handler = [](rpc_buffer_t *buffer, void *data) -> void { |
171 | auto &[dev_agent, pool] = *static_cast<decltype(tuple) *>(data); |
172 | uint64_t size = buffer->data[0]; |
173 | void *dev_ptr = nullptr; |
174 | if (hsa_status_t err = |
175 | hsa_amd_memory_pool_allocate(pool, size, |
176 | /*flags=*/0, &dev_ptr)) |
177 | handle_error(err); |
178 | hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr); |
179 | buffer->data[0] = reinterpret_cast<uintptr_t>(dev_ptr); |
180 | }; |
181 | rpc_recv_and_send(port, malloc_handler, data); |
182 | }, |
183 | &tuple); |
184 | rpc_register_callback( |
185 | device, RPC_FREE, |
186 | [](rpc_port_t port, void *data) { |
187 | auto free_handler = [](rpc_buffer_t *buffer, void *) { |
188 | if (hsa_status_t err = hsa_amd_memory_pool_free( |
189 | reinterpret_cast<void *>(buffer->data[0]))) |
190 | handle_error(err); |
191 | }; |
192 | rpc_recv_and_send(port, free_handler, data); |
193 | }, |
194 | nullptr); |
195 | |
196 | // Retrieve different properties of the kernel symbol used for launch. |
197 | uint64_t kernel; |
198 | uint32_t args_size; |
199 | uint32_t group_size; |
200 | uint32_t private_size; |
201 | bool dynamic_stack; |
202 | |
203 | std::pair<hsa_executable_symbol_info_t, void *> symbol_infos[] = { |
204 | {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel}, |
205 | {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &args_size}, |
206 | {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_size}, |
207 | {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK, &dynamic_stack}, |
208 | {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &private_size}}; |
209 | |
210 | for (auto &[info, value] : symbol_infos) |
211 | if (hsa_status_t err = hsa_executable_symbol_get_info(symbol, info, value)) |
212 | return err; |
213 | |
214 | // Allocate space for the kernel arguments on the host and allow the GPU agent |
215 | // to access it. |
216 | void *args; |
217 | if (hsa_status_t err = hsa_amd_memory_pool_allocate(kernargs_pool, args_size, |
218 | /*flags=*/0, &args)) |
219 | handle_error(err); |
220 | hsa_amd_agents_allow_access(1, &dev_agent, nullptr, args); |
221 | |
222 | // Initialie all the arguments (explicit and implicit) to zero, then set the |
223 | // explicit arguments to the values created above. |
224 | std::memset(s: args, c: 0, n: args_size); |
225 | std::memcpy(dest: args, src: &kernel_args, n: sizeof(args_t)); |
226 | |
227 | // Initialize the necessary implicit arguments to the proper values. |
228 | bool dims = 1 + (params.num_blocks_y * params.num_threads_y != 1) + |
229 | (params.num_blocks_z * params.num_threads_z != 1); |
230 | implicit_args_t *implicit_args = reinterpret_cast<implicit_args_t *>( |
231 | reinterpret_cast<uint8_t *>(args) + sizeof(args_t)); |
232 | implicit_args->grid_dims = dims; |
233 | implicit_args->grid_size_x = params.num_blocks_x; |
234 | implicit_args->grid_size_y = params.num_blocks_y; |
235 | implicit_args->grid_size_z = params.num_blocks_z; |
236 | implicit_args->workgroup_size_x = params.num_threads_x; |
237 | implicit_args->workgroup_size_y = params.num_threads_y; |
238 | implicit_args->workgroup_size_z = params.num_threads_z; |
239 | |
240 | // Obtain a packet from the queue. |
241 | uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, 1); |
242 | while (packet_id - hsa_queue_load_read_index_scacquire(queue) >= queue->size) |
243 | ; |
244 | |
245 | const uint32_t mask = queue->size - 1; |
246 | hsa_kernel_dispatch_packet_t *packet = |
247 | static_cast<hsa_kernel_dispatch_packet_t *>(queue->base_address) + |
248 | (packet_id & mask); |
249 | |
250 | // Set up the packet for exeuction on the device. We currently only launch |
251 | // with one thread on the device, forcing the rest of the wavefront to be |
252 | // masked off. |
253 | uint16_t setup = (dims) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; |
254 | packet->workgroup_size_x = params.num_threads_x; |
255 | packet->workgroup_size_y = params.num_threads_y; |
256 | packet->workgroup_size_z = params.num_threads_z; |
257 | packet->reserved0 = 0; |
258 | packet->grid_size_x = params.num_blocks_x * params.num_threads_x; |
259 | packet->grid_size_y = params.num_blocks_y * params.num_threads_y; |
260 | packet->grid_size_z = params.num_blocks_z * params.num_threads_z; |
261 | packet->private_segment_size = |
262 | dynamic_stack ? 16 * 1024 /* 16 KB */ : private_size; |
263 | packet->group_segment_size = group_size; |
264 | packet->kernel_object = kernel; |
265 | packet->kernarg_address = args; |
266 | packet->reserved2 = 0; |
267 | // Create a signal to indicate when this packet has been completed. |
268 | if (hsa_status_t err = |
269 | hsa_signal_create(1, 0, nullptr, &packet->completion_signal)) |
270 | handle_error(err); |
271 | |
272 | // Initialize the packet header and set the doorbell signal to begin execution |
273 | // by the HSA runtime. |
274 | uint16_t = |
275 | (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | |
276 | (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) | |
277 | (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE); |
278 | uint32_t = header | (setup << 16u); |
279 | __atomic_store_n((uint32_t *)&packet->header, header_word, __ATOMIC_RELEASE); |
280 | hsa_signal_store_relaxed(queue->doorbell_signal, packet_id); |
281 | |
282 | // Wait until the kernel has completed execution on the device. Periodically |
283 | // check the RPC client for work to be performed on the server. |
284 | while (hsa_signal_wait_scacquire( |
285 | packet->completion_signal, HSA_SIGNAL_CONDITION_EQ, 0, |
286 | /*timeout_hint=*/1024, HSA_WAIT_STATE_ACTIVE) != 0) |
287 | if (rpc_status_t err = rpc_handle_server(device)) |
288 | handle_error(err); |
289 | |
290 | // Handle the server one more time in case the kernel exited with a pending |
291 | // send still in flight. |
292 | if (rpc_status_t err = rpc_handle_server(device)) |
293 | handle_error(err); |
294 | |
295 | // Destroy the resources acquired to launch the kernel and return. |
296 | if (hsa_status_t err = hsa_amd_memory_pool_free(args)) |
297 | handle_error(err); |
298 | if (hsa_status_t err = hsa_signal_destroy(packet->completion_signal)) |
299 | handle_error(err); |
300 | |
301 | return HSA_STATUS_SUCCESS; |
302 | } |
303 | |
304 | /// Copies data from the source agent to the destination agent. The source |
305 | /// memory must first be pinned explicitly or allocated via HSA. |
306 | static hsa_status_t hsa_memcpy(void *dst, hsa_agent_t dst_agent, |
307 | const void *src, hsa_agent_t src_agent, |
308 | uint64_t size) { |
309 | // Create a memory signal to copy information between the host and device. |
310 | hsa_signal_t memory_signal; |
311 | if (hsa_status_t err = hsa_signal_create(1, 0, nullptr, &memory_signal)) |
312 | return err; |
313 | |
314 | if (hsa_status_t err = hsa_amd_memory_async_copy( |
315 | dst, dst_agent, src, src_agent, size, 0, nullptr, memory_signal)) |
316 | return err; |
317 | |
318 | while (hsa_signal_wait_scacquire(memory_signal, HSA_SIGNAL_CONDITION_EQ, 0, |
319 | UINT64_MAX, HSA_WAIT_STATE_ACTIVE) != 0) |
320 | ; |
321 | |
322 | if (hsa_status_t err = hsa_signal_destroy(memory_signal)) |
323 | return err; |
324 | |
325 | return HSA_STATUS_SUCCESS; |
326 | } |
327 | |
328 | int load(int argc, char **argv, char **envp, void *image, size_t size, |
329 | const LaunchParameters ¶ms) { |
330 | // Initialize the HSA runtime used to communicate with the device. |
331 | if (hsa_status_t err = hsa_init()) |
332 | handle_error(err); |
333 | |
334 | // Register a callback when the device encounters a memory fault. |
335 | if (hsa_status_t err = hsa_amd_register_system_event_handler( |
336 | [](const hsa_amd_event_t *event, void *) -> hsa_status_t { |
337 | if (event->event_type == HSA_AMD_GPU_MEMORY_FAULT_EVENT) |
338 | return HSA_STATUS_ERROR; |
339 | return HSA_STATUS_SUCCESS; |
340 | }, |
341 | nullptr)) |
342 | handle_error(err); |
343 | |
344 | // Obtain a single agent for the device and host to use the HSA memory model. |
345 | hsa_agent_t dev_agent; |
346 | hsa_agent_t host_agent; |
347 | if (hsa_status_t err = get_agent<HSA_DEVICE_TYPE_GPU>(&dev_agent)) |
348 | handle_error(err); |
349 | if (hsa_status_t err = get_agent<HSA_DEVICE_TYPE_CPU>(&host_agent)) |
350 | handle_error(err); |
351 | |
352 | // Load the code object's ISA information and executable data segments. |
353 | hsa_code_object_t object; |
354 | if (hsa_status_t err = hsa_code_object_deserialize(image, size, "" , &object)) |
355 | handle_error(err); |
356 | |
357 | hsa_executable_t executable; |
358 | if (hsa_status_t err = hsa_executable_create_alt( |
359 | HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO, "" , |
360 | &executable)) |
361 | handle_error(err); |
362 | |
363 | if (hsa_status_t err = |
364 | hsa_executable_load_code_object(executable, dev_agent, object, "" )) |
365 | handle_error(err); |
366 | |
367 | // No modifications to the executable are allowed after this point. |
368 | if (hsa_status_t err = hsa_executable_freeze(executable, "" )) |
369 | handle_error(err); |
370 | |
371 | // Check the validity of the loaded executable. If the agents ISA features do |
372 | // not match the executable's code object it will fail here. |
373 | uint32_t result; |
374 | if (hsa_status_t err = hsa_executable_validate(executable, &result)) |
375 | handle_error(err); |
376 | if (result) |
377 | handle_error(HSA_STATUS_ERROR); |
378 | |
379 | // Obtain memory pools to exchange data between the host and the device. The |
380 | // fine-grained pool acts as pinned memory on the host for DMA transfers to |
381 | // the device, the coarse-grained pool is for allocations directly on the |
382 | // device, and the kernerl-argument pool is for executing the kernel. |
383 | hsa_amd_memory_pool_t kernargs_pool; |
384 | hsa_amd_memory_pool_t finegrained_pool; |
385 | hsa_amd_memory_pool_t coarsegrained_pool; |
386 | if (hsa_status_t err = |
387 | get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT>( |
388 | host_agent, &kernargs_pool)) |
389 | handle_error(err); |
390 | if (hsa_status_t err = |
391 | get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED>( |
392 | host_agent, &finegrained_pool)) |
393 | handle_error(err); |
394 | if (hsa_status_t err = |
395 | get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED>( |
396 | dev_agent, &coarsegrained_pool)) |
397 | handle_error(err); |
398 | |
399 | // Allocate fine-grained memory on the host to hold the pointer array for the |
400 | // copied argv and allow the GPU agent to access it. |
401 | auto allocator = [&](uint64_t size) -> void * { |
402 | void *dev_ptr = nullptr; |
403 | if (hsa_status_t err = hsa_amd_memory_pool_allocate(finegrained_pool, size, |
404 | /*flags=*/0, &dev_ptr)) |
405 | handle_error(err); |
406 | hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr); |
407 | return dev_ptr; |
408 | }; |
409 | void *dev_argv = copy_argument_vector(argc, argv, allocator); |
410 | if (!dev_argv) |
411 | handle_error("Failed to allocate device argv" ); |
412 | |
413 | // Allocate fine-grained memory on the host to hold the pointer array for the |
414 | // copied environment array and allow the GPU agent to access it. |
415 | void *dev_envp = copy_environment(envp, allocator); |
416 | if (!dev_envp) |
417 | handle_error("Failed to allocate device environment" ); |
418 | |
419 | // Allocate space for the return pointer and initialize it to zero. |
420 | void *dev_ret; |
421 | if (hsa_status_t err = |
422 | hsa_amd_memory_pool_allocate(coarsegrained_pool, sizeof(int), |
423 | /*flags=*/0, &dev_ret)) |
424 | handle_error(err); |
425 | hsa_amd_memory_fill(dev_ret, 0, /*count=*/1); |
426 | |
427 | // Allocate finegrained memory for the RPC server and client to share. |
428 | uint32_t wavefront_size = 0; |
429 | if (hsa_status_t err = hsa_agent_get_info( |
430 | dev_agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size)) |
431 | handle_error(err); |
432 | |
433 | // Set up the RPC server. |
434 | auto tuple = std::make_tuple(dev_agent, finegrained_pool); |
435 | auto rpc_alloc = [](uint64_t size, void *data) { |
436 | auto &[dev_agent, finegrained_pool] = *static_cast<decltype(tuple) *>(data); |
437 | void *dev_ptr = nullptr; |
438 | if (hsa_status_t err = hsa_amd_memory_pool_allocate(finegrained_pool, size, |
439 | /*flags=*/0, &dev_ptr)) |
440 | handle_error(err); |
441 | hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr); |
442 | return dev_ptr; |
443 | }; |
444 | rpc_device_t device; |
445 | if (rpc_status_t err = rpc_server_init(&device, RPC_MAXIMUM_PORT_COUNT, |
446 | wavefront_size, rpc_alloc, &tuple)) |
447 | handle_error(err); |
448 | |
449 | // Register callbacks for the RPC unit tests. |
450 | if (wavefront_size == 32) |
451 | register_rpc_callbacks<32>(device); |
452 | else if (wavefront_size == 64) |
453 | register_rpc_callbacks<64>(device); |
454 | else |
455 | handle_error("Invalid wavefront size" ); |
456 | |
457 | // Initialize the RPC client on the device by copying the local data to the |
458 | // device's internal pointer. |
459 | hsa_executable_symbol_t rpc_client_sym; |
460 | if (hsa_status_t err = hsa_executable_get_symbol_by_name( |
461 | executable, rpc_client_symbol_name, &dev_agent, &rpc_client_sym)) |
462 | handle_error(err); |
463 | |
464 | void *rpc_client_host; |
465 | if (hsa_status_t err = |
466 | hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(void *), |
467 | /*flags=*/0, &rpc_client_host)) |
468 | handle_error(err); |
469 | hsa_amd_agents_allow_access(1, &dev_agent, nullptr, rpc_client_host); |
470 | |
471 | void *rpc_client_dev; |
472 | if (hsa_status_t err = hsa_executable_symbol_get_info( |
473 | rpc_client_sym, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, |
474 | &rpc_client_dev)) |
475 | handle_error(err); |
476 | |
477 | // Copy the address of the client buffer from the device to the host. |
478 | if (hsa_status_t err = hsa_memcpy(rpc_client_host, host_agent, rpc_client_dev, |
479 | dev_agent, sizeof(void *))) |
480 | handle_error(err); |
481 | |
482 | void *rpc_client_buffer; |
483 | if (hsa_status_t err = |
484 | hsa_amd_memory_lock(const_cast<void *>(rpc_get_client_buffer(device)), |
485 | rpc_get_client_size(), |
486 | /*agents=*/nullptr, 0, &rpc_client_buffer)) |
487 | handle_error(err); |
488 | |
489 | // Copy the RPC client buffer to the address pointed to by the symbol. |
490 | if (hsa_status_t err = |
491 | hsa_memcpy(*reinterpret_cast<void **>(rpc_client_host), dev_agent, |
492 | rpc_client_buffer, host_agent, rpc_get_client_size())) |
493 | handle_error(err); |
494 | |
495 | if (hsa_status_t err = hsa_amd_memory_unlock( |
496 | const_cast<void *>(rpc_get_client_buffer(device)))) |
497 | handle_error(err); |
498 | if (hsa_status_t err = hsa_amd_memory_pool_free(rpc_client_host)) |
499 | handle_error(err); |
500 | |
501 | // Obtain the GPU's fixed-frequency clock rate and copy it to the GPU. |
502 | // If the clock_freq symbol is missing, no work to do. |
503 | hsa_executable_symbol_t freq_sym; |
504 | if (HSA_STATUS_SUCCESS == |
505 | hsa_executable_get_symbol_by_name(executable, "__llvm_libc_clock_freq" , |
506 | &dev_agent, &freq_sym)) { |
507 | |
508 | void *host_clock_freq; |
509 | if (hsa_status_t err = |
510 | hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(uint64_t), |
511 | /*flags=*/0, &host_clock_freq)) |
512 | handle_error(err); |
513 | hsa_amd_agents_allow_access(1, &dev_agent, nullptr, host_clock_freq); |
514 | |
515 | if (HSA_STATUS_SUCCESS == |
516 | hsa_agent_get_info(dev_agent, |
517 | static_cast<hsa_agent_info_t>( |
518 | HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY), |
519 | host_clock_freq)) { |
520 | |
521 | void *freq_addr; |
522 | if (hsa_status_t err = hsa_executable_symbol_get_info( |
523 | freq_sym, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, |
524 | &freq_addr)) |
525 | handle_error(err); |
526 | |
527 | if (hsa_status_t err = hsa_memcpy(freq_addr, dev_agent, host_clock_freq, |
528 | host_agent, sizeof(uint64_t))) |
529 | handle_error(err); |
530 | } |
531 | } |
532 | |
533 | // Obtain a queue with the minimum (power of two) size, used to send commands |
534 | // to the HSA runtime and launch execution on the device. |
535 | uint64_t queue_size; |
536 | if (hsa_status_t err = hsa_agent_get_info( |
537 | dev_agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE, &queue_size)) |
538 | handle_error(err); |
539 | hsa_queue_t *queue = nullptr; |
540 | if (hsa_status_t err = |
541 | hsa_queue_create(dev_agent, queue_size, HSA_QUEUE_TYPE_MULTI, nullptr, |
542 | nullptr, UINT32_MAX, UINT32_MAX, &queue)) |
543 | handle_error(err); |
544 | |
545 | LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1}; |
546 | begin_args_t init_args = {argc, dev_argv, dev_envp}; |
547 | if (hsa_status_t err = launch_kernel( |
548 | dev_agent, executable, kernargs_pool, coarsegrained_pool, queue, |
549 | device, single_threaded_params, "_begin.kd" , init_args)) |
550 | handle_error(err); |
551 | |
552 | start_args_t args = {argc, dev_argv, dev_envp, dev_ret}; |
553 | if (hsa_status_t err = launch_kernel(dev_agent, executable, kernargs_pool, |
554 | coarsegrained_pool, queue, device, |
555 | params, "_start.kd" , args)) |
556 | handle_error(err); |
557 | |
558 | void *host_ret; |
559 | if (hsa_status_t err = |
560 | hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(int), |
561 | /*flags=*/0, &host_ret)) |
562 | handle_error(err); |
563 | hsa_amd_agents_allow_access(1, &dev_agent, nullptr, host_ret); |
564 | |
565 | if (hsa_status_t err = |
566 | hsa_memcpy(host_ret, host_agent, dev_ret, dev_agent, sizeof(int))) |
567 | handle_error(err); |
568 | |
569 | // Save the return value and perform basic clean-up. |
570 | int ret = *static_cast<int *>(host_ret); |
571 | |
572 | end_args_t fini_args = {ret}; |
573 | if (hsa_status_t err = launch_kernel( |
574 | dev_agent, executable, kernargs_pool, coarsegrained_pool, queue, |
575 | device, single_threaded_params, "_end.kd" , fini_args)) |
576 | handle_error(err); |
577 | |
578 | if (rpc_status_t err = rpc_server_shutdown( |
579 | device, [](void *ptr, void *) { hsa_amd_memory_pool_free(ptr); }, |
580 | nullptr)) |
581 | handle_error(err); |
582 | |
583 | // Free the memory allocated for the device. |
584 | if (hsa_status_t err = hsa_amd_memory_pool_free(dev_argv)) |
585 | handle_error(err); |
586 | if (hsa_status_t err = hsa_amd_memory_pool_free(dev_ret)) |
587 | handle_error(err); |
588 | if (hsa_status_t err = hsa_amd_memory_pool_free(host_ret)) |
589 | handle_error(err); |
590 | |
591 | if (hsa_status_t err = hsa_queue_destroy(queue)) |
592 | handle_error(err); |
593 | |
594 | if (hsa_status_t err = hsa_executable_destroy(executable)) |
595 | handle_error(err); |
596 | |
597 | if (hsa_status_t err = hsa_code_object_destroy(object)) |
598 | handle_error(err); |
599 | |
600 | if (hsa_status_t err = hsa_shut_down()) |
601 | handle_error(err); |
602 | |
603 | return ret; |
604 | } |
605 | |