1 | //===-------- interface.cpp - Target independent OpenMP target RTL --------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // Implementation of the interface to be used by Clang during the codegen of a |
10 | // target region. |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "OpenMP/OMPT/Interface.h" |
15 | #include "OffloadPolicy.h" |
16 | #include "OpenMP/OMPT/Callback.h" |
17 | #include "OpenMP/omp.h" |
18 | #include "PluginManager.h" |
19 | #include "omptarget.h" |
20 | #include "private.h" |
21 | |
22 | #include "Shared/EnvironmentVar.h" |
23 | #include "Shared/Profile.h" |
24 | |
25 | #include "Utils/ExponentialBackoff.h" |
26 | |
27 | #include "llvm/Frontend/OpenMP/OMPConstants.h" |
28 | |
29 | #include <cassert> |
30 | #include <cstdint> |
31 | #include <cstdio> |
32 | #include <cstdlib> |
33 | |
34 | #ifdef OMPT_SUPPORT |
35 | using namespace llvm::omp::target::ompt; |
36 | #endif |
37 | |
38 | // If offload is enabled, ensure that device DeviceID has been initialized. |
39 | // |
40 | // The return bool indicates if the offload is to the host device |
41 | // There are three possible results: |
42 | // - Return false if the target device is ready for offload |
43 | // - Return true without reporting a runtime error if offload is |
44 | // disabled, perhaps because the initial device was specified. |
45 | // - Report a runtime error and return true. |
46 | // |
47 | // If DeviceID == OFFLOAD_DEVICE_DEFAULT, set DeviceID to the default device. |
48 | // This step might be skipped if offload is disabled. |
49 | bool checkDevice(int64_t &DeviceID, ident_t *Loc) { |
50 | if (OffloadPolicy::get(*PM).Kind == OffloadPolicy::DISABLED) { |
51 | DP("Offload is disabled\n" ); |
52 | return true; |
53 | } |
54 | |
55 | if (DeviceID == OFFLOAD_DEVICE_DEFAULT) { |
56 | DeviceID = omp_get_default_device(); |
57 | DP("Use default device id %" PRId64 "\n" , DeviceID); |
58 | } |
59 | |
60 | // Proposed behavior for OpenMP 5.2 in OpenMP spec github issue 2669. |
61 | if (omp_get_num_devices() == 0) { |
62 | DP("omp_get_num_devices() == 0 but offload is manadatory\n" ); |
63 | handleTargetOutcome(false, Loc); |
64 | return true; |
65 | } |
66 | |
67 | if (DeviceID == omp_get_initial_device()) { |
68 | DP("Device is host (%" PRId64 "), returning as if offload is disabled\n" , |
69 | DeviceID); |
70 | return true; |
71 | } |
72 | return false; |
73 | } |
74 | |
75 | //////////////////////////////////////////////////////////////////////////////// |
76 | /// adds requires flags |
77 | EXTERN void __tgt_register_requires(int64_t Flags) { |
78 | MESSAGE("The %s function has been removed. Old OpenMP requirements will not " |
79 | "be handled" , |
80 | __PRETTY_FUNCTION__); |
81 | } |
82 | |
83 | EXTERN void __tgt_rtl_init() { initRuntime(); } |
84 | EXTERN void __tgt_rtl_deinit() { deinitRuntime(); } |
85 | |
86 | //////////////////////////////////////////////////////////////////////////////// |
87 | /// adds a target shared library to the target execution image |
88 | EXTERN void __tgt_register_lib(__tgt_bin_desc *Desc) { |
89 | initRuntime(); |
90 | if (PM->delayRegisterLib(Desc)) |
91 | return; |
92 | |
93 | PM->registerLib(Desc); |
94 | } |
95 | |
96 | //////////////////////////////////////////////////////////////////////////////// |
97 | /// Initialize all available devices without registering any image |
98 | EXTERN void __tgt_init_all_rtls() { |
99 | assert(PM && "Runtime not initialized" ); |
100 | PM->initializeAllDevices(); |
101 | } |
102 | |
103 | //////////////////////////////////////////////////////////////////////////////// |
104 | /// unloads a target shared library |
105 | EXTERN void __tgt_unregister_lib(__tgt_bin_desc *Desc) { |
106 | PM->unregisterLib(Desc); |
107 | |
108 | deinitRuntime(); |
109 | } |
110 | |
111 | template <typename TargetAsyncInfoTy> |
112 | static inline void |
113 | targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase, |
114 | void **Args, int64_t *ArgSizes, int64_t *ArgTypes, |
115 | map_var_info_t *ArgNames, void **ArgMappers, |
116 | TargetDataFuncPtrTy TargetDataFunction, const char *RegionTypeMsg, |
117 | const char *RegionName) { |
118 | assert(PM && "Runtime not initialized" ); |
119 | static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>, |
120 | "TargetAsyncInfoTy must be convertible to AsyncInfoTy." ); |
121 | |
122 | TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: Data Copy" , |
123 | "NumArgs=" + std::to_string(val: ArgNum), Loc); |
124 | |
125 | DP("Entering data %s region for device %" PRId64 " with %d mappings\n" , |
126 | RegionName, DeviceId, ArgNum); |
127 | |
128 | if (checkDevice(DeviceId, Loc)) { |
129 | DP("Not offloading to device %" PRId64 "\n" , DeviceId); |
130 | return; |
131 | } |
132 | |
133 | if (getInfoLevel() & OMP_INFOTYPE_KERNEL_ARGS) |
134 | printKernelArguments(Loc, DeviceId, ArgNum, ArgSizes, ArgTypes, ArgNames, |
135 | RegionTypeMsg); |
136 | #ifdef OMPTARGET_DEBUG |
137 | for (int I = 0; I < ArgNum; ++I) { |
138 | DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64 |
139 | ", Type=0x%" PRIx64 ", Name=%s\n" , |
140 | I, DPxPTR(ArgsBase[I]), DPxPTR(Args[I]), ArgSizes[I], ArgTypes[I], |
141 | (ArgNames) ? getNameFromMapping(ArgNames[I]).c_str() : "unknown" ); |
142 | } |
143 | #endif |
144 | |
145 | auto DeviceOrErr = PM->getDevice(DeviceId); |
146 | if (!DeviceOrErr) |
147 | FATAL_MESSAGE(DeviceId, "%s" , toString(DeviceOrErr.takeError()).c_str()); |
148 | |
149 | TargetAsyncInfoTy TargetAsyncInfo(*DeviceOrErr); |
150 | AsyncInfoTy &AsyncInfo = TargetAsyncInfo; |
151 | |
152 | /// RAII to establish tool anchors before and after data begin / end / update |
153 | OMPT_IF_BUILT(assert((TargetDataFunction == targetDataBegin || |
154 | TargetDataFunction == targetDataEnd || |
155 | TargetDataFunction == targetDataUpdate) && |
156 | "Encountered unexpected TargetDataFunction during " |
157 | "execution of targetData" ); |
158 | auto CallbackFunctions = |
159 | (TargetDataFunction == targetDataBegin) |
160 | ? RegionInterface.getCallbacks<ompt_target_enter_data>() |
161 | : (TargetDataFunction == targetDataEnd) |
162 | ? RegionInterface.getCallbacks<ompt_target_exit_data>() |
163 | : RegionInterface.getCallbacks<ompt_target_update>(); |
164 | InterfaceRAII TargetDataRAII(CallbackFunctions, DeviceId, |
165 | OMPT_GET_RETURN_ADDRESS);) |
166 | |
167 | int Rc = OFFLOAD_SUCCESS; |
168 | Rc = TargetDataFunction(Loc, *DeviceOrErr, ArgNum, ArgsBase, Args, ArgSizes, |
169 | ArgTypes, ArgNames, ArgMappers, AsyncInfo, |
170 | false /*FromMapper=*/); |
171 | |
172 | if (Rc == OFFLOAD_SUCCESS) |
173 | Rc = AsyncInfo.synchronize(); |
174 | |
175 | handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc); |
176 | } |
177 | |
178 | /// creates host-to-target data mapping, stores it in the |
179 | /// libomptarget.so internal structure (an entry in a stack of data maps) |
180 | /// and passes the data to the device. |
181 | EXTERN void __tgt_target_data_begin_mapper(ident_t *Loc, int64_t DeviceId, |
182 | int32_t ArgNum, void **ArgsBase, |
183 | void **Args, int64_t *ArgSizes, |
184 | int64_t *ArgTypes, |
185 | map_var_info_t *ArgNames, |
186 | void **ArgMappers) { |
187 | OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0))); |
188 | targetData<AsyncInfoTy>(Loc, DeviceId, ArgNum, ArgsBase, Args, ArgSizes, |
189 | ArgTypes, ArgNames, ArgMappers, targetDataBegin, |
190 | "Entering OpenMP data region with being_mapper" , |
191 | "begin" ); |
192 | } |
193 | |
194 | EXTERN void __tgt_target_data_begin_nowait_mapper( |
195 | ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase, |
196 | void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, |
197 | void **ArgMappers, int32_t DepNum, void *DepList, int32_t NoAliasDepNum, |
198 | void *NoAliasDepList) { |
199 | OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0))); |
200 | targetData<TaskAsyncInfoWrapperTy>( |
201 | Loc, DeviceId, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, ArgNames, |
202 | ArgMappers, targetDataBegin, |
203 | "Entering OpenMP data region with being_nowait_mapper" , "begin" ); |
204 | } |
205 | |
206 | /// passes data from the target, releases target memory and destroys |
207 | /// the host-target mapping (top entry from the stack of data maps) |
208 | /// created by the last __tgt_target_data_begin. |
209 | EXTERN void __tgt_target_data_end_mapper(ident_t *Loc, int64_t DeviceId, |
210 | int32_t ArgNum, void **ArgsBase, |
211 | void **Args, int64_t *ArgSizes, |
212 | int64_t *ArgTypes, |
213 | map_var_info_t *ArgNames, |
214 | void **ArgMappers) { |
215 | OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0))); |
216 | targetData<AsyncInfoTy>(Loc, DeviceId, ArgNum, ArgsBase, Args, ArgSizes, |
217 | ArgTypes, ArgNames, ArgMappers, targetDataEnd, |
218 | "Exiting OpenMP data region with end_mapper" , "end" ); |
219 | } |
220 | |
221 | EXTERN void __tgt_target_data_end_nowait_mapper( |
222 | ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase, |
223 | void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, |
224 | void **ArgMappers, int32_t DepNum, void *DepList, int32_t NoAliasDepNum, |
225 | void *NoAliasDepList) { |
226 | OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0))); |
227 | targetData<TaskAsyncInfoWrapperTy>( |
228 | Loc, DeviceId, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, ArgNames, |
229 | ArgMappers, targetDataEnd, |
230 | "Exiting OpenMP data region with end_nowait_mapper" , "end" ); |
231 | } |
232 | |
233 | EXTERN void __tgt_target_data_update_mapper(ident_t *Loc, int64_t DeviceId, |
234 | int32_t ArgNum, void **ArgsBase, |
235 | void **Args, int64_t *ArgSizes, |
236 | int64_t *ArgTypes, |
237 | map_var_info_t *ArgNames, |
238 | void **ArgMappers) { |
239 | OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0))); |
240 | targetData<AsyncInfoTy>( |
241 | Loc, DeviceId, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, ArgNames, |
242 | ArgMappers, targetDataUpdate, |
243 | "Updating data within the OpenMP data region with update_mapper" , |
244 | "update" ); |
245 | } |
246 | |
247 | EXTERN void __tgt_target_data_update_nowait_mapper( |
248 | ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase, |
249 | void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, |
250 | void **ArgMappers, int32_t DepNum, void *DepList, int32_t NoAliasDepNum, |
251 | void *NoAliasDepList) { |
252 | OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0))); |
253 | targetData<TaskAsyncInfoWrapperTy>( |
254 | Loc, DeviceId, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, ArgNames, |
255 | ArgMappers, targetDataUpdate, |
256 | "Updating data within the OpenMP data region with update_nowait_mapper" , |
257 | "update" ); |
258 | } |
259 | |
260 | static KernelArgsTy *upgradeKernelArgs(KernelArgsTy *KernelArgs, |
261 | KernelArgsTy &LocalKernelArgs, |
262 | int32_t NumTeams, int32_t ThreadLimit) { |
263 | if (KernelArgs->Version > OMP_KERNEL_ARG_VERSION) |
264 | DP("Unexpected ABI version: %u\n" , KernelArgs->Version); |
265 | |
266 | uint32_t UpgradedVersion = KernelArgs->Version; |
267 | if (KernelArgs->Version < OMP_KERNEL_ARG_VERSION) { |
268 | // The upgraded version will be based on the kernel launch environment. |
269 | if (KernelArgs->Version < OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR) |
270 | UpgradedVersion = OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR - 1; |
271 | else |
272 | UpgradedVersion = OMP_KERNEL_ARG_VERSION; |
273 | } |
274 | if (UpgradedVersion != KernelArgs->Version) { |
275 | LocalKernelArgs.Version = UpgradedVersion; |
276 | LocalKernelArgs.NumArgs = KernelArgs->NumArgs; |
277 | LocalKernelArgs.ArgBasePtrs = KernelArgs->ArgBasePtrs; |
278 | LocalKernelArgs.ArgPtrs = KernelArgs->ArgPtrs; |
279 | LocalKernelArgs.ArgSizes = KernelArgs->ArgSizes; |
280 | LocalKernelArgs.ArgTypes = KernelArgs->ArgTypes; |
281 | LocalKernelArgs.ArgNames = KernelArgs->ArgNames; |
282 | LocalKernelArgs.ArgMappers = KernelArgs->ArgMappers; |
283 | LocalKernelArgs.Tripcount = KernelArgs->Tripcount; |
284 | LocalKernelArgs.Flags = KernelArgs->Flags; |
285 | LocalKernelArgs.DynCGroupMem = 0; |
286 | LocalKernelArgs.NumTeams[0] = NumTeams; |
287 | LocalKernelArgs.NumTeams[1] = 1; |
288 | LocalKernelArgs.NumTeams[2] = 1; |
289 | LocalKernelArgs.ThreadLimit[0] = ThreadLimit; |
290 | LocalKernelArgs.ThreadLimit[1] = 1; |
291 | LocalKernelArgs.ThreadLimit[2] = 1; |
292 | return &LocalKernelArgs; |
293 | } |
294 | |
295 | // FIXME: This is a WA to "calibrate" the bad work done in the front end. |
296 | // Delete this ugly code after the front end emits proper values. |
297 | auto CorrectMultiDim = [](uint32_t (&Val)[3]) { |
298 | if (Val[1] == 0) |
299 | Val[1] = 1; |
300 | if (Val[2] == 0) |
301 | Val[2] = 1; |
302 | }; |
303 | CorrectMultiDim(KernelArgs->ThreadLimit); |
304 | CorrectMultiDim(KernelArgs->NumTeams); |
305 | |
306 | return KernelArgs; |
307 | } |
308 | |
309 | template <typename TargetAsyncInfoTy> |
310 | static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams, |
311 | int32_t ThreadLimit, void *HostPtr, |
312 | KernelArgsTy *KernelArgs) { |
313 | assert(PM && "Runtime not initialized" ); |
314 | static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>, |
315 | "Target AsyncInfoTy must be convertible to AsyncInfoTy." ); |
316 | DP("Entering target region for device %" PRId64 " with entry point " DPxMOD |
317 | "\n" , |
318 | DeviceId, DPxPTR(HostPtr)); |
319 | |
320 | if (checkDevice(DeviceId, Loc)) { |
321 | DP("Not offloading to device %" PRId64 "\n" , DeviceId); |
322 | return OMP_TGT_FAIL; |
323 | } |
324 | |
325 | bool IsTeams = NumTeams != -1; |
326 | if (!IsTeams) |
327 | KernelArgs->NumTeams[0] = NumTeams = 1; |
328 | |
329 | // Auto-upgrade kernel args version 1 to 2. |
330 | KernelArgsTy LocalKernelArgs; |
331 | KernelArgs = |
332 | upgradeKernelArgs(KernelArgs, LocalKernelArgs, NumTeams, ThreadLimit); |
333 | |
334 | TIMESCOPE_WITH_DETAILS_AND_IDENT( |
335 | "Runtime: target exe" , |
336 | "NumTeams=" + std::to_string(val: NumTeams) + |
337 | ";NumArgs=" + std::to_string(KernelArgs->NumArgs), |
338 | Loc); |
339 | |
340 | if (getInfoLevel() & OMP_INFOTYPE_KERNEL_ARGS) |
341 | printKernelArguments(Loc, DeviceId, KernelArgs->NumArgs, |
342 | KernelArgs->ArgSizes, KernelArgs->ArgTypes, |
343 | KernelArgs->ArgNames, "Entering OpenMP kernel" ); |
344 | #ifdef OMPTARGET_DEBUG |
345 | for (uint32_t I = 0; I < KernelArgs->NumArgs; ++I) { |
346 | DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64 |
347 | ", Type=0x%" PRIx64 ", Name=%s\n" , |
348 | I, DPxPTR(KernelArgs->ArgBasePtrs[I]), DPxPTR(KernelArgs->ArgPtrs[I]), |
349 | KernelArgs->ArgSizes[I], KernelArgs->ArgTypes[I], |
350 | (KernelArgs->ArgNames) |
351 | ? getNameFromMapping(KernelArgs->ArgNames[I]).c_str() |
352 | : "unknown" ); |
353 | } |
354 | #endif |
355 | |
356 | auto DeviceOrErr = PM->getDevice(DeviceId); |
357 | if (!DeviceOrErr) |
358 | FATAL_MESSAGE(DeviceId, "%s" , toString(DeviceOrErr.takeError()).c_str()); |
359 | |
360 | TargetAsyncInfoTy TargetAsyncInfo(*DeviceOrErr); |
361 | AsyncInfoTy &AsyncInfo = TargetAsyncInfo; |
362 | /// RAII to establish tool anchors before and after target region |
363 | OMPT_IF_BUILT(InterfaceRAII TargetRAII( |
364 | RegionInterface.getCallbacks<ompt_target>(), DeviceId, |
365 | /*CodePtr=*/OMPT_GET_RETURN_ADDRESS);) |
366 | |
367 | int Rc = OFFLOAD_SUCCESS; |
368 | Rc = target(Loc, *DeviceOrErr, HostPtr, *KernelArgs, AsyncInfo); |
369 | { // required to show synchronization |
370 | TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: synchronize" , "" , Loc); |
371 | if (Rc == OFFLOAD_SUCCESS) |
372 | Rc = AsyncInfo.synchronize(); |
373 | |
374 | handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc); |
375 | assert(Rc == OFFLOAD_SUCCESS && "__tgt_target_kernel unexpected failure!" ); |
376 | } |
377 | return OMP_TGT_SUCCESS; |
378 | } |
379 | |
380 | /// Implements a kernel entry that executes the target region on the specified |
381 | /// device. |
382 | /// |
383 | /// \param Loc Source location associated with this target region. |
384 | /// \param DeviceId The device to execute this region, -1 indicated the default. |
385 | /// \param NumTeams Number of teams to launch the region with, -1 indicates a |
386 | /// non-teams region and 0 indicates it was unspecified. |
387 | /// \param ThreadLimit Limit to the number of threads to use in the kernel |
388 | /// launch, 0 indicates it was unspecified. |
389 | /// \param HostPtr The pointer to the host function registered with the kernel. |
390 | /// \param Args All arguments to this kernel launch (see struct definition). |
391 | EXTERN int __tgt_target_kernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams, |
392 | int32_t ThreadLimit, void *HostPtr, |
393 | KernelArgsTy *KernelArgs) { |
394 | OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0))); |
395 | if (KernelArgs->Flags.NoWait) |
396 | return targetKernel<TaskAsyncInfoWrapperTy>( |
397 | Loc, DeviceId, NumTeams, ThreadLimit, HostPtr, KernelArgs); |
398 | return targetKernel<AsyncInfoTy>(Loc, DeviceId, NumTeams, ThreadLimit, |
399 | HostPtr, KernelArgs); |
400 | } |
401 | |
402 | /// Activates the record replay mechanism. |
403 | /// \param DeviceId The device identifier to execute the target region. |
404 | /// \param MemorySize The number of bytes to be (pre-)allocated |
405 | /// by the bump allocator |
406 | /// /param IsRecord Activates the record replay mechanism in |
407 | /// 'record' mode or 'replay' mode. |
408 | /// /param SaveOutput Store the device memory after kernel |
409 | /// execution on persistent storage |
410 | EXTERN int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize, |
411 | void *VAddr, bool IsRecord, |
412 | bool SaveOutput, |
413 | uint64_t &ReqPtrArgOffset) { |
414 | assert(PM && "Runtime not initialized" ); |
415 | OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0))); |
416 | auto DeviceOrErr = PM->getDevice(DeviceId); |
417 | if (!DeviceOrErr) |
418 | FATAL_MESSAGE(DeviceId, "%s" , toString(DeviceOrErr.takeError()).c_str()); |
419 | |
420 | [[maybe_unused]] int Rc = target_activate_rr( |
421 | *DeviceOrErr, MemorySize, VAddr, IsRecord, SaveOutput, ReqPtrArgOffset); |
422 | assert(Rc == OFFLOAD_SUCCESS && |
423 | "__tgt_activate_record_replay unexpected failure!" ); |
424 | return OMP_TGT_SUCCESS; |
425 | } |
426 | |
427 | /// Implements a target kernel entry that replays a pre-recorded kernel. |
428 | /// \param Loc Source location associated with this target region (unused). |
429 | /// \param DeviceId The device identifier to execute the target region. |
430 | /// \param HostPtr A pointer to an address that uniquely identifies the kernel. |
431 | /// \param DeviceMemory A pointer to an array storing device memory data to move |
432 | /// prior to kernel execution. |
433 | /// \param DeviceMemorySize The size of the above device memory data in bytes. |
434 | /// \param TgtArgs An array of pointers of the pre-recorded target kernel |
435 | /// arguments. |
436 | /// \param TgtOffsets An array of pointers of the pre-recorded target kernel |
437 | /// argument offsets. |
438 | /// \param NumArgs The number of kernel arguments. |
439 | /// \param NumTeams Number of teams to launch the target region with. |
440 | /// \param ThreadLimit Limit to the number of threads to use in kernel |
441 | /// execution. |
442 | /// \param LoopTripCount The pre-recorded value of the loop tripcount, if any. |
443 | /// \return OMP_TGT_SUCCESS on success, OMP_TGT_FAIL on failure. |
444 | EXTERN int __tgt_target_kernel_replay(ident_t *Loc, int64_t DeviceId, |
445 | void *HostPtr, void *DeviceMemory, |
446 | int64_t DeviceMemorySize, void **TgtArgs, |
447 | ptrdiff_t *TgtOffsets, int32_t NumArgs, |
448 | int32_t NumTeams, int32_t ThreadLimit, |
449 | uint64_t LoopTripCount) { |
450 | assert(PM && "Runtime not initialized" ); |
451 | OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0))); |
452 | if (checkDevice(DeviceId, Loc)) { |
453 | DP("Not offloading to device %" PRId64 "\n" , DeviceId); |
454 | return OMP_TGT_FAIL; |
455 | } |
456 | auto DeviceOrErr = PM->getDevice(DeviceId); |
457 | if (!DeviceOrErr) |
458 | FATAL_MESSAGE(DeviceId, "%s" , toString(DeviceOrErr.takeError()).c_str()); |
459 | |
460 | /// RAII to establish tool anchors before and after target region |
461 | OMPT_IF_BUILT(InterfaceRAII TargetRAII( |
462 | RegionInterface.getCallbacks<ompt_target>(), DeviceId, |
463 | /*CodePtr=*/OMPT_GET_RETURN_ADDRESS);) |
464 | |
465 | AsyncInfoTy AsyncInfo(*DeviceOrErr); |
466 | int Rc = target_replay(Loc, *DeviceOrErr, HostPtr, DeviceMemory, |
467 | DeviceMemorySize, TgtArgs, TgtOffsets, NumArgs, |
468 | NumTeams, ThreadLimit, LoopTripCount, AsyncInfo); |
469 | if (Rc == OFFLOAD_SUCCESS) |
470 | Rc = AsyncInfo.synchronize(); |
471 | handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc); |
472 | assert(Rc == OFFLOAD_SUCCESS && |
473 | "__tgt_target_kernel_replay unexpected failure!" ); |
474 | return OMP_TGT_SUCCESS; |
475 | } |
476 | |
477 | // Get the current number of components for a user-defined mapper. |
478 | EXTERN int64_t __tgt_mapper_num_components(void *RtMapperHandle) { |
479 | auto *MapperComponentsPtr = (struct MapperComponentsTy *)RtMapperHandle; |
480 | int64_t Size = MapperComponentsPtr->Components.size(); |
481 | DP("__tgt_mapper_num_components(Handle=" DPxMOD ") returns %" PRId64 "\n" , |
482 | DPxPTR(RtMapperHandle), Size); |
483 | return Size; |
484 | } |
485 | |
486 | // Push back one component for a user-defined mapper. |
487 | EXTERN void __tgt_push_mapper_component(void *RtMapperHandle, void *Base, |
488 | void *Begin, int64_t Size, int64_t Type, |
489 | void *Name) { |
490 | DP("__tgt_push_mapper_component(Handle=" DPxMOD |
491 | ") adds an entry (Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64 |
492 | ", Type=0x%" PRIx64 ", Name=%s).\n" , |
493 | DPxPTR(RtMapperHandle), DPxPTR(Base), DPxPTR(Begin), Size, Type, |
494 | (Name) ? getNameFromMapping(Name).c_str() : "unknown" ); |
495 | auto *MapperComponentsPtr = (struct MapperComponentsTy *)RtMapperHandle; |
496 | MapperComponentsPtr->Components.push_back( |
497 | MapComponentInfoTy(Base, Begin, Size, Type, Name)); |
498 | } |
499 | |
500 | EXTERN void __tgt_set_info_flag(uint32_t NewInfoLevel) { |
501 | assert(PM && "Runtime not initialized" ); |
502 | std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal(); |
503 | InfoLevel.store(NewInfoLevel); |
504 | } |
505 | |
506 | EXTERN int __tgt_print_device_info(int64_t DeviceId) { |
507 | assert(PM && "Runtime not initialized" ); |
508 | auto DeviceOrErr = PM->getDevice(DeviceId); |
509 | if (!DeviceOrErr) |
510 | FATAL_MESSAGE(DeviceId, "%s" , toString(DeviceOrErr.takeError()).c_str()); |
511 | |
512 | return DeviceOrErr->printDeviceInfo(); |
513 | } |
514 | |
515 | EXTERN void __tgt_target_nowait_query(void **AsyncHandle) { |
516 | assert(PM && "Runtime not initialized" ); |
517 | OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0))); |
518 | |
519 | if (!AsyncHandle || !*AsyncHandle) { |
520 | FATAL_MESSAGE0( |
521 | 1, "Receive an invalid async handle from the current OpenMP task. Is " |
522 | "this a target nowait region?\n" ); |
523 | } |
524 | |
525 | // Exponential backoff tries to optimally decide if a thread should just query |
526 | // for the device operations (work/spin wait on them) or block until they are |
527 | // completed (use device side blocking mechanism). This allows the runtime to |
528 | // adapt itself when there are a lot of long-running target regions in-flight. |
529 | static thread_local utils::ExponentialBackoff QueryCounter( |
530 | Int64Envar("OMPTARGET_QUERY_COUNT_MAX" , 10), |
531 | Int64Envar("OMPTARGET_QUERY_COUNT_THRESHOLD" , 5), |
532 | Envar<float>("OMPTARGET_QUERY_COUNT_BACKOFF_FACTOR" , 0.5f)); |
533 | |
534 | auto *AsyncInfo = (AsyncInfoTy *)*AsyncHandle; |
535 | |
536 | // If the thread is actively waiting on too many target nowait regions, we |
537 | // should use the blocking sync type. |
538 | if (QueryCounter.isAboveThreshold()) |
539 | AsyncInfo->SyncType = AsyncInfoTy::SyncTy::BLOCKING; |
540 | |
541 | if (AsyncInfo->synchronize()) |
542 | FATAL_MESSAGE0(1, "Error while querying the async queue for completion.\n" ); |
543 | // If there are device operations still pending, return immediately without |
544 | // deallocating the handle and increase the current thread query count. |
545 | if (!AsyncInfo->isDone()) { |
546 | QueryCounter.increment(); |
547 | return; |
548 | } |
549 | |
550 | // When a thread successfully completes a target nowait region, we |
551 | // exponentially backoff its query counter by the query factor. |
552 | QueryCounter.decrement(); |
553 | |
554 | // Delete the handle and unset it from the OpenMP task data. |
555 | delete AsyncInfo; |
556 | *AsyncHandle = nullptr; |
557 | } |
558 | |