1 | //===-------- interface.cpp - Target independent OpenMP target RTL --------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // Implementation of the interface to be used by Clang during the codegen of a |
10 | // target region. |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "OpenMP/OMPT/Interface.h" |
15 | #include "OpenMP/OMPT/Callback.h" |
16 | #include "PluginManager.h" |
17 | #include "private.h" |
18 | |
19 | #include "Shared/EnvironmentVar.h" |
20 | #include "Shared/Profile.h" |
21 | |
22 | #include "Utils/ExponentialBackoff.h" |
23 | |
24 | #include "llvm/Frontend/OpenMP/OMPConstants.h" |
25 | |
26 | #include <cassert> |
27 | #include <cstdint> |
28 | #include <cstdio> |
29 | #include <cstdlib> |
30 | |
31 | #ifdef OMPT_SUPPORT |
32 | using namespace llvm::omp::target::ompt; |
33 | #endif |
34 | |
35 | //////////////////////////////////////////////////////////////////////////////// |
36 | /// adds requires flags |
37 | EXTERN void __tgt_register_requires(int64_t Flags) { |
38 | MESSAGE("The %s function has been removed. Old OpenMP requirements will not " |
39 | "be handled" , |
40 | __PRETTY_FUNCTION__); |
41 | } |
42 | |
43 | EXTERN void __tgt_rtl_init() { initRuntime(); } |
44 | EXTERN void __tgt_rtl_deinit() { deinitRuntime(); } |
45 | |
46 | //////////////////////////////////////////////////////////////////////////////// |
47 | /// adds a target shared library to the target execution image |
48 | EXTERN void __tgt_register_lib(__tgt_bin_desc *Desc) { |
49 | initRuntime(); |
50 | if (PM->delayRegisterLib(Desc)) |
51 | return; |
52 | |
53 | PM->registerLib(Desc); |
54 | } |
55 | |
56 | //////////////////////////////////////////////////////////////////////////////// |
57 | /// Initialize all available devices without registering any image |
58 | EXTERN void __tgt_init_all_rtls() { |
59 | assert(PM && "Runtime not initialized" ); |
60 | PM->initAllPlugins(); |
61 | } |
62 | |
63 | //////////////////////////////////////////////////////////////////////////////// |
64 | /// unloads a target shared library |
65 | EXTERN void __tgt_unregister_lib(__tgt_bin_desc *Desc) { |
66 | PM->unregisterLib(Desc); |
67 | |
68 | deinitRuntime(); |
69 | } |
70 | |
71 | template <typename TargetAsyncInfoTy> |
72 | static inline void |
73 | targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase, |
74 | void **Args, int64_t *ArgSizes, int64_t *ArgTypes, |
75 | map_var_info_t *ArgNames, void **ArgMappers, |
76 | TargetDataFuncPtrTy TargetDataFunction, const char *RegionTypeMsg, |
77 | const char *RegionName) { |
78 | assert(PM && "Runtime not initialized" ); |
79 | static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>, |
80 | "TargetAsyncInfoTy must be convertible to AsyncInfoTy." ); |
81 | |
82 | TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: Data Copy" , |
83 | "NumArgs=" + std::to_string(val: ArgNum), Loc); |
84 | |
85 | DP("Entering data %s region for device %" PRId64 " with %d mappings\n" , |
86 | RegionName, DeviceId, ArgNum); |
87 | |
88 | if (checkDeviceAndCtors(DeviceId, Loc)) { |
89 | DP("Not offloading to device %" PRId64 "\n" , DeviceId); |
90 | return; |
91 | } |
92 | |
93 | if (getInfoLevel() & OMP_INFOTYPE_KERNEL_ARGS) |
94 | printKernelArguments(Loc, DeviceId, ArgNum, ArgSizes, ArgTypes, ArgNames, |
95 | RegionTypeMsg); |
96 | #ifdef OMPTARGET_DEBUG |
97 | for (int I = 0; I < ArgNum; ++I) { |
98 | DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64 |
99 | ", Type=0x%" PRIx64 ", Name=%s\n" , |
100 | I, DPxPTR(ArgsBase[I]), DPxPTR(Args[I]), ArgSizes[I], ArgTypes[I], |
101 | (ArgNames) ? getNameFromMapping(ArgNames[I]).c_str() : "unknown" ); |
102 | } |
103 | #endif |
104 | |
105 | auto DeviceOrErr = PM->getDevice(DeviceId); |
106 | if (!DeviceOrErr) |
107 | FATAL_MESSAGE(DeviceId, "%s" , toString(DeviceOrErr.takeError()).c_str()); |
108 | |
109 | TargetAsyncInfoTy TargetAsyncInfo(*DeviceOrErr); |
110 | AsyncInfoTy &AsyncInfo = TargetAsyncInfo; |
111 | |
112 | /// RAII to establish tool anchors before and after data begin / end / update |
113 | OMPT_IF_BUILT(assert((TargetDataFunction == targetDataBegin || |
114 | TargetDataFunction == targetDataEnd || |
115 | TargetDataFunction == targetDataUpdate) && |
116 | "Encountered unexpected TargetDataFunction during " |
117 | "execution of targetData" ); |
118 | auto CallbackFunctions = |
119 | (TargetDataFunction == targetDataBegin) |
120 | ? RegionInterface.getCallbacks<ompt_target_enter_data>() |
121 | : (TargetDataFunction == targetDataEnd) |
122 | ? RegionInterface.getCallbacks<ompt_target_exit_data>() |
123 | : RegionInterface.getCallbacks<ompt_target_update>(); |
124 | InterfaceRAII TargetDataRAII(CallbackFunctions, DeviceId, |
125 | OMPT_GET_RETURN_ADDRESS);) |
126 | |
127 | int Rc = OFFLOAD_SUCCESS; |
128 | Rc = TargetDataFunction(Loc, *DeviceOrErr, ArgNum, ArgsBase, Args, ArgSizes, |
129 | ArgTypes, ArgNames, ArgMappers, AsyncInfo, |
130 | false /*FromMapper=*/); |
131 | |
132 | if (Rc == OFFLOAD_SUCCESS) |
133 | Rc = AsyncInfo.synchronize(); |
134 | |
135 | handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc); |
136 | } |
137 | |
138 | /// creates host-to-target data mapping, stores it in the |
139 | /// libomptarget.so internal structure (an entry in a stack of data maps) |
140 | /// and passes the data to the device. |
141 | EXTERN void __tgt_target_data_begin_mapper(ident_t *Loc, int64_t DeviceId, |
142 | int32_t ArgNum, void **ArgsBase, |
143 | void **Args, int64_t *ArgSizes, |
144 | int64_t *ArgTypes, |
145 | map_var_info_t *ArgNames, |
146 | void **ArgMappers) { |
147 | OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0))); |
148 | targetData<AsyncInfoTy>(Loc, DeviceId, ArgNum, ArgsBase, Args, ArgSizes, |
149 | ArgTypes, ArgNames, ArgMappers, targetDataBegin, |
150 | "Entering OpenMP data region with being_mapper" , |
151 | "begin" ); |
152 | } |
153 | |
154 | EXTERN void __tgt_target_data_begin_nowait_mapper( |
155 | ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase, |
156 | void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, |
157 | void **ArgMappers, int32_t DepNum, void *DepList, int32_t NoAliasDepNum, |
158 | void *NoAliasDepList) { |
159 | OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0))); |
160 | targetData<TaskAsyncInfoWrapperTy>( |
161 | Loc, DeviceId, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, ArgNames, |
162 | ArgMappers, targetDataBegin, |
163 | "Entering OpenMP data region with being_nowait_mapper" , "begin" ); |
164 | } |
165 | |
166 | /// passes data from the target, releases target memory and destroys |
167 | /// the host-target mapping (top entry from the stack of data maps) |
168 | /// created by the last __tgt_target_data_begin. |
169 | EXTERN void __tgt_target_data_end_mapper(ident_t *Loc, int64_t DeviceId, |
170 | int32_t ArgNum, void **ArgsBase, |
171 | void **Args, int64_t *ArgSizes, |
172 | int64_t *ArgTypes, |
173 | map_var_info_t *ArgNames, |
174 | void **ArgMappers) { |
175 | OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0))); |
176 | targetData<AsyncInfoTy>(Loc, DeviceId, ArgNum, ArgsBase, Args, ArgSizes, |
177 | ArgTypes, ArgNames, ArgMappers, targetDataEnd, |
178 | "Exiting OpenMP data region with end_mapper" , "end" ); |
179 | } |
180 | |
181 | EXTERN void __tgt_target_data_end_nowait_mapper( |
182 | ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase, |
183 | void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, |
184 | void **ArgMappers, int32_t DepNum, void *DepList, int32_t NoAliasDepNum, |
185 | void *NoAliasDepList) { |
186 | OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0))); |
187 | targetData<TaskAsyncInfoWrapperTy>( |
188 | Loc, DeviceId, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, ArgNames, |
189 | ArgMappers, targetDataEnd, |
190 | "Exiting OpenMP data region with end_nowait_mapper" , "end" ); |
191 | } |
192 | |
193 | EXTERN void __tgt_target_data_update_mapper(ident_t *Loc, int64_t DeviceId, |
194 | int32_t ArgNum, void **ArgsBase, |
195 | void **Args, int64_t *ArgSizes, |
196 | int64_t *ArgTypes, |
197 | map_var_info_t *ArgNames, |
198 | void **ArgMappers) { |
199 | OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0))); |
200 | targetData<AsyncInfoTy>( |
201 | Loc, DeviceId, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, ArgNames, |
202 | ArgMappers, targetDataUpdate, |
203 | "Updating data within the OpenMP data region with update_mapper" , |
204 | "update" ); |
205 | } |
206 | |
207 | EXTERN void __tgt_target_data_update_nowait_mapper( |
208 | ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase, |
209 | void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, |
210 | void **ArgMappers, int32_t DepNum, void *DepList, int32_t NoAliasDepNum, |
211 | void *NoAliasDepList) { |
212 | OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0))); |
213 | targetData<TaskAsyncInfoWrapperTy>( |
214 | Loc, DeviceId, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, ArgNames, |
215 | ArgMappers, targetDataUpdate, |
216 | "Updating data within the OpenMP data region with update_nowait_mapper" , |
217 | "update" ); |
218 | } |
219 | |
220 | static KernelArgsTy *upgradeKernelArgs(KernelArgsTy *KernelArgs, |
221 | KernelArgsTy &LocalKernelArgs, |
222 | int32_t NumTeams, int32_t ThreadLimit) { |
223 | if (KernelArgs->Version > OMP_KERNEL_ARG_VERSION) |
224 | DP("Unexpected ABI version: %u\n" , KernelArgs->Version); |
225 | |
226 | uint32_t UpgradedVersion = KernelArgs->Version; |
227 | if (KernelArgs->Version < OMP_KERNEL_ARG_VERSION) { |
228 | // The upgraded version will be based on the kernel launch environment. |
229 | if (KernelArgs->Version < OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR) |
230 | UpgradedVersion = OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR - 1; |
231 | else |
232 | UpgradedVersion = OMP_KERNEL_ARG_VERSION; |
233 | } |
234 | if (UpgradedVersion != KernelArgs->Version) { |
235 | LocalKernelArgs.Version = UpgradedVersion; |
236 | LocalKernelArgs.NumArgs = KernelArgs->NumArgs; |
237 | LocalKernelArgs.ArgBasePtrs = KernelArgs->ArgBasePtrs; |
238 | LocalKernelArgs.ArgPtrs = KernelArgs->ArgPtrs; |
239 | LocalKernelArgs.ArgSizes = KernelArgs->ArgSizes; |
240 | LocalKernelArgs.ArgTypes = KernelArgs->ArgTypes; |
241 | LocalKernelArgs.ArgNames = KernelArgs->ArgNames; |
242 | LocalKernelArgs.ArgMappers = KernelArgs->ArgMappers; |
243 | LocalKernelArgs.Tripcount = KernelArgs->Tripcount; |
244 | LocalKernelArgs.Flags = KernelArgs->Flags; |
245 | LocalKernelArgs.DynCGroupMem = 0; |
246 | LocalKernelArgs.NumTeams[0] = NumTeams; |
247 | LocalKernelArgs.NumTeams[1] = 0; |
248 | LocalKernelArgs.NumTeams[2] = 0; |
249 | LocalKernelArgs.ThreadLimit[0] = ThreadLimit; |
250 | LocalKernelArgs.ThreadLimit[1] = 0; |
251 | LocalKernelArgs.ThreadLimit[2] = 0; |
252 | return &LocalKernelArgs; |
253 | } |
254 | |
255 | return KernelArgs; |
256 | } |
257 | |
258 | template <typename TargetAsyncInfoTy> |
259 | static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams, |
260 | int32_t ThreadLimit, void *HostPtr, |
261 | KernelArgsTy *KernelArgs) { |
262 | assert(PM && "Runtime not initialized" ); |
263 | static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>, |
264 | "Target AsyncInfoTy must be convertible to AsyncInfoTy." ); |
265 | DP("Entering target region for device %" PRId64 " with entry point " DPxMOD |
266 | "\n" , |
267 | DeviceId, DPxPTR(HostPtr)); |
268 | |
269 | if (checkDeviceAndCtors(DeviceId, Loc)) { |
270 | DP("Not offloading to device %" PRId64 "\n" , DeviceId); |
271 | return OMP_TGT_FAIL; |
272 | } |
273 | |
274 | bool IsTeams = NumTeams != -1; |
275 | if (!IsTeams) |
276 | KernelArgs->NumTeams[0] = NumTeams = 1; |
277 | |
278 | // Auto-upgrade kernel args version 1 to 2. |
279 | KernelArgsTy LocalKernelArgs; |
280 | KernelArgs = |
281 | upgradeKernelArgs(KernelArgs, LocalKernelArgs, NumTeams, ThreadLimit); |
282 | |
283 | assert(KernelArgs->NumTeams[0] == static_cast<uint32_t>(NumTeams) && |
284 | !KernelArgs->NumTeams[1] && !KernelArgs->NumTeams[2] && |
285 | "OpenMP interface should not use multiple dimensions" ); |
286 | assert(KernelArgs->ThreadLimit[0] == static_cast<uint32_t>(ThreadLimit) && |
287 | !KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] && |
288 | "OpenMP interface should not use multiple dimensions" ); |
289 | TIMESCOPE_WITH_DETAILS_AND_IDENT( |
290 | "Runtime: target exe" , |
291 | "NumTeams=" + std::to_string(val: NumTeams) + |
292 | ";NumArgs=" + std::to_string(KernelArgs->NumArgs), |
293 | Loc); |
294 | |
295 | if (getInfoLevel() & OMP_INFOTYPE_KERNEL_ARGS) |
296 | printKernelArguments(Loc, DeviceId, KernelArgs->NumArgs, |
297 | KernelArgs->ArgSizes, KernelArgs->ArgTypes, |
298 | KernelArgs->ArgNames, "Entering OpenMP kernel" ); |
299 | #ifdef OMPTARGET_DEBUG |
300 | for (uint32_t I = 0; I < KernelArgs->NumArgs; ++I) { |
301 | DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64 |
302 | ", Type=0x%" PRIx64 ", Name=%s\n" , |
303 | I, DPxPTR(KernelArgs->ArgBasePtrs[I]), DPxPTR(KernelArgs->ArgPtrs[I]), |
304 | KernelArgs->ArgSizes[I], KernelArgs->ArgTypes[I], |
305 | (KernelArgs->ArgNames) |
306 | ? getNameFromMapping(KernelArgs->ArgNames[I]).c_str() |
307 | : "unknown" ); |
308 | } |
309 | #endif |
310 | |
311 | auto DeviceOrErr = PM->getDevice(DeviceId); |
312 | if (!DeviceOrErr) |
313 | FATAL_MESSAGE(DeviceId, "%s" , toString(DeviceOrErr.takeError()).c_str()); |
314 | |
315 | TargetAsyncInfoTy TargetAsyncInfo(*DeviceOrErr); |
316 | AsyncInfoTy &AsyncInfo = TargetAsyncInfo; |
317 | /// RAII to establish tool anchors before and after target region |
318 | OMPT_IF_BUILT(InterfaceRAII TargetRAII( |
319 | RegionInterface.getCallbacks<ompt_target>(), DeviceId, |
320 | /*CodePtr=*/OMPT_GET_RETURN_ADDRESS);) |
321 | |
322 | int Rc = OFFLOAD_SUCCESS; |
323 | Rc = target(Loc, *DeviceOrErr, HostPtr, *KernelArgs, AsyncInfo); |
324 | { // required to show syncronization |
325 | TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: syncronize" , "" , Loc); |
326 | if (Rc == OFFLOAD_SUCCESS) |
327 | Rc = AsyncInfo.synchronize(); |
328 | |
329 | handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc); |
330 | assert(Rc == OFFLOAD_SUCCESS && "__tgt_target_kernel unexpected failure!" ); |
331 | } |
332 | return OMP_TGT_SUCCESS; |
333 | } |
334 | |
335 | /// Implements a kernel entry that executes the target region on the specified |
336 | /// device. |
337 | /// |
338 | /// \param Loc Source location associated with this target region. |
339 | /// \param DeviceId The device to execute this region, -1 indicated the default. |
340 | /// \param NumTeams Number of teams to launch the region with, -1 indicates a |
341 | /// non-teams region and 0 indicates it was unspecified. |
342 | /// \param ThreadLimit Limit to the number of threads to use in the kernel |
343 | /// launch, 0 indicates it was unspecified. |
344 | /// \param HostPtr The pointer to the host function registered with the kernel. |
345 | /// \param Args All arguments to this kernel launch (see struct definition). |
346 | EXTERN int __tgt_target_kernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams, |
347 | int32_t ThreadLimit, void *HostPtr, |
348 | KernelArgsTy *KernelArgs) { |
349 | OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0))); |
350 | if (KernelArgs->Flags.NoWait) |
351 | return targetKernel<TaskAsyncInfoWrapperTy>( |
352 | Loc, DeviceId, NumTeams, ThreadLimit, HostPtr, KernelArgs); |
353 | return targetKernel<AsyncInfoTy>(Loc, DeviceId, NumTeams, ThreadLimit, |
354 | HostPtr, KernelArgs); |
355 | } |
356 | |
357 | /// Activates the record replay mechanism. |
358 | /// \param DeviceId The device identifier to execute the target region. |
359 | /// \param MemorySize The number of bytes to be (pre-)allocated |
360 | /// by the bump allocator |
361 | /// /param IsRecord Activates the record replay mechanism in |
362 | /// 'record' mode or 'replay' mode. |
363 | /// /param SaveOutput Store the device memory after kernel |
364 | /// execution on persistent storage |
365 | EXTERN int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize, |
366 | void *VAddr, bool IsRecord, |
367 | bool SaveOutput, |
368 | uint64_t &ReqPtrArgOffset) { |
369 | assert(PM && "Runtime not initialized" ); |
370 | OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0))); |
371 | auto DeviceOrErr = PM->getDevice(DeviceId); |
372 | if (!DeviceOrErr) |
373 | FATAL_MESSAGE(DeviceId, "%s" , toString(DeviceOrErr.takeError()).c_str()); |
374 | |
375 | [[maybe_unused]] int Rc = target_activate_rr( |
376 | *DeviceOrErr, MemorySize, VAddr, IsRecord, SaveOutput, ReqPtrArgOffset); |
377 | assert(Rc == OFFLOAD_SUCCESS && |
378 | "__tgt_activate_record_replay unexpected failure!" ); |
379 | return OMP_TGT_SUCCESS; |
380 | } |
381 | |
382 | /// Implements a target kernel entry that replays a pre-recorded kernel. |
383 | /// \param Loc Source location associated with this target region (unused). |
384 | /// \param DeviceId The device identifier to execute the target region. |
385 | /// \param HostPtr A pointer to an address that uniquely identifies the kernel. |
386 | /// \param DeviceMemory A pointer to an array storing device memory data to move |
387 | /// prior to kernel execution. |
388 | /// \param DeviceMemorySize The size of the above device memory data in bytes. |
389 | /// \param TgtArgs An array of pointers of the pre-recorded target kernel |
390 | /// arguments. |
391 | /// \param TgtOffsets An array of pointers of the pre-recorded target kernel |
392 | /// argument offsets. |
393 | /// \param NumArgs The number of kernel arguments. |
394 | /// \param NumTeams Number of teams to launch the target region with. |
395 | /// \param ThreadLimit Limit to the number of threads to use in kernel |
396 | /// execution. |
397 | /// \param LoopTripCount The pre-recorded value of the loop tripcount, if any. |
398 | /// \return OMP_TGT_SUCCESS on success, OMP_TGT_FAIL on failure. |
399 | EXTERN int __tgt_target_kernel_replay(ident_t *Loc, int64_t DeviceId, |
400 | void *HostPtr, void *DeviceMemory, |
401 | int64_t DeviceMemorySize, void **TgtArgs, |
402 | ptrdiff_t *TgtOffsets, int32_t NumArgs, |
403 | int32_t NumTeams, int32_t ThreadLimit, |
404 | uint64_t LoopTripCount) { |
405 | assert(PM && "Runtime not initialized" ); |
406 | OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0))); |
407 | if (checkDeviceAndCtors(DeviceId, Loc)) { |
408 | DP("Not offloading to device %" PRId64 "\n" , DeviceId); |
409 | return OMP_TGT_FAIL; |
410 | } |
411 | auto DeviceOrErr = PM->getDevice(DeviceId); |
412 | if (!DeviceOrErr) |
413 | FATAL_MESSAGE(DeviceId, "%s" , toString(DeviceOrErr.takeError()).c_str()); |
414 | |
415 | /// RAII to establish tool anchors before and after target region |
416 | OMPT_IF_BUILT(InterfaceRAII TargetRAII( |
417 | RegionInterface.getCallbacks<ompt_target>(), DeviceId, |
418 | /*CodePtr=*/OMPT_GET_RETURN_ADDRESS);) |
419 | |
420 | AsyncInfoTy AsyncInfo(*DeviceOrErr); |
421 | int Rc = target_replay(Loc, *DeviceOrErr, HostPtr, DeviceMemory, |
422 | DeviceMemorySize, TgtArgs, TgtOffsets, NumArgs, |
423 | NumTeams, ThreadLimit, LoopTripCount, AsyncInfo); |
424 | if (Rc == OFFLOAD_SUCCESS) |
425 | Rc = AsyncInfo.synchronize(); |
426 | handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc); |
427 | assert(Rc == OFFLOAD_SUCCESS && |
428 | "__tgt_target_kernel_replay unexpected failure!" ); |
429 | return OMP_TGT_SUCCESS; |
430 | } |
431 | |
432 | // Get the current number of components for a user-defined mapper. |
433 | EXTERN int64_t __tgt_mapper_num_components(void *RtMapperHandle) { |
434 | auto *MapperComponentsPtr = (struct MapperComponentsTy *)RtMapperHandle; |
435 | int64_t Size = MapperComponentsPtr->Components.size(); |
436 | DP("__tgt_mapper_num_components(Handle=" DPxMOD ") returns %" PRId64 "\n" , |
437 | DPxPTR(RtMapperHandle), Size); |
438 | return Size; |
439 | } |
440 | |
441 | // Push back one component for a user-defined mapper. |
442 | EXTERN void __tgt_push_mapper_component(void *RtMapperHandle, void *Base, |
443 | void *Begin, int64_t Size, int64_t Type, |
444 | void *Name) { |
445 | DP("__tgt_push_mapper_component(Handle=" DPxMOD |
446 | ") adds an entry (Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64 |
447 | ", Type=0x%" PRIx64 ", Name=%s).\n" , |
448 | DPxPTR(RtMapperHandle), DPxPTR(Base), DPxPTR(Begin), Size, Type, |
449 | (Name) ? getNameFromMapping(Name).c_str() : "unknown" ); |
450 | auto *MapperComponentsPtr = (struct MapperComponentsTy *)RtMapperHandle; |
451 | MapperComponentsPtr->Components.push_back( |
452 | MapComponentInfoTy(Base, Begin, Size, Type, Name)); |
453 | } |
454 | |
455 | EXTERN void __tgt_set_info_flag(uint32_t NewInfoLevel) { |
456 | assert(PM && "Runtime not initialized" ); |
457 | std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal(); |
458 | InfoLevel.store(NewInfoLevel); |
459 | for (auto &R : PM->pluginAdaptors()) |
460 | R.set_info_flag(NewInfoLevel); |
461 | } |
462 | |
463 | EXTERN int __tgt_print_device_info(int64_t DeviceId) { |
464 | assert(PM && "Runtime not initialized" ); |
465 | auto DeviceOrErr = PM->getDevice(DeviceId); |
466 | if (!DeviceOrErr) |
467 | FATAL_MESSAGE(DeviceId, "%s" , toString(DeviceOrErr.takeError()).c_str()); |
468 | |
469 | return DeviceOrErr->printDeviceInfo(); |
470 | } |
471 | |
472 | EXTERN void __tgt_target_nowait_query(void **AsyncHandle) { |
473 | assert(PM && "Runtime not initialized" ); |
474 | OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0))); |
475 | |
476 | if (!AsyncHandle || !*AsyncHandle) { |
477 | FATAL_MESSAGE0( |
478 | 1, "Receive an invalid async handle from the current OpenMP task. Is " |
479 | "this a target nowait region?\n" ); |
480 | } |
481 | |
482 | // Exponential backoff tries to optimally decide if a thread should just query |
483 | // for the device operations (work/spin wait on them) or block until they are |
484 | // completed (use device side blocking mechanism). This allows the runtime to |
485 | // adapt itself when there are a lot of long-running target regions in-flight. |
486 | static thread_local utils::ExponentialBackoff QueryCounter( |
487 | Int64Envar("OMPTARGET_QUERY_COUNT_MAX" , 10), |
488 | Int64Envar("OMPTARGET_QUERY_COUNT_THRESHOLD" , 5), |
489 | Envar<float>("OMPTARGET_QUERY_COUNT_BACKOFF_FACTOR" , 0.5f)); |
490 | |
491 | auto *AsyncInfo = (AsyncInfoTy *)*AsyncHandle; |
492 | |
493 | // If the thread is actively waiting on too many target nowait regions, we |
494 | // should use the blocking sync type. |
495 | if (QueryCounter.isAboveThreshold()) |
496 | AsyncInfo->SyncType = AsyncInfoTy::SyncTy::BLOCKING; |
497 | |
498 | if (AsyncInfo->synchronize()) |
499 | FATAL_MESSAGE0(1, "Error while querying the async queue for completion.\n" ); |
500 | // If there are device operations still pending, return immediately without |
501 | // deallocating the handle and increase the current thread query count. |
502 | if (!AsyncInfo->isDone()) { |
503 | QueryCounter.increment(); |
504 | return; |
505 | } |
506 | |
507 | // When a thread successfully completes a target nowait region, we |
508 | // exponentially backoff its query counter by the query factor. |
509 | QueryCounter.decrement(); |
510 | |
511 | // Delete the handle and unset it from the OpenMP task data. |
512 | delete AsyncInfo; |
513 | *AsyncHandle = nullptr; |
514 | } |
515 | |