| 1 | //===-------- interface.cpp - Target independent OpenMP target RTL --------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // Implementation of the interface to be used by Clang during the codegen of a |
| 10 | // target region. |
| 11 | // |
| 12 | //===----------------------------------------------------------------------===// |
| 13 | |
| 14 | #include "OpenMP/OMPT/Interface.h" |
| 15 | #include "OffloadPolicy.h" |
| 16 | #include "OpenMP/OMPT/Callback.h" |
| 17 | #include "OpenMP/omp.h" |
| 18 | #include "PluginManager.h" |
| 19 | #include "omptarget.h" |
| 20 | #include "private.h" |
| 21 | |
| 22 | #include "Shared/EnvironmentVar.h" |
| 23 | #include "Shared/Profile.h" |
| 24 | |
| 25 | #include "Utils/ExponentialBackoff.h" |
| 26 | |
| 27 | #include "llvm/Frontend/OpenMP/OMPConstants.h" |
| 28 | |
| 29 | #include <cassert> |
| 30 | #include <cstdint> |
| 31 | #include <cstdio> |
| 32 | #include <cstdlib> |
| 33 | |
| 34 | #ifdef OMPT_SUPPORT |
| 35 | using namespace llvm::omp::target::ompt; |
| 36 | #endif |
| 37 | |
| 38 | // If offload is enabled, ensure that device DeviceID has been initialized. |
| 39 | // |
| 40 | // The return bool indicates if the offload is to the host device |
| 41 | // There are three possible results: |
| 42 | // - Return false if the target device is ready for offload |
| 43 | // - Return true without reporting a runtime error if offload is |
| 44 | // disabled, perhaps because the initial device was specified. |
| 45 | // - Report a runtime error and return true. |
| 46 | // |
| 47 | // If DeviceID == OFFLOAD_DEVICE_DEFAULT, set DeviceID to the default device. |
| 48 | // This step might be skipped if offload is disabled. |
| 49 | bool checkDevice(int64_t &DeviceID, ident_t *Loc) { |
| 50 | if (OffloadPolicy::get(*PM).Kind == OffloadPolicy::DISABLED) { |
| 51 | DP("Offload is disabled\n" ); |
| 52 | return true; |
| 53 | } |
| 54 | |
| 55 | if (DeviceID == OFFLOAD_DEVICE_DEFAULT) { |
| 56 | DeviceID = omp_get_default_device(); |
| 57 | DP("Use default device id %" PRId64 "\n" , DeviceID); |
| 58 | } |
| 59 | |
| 60 | // Proposed behavior for OpenMP 5.2 in OpenMP spec github issue 2669. |
| 61 | if (omp_get_num_devices() == 0) { |
| 62 | DP("omp_get_num_devices() == 0 but offload is manadatory\n" ); |
| 63 | handleTargetOutcome(false, Loc); |
| 64 | return true; |
| 65 | } |
| 66 | |
| 67 | if (DeviceID == omp_get_initial_device()) { |
| 68 | DP("Device is host (%" PRId64 "), returning as if offload is disabled\n" , |
| 69 | DeviceID); |
| 70 | return true; |
| 71 | } |
| 72 | return false; |
| 73 | } |
| 74 | |
| 75 | //////////////////////////////////////////////////////////////////////////////// |
| 76 | /// adds requires flags |
| 77 | EXTERN void __tgt_register_requires(int64_t Flags) { |
| 78 | MESSAGE("The %s function has been removed. Old OpenMP requirements will not " |
| 79 | "be handled" , |
| 80 | __PRETTY_FUNCTION__); |
| 81 | } |
| 82 | |
| 83 | EXTERN void __tgt_rtl_init() { initRuntime(); } |
| 84 | EXTERN void __tgt_rtl_deinit() { deinitRuntime(); } |
| 85 | |
| 86 | //////////////////////////////////////////////////////////////////////////////// |
| 87 | /// adds a target shared library to the target execution image |
| 88 | EXTERN void __tgt_register_lib(__tgt_bin_desc *Desc) { |
| 89 | initRuntime(); |
| 90 | if (PM->delayRegisterLib(Desc)) |
| 91 | return; |
| 92 | |
| 93 | PM->registerLib(Desc); |
| 94 | } |
| 95 | |
| 96 | //////////////////////////////////////////////////////////////////////////////// |
| 97 | /// Initialize all available devices without registering any image |
| 98 | EXTERN void __tgt_init_all_rtls() { |
| 99 | assert(PM && "Runtime not initialized" ); |
| 100 | PM->initializeAllDevices(); |
| 101 | } |
| 102 | |
| 103 | //////////////////////////////////////////////////////////////////////////////// |
| 104 | /// unloads a target shared library |
| 105 | EXTERN void __tgt_unregister_lib(__tgt_bin_desc *Desc) { |
| 106 | PM->unregisterLib(Desc); |
| 107 | |
| 108 | deinitRuntime(); |
| 109 | } |
| 110 | |
| 111 | template <typename TargetAsyncInfoTy> |
| 112 | static inline void |
| 113 | targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase, |
| 114 | void **Args, int64_t *ArgSizes, int64_t *ArgTypes, |
| 115 | map_var_info_t *ArgNames, void **ArgMappers, |
| 116 | TargetDataFuncPtrTy TargetDataFunction, const char *RegionTypeMsg, |
| 117 | const char *RegionName) { |
| 118 | assert(PM && "Runtime not initialized" ); |
| 119 | static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>, |
| 120 | "TargetAsyncInfoTy must be convertible to AsyncInfoTy." ); |
| 121 | |
| 122 | TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: Data Copy" , |
| 123 | "NumArgs=" + std::to_string(val: ArgNum), Loc); |
| 124 | |
| 125 | DP("Entering data %s region for device %" PRId64 " with %d mappings\n" , |
| 126 | RegionName, DeviceId, ArgNum); |
| 127 | |
| 128 | if (checkDevice(DeviceId, Loc)) { |
| 129 | DP("Not offloading to device %" PRId64 "\n" , DeviceId); |
| 130 | return; |
| 131 | } |
| 132 | |
| 133 | if (getInfoLevel() & OMP_INFOTYPE_KERNEL_ARGS) |
| 134 | printKernelArguments(Loc, DeviceId, ArgNum, ArgSizes, ArgTypes, ArgNames, |
| 135 | RegionTypeMsg); |
| 136 | #ifdef OMPTARGET_DEBUG |
| 137 | for (int I = 0; I < ArgNum; ++I) { |
| 138 | DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64 |
| 139 | ", Type=0x%" PRIx64 ", Name=%s\n" , |
| 140 | I, DPxPTR(ArgsBase[I]), DPxPTR(Args[I]), ArgSizes[I], ArgTypes[I], |
| 141 | (ArgNames) ? getNameFromMapping(ArgNames[I]).c_str() : "unknown" ); |
| 142 | } |
| 143 | #endif |
| 144 | |
| 145 | auto DeviceOrErr = PM->getDevice(DeviceId); |
| 146 | if (!DeviceOrErr) |
| 147 | FATAL_MESSAGE(DeviceId, "%s" , toString(DeviceOrErr.takeError()).c_str()); |
| 148 | |
| 149 | TargetAsyncInfoTy TargetAsyncInfo(*DeviceOrErr); |
| 150 | AsyncInfoTy &AsyncInfo = TargetAsyncInfo; |
| 151 | |
| 152 | /// RAII to establish tool anchors before and after data begin / end / update |
| 153 | OMPT_IF_BUILT(assert((TargetDataFunction == targetDataBegin || |
| 154 | TargetDataFunction == targetDataEnd || |
| 155 | TargetDataFunction == targetDataUpdate) && |
| 156 | "Encountered unexpected TargetDataFunction during " |
| 157 | "execution of targetData" ); |
| 158 | auto CallbackFunctions = |
| 159 | (TargetDataFunction == targetDataBegin) |
| 160 | ? RegionInterface.getCallbacks<ompt_target_enter_data>() |
| 161 | : (TargetDataFunction == targetDataEnd) |
| 162 | ? RegionInterface.getCallbacks<ompt_target_exit_data>() |
| 163 | : RegionInterface.getCallbacks<ompt_target_update>(); |
| 164 | InterfaceRAII TargetDataRAII(CallbackFunctions, DeviceId, |
| 165 | OMPT_GET_RETURN_ADDRESS);) |
| 166 | |
| 167 | int Rc = OFFLOAD_SUCCESS; |
| 168 | Rc = TargetDataFunction(Loc, *DeviceOrErr, ArgNum, ArgsBase, Args, ArgSizes, |
| 169 | ArgTypes, ArgNames, ArgMappers, AsyncInfo, |
| 170 | false /*FromMapper=*/); |
| 171 | |
| 172 | if (Rc == OFFLOAD_SUCCESS) |
| 173 | Rc = AsyncInfo.synchronize(); |
| 174 | |
| 175 | handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc); |
| 176 | } |
| 177 | |
| 178 | /// creates host-to-target data mapping, stores it in the |
| 179 | /// libomptarget.so internal structure (an entry in a stack of data maps) |
| 180 | /// and passes the data to the device. |
| 181 | EXTERN void __tgt_target_data_begin_mapper(ident_t *Loc, int64_t DeviceId, |
| 182 | int32_t ArgNum, void **ArgsBase, |
| 183 | void **Args, int64_t *ArgSizes, |
| 184 | int64_t *ArgTypes, |
| 185 | map_var_info_t *ArgNames, |
| 186 | void **ArgMappers) { |
| 187 | OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0))); |
| 188 | targetData<AsyncInfoTy>(Loc, DeviceId, ArgNum, ArgsBase, Args, ArgSizes, |
| 189 | ArgTypes, ArgNames, ArgMappers, targetDataBegin, |
| 190 | "Entering OpenMP data region with being_mapper" , |
| 191 | "begin" ); |
| 192 | } |
| 193 | |
| 194 | EXTERN void __tgt_target_data_begin_nowait_mapper( |
| 195 | ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase, |
| 196 | void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, |
| 197 | void **ArgMappers, int32_t DepNum, void *DepList, int32_t NoAliasDepNum, |
| 198 | void *NoAliasDepList) { |
| 199 | OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0))); |
| 200 | targetData<TaskAsyncInfoWrapperTy>( |
| 201 | Loc, DeviceId, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, ArgNames, |
| 202 | ArgMappers, targetDataBegin, |
| 203 | "Entering OpenMP data region with being_nowait_mapper" , "begin" ); |
| 204 | } |
| 205 | |
| 206 | /// passes data from the target, releases target memory and destroys |
| 207 | /// the host-target mapping (top entry from the stack of data maps) |
| 208 | /// created by the last __tgt_target_data_begin. |
| 209 | EXTERN void __tgt_target_data_end_mapper(ident_t *Loc, int64_t DeviceId, |
| 210 | int32_t ArgNum, void **ArgsBase, |
| 211 | void **Args, int64_t *ArgSizes, |
| 212 | int64_t *ArgTypes, |
| 213 | map_var_info_t *ArgNames, |
| 214 | void **ArgMappers) { |
| 215 | OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0))); |
| 216 | targetData<AsyncInfoTy>(Loc, DeviceId, ArgNum, ArgsBase, Args, ArgSizes, |
| 217 | ArgTypes, ArgNames, ArgMappers, targetDataEnd, |
| 218 | "Exiting OpenMP data region with end_mapper" , "end" ); |
| 219 | } |
| 220 | |
| 221 | EXTERN void __tgt_target_data_end_nowait_mapper( |
| 222 | ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase, |
| 223 | void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, |
| 224 | void **ArgMappers, int32_t DepNum, void *DepList, int32_t NoAliasDepNum, |
| 225 | void *NoAliasDepList) { |
| 226 | OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0))); |
| 227 | targetData<TaskAsyncInfoWrapperTy>( |
| 228 | Loc, DeviceId, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, ArgNames, |
| 229 | ArgMappers, targetDataEnd, |
| 230 | "Exiting OpenMP data region with end_nowait_mapper" , "end" ); |
| 231 | } |
| 232 | |
| 233 | EXTERN void __tgt_target_data_update_mapper(ident_t *Loc, int64_t DeviceId, |
| 234 | int32_t ArgNum, void **ArgsBase, |
| 235 | void **Args, int64_t *ArgSizes, |
| 236 | int64_t *ArgTypes, |
| 237 | map_var_info_t *ArgNames, |
| 238 | void **ArgMappers) { |
| 239 | OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0))); |
| 240 | targetData<AsyncInfoTy>( |
| 241 | Loc, DeviceId, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, ArgNames, |
| 242 | ArgMappers, targetDataUpdate, |
| 243 | "Updating data within the OpenMP data region with update_mapper" , |
| 244 | "update" ); |
| 245 | } |
| 246 | |
| 247 | EXTERN void __tgt_target_data_update_nowait_mapper( |
| 248 | ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase, |
| 249 | void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, |
| 250 | void **ArgMappers, int32_t DepNum, void *DepList, int32_t NoAliasDepNum, |
| 251 | void *NoAliasDepList) { |
| 252 | OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0))); |
| 253 | targetData<TaskAsyncInfoWrapperTy>( |
| 254 | Loc, DeviceId, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, ArgNames, |
| 255 | ArgMappers, targetDataUpdate, |
| 256 | "Updating data within the OpenMP data region with update_nowait_mapper" , |
| 257 | "update" ); |
| 258 | } |
| 259 | |
| 260 | static KernelArgsTy *upgradeKernelArgs(KernelArgsTy *KernelArgs, |
| 261 | KernelArgsTy &LocalKernelArgs, |
| 262 | int32_t NumTeams, int32_t ThreadLimit) { |
| 263 | if (KernelArgs->Version > OMP_KERNEL_ARG_VERSION) |
| 264 | DP("Unexpected ABI version: %u\n" , KernelArgs->Version); |
| 265 | |
| 266 | uint32_t UpgradedVersion = KernelArgs->Version; |
| 267 | if (KernelArgs->Version < OMP_KERNEL_ARG_VERSION) { |
| 268 | // The upgraded version will be based on the kernel launch environment. |
| 269 | if (KernelArgs->Version < OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR) |
| 270 | UpgradedVersion = OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR - 1; |
| 271 | else |
| 272 | UpgradedVersion = OMP_KERNEL_ARG_VERSION; |
| 273 | } |
| 274 | if (UpgradedVersion != KernelArgs->Version) { |
| 275 | LocalKernelArgs.Version = UpgradedVersion; |
| 276 | LocalKernelArgs.NumArgs = KernelArgs->NumArgs; |
| 277 | LocalKernelArgs.ArgBasePtrs = KernelArgs->ArgBasePtrs; |
| 278 | LocalKernelArgs.ArgPtrs = KernelArgs->ArgPtrs; |
| 279 | LocalKernelArgs.ArgSizes = KernelArgs->ArgSizes; |
| 280 | LocalKernelArgs.ArgTypes = KernelArgs->ArgTypes; |
| 281 | LocalKernelArgs.ArgNames = KernelArgs->ArgNames; |
| 282 | LocalKernelArgs.ArgMappers = KernelArgs->ArgMappers; |
| 283 | LocalKernelArgs.Tripcount = KernelArgs->Tripcount; |
| 284 | LocalKernelArgs.Flags = KernelArgs->Flags; |
| 285 | LocalKernelArgs.DynCGroupMem = 0; |
| 286 | LocalKernelArgs.NumTeams[0] = NumTeams; |
| 287 | LocalKernelArgs.NumTeams[1] = 1; |
| 288 | LocalKernelArgs.NumTeams[2] = 1; |
| 289 | LocalKernelArgs.ThreadLimit[0] = ThreadLimit; |
| 290 | LocalKernelArgs.ThreadLimit[1] = 1; |
| 291 | LocalKernelArgs.ThreadLimit[2] = 1; |
| 292 | return &LocalKernelArgs; |
| 293 | } |
| 294 | |
| 295 | // FIXME: This is a WA to "calibrate" the bad work done in the front end. |
| 296 | // Delete this ugly code after the front end emits proper values. |
| 297 | auto CorrectMultiDim = [](uint32_t (&Val)[3]) { |
| 298 | if (Val[1] == 0) |
| 299 | Val[1] = 1; |
| 300 | if (Val[2] == 0) |
| 301 | Val[2] = 1; |
| 302 | }; |
| 303 | CorrectMultiDim(KernelArgs->ThreadLimit); |
| 304 | CorrectMultiDim(KernelArgs->NumTeams); |
| 305 | |
| 306 | return KernelArgs; |
| 307 | } |
| 308 | |
| 309 | template <typename TargetAsyncInfoTy> |
| 310 | static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams, |
| 311 | int32_t ThreadLimit, void *HostPtr, |
| 312 | KernelArgsTy *KernelArgs) { |
| 313 | assert(PM && "Runtime not initialized" ); |
| 314 | static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>, |
| 315 | "Target AsyncInfoTy must be convertible to AsyncInfoTy." ); |
| 316 | DP("Entering target region for device %" PRId64 " with entry point " DPxMOD |
| 317 | "\n" , |
| 318 | DeviceId, DPxPTR(HostPtr)); |
| 319 | |
| 320 | if (checkDevice(DeviceId, Loc)) { |
| 321 | DP("Not offloading to device %" PRId64 "\n" , DeviceId); |
| 322 | return OMP_TGT_FAIL; |
| 323 | } |
| 324 | |
| 325 | bool IsTeams = NumTeams != -1; |
| 326 | if (!IsTeams) |
| 327 | KernelArgs->NumTeams[0] = NumTeams = 1; |
| 328 | |
| 329 | // Auto-upgrade kernel args version 1 to 2. |
| 330 | KernelArgsTy LocalKernelArgs; |
| 331 | KernelArgs = |
| 332 | upgradeKernelArgs(KernelArgs, LocalKernelArgs, NumTeams, ThreadLimit); |
| 333 | |
| 334 | TIMESCOPE_WITH_DETAILS_AND_IDENT( |
| 335 | "Runtime: target exe" , |
| 336 | "NumTeams=" + std::to_string(val: NumTeams) + |
| 337 | ";NumArgs=" + std::to_string(KernelArgs->NumArgs), |
| 338 | Loc); |
| 339 | |
| 340 | if (getInfoLevel() & OMP_INFOTYPE_KERNEL_ARGS) |
| 341 | printKernelArguments(Loc, DeviceId, KernelArgs->NumArgs, |
| 342 | KernelArgs->ArgSizes, KernelArgs->ArgTypes, |
| 343 | KernelArgs->ArgNames, "Entering OpenMP kernel" ); |
| 344 | #ifdef OMPTARGET_DEBUG |
| 345 | for (uint32_t I = 0; I < KernelArgs->NumArgs; ++I) { |
| 346 | DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64 |
| 347 | ", Type=0x%" PRIx64 ", Name=%s\n" , |
| 348 | I, DPxPTR(KernelArgs->ArgBasePtrs[I]), DPxPTR(KernelArgs->ArgPtrs[I]), |
| 349 | KernelArgs->ArgSizes[I], KernelArgs->ArgTypes[I], |
| 350 | (KernelArgs->ArgNames) |
| 351 | ? getNameFromMapping(KernelArgs->ArgNames[I]).c_str() |
| 352 | : "unknown" ); |
| 353 | } |
| 354 | #endif |
| 355 | |
| 356 | auto DeviceOrErr = PM->getDevice(DeviceId); |
| 357 | if (!DeviceOrErr) |
| 358 | FATAL_MESSAGE(DeviceId, "%s" , toString(DeviceOrErr.takeError()).c_str()); |
| 359 | |
| 360 | TargetAsyncInfoTy TargetAsyncInfo(*DeviceOrErr); |
| 361 | AsyncInfoTy &AsyncInfo = TargetAsyncInfo; |
| 362 | /// RAII to establish tool anchors before and after target region |
| 363 | OMPT_IF_BUILT(InterfaceRAII TargetRAII( |
| 364 | RegionInterface.getCallbacks<ompt_target>(), DeviceId, |
| 365 | /*CodePtr=*/OMPT_GET_RETURN_ADDRESS);) |
| 366 | |
| 367 | int Rc = OFFLOAD_SUCCESS; |
| 368 | Rc = target(Loc, *DeviceOrErr, HostPtr, *KernelArgs, AsyncInfo); |
| 369 | { // required to show synchronization |
| 370 | TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: synchronize" , "" , Loc); |
| 371 | if (Rc == OFFLOAD_SUCCESS) |
| 372 | Rc = AsyncInfo.synchronize(); |
| 373 | |
| 374 | handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc); |
| 375 | assert(Rc == OFFLOAD_SUCCESS && "__tgt_target_kernel unexpected failure!" ); |
| 376 | } |
| 377 | return OMP_TGT_SUCCESS; |
| 378 | } |
| 379 | |
| 380 | /// Implements a kernel entry that executes the target region on the specified |
| 381 | /// device. |
| 382 | /// |
| 383 | /// \param Loc Source location associated with this target region. |
| 384 | /// \param DeviceId The device to execute this region, -1 indicated the default. |
| 385 | /// \param NumTeams Number of teams to launch the region with, -1 indicates a |
| 386 | /// non-teams region and 0 indicates it was unspecified. |
| 387 | /// \param ThreadLimit Limit to the number of threads to use in the kernel |
| 388 | /// launch, 0 indicates it was unspecified. |
| 389 | /// \param HostPtr The pointer to the host function registered with the kernel. |
| 390 | /// \param Args All arguments to this kernel launch (see struct definition). |
| 391 | EXTERN int __tgt_target_kernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams, |
| 392 | int32_t ThreadLimit, void *HostPtr, |
| 393 | KernelArgsTy *KernelArgs) { |
| 394 | OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0))); |
| 395 | if (KernelArgs->Flags.NoWait) |
| 396 | return targetKernel<TaskAsyncInfoWrapperTy>( |
| 397 | Loc, DeviceId, NumTeams, ThreadLimit, HostPtr, KernelArgs); |
| 398 | return targetKernel<AsyncInfoTy>(Loc, DeviceId, NumTeams, ThreadLimit, |
| 399 | HostPtr, KernelArgs); |
| 400 | } |
| 401 | |
| 402 | /// Activates the record replay mechanism. |
| 403 | /// \param DeviceId The device identifier to execute the target region. |
| 404 | /// \param MemorySize The number of bytes to be (pre-)allocated |
| 405 | /// by the bump allocator |
| 406 | /// /param IsRecord Activates the record replay mechanism in |
| 407 | /// 'record' mode or 'replay' mode. |
| 408 | /// /param SaveOutput Store the device memory after kernel |
| 409 | /// execution on persistent storage |
| 410 | EXTERN int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize, |
| 411 | void *VAddr, bool IsRecord, |
| 412 | bool SaveOutput, |
| 413 | uint64_t &ReqPtrArgOffset) { |
| 414 | assert(PM && "Runtime not initialized" ); |
| 415 | OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0))); |
| 416 | auto DeviceOrErr = PM->getDevice(DeviceId); |
| 417 | if (!DeviceOrErr) |
| 418 | FATAL_MESSAGE(DeviceId, "%s" , toString(DeviceOrErr.takeError()).c_str()); |
| 419 | |
| 420 | [[maybe_unused]] int Rc = target_activate_rr( |
| 421 | *DeviceOrErr, MemorySize, VAddr, IsRecord, SaveOutput, ReqPtrArgOffset); |
| 422 | assert(Rc == OFFLOAD_SUCCESS && |
| 423 | "__tgt_activate_record_replay unexpected failure!" ); |
| 424 | return OMP_TGT_SUCCESS; |
| 425 | } |
| 426 | |
| 427 | /// Implements a target kernel entry that replays a pre-recorded kernel. |
| 428 | /// \param Loc Source location associated with this target region (unused). |
| 429 | /// \param DeviceId The device identifier to execute the target region. |
| 430 | /// \param HostPtr A pointer to an address that uniquely identifies the kernel. |
| 431 | /// \param DeviceMemory A pointer to an array storing device memory data to move |
| 432 | /// prior to kernel execution. |
| 433 | /// \param DeviceMemorySize The size of the above device memory data in bytes. |
| 434 | /// \param TgtArgs An array of pointers of the pre-recorded target kernel |
| 435 | /// arguments. |
| 436 | /// \param TgtOffsets An array of pointers of the pre-recorded target kernel |
| 437 | /// argument offsets. |
| 438 | /// \param NumArgs The number of kernel arguments. |
| 439 | /// \param NumTeams Number of teams to launch the target region with. |
| 440 | /// \param ThreadLimit Limit to the number of threads to use in kernel |
| 441 | /// execution. |
| 442 | /// \param LoopTripCount The pre-recorded value of the loop tripcount, if any. |
| 443 | /// \return OMP_TGT_SUCCESS on success, OMP_TGT_FAIL on failure. |
| 444 | EXTERN int __tgt_target_kernel_replay(ident_t *Loc, int64_t DeviceId, |
| 445 | void *HostPtr, void *DeviceMemory, |
| 446 | int64_t DeviceMemorySize, void **TgtArgs, |
| 447 | ptrdiff_t *TgtOffsets, int32_t NumArgs, |
| 448 | int32_t NumTeams, int32_t ThreadLimit, |
| 449 | uint64_t LoopTripCount) { |
| 450 | assert(PM && "Runtime not initialized" ); |
| 451 | OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0))); |
| 452 | if (checkDevice(DeviceId, Loc)) { |
| 453 | DP("Not offloading to device %" PRId64 "\n" , DeviceId); |
| 454 | return OMP_TGT_FAIL; |
| 455 | } |
| 456 | auto DeviceOrErr = PM->getDevice(DeviceId); |
| 457 | if (!DeviceOrErr) |
| 458 | FATAL_MESSAGE(DeviceId, "%s" , toString(DeviceOrErr.takeError()).c_str()); |
| 459 | |
| 460 | /// RAII to establish tool anchors before and after target region |
| 461 | OMPT_IF_BUILT(InterfaceRAII TargetRAII( |
| 462 | RegionInterface.getCallbacks<ompt_target>(), DeviceId, |
| 463 | /*CodePtr=*/OMPT_GET_RETURN_ADDRESS);) |
| 464 | |
| 465 | AsyncInfoTy AsyncInfo(*DeviceOrErr); |
| 466 | int Rc = target_replay(Loc, *DeviceOrErr, HostPtr, DeviceMemory, |
| 467 | DeviceMemorySize, TgtArgs, TgtOffsets, NumArgs, |
| 468 | NumTeams, ThreadLimit, LoopTripCount, AsyncInfo); |
| 469 | if (Rc == OFFLOAD_SUCCESS) |
| 470 | Rc = AsyncInfo.synchronize(); |
| 471 | handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc); |
| 472 | assert(Rc == OFFLOAD_SUCCESS && |
| 473 | "__tgt_target_kernel_replay unexpected failure!" ); |
| 474 | return OMP_TGT_SUCCESS; |
| 475 | } |
| 476 | |
| 477 | // Get the current number of components for a user-defined mapper. |
| 478 | EXTERN int64_t __tgt_mapper_num_components(void *RtMapperHandle) { |
| 479 | auto *MapperComponentsPtr = (struct MapperComponentsTy *)RtMapperHandle; |
| 480 | int64_t Size = MapperComponentsPtr->Components.size(); |
| 481 | DP("__tgt_mapper_num_components(Handle=" DPxMOD ") returns %" PRId64 "\n" , |
| 482 | DPxPTR(RtMapperHandle), Size); |
| 483 | return Size; |
| 484 | } |
| 485 | |
| 486 | // Push back one component for a user-defined mapper. |
| 487 | EXTERN void __tgt_push_mapper_component(void *RtMapperHandle, void *Base, |
| 488 | void *Begin, int64_t Size, int64_t Type, |
| 489 | void *Name) { |
| 490 | DP("__tgt_push_mapper_component(Handle=" DPxMOD |
| 491 | ") adds an entry (Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64 |
| 492 | ", Type=0x%" PRIx64 ", Name=%s).\n" , |
| 493 | DPxPTR(RtMapperHandle), DPxPTR(Base), DPxPTR(Begin), Size, Type, |
| 494 | (Name) ? getNameFromMapping(Name).c_str() : "unknown" ); |
| 495 | auto *MapperComponentsPtr = (struct MapperComponentsTy *)RtMapperHandle; |
| 496 | MapperComponentsPtr->Components.push_back( |
| 497 | MapComponentInfoTy(Base, Begin, Size, Type, Name)); |
| 498 | } |
| 499 | |
| 500 | EXTERN void __tgt_set_info_flag(uint32_t NewInfoLevel) { |
| 501 | assert(PM && "Runtime not initialized" ); |
| 502 | std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal(); |
| 503 | InfoLevel.store(NewInfoLevel); |
| 504 | } |
| 505 | |
| 506 | EXTERN int __tgt_print_device_info(int64_t DeviceId) { |
| 507 | assert(PM && "Runtime not initialized" ); |
| 508 | auto DeviceOrErr = PM->getDevice(DeviceId); |
| 509 | if (!DeviceOrErr) |
| 510 | FATAL_MESSAGE(DeviceId, "%s" , toString(DeviceOrErr.takeError()).c_str()); |
| 511 | |
| 512 | return DeviceOrErr->printDeviceInfo(); |
| 513 | } |
| 514 | |
| 515 | EXTERN void __tgt_target_nowait_query(void **AsyncHandle) { |
| 516 | assert(PM && "Runtime not initialized" ); |
| 517 | OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0))); |
| 518 | |
| 519 | if (!AsyncHandle || !*AsyncHandle) { |
| 520 | FATAL_MESSAGE0( |
| 521 | 1, "Receive an invalid async handle from the current OpenMP task. Is " |
| 522 | "this a target nowait region?\n" ); |
| 523 | } |
| 524 | |
| 525 | // Exponential backoff tries to optimally decide if a thread should just query |
| 526 | // for the device operations (work/spin wait on them) or block until they are |
| 527 | // completed (use device side blocking mechanism). This allows the runtime to |
| 528 | // adapt itself when there are a lot of long-running target regions in-flight. |
| 529 | static thread_local utils::ExponentialBackoff QueryCounter( |
| 530 | Int64Envar("OMPTARGET_QUERY_COUNT_MAX" , 10), |
| 531 | Int64Envar("OMPTARGET_QUERY_COUNT_THRESHOLD" , 5), |
| 532 | Envar<float>("OMPTARGET_QUERY_COUNT_BACKOFF_FACTOR" , 0.5f)); |
| 533 | |
| 534 | auto *AsyncInfo = (AsyncInfoTy *)*AsyncHandle; |
| 535 | |
| 536 | // If the thread is actively waiting on too many target nowait regions, we |
| 537 | // should use the blocking sync type. |
| 538 | if (QueryCounter.isAboveThreshold()) |
| 539 | AsyncInfo->SyncType = AsyncInfoTy::SyncTy::BLOCKING; |
| 540 | |
| 541 | if (AsyncInfo->synchronize()) |
| 542 | FATAL_MESSAGE0(1, "Error while querying the async queue for completion.\n" ); |
| 543 | // If there are device operations still pending, return immediately without |
| 544 | // deallocating the handle and increase the current thread query count. |
| 545 | if (!AsyncInfo->isDone()) { |
| 546 | QueryCounter.increment(); |
| 547 | return; |
| 548 | } |
| 549 | |
| 550 | // When a thread successfully completes a target nowait region, we |
| 551 | // exponentially backoff its query counter by the query factor. |
| 552 | QueryCounter.decrement(); |
| 553 | |
| 554 | // Delete the handle and unset it from the OpenMP task data. |
| 555 | delete AsyncInfo; |
| 556 | *AsyncHandle = nullptr; |
| 557 | } |
| 558 | |