Warning: This file is not a C or C++ file. It does not have highlighting.
| 1 | //===-------- omptarget.h - Target independent OpenMP target RTL -- C++ -*-===// |
|---|---|
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // Interface to be used by Clang during the codegen of a |
| 10 | // target region. |
| 11 | // |
| 12 | //===----------------------------------------------------------------------===// |
| 13 | |
| 14 | #ifndef _OMPTARGET_H_ |
| 15 | #define _OMPTARGET_H_ |
| 16 | |
| 17 | #include "Shared/APITypes.h" |
| 18 | #include "Shared/Environment.h" |
| 19 | #include "Shared/SourceInfo.h" |
| 20 | |
| 21 | #include "OpenMP/InternalTypes.h" |
| 22 | |
| 23 | #include <cstddef> |
| 24 | #include <cstdint> |
| 25 | #include <deque> |
| 26 | #include <functional> |
| 27 | #include <type_traits> |
| 28 | |
| 29 | #include "llvm/ADT/SmallVector.h" |
| 30 | |
| 31 | #define OFFLOAD_SUCCESS (0) |
| 32 | #define OFFLOAD_FAIL (~0) |
| 33 | |
| 34 | #define OFFLOAD_DEVICE_DEFAULT -1 |
| 35 | |
| 36 | // Don't format out enums and structs. |
| 37 | // clang-format off |
| 38 | |
| 39 | /// return flags of __tgt_target_XXX public APIs |
| 40 | enum __tgt_target_return_t : int { |
| 41 | /// successful offload executed on a target device |
| 42 | OMP_TGT_SUCCESS = 0, |
| 43 | /// offload may not execute on the requested target device |
| 44 | /// this scenario can be caused by the device not available or unsupported |
| 45 | /// as described in the Execution Model in the specification |
| 46 | /// this status may not be used for target device execution failure |
| 47 | /// which should be handled internally in libomptarget |
| 48 | OMP_TGT_FAIL = ~0 |
| 49 | }; |
| 50 | |
| 51 | /// Data attributes for each data reference used in an OpenMP target region. |
| 52 | enum tgt_map_type { |
| 53 | // No flags |
| 54 | OMP_TGT_MAPTYPE_NONE = 0x000, |
| 55 | // copy data from host to device |
| 56 | OMP_TGT_MAPTYPE_TO = 0x001, |
| 57 | // copy data from device to host |
| 58 | OMP_TGT_MAPTYPE_FROM = 0x002, |
| 59 | // copy regardless of the reference count |
| 60 | OMP_TGT_MAPTYPE_ALWAYS = 0x004, |
| 61 | // force unmapping of data |
| 62 | OMP_TGT_MAPTYPE_DELETE = 0x008, |
| 63 | // map the pointer as well as the pointee |
| 64 | OMP_TGT_MAPTYPE_PTR_AND_OBJ = 0x010, |
| 65 | // pass device base address to kernel |
| 66 | OMP_TGT_MAPTYPE_TARGET_PARAM = 0x020, |
| 67 | // return base device address of mapped data |
| 68 | OMP_TGT_MAPTYPE_RETURN_PARAM = 0x040, |
| 69 | // private variable - not mapped |
| 70 | OMP_TGT_MAPTYPE_PRIVATE = 0x080, |
| 71 | // copy by value - not mapped |
| 72 | OMP_TGT_MAPTYPE_LITERAL = 0x100, |
| 73 | // mapping is implicit |
| 74 | OMP_TGT_MAPTYPE_IMPLICIT = 0x200, |
| 75 | // copy data to device |
| 76 | OMP_TGT_MAPTYPE_CLOSE = 0x400, |
| 77 | // runtime error if not already allocated |
| 78 | OMP_TGT_MAPTYPE_PRESENT = 0x1000, |
| 79 | // use a separate reference counter so that the data cannot be unmapped within |
| 80 | // the structured region |
| 81 | // This is an OpenMP extension for the sake of OpenACC support. |
| 82 | OMP_TGT_MAPTYPE_OMPX_HOLD = 0x2000, |
| 83 | // descriptor for non-contiguous target-update |
| 84 | OMP_TGT_MAPTYPE_NON_CONTIG = 0x100000000000, |
| 85 | // member of struct, member given by [16 MSBs] - 1 |
| 86 | OMP_TGT_MAPTYPE_MEMBER_OF = 0xffff000000000000 |
| 87 | }; |
| 88 | |
| 89 | /// Flags for offload entries. |
| 90 | enum OpenMPOffloadingDeclareTargetFlags { |
| 91 | /// Mark the entry global as having a 'link' attribute. |
| 92 | OMP_DECLARE_TARGET_LINK = 0x01, |
| 93 | /// Mark the entry global as being an indirectly callable function. |
| 94 | OMP_DECLARE_TARGET_INDIRECT = 0x08, |
| 95 | /// This is an entry corresponding to a requirement to be registered. |
| 96 | OMP_REGISTER_REQUIRES = 0x10, |
| 97 | }; |
| 98 | |
| 99 | enum TargetAllocTy : int32_t { |
| 100 | TARGET_ALLOC_DEVICE = 0, |
| 101 | TARGET_ALLOC_HOST, |
| 102 | TARGET_ALLOC_SHARED, |
| 103 | TARGET_ALLOC_DEFAULT, |
| 104 | /// The allocation will not block on other streams. |
| 105 | TARGET_ALLOC_DEVICE_NON_BLOCKING, |
| 106 | }; |
| 107 | |
| 108 | inline KernelArgsTy CTorDTorKernelArgs = {1, 0, nullptr, nullptr, |
| 109 | nullptr, nullptr, nullptr, nullptr, |
| 110 | 0, {0,0,0}, {1, 0, 0}, {1, 0, 0}, 0}; |
| 111 | |
| 112 | struct DeviceTy; |
| 113 | |
| 114 | /// The libomptarget wrapper around a __tgt_async_info object directly |
| 115 | /// associated with a libomptarget layer device. RAII semantics to avoid |
| 116 | /// mistakes. |
| 117 | class AsyncInfoTy { |
| 118 | public: |
| 119 | enum class SyncTy { BLOCKING, NON_BLOCKING }; |
| 120 | |
| 121 | private: |
| 122 | /// Locations we used in (potentially) asynchronous calls which should live |
| 123 | /// as long as this AsyncInfoTy object. |
| 124 | std::deque<void *> BufferLocations; |
| 125 | |
| 126 | /// Post-processing operations executed after a successful synchronization. |
| 127 | /// \note the post-processing function should return OFFLOAD_SUCCESS or |
| 128 | /// OFFLOAD_FAIL appropriately. |
| 129 | using PostProcFuncTy = std::function<int()>; |
| 130 | llvm::SmallVector<PostProcFuncTy> PostProcessingFunctions; |
| 131 | |
| 132 | __tgt_async_info AsyncInfo; |
| 133 | DeviceTy &Device; |
| 134 | |
| 135 | public: |
| 136 | /// Synchronization method to be used. |
| 137 | SyncTy SyncType; |
| 138 | |
| 139 | AsyncInfoTy(DeviceTy &Device, SyncTy SyncType = SyncTy::BLOCKING) |
| 140 | : Device(Device), SyncType(SyncType) {} |
| 141 | ~AsyncInfoTy() { synchronize(); } |
| 142 | |
| 143 | /// Implicit conversion to the __tgt_async_info which is used in the |
| 144 | /// plugin interface. |
| 145 | operator __tgt_async_info *() { return &AsyncInfo; } |
| 146 | |
| 147 | /// Synchronize all pending actions. |
| 148 | /// |
| 149 | /// \note synchronization will be performance in a blocking or non-blocking |
| 150 | /// manner, depending on the SyncType. |
| 151 | /// |
| 152 | /// \note if the operations are completed, the registered post-processing |
| 153 | /// functions will be executed once and unregistered afterwards. |
| 154 | /// |
| 155 | /// \returns OFFLOAD_FAIL or OFFLOAD_SUCCESS appropriately. |
| 156 | int synchronize(); |
| 157 | |
| 158 | /// Return a void* reference with a lifetime that is at least as long as this |
| 159 | /// AsyncInfoTy object. The location can be used as intermediate buffer. |
| 160 | void *&getVoidPtrLocation(); |
| 161 | |
| 162 | /// Check if all asynchronous operations are completed. |
| 163 | /// |
| 164 | /// \note only a lightweight check. If needed, use synchronize() to query the |
| 165 | /// status of AsyncInfo before checking. |
| 166 | /// |
| 167 | /// \returns true if there is no pending asynchronous operations, false |
| 168 | /// otherwise. |
| 169 | bool isDone() const; |
| 170 | |
| 171 | /// Add a new post-processing function to be executed after synchronization. |
| 172 | /// |
| 173 | /// \param[in] Function is a templated function (e.g., function pointers, |
| 174 | /// lambdas, std::function) that can be convertible to a PostProcFuncTy (i.e., |
| 175 | /// it must have int() as its function signature). |
| 176 | template <typename FuncTy> void addPostProcessingFunction(FuncTy &&Function) { |
| 177 | static_assert(std::is_convertible_v<FuncTy, PostProcFuncTy>, |
| 178 | "Invalid post-processing function type. Please check " |
| 179 | "function signature!"); |
| 180 | PostProcessingFunctions.emplace_back(Function); |
| 181 | } |
| 182 | |
| 183 | private: |
| 184 | /// Run all the post-processing functions sequentially. |
| 185 | /// |
| 186 | /// \note after a successful execution, all previously registered functions |
| 187 | /// are unregistered. |
| 188 | /// |
| 189 | /// \returns OFFLOAD_FAIL if any post-processing function failed, |
| 190 | /// OFFLOAD_SUCCESS otherwise. |
| 191 | int32_t runPostProcessing(); |
| 192 | |
| 193 | /// Check if the internal asynchronous info queue is empty or not. |
| 194 | /// |
| 195 | /// \returns true if empty, false otherwise. |
| 196 | bool isQueueEmpty() const; |
| 197 | }; |
| 198 | |
| 199 | // Wrapper for task stored async info objects. |
| 200 | class TaskAsyncInfoWrapperTy { |
| 201 | // Invalid GTID as defined by libomp; keep in sync |
| 202 | static constexpr int KMP_GTID_DNE = -2; |
| 203 | |
| 204 | const int ExecThreadID = KMP_GTID_DNE; |
| 205 | AsyncInfoTy LocalAsyncInfo; |
| 206 | AsyncInfoTy *AsyncInfo = &LocalAsyncInfo; |
| 207 | void **TaskAsyncInfoPtr = nullptr; |
| 208 | |
| 209 | public: |
| 210 | TaskAsyncInfoWrapperTy(DeviceTy &Device) |
| 211 | : ExecThreadID(__kmpc_global_thread_num(NULL)), LocalAsyncInfo(Device) { |
| 212 | // If we failed to acquired the current global thread id, we cannot |
| 213 | // re-enqueue the current task. Thus we should use the local blocking async |
| 214 | // info. |
| 215 | if (ExecThreadID == KMP_GTID_DNE) |
| 216 | return; |
| 217 | |
| 218 | // Only tasks with an assigned task team can be re-enqueue and thus can |
| 219 | // use the non-blocking synchronization scheme. Thus we should use the local |
| 220 | // blocking async info, if we don“t have one. |
| 221 | if (!__kmpc_omp_has_task_team(ExecThreadID)) |
| 222 | return; |
| 223 | |
| 224 | // Acquire a pointer to the AsyncInfo stored inside the current task being |
| 225 | // executed. |
| 226 | TaskAsyncInfoPtr = __kmpc_omp_get_target_async_handle_ptr(ExecThreadID); |
| 227 | |
| 228 | // If we cannot acquire such pointer, fallback to using the local blocking |
| 229 | // async info. |
| 230 | if (!TaskAsyncInfoPtr) |
| 231 | return; |
| 232 | |
| 233 | // When creating a new task async info, the task handle must always be |
| 234 | // invalid. We must never overwrite any task async handle and there should |
| 235 | // never be any valid handle store inside the task at this point. |
| 236 | assert((*TaskAsyncInfoPtr) == nullptr && |
| 237 | "Task async handle is not empty when dispatching new device " |
| 238 | "operations. The handle was not cleared properly or " |
| 239 | "__tgt_target_nowait_query should have been called!"); |
| 240 | |
| 241 | // If no valid async handle is present, a new AsyncInfo will be allocated |
| 242 | // and stored in the current task. |
| 243 | AsyncInfo = new AsyncInfoTy(Device, AsyncInfoTy::SyncTy::NON_BLOCKING); |
| 244 | *TaskAsyncInfoPtr = (void *)AsyncInfo; |
| 245 | } |
| 246 | |
| 247 | ~TaskAsyncInfoWrapperTy() { |
| 248 | // Local async info destruction is automatically handled by ~AsyncInfoTy. |
| 249 | if (AsyncInfo == &LocalAsyncInfo) |
| 250 | return; |
| 251 | |
| 252 | // If the are device operations still pending, return immediately without |
| 253 | // deallocating the handle. |
| 254 | if (!AsyncInfo->isDone()) |
| 255 | return; |
| 256 | |
| 257 | // Delete the handle and unset it from the OpenMP task data. |
| 258 | delete AsyncInfo; |
| 259 | *TaskAsyncInfoPtr = nullptr; |
| 260 | } |
| 261 | |
| 262 | operator AsyncInfoTy &() { return *AsyncInfo; } |
| 263 | }; |
| 264 | |
| 265 | /// This struct is a record of non-contiguous information |
| 266 | struct __tgt_target_non_contig { |
| 267 | uint64_t Offset; |
| 268 | uint64_t Count; |
| 269 | uint64_t Stride; |
| 270 | }; |
| 271 | |
| 272 | #ifdef __cplusplus |
| 273 | extern "C" { |
| 274 | #endif |
| 275 | |
| 276 | void ompx_dump_mapping_tables(void); |
| 277 | int omp_get_num_devices(void); |
| 278 | int omp_get_device_num(void); |
| 279 | int omp_get_initial_device(void); |
| 280 | void *omp_target_alloc(size_t Size, int DeviceNum); |
| 281 | void omp_target_free(void *DevicePtr, int DeviceNum); |
| 282 | int omp_target_is_present(const void *Ptr, int DeviceNum); |
| 283 | int omp_target_memcpy(void *Dst, const void *Src, size_t Length, |
| 284 | size_t DstOffset, size_t SrcOffset, int DstDevice, |
| 285 | int SrcDevice); |
| 286 | int omp_target_memcpy_rect(void *Dst, const void *Src, size_t ElementSize, |
| 287 | int NumDims, const size_t *Volume, |
| 288 | const size_t *DstOffsets, const size_t *SrcOffsets, |
| 289 | const size_t *DstDimensions, |
| 290 | const size_t *SrcDimensions, int DstDevice, |
| 291 | int SrcDevice); |
| 292 | void *omp_target_memset(void *Ptr, int C, size_t N, int DeviceNum); |
| 293 | int omp_target_associate_ptr(const void *HostPtr, const void *DevicePtr, |
| 294 | size_t Size, size_t DeviceOffset, int DeviceNum); |
| 295 | int omp_target_disassociate_ptr(const void *HostPtr, int DeviceNum); |
| 296 | |
| 297 | /// Explicit target memory allocators |
| 298 | /// Using the llvm_ prefix until they become part of the OpenMP standard. |
| 299 | void *llvm_omp_target_alloc_device(size_t Size, int DeviceNum); |
| 300 | void *llvm_omp_target_alloc_host(size_t Size, int DeviceNum); |
| 301 | void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum); |
| 302 | |
| 303 | /// Explicit target memory deallocators |
| 304 | /// Using the llvm_ prefix until they become part of the OpenMP standard. |
| 305 | void llvm_omp_target_free_device(void *DevicePtr, int DeviceNum); |
| 306 | void llvm_omp_target_free_host(void *DevicePtr, int DeviceNum); |
| 307 | void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum); |
| 308 | |
| 309 | /// Dummy target so we have a symbol for generating host fallback. |
| 310 | void *llvm_omp_target_dynamic_shared_alloc(); |
| 311 | |
| 312 | /// add the clauses of the requires directives in a given file |
| 313 | void __tgt_register_requires(int64_t Flags); |
| 314 | |
| 315 | /// Initializes the runtime library. |
| 316 | void __tgt_rtl_init(); |
| 317 | |
| 318 | /// Deinitializes the runtime library. |
| 319 | void __tgt_rtl_deinit(); |
| 320 | |
| 321 | /// adds a target shared library to the target execution image |
| 322 | void __tgt_register_lib(__tgt_bin_desc *Desc); |
| 323 | |
| 324 | /// Initialize all RTLs at once |
| 325 | void __tgt_init_all_rtls(); |
| 326 | |
| 327 | /// removes a target shared library from the target execution image |
| 328 | void __tgt_unregister_lib(__tgt_bin_desc *Desc); |
| 329 | |
| 330 | // creates the host to target data mapping, stores it in the |
| 331 | // libomptarget.so internal structure (an entry in a stack of data maps) and |
| 332 | // passes the data to the device; |
| 333 | void __tgt_target_data_begin(int64_t DeviceId, int32_t ArgNum, void **ArgsBase, |
| 334 | void **Args, int64_t *ArgSizes, int64_t *ArgTypes); |
| 335 | void __tgt_target_data_begin_nowait(int64_t DeviceId, int32_t ArgNum, |
| 336 | void **ArgsBase, void **Args, |
| 337 | int64_t *ArgSizes, int64_t *ArgTypes, |
| 338 | int32_t DepNum, void *DepList, |
| 339 | int32_t NoAliasDepNum, |
| 340 | void *NoAliasDepList); |
| 341 | void __tgt_target_data_begin_mapper(ident_t *Loc, int64_t DeviceId, |
| 342 | int32_t ArgNum, void **ArgsBase, |
| 343 | void **Args, int64_t *ArgSizes, |
| 344 | int64_t *ArgTypes, map_var_info_t *ArgNames, |
| 345 | void **ArgMappers); |
| 346 | void __tgt_target_data_begin_nowait_mapper( |
| 347 | ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase, |
| 348 | void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, |
| 349 | void **ArgMappers, int32_t DepNum, void *DepList, int32_t NoAliasDepNum, |
| 350 | void *NoAliasDepList); |
| 351 | |
| 352 | // passes data from the target, release target memory and destroys the |
| 353 | // host-target mapping (top entry from the stack of data maps) created by |
| 354 | // the last __tgt_target_data_begin |
| 355 | void __tgt_target_data_end(int64_t DeviceId, int32_t ArgNum, void **ArgsBase, |
| 356 | void **Args, int64_t *ArgSizes, int64_t *ArgTypes); |
| 357 | void __tgt_target_data_end_nowait(int64_t DeviceId, int32_t ArgNum, |
| 358 | void **ArgsBase, void **Args, |
| 359 | int64_t *ArgSizes, int64_t *ArgTypes, |
| 360 | int32_t DepNum, void *DepList, |
| 361 | int32_t NoAliasDepNum, void *NoAliasDepList); |
| 362 | void __tgt_target_data_end_mapper(ident_t *Loc, int64_t DeviceId, |
| 363 | int32_t ArgNum, void **ArgsBase, void **Args, |
| 364 | int64_t *ArgSizes, int64_t *ArgTypes, |
| 365 | map_var_info_t *ArgNames, void **ArgMappers); |
| 366 | void __tgt_target_data_end_nowait_mapper( |
| 367 | ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase, |
| 368 | void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, |
| 369 | void **ArgMappers, int32_t depNum, void *depList, int32_t NoAliasDepNum, |
| 370 | void *NoAliasDepList); |
| 371 | |
| 372 | /// passes data to/from the target |
| 373 | void __tgt_target_data_update(int64_t DeviceId, int32_t ArgNum, void **ArgsBase, |
| 374 | void **Args, int64_t *ArgSizes, |
| 375 | int64_t *ArgTypes); |
| 376 | void __tgt_target_data_update_nowait(int64_t DeviceId, int32_t ArgNum, |
| 377 | void **ArgsBase, void **Args, |
| 378 | int64_t *ArgSizes, int64_t *ArgTypes, |
| 379 | int32_t DepNum, void *DepList, |
| 380 | int32_t NoAliasDepNum, |
| 381 | void *NoAliasDepList); |
| 382 | void __tgt_target_data_update_mapper(ident_t *Loc, int64_t DeviceId, |
| 383 | int32_t ArgNum, void **ArgsBase, |
| 384 | void **Args, int64_t *ArgSizes, |
| 385 | int64_t *ArgTypes, |
| 386 | map_var_info_t *ArgNames, |
| 387 | void **ArgMappers); |
| 388 | void __tgt_target_data_update_nowait_mapper( |
| 389 | ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase, |
| 390 | void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, |
| 391 | void **ArgMappers, int32_t DepNum, void *DepList, int32_t NoAliasDepNum, |
| 392 | void *NoAliasDepList); |
| 393 | |
| 394 | // Performs the same actions as data_begin in case ArgNum is non-zero |
| 395 | // and initiates run of offloaded region on target platform; if ArgNum |
| 396 | // is non-zero after the region execution is done it also performs the |
| 397 | // same action as data_end above. The following types are used; this |
| 398 | // function returns 0 if it was able to transfer the execution to a |
| 399 | // target and an int different from zero otherwise. |
| 400 | int __tgt_target_kernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams, |
| 401 | int32_t ThreadLimit, void *HostPtr, KernelArgsTy *Args); |
| 402 | |
| 403 | // Non-blocking synchronization for target nowait regions. This function |
| 404 | // acquires the asynchronous context from task data of the current task being |
| 405 | // executed and tries to query for the completion of its operations. If the |
| 406 | // operations are still pending, the function returns immediately. If the |
| 407 | // operations are completed, all the post-processing procedures stored in the |
| 408 | // asynchronous context are executed and the context is removed from the task |
| 409 | // data. |
| 410 | void __tgt_target_nowait_query(void **AsyncHandle); |
| 411 | |
| 412 | /// Executes a target kernel by replaying recorded kernel arguments and |
| 413 | /// device memory. |
| 414 | int __tgt_target_kernel_replay(ident_t *Loc, int64_t DeviceId, void *HostPtr, |
| 415 | void *DeviceMemory, int64_t DeviceMemorySize, |
| 416 | void **TgtArgs, ptrdiff_t *TgtOffsets, |
| 417 | int32_t NumArgs, int32_t NumTeams, |
| 418 | int32_t ThreadLimit, uint64_t LoopTripCount); |
| 419 | |
| 420 | void __tgt_set_info_flag(uint32_t); |
| 421 | |
| 422 | int __tgt_print_device_info(int64_t DeviceId); |
| 423 | |
| 424 | int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize, |
| 425 | void *VAddr, bool IsRecord, bool SaveOutput, |
| 426 | uint64_t &ReqPtrArgOffset); |
| 427 | |
| 428 | #ifdef __cplusplus |
| 429 | } |
| 430 | #endif |
| 431 | |
| 432 | #ifdef __cplusplus |
| 433 | #define EXTERN extern "C" |
| 434 | #else |
| 435 | #define EXTERN extern |
| 436 | #endif |
| 437 | |
| 438 | #endif // _OMPTARGET_H_ |
| 439 |
Warning: This file is not a C or C++ file. It does not have highlighting.
