| 1 | //===------ omptarget.cpp - Target independent OpenMP target RTL -- C++ -*-===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // Implementation of the interface to be used by Clang during the codegen of a |
| 10 | // target region. |
| 11 | // |
| 12 | //===----------------------------------------------------------------------===// |
| 13 | |
| 14 | #include "omptarget.h" |
| 15 | #include "OffloadPolicy.h" |
| 16 | #include "OpenMP/OMPT/Callback.h" |
| 17 | #include "OpenMP/OMPT/Interface.h" |
| 18 | #include "PluginManager.h" |
| 19 | #include "Shared/Debug.h" |
| 20 | #include "Shared/EnvironmentVar.h" |
| 21 | #include "Shared/Utils.h" |
| 22 | #include "device.h" |
| 23 | #include "private.h" |
| 24 | #include "rtl.h" |
| 25 | |
| 26 | #include "Shared/Profile.h" |
| 27 | |
| 28 | #include "OpenMP/Mapping.h" |
| 29 | #include "OpenMP/omp.h" |
| 30 | |
| 31 | #include "llvm/ADT/StringExtras.h" |
| 32 | #include "llvm/ADT/bit.h" |
| 33 | #include "llvm/Frontend/OpenMP/OMPConstants.h" |
| 34 | #include "llvm/Object/ObjectFile.h" |
| 35 | |
| 36 | #include <cassert> |
| 37 | #include <cstdint> |
| 38 | #include <vector> |
| 39 | |
| 40 | using llvm::SmallVector; |
| 41 | #ifdef OMPT_SUPPORT |
| 42 | using namespace llvm::omp::target::ompt; |
| 43 | #endif |
| 44 | |
| 45 | int AsyncInfoTy::synchronize() { |
| 46 | int Result = OFFLOAD_SUCCESS; |
| 47 | if (!isQueueEmpty()) { |
| 48 | switch (SyncType) { |
| 49 | case SyncTy::BLOCKING: |
| 50 | // If we have a queue we need to synchronize it now. |
| 51 | Result = Device.synchronize(*this); |
| 52 | assert(AsyncInfo.Queue == nullptr && |
| 53 | "The device plugin should have nulled the queue to indicate there " |
| 54 | "are no outstanding actions!" ); |
| 55 | break; |
| 56 | case SyncTy::NON_BLOCKING: |
| 57 | Result = Device.queryAsync(*this); |
| 58 | break; |
| 59 | } |
| 60 | } |
| 61 | |
| 62 | // Run any pending post-processing function registered on this async object. |
| 63 | if (Result == OFFLOAD_SUCCESS && isQueueEmpty()) |
| 64 | Result = runPostProcessing(); |
| 65 | |
| 66 | return Result; |
| 67 | } |
| 68 | |
| 69 | void *&AsyncInfoTy::getVoidPtrLocation() { |
| 70 | BufferLocations.push_back(nullptr); |
| 71 | return BufferLocations.back(); |
| 72 | } |
| 73 | |
| 74 | bool AsyncInfoTy::isDone() const { return isQueueEmpty(); } |
| 75 | |
| 76 | int32_t AsyncInfoTy::runPostProcessing() { |
| 77 | size_t Size = PostProcessingFunctions.size(); |
| 78 | for (size_t I = 0; I < Size; ++I) { |
| 79 | const int Result = PostProcessingFunctions[I](); |
| 80 | if (Result != OFFLOAD_SUCCESS) |
| 81 | return Result; |
| 82 | } |
| 83 | |
| 84 | // Clear the vector up until the last known function, since post-processing |
| 85 | // procedures might add new procedures themselves. |
| 86 | const auto *PrevBegin = PostProcessingFunctions.begin(); |
| 87 | PostProcessingFunctions.erase(PrevBegin, PrevBegin + Size); |
| 88 | |
| 89 | return OFFLOAD_SUCCESS; |
| 90 | } |
| 91 | |
| 92 | bool AsyncInfoTy::isQueueEmpty() const { return AsyncInfo.Queue == nullptr; } |
| 93 | |
| 94 | /* All begin addresses for partially mapped structs must be aligned, up to 16, |
| 95 | * in order to ensure proper alignment of members. E.g. |
| 96 | * |
| 97 | * struct S { |
| 98 | * int a; // 4-aligned |
| 99 | * int b; // 4-aligned |
| 100 | * int *p; // 8-aligned |
| 101 | * } s1; |
| 102 | * ... |
| 103 | * #pragma omp target map(tofrom: s1.b, s1.p[0:N]) |
| 104 | * { |
| 105 | * s1.b = 5; |
| 106 | * for (int i...) s1.p[i] = ...; |
| 107 | * } |
| 108 | * |
| 109 | * Here we are mapping s1 starting from member b, so BaseAddress=&s1=&s1.a and |
| 110 | * BeginAddress=&s1.b. Let's assume that the struct begins at address 0x100, |
| 111 | * then &s1.a=0x100, &s1.b=0x104, &s1.p=0x108. Each member obeys the alignment |
| 112 | * requirements for its type. Now, when we allocate memory on the device, in |
| 113 | * CUDA's case cuMemAlloc() returns an address which is at least 256-aligned. |
| 114 | * This means that the chunk of the struct on the device will start at a |
| 115 | * 256-aligned address, let's say 0x200. Then the address of b will be 0x200 and |
| 116 | * address of p will be a misaligned 0x204 (on the host there was no need to add |
| 117 | * padding between b and p, so p comes exactly 4 bytes after b). If the device |
| 118 | * kernel tries to access s1.p, a misaligned address error occurs (as reported |
| 119 | * by the CUDA plugin). By padding the begin address down to a multiple of 8 and |
| 120 | * extending the size of the allocated chuck accordingly, the chuck on the |
| 121 | * device will start at 0x200 with the padding (4 bytes), then &s1.b=0x204 and |
| 122 | * &s1.p=0x208, as they should be to satisfy the alignment requirements. |
| 123 | */ |
| 124 | static const int64_t MaxAlignment = 16; |
| 125 | |
| 126 | /// Return the alignment requirement of partially mapped structs, see |
| 127 | /// MaxAlignment above. |
| 128 | static uint64_t getPartialStructRequiredAlignment(void *HstPtrBase) { |
| 129 | int LowestOneBit = __builtin_ffsl(reinterpret_cast<uintptr_t>(HstPtrBase)); |
| 130 | uint64_t BaseAlignment = 1 << (LowestOneBit - 1); |
| 131 | return MaxAlignment < BaseAlignment ? MaxAlignment : BaseAlignment; |
| 132 | } |
| 133 | |
| 134 | void handleTargetOutcome(bool Success, ident_t *Loc) { |
| 135 | switch (OffloadPolicy::get(*PM).Kind) { |
| 136 | case OffloadPolicy::DISABLED: |
| 137 | if (Success) { |
| 138 | FATAL_MESSAGE0(1, "expected no offloading while offloading is disabled" ); |
| 139 | } |
| 140 | break; |
| 141 | case OffloadPolicy::MANDATORY: |
| 142 | if (!Success) { |
| 143 | if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE) { |
| 144 | auto ExclusiveDevicesAccessor = PM->getExclusiveDevicesAccessor(); |
| 145 | for (auto &Device : PM->devices(ExclusiveDevicesAccessor)) |
| 146 | dumpTargetPointerMappings(Loc, Device); |
| 147 | } else |
| 148 | FAILURE_MESSAGE("Consult https://openmp.llvm.org/design/Runtimes.html " |
| 149 | "for debugging options.\n" ); |
| 150 | |
| 151 | if (!PM->getNumActivePlugins()) { |
| 152 | FAILURE_MESSAGE( |
| 153 | "No images found compatible with the installed hardware. " ); |
| 154 | |
| 155 | llvm::SmallVector<llvm::StringRef> Archs; |
| 156 | for (auto &Image : PM->deviceImages()) { |
| 157 | const char *Start = reinterpret_cast<const char *>( |
| 158 | Image.getExecutableImage().ImageStart); |
| 159 | uint64_t Length = |
| 160 | utils::getPtrDiff(Start, Image.getExecutableImage().ImageEnd); |
| 161 | llvm::MemoryBufferRef Buffer(llvm::StringRef(Start, Length), |
| 162 | /*Identifier=*/"" ); |
| 163 | |
| 164 | auto ObjectOrErr = llvm::object::ObjectFile::createObjectFile(Buffer); |
| 165 | if (auto Err = ObjectOrErr.takeError()) { |
| 166 | llvm::consumeError(std::move(Err)); |
| 167 | continue; |
| 168 | } |
| 169 | |
| 170 | if (auto CPU = (*ObjectOrErr)->tryGetCPUName()) |
| 171 | Archs.push_back(*CPU); |
| 172 | } |
| 173 | fprintf(stderr, "Found %zu image(s): (%s)\n" , Archs.size(), |
| 174 | llvm::join(Archs, "," ).c_str()); |
| 175 | } |
| 176 | |
| 177 | SourceInfo Info(Loc); |
| 178 | if (Info.isAvailible()) |
| 179 | fprintf(stderr, "%s:%d:%d: " , Info.getFilename(), Info.getLine(), |
| 180 | Info.getColumn()); |
| 181 | else |
| 182 | FAILURE_MESSAGE("Source location information not present. Compile with " |
| 183 | "-g or -gline-tables-only.\n" ); |
| 184 | FATAL_MESSAGE0( |
| 185 | 1, "failure of target construct while offloading is mandatory" ); |
| 186 | } else { |
| 187 | if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE) { |
| 188 | auto ExclusiveDevicesAccessor = PM->getExclusiveDevicesAccessor(); |
| 189 | for (auto &Device : PM->devices(ExclusiveDevicesAccessor)) |
| 190 | dumpTargetPointerMappings(Loc, Device); |
| 191 | } |
| 192 | } |
| 193 | break; |
| 194 | } |
| 195 | } |
| 196 | |
| 197 | static int32_t getParentIndex(int64_t Type) { |
| 198 | return ((Type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1; |
| 199 | } |
| 200 | |
| 201 | void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind, |
| 202 | const char *Name) { |
| 203 | DP("Call to %s for device %d requesting %zu bytes\n" , Name, DeviceNum, Size); |
| 204 | |
| 205 | if (Size <= 0) { |
| 206 | DP("Call to %s with non-positive length\n" , Name); |
| 207 | return NULL; |
| 208 | } |
| 209 | |
| 210 | void *Rc = NULL; |
| 211 | |
| 212 | if (DeviceNum == omp_get_initial_device()) { |
| 213 | Rc = malloc(size: Size); |
| 214 | DP("%s returns host ptr " DPxMOD "\n" , Name, DPxPTR(Rc)); |
| 215 | return Rc; |
| 216 | } |
| 217 | |
| 218 | auto DeviceOrErr = PM->getDevice(DeviceNum); |
| 219 | if (!DeviceOrErr) |
| 220 | FATAL_MESSAGE(DeviceNum, "%s" , toString(DeviceOrErr.takeError()).c_str()); |
| 221 | |
| 222 | Rc = DeviceOrErr->allocData(Size, nullptr, Kind); |
| 223 | DP("%s returns device ptr " DPxMOD "\n" , Name, DPxPTR(Rc)); |
| 224 | return Rc; |
| 225 | } |
| 226 | |
| 227 | void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind, |
| 228 | const char *Name) { |
| 229 | DP("Call to %s for device %d and address " DPxMOD "\n" , Name, DeviceNum, |
| 230 | DPxPTR(DevicePtr)); |
| 231 | |
| 232 | if (!DevicePtr) { |
| 233 | DP("Call to %s with NULL ptr\n" , Name); |
| 234 | return; |
| 235 | } |
| 236 | |
| 237 | if (DeviceNum == omp_get_initial_device()) { |
| 238 | free(ptr: DevicePtr); |
| 239 | DP("%s deallocated host ptr\n" , Name); |
| 240 | return; |
| 241 | } |
| 242 | |
| 243 | auto DeviceOrErr = PM->getDevice(DeviceNum); |
| 244 | if (!DeviceOrErr) |
| 245 | FATAL_MESSAGE(DeviceNum, "%s" , toString(DeviceOrErr.takeError()).c_str()); |
| 246 | |
| 247 | if (DeviceOrErr->deleteData(DevicePtr, Kind) == OFFLOAD_FAIL) |
| 248 | FATAL_MESSAGE(DeviceNum, "%s" , |
| 249 | "Failed to deallocate device ptr. Set " |
| 250 | "OFFLOAD_TRACK_ALLOCATION_TRACES=1 to track allocations." ); |
| 251 | |
| 252 | DP("omp_target_free deallocated device ptr\n" ); |
| 253 | } |
| 254 | |
| 255 | void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum, |
| 256 | const char *Name) { |
| 257 | DP("Call to %s for device %d locking %zu bytes\n" , Name, DeviceNum, Size); |
| 258 | |
| 259 | if (Size <= 0) { |
| 260 | DP("Call to %s with non-positive length\n" , Name); |
| 261 | return NULL; |
| 262 | } |
| 263 | |
| 264 | void *RC = NULL; |
| 265 | |
| 266 | auto DeviceOrErr = PM->getDevice(DeviceNum); |
| 267 | if (!DeviceOrErr) |
| 268 | FATAL_MESSAGE(DeviceNum, "%s" , toString(DeviceOrErr.takeError()).c_str()); |
| 269 | |
| 270 | int32_t Err = 0; |
| 271 | Err = DeviceOrErr->RTL->data_lock(DeviceNum, HostPtr, Size, &RC); |
| 272 | if (Err) { |
| 273 | DP("Could not lock ptr %p\n" , HostPtr); |
| 274 | return nullptr; |
| 275 | } |
| 276 | DP("%s returns device ptr " DPxMOD "\n" , Name, DPxPTR(RC)); |
| 277 | return RC; |
| 278 | } |
| 279 | |
| 280 | void targetUnlockExplicit(void *HostPtr, int DeviceNum, const char *Name) { |
| 281 | DP("Call to %s for device %d unlocking\n" , Name, DeviceNum); |
| 282 | |
| 283 | auto DeviceOrErr = PM->getDevice(DeviceNum); |
| 284 | if (!DeviceOrErr) |
| 285 | FATAL_MESSAGE(DeviceNum, "%s" , toString(DeviceOrErr.takeError()).c_str()); |
| 286 | |
| 287 | DeviceOrErr->RTL->data_unlock(DeviceNum, HostPtr); |
| 288 | DP("%s returns\n" , Name); |
| 289 | } |
| 290 | |
| 291 | /// Call the user-defined mapper function followed by the appropriate |
| 292 | // targetData* function (targetData{Begin,End,Update}). |
| 293 | int targetDataMapper(ident_t *Loc, DeviceTy &Device, void *ArgBase, void *Arg, |
| 294 | int64_t ArgSize, int64_t ArgType, map_var_info_t ArgNames, |
| 295 | void *ArgMapper, AsyncInfoTy &AsyncInfo, |
| 296 | TargetDataFuncPtrTy TargetDataFunction) { |
| 297 | DP("Calling the mapper function " DPxMOD "\n" , DPxPTR(ArgMapper)); |
| 298 | |
| 299 | // The mapper function fills up Components. |
| 300 | MapperComponentsTy MapperComponents; |
| 301 | MapperFuncPtrTy MapperFuncPtr = (MapperFuncPtrTy)(ArgMapper); |
| 302 | (*MapperFuncPtr)((void *)&MapperComponents, ArgBase, Arg, ArgSize, ArgType, |
| 303 | ArgNames); |
| 304 | |
| 305 | // Construct new arrays for args_base, args, arg_sizes and arg_types |
| 306 | // using the information in MapperComponents and call the corresponding |
| 307 | // targetData* function using these new arrays. |
| 308 | SmallVector<void *> MapperArgsBase(MapperComponents.Components.size()); |
| 309 | SmallVector<void *> MapperArgs(MapperComponents.Components.size()); |
| 310 | SmallVector<int64_t> MapperArgSizes(MapperComponents.Components.size()); |
| 311 | SmallVector<int64_t> MapperArgTypes(MapperComponents.Components.size()); |
| 312 | SmallVector<void *> MapperArgNames(MapperComponents.Components.size()); |
| 313 | |
| 314 | for (unsigned I = 0, E = MapperComponents.Components.size(); I < E; ++I) { |
| 315 | auto &C = MapperComponents.Components[I]; |
| 316 | MapperArgsBase[I] = C.Base; |
| 317 | MapperArgs[I] = C.Begin; |
| 318 | MapperArgSizes[I] = C.Size; |
| 319 | MapperArgTypes[I] = C.Type; |
| 320 | MapperArgNames[I] = C.Name; |
| 321 | } |
| 322 | |
| 323 | int Rc = TargetDataFunction(Loc, Device, MapperComponents.Components.size(), |
| 324 | MapperArgsBase.data(), MapperArgs.data(), |
| 325 | MapperArgSizes.data(), MapperArgTypes.data(), |
| 326 | MapperArgNames.data(), /*arg_mappers*/ nullptr, |
| 327 | AsyncInfo, /*FromMapper=*/true); |
| 328 | |
| 329 | return Rc; |
| 330 | } |
| 331 | |
| 332 | /// Internal function to do the mapping and transfer the data to the device |
| 333 | int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, |
| 334 | void **ArgsBase, void **Args, int64_t *ArgSizes, |
| 335 | int64_t *ArgTypes, map_var_info_t *ArgNames, |
| 336 | void **ArgMappers, AsyncInfoTy &AsyncInfo, |
| 337 | bool FromMapper) { |
| 338 | // process each input. |
| 339 | for (int32_t I = 0; I < ArgNum; ++I) { |
| 340 | // Ignore private variables and arrays - there is no mapping for them. |
| 341 | if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) || |
| 342 | (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE)) |
| 343 | continue; |
| 344 | TIMESCOPE_WITH_DETAILS_AND_IDENT( |
| 345 | "HostToDev" , "Size=" + std::to_string(val: ArgSizes[I]) + "B" , Loc); |
| 346 | if (ArgMappers && ArgMappers[I]) { |
| 347 | // Instead of executing the regular path of targetDataBegin, call the |
| 348 | // targetDataMapper variant which will call targetDataBegin again |
| 349 | // with new arguments. |
| 350 | DP("Calling targetDataMapper for the %dth argument\n" , I); |
| 351 | |
| 352 | map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I]; |
| 353 | int Rc = targetDataMapper(Loc, Device, ArgsBase[I], Args[I], ArgSizes[I], |
| 354 | ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo, |
| 355 | targetDataBegin); |
| 356 | |
| 357 | if (Rc != OFFLOAD_SUCCESS) { |
| 358 | REPORT("Call to targetDataBegin via targetDataMapper for custom mapper" |
| 359 | " failed.\n" ); |
| 360 | return OFFLOAD_FAIL; |
| 361 | } |
| 362 | |
| 363 | // Skip the rest of this function, continue to the next argument. |
| 364 | continue; |
| 365 | } |
| 366 | |
| 367 | void *HstPtrBegin = Args[I]; |
| 368 | void *HstPtrBase = ArgsBase[I]; |
| 369 | int64_t DataSize = ArgSizes[I]; |
| 370 | map_var_info_t HstPtrName = (!ArgNames) ? nullptr : ArgNames[I]; |
| 371 | |
| 372 | // Adjust for proper alignment if this is a combined entry (for structs). |
| 373 | // Look at the next argument - if that is MEMBER_OF this one, then this one |
| 374 | // is a combined entry. |
| 375 | int64_t TgtPadding = 0; |
| 376 | const int NextI = I + 1; |
| 377 | if (getParentIndex(Type: ArgTypes[I]) < 0 && NextI < ArgNum && |
| 378 | getParentIndex(Type: ArgTypes[NextI]) == I) { |
| 379 | int64_t Alignment = getPartialStructRequiredAlignment(HstPtrBase); |
| 380 | TgtPadding = (int64_t)HstPtrBegin % Alignment; |
| 381 | if (TgtPadding) { |
| 382 | DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD |
| 383 | "\n" , |
| 384 | TgtPadding, DPxPTR(HstPtrBegin)); |
| 385 | } |
| 386 | } |
| 387 | |
| 388 | // Address of pointer on the host and device, respectively. |
| 389 | void *PointerHstPtrBegin, *PointerTgtPtrBegin; |
| 390 | TargetPointerResultTy PointerTpr; |
| 391 | bool IsHostPtr = false; |
| 392 | bool IsImplicit = ArgTypes[I] & OMP_TGT_MAPTYPE_IMPLICIT; |
| 393 | // Force the creation of a device side copy of the data when: |
| 394 | // a close map modifier was associated with a map that contained a to. |
| 395 | bool HasCloseModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_CLOSE; |
| 396 | bool HasPresentModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_PRESENT; |
| 397 | bool HasHoldModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_OMPX_HOLD; |
| 398 | // UpdateRef is based on MEMBER_OF instead of TARGET_PARAM because if we |
| 399 | // have reached this point via __tgt_target_data_begin and not __tgt_target |
| 400 | // then no argument is marked as TARGET_PARAM ("omp target data map" is not |
| 401 | // associated with a target region, so there are no target parameters). This |
| 402 | // may be considered a hack, we could revise the scheme in the future. |
| 403 | bool UpdateRef = |
| 404 | !(ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) && !(FromMapper && I == 0); |
| 405 | |
| 406 | MappingInfoTy::HDTTMapAccessorTy HDTTMap = |
| 407 | Device.getMappingInfo().HostDataToTargetMap.getExclusiveAccessor(); |
| 408 | if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) { |
| 409 | DP("Has a pointer entry: \n" ); |
| 410 | // Base is address of pointer. |
| 411 | // |
| 412 | // Usually, the pointer is already allocated by this time. For example: |
| 413 | // |
| 414 | // #pragma omp target map(s.p[0:N]) |
| 415 | // |
| 416 | // The map entry for s comes first, and the PTR_AND_OBJ entry comes |
| 417 | // afterward, so the pointer is already allocated by the time the |
| 418 | // PTR_AND_OBJ entry is handled below, and PointerTgtPtrBegin is thus |
| 419 | // non-null. However, "declare target link" can produce a PTR_AND_OBJ |
| 420 | // entry for a global that might not already be allocated by the time the |
| 421 | // PTR_AND_OBJ entry is handled below, and so the allocation might fail |
| 422 | // when HasPresentModifier. |
| 423 | PointerTpr = Device.getMappingInfo().getTargetPointer( |
| 424 | HDTTMap, HstPtrBase, HstPtrBase, /*TgtPadding=*/0, sizeof(void *), |
| 425 | /*HstPtrName=*/nullptr, |
| 426 | /*HasFlagTo=*/false, /*HasFlagAlways=*/false, IsImplicit, UpdateRef, |
| 427 | HasCloseModifier, HasPresentModifier, HasHoldModifier, AsyncInfo, |
| 428 | /*OwnedTPR=*/nullptr, /*ReleaseHDTTMap=*/false); |
| 429 | PointerTgtPtrBegin = PointerTpr.TargetPointer; |
| 430 | IsHostPtr = PointerTpr.Flags.IsHostPointer; |
| 431 | if (!PointerTgtPtrBegin) { |
| 432 | REPORT("Call to getTargetPointer returned null pointer (%s).\n" , |
| 433 | HasPresentModifier ? "'present' map type modifier" |
| 434 | : "device failure or illegal mapping" ); |
| 435 | return OFFLOAD_FAIL; |
| 436 | } |
| 437 | DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new" |
| 438 | "\n" , |
| 439 | sizeof(void *), DPxPTR(PointerTgtPtrBegin), |
| 440 | (PointerTpr.Flags.IsNewEntry ? "" : " not" )); |
| 441 | PointerHstPtrBegin = HstPtrBase; |
| 442 | // modify current entry. |
| 443 | HstPtrBase = *(void **)HstPtrBase; |
| 444 | // No need to update pointee ref count for the first element of the |
| 445 | // subelement that comes from mapper. |
| 446 | UpdateRef = |
| 447 | (!FromMapper || I != 0); // subsequently update ref count of pointee |
| 448 | } |
| 449 | |
| 450 | const bool HasFlagTo = ArgTypes[I] & OMP_TGT_MAPTYPE_TO; |
| 451 | const bool HasFlagAlways = ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS; |
| 452 | // Note that HDTTMap will be released in getTargetPointer. |
| 453 | auto TPR = Device.getMappingInfo().getTargetPointer( |
| 454 | HDTTMap, HstPtrBegin, HstPtrBase, TgtPadding, DataSize, HstPtrName, |
| 455 | HasFlagTo, HasFlagAlways, IsImplicit, UpdateRef, HasCloseModifier, |
| 456 | HasPresentModifier, HasHoldModifier, AsyncInfo, PointerTpr.getEntry()); |
| 457 | void *TgtPtrBegin = TPR.TargetPointer; |
| 458 | IsHostPtr = TPR.Flags.IsHostPointer; |
| 459 | // If data_size==0, then the argument could be a zero-length pointer to |
| 460 | // NULL, so getOrAlloc() returning NULL is not an error. |
| 461 | if (!TgtPtrBegin && (DataSize || HasPresentModifier)) { |
| 462 | REPORT("Call to getTargetPointer returned null pointer (%s).\n" , |
| 463 | HasPresentModifier ? "'present' map type modifier" |
| 464 | : "device failure or illegal mapping" ); |
| 465 | return OFFLOAD_FAIL; |
| 466 | } |
| 467 | DP("There are %" PRId64 " bytes allocated at target address " DPxMOD |
| 468 | " - is%s new\n" , |
| 469 | DataSize, DPxPTR(TgtPtrBegin), (TPR.Flags.IsNewEntry ? "" : " not" )); |
| 470 | |
| 471 | if (ArgTypes[I] & OMP_TGT_MAPTYPE_RETURN_PARAM) { |
| 472 | uintptr_t Delta = (uintptr_t)HstPtrBegin - (uintptr_t)HstPtrBase; |
| 473 | void *TgtPtrBase = (void *)((uintptr_t)TgtPtrBegin - Delta); |
| 474 | DP("Returning device pointer " DPxMOD "\n" , DPxPTR(TgtPtrBase)); |
| 475 | ArgsBase[I] = TgtPtrBase; |
| 476 | } |
| 477 | |
| 478 | if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ && !IsHostPtr) { |
| 479 | |
| 480 | uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase; |
| 481 | void *ExpectedTgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta); |
| 482 | |
| 483 | if (PointerTpr.getEntry()->addShadowPointer(ShadowPtrInfoTy{ |
| 484 | (void **)PointerHstPtrBegin, HstPtrBase, |
| 485 | (void **)PointerTgtPtrBegin, ExpectedTgtPtrBase})) { |
| 486 | DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n" , |
| 487 | DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin)); |
| 488 | |
| 489 | void *&TgtPtrBase = AsyncInfo.getVoidPtrLocation(); |
| 490 | TgtPtrBase = ExpectedTgtPtrBase; |
| 491 | |
| 492 | int Ret = |
| 493 | Device.submitData(PointerTgtPtrBegin, &TgtPtrBase, sizeof(void *), |
| 494 | AsyncInfo, PointerTpr.getEntry()); |
| 495 | if (Ret != OFFLOAD_SUCCESS) { |
| 496 | REPORT("Copying data to device failed.\n" ); |
| 497 | return OFFLOAD_FAIL; |
| 498 | } |
| 499 | if (PointerTpr.getEntry()->addEventIfNecessary(Device, AsyncInfo) != |
| 500 | OFFLOAD_SUCCESS) |
| 501 | return OFFLOAD_FAIL; |
| 502 | } |
| 503 | } |
| 504 | |
| 505 | // Check if variable can be used on the device: |
| 506 | bool IsStructMember = ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF; |
| 507 | if (getInfoLevel() & OMP_INFOTYPE_EMPTY_MAPPING && ArgTypes[I] != 0 && |
| 508 | !IsStructMember && !IsImplicit && !TPR.isPresent() && |
| 509 | !TPR.isContained() && !TPR.isHostPointer()) |
| 510 | INFO(OMP_INFOTYPE_EMPTY_MAPPING, Device.DeviceID, |
| 511 | "variable %s does not have a valid device counterpart\n" , |
| 512 | (HstPtrName) ? getNameFromMapping(HstPtrName).c_str() : "unknown" ); |
| 513 | } |
| 514 | |
| 515 | return OFFLOAD_SUCCESS; |
| 516 | } |
| 517 | |
| 518 | namespace { |
| 519 | /// This structure contains information to deallocate a target pointer, aka. |
| 520 | /// used to fix up the shadow map and potentially delete the entry from the |
| 521 | /// mapping table via \p DeviceTy::deallocTgtPtr. |
| 522 | struct PostProcessingInfo { |
| 523 | /// Host pointer used to look up into the map table |
| 524 | void *HstPtrBegin; |
| 525 | |
| 526 | /// Size of the data |
| 527 | int64_t DataSize; |
| 528 | |
| 529 | /// The mapping type (bitfield). |
| 530 | int64_t ArgType; |
| 531 | |
| 532 | /// The target pointer information. |
| 533 | TargetPointerResultTy TPR; |
| 534 | |
| 535 | PostProcessingInfo(void *HstPtr, int64_t Size, int64_t ArgType, |
| 536 | TargetPointerResultTy &&TPR) |
| 537 | : HstPtrBegin(HstPtr), DataSize(Size), ArgType(ArgType), |
| 538 | TPR(std::move(TPR)) {} |
| 539 | }; |
| 540 | |
| 541 | } // namespace |
| 542 | |
| 543 | /// Applies the necessary post-processing procedures to entries listed in \p |
| 544 | /// EntriesInfo after the execution of all device side operations from a target |
| 545 | /// data end. This includes the update of pointers at the host and removal of |
| 546 | /// device buffer when needed. It returns OFFLOAD_FAIL or OFFLOAD_SUCCESS |
| 547 | /// according to the successfulness of the operations. |
| 548 | [[nodiscard]] static int |
| 549 | postProcessingTargetDataEnd(DeviceTy *Device, |
| 550 | SmallVector<PostProcessingInfo> &EntriesInfo) { |
| 551 | int Ret = OFFLOAD_SUCCESS; |
| 552 | |
| 553 | for (auto &[HstPtrBegin, DataSize, ArgType, TPR] : EntriesInfo) { |
| 554 | bool DelEntry = !TPR.isHostPointer(); |
| 555 | |
| 556 | // If the last element from the mapper (for end transfer args comes in |
| 557 | // reverse order), do not remove the partial entry, the parent struct still |
| 558 | // exists. |
| 559 | if ((ArgType & OMP_TGT_MAPTYPE_MEMBER_OF) && |
| 560 | !(ArgType & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) { |
| 561 | DelEntry = false; // protect parent struct from being deallocated |
| 562 | } |
| 563 | |
| 564 | // If we marked the entry to be deleted we need to verify no other |
| 565 | // thread reused it by now. If deletion is still supposed to happen by |
| 566 | // this thread LR will be set and exclusive access to the HDTT map |
| 567 | // will avoid another thread reusing the entry now. Note that we do |
| 568 | // not request (exclusive) access to the HDTT map if DelEntry is |
| 569 | // not set. |
| 570 | MappingInfoTy::HDTTMapAccessorTy HDTTMap = |
| 571 | Device->getMappingInfo().HostDataToTargetMap.getExclusiveAccessor(); |
| 572 | |
| 573 | // We cannot use a lock guard because we may end up delete the mutex. |
| 574 | // We also explicitly unlocked the entry after it was put in the EntriesInfo |
| 575 | // so it can be reused. |
| 576 | TPR.getEntry()->lock(); |
| 577 | auto *Entry = TPR.getEntry(); |
| 578 | |
| 579 | const bool IsNotLastUser = Entry->decDataEndThreadCount() != 0; |
| 580 | if (DelEntry && (Entry->getTotalRefCount() != 0 || IsNotLastUser)) { |
| 581 | // The thread is not in charge of deletion anymore. Give up access |
| 582 | // to the HDTT map and unset the deletion flag. |
| 583 | HDTTMap.destroy(); |
| 584 | DelEntry = false; |
| 585 | } |
| 586 | |
| 587 | // If we copied back to the host a struct/array containing pointers, |
| 588 | // we need to restore the original host pointer values from their |
| 589 | // shadow copies. If the struct is going to be deallocated, remove any |
| 590 | // remaining shadow pointer entries for this struct. |
| 591 | const bool HasFrom = ArgType & OMP_TGT_MAPTYPE_FROM; |
| 592 | if (HasFrom) { |
| 593 | Entry->foreachShadowPointerInfo([&](const ShadowPtrInfoTy &ShadowPtr) { |
| 594 | *ShadowPtr.HstPtrAddr = ShadowPtr.HstPtrVal; |
| 595 | DP("Restoring original host pointer value " DPxMOD " for host " |
| 596 | "pointer " DPxMOD "\n" , |
| 597 | DPxPTR(ShadowPtr.HstPtrVal), DPxPTR(ShadowPtr.HstPtrAddr)); |
| 598 | return OFFLOAD_SUCCESS; |
| 599 | }); |
| 600 | } |
| 601 | |
| 602 | // Give up the lock as we either don't need it anymore (e.g., done with |
| 603 | // TPR), or erase TPR. |
| 604 | TPR.setEntry(nullptr); |
| 605 | |
| 606 | if (!DelEntry) |
| 607 | continue; |
| 608 | |
| 609 | Ret = Device->getMappingInfo().eraseMapEntry(HDTTMap, Entry, DataSize); |
| 610 | // Entry is already remove from the map, we can unlock it now. |
| 611 | HDTTMap.destroy(); |
| 612 | Ret |= Device->getMappingInfo().deallocTgtPtrAndEntry(Entry, DataSize); |
| 613 | if (Ret != OFFLOAD_SUCCESS) { |
| 614 | REPORT("Deallocating data from device failed.\n" ); |
| 615 | break; |
| 616 | } |
| 617 | } |
| 618 | |
| 619 | delete &EntriesInfo; |
| 620 | return Ret; |
| 621 | } |
| 622 | |
| 623 | /// Internal function to undo the mapping and retrieve the data from the device. |
| 624 | int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, |
| 625 | void **ArgBases, void **Args, int64_t *ArgSizes, |
| 626 | int64_t *ArgTypes, map_var_info_t *ArgNames, |
| 627 | void **ArgMappers, AsyncInfoTy &AsyncInfo, bool FromMapper) { |
| 628 | int Ret = OFFLOAD_SUCCESS; |
| 629 | auto *PostProcessingPtrs = new SmallVector<PostProcessingInfo>(); |
| 630 | // process each input. |
| 631 | for (int32_t I = ArgNum - 1; I >= 0; --I) { |
| 632 | // Ignore private variables and arrays - there is no mapping for them. |
| 633 | // Also, ignore the use_device_ptr directive, it has no effect here. |
| 634 | if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) || |
| 635 | (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE)) |
| 636 | continue; |
| 637 | |
| 638 | if (ArgMappers && ArgMappers[I]) { |
| 639 | // Instead of executing the regular path of targetDataEnd, call the |
| 640 | // targetDataMapper variant which will call targetDataEnd again |
| 641 | // with new arguments. |
| 642 | DP("Calling targetDataMapper for the %dth argument\n" , I); |
| 643 | |
| 644 | map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I]; |
| 645 | Ret = targetDataMapper(Loc, Device, ArgBases[I], Args[I], ArgSizes[I], |
| 646 | ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo, |
| 647 | targetDataEnd); |
| 648 | |
| 649 | if (Ret != OFFLOAD_SUCCESS) { |
| 650 | REPORT("Call to targetDataEnd via targetDataMapper for custom mapper" |
| 651 | " failed.\n" ); |
| 652 | return OFFLOAD_FAIL; |
| 653 | } |
| 654 | |
| 655 | // Skip the rest of this function, continue to the next argument. |
| 656 | continue; |
| 657 | } |
| 658 | |
| 659 | void *HstPtrBegin = Args[I]; |
| 660 | int64_t DataSize = ArgSizes[I]; |
| 661 | bool IsImplicit = ArgTypes[I] & OMP_TGT_MAPTYPE_IMPLICIT; |
| 662 | bool UpdateRef = (!(ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) || |
| 663 | (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) && |
| 664 | !(FromMapper && I == 0); |
| 665 | bool ForceDelete = ArgTypes[I] & OMP_TGT_MAPTYPE_DELETE; |
| 666 | bool HasPresentModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_PRESENT; |
| 667 | bool HasHoldModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_OMPX_HOLD; |
| 668 | |
| 669 | // If PTR_AND_OBJ, HstPtrBegin is address of pointee |
| 670 | TargetPointerResultTy TPR = Device.getMappingInfo().getTgtPtrBegin( |
| 671 | HstPtrBegin, DataSize, UpdateRef, HasHoldModifier, !IsImplicit, |
| 672 | ForceDelete, /*FromDataEnd=*/true); |
| 673 | void *TgtPtrBegin = TPR.TargetPointer; |
| 674 | if (!TPR.isPresent() && !TPR.isHostPointer() && |
| 675 | (DataSize || HasPresentModifier)) { |
| 676 | DP("Mapping does not exist (%s)\n" , |
| 677 | (HasPresentModifier ? "'present' map type modifier" : "ignored" )); |
| 678 | if (HasPresentModifier) { |
| 679 | // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 350 L10-13: |
| 680 | // "If a map clause appears on a target, target data, target enter data |
| 681 | // or target exit data construct with a present map-type-modifier then |
| 682 | // on entry to the region if the corresponding list item does not appear |
| 683 | // in the device data environment then an error occurs and the program |
| 684 | // terminates." |
| 685 | // |
| 686 | // This should be an error upon entering an "omp target exit data". It |
| 687 | // should not be an error upon exiting an "omp target data" or "omp |
| 688 | // target". For "omp target data", Clang thus doesn't include present |
| 689 | // modifiers for end calls. For "omp target", we have not found a valid |
| 690 | // OpenMP program for which the error matters: it appears that, if a |
| 691 | // program can guarantee that data is present at the beginning of an |
| 692 | // "omp target" region so that there's no error there, that data is also |
| 693 | // guaranteed to be present at the end. |
| 694 | MESSAGE("device mapping required by 'present' map type modifier does " |
| 695 | "not exist for host address " DPxMOD " (%" PRId64 " bytes)" , |
| 696 | DPxPTR(HstPtrBegin), DataSize); |
| 697 | return OFFLOAD_FAIL; |
| 698 | } |
| 699 | } else { |
| 700 | DP("There are %" PRId64 " bytes allocated at target address " DPxMOD |
| 701 | " - is%s last\n" , |
| 702 | DataSize, DPxPTR(TgtPtrBegin), (TPR.Flags.IsLast ? "" : " not" )); |
| 703 | } |
| 704 | |
| 705 | // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 351 L14-16: |
| 706 | // "If the map clause appears on a target, target data, or target exit data |
| 707 | // construct and a corresponding list item of the original list item is not |
| 708 | // present in the device data environment on exit from the region then the |
| 709 | // list item is ignored." |
| 710 | if (!TPR.isPresent()) |
| 711 | continue; |
| 712 | |
| 713 | // Move data back to the host |
| 714 | const bool HasAlways = ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS; |
| 715 | const bool HasFrom = ArgTypes[I] & OMP_TGT_MAPTYPE_FROM; |
| 716 | if (HasFrom && (HasAlways || TPR.Flags.IsLast) && |
| 717 | !TPR.Flags.IsHostPointer && DataSize != 0) { |
| 718 | DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n" , |
| 719 | DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); |
| 720 | TIMESCOPE_WITH_DETAILS_AND_IDENT( |
| 721 | "DevToHost" , "Size=" + std::to_string(val: DataSize) + "B" , Loc); |
| 722 | // Wait for any previous transfer if an event is present. |
| 723 | if (void *Event = TPR.getEntry()->getEvent()) { |
| 724 | if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) { |
| 725 | REPORT("Failed to wait for event " DPxMOD ".\n" , DPxPTR(Event)); |
| 726 | return OFFLOAD_FAIL; |
| 727 | } |
| 728 | } |
| 729 | |
| 730 | Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, DataSize, AsyncInfo, |
| 731 | TPR.getEntry()); |
| 732 | if (Ret != OFFLOAD_SUCCESS) { |
| 733 | REPORT("Copying data from device failed.\n" ); |
| 734 | return OFFLOAD_FAIL; |
| 735 | } |
| 736 | |
| 737 | // As we are expecting to delete the entry the d2h copy might race |
| 738 | // with another one that also tries to delete the entry. This happens |
| 739 | // as the entry can be reused and the reuse might happen after the |
| 740 | // copy-back was issued but before it completed. Since the reuse might |
| 741 | // also copy-back a value we would race. |
| 742 | if (TPR.Flags.IsLast) { |
| 743 | if (TPR.getEntry()->addEventIfNecessary(Device, AsyncInfo) != |
| 744 | OFFLOAD_SUCCESS) |
| 745 | return OFFLOAD_FAIL; |
| 746 | } |
| 747 | } |
| 748 | |
| 749 | // Add pointer to the buffer for post-synchronize processing. |
| 750 | PostProcessingPtrs->emplace_back(HstPtrBegin, DataSize, ArgTypes[I], |
| 751 | std::move(TPR)); |
| 752 | PostProcessingPtrs->back().TPR.getEntry()->unlock(); |
| 753 | } |
| 754 | |
| 755 | // Add post-processing functions |
| 756 | // TODO: We might want to remove `mutable` in the future by not changing the |
| 757 | // captured variables somehow. |
| 758 | AsyncInfo.addPostProcessingFunction([=, Device = &Device]() mutable -> int { |
| 759 | return postProcessingTargetDataEnd(Device, *PostProcessingPtrs); |
| 760 | }); |
| 761 | |
| 762 | return Ret; |
| 763 | } |
| 764 | |
| 765 | static int targetDataContiguous(ident_t *Loc, DeviceTy &Device, void *ArgsBase, |
| 766 | void *HstPtrBegin, int64_t ArgSize, |
| 767 | int64_t ArgType, AsyncInfoTy &AsyncInfo) { |
| 768 | TargetPointerResultTy TPR = Device.getMappingInfo().getTgtPtrBegin( |
| 769 | HstPtrBegin, ArgSize, /*UpdateRefCount=*/false, |
| 770 | /*UseHoldRefCount=*/false, /*MustContain=*/true); |
| 771 | void *TgtPtrBegin = TPR.TargetPointer; |
| 772 | if (!TPR.isPresent()) { |
| 773 | DP("hst data:" DPxMOD " not found, becomes a noop\n" , DPxPTR(HstPtrBegin)); |
| 774 | if (ArgType & OMP_TGT_MAPTYPE_PRESENT) { |
| 775 | MESSAGE("device mapping required by 'present' motion modifier does not " |
| 776 | "exist for host address " DPxMOD " (%" PRId64 " bytes)" , |
| 777 | DPxPTR(HstPtrBegin), ArgSize); |
| 778 | return OFFLOAD_FAIL; |
| 779 | } |
| 780 | return OFFLOAD_SUCCESS; |
| 781 | } |
| 782 | |
| 783 | if (TPR.Flags.IsHostPointer) { |
| 784 | DP("hst data:" DPxMOD " unified and shared, becomes a noop\n" , |
| 785 | DPxPTR(HstPtrBegin)); |
| 786 | return OFFLOAD_SUCCESS; |
| 787 | } |
| 788 | |
| 789 | if (ArgType & OMP_TGT_MAPTYPE_TO) { |
| 790 | DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n" , |
| 791 | ArgSize, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin)); |
| 792 | int Ret = Device.submitData(TgtPtrBegin, HstPtrBegin, ArgSize, AsyncInfo, |
| 793 | TPR.getEntry()); |
| 794 | if (Ret != OFFLOAD_SUCCESS) { |
| 795 | REPORT("Copying data to device failed.\n" ); |
| 796 | return OFFLOAD_FAIL; |
| 797 | } |
| 798 | if (TPR.getEntry()) { |
| 799 | int Ret = TPR.getEntry()->foreachShadowPointerInfo( |
| 800 | [&](ShadowPtrInfoTy &ShadowPtr) { |
| 801 | DP("Restoring original target pointer value " DPxMOD " for target " |
| 802 | "pointer " DPxMOD "\n" , |
| 803 | DPxPTR(ShadowPtr.TgtPtrVal), DPxPTR(ShadowPtr.TgtPtrAddr)); |
| 804 | Ret = Device.submitData(ShadowPtr.TgtPtrAddr, |
| 805 | (void *)&ShadowPtr.TgtPtrVal, |
| 806 | sizeof(void *), AsyncInfo); |
| 807 | if (Ret != OFFLOAD_SUCCESS) { |
| 808 | REPORT("Copying data to device failed.\n" ); |
| 809 | return OFFLOAD_FAIL; |
| 810 | } |
| 811 | return OFFLOAD_SUCCESS; |
| 812 | }); |
| 813 | if (Ret != OFFLOAD_SUCCESS) { |
| 814 | DP("Updating shadow map failed\n" ); |
| 815 | return Ret; |
| 816 | } |
| 817 | } |
| 818 | } |
| 819 | |
| 820 | if (ArgType & OMP_TGT_MAPTYPE_FROM) { |
| 821 | DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n" , |
| 822 | ArgSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); |
| 823 | int Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, ArgSize, AsyncInfo, |
| 824 | TPR.getEntry()); |
| 825 | if (Ret != OFFLOAD_SUCCESS) { |
| 826 | REPORT("Copying data from device failed.\n" ); |
| 827 | return OFFLOAD_FAIL; |
| 828 | } |
| 829 | |
| 830 | // Wait for device-to-host memcopies for whole struct to complete, |
| 831 | // before restoring the correct host pointer. |
| 832 | if (auto *Entry = TPR.getEntry()) { |
| 833 | AsyncInfo.addPostProcessingFunction([=]() -> int { |
| 834 | int Ret = Entry->foreachShadowPointerInfo( |
| 835 | [&](const ShadowPtrInfoTy &ShadowPtr) { |
| 836 | *ShadowPtr.HstPtrAddr = ShadowPtr.HstPtrVal; |
| 837 | DP("Restoring original host pointer value " DPxMOD |
| 838 | " for host pointer " DPxMOD "\n" , |
| 839 | DPxPTR(ShadowPtr.HstPtrVal), DPxPTR(ShadowPtr.HstPtrAddr)); |
| 840 | return OFFLOAD_SUCCESS; |
| 841 | }); |
| 842 | Entry->unlock(); |
| 843 | if (Ret != OFFLOAD_SUCCESS) { |
| 844 | DP("Updating shadow map failed\n" ); |
| 845 | return Ret; |
| 846 | } |
| 847 | return OFFLOAD_SUCCESS; |
| 848 | }); |
| 849 | } |
| 850 | } |
| 851 | |
| 852 | return OFFLOAD_SUCCESS; |
| 853 | } |
| 854 | |
| 855 | static int targetDataNonContiguous(ident_t *Loc, DeviceTy &Device, |
| 856 | void *ArgsBase, |
| 857 | __tgt_target_non_contig *NonContig, |
| 858 | uint64_t Size, int64_t ArgType, |
| 859 | int CurrentDim, int DimSize, uint64_t Offset, |
| 860 | AsyncInfoTy &AsyncInfo) { |
| 861 | int Ret = OFFLOAD_SUCCESS; |
| 862 | if (CurrentDim < DimSize) { |
| 863 | for (unsigned int I = 0; I < NonContig[CurrentDim].Count; ++I) { |
| 864 | uint64_t CurOffset = |
| 865 | (NonContig[CurrentDim].Offset + I) * NonContig[CurrentDim].Stride; |
| 866 | // we only need to transfer the first element for the last dimension |
| 867 | // since we've already got a contiguous piece. |
| 868 | if (CurrentDim != DimSize - 1 || I == 0) { |
| 869 | Ret = targetDataNonContiguous(Loc, Device, ArgsBase, NonContig, Size, |
| 870 | ArgType, CurrentDim + 1, DimSize, |
| 871 | Offset + CurOffset, AsyncInfo); |
| 872 | // Stop the whole process if any contiguous piece returns anything |
| 873 | // other than OFFLOAD_SUCCESS. |
| 874 | if (Ret != OFFLOAD_SUCCESS) |
| 875 | return Ret; |
| 876 | } |
| 877 | } |
| 878 | } else { |
| 879 | char *Ptr = (char *)ArgsBase + Offset; |
| 880 | DP("Transfer of non-contiguous : host ptr " DPxMOD " offset %" PRIu64 |
| 881 | " len %" PRIu64 "\n" , |
| 882 | DPxPTR(Ptr), Offset, Size); |
| 883 | Ret = targetDataContiguous(Loc, Device, ArgsBase, Ptr, Size, ArgType, |
| 884 | AsyncInfo); |
| 885 | } |
| 886 | return Ret; |
| 887 | } |
| 888 | |
| 889 | static int getNonContigMergedDimension(__tgt_target_non_contig *NonContig, |
| 890 | int32_t DimSize) { |
| 891 | int RemovedDim = 0; |
| 892 | for (int I = DimSize - 1; I > 0; --I) { |
| 893 | if (NonContig[I].Count * NonContig[I].Stride == NonContig[I - 1].Stride) |
| 894 | RemovedDim++; |
| 895 | } |
| 896 | return RemovedDim; |
| 897 | } |
| 898 | |
| 899 | /// Internal function to pass data to/from the target. |
| 900 | int targetDataUpdate(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, |
| 901 | void **ArgsBase, void **Args, int64_t *ArgSizes, |
| 902 | int64_t *ArgTypes, map_var_info_t *ArgNames, |
| 903 | void **ArgMappers, AsyncInfoTy &AsyncInfo, bool) { |
| 904 | // process each input. |
| 905 | for (int32_t I = 0; I < ArgNum; ++I) { |
| 906 | if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) || |
| 907 | (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE)) |
| 908 | continue; |
| 909 | |
| 910 | if (ArgMappers && ArgMappers[I]) { |
| 911 | // Instead of executing the regular path of targetDataUpdate, call the |
| 912 | // targetDataMapper variant which will call targetDataUpdate again |
| 913 | // with new arguments. |
| 914 | DP("Calling targetDataMapper for the %dth argument\n" , I); |
| 915 | |
| 916 | map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I]; |
| 917 | int Ret = targetDataMapper(Loc, Device, ArgsBase[I], Args[I], ArgSizes[I], |
| 918 | ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo, |
| 919 | targetDataUpdate); |
| 920 | |
| 921 | if (Ret != OFFLOAD_SUCCESS) { |
| 922 | REPORT("Call to targetDataUpdate via targetDataMapper for custom mapper" |
| 923 | " failed.\n" ); |
| 924 | return OFFLOAD_FAIL; |
| 925 | } |
| 926 | |
| 927 | // Skip the rest of this function, continue to the next argument. |
| 928 | continue; |
| 929 | } |
| 930 | |
| 931 | int Ret = OFFLOAD_SUCCESS; |
| 932 | |
| 933 | if (ArgTypes[I] & OMP_TGT_MAPTYPE_NON_CONTIG) { |
| 934 | __tgt_target_non_contig *NonContig = (__tgt_target_non_contig *)Args[I]; |
| 935 | int32_t DimSize = ArgSizes[I]; |
| 936 | uint64_t Size = |
| 937 | NonContig[DimSize - 1].Count * NonContig[DimSize - 1].Stride; |
| 938 | int32_t MergedDim = getNonContigMergedDimension(NonContig, DimSize); |
| 939 | Ret = targetDataNonContiguous( |
| 940 | Loc, Device, ArgsBase[I], NonContig, Size, ArgTypes[I], |
| 941 | /*current_dim=*/0, DimSize - MergedDim, /*offset=*/0, AsyncInfo); |
| 942 | } else { |
| 943 | Ret = targetDataContiguous(Loc, Device, ArgsBase[I], Args[I], ArgSizes[I], |
| 944 | ArgTypes[I], AsyncInfo); |
| 945 | } |
| 946 | if (Ret == OFFLOAD_FAIL) |
| 947 | return OFFLOAD_FAIL; |
| 948 | } |
| 949 | return OFFLOAD_SUCCESS; |
| 950 | } |
| 951 | |
| 952 | static const unsigned LambdaMapping = OMP_TGT_MAPTYPE_PTR_AND_OBJ | |
| 953 | OMP_TGT_MAPTYPE_LITERAL | |
| 954 | OMP_TGT_MAPTYPE_IMPLICIT; |
| 955 | static bool isLambdaMapping(int64_t Mapping) { |
| 956 | return (Mapping & LambdaMapping) == LambdaMapping; |
| 957 | } |
| 958 | |
| 959 | namespace { |
| 960 | /// Find the table information in the map or look it up in the translation |
| 961 | /// tables. |
| 962 | TableMap *getTableMap(void *HostPtr) { |
| 963 | std::lock_guard<std::mutex> TblMapLock(PM->TblMapMtx); |
| 964 | HostPtrToTableMapTy::iterator TableMapIt = |
| 965 | PM->HostPtrToTableMap.find(HostPtr); |
| 966 | |
| 967 | if (TableMapIt != PM->HostPtrToTableMap.end()) |
| 968 | return &TableMapIt->second; |
| 969 | |
| 970 | // We don't have a map. So search all the registered libraries. |
| 971 | TableMap *TM = nullptr; |
| 972 | std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx); |
| 973 | for (HostEntriesBeginToTransTableTy::iterator Itr = |
| 974 | PM->HostEntriesBeginToTransTable.begin(); |
| 975 | Itr != PM->HostEntriesBeginToTransTable.end(); ++Itr) { |
| 976 | // get the translation table (which contains all the good info). |
| 977 | TranslationTable *TransTable = &Itr->second; |
| 978 | // iterate over all the host table entries to see if we can locate the |
| 979 | // host_ptr. |
| 980 | llvm::offloading::EntryTy *Cur = TransTable->HostTable.EntriesBegin; |
| 981 | for (uint32_t I = 0; Cur < TransTable->HostTable.EntriesEnd; ++Cur, ++I) { |
| 982 | if (Cur->Address != HostPtr) |
| 983 | continue; |
| 984 | // we got a match, now fill the HostPtrToTableMap so that we |
| 985 | // may avoid this search next time. |
| 986 | TM = &(PM->HostPtrToTableMap)[HostPtr]; |
| 987 | TM->Table = TransTable; |
| 988 | TM->Index = I; |
| 989 | return TM; |
| 990 | } |
| 991 | } |
| 992 | |
| 993 | return nullptr; |
| 994 | } |
| 995 | |
| 996 | /// A class manages private arguments in a target region. |
| 997 | class PrivateArgumentManagerTy { |
| 998 | /// A data structure for the information of first-private arguments. We can |
| 999 | /// use this information to optimize data transfer by packing all |
| 1000 | /// first-private arguments and transfer them all at once. |
| 1001 | struct FirstPrivateArgInfoTy { |
| 1002 | /// Host pointer begin |
| 1003 | char *HstPtrBegin; |
| 1004 | /// Host pointer end |
| 1005 | char *HstPtrEnd; |
| 1006 | /// The index of the element in \p TgtArgs corresponding to the argument |
| 1007 | int Index; |
| 1008 | /// Alignment of the entry (base of the entry, not after the entry). |
| 1009 | uint32_t Alignment; |
| 1010 | /// Size (without alignment, see padding) |
| 1011 | uint32_t Size; |
| 1012 | /// Padding used to align this argument entry, if necessary. |
| 1013 | uint32_t Padding; |
| 1014 | /// Host pointer name |
| 1015 | map_var_info_t HstPtrName = nullptr; |
| 1016 | |
| 1017 | FirstPrivateArgInfoTy(int Index, void *HstPtr, uint32_t Size, |
| 1018 | uint32_t Alignment, uint32_t Padding, |
| 1019 | map_var_info_t HstPtrName = nullptr) |
| 1020 | : HstPtrBegin(reinterpret_cast<char *>(HstPtr)), |
| 1021 | HstPtrEnd(HstPtrBegin + Size), Index(Index), Alignment(Alignment), |
| 1022 | Size(Size), Padding(Padding), HstPtrName(HstPtrName) {} |
| 1023 | }; |
| 1024 | |
| 1025 | /// A vector of target pointers for all private arguments |
| 1026 | SmallVector<void *> TgtPtrs; |
| 1027 | |
| 1028 | /// A vector of information of all first-private arguments to be packed |
| 1029 | SmallVector<FirstPrivateArgInfoTy> FirstPrivateArgInfo; |
| 1030 | /// Host buffer for all arguments to be packed |
| 1031 | SmallVector<char> FirstPrivateArgBuffer; |
| 1032 | /// The total size of all arguments to be packed |
| 1033 | int64_t FirstPrivateArgSize = 0; |
| 1034 | |
| 1035 | /// A reference to the \p DeviceTy object |
| 1036 | DeviceTy &Device; |
| 1037 | /// A pointer to a \p AsyncInfoTy object |
| 1038 | AsyncInfoTy &AsyncInfo; |
| 1039 | |
| 1040 | // TODO: What would be the best value here? Should we make it configurable? |
| 1041 | // If the size is larger than this threshold, we will allocate and transfer it |
| 1042 | // immediately instead of packing it. |
| 1043 | static constexpr const int64_t FirstPrivateArgSizeThreshold = 1024; |
| 1044 | |
| 1045 | public: |
| 1046 | /// Constructor |
| 1047 | PrivateArgumentManagerTy(DeviceTy &Dev, AsyncInfoTy &AsyncInfo) |
| 1048 | : Device(Dev), AsyncInfo(AsyncInfo) {} |
| 1049 | |
| 1050 | /// Add a private argument |
| 1051 | int addArg(void *HstPtr, int64_t ArgSize, int64_t ArgOffset, |
| 1052 | bool IsFirstPrivate, void *&TgtPtr, int TgtArgsIndex, |
| 1053 | map_var_info_t HstPtrName = nullptr, |
| 1054 | const bool AllocImmediately = false) { |
| 1055 | // If the argument is not first-private, or its size is greater than a |
| 1056 | // predefined threshold, we will allocate memory and issue the transfer |
| 1057 | // immediately. |
| 1058 | if (ArgSize > FirstPrivateArgSizeThreshold || !IsFirstPrivate || |
| 1059 | AllocImmediately) { |
| 1060 | TgtPtr = Device.allocData(ArgSize, HstPtr); |
| 1061 | if (!TgtPtr) { |
| 1062 | DP("Data allocation for %sprivate array " DPxMOD " failed.\n" , |
| 1063 | (IsFirstPrivate ? "first-" : "" ), DPxPTR(HstPtr)); |
| 1064 | return OFFLOAD_FAIL; |
| 1065 | } |
| 1066 | #ifdef OMPTARGET_DEBUG |
| 1067 | void *TgtPtrBase = (void *)((intptr_t)TgtPtr + ArgOffset); |
| 1068 | DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD |
| 1069 | " for %sprivate array " DPxMOD " - pushing target argument " DPxMOD |
| 1070 | "\n" , |
| 1071 | ArgSize, DPxPTR(TgtPtr), (IsFirstPrivate ? "first-" : "" ), |
| 1072 | DPxPTR(HstPtr), DPxPTR(TgtPtrBase)); |
| 1073 | #endif |
| 1074 | // If first-private, copy data from host |
| 1075 | if (IsFirstPrivate) { |
| 1076 | DP("Submitting firstprivate data to the device.\n" ); |
| 1077 | int Ret = Device.submitData(TgtPtr, HstPtr, ArgSize, AsyncInfo); |
| 1078 | if (Ret != OFFLOAD_SUCCESS) { |
| 1079 | DP("Copying data to device failed, failed.\n" ); |
| 1080 | return OFFLOAD_FAIL; |
| 1081 | } |
| 1082 | } |
| 1083 | TgtPtrs.push_back(TgtPtr); |
| 1084 | } else { |
| 1085 | DP("Firstprivate array " DPxMOD " of size %" PRId64 " will be packed\n" , |
| 1086 | DPxPTR(HstPtr), ArgSize); |
| 1087 | // When reach this point, the argument must meet all following |
| 1088 | // requirements: |
| 1089 | // 1. Its size does not exceed the threshold (see the comment for |
| 1090 | // FirstPrivateArgSizeThreshold); |
| 1091 | // 2. It must be first-private (needs to be mapped to target device). |
| 1092 | // We will pack all this kind of arguments to transfer them all at once |
| 1093 | // to reduce the number of data transfer. We will not take |
| 1094 | // non-first-private arguments, aka. private arguments that doesn't need |
| 1095 | // to be mapped to target device, into account because data allocation |
| 1096 | // can be very efficient with memory manager. |
| 1097 | |
| 1098 | // Placeholder value |
| 1099 | TgtPtr = nullptr; |
| 1100 | auto *LastFPArgInfo = |
| 1101 | FirstPrivateArgInfo.empty() ? nullptr : &FirstPrivateArgInfo.back(); |
| 1102 | |
| 1103 | // Compute the start alignment of this entry, add padding if necessary. |
| 1104 | // TODO: Consider sorting instead. |
| 1105 | uint32_t Padding = 0; |
| 1106 | uint32_t StartAlignment = |
| 1107 | LastFPArgInfo ? LastFPArgInfo->Alignment : MaxAlignment; |
| 1108 | if (LastFPArgInfo) { |
| 1109 | // Check if we keep the start alignment or if it is shrunk due to the |
| 1110 | // size of the last element. |
| 1111 | uint32_t Offset = LastFPArgInfo->Size % StartAlignment; |
| 1112 | if (Offset) |
| 1113 | StartAlignment = Offset; |
| 1114 | // We only need as much alignment as the host pointer had (since we |
| 1115 | // don't know the alignment information from the source we might end up |
| 1116 | // overaligning accesses but not too much). |
| 1117 | uint32_t RequiredAlignment = |
| 1118 | llvm::bit_floor(Value: getPartialStructRequiredAlignment(HstPtrBase: HstPtr)); |
| 1119 | if (RequiredAlignment > StartAlignment) { |
| 1120 | Padding = RequiredAlignment - StartAlignment; |
| 1121 | StartAlignment = RequiredAlignment; |
| 1122 | } |
| 1123 | } |
| 1124 | |
| 1125 | FirstPrivateArgInfo.emplace_back(TgtArgsIndex, HstPtr, ArgSize, |
| 1126 | StartAlignment, Padding, HstPtrName); |
| 1127 | FirstPrivateArgSize += Padding + ArgSize; |
| 1128 | } |
| 1129 | |
| 1130 | return OFFLOAD_SUCCESS; |
| 1131 | } |
| 1132 | |
| 1133 | /// Pack first-private arguments, replace place holder pointers in \p TgtArgs, |
| 1134 | /// and start the transfer. |
| 1135 | int packAndTransfer(SmallVector<void *> &TgtArgs) { |
| 1136 | if (!FirstPrivateArgInfo.empty()) { |
| 1137 | assert(FirstPrivateArgSize != 0 && |
| 1138 | "FirstPrivateArgSize is 0 but FirstPrivateArgInfo is empty" ); |
| 1139 | FirstPrivateArgBuffer.resize(FirstPrivateArgSize, 0); |
| 1140 | auto *Itr = FirstPrivateArgBuffer.begin(); |
| 1141 | // Copy all host data to this buffer |
| 1142 | for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) { |
| 1143 | // First pad the pointer as we (have to) pad it on the device too. |
| 1144 | Itr = std::next(Itr, Info.Padding); |
| 1145 | std::copy(Info.HstPtrBegin, Info.HstPtrEnd, Itr); |
| 1146 | Itr = std::next(Itr, Info.Size); |
| 1147 | } |
| 1148 | // Allocate target memory |
| 1149 | void *TgtPtr = |
| 1150 | Device.allocData(FirstPrivateArgSize, FirstPrivateArgBuffer.data()); |
| 1151 | if (TgtPtr == nullptr) { |
| 1152 | DP("Failed to allocate target memory for private arguments.\n" ); |
| 1153 | return OFFLOAD_FAIL; |
| 1154 | } |
| 1155 | TgtPtrs.push_back(TgtPtr); |
| 1156 | DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD "\n" , |
| 1157 | FirstPrivateArgSize, DPxPTR(TgtPtr)); |
| 1158 | // Transfer data to target device |
| 1159 | int Ret = Device.submitData(TgtPtr, FirstPrivateArgBuffer.data(), |
| 1160 | FirstPrivateArgSize, AsyncInfo); |
| 1161 | if (Ret != OFFLOAD_SUCCESS) { |
| 1162 | DP("Failed to submit data of private arguments.\n" ); |
| 1163 | return OFFLOAD_FAIL; |
| 1164 | } |
| 1165 | // Fill in all placeholder pointers |
| 1166 | auto TP = reinterpret_cast<uintptr_t>(TgtPtr); |
| 1167 | for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) { |
| 1168 | void *&Ptr = TgtArgs[Info.Index]; |
| 1169 | assert(Ptr == nullptr && "Target pointer is already set by mistaken" ); |
| 1170 | // Pad the device pointer to get the right alignment. |
| 1171 | TP += Info.Padding; |
| 1172 | Ptr = reinterpret_cast<void *>(TP); |
| 1173 | TP += Info.Size; |
| 1174 | DP("Firstprivate array " DPxMOD " of size %" PRId64 " mapped to " DPxMOD |
| 1175 | "\n" , |
| 1176 | DPxPTR(Info.HstPtrBegin), Info.HstPtrEnd - Info.HstPtrBegin, |
| 1177 | DPxPTR(Ptr)); |
| 1178 | } |
| 1179 | } |
| 1180 | |
| 1181 | return OFFLOAD_SUCCESS; |
| 1182 | } |
| 1183 | |
| 1184 | /// Free all target memory allocated for private arguments |
| 1185 | int free() { |
| 1186 | for (void *P : TgtPtrs) { |
| 1187 | int Ret = Device.deleteData(P); |
| 1188 | if (Ret != OFFLOAD_SUCCESS) { |
| 1189 | DP("Deallocation of (first-)private arrays failed.\n" ); |
| 1190 | return OFFLOAD_FAIL; |
| 1191 | } |
| 1192 | } |
| 1193 | |
| 1194 | TgtPtrs.clear(); |
| 1195 | |
| 1196 | return OFFLOAD_SUCCESS; |
| 1197 | } |
| 1198 | }; |
| 1199 | |
| 1200 | /// Process data before launching the kernel, including calling targetDataBegin |
| 1201 | /// to map and transfer data to target device, transferring (first-)private |
| 1202 | /// variables. |
| 1203 | static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr, |
| 1204 | int32_t ArgNum, void **ArgBases, void **Args, |
| 1205 | int64_t *ArgSizes, int64_t *ArgTypes, |
| 1206 | map_var_info_t *ArgNames, void **ArgMappers, |
| 1207 | SmallVector<void *> &TgtArgs, |
| 1208 | SmallVector<ptrdiff_t> &TgtOffsets, |
| 1209 | PrivateArgumentManagerTy &PrivateArgumentManager, |
| 1210 | AsyncInfoTy &AsyncInfo) { |
| 1211 | |
| 1212 | auto DeviceOrErr = PM->getDevice(DeviceId); |
| 1213 | if (!DeviceOrErr) |
| 1214 | FATAL_MESSAGE(DeviceId, "%s" , toString(DeviceOrErr.takeError()).c_str()); |
| 1215 | |
| 1216 | int Ret = targetDataBegin(Loc, *DeviceOrErr, ArgNum, ArgBases, Args, ArgSizes, |
| 1217 | ArgTypes, ArgNames, ArgMappers, AsyncInfo); |
| 1218 | if (Ret != OFFLOAD_SUCCESS) { |
| 1219 | REPORT("Call to targetDataBegin failed, abort target.\n" ); |
| 1220 | return OFFLOAD_FAIL; |
| 1221 | } |
| 1222 | |
| 1223 | // List of (first-)private arrays allocated for this target region |
| 1224 | SmallVector<int> TgtArgsPositions(ArgNum, -1); |
| 1225 | |
| 1226 | for (int32_t I = 0; I < ArgNum; ++I) { |
| 1227 | if (!(ArgTypes[I] & OMP_TGT_MAPTYPE_TARGET_PARAM)) { |
| 1228 | // This is not a target parameter, do not push it into TgtArgs. |
| 1229 | // Check for lambda mapping. |
| 1230 | if (isLambdaMapping(Mapping: ArgTypes[I])) { |
| 1231 | assert((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) && |
| 1232 | "PTR_AND_OBJ must be also MEMBER_OF." ); |
| 1233 | unsigned Idx = getParentIndex(Type: ArgTypes[I]); |
| 1234 | int TgtIdx = TgtArgsPositions[Idx]; |
| 1235 | assert(TgtIdx != -1 && "Base address must be translated already." ); |
| 1236 | // The parent lambda must be processed already and it must be the last |
| 1237 | // in TgtArgs and TgtOffsets arrays. |
| 1238 | void *HstPtrVal = Args[I]; |
| 1239 | void *HstPtrBegin = ArgBases[I]; |
| 1240 | void *HstPtrBase = Args[Idx]; |
| 1241 | void *TgtPtrBase = |
| 1242 | (void *)((intptr_t)TgtArgs[TgtIdx] + TgtOffsets[TgtIdx]); |
| 1243 | DP("Parent lambda base " DPxMOD "\n" , DPxPTR(TgtPtrBase)); |
| 1244 | uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase; |
| 1245 | void *TgtPtrBegin = (void *)((uintptr_t)TgtPtrBase + Delta); |
| 1246 | void *&PointerTgtPtrBegin = AsyncInfo.getVoidPtrLocation(); |
| 1247 | TargetPointerResultTy TPR = |
| 1248 | DeviceOrErr->getMappingInfo().getTgtPtrBegin( |
| 1249 | HstPtrVal, ArgSizes[I], /*UpdateRefCount=*/false, |
| 1250 | /*UseHoldRefCount=*/false); |
| 1251 | PointerTgtPtrBegin = TPR.TargetPointer; |
| 1252 | if (!TPR.isPresent()) { |
| 1253 | DP("No lambda captured variable mapped (" DPxMOD ") - ignored\n" , |
| 1254 | DPxPTR(HstPtrVal)); |
| 1255 | continue; |
| 1256 | } |
| 1257 | if (TPR.Flags.IsHostPointer) { |
| 1258 | DP("Unified memory is active, no need to map lambda captured" |
| 1259 | "variable (" DPxMOD ")\n" , |
| 1260 | DPxPTR(HstPtrVal)); |
| 1261 | continue; |
| 1262 | } |
| 1263 | DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n" , |
| 1264 | DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin)); |
| 1265 | Ret = |
| 1266 | DeviceOrErr->submitData(TgtPtrBegin, &PointerTgtPtrBegin, |
| 1267 | sizeof(void *), AsyncInfo, TPR.getEntry()); |
| 1268 | if (Ret != OFFLOAD_SUCCESS) { |
| 1269 | REPORT("Copying data to device failed.\n" ); |
| 1270 | return OFFLOAD_FAIL; |
| 1271 | } |
| 1272 | } |
| 1273 | continue; |
| 1274 | } |
| 1275 | void *HstPtrBegin = Args[I]; |
| 1276 | void *HstPtrBase = ArgBases[I]; |
| 1277 | void *TgtPtrBegin; |
| 1278 | map_var_info_t HstPtrName = (!ArgNames) ? nullptr : ArgNames[I]; |
| 1279 | ptrdiff_t TgtBaseOffset; |
| 1280 | TargetPointerResultTy TPR; |
| 1281 | if (ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) { |
| 1282 | DP("Forwarding first-private value " DPxMOD " to the target construct\n" , |
| 1283 | DPxPTR(HstPtrBase)); |
| 1284 | TgtPtrBegin = HstPtrBase; |
| 1285 | TgtBaseOffset = 0; |
| 1286 | } else if (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE) { |
| 1287 | TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin; |
| 1288 | const bool IsFirstPrivate = (ArgTypes[I] & OMP_TGT_MAPTYPE_TO); |
| 1289 | // If there is a next argument and it depends on the current one, we need |
| 1290 | // to allocate the private memory immediately. If this is not the case, |
| 1291 | // then the argument can be marked for optimization and packed with the |
| 1292 | // other privates. |
| 1293 | const bool AllocImmediately = |
| 1294 | (I < ArgNum - 1 && (ArgTypes[I + 1] & OMP_TGT_MAPTYPE_MEMBER_OF)); |
| 1295 | Ret = PrivateArgumentManager.addArg( |
| 1296 | HstPtrBegin, ArgSizes[I], TgtBaseOffset, IsFirstPrivate, TgtPtrBegin, |
| 1297 | TgtArgs.size(), HstPtrName, AllocImmediately); |
| 1298 | if (Ret != OFFLOAD_SUCCESS) { |
| 1299 | REPORT("Failed to process %sprivate argument " DPxMOD "\n" , |
| 1300 | (IsFirstPrivate ? "first-" : "" ), DPxPTR(HstPtrBegin)); |
| 1301 | return OFFLOAD_FAIL; |
| 1302 | } |
| 1303 | } else { |
| 1304 | if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) |
| 1305 | HstPtrBase = *reinterpret_cast<void **>(HstPtrBase); |
| 1306 | TPR = DeviceOrErr->getMappingInfo().getTgtPtrBegin( |
| 1307 | HstPtrBegin, ArgSizes[I], |
| 1308 | /*UpdateRefCount=*/false, |
| 1309 | /*UseHoldRefCount=*/false); |
| 1310 | TgtPtrBegin = TPR.TargetPointer; |
| 1311 | TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin; |
| 1312 | #ifdef OMPTARGET_DEBUG |
| 1313 | void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset); |
| 1314 | DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n" , |
| 1315 | DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin)); |
| 1316 | #endif |
| 1317 | } |
| 1318 | TgtArgsPositions[I] = TgtArgs.size(); |
| 1319 | TgtArgs.push_back(TgtPtrBegin); |
| 1320 | TgtOffsets.push_back(TgtBaseOffset); |
| 1321 | } |
| 1322 | |
| 1323 | assert(TgtArgs.size() == TgtOffsets.size() && |
| 1324 | "Size mismatch in arguments and offsets" ); |
| 1325 | |
| 1326 | // Pack and transfer first-private arguments |
| 1327 | Ret = PrivateArgumentManager.packAndTransfer(TgtArgs); |
| 1328 | if (Ret != OFFLOAD_SUCCESS) { |
| 1329 | DP("Failed to pack and transfer first private arguments\n" ); |
| 1330 | return OFFLOAD_FAIL; |
| 1331 | } |
| 1332 | |
| 1333 | return OFFLOAD_SUCCESS; |
| 1334 | } |
| 1335 | |
| 1336 | /// Process data after launching the kernel, including transferring data back to |
| 1337 | /// host if needed and deallocating target memory of (first-)private variables. |
| 1338 | static int processDataAfter(ident_t *Loc, int64_t DeviceId, void *HostPtr, |
| 1339 | int32_t ArgNum, void **ArgBases, void **Args, |
| 1340 | int64_t *ArgSizes, int64_t *ArgTypes, |
| 1341 | map_var_info_t *ArgNames, void **ArgMappers, |
| 1342 | PrivateArgumentManagerTy &PrivateArgumentManager, |
| 1343 | AsyncInfoTy &AsyncInfo) { |
| 1344 | |
| 1345 | auto DeviceOrErr = PM->getDevice(DeviceId); |
| 1346 | if (!DeviceOrErr) |
| 1347 | FATAL_MESSAGE(DeviceId, "%s" , toString(DeviceOrErr.takeError()).c_str()); |
| 1348 | |
| 1349 | // Move data from device. |
| 1350 | int Ret = targetDataEnd(Loc, *DeviceOrErr, ArgNum, ArgBases, Args, ArgSizes, |
| 1351 | ArgTypes, ArgNames, ArgMappers, AsyncInfo); |
| 1352 | if (Ret != OFFLOAD_SUCCESS) { |
| 1353 | REPORT("Call to targetDataEnd failed, abort target.\n" ); |
| 1354 | return OFFLOAD_FAIL; |
| 1355 | } |
| 1356 | |
| 1357 | // Free target memory for private arguments after synchronization. |
| 1358 | // TODO: We might want to remove `mutable` in the future by not changing the |
| 1359 | // captured variables somehow. |
| 1360 | AsyncInfo.addPostProcessingFunction( |
| 1361 | [PrivateArgumentManager = |
| 1362 | std::move(PrivateArgumentManager)]() mutable -> int { |
| 1363 | int Ret = PrivateArgumentManager.free(); |
| 1364 | if (Ret != OFFLOAD_SUCCESS) { |
| 1365 | REPORT("Failed to deallocate target memory for private args\n" ); |
| 1366 | return OFFLOAD_FAIL; |
| 1367 | } |
| 1368 | return Ret; |
| 1369 | }); |
| 1370 | |
| 1371 | return OFFLOAD_SUCCESS; |
| 1372 | } |
| 1373 | } // namespace |
| 1374 | |
| 1375 | /// performs the same actions as data_begin in case arg_num is |
| 1376 | /// non-zero and initiates run of the offloaded region on the target platform; |
| 1377 | /// if arg_num is non-zero after the region execution is done it also |
| 1378 | /// performs the same action as data_update and data_end above. This function |
| 1379 | /// returns 0 if it was able to transfer the execution to a target and an |
| 1380 | /// integer different from zero otherwise. |
| 1381 | int target(ident_t *Loc, DeviceTy &Device, void *HostPtr, |
| 1382 | KernelArgsTy &KernelArgs, AsyncInfoTy &AsyncInfo) { |
| 1383 | int32_t DeviceId = Device.DeviceID; |
| 1384 | TableMap *TM = getTableMap(HostPtr); |
| 1385 | // No map for this host pointer found! |
| 1386 | if (!TM) { |
| 1387 | REPORT("Host ptr " DPxMOD " does not have a matching target pointer.\n" , |
| 1388 | DPxPTR(HostPtr)); |
| 1389 | return OFFLOAD_FAIL; |
| 1390 | } |
| 1391 | |
| 1392 | // get target table. |
| 1393 | __tgt_target_table *TargetTable = nullptr; |
| 1394 | { |
| 1395 | std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx); |
| 1396 | assert(TM->Table->TargetsTable.size() > (size_t)DeviceId && |
| 1397 | "Not expecting a device ID outside the table's bounds!" ); |
| 1398 | TargetTable = TM->Table->TargetsTable[DeviceId]; |
| 1399 | } |
| 1400 | assert(TargetTable && "Global data has not been mapped\n" ); |
| 1401 | |
| 1402 | DP("loop trip count is %" PRIu64 ".\n" , KernelArgs.Tripcount); |
| 1403 | |
| 1404 | // We need to keep bases and offsets separate. Sometimes (e.g. in OpenCL) we |
| 1405 | // need to manifest base pointers prior to launching a kernel. Even if we have |
| 1406 | // mapped an object only partially, e.g. A[N:M], although the kernel is |
| 1407 | // expected to access elements starting at address &A[N] and beyond, we still |
| 1408 | // need to manifest the base of the array &A[0]. In other cases, e.g. the COI |
| 1409 | // API, we need the begin address itself, i.e. &A[N], as the API operates on |
| 1410 | // begin addresses, not bases. That's why we pass args and offsets as two |
| 1411 | // separate entities so that each plugin can do what it needs. This behavior |
| 1412 | // was introduced via https://reviews.llvm.org/D33028 and commit 1546d319244c. |
| 1413 | SmallVector<void *> TgtArgs; |
| 1414 | SmallVector<ptrdiff_t> TgtOffsets; |
| 1415 | |
| 1416 | PrivateArgumentManagerTy PrivateArgumentManager(Device, AsyncInfo); |
| 1417 | |
| 1418 | int NumClangLaunchArgs = KernelArgs.NumArgs; |
| 1419 | int Ret = OFFLOAD_SUCCESS; |
| 1420 | if (NumClangLaunchArgs) { |
| 1421 | // Process data, such as data mapping, before launching the kernel |
| 1422 | Ret = processDataBefore(Loc, DeviceId, HostPtr, NumClangLaunchArgs, |
| 1423 | KernelArgs.ArgBasePtrs, KernelArgs.ArgPtrs, |
| 1424 | KernelArgs.ArgSizes, KernelArgs.ArgTypes, |
| 1425 | KernelArgs.ArgNames, KernelArgs.ArgMappers, TgtArgs, |
| 1426 | TgtOffsets, PrivateArgumentManager, AsyncInfo); |
| 1427 | if (Ret != OFFLOAD_SUCCESS) { |
| 1428 | REPORT("Failed to process data before launching the kernel.\n" ); |
| 1429 | return OFFLOAD_FAIL; |
| 1430 | } |
| 1431 | |
| 1432 | // Clang might pass more values via the ArgPtrs to the runtime that we pass |
| 1433 | // on to the kernel. |
| 1434 | // TODO: Next time we adjust the KernelArgsTy we should introduce a new |
| 1435 | // NumKernelArgs field. |
| 1436 | KernelArgs.NumArgs = TgtArgs.size(); |
| 1437 | } |
| 1438 | |
| 1439 | // Launch device execution. |
| 1440 | void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].Address; |
| 1441 | DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n" , |
| 1442 | TargetTable->EntriesBegin[TM->Index].SymbolName, DPxPTR(TgtEntryPtr), |
| 1443 | TM->Index); |
| 1444 | |
| 1445 | { |
| 1446 | assert(KernelArgs.NumArgs == TgtArgs.size() && "Argument count mismatch!" ); |
| 1447 | TIMESCOPE_WITH_DETAILS_AND_IDENT( |
| 1448 | "Kernel Target" , |
| 1449 | "NumArguments=" + std::to_string(KernelArgs.NumArgs) + |
| 1450 | ";NumTeams=" + std::to_string(KernelArgs.NumTeams[0]) + |
| 1451 | ";TripCount=" + std::to_string(KernelArgs.Tripcount), |
| 1452 | Loc); |
| 1453 | |
| 1454 | #ifdef OMPT_SUPPORT |
| 1455 | /// RAII to establish tool anchors before and after kernel launch |
| 1456 | int32_t NumTeams = KernelArgs.NumTeams[0]; |
| 1457 | // No need to guard this with OMPT_IF_BUILT |
| 1458 | InterfaceRAII TargetSubmitRAII( |
| 1459 | RegionInterface.getCallbacks<ompt_callback_target_submit>(), NumTeams); |
| 1460 | #endif |
| 1461 | |
| 1462 | Ret = Device.launchKernel(TgtEntryPtr, TgtArgs.data(), TgtOffsets.data(), |
| 1463 | KernelArgs, AsyncInfo); |
| 1464 | } |
| 1465 | |
| 1466 | if (Ret != OFFLOAD_SUCCESS) { |
| 1467 | REPORT("Executing target region abort target.\n" ); |
| 1468 | return OFFLOAD_FAIL; |
| 1469 | } |
| 1470 | |
| 1471 | if (NumClangLaunchArgs) { |
| 1472 | // Transfer data back and deallocate target memory for (first-)private |
| 1473 | // variables |
| 1474 | Ret = processDataAfter(Loc, DeviceId, HostPtr, NumClangLaunchArgs, |
| 1475 | KernelArgs.ArgBasePtrs, KernelArgs.ArgPtrs, |
| 1476 | KernelArgs.ArgSizes, KernelArgs.ArgTypes, |
| 1477 | KernelArgs.ArgNames, KernelArgs.ArgMappers, |
| 1478 | PrivateArgumentManager, AsyncInfo); |
| 1479 | if (Ret != OFFLOAD_SUCCESS) { |
| 1480 | REPORT("Failed to process data after launching the kernel.\n" ); |
| 1481 | return OFFLOAD_FAIL; |
| 1482 | } |
| 1483 | } |
| 1484 | |
| 1485 | return OFFLOAD_SUCCESS; |
| 1486 | } |
| 1487 | |
| 1488 | /// Enables the record replay mechanism by pre-allocating MemorySize |
| 1489 | /// and informing the record-replayer of whether to store the output |
| 1490 | /// in some file. |
| 1491 | int target_activate_rr(DeviceTy &Device, uint64_t MemorySize, void *VAddr, |
| 1492 | bool IsRecord, bool SaveOutput, |
| 1493 | uint64_t &ReqPtrArgOffset) { |
| 1494 | return Device.RTL->initialize_record_replay(Device.DeviceID, MemorySize, |
| 1495 | VAddr, IsRecord, SaveOutput, |
| 1496 | ReqPtrArgOffset); |
| 1497 | } |
| 1498 | |
| 1499 | /// Executes a kernel using pre-recorded information for loading to |
| 1500 | /// device memory to launch the target kernel with the pre-recorded |
| 1501 | /// configuration. |
| 1502 | int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr, |
| 1503 | void *DeviceMemory, int64_t DeviceMemorySize, void **TgtArgs, |
| 1504 | ptrdiff_t *TgtOffsets, int32_t NumArgs, int32_t NumTeams, |
| 1505 | int32_t ThreadLimit, uint64_t LoopTripCount, |
| 1506 | AsyncInfoTy &AsyncInfo) { |
| 1507 | int32_t DeviceId = Device.DeviceID; |
| 1508 | TableMap *TM = getTableMap(HostPtr); |
| 1509 | // Fail if the table map fails to find the target kernel pointer for the |
| 1510 | // provided host pointer. |
| 1511 | if (!TM) { |
| 1512 | REPORT("Host ptr " DPxMOD " does not have a matching target pointer.\n" , |
| 1513 | DPxPTR(HostPtr)); |
| 1514 | return OFFLOAD_FAIL; |
| 1515 | } |
| 1516 | |
| 1517 | // Retrieve the target table of offloading entries. |
| 1518 | __tgt_target_table *TargetTable = nullptr; |
| 1519 | { |
| 1520 | std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx); |
| 1521 | assert(TM->Table->TargetsTable.size() > (size_t)DeviceId && |
| 1522 | "Not expecting a device ID outside the table's bounds!" ); |
| 1523 | TargetTable = TM->Table->TargetsTable[DeviceId]; |
| 1524 | } |
| 1525 | assert(TargetTable && "Global data has not been mapped\n" ); |
| 1526 | |
| 1527 | // Retrieve the target kernel pointer, allocate and store the recorded device |
| 1528 | // memory data, and launch device execution. |
| 1529 | void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].Address; |
| 1530 | DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n" , |
| 1531 | TargetTable->EntriesBegin[TM->Index].SymbolName, DPxPTR(TgtEntryPtr), |
| 1532 | TM->Index); |
| 1533 | |
| 1534 | void *TgtPtr = Device.allocData(DeviceMemorySize, /*HstPtr=*/nullptr, |
| 1535 | TARGET_ALLOC_DEFAULT); |
| 1536 | Device.submitData(TgtPtr, DeviceMemory, DeviceMemorySize, AsyncInfo); |
| 1537 | |
| 1538 | KernelArgsTy KernelArgs{}; |
| 1539 | KernelArgs.Version = OMP_KERNEL_ARG_VERSION; |
| 1540 | KernelArgs.NumArgs = NumArgs; |
| 1541 | KernelArgs.Tripcount = LoopTripCount; |
| 1542 | KernelArgs.NumTeams[0] = NumTeams; |
| 1543 | KernelArgs.ThreadLimit[0] = ThreadLimit; |
| 1544 | |
| 1545 | int Ret = Device.launchKernel(TgtEntryPtr, TgtArgs, TgtOffsets, KernelArgs, |
| 1546 | AsyncInfo); |
| 1547 | |
| 1548 | if (Ret != OFFLOAD_SUCCESS) { |
| 1549 | REPORT("Executing target region abort target.\n" ); |
| 1550 | return OFFLOAD_FAIL; |
| 1551 | } |
| 1552 | |
| 1553 | return OFFLOAD_SUCCESS; |
| 1554 | } |
| 1555 | |