1//===------ omptarget.cpp - Target independent OpenMP target RTL -- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Implementation of the interface to be used by Clang during the codegen of a
10// target region.
11//
12//===----------------------------------------------------------------------===//
13
14#include "omptarget.h"
15#include "OffloadPolicy.h"
16#include "OpenMP/OMPT/Callback.h"
17#include "OpenMP/OMPT/Interface.h"
18#include "PluginManager.h"
19#include "Shared/Debug.h"
20#include "Shared/EnvironmentVar.h"
21#include "Shared/Utils.h"
22#include "device.h"
23#include "private.h"
24#include "rtl.h"
25
26#include "Shared/Profile.h"
27
28#include "OpenMP/Mapping.h"
29#include "OpenMP/omp.h"
30
31#include "llvm/ADT/StringExtras.h"
32#include "llvm/ADT/bit.h"
33#include "llvm/Frontend/OpenMP/OMPConstants.h"
34#include "llvm/Object/ObjectFile.h"
35
36#include <cassert>
37#include <cstdint>
38#include <vector>
39
40using llvm::SmallVector;
41#ifdef OMPT_SUPPORT
42using namespace llvm::omp::target::ompt;
43#endif
44
45int AsyncInfoTy::synchronize() {
46 int Result = OFFLOAD_SUCCESS;
47 if (!isQueueEmpty()) {
48 switch (SyncType) {
49 case SyncTy::BLOCKING:
50 // If we have a queue we need to synchronize it now.
51 Result = Device.synchronize(*this);
52 assert(AsyncInfo.Queue == nullptr &&
53 "The device plugin should have nulled the queue to indicate there "
54 "are no outstanding actions!");
55 break;
56 case SyncTy::NON_BLOCKING:
57 Result = Device.queryAsync(*this);
58 break;
59 }
60 }
61
62 // Run any pending post-processing function registered on this async object.
63 if (Result == OFFLOAD_SUCCESS && isQueueEmpty())
64 Result = runPostProcessing();
65
66 return Result;
67}
68
69void *&AsyncInfoTy::getVoidPtrLocation() {
70 BufferLocations.push_back(nullptr);
71 return BufferLocations.back();
72}
73
74bool AsyncInfoTy::isDone() const { return isQueueEmpty(); }
75
76int32_t AsyncInfoTy::runPostProcessing() {
77 size_t Size = PostProcessingFunctions.size();
78 for (size_t I = 0; I < Size; ++I) {
79 const int Result = PostProcessingFunctions[I]();
80 if (Result != OFFLOAD_SUCCESS)
81 return Result;
82 }
83
84 // Clear the vector up until the last known function, since post-processing
85 // procedures might add new procedures themselves.
86 const auto *PrevBegin = PostProcessingFunctions.begin();
87 PostProcessingFunctions.erase(PrevBegin, PrevBegin + Size);
88
89 return OFFLOAD_SUCCESS;
90}
91
92bool AsyncInfoTy::isQueueEmpty() const { return AsyncInfo.Queue == nullptr; }
93
94/* All begin addresses for partially mapped structs must be aligned, up to 16,
95 * in order to ensure proper alignment of members. E.g.
96 *
97 * struct S {
98 * int a; // 4-aligned
99 * int b; // 4-aligned
100 * int *p; // 8-aligned
101 * } s1;
102 * ...
103 * #pragma omp target map(tofrom: s1.b, s1.p[0:N])
104 * {
105 * s1.b = 5;
106 * for (int i...) s1.p[i] = ...;
107 * }
108 *
109 * Here we are mapping s1 starting from member b, so BaseAddress=&s1=&s1.a and
110 * BeginAddress=&s1.b. Let's assume that the struct begins at address 0x100,
111 * then &s1.a=0x100, &s1.b=0x104, &s1.p=0x108. Each member obeys the alignment
112 * requirements for its type. Now, when we allocate memory on the device, in
113 * CUDA's case cuMemAlloc() returns an address which is at least 256-aligned.
114 * This means that the chunk of the struct on the device will start at a
115 * 256-aligned address, let's say 0x200. Then the address of b will be 0x200 and
116 * address of p will be a misaligned 0x204 (on the host there was no need to add
117 * padding between b and p, so p comes exactly 4 bytes after b). If the device
118 * kernel tries to access s1.p, a misaligned address error occurs (as reported
119 * by the CUDA plugin). By padding the begin address down to a multiple of 8 and
120 * extending the size of the allocated chuck accordingly, the chuck on the
121 * device will start at 0x200 with the padding (4 bytes), then &s1.b=0x204 and
122 * &s1.p=0x208, as they should be to satisfy the alignment requirements.
123 */
124static const int64_t MaxAlignment = 16;
125
126/// Return the alignment requirement of partially mapped structs, see
127/// MaxAlignment above.
128static uint64_t getPartialStructRequiredAlignment(void *HstPtrBase) {
129 int LowestOneBit = __builtin_ffsl(reinterpret_cast<uintptr_t>(HstPtrBase));
130 uint64_t BaseAlignment = 1 << (LowestOneBit - 1);
131 return MaxAlignment < BaseAlignment ? MaxAlignment : BaseAlignment;
132}
133
134void handleTargetOutcome(bool Success, ident_t *Loc) {
135 switch (OffloadPolicy::get(*PM).Kind) {
136 case OffloadPolicy::DISABLED:
137 if (Success) {
138 FATAL_MESSAGE0(1, "expected no offloading while offloading is disabled");
139 }
140 break;
141 case OffloadPolicy::MANDATORY:
142 if (!Success) {
143 if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE) {
144 auto ExclusiveDevicesAccessor = PM->getExclusiveDevicesAccessor();
145 for (auto &Device : PM->devices(ExclusiveDevicesAccessor))
146 dumpTargetPointerMappings(Loc, Device);
147 } else
148 FAILURE_MESSAGE("Consult https://openmp.llvm.org/design/Runtimes.html "
149 "for debugging options.\n");
150
151 if (!PM->getNumActivePlugins()) {
152 FAILURE_MESSAGE(
153 "No images found compatible with the installed hardware. ");
154
155 llvm::SmallVector<llvm::StringRef> Archs;
156 for (auto &Image : PM->deviceImages()) {
157 const char *Start = reinterpret_cast<const char *>(
158 Image.getExecutableImage().ImageStart);
159 uint64_t Length =
160 utils::getPtrDiff(Start, Image.getExecutableImage().ImageEnd);
161 llvm::MemoryBufferRef Buffer(llvm::StringRef(Start, Length),
162 /*Identifier=*/"");
163
164 auto ObjectOrErr = llvm::object::ObjectFile::createObjectFile(Buffer);
165 if (auto Err = ObjectOrErr.takeError()) {
166 llvm::consumeError(std::move(Err));
167 continue;
168 }
169
170 if (auto CPU = (*ObjectOrErr)->tryGetCPUName())
171 Archs.push_back(*CPU);
172 }
173 fprintf(stderr, "Found %zu image(s): (%s)\n", Archs.size(),
174 llvm::join(Archs, ",").c_str());
175 }
176
177 SourceInfo Info(Loc);
178 if (Info.isAvailible())
179 fprintf(stderr, "%s:%d:%d: ", Info.getFilename(), Info.getLine(),
180 Info.getColumn());
181 else
182 FAILURE_MESSAGE("Source location information not present. Compile with "
183 "-g or -gline-tables-only.\n");
184 FATAL_MESSAGE0(
185 1, "failure of target construct while offloading is mandatory");
186 } else {
187 if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE) {
188 auto ExclusiveDevicesAccessor = PM->getExclusiveDevicesAccessor();
189 for (auto &Device : PM->devices(ExclusiveDevicesAccessor))
190 dumpTargetPointerMappings(Loc, Device);
191 }
192 }
193 break;
194 }
195}
196
197static int32_t getParentIndex(int64_t Type) {
198 return ((Type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1;
199}
200
201void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind,
202 const char *Name) {
203 DP("Call to %s for device %d requesting %zu bytes\n", Name, DeviceNum, Size);
204
205 if (Size <= 0) {
206 DP("Call to %s with non-positive length\n", Name);
207 return NULL;
208 }
209
210 void *Rc = NULL;
211
212 if (DeviceNum == omp_get_initial_device()) {
213 Rc = malloc(size: Size);
214 DP("%s returns host ptr " DPxMOD "\n", Name, DPxPTR(Rc));
215 return Rc;
216 }
217
218 auto DeviceOrErr = PM->getDevice(DeviceNum);
219 if (!DeviceOrErr)
220 FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
221
222 Rc = DeviceOrErr->allocData(Size, nullptr, Kind);
223 DP("%s returns device ptr " DPxMOD "\n", Name, DPxPTR(Rc));
224 return Rc;
225}
226
227void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind,
228 const char *Name) {
229 DP("Call to %s for device %d and address " DPxMOD "\n", Name, DeviceNum,
230 DPxPTR(DevicePtr));
231
232 if (!DevicePtr) {
233 DP("Call to %s with NULL ptr\n", Name);
234 return;
235 }
236
237 if (DeviceNum == omp_get_initial_device()) {
238 free(ptr: DevicePtr);
239 DP("%s deallocated host ptr\n", Name);
240 return;
241 }
242
243 auto DeviceOrErr = PM->getDevice(DeviceNum);
244 if (!DeviceOrErr)
245 FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
246
247 if (DeviceOrErr->deleteData(DevicePtr, Kind) == OFFLOAD_FAIL)
248 FATAL_MESSAGE(DeviceNum, "%s",
249 "Failed to deallocate device ptr. Set "
250 "OFFLOAD_TRACK_ALLOCATION_TRACES=1 to track allocations.");
251
252 DP("omp_target_free deallocated device ptr\n");
253}
254
255void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum,
256 const char *Name) {
257 DP("Call to %s for device %d locking %zu bytes\n", Name, DeviceNum, Size);
258
259 if (Size <= 0) {
260 DP("Call to %s with non-positive length\n", Name);
261 return NULL;
262 }
263
264 void *RC = NULL;
265
266 auto DeviceOrErr = PM->getDevice(DeviceNum);
267 if (!DeviceOrErr)
268 FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
269
270 int32_t Err = 0;
271 Err = DeviceOrErr->RTL->data_lock(DeviceNum, HostPtr, Size, &RC);
272 if (Err) {
273 DP("Could not lock ptr %p\n", HostPtr);
274 return nullptr;
275 }
276 DP("%s returns device ptr " DPxMOD "\n", Name, DPxPTR(RC));
277 return RC;
278}
279
280void targetUnlockExplicit(void *HostPtr, int DeviceNum, const char *Name) {
281 DP("Call to %s for device %d unlocking\n", Name, DeviceNum);
282
283 auto DeviceOrErr = PM->getDevice(DeviceNum);
284 if (!DeviceOrErr)
285 FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
286
287 DeviceOrErr->RTL->data_unlock(DeviceNum, HostPtr);
288 DP("%s returns\n", Name);
289}
290
291/// Call the user-defined mapper function followed by the appropriate
292// targetData* function (targetData{Begin,End,Update}).
293int targetDataMapper(ident_t *Loc, DeviceTy &Device, void *ArgBase, void *Arg,
294 int64_t ArgSize, int64_t ArgType, map_var_info_t ArgNames,
295 void *ArgMapper, AsyncInfoTy &AsyncInfo,
296 TargetDataFuncPtrTy TargetDataFunction) {
297 DP("Calling the mapper function " DPxMOD "\n", DPxPTR(ArgMapper));
298
299 // The mapper function fills up Components.
300 MapperComponentsTy MapperComponents;
301 MapperFuncPtrTy MapperFuncPtr = (MapperFuncPtrTy)(ArgMapper);
302 (*MapperFuncPtr)((void *)&MapperComponents, ArgBase, Arg, ArgSize, ArgType,
303 ArgNames);
304
305 // Construct new arrays for args_base, args, arg_sizes and arg_types
306 // using the information in MapperComponents and call the corresponding
307 // targetData* function using these new arrays.
308 SmallVector<void *> MapperArgsBase(MapperComponents.Components.size());
309 SmallVector<void *> MapperArgs(MapperComponents.Components.size());
310 SmallVector<int64_t> MapperArgSizes(MapperComponents.Components.size());
311 SmallVector<int64_t> MapperArgTypes(MapperComponents.Components.size());
312 SmallVector<void *> MapperArgNames(MapperComponents.Components.size());
313
314 for (unsigned I = 0, E = MapperComponents.Components.size(); I < E; ++I) {
315 auto &C = MapperComponents.Components[I];
316 MapperArgsBase[I] = C.Base;
317 MapperArgs[I] = C.Begin;
318 MapperArgSizes[I] = C.Size;
319 MapperArgTypes[I] = C.Type;
320 MapperArgNames[I] = C.Name;
321 }
322
323 int Rc = TargetDataFunction(Loc, Device, MapperComponents.Components.size(),
324 MapperArgsBase.data(), MapperArgs.data(),
325 MapperArgSizes.data(), MapperArgTypes.data(),
326 MapperArgNames.data(), /*arg_mappers*/ nullptr,
327 AsyncInfo, /*FromMapper=*/true);
328
329 return Rc;
330}
331
332/// Internal function to do the mapping and transfer the data to the device
333int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
334 void **ArgsBase, void **Args, int64_t *ArgSizes,
335 int64_t *ArgTypes, map_var_info_t *ArgNames,
336 void **ArgMappers, AsyncInfoTy &AsyncInfo,
337 bool FromMapper) {
338 // process each input.
339 for (int32_t I = 0; I < ArgNum; ++I) {
340 // Ignore private variables and arrays - there is no mapping for them.
341 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
342 (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
343 continue;
344 TIMESCOPE_WITH_DETAILS_AND_IDENT(
345 "HostToDev", "Size=" + std::to_string(val: ArgSizes[I]) + "B", Loc);
346 if (ArgMappers && ArgMappers[I]) {
347 // Instead of executing the regular path of targetDataBegin, call the
348 // targetDataMapper variant which will call targetDataBegin again
349 // with new arguments.
350 DP("Calling targetDataMapper for the %dth argument\n", I);
351
352 map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
353 int Rc = targetDataMapper(Loc, Device, ArgsBase[I], Args[I], ArgSizes[I],
354 ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo,
355 targetDataBegin);
356
357 if (Rc != OFFLOAD_SUCCESS) {
358 REPORT("Call to targetDataBegin via targetDataMapper for custom mapper"
359 " failed.\n");
360 return OFFLOAD_FAIL;
361 }
362
363 // Skip the rest of this function, continue to the next argument.
364 continue;
365 }
366
367 void *HstPtrBegin = Args[I];
368 void *HstPtrBase = ArgsBase[I];
369 int64_t DataSize = ArgSizes[I];
370 map_var_info_t HstPtrName = (!ArgNames) ? nullptr : ArgNames[I];
371
372 // Adjust for proper alignment if this is a combined entry (for structs).
373 // Look at the next argument - if that is MEMBER_OF this one, then this one
374 // is a combined entry.
375 int64_t TgtPadding = 0;
376 const int NextI = I + 1;
377 if (getParentIndex(Type: ArgTypes[I]) < 0 && NextI < ArgNum &&
378 getParentIndex(Type: ArgTypes[NextI]) == I) {
379 int64_t Alignment = getPartialStructRequiredAlignment(HstPtrBase);
380 TgtPadding = (int64_t)HstPtrBegin % Alignment;
381 if (TgtPadding) {
382 DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD
383 "\n",
384 TgtPadding, DPxPTR(HstPtrBegin));
385 }
386 }
387
388 // Address of pointer on the host and device, respectively.
389 void *PointerHstPtrBegin, *PointerTgtPtrBegin;
390 TargetPointerResultTy PointerTpr;
391 bool IsHostPtr = false;
392 bool IsImplicit = ArgTypes[I] & OMP_TGT_MAPTYPE_IMPLICIT;
393 // Force the creation of a device side copy of the data when:
394 // a close map modifier was associated with a map that contained a to.
395 bool HasCloseModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_CLOSE;
396 bool HasPresentModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_PRESENT;
397 bool HasHoldModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_OMPX_HOLD;
398 // UpdateRef is based on MEMBER_OF instead of TARGET_PARAM because if we
399 // have reached this point via __tgt_target_data_begin and not __tgt_target
400 // then no argument is marked as TARGET_PARAM ("omp target data map" is not
401 // associated with a target region, so there are no target parameters). This
402 // may be considered a hack, we could revise the scheme in the future.
403 bool UpdateRef =
404 !(ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) && !(FromMapper && I == 0);
405
406 MappingInfoTy::HDTTMapAccessorTy HDTTMap =
407 Device.getMappingInfo().HostDataToTargetMap.getExclusiveAccessor();
408 if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
409 DP("Has a pointer entry: \n");
410 // Base is address of pointer.
411 //
412 // Usually, the pointer is already allocated by this time. For example:
413 //
414 // #pragma omp target map(s.p[0:N])
415 //
416 // The map entry for s comes first, and the PTR_AND_OBJ entry comes
417 // afterward, so the pointer is already allocated by the time the
418 // PTR_AND_OBJ entry is handled below, and PointerTgtPtrBegin is thus
419 // non-null. However, "declare target link" can produce a PTR_AND_OBJ
420 // entry for a global that might not already be allocated by the time the
421 // PTR_AND_OBJ entry is handled below, and so the allocation might fail
422 // when HasPresentModifier.
423 PointerTpr = Device.getMappingInfo().getTargetPointer(
424 HDTTMap, HstPtrBase, HstPtrBase, /*TgtPadding=*/0, sizeof(void *),
425 /*HstPtrName=*/nullptr,
426 /*HasFlagTo=*/false, /*HasFlagAlways=*/false, IsImplicit, UpdateRef,
427 HasCloseModifier, HasPresentModifier, HasHoldModifier, AsyncInfo,
428 /*OwnedTPR=*/nullptr, /*ReleaseHDTTMap=*/false);
429 PointerTgtPtrBegin = PointerTpr.TargetPointer;
430 IsHostPtr = PointerTpr.Flags.IsHostPointer;
431 if (!PointerTgtPtrBegin) {
432 REPORT("Call to getTargetPointer returned null pointer (%s).\n",
433 HasPresentModifier ? "'present' map type modifier"
434 : "device failure or illegal mapping");
435 return OFFLOAD_FAIL;
436 }
437 DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new"
438 "\n",
439 sizeof(void *), DPxPTR(PointerTgtPtrBegin),
440 (PointerTpr.Flags.IsNewEntry ? "" : " not"));
441 PointerHstPtrBegin = HstPtrBase;
442 // modify current entry.
443 HstPtrBase = *(void **)HstPtrBase;
444 // No need to update pointee ref count for the first element of the
445 // subelement that comes from mapper.
446 UpdateRef =
447 (!FromMapper || I != 0); // subsequently update ref count of pointee
448 }
449
450 const bool HasFlagTo = ArgTypes[I] & OMP_TGT_MAPTYPE_TO;
451 const bool HasFlagAlways = ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS;
452 // Note that HDTTMap will be released in getTargetPointer.
453 auto TPR = Device.getMappingInfo().getTargetPointer(
454 HDTTMap, HstPtrBegin, HstPtrBase, TgtPadding, DataSize, HstPtrName,
455 HasFlagTo, HasFlagAlways, IsImplicit, UpdateRef, HasCloseModifier,
456 HasPresentModifier, HasHoldModifier, AsyncInfo, PointerTpr.getEntry());
457 void *TgtPtrBegin = TPR.TargetPointer;
458 IsHostPtr = TPR.Flags.IsHostPointer;
459 // If data_size==0, then the argument could be a zero-length pointer to
460 // NULL, so getOrAlloc() returning NULL is not an error.
461 if (!TgtPtrBegin && (DataSize || HasPresentModifier)) {
462 REPORT("Call to getTargetPointer returned null pointer (%s).\n",
463 HasPresentModifier ? "'present' map type modifier"
464 : "device failure or illegal mapping");
465 return OFFLOAD_FAIL;
466 }
467 DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
468 " - is%s new\n",
469 DataSize, DPxPTR(TgtPtrBegin), (TPR.Flags.IsNewEntry ? "" : " not"));
470
471 if (ArgTypes[I] & OMP_TGT_MAPTYPE_RETURN_PARAM) {
472 uintptr_t Delta = (uintptr_t)HstPtrBegin - (uintptr_t)HstPtrBase;
473 void *TgtPtrBase = (void *)((uintptr_t)TgtPtrBegin - Delta);
474 DP("Returning device pointer " DPxMOD "\n", DPxPTR(TgtPtrBase));
475 ArgsBase[I] = TgtPtrBase;
476 }
477
478 if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ && !IsHostPtr) {
479
480 uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
481 void *ExpectedTgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta);
482
483 if (PointerTpr.getEntry()->addShadowPointer(ShadowPtrInfoTy{
484 (void **)PointerHstPtrBegin, HstPtrBase,
485 (void **)PointerTgtPtrBegin, ExpectedTgtPtrBase})) {
486 DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n",
487 DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
488
489 void *&TgtPtrBase = AsyncInfo.getVoidPtrLocation();
490 TgtPtrBase = ExpectedTgtPtrBase;
491
492 int Ret =
493 Device.submitData(PointerTgtPtrBegin, &TgtPtrBase, sizeof(void *),
494 AsyncInfo, PointerTpr.getEntry());
495 if (Ret != OFFLOAD_SUCCESS) {
496 REPORT("Copying data to device failed.\n");
497 return OFFLOAD_FAIL;
498 }
499 if (PointerTpr.getEntry()->addEventIfNecessary(Device, AsyncInfo) !=
500 OFFLOAD_SUCCESS)
501 return OFFLOAD_FAIL;
502 }
503 }
504
505 // Check if variable can be used on the device:
506 bool IsStructMember = ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF;
507 if (getInfoLevel() & OMP_INFOTYPE_EMPTY_MAPPING && ArgTypes[I] != 0 &&
508 !IsStructMember && !IsImplicit && !TPR.isPresent() &&
509 !TPR.isContained() && !TPR.isHostPointer())
510 INFO(OMP_INFOTYPE_EMPTY_MAPPING, Device.DeviceID,
511 "variable %s does not have a valid device counterpart\n",
512 (HstPtrName) ? getNameFromMapping(HstPtrName).c_str() : "unknown");
513 }
514
515 return OFFLOAD_SUCCESS;
516}
517
518namespace {
519/// This structure contains information to deallocate a target pointer, aka.
520/// used to fix up the shadow map and potentially delete the entry from the
521/// mapping table via \p DeviceTy::deallocTgtPtr.
522struct PostProcessingInfo {
523 /// Host pointer used to look up into the map table
524 void *HstPtrBegin;
525
526 /// Size of the data
527 int64_t DataSize;
528
529 /// The mapping type (bitfield).
530 int64_t ArgType;
531
532 /// The target pointer information.
533 TargetPointerResultTy TPR;
534
535 PostProcessingInfo(void *HstPtr, int64_t Size, int64_t ArgType,
536 TargetPointerResultTy &&TPR)
537 : HstPtrBegin(HstPtr), DataSize(Size), ArgType(ArgType),
538 TPR(std::move(TPR)) {}
539};
540
541} // namespace
542
543/// Applies the necessary post-processing procedures to entries listed in \p
544/// EntriesInfo after the execution of all device side operations from a target
545/// data end. This includes the update of pointers at the host and removal of
546/// device buffer when needed. It returns OFFLOAD_FAIL or OFFLOAD_SUCCESS
547/// according to the successfulness of the operations.
548[[nodiscard]] static int
549postProcessingTargetDataEnd(DeviceTy *Device,
550 SmallVector<PostProcessingInfo> &EntriesInfo) {
551 int Ret = OFFLOAD_SUCCESS;
552
553 for (auto &[HstPtrBegin, DataSize, ArgType, TPR] : EntriesInfo) {
554 bool DelEntry = !TPR.isHostPointer();
555
556 // If the last element from the mapper (for end transfer args comes in
557 // reverse order), do not remove the partial entry, the parent struct still
558 // exists.
559 if ((ArgType & OMP_TGT_MAPTYPE_MEMBER_OF) &&
560 !(ArgType & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
561 DelEntry = false; // protect parent struct from being deallocated
562 }
563
564 // If we marked the entry to be deleted we need to verify no other
565 // thread reused it by now. If deletion is still supposed to happen by
566 // this thread LR will be set and exclusive access to the HDTT map
567 // will avoid another thread reusing the entry now. Note that we do
568 // not request (exclusive) access to the HDTT map if DelEntry is
569 // not set.
570 MappingInfoTy::HDTTMapAccessorTy HDTTMap =
571 Device->getMappingInfo().HostDataToTargetMap.getExclusiveAccessor();
572
573 // We cannot use a lock guard because we may end up delete the mutex.
574 // We also explicitly unlocked the entry after it was put in the EntriesInfo
575 // so it can be reused.
576 TPR.getEntry()->lock();
577 auto *Entry = TPR.getEntry();
578
579 const bool IsNotLastUser = Entry->decDataEndThreadCount() != 0;
580 if (DelEntry && (Entry->getTotalRefCount() != 0 || IsNotLastUser)) {
581 // The thread is not in charge of deletion anymore. Give up access
582 // to the HDTT map and unset the deletion flag.
583 HDTTMap.destroy();
584 DelEntry = false;
585 }
586
587 // If we copied back to the host a struct/array containing pointers,
588 // we need to restore the original host pointer values from their
589 // shadow copies. If the struct is going to be deallocated, remove any
590 // remaining shadow pointer entries for this struct.
591 const bool HasFrom = ArgType & OMP_TGT_MAPTYPE_FROM;
592 if (HasFrom) {
593 Entry->foreachShadowPointerInfo([&](const ShadowPtrInfoTy &ShadowPtr) {
594 *ShadowPtr.HstPtrAddr = ShadowPtr.HstPtrVal;
595 DP("Restoring original host pointer value " DPxMOD " for host "
596 "pointer " DPxMOD "\n",
597 DPxPTR(ShadowPtr.HstPtrVal), DPxPTR(ShadowPtr.HstPtrAddr));
598 return OFFLOAD_SUCCESS;
599 });
600 }
601
602 // Give up the lock as we either don't need it anymore (e.g., done with
603 // TPR), or erase TPR.
604 TPR.setEntry(nullptr);
605
606 if (!DelEntry)
607 continue;
608
609 Ret = Device->getMappingInfo().eraseMapEntry(HDTTMap, Entry, DataSize);
610 // Entry is already remove from the map, we can unlock it now.
611 HDTTMap.destroy();
612 Ret |= Device->getMappingInfo().deallocTgtPtrAndEntry(Entry, DataSize);
613 if (Ret != OFFLOAD_SUCCESS) {
614 REPORT("Deallocating data from device failed.\n");
615 break;
616 }
617 }
618
619 delete &EntriesInfo;
620 return Ret;
621}
622
623/// Internal function to undo the mapping and retrieve the data from the device.
624int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
625 void **ArgBases, void **Args, int64_t *ArgSizes,
626 int64_t *ArgTypes, map_var_info_t *ArgNames,
627 void **ArgMappers, AsyncInfoTy &AsyncInfo, bool FromMapper) {
628 int Ret = OFFLOAD_SUCCESS;
629 auto *PostProcessingPtrs = new SmallVector<PostProcessingInfo>();
630 // process each input.
631 for (int32_t I = ArgNum - 1; I >= 0; --I) {
632 // Ignore private variables and arrays - there is no mapping for them.
633 // Also, ignore the use_device_ptr directive, it has no effect here.
634 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
635 (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
636 continue;
637
638 if (ArgMappers && ArgMappers[I]) {
639 // Instead of executing the regular path of targetDataEnd, call the
640 // targetDataMapper variant which will call targetDataEnd again
641 // with new arguments.
642 DP("Calling targetDataMapper for the %dth argument\n", I);
643
644 map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
645 Ret = targetDataMapper(Loc, Device, ArgBases[I], Args[I], ArgSizes[I],
646 ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo,
647 targetDataEnd);
648
649 if (Ret != OFFLOAD_SUCCESS) {
650 REPORT("Call to targetDataEnd via targetDataMapper for custom mapper"
651 " failed.\n");
652 return OFFLOAD_FAIL;
653 }
654
655 // Skip the rest of this function, continue to the next argument.
656 continue;
657 }
658
659 void *HstPtrBegin = Args[I];
660 int64_t DataSize = ArgSizes[I];
661 bool IsImplicit = ArgTypes[I] & OMP_TGT_MAPTYPE_IMPLICIT;
662 bool UpdateRef = (!(ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) ||
663 (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) &&
664 !(FromMapper && I == 0);
665 bool ForceDelete = ArgTypes[I] & OMP_TGT_MAPTYPE_DELETE;
666 bool HasPresentModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_PRESENT;
667 bool HasHoldModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_OMPX_HOLD;
668
669 // If PTR_AND_OBJ, HstPtrBegin is address of pointee
670 TargetPointerResultTy TPR = Device.getMappingInfo().getTgtPtrBegin(
671 HstPtrBegin, DataSize, UpdateRef, HasHoldModifier, !IsImplicit,
672 ForceDelete, /*FromDataEnd=*/true);
673 void *TgtPtrBegin = TPR.TargetPointer;
674 if (!TPR.isPresent() && !TPR.isHostPointer() &&
675 (DataSize || HasPresentModifier)) {
676 DP("Mapping does not exist (%s)\n",
677 (HasPresentModifier ? "'present' map type modifier" : "ignored"));
678 if (HasPresentModifier) {
679 // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 350 L10-13:
680 // "If a map clause appears on a target, target data, target enter data
681 // or target exit data construct with a present map-type-modifier then
682 // on entry to the region if the corresponding list item does not appear
683 // in the device data environment then an error occurs and the program
684 // terminates."
685 //
686 // This should be an error upon entering an "omp target exit data". It
687 // should not be an error upon exiting an "omp target data" or "omp
688 // target". For "omp target data", Clang thus doesn't include present
689 // modifiers for end calls. For "omp target", we have not found a valid
690 // OpenMP program for which the error matters: it appears that, if a
691 // program can guarantee that data is present at the beginning of an
692 // "omp target" region so that there's no error there, that data is also
693 // guaranteed to be present at the end.
694 MESSAGE("device mapping required by 'present' map type modifier does "
695 "not exist for host address " DPxMOD " (%" PRId64 " bytes)",
696 DPxPTR(HstPtrBegin), DataSize);
697 return OFFLOAD_FAIL;
698 }
699 } else {
700 DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
701 " - is%s last\n",
702 DataSize, DPxPTR(TgtPtrBegin), (TPR.Flags.IsLast ? "" : " not"));
703 }
704
705 // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 351 L14-16:
706 // "If the map clause appears on a target, target data, or target exit data
707 // construct and a corresponding list item of the original list item is not
708 // present in the device data environment on exit from the region then the
709 // list item is ignored."
710 if (!TPR.isPresent())
711 continue;
712
713 // Move data back to the host
714 const bool HasAlways = ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS;
715 const bool HasFrom = ArgTypes[I] & OMP_TGT_MAPTYPE_FROM;
716 if (HasFrom && (HasAlways || TPR.Flags.IsLast) &&
717 !TPR.Flags.IsHostPointer && DataSize != 0) {
718 DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
719 DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
720 TIMESCOPE_WITH_DETAILS_AND_IDENT(
721 "DevToHost", "Size=" + std::to_string(val: DataSize) + "B", Loc);
722 // Wait for any previous transfer if an event is present.
723 if (void *Event = TPR.getEntry()->getEvent()) {
724 if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) {
725 REPORT("Failed to wait for event " DPxMOD ".\n", DPxPTR(Event));
726 return OFFLOAD_FAIL;
727 }
728 }
729
730 Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, DataSize, AsyncInfo,
731 TPR.getEntry());
732 if (Ret != OFFLOAD_SUCCESS) {
733 REPORT("Copying data from device failed.\n");
734 return OFFLOAD_FAIL;
735 }
736
737 // As we are expecting to delete the entry the d2h copy might race
738 // with another one that also tries to delete the entry. This happens
739 // as the entry can be reused and the reuse might happen after the
740 // copy-back was issued but before it completed. Since the reuse might
741 // also copy-back a value we would race.
742 if (TPR.Flags.IsLast) {
743 if (TPR.getEntry()->addEventIfNecessary(Device, AsyncInfo) !=
744 OFFLOAD_SUCCESS)
745 return OFFLOAD_FAIL;
746 }
747 }
748
749 // Add pointer to the buffer for post-synchronize processing.
750 PostProcessingPtrs->emplace_back(HstPtrBegin, DataSize, ArgTypes[I],
751 std::move(TPR));
752 PostProcessingPtrs->back().TPR.getEntry()->unlock();
753 }
754
755 // Add post-processing functions
756 // TODO: We might want to remove `mutable` in the future by not changing the
757 // captured variables somehow.
758 AsyncInfo.addPostProcessingFunction([=, Device = &Device]() mutable -> int {
759 return postProcessingTargetDataEnd(Device, *PostProcessingPtrs);
760 });
761
762 return Ret;
763}
764
765static int targetDataContiguous(ident_t *Loc, DeviceTy &Device, void *ArgsBase,
766 void *HstPtrBegin, int64_t ArgSize,
767 int64_t ArgType, AsyncInfoTy &AsyncInfo) {
768 TargetPointerResultTy TPR = Device.getMappingInfo().getTgtPtrBegin(
769 HstPtrBegin, ArgSize, /*UpdateRefCount=*/false,
770 /*UseHoldRefCount=*/false, /*MustContain=*/true);
771 void *TgtPtrBegin = TPR.TargetPointer;
772 if (!TPR.isPresent()) {
773 DP("hst data:" DPxMOD " not found, becomes a noop\n", DPxPTR(HstPtrBegin));
774 if (ArgType & OMP_TGT_MAPTYPE_PRESENT) {
775 MESSAGE("device mapping required by 'present' motion modifier does not "
776 "exist for host address " DPxMOD " (%" PRId64 " bytes)",
777 DPxPTR(HstPtrBegin), ArgSize);
778 return OFFLOAD_FAIL;
779 }
780 return OFFLOAD_SUCCESS;
781 }
782
783 if (TPR.Flags.IsHostPointer) {
784 DP("hst data:" DPxMOD " unified and shared, becomes a noop\n",
785 DPxPTR(HstPtrBegin));
786 return OFFLOAD_SUCCESS;
787 }
788
789 if (ArgType & OMP_TGT_MAPTYPE_TO) {
790 DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
791 ArgSize, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
792 int Ret = Device.submitData(TgtPtrBegin, HstPtrBegin, ArgSize, AsyncInfo,
793 TPR.getEntry());
794 if (Ret != OFFLOAD_SUCCESS) {
795 REPORT("Copying data to device failed.\n");
796 return OFFLOAD_FAIL;
797 }
798 if (TPR.getEntry()) {
799 int Ret = TPR.getEntry()->foreachShadowPointerInfo(
800 [&](ShadowPtrInfoTy &ShadowPtr) {
801 DP("Restoring original target pointer value " DPxMOD " for target "
802 "pointer " DPxMOD "\n",
803 DPxPTR(ShadowPtr.TgtPtrVal), DPxPTR(ShadowPtr.TgtPtrAddr));
804 Ret = Device.submitData(ShadowPtr.TgtPtrAddr,
805 (void *)&ShadowPtr.TgtPtrVal,
806 sizeof(void *), AsyncInfo);
807 if (Ret != OFFLOAD_SUCCESS) {
808 REPORT("Copying data to device failed.\n");
809 return OFFLOAD_FAIL;
810 }
811 return OFFLOAD_SUCCESS;
812 });
813 if (Ret != OFFLOAD_SUCCESS) {
814 DP("Updating shadow map failed\n");
815 return Ret;
816 }
817 }
818 }
819
820 if (ArgType & OMP_TGT_MAPTYPE_FROM) {
821 DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
822 ArgSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
823 int Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, ArgSize, AsyncInfo,
824 TPR.getEntry());
825 if (Ret != OFFLOAD_SUCCESS) {
826 REPORT("Copying data from device failed.\n");
827 return OFFLOAD_FAIL;
828 }
829
830 // Wait for device-to-host memcopies for whole struct to complete,
831 // before restoring the correct host pointer.
832 if (auto *Entry = TPR.getEntry()) {
833 AsyncInfo.addPostProcessingFunction([=]() -> int {
834 int Ret = Entry->foreachShadowPointerInfo(
835 [&](const ShadowPtrInfoTy &ShadowPtr) {
836 *ShadowPtr.HstPtrAddr = ShadowPtr.HstPtrVal;
837 DP("Restoring original host pointer value " DPxMOD
838 " for host pointer " DPxMOD "\n",
839 DPxPTR(ShadowPtr.HstPtrVal), DPxPTR(ShadowPtr.HstPtrAddr));
840 return OFFLOAD_SUCCESS;
841 });
842 Entry->unlock();
843 if (Ret != OFFLOAD_SUCCESS) {
844 DP("Updating shadow map failed\n");
845 return Ret;
846 }
847 return OFFLOAD_SUCCESS;
848 });
849 }
850 }
851
852 return OFFLOAD_SUCCESS;
853}
854
855static int targetDataNonContiguous(ident_t *Loc, DeviceTy &Device,
856 void *ArgsBase,
857 __tgt_target_non_contig *NonContig,
858 uint64_t Size, int64_t ArgType,
859 int CurrentDim, int DimSize, uint64_t Offset,
860 AsyncInfoTy &AsyncInfo) {
861 int Ret = OFFLOAD_SUCCESS;
862 if (CurrentDim < DimSize) {
863 for (unsigned int I = 0; I < NonContig[CurrentDim].Count; ++I) {
864 uint64_t CurOffset =
865 (NonContig[CurrentDim].Offset + I) * NonContig[CurrentDim].Stride;
866 // we only need to transfer the first element for the last dimension
867 // since we've already got a contiguous piece.
868 if (CurrentDim != DimSize - 1 || I == 0) {
869 Ret = targetDataNonContiguous(Loc, Device, ArgsBase, NonContig, Size,
870 ArgType, CurrentDim + 1, DimSize,
871 Offset + CurOffset, AsyncInfo);
872 // Stop the whole process if any contiguous piece returns anything
873 // other than OFFLOAD_SUCCESS.
874 if (Ret != OFFLOAD_SUCCESS)
875 return Ret;
876 }
877 }
878 } else {
879 char *Ptr = (char *)ArgsBase + Offset;
880 DP("Transfer of non-contiguous : host ptr " DPxMOD " offset %" PRIu64
881 " len %" PRIu64 "\n",
882 DPxPTR(Ptr), Offset, Size);
883 Ret = targetDataContiguous(Loc, Device, ArgsBase, Ptr, Size, ArgType,
884 AsyncInfo);
885 }
886 return Ret;
887}
888
889static int getNonContigMergedDimension(__tgt_target_non_contig *NonContig,
890 int32_t DimSize) {
891 int RemovedDim = 0;
892 for (int I = DimSize - 1; I > 0; --I) {
893 if (NonContig[I].Count * NonContig[I].Stride == NonContig[I - 1].Stride)
894 RemovedDim++;
895 }
896 return RemovedDim;
897}
898
899/// Internal function to pass data to/from the target.
900int targetDataUpdate(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
901 void **ArgsBase, void **Args, int64_t *ArgSizes,
902 int64_t *ArgTypes, map_var_info_t *ArgNames,
903 void **ArgMappers, AsyncInfoTy &AsyncInfo, bool) {
904 // process each input.
905 for (int32_t I = 0; I < ArgNum; ++I) {
906 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
907 (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
908 continue;
909
910 if (ArgMappers && ArgMappers[I]) {
911 // Instead of executing the regular path of targetDataUpdate, call the
912 // targetDataMapper variant which will call targetDataUpdate again
913 // with new arguments.
914 DP("Calling targetDataMapper for the %dth argument\n", I);
915
916 map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
917 int Ret = targetDataMapper(Loc, Device, ArgsBase[I], Args[I], ArgSizes[I],
918 ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo,
919 targetDataUpdate);
920
921 if (Ret != OFFLOAD_SUCCESS) {
922 REPORT("Call to targetDataUpdate via targetDataMapper for custom mapper"
923 " failed.\n");
924 return OFFLOAD_FAIL;
925 }
926
927 // Skip the rest of this function, continue to the next argument.
928 continue;
929 }
930
931 int Ret = OFFLOAD_SUCCESS;
932
933 if (ArgTypes[I] & OMP_TGT_MAPTYPE_NON_CONTIG) {
934 __tgt_target_non_contig *NonContig = (__tgt_target_non_contig *)Args[I];
935 int32_t DimSize = ArgSizes[I];
936 uint64_t Size =
937 NonContig[DimSize - 1].Count * NonContig[DimSize - 1].Stride;
938 int32_t MergedDim = getNonContigMergedDimension(NonContig, DimSize);
939 Ret = targetDataNonContiguous(
940 Loc, Device, ArgsBase[I], NonContig, Size, ArgTypes[I],
941 /*current_dim=*/0, DimSize - MergedDim, /*offset=*/0, AsyncInfo);
942 } else {
943 Ret = targetDataContiguous(Loc, Device, ArgsBase[I], Args[I], ArgSizes[I],
944 ArgTypes[I], AsyncInfo);
945 }
946 if (Ret == OFFLOAD_FAIL)
947 return OFFLOAD_FAIL;
948 }
949 return OFFLOAD_SUCCESS;
950}
951
952static const unsigned LambdaMapping = OMP_TGT_MAPTYPE_PTR_AND_OBJ |
953 OMP_TGT_MAPTYPE_LITERAL |
954 OMP_TGT_MAPTYPE_IMPLICIT;
955static bool isLambdaMapping(int64_t Mapping) {
956 return (Mapping & LambdaMapping) == LambdaMapping;
957}
958
959namespace {
960/// Find the table information in the map or look it up in the translation
961/// tables.
962TableMap *getTableMap(void *HostPtr) {
963 std::lock_guard<std::mutex> TblMapLock(PM->TblMapMtx);
964 HostPtrToTableMapTy::iterator TableMapIt =
965 PM->HostPtrToTableMap.find(HostPtr);
966
967 if (TableMapIt != PM->HostPtrToTableMap.end())
968 return &TableMapIt->second;
969
970 // We don't have a map. So search all the registered libraries.
971 TableMap *TM = nullptr;
972 std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
973 for (HostEntriesBeginToTransTableTy::iterator Itr =
974 PM->HostEntriesBeginToTransTable.begin();
975 Itr != PM->HostEntriesBeginToTransTable.end(); ++Itr) {
976 // get the translation table (which contains all the good info).
977 TranslationTable *TransTable = &Itr->second;
978 // iterate over all the host table entries to see if we can locate the
979 // host_ptr.
980 llvm::offloading::EntryTy *Cur = TransTable->HostTable.EntriesBegin;
981 for (uint32_t I = 0; Cur < TransTable->HostTable.EntriesEnd; ++Cur, ++I) {
982 if (Cur->Address != HostPtr)
983 continue;
984 // we got a match, now fill the HostPtrToTableMap so that we
985 // may avoid this search next time.
986 TM = &(PM->HostPtrToTableMap)[HostPtr];
987 TM->Table = TransTable;
988 TM->Index = I;
989 return TM;
990 }
991 }
992
993 return nullptr;
994}
995
996/// A class manages private arguments in a target region.
997class PrivateArgumentManagerTy {
998 /// A data structure for the information of first-private arguments. We can
999 /// use this information to optimize data transfer by packing all
1000 /// first-private arguments and transfer them all at once.
1001 struct FirstPrivateArgInfoTy {
1002 /// Host pointer begin
1003 char *HstPtrBegin;
1004 /// Host pointer end
1005 char *HstPtrEnd;
1006 /// The index of the element in \p TgtArgs corresponding to the argument
1007 int Index;
1008 /// Alignment of the entry (base of the entry, not after the entry).
1009 uint32_t Alignment;
1010 /// Size (without alignment, see padding)
1011 uint32_t Size;
1012 /// Padding used to align this argument entry, if necessary.
1013 uint32_t Padding;
1014 /// Host pointer name
1015 map_var_info_t HstPtrName = nullptr;
1016
1017 FirstPrivateArgInfoTy(int Index, void *HstPtr, uint32_t Size,
1018 uint32_t Alignment, uint32_t Padding,
1019 map_var_info_t HstPtrName = nullptr)
1020 : HstPtrBegin(reinterpret_cast<char *>(HstPtr)),
1021 HstPtrEnd(HstPtrBegin + Size), Index(Index), Alignment(Alignment),
1022 Size(Size), Padding(Padding), HstPtrName(HstPtrName) {}
1023 };
1024
1025 /// A vector of target pointers for all private arguments
1026 SmallVector<void *> TgtPtrs;
1027
1028 /// A vector of information of all first-private arguments to be packed
1029 SmallVector<FirstPrivateArgInfoTy> FirstPrivateArgInfo;
1030 /// Host buffer for all arguments to be packed
1031 SmallVector<char> FirstPrivateArgBuffer;
1032 /// The total size of all arguments to be packed
1033 int64_t FirstPrivateArgSize = 0;
1034
1035 /// A reference to the \p DeviceTy object
1036 DeviceTy &Device;
1037 /// A pointer to a \p AsyncInfoTy object
1038 AsyncInfoTy &AsyncInfo;
1039
1040 // TODO: What would be the best value here? Should we make it configurable?
1041 // If the size is larger than this threshold, we will allocate and transfer it
1042 // immediately instead of packing it.
1043 static constexpr const int64_t FirstPrivateArgSizeThreshold = 1024;
1044
1045public:
1046 /// Constructor
1047 PrivateArgumentManagerTy(DeviceTy &Dev, AsyncInfoTy &AsyncInfo)
1048 : Device(Dev), AsyncInfo(AsyncInfo) {}
1049
1050 /// Add a private argument
1051 int addArg(void *HstPtr, int64_t ArgSize, int64_t ArgOffset,
1052 bool IsFirstPrivate, void *&TgtPtr, int TgtArgsIndex,
1053 map_var_info_t HstPtrName = nullptr,
1054 const bool AllocImmediately = false) {
1055 // If the argument is not first-private, or its size is greater than a
1056 // predefined threshold, we will allocate memory and issue the transfer
1057 // immediately.
1058 if (ArgSize > FirstPrivateArgSizeThreshold || !IsFirstPrivate ||
1059 AllocImmediately) {
1060 TgtPtr = Device.allocData(ArgSize, HstPtr);
1061 if (!TgtPtr) {
1062 DP("Data allocation for %sprivate array " DPxMOD " failed.\n",
1063 (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtr));
1064 return OFFLOAD_FAIL;
1065 }
1066#ifdef OMPTARGET_DEBUG
1067 void *TgtPtrBase = (void *)((intptr_t)TgtPtr + ArgOffset);
1068 DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD
1069 " for %sprivate array " DPxMOD " - pushing target argument " DPxMOD
1070 "\n",
1071 ArgSize, DPxPTR(TgtPtr), (IsFirstPrivate ? "first-" : ""),
1072 DPxPTR(HstPtr), DPxPTR(TgtPtrBase));
1073#endif
1074 // If first-private, copy data from host
1075 if (IsFirstPrivate) {
1076 DP("Submitting firstprivate data to the device.\n");
1077 int Ret = Device.submitData(TgtPtr, HstPtr, ArgSize, AsyncInfo);
1078 if (Ret != OFFLOAD_SUCCESS) {
1079 DP("Copying data to device failed, failed.\n");
1080 return OFFLOAD_FAIL;
1081 }
1082 }
1083 TgtPtrs.push_back(TgtPtr);
1084 } else {
1085 DP("Firstprivate array " DPxMOD " of size %" PRId64 " will be packed\n",
1086 DPxPTR(HstPtr), ArgSize);
1087 // When reach this point, the argument must meet all following
1088 // requirements:
1089 // 1. Its size does not exceed the threshold (see the comment for
1090 // FirstPrivateArgSizeThreshold);
1091 // 2. It must be first-private (needs to be mapped to target device).
1092 // We will pack all this kind of arguments to transfer them all at once
1093 // to reduce the number of data transfer. We will not take
1094 // non-first-private arguments, aka. private arguments that doesn't need
1095 // to be mapped to target device, into account because data allocation
1096 // can be very efficient with memory manager.
1097
1098 // Placeholder value
1099 TgtPtr = nullptr;
1100 auto *LastFPArgInfo =
1101 FirstPrivateArgInfo.empty() ? nullptr : &FirstPrivateArgInfo.back();
1102
1103 // Compute the start alignment of this entry, add padding if necessary.
1104 // TODO: Consider sorting instead.
1105 uint32_t Padding = 0;
1106 uint32_t StartAlignment =
1107 LastFPArgInfo ? LastFPArgInfo->Alignment : MaxAlignment;
1108 if (LastFPArgInfo) {
1109 // Check if we keep the start alignment or if it is shrunk due to the
1110 // size of the last element.
1111 uint32_t Offset = LastFPArgInfo->Size % StartAlignment;
1112 if (Offset)
1113 StartAlignment = Offset;
1114 // We only need as much alignment as the host pointer had (since we
1115 // don't know the alignment information from the source we might end up
1116 // overaligning accesses but not too much).
1117 uint32_t RequiredAlignment =
1118 llvm::bit_floor(Value: getPartialStructRequiredAlignment(HstPtrBase: HstPtr));
1119 if (RequiredAlignment > StartAlignment) {
1120 Padding = RequiredAlignment - StartAlignment;
1121 StartAlignment = RequiredAlignment;
1122 }
1123 }
1124
1125 FirstPrivateArgInfo.emplace_back(TgtArgsIndex, HstPtr, ArgSize,
1126 StartAlignment, Padding, HstPtrName);
1127 FirstPrivateArgSize += Padding + ArgSize;
1128 }
1129
1130 return OFFLOAD_SUCCESS;
1131 }
1132
1133 /// Pack first-private arguments, replace place holder pointers in \p TgtArgs,
1134 /// and start the transfer.
1135 int packAndTransfer(SmallVector<void *> &TgtArgs) {
1136 if (!FirstPrivateArgInfo.empty()) {
1137 assert(FirstPrivateArgSize != 0 &&
1138 "FirstPrivateArgSize is 0 but FirstPrivateArgInfo is empty");
1139 FirstPrivateArgBuffer.resize(FirstPrivateArgSize, 0);
1140 auto *Itr = FirstPrivateArgBuffer.begin();
1141 // Copy all host data to this buffer
1142 for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) {
1143 // First pad the pointer as we (have to) pad it on the device too.
1144 Itr = std::next(Itr, Info.Padding);
1145 std::copy(Info.HstPtrBegin, Info.HstPtrEnd, Itr);
1146 Itr = std::next(Itr, Info.Size);
1147 }
1148 // Allocate target memory
1149 void *TgtPtr =
1150 Device.allocData(FirstPrivateArgSize, FirstPrivateArgBuffer.data());
1151 if (TgtPtr == nullptr) {
1152 DP("Failed to allocate target memory for private arguments.\n");
1153 return OFFLOAD_FAIL;
1154 }
1155 TgtPtrs.push_back(TgtPtr);
1156 DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD "\n",
1157 FirstPrivateArgSize, DPxPTR(TgtPtr));
1158 // Transfer data to target device
1159 int Ret = Device.submitData(TgtPtr, FirstPrivateArgBuffer.data(),
1160 FirstPrivateArgSize, AsyncInfo);
1161 if (Ret != OFFLOAD_SUCCESS) {
1162 DP("Failed to submit data of private arguments.\n");
1163 return OFFLOAD_FAIL;
1164 }
1165 // Fill in all placeholder pointers
1166 auto TP = reinterpret_cast<uintptr_t>(TgtPtr);
1167 for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) {
1168 void *&Ptr = TgtArgs[Info.Index];
1169 assert(Ptr == nullptr && "Target pointer is already set by mistaken");
1170 // Pad the device pointer to get the right alignment.
1171 TP += Info.Padding;
1172 Ptr = reinterpret_cast<void *>(TP);
1173 TP += Info.Size;
1174 DP("Firstprivate array " DPxMOD " of size %" PRId64 " mapped to " DPxMOD
1175 "\n",
1176 DPxPTR(Info.HstPtrBegin), Info.HstPtrEnd - Info.HstPtrBegin,
1177 DPxPTR(Ptr));
1178 }
1179 }
1180
1181 return OFFLOAD_SUCCESS;
1182 }
1183
1184 /// Free all target memory allocated for private arguments
1185 int free() {
1186 for (void *P : TgtPtrs) {
1187 int Ret = Device.deleteData(P);
1188 if (Ret != OFFLOAD_SUCCESS) {
1189 DP("Deallocation of (first-)private arrays failed.\n");
1190 return OFFLOAD_FAIL;
1191 }
1192 }
1193
1194 TgtPtrs.clear();
1195
1196 return OFFLOAD_SUCCESS;
1197 }
1198};
1199
1200/// Process data before launching the kernel, including calling targetDataBegin
1201/// to map and transfer data to target device, transferring (first-)private
1202/// variables.
1203static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr,
1204 int32_t ArgNum, void **ArgBases, void **Args,
1205 int64_t *ArgSizes, int64_t *ArgTypes,
1206 map_var_info_t *ArgNames, void **ArgMappers,
1207 SmallVector<void *> &TgtArgs,
1208 SmallVector<ptrdiff_t> &TgtOffsets,
1209 PrivateArgumentManagerTy &PrivateArgumentManager,
1210 AsyncInfoTy &AsyncInfo) {
1211
1212 auto DeviceOrErr = PM->getDevice(DeviceId);
1213 if (!DeviceOrErr)
1214 FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
1215
1216 int Ret = targetDataBegin(Loc, *DeviceOrErr, ArgNum, ArgBases, Args, ArgSizes,
1217 ArgTypes, ArgNames, ArgMappers, AsyncInfo);
1218 if (Ret != OFFLOAD_SUCCESS) {
1219 REPORT("Call to targetDataBegin failed, abort target.\n");
1220 return OFFLOAD_FAIL;
1221 }
1222
1223 // List of (first-)private arrays allocated for this target region
1224 SmallVector<int> TgtArgsPositions(ArgNum, -1);
1225
1226 for (int32_t I = 0; I < ArgNum; ++I) {
1227 if (!(ArgTypes[I] & OMP_TGT_MAPTYPE_TARGET_PARAM)) {
1228 // This is not a target parameter, do not push it into TgtArgs.
1229 // Check for lambda mapping.
1230 if (isLambdaMapping(Mapping: ArgTypes[I])) {
1231 assert((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
1232 "PTR_AND_OBJ must be also MEMBER_OF.");
1233 unsigned Idx = getParentIndex(Type: ArgTypes[I]);
1234 int TgtIdx = TgtArgsPositions[Idx];
1235 assert(TgtIdx != -1 && "Base address must be translated already.");
1236 // The parent lambda must be processed already and it must be the last
1237 // in TgtArgs and TgtOffsets arrays.
1238 void *HstPtrVal = Args[I];
1239 void *HstPtrBegin = ArgBases[I];
1240 void *HstPtrBase = Args[Idx];
1241 void *TgtPtrBase =
1242 (void *)((intptr_t)TgtArgs[TgtIdx] + TgtOffsets[TgtIdx]);
1243 DP("Parent lambda base " DPxMOD "\n", DPxPTR(TgtPtrBase));
1244 uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
1245 void *TgtPtrBegin = (void *)((uintptr_t)TgtPtrBase + Delta);
1246 void *&PointerTgtPtrBegin = AsyncInfo.getVoidPtrLocation();
1247 TargetPointerResultTy TPR =
1248 DeviceOrErr->getMappingInfo().getTgtPtrBegin(
1249 HstPtrVal, ArgSizes[I], /*UpdateRefCount=*/false,
1250 /*UseHoldRefCount=*/false);
1251 PointerTgtPtrBegin = TPR.TargetPointer;
1252 if (!TPR.isPresent()) {
1253 DP("No lambda captured variable mapped (" DPxMOD ") - ignored\n",
1254 DPxPTR(HstPtrVal));
1255 continue;
1256 }
1257 if (TPR.Flags.IsHostPointer) {
1258 DP("Unified memory is active, no need to map lambda captured"
1259 "variable (" DPxMOD ")\n",
1260 DPxPTR(HstPtrVal));
1261 continue;
1262 }
1263 DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n",
1264 DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
1265 Ret =
1266 DeviceOrErr->submitData(TgtPtrBegin, &PointerTgtPtrBegin,
1267 sizeof(void *), AsyncInfo, TPR.getEntry());
1268 if (Ret != OFFLOAD_SUCCESS) {
1269 REPORT("Copying data to device failed.\n");
1270 return OFFLOAD_FAIL;
1271 }
1272 }
1273 continue;
1274 }
1275 void *HstPtrBegin = Args[I];
1276 void *HstPtrBase = ArgBases[I];
1277 void *TgtPtrBegin;
1278 map_var_info_t HstPtrName = (!ArgNames) ? nullptr : ArgNames[I];
1279 ptrdiff_t TgtBaseOffset;
1280 TargetPointerResultTy TPR;
1281 if (ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) {
1282 DP("Forwarding first-private value " DPxMOD " to the target construct\n",
1283 DPxPTR(HstPtrBase));
1284 TgtPtrBegin = HstPtrBase;
1285 TgtBaseOffset = 0;
1286 } else if (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE) {
1287 TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
1288 const bool IsFirstPrivate = (ArgTypes[I] & OMP_TGT_MAPTYPE_TO);
1289 // If there is a next argument and it depends on the current one, we need
1290 // to allocate the private memory immediately. If this is not the case,
1291 // then the argument can be marked for optimization and packed with the
1292 // other privates.
1293 const bool AllocImmediately =
1294 (I < ArgNum - 1 && (ArgTypes[I + 1] & OMP_TGT_MAPTYPE_MEMBER_OF));
1295 Ret = PrivateArgumentManager.addArg(
1296 HstPtrBegin, ArgSizes[I], TgtBaseOffset, IsFirstPrivate, TgtPtrBegin,
1297 TgtArgs.size(), HstPtrName, AllocImmediately);
1298 if (Ret != OFFLOAD_SUCCESS) {
1299 REPORT("Failed to process %sprivate argument " DPxMOD "\n",
1300 (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtrBegin));
1301 return OFFLOAD_FAIL;
1302 }
1303 } else {
1304 if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)
1305 HstPtrBase = *reinterpret_cast<void **>(HstPtrBase);
1306 TPR = DeviceOrErr->getMappingInfo().getTgtPtrBegin(
1307 HstPtrBegin, ArgSizes[I],
1308 /*UpdateRefCount=*/false,
1309 /*UseHoldRefCount=*/false);
1310 TgtPtrBegin = TPR.TargetPointer;
1311 TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
1312#ifdef OMPTARGET_DEBUG
1313 void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset);
1314 DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n",
1315 DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin));
1316#endif
1317 }
1318 TgtArgsPositions[I] = TgtArgs.size();
1319 TgtArgs.push_back(TgtPtrBegin);
1320 TgtOffsets.push_back(TgtBaseOffset);
1321 }
1322
1323 assert(TgtArgs.size() == TgtOffsets.size() &&
1324 "Size mismatch in arguments and offsets");
1325
1326 // Pack and transfer first-private arguments
1327 Ret = PrivateArgumentManager.packAndTransfer(TgtArgs);
1328 if (Ret != OFFLOAD_SUCCESS) {
1329 DP("Failed to pack and transfer first private arguments\n");
1330 return OFFLOAD_FAIL;
1331 }
1332
1333 return OFFLOAD_SUCCESS;
1334}
1335
1336/// Process data after launching the kernel, including transferring data back to
1337/// host if needed and deallocating target memory of (first-)private variables.
1338static int processDataAfter(ident_t *Loc, int64_t DeviceId, void *HostPtr,
1339 int32_t ArgNum, void **ArgBases, void **Args,
1340 int64_t *ArgSizes, int64_t *ArgTypes,
1341 map_var_info_t *ArgNames, void **ArgMappers,
1342 PrivateArgumentManagerTy &PrivateArgumentManager,
1343 AsyncInfoTy &AsyncInfo) {
1344
1345 auto DeviceOrErr = PM->getDevice(DeviceId);
1346 if (!DeviceOrErr)
1347 FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
1348
1349 // Move data from device.
1350 int Ret = targetDataEnd(Loc, *DeviceOrErr, ArgNum, ArgBases, Args, ArgSizes,
1351 ArgTypes, ArgNames, ArgMappers, AsyncInfo);
1352 if (Ret != OFFLOAD_SUCCESS) {
1353 REPORT("Call to targetDataEnd failed, abort target.\n");
1354 return OFFLOAD_FAIL;
1355 }
1356
1357 // Free target memory for private arguments after synchronization.
1358 // TODO: We might want to remove `mutable` in the future by not changing the
1359 // captured variables somehow.
1360 AsyncInfo.addPostProcessingFunction(
1361 [PrivateArgumentManager =
1362 std::move(PrivateArgumentManager)]() mutable -> int {
1363 int Ret = PrivateArgumentManager.free();
1364 if (Ret != OFFLOAD_SUCCESS) {
1365 REPORT("Failed to deallocate target memory for private args\n");
1366 return OFFLOAD_FAIL;
1367 }
1368 return Ret;
1369 });
1370
1371 return OFFLOAD_SUCCESS;
1372}
1373} // namespace
1374
1375/// performs the same actions as data_begin in case arg_num is
1376/// non-zero and initiates run of the offloaded region on the target platform;
1377/// if arg_num is non-zero after the region execution is done it also
1378/// performs the same action as data_update and data_end above. This function
1379/// returns 0 if it was able to transfer the execution to a target and an
1380/// integer different from zero otherwise.
1381int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
1382 KernelArgsTy &KernelArgs, AsyncInfoTy &AsyncInfo) {
1383 int32_t DeviceId = Device.DeviceID;
1384 TableMap *TM = getTableMap(HostPtr);
1385 // No map for this host pointer found!
1386 if (!TM) {
1387 REPORT("Host ptr " DPxMOD " does not have a matching target pointer.\n",
1388 DPxPTR(HostPtr));
1389 return OFFLOAD_FAIL;
1390 }
1391
1392 // get target table.
1393 __tgt_target_table *TargetTable = nullptr;
1394 {
1395 std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
1396 assert(TM->Table->TargetsTable.size() > (size_t)DeviceId &&
1397 "Not expecting a device ID outside the table's bounds!");
1398 TargetTable = TM->Table->TargetsTable[DeviceId];
1399 }
1400 assert(TargetTable && "Global data has not been mapped\n");
1401
1402 DP("loop trip count is %" PRIu64 ".\n", KernelArgs.Tripcount);
1403
1404 // We need to keep bases and offsets separate. Sometimes (e.g. in OpenCL) we
1405 // need to manifest base pointers prior to launching a kernel. Even if we have
1406 // mapped an object only partially, e.g. A[N:M], although the kernel is
1407 // expected to access elements starting at address &A[N] and beyond, we still
1408 // need to manifest the base of the array &A[0]. In other cases, e.g. the COI
1409 // API, we need the begin address itself, i.e. &A[N], as the API operates on
1410 // begin addresses, not bases. That's why we pass args and offsets as two
1411 // separate entities so that each plugin can do what it needs. This behavior
1412 // was introduced via https://reviews.llvm.org/D33028 and commit 1546d319244c.
1413 SmallVector<void *> TgtArgs;
1414 SmallVector<ptrdiff_t> TgtOffsets;
1415
1416 PrivateArgumentManagerTy PrivateArgumentManager(Device, AsyncInfo);
1417
1418 int NumClangLaunchArgs = KernelArgs.NumArgs;
1419 int Ret = OFFLOAD_SUCCESS;
1420 if (NumClangLaunchArgs) {
1421 // Process data, such as data mapping, before launching the kernel
1422 Ret = processDataBefore(Loc, DeviceId, HostPtr, NumClangLaunchArgs,
1423 KernelArgs.ArgBasePtrs, KernelArgs.ArgPtrs,
1424 KernelArgs.ArgSizes, KernelArgs.ArgTypes,
1425 KernelArgs.ArgNames, KernelArgs.ArgMappers, TgtArgs,
1426 TgtOffsets, PrivateArgumentManager, AsyncInfo);
1427 if (Ret != OFFLOAD_SUCCESS) {
1428 REPORT("Failed to process data before launching the kernel.\n");
1429 return OFFLOAD_FAIL;
1430 }
1431
1432 // Clang might pass more values via the ArgPtrs to the runtime that we pass
1433 // on to the kernel.
1434 // TODO: Next time we adjust the KernelArgsTy we should introduce a new
1435 // NumKernelArgs field.
1436 KernelArgs.NumArgs = TgtArgs.size();
1437 }
1438
1439 // Launch device execution.
1440 void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].Address;
1441 DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",
1442 TargetTable->EntriesBegin[TM->Index].SymbolName, DPxPTR(TgtEntryPtr),
1443 TM->Index);
1444
1445 {
1446 assert(KernelArgs.NumArgs == TgtArgs.size() && "Argument count mismatch!");
1447 TIMESCOPE_WITH_DETAILS_AND_IDENT(
1448 "Kernel Target",
1449 "NumArguments=" + std::to_string(KernelArgs.NumArgs) +
1450 ";NumTeams=" + std::to_string(KernelArgs.NumTeams[0]) +
1451 ";TripCount=" + std::to_string(KernelArgs.Tripcount),
1452 Loc);
1453
1454#ifdef OMPT_SUPPORT
1455 /// RAII to establish tool anchors before and after kernel launch
1456 int32_t NumTeams = KernelArgs.NumTeams[0];
1457 // No need to guard this with OMPT_IF_BUILT
1458 InterfaceRAII TargetSubmitRAII(
1459 RegionInterface.getCallbacks<ompt_callback_target_submit>(), NumTeams);
1460#endif
1461
1462 Ret = Device.launchKernel(TgtEntryPtr, TgtArgs.data(), TgtOffsets.data(),
1463 KernelArgs, AsyncInfo);
1464 }
1465
1466 if (Ret != OFFLOAD_SUCCESS) {
1467 REPORT("Executing target region abort target.\n");
1468 return OFFLOAD_FAIL;
1469 }
1470
1471 if (NumClangLaunchArgs) {
1472 // Transfer data back and deallocate target memory for (first-)private
1473 // variables
1474 Ret = processDataAfter(Loc, DeviceId, HostPtr, NumClangLaunchArgs,
1475 KernelArgs.ArgBasePtrs, KernelArgs.ArgPtrs,
1476 KernelArgs.ArgSizes, KernelArgs.ArgTypes,
1477 KernelArgs.ArgNames, KernelArgs.ArgMappers,
1478 PrivateArgumentManager, AsyncInfo);
1479 if (Ret != OFFLOAD_SUCCESS) {
1480 REPORT("Failed to process data after launching the kernel.\n");
1481 return OFFLOAD_FAIL;
1482 }
1483 }
1484
1485 return OFFLOAD_SUCCESS;
1486}
1487
1488/// Enables the record replay mechanism by pre-allocating MemorySize
1489/// and informing the record-replayer of whether to store the output
1490/// in some file.
1491int target_activate_rr(DeviceTy &Device, uint64_t MemorySize, void *VAddr,
1492 bool IsRecord, bool SaveOutput,
1493 uint64_t &ReqPtrArgOffset) {
1494 return Device.RTL->initialize_record_replay(Device.DeviceID, MemorySize,
1495 VAddr, IsRecord, SaveOutput,
1496 ReqPtrArgOffset);
1497}
1498
1499/// Executes a kernel using pre-recorded information for loading to
1500/// device memory to launch the target kernel with the pre-recorded
1501/// configuration.
1502int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr,
1503 void *DeviceMemory, int64_t DeviceMemorySize, void **TgtArgs,
1504 ptrdiff_t *TgtOffsets, int32_t NumArgs, int32_t NumTeams,
1505 int32_t ThreadLimit, uint64_t LoopTripCount,
1506 AsyncInfoTy &AsyncInfo) {
1507 int32_t DeviceId = Device.DeviceID;
1508 TableMap *TM = getTableMap(HostPtr);
1509 // Fail if the table map fails to find the target kernel pointer for the
1510 // provided host pointer.
1511 if (!TM) {
1512 REPORT("Host ptr " DPxMOD " does not have a matching target pointer.\n",
1513 DPxPTR(HostPtr));
1514 return OFFLOAD_FAIL;
1515 }
1516
1517 // Retrieve the target table of offloading entries.
1518 __tgt_target_table *TargetTable = nullptr;
1519 {
1520 std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
1521 assert(TM->Table->TargetsTable.size() > (size_t)DeviceId &&
1522 "Not expecting a device ID outside the table's bounds!");
1523 TargetTable = TM->Table->TargetsTable[DeviceId];
1524 }
1525 assert(TargetTable && "Global data has not been mapped\n");
1526
1527 // Retrieve the target kernel pointer, allocate and store the recorded device
1528 // memory data, and launch device execution.
1529 void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].Address;
1530 DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",
1531 TargetTable->EntriesBegin[TM->Index].SymbolName, DPxPTR(TgtEntryPtr),
1532 TM->Index);
1533
1534 void *TgtPtr = Device.allocData(DeviceMemorySize, /*HstPtr=*/nullptr,
1535 TARGET_ALLOC_DEFAULT);
1536 Device.submitData(TgtPtr, DeviceMemory, DeviceMemorySize, AsyncInfo);
1537
1538 KernelArgsTy KernelArgs{};
1539 KernelArgs.Version = OMP_KERNEL_ARG_VERSION;
1540 KernelArgs.NumArgs = NumArgs;
1541 KernelArgs.Tripcount = LoopTripCount;
1542 KernelArgs.NumTeams[0] = NumTeams;
1543 KernelArgs.ThreadLimit[0] = ThreadLimit;
1544
1545 int Ret = Device.launchKernel(TgtEntryPtr, TgtArgs, TgtOffsets, KernelArgs,
1546 AsyncInfo);
1547
1548 if (Ret != OFFLOAD_SUCCESS) {
1549 REPORT("Executing target region abort target.\n");
1550 return OFFLOAD_FAIL;
1551 }
1552
1553 return OFFLOAD_SUCCESS;
1554}
1555

source code of offload/libomptarget/omptarget.cpp