1//===------ omptarget.cpp - Target independent OpenMP target RTL -- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Implementation of the interface to be used by Clang during the codegen of a
10// target region.
11//
12//===----------------------------------------------------------------------===//
13
14#include "omptarget.h"
15#include "OffloadPolicy.h"
16#include "OpenMP/OMPT/Callback.h"
17#include "OpenMP/OMPT/Interface.h"
18#include "PluginManager.h"
19#include "Shared/Debug.h"
20#include "Shared/EnvironmentVar.h"
21#include "Shared/Utils.h"
22#include "device.h"
23#include "private.h"
24#include "rtl.h"
25
26#include "Shared/Profile.h"
27
28#include "OpenMP/Mapping.h"
29#include "OpenMP/omp.h"
30
31#include "llvm/ADT/StringExtras.h"
32#include "llvm/ADT/bit.h"
33#include "llvm/Frontend/OpenMP/OMPConstants.h"
34#include "llvm/Object/ObjectFile.h"
35
36#include <cassert>
37#include <cstdint>
38#include <vector>
39
40using llvm::SmallVector;
41#ifdef OMPT_SUPPORT
42using namespace llvm::omp::target::ompt;
43#endif
44
45int AsyncInfoTy::synchronize() {
46 int Result = OFFLOAD_SUCCESS;
47 if (!isQueueEmpty()) {
48 switch (SyncType) {
49 case SyncTy::BLOCKING:
50 // If we have a queue we need to synchronize it now.
51 Result = Device.synchronize(*this);
52 assert(AsyncInfo.Queue == nullptr &&
53 "The device plugin should have nulled the queue to indicate there "
54 "are no outstanding actions!");
55 break;
56 case SyncTy::NON_BLOCKING:
57 Result = Device.queryAsync(*this);
58 break;
59 }
60 }
61
62 // Run any pending post-processing function registered on this async object.
63 if (Result == OFFLOAD_SUCCESS && isQueueEmpty())
64 Result = runPostProcessing();
65
66 return Result;
67}
68
69void *&AsyncInfoTy::getVoidPtrLocation() {
70 BufferLocations.push_back(nullptr);
71 return BufferLocations.back();
72}
73
74bool AsyncInfoTy::isDone() const { return isQueueEmpty(); }
75
76int32_t AsyncInfoTy::runPostProcessing() {
77 size_t Size = PostProcessingFunctions.size();
78 for (size_t I = 0; I < Size; ++I) {
79 const int Result = PostProcessingFunctions[I]();
80 if (Result != OFFLOAD_SUCCESS)
81 return Result;
82 }
83
84 // Clear the vector up until the last known function, since post-processing
85 // procedures might add new procedures themselves.
86 const auto *PrevBegin = PostProcessingFunctions.begin();
87 PostProcessingFunctions.erase(PrevBegin, PrevBegin + Size);
88
89 return OFFLOAD_SUCCESS;
90}
91
92bool AsyncInfoTy::isQueueEmpty() const { return AsyncInfo.Queue == nullptr; }
93
94/* All begin addresses for partially mapped structs must be aligned, up to 16,
95 * in order to ensure proper alignment of members. E.g.
96 *
97 * struct S {
98 * int a; // 4-aligned
99 * int b; // 4-aligned
100 * int *p; // 8-aligned
101 * } s1;
102 * ...
103 * #pragma omp target map(tofrom: s1.b, s1.p[0:N])
104 * {
105 * s1.b = 5;
106 * for (int i...) s1.p[i] = ...;
107 * }
108 *
109 * Here we are mapping s1 starting from member b, so BaseAddress=&s1=&s1.a and
110 * BeginAddress=&s1.b. Let's assume that the struct begins at address 0x100,
111 * then &s1.a=0x100, &s1.b=0x104, &s1.p=0x108. Each member obeys the alignment
112 * requirements for its type. Now, when we allocate memory on the device, in
113 * CUDA's case cuMemAlloc() returns an address which is at least 256-aligned.
114 * This means that the chunk of the struct on the device will start at a
115 * 256-aligned address, let's say 0x200. Then the address of b will be 0x200 and
116 * address of p will be a misaligned 0x204 (on the host there was no need to add
117 * padding between b and p, so p comes exactly 4 bytes after b). If the device
118 * kernel tries to access s1.p, a misaligned address error occurs (as reported
119 * by the CUDA plugin). By padding the begin address down to a multiple of 8 and
120 * extending the size of the allocated chuck accordingly, the chuck on the
121 * device will start at 0x200 with the padding (4 bytes), then &s1.b=0x204 and
122 * &s1.p=0x208, as they should be to satisfy the alignment requirements.
123 */
124static const int64_t MaxAlignment = 16;
125
126/// Return the alignment requirement of partially mapped structs, see
127/// MaxAlignment above.
128static uint64_t getPartialStructRequiredAlignment(void *HstPtrBase) {
129 int LowestOneBit = __builtin_ffsl(reinterpret_cast<uintptr_t>(HstPtrBase));
130 uint64_t BaseAlignment = 1 << (LowestOneBit - 1);
131 return MaxAlignment < BaseAlignment ? MaxAlignment : BaseAlignment;
132}
133
134/// Map global data and execute pending ctors
135static int initLibrary(DeviceTy &Device) {
136 /*
137 * Map global data
138 */
139 int32_t DeviceId = Device.DeviceID;
140 int Rc = OFFLOAD_SUCCESS;
141 {
142 std::lock_guard<decltype(PM->TrlTblMtx)> LG(PM->TrlTblMtx);
143 for (auto *HostEntriesBegin : PM->HostEntriesBeginRegistrationOrder) {
144 TranslationTable *TransTable =
145 &PM->HostEntriesBeginToTransTable[HostEntriesBegin];
146 if (TransTable->HostTable.EntriesBegin ==
147 TransTable->HostTable.EntriesEnd) {
148 // No host entry so no need to proceed
149 continue;
150 }
151
152 if (TransTable->TargetsTable[DeviceId] != 0) {
153 // Library entries have already been processed
154 continue;
155 }
156
157 // 1) get image.
158 assert(TransTable->TargetsImages.size() > (size_t)DeviceId &&
159 "Not expecting a device ID outside the table's bounds!");
160 __tgt_device_image *Img = TransTable->TargetsImages[DeviceId];
161 if (!Img) {
162 REPORT("No image loaded for device id %d.\n", DeviceId);
163 Rc = OFFLOAD_FAIL;
164 break;
165 }
166
167 // 2) Load the image onto the given device.
168 auto BinaryOrErr = Device.loadBinary(Img);
169 if (llvm::Error Err = BinaryOrErr.takeError()) {
170 REPORT("Failed to load image %s\n",
171 llvm::toString(std::move(Err)).c_str());
172 Rc = OFFLOAD_FAIL;
173 break;
174 }
175
176 // 3) Create the translation table.
177 llvm::SmallVector<__tgt_offload_entry> &DeviceEntries =
178 TransTable->TargetsEntries[DeviceId];
179 for (__tgt_offload_entry &Entry :
180 llvm::make_range(Img->EntriesBegin, Img->EntriesEnd)) {
181 __tgt_device_binary &Binary = *BinaryOrErr;
182
183 __tgt_offload_entry DeviceEntry = Entry;
184 if (Entry.size) {
185 if (Device.RTL->get_global(Binary, Entry.size, Entry.name,
186 &DeviceEntry.addr) != OFFLOAD_SUCCESS)
187 REPORT("Failed to load symbol %s\n", Entry.name);
188
189 // If unified memory is active, the corresponding global is a device
190 // reference to the host global. We need to initialize the pointer on
191 // the deive to point to the memory on the host.
192 if ((PM->getRequirements() & OMP_REQ_UNIFIED_SHARED_MEMORY) ||
193 (PM->getRequirements() & OMPX_REQ_AUTO_ZERO_COPY)) {
194 if (Device.RTL->data_submit(DeviceId, DeviceEntry.addr, Entry.addr,
195 Entry.size) != OFFLOAD_SUCCESS)
196 REPORT("Failed to write symbol for USM %s\n", Entry.name);
197 }
198 } else if (Entry.addr) {
199 if (Device.RTL->get_function(Binary, Entry.name, &DeviceEntry.addr) !=
200 OFFLOAD_SUCCESS)
201 REPORT("Failed to load kernel %s\n", Entry.name);
202 }
203 DP("Entry point " DPxMOD " maps to%s %s (" DPxMOD ")\n",
204 DPxPTR(Entry.addr), (Entry.size) ? " global" : "", Entry.name,
205 DPxPTR(DeviceEntry.addr));
206
207 DeviceEntries.emplace_back(DeviceEntry);
208 }
209
210 // Set the storage for the table and get a pointer to it.
211 __tgt_target_table DeviceTable{&DeviceEntries[0],
212 &DeviceEntries[0] + DeviceEntries.size()};
213 TransTable->DeviceTables[DeviceId] = DeviceTable;
214 __tgt_target_table *TargetTable = TransTable->TargetsTable[DeviceId] =
215 &TransTable->DeviceTables[DeviceId];
216
217 // 4) Verify whether the two table sizes match.
218 size_t Hsize =
219 TransTable->HostTable.EntriesEnd - TransTable->HostTable.EntriesBegin;
220 size_t Tsize = TargetTable->EntriesEnd - TargetTable->EntriesBegin;
221
222 // Invalid image for these host entries!
223 if (Hsize != Tsize) {
224 REPORT(
225 "Host and Target tables mismatch for device id %d [%zx != %zx].\n",
226 DeviceId, Hsize, Tsize);
227 TransTable->TargetsImages[DeviceId] = 0;
228 TransTable->TargetsTable[DeviceId] = 0;
229 Rc = OFFLOAD_FAIL;
230 break;
231 }
232
233 MappingInfoTy::HDTTMapAccessorTy HDTTMap =
234 Device.getMappingInfo().HostDataToTargetMap.getExclusiveAccessor();
235
236 __tgt_target_table *HostTable = &TransTable->HostTable;
237 for (__tgt_offload_entry *CurrDeviceEntry = TargetTable->EntriesBegin,
238 *CurrHostEntry = HostTable->EntriesBegin,
239 *EntryDeviceEnd = TargetTable->EntriesEnd;
240 CurrDeviceEntry != EntryDeviceEnd;
241 CurrDeviceEntry++, CurrHostEntry++) {
242 if (CurrDeviceEntry->size == 0)
243 continue;
244
245 assert(CurrDeviceEntry->size == CurrHostEntry->size &&
246 "data size mismatch");
247
248 // Fortran may use multiple weak declarations for the same symbol,
249 // therefore we must allow for multiple weak symbols to be loaded from
250 // the fat binary. Treat these mappings as any other "regular"
251 // mapping. Add entry to map.
252 if (Device.getMappingInfo().getTgtPtrBegin(HDTTMap, CurrHostEntry->addr,
253 CurrHostEntry->size))
254 continue;
255
256 void *CurrDeviceEntryAddr = CurrDeviceEntry->addr;
257
258 // For indirect mapping, follow the indirection and map the actual
259 // target.
260 if (CurrDeviceEntry->flags & OMP_DECLARE_TARGET_INDIRECT) {
261 AsyncInfoTy AsyncInfo(Device);
262 void *DevPtr;
263 Device.retrieveData(&DevPtr, CurrDeviceEntryAddr, sizeof(void *),
264 AsyncInfo, /*Entry=*/nullptr, &HDTTMap);
265 if (AsyncInfo.synchronize() != OFFLOAD_SUCCESS)
266 return OFFLOAD_FAIL;
267 CurrDeviceEntryAddr = DevPtr;
268 }
269
270 DP("Add mapping from host " DPxMOD " to device " DPxMOD " with size %zu"
271 ", name \"%s\"\n",
272 DPxPTR(CurrHostEntry->addr), DPxPTR(CurrDeviceEntry->addr),
273 CurrDeviceEntry->size, CurrDeviceEntry->name);
274 HDTTMap->emplace(new HostDataToTargetTy(
275 (uintptr_t)CurrHostEntry->addr /*HstPtrBase*/,
276 (uintptr_t)CurrHostEntry->addr /*HstPtrBegin*/,
277 (uintptr_t)CurrHostEntry->addr + CurrHostEntry->size /*HstPtrEnd*/,
278 (uintptr_t)CurrDeviceEntryAddr /*TgtAllocBegin*/,
279 (uintptr_t)CurrDeviceEntryAddr /*TgtPtrBegin*/,
280 false /*UseHoldRefCount*/, CurrHostEntry->name,
281 true /*IsRefCountINF*/));
282
283 // Notify about the new mapping.
284 if (Device.notifyDataMapped(CurrHostEntry->addr, CurrHostEntry->size))
285 return OFFLOAD_FAIL;
286 }
287 }
288 }
289
290 if (Rc != OFFLOAD_SUCCESS)
291 return Rc;
292
293 static Int32Envar DumpOffloadEntries =
294 Int32Envar("OMPTARGET_DUMP_OFFLOAD_ENTRIES", -1);
295 if (DumpOffloadEntries.get() == DeviceId)
296 Device.dumpOffloadEntries();
297
298 return OFFLOAD_SUCCESS;
299}
300
301void handleTargetOutcome(bool Success, ident_t *Loc) {
302 switch (OffloadPolicy::get(*PM).Kind) {
303 case OffloadPolicy::DISABLED:
304 if (Success) {
305 FATAL_MESSAGE0(1, "expected no offloading while offloading is disabled");
306 }
307 break;
308 case OffloadPolicy::MANDATORY:
309 if (!Success) {
310 if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE) {
311 auto ExclusiveDevicesAccessor = PM->getExclusiveDevicesAccessor();
312 for (auto &Device : PM->devices(ExclusiveDevicesAccessor))
313 dumpTargetPointerMappings(Loc, Device);
314 } else
315 FAILURE_MESSAGE("Consult https://openmp.llvm.org/design/Runtimes.html "
316 "for debugging options.\n");
317
318 if (!PM->getNumUsedPlugins()) {
319 FAILURE_MESSAGE(
320 "No images found compatible with the installed hardware. ");
321
322 llvm::SmallVector<llvm::StringRef> Archs;
323 for (auto &Image : PM->deviceImages()) {
324 const char *Start = reinterpret_cast<const char *>(
325 Image.getExecutableImage().ImageStart);
326 uint64_t Length = llvm::omp::target::getPtrDiff(
327 Start, Image.getExecutableImage().ImageEnd);
328 llvm::MemoryBufferRef Buffer(llvm::StringRef(Start, Length),
329 /*Identifier=*/"");
330
331 auto ObjectOrErr = llvm::object::ObjectFile::createObjectFile(Buffer);
332 if (auto Err = ObjectOrErr.takeError()) {
333 llvm::consumeError(std::move(Err));
334 continue;
335 }
336
337 if (auto CPU = (*ObjectOrErr)->tryGetCPUName())
338 Archs.push_back(*CPU);
339 }
340 fprintf(stderr, "Found %zu image(s): (%s)\n", Archs.size(),
341 llvm::join(Archs, ",").c_str());
342 }
343
344 SourceInfo Info(Loc);
345 if (Info.isAvailible())
346 fprintf(stderr, "%s:%d:%d: ", Info.getFilename(), Info.getLine(),
347 Info.getColumn());
348 else
349 FAILURE_MESSAGE("Source location information not present. Compile with "
350 "-g or -gline-tables-only.\n");
351 FATAL_MESSAGE0(
352 1, "failure of target construct while offloading is mandatory");
353 } else {
354 if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE) {
355 auto ExclusiveDevicesAccessor = PM->getExclusiveDevicesAccessor();
356 for (auto &Device : PM->devices(ExclusiveDevicesAccessor))
357 dumpTargetPointerMappings(Loc, Device);
358 }
359 }
360 break;
361 }
362}
363
364// If offload is enabled, ensure that device DeviceID has been initialized,
365// global ctors have been executed, and global data has been mapped.
366//
367// The return bool indicates if the offload is to the host device
368// There are three possible results:
369// - Return false if the taregt device is ready for offload
370// - Return true without reporting a runtime error if offload is
371// disabled, perhaps because the initial device was specified.
372// - Report a runtime error and return true.
373//
374// If DeviceID == OFFLOAD_DEVICE_DEFAULT, set DeviceID to the default device.
375// This step might be skipped if offload is disabled.
376bool checkDeviceAndCtors(int64_t &DeviceID, ident_t *Loc) {
377 if (OffloadPolicy::get(*PM).Kind == OffloadPolicy::DISABLED) {
378 DP("Offload is disabled\n");
379 return true;
380 }
381
382 if (DeviceID == OFFLOAD_DEVICE_DEFAULT) {
383 DeviceID = omp_get_default_device();
384 DP("Use default device id %" PRId64 "\n", DeviceID);
385 }
386
387 // Proposed behavior for OpenMP 5.2 in OpenMP spec github issue 2669.
388 if (omp_get_num_devices() == 0) {
389 DP("omp_get_num_devices() == 0 but offload is manadatory\n");
390 handleTargetOutcome(false, Loc);
391 return true;
392 }
393
394 if (DeviceID == omp_get_initial_device()) {
395 DP("Device is host (%" PRId64 "), returning as if offload is disabled\n",
396 DeviceID);
397 return true;
398 }
399
400 auto DeviceOrErr = PM->getDevice(DeviceID);
401 if (!DeviceOrErr)
402 FATAL_MESSAGE(DeviceID, "%s", toString(DeviceOrErr.takeError()).data());
403
404 // Check whether global data has been mapped for this device
405 if (initLibrary(*DeviceOrErr) != OFFLOAD_SUCCESS) {
406 REPORT("Failed to init globals on device %" PRId64 "\n", DeviceID);
407 handleTargetOutcome(false, Loc);
408 return true;
409 }
410
411 return false;
412}
413
414static int32_t getParentIndex(int64_t Type) {
415 return ((Type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1;
416}
417
418void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind,
419 const char *Name) {
420 DP("Call to %s for device %d requesting %zu bytes\n", Name, DeviceNum, Size);
421
422 if (Size <= 0) {
423 DP("Call to %s with non-positive length\n", Name);
424 return NULL;
425 }
426
427 void *Rc = NULL;
428
429 if (DeviceNum == omp_get_initial_device()) {
430 Rc = malloc(size: Size);
431 DP("%s returns host ptr " DPxMOD "\n", Name, DPxPTR(Rc));
432 return Rc;
433 }
434
435 auto DeviceOrErr = PM->getDevice(DeviceNum);
436 if (!DeviceOrErr)
437 FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
438
439 Rc = DeviceOrErr->allocData(Size, nullptr, Kind);
440 DP("%s returns device ptr " DPxMOD "\n", Name, DPxPTR(Rc));
441 return Rc;
442}
443
444void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind,
445 const char *Name) {
446 DP("Call to %s for device %d and address " DPxMOD "\n", Name, DeviceNum,
447 DPxPTR(DevicePtr));
448
449 if (!DevicePtr) {
450 DP("Call to %s with NULL ptr\n", Name);
451 return;
452 }
453
454 if (DeviceNum == omp_get_initial_device()) {
455 free(ptr: DevicePtr);
456 DP("%s deallocated host ptr\n", Name);
457 return;
458 }
459
460 auto DeviceOrErr = PM->getDevice(DeviceNum);
461 if (!DeviceOrErr)
462 FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
463
464 DeviceOrErr->deleteData(DevicePtr, Kind);
465 DP("omp_target_free deallocated device ptr\n");
466}
467
468void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum,
469 const char *Name) {
470 DP("Call to %s for device %d locking %zu bytes\n", Name, DeviceNum, Size);
471
472 if (Size <= 0) {
473 DP("Call to %s with non-positive length\n", Name);
474 return NULL;
475 }
476
477 void *RC = NULL;
478
479 auto DeviceOrErr = PM->getDevice(DeviceNum);
480 if (!DeviceOrErr)
481 FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
482
483 int32_t Err = 0;
484 Err = DeviceOrErr->RTL->data_lock(DeviceNum, HostPtr, Size, &RC);
485 if (Err) {
486 DP("Could not lock ptr %p\n", HostPtr);
487 return nullptr;
488 }
489 DP("%s returns device ptr " DPxMOD "\n", Name, DPxPTR(RC));
490 return RC;
491}
492
493void targetUnlockExplicit(void *HostPtr, int DeviceNum, const char *Name) {
494 DP("Call to %s for device %d unlocking\n", Name, DeviceNum);
495
496 auto DeviceOrErr = PM->getDevice(DeviceNum);
497 if (!DeviceOrErr)
498 FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
499
500 DeviceOrErr->RTL->data_unlock(DeviceNum, HostPtr);
501 DP("%s returns\n", Name);
502}
503
504/// Call the user-defined mapper function followed by the appropriate
505// targetData* function (targetData{Begin,End,Update}).
506int targetDataMapper(ident_t *Loc, DeviceTy &Device, void *ArgBase, void *Arg,
507 int64_t ArgSize, int64_t ArgType, map_var_info_t ArgNames,
508 void *ArgMapper, AsyncInfoTy &AsyncInfo,
509 TargetDataFuncPtrTy TargetDataFunction) {
510 DP("Calling the mapper function " DPxMOD "\n", DPxPTR(ArgMapper));
511
512 // The mapper function fills up Components.
513 MapperComponentsTy MapperComponents;
514 MapperFuncPtrTy MapperFuncPtr = (MapperFuncPtrTy)(ArgMapper);
515 (*MapperFuncPtr)((void *)&MapperComponents, ArgBase, Arg, ArgSize, ArgType,
516 ArgNames);
517
518 // Construct new arrays for args_base, args, arg_sizes and arg_types
519 // using the information in MapperComponents and call the corresponding
520 // targetData* function using these new arrays.
521 SmallVector<void *> MapperArgsBase(MapperComponents.Components.size());
522 SmallVector<void *> MapperArgs(MapperComponents.Components.size());
523 SmallVector<int64_t> MapperArgSizes(MapperComponents.Components.size());
524 SmallVector<int64_t> MapperArgTypes(MapperComponents.Components.size());
525 SmallVector<void *> MapperArgNames(MapperComponents.Components.size());
526
527 for (unsigned I = 0, E = MapperComponents.Components.size(); I < E; ++I) {
528 auto &C = MapperComponents.Components[I];
529 MapperArgsBase[I] = C.Base;
530 MapperArgs[I] = C.Begin;
531 MapperArgSizes[I] = C.Size;
532 MapperArgTypes[I] = C.Type;
533 MapperArgNames[I] = C.Name;
534 }
535
536 int Rc = TargetDataFunction(Loc, Device, MapperComponents.Components.size(),
537 MapperArgsBase.data(), MapperArgs.data(),
538 MapperArgSizes.data(), MapperArgTypes.data(),
539 MapperArgNames.data(), /*arg_mappers*/ nullptr,
540 AsyncInfo, /*FromMapper=*/true);
541
542 return Rc;
543}
544
545/// Internal function to do the mapping and transfer the data to the device
546int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
547 void **ArgsBase, void **Args, int64_t *ArgSizes,
548 int64_t *ArgTypes, map_var_info_t *ArgNames,
549 void **ArgMappers, AsyncInfoTy &AsyncInfo,
550 bool FromMapper) {
551 // process each input.
552 for (int32_t I = 0; I < ArgNum; ++I) {
553 // Ignore private variables and arrays - there is no mapping for them.
554 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
555 (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
556 continue;
557 TIMESCOPE_WITH_DETAILS_AND_IDENT(
558 "HostToDev", "Size=" + std::to_string(val: ArgSizes[I]) + "B", Loc);
559 if (ArgMappers && ArgMappers[I]) {
560 // Instead of executing the regular path of targetDataBegin, call the
561 // targetDataMapper variant which will call targetDataBegin again
562 // with new arguments.
563 DP("Calling targetDataMapper for the %dth argument\n", I);
564
565 map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
566 int Rc = targetDataMapper(Loc, Device, ArgsBase[I], Args[I], ArgSizes[I],
567 ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo,
568 targetDataBegin);
569
570 if (Rc != OFFLOAD_SUCCESS) {
571 REPORT("Call to targetDataBegin via targetDataMapper for custom mapper"
572 " failed.\n");
573 return OFFLOAD_FAIL;
574 }
575
576 // Skip the rest of this function, continue to the next argument.
577 continue;
578 }
579
580 void *HstPtrBegin = Args[I];
581 void *HstPtrBase = ArgsBase[I];
582 int64_t DataSize = ArgSizes[I];
583 map_var_info_t HstPtrName = (!ArgNames) ? nullptr : ArgNames[I];
584
585 // Adjust for proper alignment if this is a combined entry (for structs).
586 // Look at the next argument - if that is MEMBER_OF this one, then this one
587 // is a combined entry.
588 int64_t TgtPadding = 0;
589 const int NextI = I + 1;
590 if (getParentIndex(Type: ArgTypes[I]) < 0 && NextI < ArgNum &&
591 getParentIndex(Type: ArgTypes[NextI]) == I) {
592 int64_t Alignment = getPartialStructRequiredAlignment(HstPtrBase);
593 TgtPadding = (int64_t)HstPtrBegin % Alignment;
594 if (TgtPadding) {
595 DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD
596 "\n",
597 TgtPadding, DPxPTR(HstPtrBegin));
598 }
599 }
600
601 // Address of pointer on the host and device, respectively.
602 void *PointerHstPtrBegin, *PointerTgtPtrBegin;
603 TargetPointerResultTy PointerTpr;
604 bool IsHostPtr = false;
605 bool IsImplicit = ArgTypes[I] & OMP_TGT_MAPTYPE_IMPLICIT;
606 // Force the creation of a device side copy of the data when:
607 // a close map modifier was associated with a map that contained a to.
608 bool HasCloseModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_CLOSE;
609 bool HasPresentModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_PRESENT;
610 bool HasHoldModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_OMPX_HOLD;
611 // UpdateRef is based on MEMBER_OF instead of TARGET_PARAM because if we
612 // have reached this point via __tgt_target_data_begin and not __tgt_target
613 // then no argument is marked as TARGET_PARAM ("omp target data map" is not
614 // associated with a target region, so there are no target parameters). This
615 // may be considered a hack, we could revise the scheme in the future.
616 bool UpdateRef =
617 !(ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) && !(FromMapper && I == 0);
618
619 MappingInfoTy::HDTTMapAccessorTy HDTTMap =
620 Device.getMappingInfo().HostDataToTargetMap.getExclusiveAccessor();
621 if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
622 DP("Has a pointer entry: \n");
623 // Base is address of pointer.
624 //
625 // Usually, the pointer is already allocated by this time. For example:
626 //
627 // #pragma omp target map(s.p[0:N])
628 //
629 // The map entry for s comes first, and the PTR_AND_OBJ entry comes
630 // afterward, so the pointer is already allocated by the time the
631 // PTR_AND_OBJ entry is handled below, and PointerTgtPtrBegin is thus
632 // non-null. However, "declare target link" can produce a PTR_AND_OBJ
633 // entry for a global that might not already be allocated by the time the
634 // PTR_AND_OBJ entry is handled below, and so the allocation might fail
635 // when HasPresentModifier.
636 PointerTpr = Device.getMappingInfo().getTargetPointer(
637 HDTTMap, HstPtrBase, HstPtrBase, /*TgtPadding=*/0, sizeof(void *),
638 /*HstPtrName=*/nullptr,
639 /*HasFlagTo=*/false, /*HasFlagAlways=*/false, IsImplicit, UpdateRef,
640 HasCloseModifier, HasPresentModifier, HasHoldModifier, AsyncInfo,
641 /*OwnedTPR=*/nullptr, /*ReleaseHDTTMap=*/false);
642 PointerTgtPtrBegin = PointerTpr.TargetPointer;
643 IsHostPtr = PointerTpr.Flags.IsHostPointer;
644 if (!PointerTgtPtrBegin) {
645 REPORT("Call to getTargetPointer returned null pointer (%s).\n",
646 HasPresentModifier ? "'present' map type modifier"
647 : "device failure or illegal mapping");
648 return OFFLOAD_FAIL;
649 }
650 DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new"
651 "\n",
652 sizeof(void *), DPxPTR(PointerTgtPtrBegin),
653 (PointerTpr.Flags.IsNewEntry ? "" : " not"));
654 PointerHstPtrBegin = HstPtrBase;
655 // modify current entry.
656 HstPtrBase = *(void **)HstPtrBase;
657 // No need to update pointee ref count for the first element of the
658 // subelement that comes from mapper.
659 UpdateRef =
660 (!FromMapper || I != 0); // subsequently update ref count of pointee
661 }
662
663 const bool HasFlagTo = ArgTypes[I] & OMP_TGT_MAPTYPE_TO;
664 const bool HasFlagAlways = ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS;
665 // Note that HDTTMap will be released in getTargetPointer.
666 auto TPR = Device.getMappingInfo().getTargetPointer(
667 HDTTMap, HstPtrBegin, HstPtrBase, TgtPadding, DataSize, HstPtrName,
668 HasFlagTo, HasFlagAlways, IsImplicit, UpdateRef, HasCloseModifier,
669 HasPresentModifier, HasHoldModifier, AsyncInfo, PointerTpr.getEntry());
670 void *TgtPtrBegin = TPR.TargetPointer;
671 IsHostPtr = TPR.Flags.IsHostPointer;
672 // If data_size==0, then the argument could be a zero-length pointer to
673 // NULL, so getOrAlloc() returning NULL is not an error.
674 if (!TgtPtrBegin && (DataSize || HasPresentModifier)) {
675 REPORT("Call to getTargetPointer returned null pointer (%s).\n",
676 HasPresentModifier ? "'present' map type modifier"
677 : "device failure or illegal mapping");
678 return OFFLOAD_FAIL;
679 }
680 DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
681 " - is%s new\n",
682 DataSize, DPxPTR(TgtPtrBegin), (TPR.Flags.IsNewEntry ? "" : " not"));
683
684 if (ArgTypes[I] & OMP_TGT_MAPTYPE_RETURN_PARAM) {
685 uintptr_t Delta = (uintptr_t)HstPtrBegin - (uintptr_t)HstPtrBase;
686 void *TgtPtrBase = (void *)((uintptr_t)TgtPtrBegin - Delta);
687 DP("Returning device pointer " DPxMOD "\n", DPxPTR(TgtPtrBase));
688 ArgsBase[I] = TgtPtrBase;
689 }
690
691 if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ && !IsHostPtr) {
692
693 uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
694 void *ExpectedTgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta);
695
696 if (PointerTpr.getEntry()->addShadowPointer(ShadowPtrInfoTy{
697 (void **)PointerHstPtrBegin, HstPtrBase,
698 (void **)PointerTgtPtrBegin, ExpectedTgtPtrBase})) {
699 DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n",
700 DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
701
702 void *&TgtPtrBase = AsyncInfo.getVoidPtrLocation();
703 TgtPtrBase = ExpectedTgtPtrBase;
704
705 int Ret =
706 Device.submitData(PointerTgtPtrBegin, &TgtPtrBase, sizeof(void *),
707 AsyncInfo, PointerTpr.getEntry());
708 if (Ret != OFFLOAD_SUCCESS) {
709 REPORT("Copying data to device failed.\n");
710 return OFFLOAD_FAIL;
711 }
712 if (PointerTpr.getEntry()->addEventIfNecessary(Device, AsyncInfo) !=
713 OFFLOAD_SUCCESS)
714 return OFFLOAD_FAIL;
715 }
716 }
717
718 // Check if variable can be used on the device:
719 bool IsStructMember = ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF;
720 if (getInfoLevel() & OMP_INFOTYPE_EMPTY_MAPPING && ArgTypes[I] != 0 &&
721 !IsStructMember && !IsImplicit && !TPR.isPresent() &&
722 !TPR.isContained() && !TPR.isHostPointer())
723 INFO(OMP_INFOTYPE_EMPTY_MAPPING, Device.DeviceID,
724 "variable %s does not have a valid device counterpart\n",
725 (HstPtrName) ? getNameFromMapping(HstPtrName).c_str() : "unknown");
726 }
727
728 return OFFLOAD_SUCCESS;
729}
730
731namespace {
732/// This structure contains information to deallocate a target pointer, aka.
733/// used to fix up the shadow map and potentially delete the entry from the
734/// mapping table via \p DeviceTy::deallocTgtPtr.
735struct PostProcessingInfo {
736 /// Host pointer used to look up into the map table
737 void *HstPtrBegin;
738
739 /// Size of the data
740 int64_t DataSize;
741
742 /// The mapping type (bitfield).
743 int64_t ArgType;
744
745 /// The target pointer information.
746 TargetPointerResultTy TPR;
747
748 PostProcessingInfo(void *HstPtr, int64_t Size, int64_t ArgType,
749 TargetPointerResultTy &&TPR)
750 : HstPtrBegin(HstPtr), DataSize(Size), ArgType(ArgType),
751 TPR(std::move(TPR)) {}
752};
753
754} // namespace
755
756/// Applies the necessary post-processing procedures to entries listed in \p
757/// EntriesInfo after the execution of all device side operations from a target
758/// data end. This includes the update of pointers at the host and removal of
759/// device buffer when needed. It returns OFFLOAD_FAIL or OFFLOAD_SUCCESS
760/// according to the successfulness of the operations.
761[[nodiscard]] static int
762postProcessingTargetDataEnd(DeviceTy *Device,
763 SmallVector<PostProcessingInfo> &EntriesInfo) {
764 int Ret = OFFLOAD_SUCCESS;
765
766 for (auto &[HstPtrBegin, DataSize, ArgType, TPR] : EntriesInfo) {
767 bool DelEntry = !TPR.isHostPointer();
768
769 // If the last element from the mapper (for end transfer args comes in
770 // reverse order), do not remove the partial entry, the parent struct still
771 // exists.
772 if ((ArgType & OMP_TGT_MAPTYPE_MEMBER_OF) &&
773 !(ArgType & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
774 DelEntry = false; // protect parent struct from being deallocated
775 }
776
777 // If we marked the entry to be deleted we need to verify no other
778 // thread reused it by now. If deletion is still supposed to happen by
779 // this thread LR will be set and exclusive access to the HDTT map
780 // will avoid another thread reusing the entry now. Note that we do
781 // not request (exclusive) access to the HDTT map if DelEntry is
782 // not set.
783 MappingInfoTy::HDTTMapAccessorTy HDTTMap =
784 Device->getMappingInfo().HostDataToTargetMap.getExclusiveAccessor();
785
786 // We cannot use a lock guard because we may end up delete the mutex.
787 // We also explicitly unlocked the entry after it was put in the EntriesInfo
788 // so it can be reused.
789 TPR.getEntry()->lock();
790 auto *Entry = TPR.getEntry();
791
792 const bool IsNotLastUser = Entry->decDataEndThreadCount() != 0;
793 if (DelEntry && (Entry->getTotalRefCount() != 0 || IsNotLastUser)) {
794 // The thread is not in charge of deletion anymore. Give up access
795 // to the HDTT map and unset the deletion flag.
796 HDTTMap.destroy();
797 DelEntry = false;
798 }
799
800 // If we copied back to the host a struct/array containing pointers,
801 // we need to restore the original host pointer values from their
802 // shadow copies. If the struct is going to be deallocated, remove any
803 // remaining shadow pointer entries for this struct.
804 const bool HasFrom = ArgType & OMP_TGT_MAPTYPE_FROM;
805 if (HasFrom) {
806 Entry->foreachShadowPointerInfo([&](const ShadowPtrInfoTy &ShadowPtr) {
807 *ShadowPtr.HstPtrAddr = ShadowPtr.HstPtrVal;
808 DP("Restoring original host pointer value " DPxMOD " for host "
809 "pointer " DPxMOD "\n",
810 DPxPTR(ShadowPtr.HstPtrVal), DPxPTR(ShadowPtr.HstPtrAddr));
811 return OFFLOAD_SUCCESS;
812 });
813 }
814
815 // Give up the lock as we either don't need it anymore (e.g., done with
816 // TPR), or erase TPR.
817 TPR.setEntry(nullptr);
818
819 if (!DelEntry)
820 continue;
821
822 Ret = Device->getMappingInfo().eraseMapEntry(HDTTMap, Entry, DataSize);
823 // Entry is already remove from the map, we can unlock it now.
824 HDTTMap.destroy();
825 Ret |= Device->getMappingInfo().deallocTgtPtrAndEntry(Entry, DataSize);
826 if (Ret != OFFLOAD_SUCCESS) {
827 REPORT("Deallocating data from device failed.\n");
828 break;
829 }
830 }
831
832 delete &EntriesInfo;
833 return Ret;
834}
835
836/// Internal function to undo the mapping and retrieve the data from the device.
837int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
838 void **ArgBases, void **Args, int64_t *ArgSizes,
839 int64_t *ArgTypes, map_var_info_t *ArgNames,
840 void **ArgMappers, AsyncInfoTy &AsyncInfo, bool FromMapper) {
841 int Ret = OFFLOAD_SUCCESS;
842 auto *PostProcessingPtrs = new SmallVector<PostProcessingInfo>();
843 // process each input.
844 for (int32_t I = ArgNum - 1; I >= 0; --I) {
845 // Ignore private variables and arrays - there is no mapping for them.
846 // Also, ignore the use_device_ptr directive, it has no effect here.
847 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
848 (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
849 continue;
850
851 if (ArgMappers && ArgMappers[I]) {
852 // Instead of executing the regular path of targetDataEnd, call the
853 // targetDataMapper variant which will call targetDataEnd again
854 // with new arguments.
855 DP("Calling targetDataMapper for the %dth argument\n", I);
856
857 map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
858 Ret = targetDataMapper(Loc, Device, ArgBases[I], Args[I], ArgSizes[I],
859 ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo,
860 targetDataEnd);
861
862 if (Ret != OFFLOAD_SUCCESS) {
863 REPORT("Call to targetDataEnd via targetDataMapper for custom mapper"
864 " failed.\n");
865 return OFFLOAD_FAIL;
866 }
867
868 // Skip the rest of this function, continue to the next argument.
869 continue;
870 }
871
872 void *HstPtrBegin = Args[I];
873 int64_t DataSize = ArgSizes[I];
874 bool IsImplicit = ArgTypes[I] & OMP_TGT_MAPTYPE_IMPLICIT;
875 bool UpdateRef = (!(ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) ||
876 (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) &&
877 !(FromMapper && I == 0);
878 bool ForceDelete = ArgTypes[I] & OMP_TGT_MAPTYPE_DELETE;
879 bool HasPresentModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_PRESENT;
880 bool HasHoldModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_OMPX_HOLD;
881
882 // If PTR_AND_OBJ, HstPtrBegin is address of pointee
883 TargetPointerResultTy TPR = Device.getMappingInfo().getTgtPtrBegin(
884 HstPtrBegin, DataSize, UpdateRef, HasHoldModifier, !IsImplicit,
885 ForceDelete, /*FromDataEnd=*/true);
886 void *TgtPtrBegin = TPR.TargetPointer;
887 if (!TPR.isPresent() && !TPR.isHostPointer() &&
888 (DataSize || HasPresentModifier)) {
889 DP("Mapping does not exist (%s)\n",
890 (HasPresentModifier ? "'present' map type modifier" : "ignored"));
891 if (HasPresentModifier) {
892 // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 350 L10-13:
893 // "If a map clause appears on a target, target data, target enter data
894 // or target exit data construct with a present map-type-modifier then
895 // on entry to the region if the corresponding list item does not appear
896 // in the device data environment then an error occurs and the program
897 // terminates."
898 //
899 // This should be an error upon entering an "omp target exit data". It
900 // should not be an error upon exiting an "omp target data" or "omp
901 // target". For "omp target data", Clang thus doesn't include present
902 // modifiers for end calls. For "omp target", we have not found a valid
903 // OpenMP program for which the error matters: it appears that, if a
904 // program can guarantee that data is present at the beginning of an
905 // "omp target" region so that there's no error there, that data is also
906 // guaranteed to be present at the end.
907 MESSAGE("device mapping required by 'present' map type modifier does "
908 "not exist for host address " DPxMOD " (%" PRId64 " bytes)",
909 DPxPTR(HstPtrBegin), DataSize);
910 return OFFLOAD_FAIL;
911 }
912 } else {
913 DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
914 " - is%s last\n",
915 DataSize, DPxPTR(TgtPtrBegin), (TPR.Flags.IsLast ? "" : " not"));
916 }
917
918 // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 351 L14-16:
919 // "If the map clause appears on a target, target data, or target exit data
920 // construct and a corresponding list item of the original list item is not
921 // present in the device data environment on exit from the region then the
922 // list item is ignored."
923 if (!TPR.isPresent())
924 continue;
925
926 // Move data back to the host
927 const bool HasAlways = ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS;
928 const bool HasFrom = ArgTypes[I] & OMP_TGT_MAPTYPE_FROM;
929 if (HasFrom && (HasAlways || TPR.Flags.IsLast) &&
930 !TPR.Flags.IsHostPointer && DataSize != 0) {
931 DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
932 DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
933 TIMESCOPE_WITH_DETAILS_AND_IDENT(
934 "DevToHost", "Size=" + std::to_string(val: DataSize) + "B", Loc);
935 // Wait for any previous transfer if an event is present.
936 if (void *Event = TPR.getEntry()->getEvent()) {
937 if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) {
938 REPORT("Failed to wait for event " DPxMOD ".\n", DPxPTR(Event));
939 return OFFLOAD_FAIL;
940 }
941 }
942
943 Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, DataSize, AsyncInfo,
944 TPR.getEntry());
945 if (Ret != OFFLOAD_SUCCESS) {
946 REPORT("Copying data from device failed.\n");
947 return OFFLOAD_FAIL;
948 }
949
950 // As we are expecting to delete the entry the d2h copy might race
951 // with another one that also tries to delete the entry. This happens
952 // as the entry can be reused and the reuse might happen after the
953 // copy-back was issued but before it completed. Since the reuse might
954 // also copy-back a value we would race.
955 if (TPR.Flags.IsLast) {
956 if (TPR.getEntry()->addEventIfNecessary(Device, AsyncInfo) !=
957 OFFLOAD_SUCCESS)
958 return OFFLOAD_FAIL;
959 }
960 }
961
962 // Add pointer to the buffer for post-synchronize processing.
963 PostProcessingPtrs->emplace_back(HstPtrBegin, DataSize, ArgTypes[I],
964 std::move(TPR));
965 PostProcessingPtrs->back().TPR.getEntry()->unlock();
966 }
967
968 // Add post-processing functions
969 // TODO: We might want to remove `mutable` in the future by not changing the
970 // captured variables somehow.
971 AsyncInfo.addPostProcessingFunction([=, Device = &Device]() mutable -> int {
972 return postProcessingTargetDataEnd(Device, *PostProcessingPtrs);
973 });
974
975 return Ret;
976}
977
978static int targetDataContiguous(ident_t *Loc, DeviceTy &Device, void *ArgsBase,
979 void *HstPtrBegin, int64_t ArgSize,
980 int64_t ArgType, AsyncInfoTy &AsyncInfo) {
981 TargetPointerResultTy TPR = Device.getMappingInfo().getTgtPtrBegin(
982 HstPtrBegin, ArgSize, /*UpdateRefCount=*/false,
983 /*UseHoldRefCount=*/false, /*MustContain=*/true);
984 void *TgtPtrBegin = TPR.TargetPointer;
985 if (!TPR.isPresent()) {
986 DP("hst data:" DPxMOD " not found, becomes a noop\n", DPxPTR(HstPtrBegin));
987 if (ArgType & OMP_TGT_MAPTYPE_PRESENT) {
988 MESSAGE("device mapping required by 'present' motion modifier does not "
989 "exist for host address " DPxMOD " (%" PRId64 " bytes)",
990 DPxPTR(HstPtrBegin), ArgSize);
991 return OFFLOAD_FAIL;
992 }
993 return OFFLOAD_SUCCESS;
994 }
995
996 if (TPR.Flags.IsHostPointer) {
997 DP("hst data:" DPxMOD " unified and shared, becomes a noop\n",
998 DPxPTR(HstPtrBegin));
999 return OFFLOAD_SUCCESS;
1000 }
1001
1002 if (ArgType & OMP_TGT_MAPTYPE_TO) {
1003 DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
1004 ArgSize, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
1005 int Ret = Device.submitData(TgtPtrBegin, HstPtrBegin, ArgSize, AsyncInfo,
1006 TPR.getEntry());
1007 if (Ret != OFFLOAD_SUCCESS) {
1008 REPORT("Copying data to device failed.\n");
1009 return OFFLOAD_FAIL;
1010 }
1011 if (TPR.getEntry()) {
1012 int Ret = TPR.getEntry()->foreachShadowPointerInfo(
1013 [&](ShadowPtrInfoTy &ShadowPtr) {
1014 DP("Restoring original target pointer value " DPxMOD " for target "
1015 "pointer " DPxMOD "\n",
1016 DPxPTR(ShadowPtr.TgtPtrVal), DPxPTR(ShadowPtr.TgtPtrAddr));
1017 Ret = Device.submitData(ShadowPtr.TgtPtrAddr,
1018 (void *)&ShadowPtr.TgtPtrVal,
1019 sizeof(void *), AsyncInfo);
1020 if (Ret != OFFLOAD_SUCCESS) {
1021 REPORT("Copying data to device failed.\n");
1022 return OFFLOAD_FAIL;
1023 }
1024 return OFFLOAD_SUCCESS;
1025 });
1026 if (Ret != OFFLOAD_SUCCESS) {
1027 DP("Updating shadow map failed\n");
1028 return Ret;
1029 }
1030 }
1031 }
1032
1033 if (ArgType & OMP_TGT_MAPTYPE_FROM) {
1034 DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
1035 ArgSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
1036 int Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, ArgSize, AsyncInfo,
1037 TPR.getEntry());
1038 if (Ret != OFFLOAD_SUCCESS) {
1039 REPORT("Copying data from device failed.\n");
1040 return OFFLOAD_FAIL;
1041 }
1042
1043 // Wait for device-to-host memcopies for whole struct to complete,
1044 // before restoring the correct host pointer.
1045 if (auto *Entry = TPR.getEntry()) {
1046 AsyncInfo.addPostProcessingFunction([=]() -> int {
1047 int Ret = Entry->foreachShadowPointerInfo(
1048 [&](const ShadowPtrInfoTy &ShadowPtr) {
1049 *ShadowPtr.HstPtrAddr = ShadowPtr.HstPtrVal;
1050 DP("Restoring original host pointer value " DPxMOD
1051 " for host pointer " DPxMOD "\n",
1052 DPxPTR(ShadowPtr.HstPtrVal), DPxPTR(ShadowPtr.HstPtrAddr));
1053 return OFFLOAD_SUCCESS;
1054 });
1055 Entry->unlock();
1056 if (Ret != OFFLOAD_SUCCESS) {
1057 DP("Updating shadow map failed\n");
1058 return Ret;
1059 }
1060 return OFFLOAD_SUCCESS;
1061 });
1062 }
1063 }
1064
1065 return OFFLOAD_SUCCESS;
1066}
1067
1068static int targetDataNonContiguous(ident_t *Loc, DeviceTy &Device,
1069 void *ArgsBase,
1070 __tgt_target_non_contig *NonContig,
1071 uint64_t Size, int64_t ArgType,
1072 int CurrentDim, int DimSize, uint64_t Offset,
1073 AsyncInfoTy &AsyncInfo) {
1074 int Ret = OFFLOAD_SUCCESS;
1075 if (CurrentDim < DimSize) {
1076 for (unsigned int I = 0; I < NonContig[CurrentDim].Count; ++I) {
1077 uint64_t CurOffset =
1078 (NonContig[CurrentDim].Offset + I) * NonContig[CurrentDim].Stride;
1079 // we only need to transfer the first element for the last dimension
1080 // since we've already got a contiguous piece.
1081 if (CurrentDim != DimSize - 1 || I == 0) {
1082 Ret = targetDataNonContiguous(Loc, Device, ArgsBase, NonContig, Size,
1083 ArgType, CurrentDim + 1, DimSize,
1084 Offset + CurOffset, AsyncInfo);
1085 // Stop the whole process if any contiguous piece returns anything
1086 // other than OFFLOAD_SUCCESS.
1087 if (Ret != OFFLOAD_SUCCESS)
1088 return Ret;
1089 }
1090 }
1091 } else {
1092 char *Ptr = (char *)ArgsBase + Offset;
1093 DP("Transfer of non-contiguous : host ptr " DPxMOD " offset %" PRIu64
1094 " len %" PRIu64 "\n",
1095 DPxPTR(Ptr), Offset, Size);
1096 Ret = targetDataContiguous(Loc, Device, ArgsBase, Ptr, Size, ArgType,
1097 AsyncInfo);
1098 }
1099 return Ret;
1100}
1101
1102static int getNonContigMergedDimension(__tgt_target_non_contig *NonContig,
1103 int32_t DimSize) {
1104 int RemovedDim = 0;
1105 for (int I = DimSize - 1; I > 0; --I) {
1106 if (NonContig[I].Count * NonContig[I].Stride == NonContig[I - 1].Stride)
1107 RemovedDim++;
1108 }
1109 return RemovedDim;
1110}
1111
1112/// Internal function to pass data to/from the target.
1113int targetDataUpdate(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
1114 void **ArgsBase, void **Args, int64_t *ArgSizes,
1115 int64_t *ArgTypes, map_var_info_t *ArgNames,
1116 void **ArgMappers, AsyncInfoTy &AsyncInfo, bool) {
1117 // process each input.
1118 for (int32_t I = 0; I < ArgNum; ++I) {
1119 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
1120 (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
1121 continue;
1122
1123 if (ArgMappers && ArgMappers[I]) {
1124 // Instead of executing the regular path of targetDataUpdate, call the
1125 // targetDataMapper variant which will call targetDataUpdate again
1126 // with new arguments.
1127 DP("Calling targetDataMapper for the %dth argument\n", I);
1128
1129 map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
1130 int Ret = targetDataMapper(Loc, Device, ArgsBase[I], Args[I], ArgSizes[I],
1131 ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo,
1132 targetDataUpdate);
1133
1134 if (Ret != OFFLOAD_SUCCESS) {
1135 REPORT("Call to targetDataUpdate via targetDataMapper for custom mapper"
1136 " failed.\n");
1137 return OFFLOAD_FAIL;
1138 }
1139
1140 // Skip the rest of this function, continue to the next argument.
1141 continue;
1142 }
1143
1144 int Ret = OFFLOAD_SUCCESS;
1145
1146 if (ArgTypes[I] & OMP_TGT_MAPTYPE_NON_CONTIG) {
1147 __tgt_target_non_contig *NonContig = (__tgt_target_non_contig *)Args[I];
1148 int32_t DimSize = ArgSizes[I];
1149 uint64_t Size =
1150 NonContig[DimSize - 1].Count * NonContig[DimSize - 1].Stride;
1151 int32_t MergedDim = getNonContigMergedDimension(NonContig, DimSize);
1152 Ret = targetDataNonContiguous(
1153 Loc, Device, ArgsBase[I], NonContig, Size, ArgTypes[I],
1154 /*current_dim=*/0, DimSize - MergedDim, /*offset=*/0, AsyncInfo);
1155 } else {
1156 Ret = targetDataContiguous(Loc, Device, ArgsBase[I], Args[I], ArgSizes[I],
1157 ArgTypes[I], AsyncInfo);
1158 }
1159 if (Ret == OFFLOAD_FAIL)
1160 return OFFLOAD_FAIL;
1161 }
1162 return OFFLOAD_SUCCESS;
1163}
1164
1165static const unsigned LambdaMapping = OMP_TGT_MAPTYPE_PTR_AND_OBJ |
1166 OMP_TGT_MAPTYPE_LITERAL |
1167 OMP_TGT_MAPTYPE_IMPLICIT;
1168static bool isLambdaMapping(int64_t Mapping) {
1169 return (Mapping & LambdaMapping) == LambdaMapping;
1170}
1171
1172namespace {
1173/// Find the table information in the map or look it up in the translation
1174/// tables.
1175TableMap *getTableMap(void *HostPtr) {
1176 std::lock_guard<std::mutex> TblMapLock(PM->TblMapMtx);
1177 HostPtrToTableMapTy::iterator TableMapIt =
1178 PM->HostPtrToTableMap.find(HostPtr);
1179
1180 if (TableMapIt != PM->HostPtrToTableMap.end())
1181 return &TableMapIt->second;
1182
1183 // We don't have a map. So search all the registered libraries.
1184 TableMap *TM = nullptr;
1185 std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
1186 for (HostEntriesBeginToTransTableTy::iterator Itr =
1187 PM->HostEntriesBeginToTransTable.begin();
1188 Itr != PM->HostEntriesBeginToTransTable.end(); ++Itr) {
1189 // get the translation table (which contains all the good info).
1190 TranslationTable *TransTable = &Itr->second;
1191 // iterate over all the host table entries to see if we can locate the
1192 // host_ptr.
1193 __tgt_offload_entry *Cur = TransTable->HostTable.EntriesBegin;
1194 for (uint32_t I = 0; Cur < TransTable->HostTable.EntriesEnd; ++Cur, ++I) {
1195 if (Cur->addr != HostPtr)
1196 continue;
1197 // we got a match, now fill the HostPtrToTableMap so that we
1198 // may avoid this search next time.
1199 TM = &(PM->HostPtrToTableMap)[HostPtr];
1200 TM->Table = TransTable;
1201 TM->Index = I;
1202 return TM;
1203 }
1204 }
1205
1206 return nullptr;
1207}
1208
1209/// A class manages private arguments in a target region.
1210class PrivateArgumentManagerTy {
1211 /// A data structure for the information of first-private arguments. We can
1212 /// use this information to optimize data transfer by packing all
1213 /// first-private arguments and transfer them all at once.
1214 struct FirstPrivateArgInfoTy {
1215 /// Host pointer begin
1216 char *HstPtrBegin;
1217 /// Host pointer end
1218 char *HstPtrEnd;
1219 /// The index of the element in \p TgtArgs corresponding to the argument
1220 int Index;
1221 /// Alignment of the entry (base of the entry, not after the entry).
1222 uint32_t Alignment;
1223 /// Size (without alignment, see padding)
1224 uint32_t Size;
1225 /// Padding used to align this argument entry, if necessary.
1226 uint32_t Padding;
1227 /// Host pointer name
1228 map_var_info_t HstPtrName = nullptr;
1229
1230 FirstPrivateArgInfoTy(int Index, void *HstPtr, uint32_t Size,
1231 uint32_t Alignment, uint32_t Padding,
1232 map_var_info_t HstPtrName = nullptr)
1233 : HstPtrBegin(reinterpret_cast<char *>(HstPtr)),
1234 HstPtrEnd(HstPtrBegin + Size), Index(Index), Alignment(Alignment),
1235 Size(Size), Padding(Padding), HstPtrName(HstPtrName) {}
1236 };
1237
1238 /// A vector of target pointers for all private arguments
1239 SmallVector<void *> TgtPtrs;
1240
1241 /// A vector of information of all first-private arguments to be packed
1242 SmallVector<FirstPrivateArgInfoTy> FirstPrivateArgInfo;
1243 /// Host buffer for all arguments to be packed
1244 SmallVector<char> FirstPrivateArgBuffer;
1245 /// The total size of all arguments to be packed
1246 int64_t FirstPrivateArgSize = 0;
1247
1248 /// A reference to the \p DeviceTy object
1249 DeviceTy &Device;
1250 /// A pointer to a \p AsyncInfoTy object
1251 AsyncInfoTy &AsyncInfo;
1252
1253 // TODO: What would be the best value here? Should we make it configurable?
1254 // If the size is larger than this threshold, we will allocate and transfer it
1255 // immediately instead of packing it.
1256 static constexpr const int64_t FirstPrivateArgSizeThreshold = 1024;
1257
1258public:
1259 /// Constructor
1260 PrivateArgumentManagerTy(DeviceTy &Dev, AsyncInfoTy &AsyncInfo)
1261 : Device(Dev), AsyncInfo(AsyncInfo) {}
1262
1263 /// Add a private argument
1264 int addArg(void *HstPtr, int64_t ArgSize, int64_t ArgOffset,
1265 bool IsFirstPrivate, void *&TgtPtr, int TgtArgsIndex,
1266 map_var_info_t HstPtrName = nullptr,
1267 const bool AllocImmediately = false) {
1268 // If the argument is not first-private, or its size is greater than a
1269 // predefined threshold, we will allocate memory and issue the transfer
1270 // immediately.
1271 if (ArgSize > FirstPrivateArgSizeThreshold || !IsFirstPrivate ||
1272 AllocImmediately) {
1273 TgtPtr = Device.allocData(ArgSize, HstPtr);
1274 if (!TgtPtr) {
1275 DP("Data allocation for %sprivate array " DPxMOD " failed.\n",
1276 (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtr));
1277 return OFFLOAD_FAIL;
1278 }
1279#ifdef OMPTARGET_DEBUG
1280 void *TgtPtrBase = (void *)((intptr_t)TgtPtr + ArgOffset);
1281 DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD
1282 " for %sprivate array " DPxMOD " - pushing target argument " DPxMOD
1283 "\n",
1284 ArgSize, DPxPTR(TgtPtr), (IsFirstPrivate ? "first-" : ""),
1285 DPxPTR(HstPtr), DPxPTR(TgtPtrBase));
1286#endif
1287 // If first-private, copy data from host
1288 if (IsFirstPrivate) {
1289 DP("Submitting firstprivate data to the device.\n");
1290 int Ret = Device.submitData(TgtPtr, HstPtr, ArgSize, AsyncInfo);
1291 if (Ret != OFFLOAD_SUCCESS) {
1292 DP("Copying data to device failed, failed.\n");
1293 return OFFLOAD_FAIL;
1294 }
1295 }
1296 TgtPtrs.push_back(TgtPtr);
1297 } else {
1298 DP("Firstprivate array " DPxMOD " of size %" PRId64 " will be packed\n",
1299 DPxPTR(HstPtr), ArgSize);
1300 // When reach this point, the argument must meet all following
1301 // requirements:
1302 // 1. Its size does not exceed the threshold (see the comment for
1303 // FirstPrivateArgSizeThreshold);
1304 // 2. It must be first-private (needs to be mapped to target device).
1305 // We will pack all this kind of arguments to transfer them all at once
1306 // to reduce the number of data transfer. We will not take
1307 // non-first-private arguments, aka. private arguments that doesn't need
1308 // to be mapped to target device, into account because data allocation
1309 // can be very efficient with memory manager.
1310
1311 // Placeholder value
1312 TgtPtr = nullptr;
1313 auto *LastFPArgInfo =
1314 FirstPrivateArgInfo.empty() ? nullptr : &FirstPrivateArgInfo.back();
1315
1316 // Compute the start alignment of this entry, add padding if necessary.
1317 // TODO: Consider sorting instead.
1318 uint32_t Padding = 0;
1319 uint32_t StartAlignment =
1320 LastFPArgInfo ? LastFPArgInfo->Alignment : MaxAlignment;
1321 if (LastFPArgInfo) {
1322 // Check if we keep the start alignment or if it is shrunk due to the
1323 // size of the last element.
1324 uint32_t Offset = LastFPArgInfo->Size % StartAlignment;
1325 if (Offset)
1326 StartAlignment = Offset;
1327 // We only need as much alignment as the host pointer had (since we
1328 // don't know the alignment information from the source we might end up
1329 // overaligning accesses but not too much).
1330 uint32_t RequiredAlignment =
1331 llvm::bit_floor(Value: getPartialStructRequiredAlignment(HstPtrBase: HstPtr));
1332 if (RequiredAlignment > StartAlignment) {
1333 Padding = RequiredAlignment - StartAlignment;
1334 StartAlignment = RequiredAlignment;
1335 }
1336 }
1337
1338 FirstPrivateArgInfo.emplace_back(TgtArgsIndex, HstPtr, ArgSize,
1339 StartAlignment, Padding, HstPtrName);
1340 FirstPrivateArgSize += Padding + ArgSize;
1341 }
1342
1343 return OFFLOAD_SUCCESS;
1344 }
1345
1346 /// Pack first-private arguments, replace place holder pointers in \p TgtArgs,
1347 /// and start the transfer.
1348 int packAndTransfer(SmallVector<void *> &TgtArgs) {
1349 if (!FirstPrivateArgInfo.empty()) {
1350 assert(FirstPrivateArgSize != 0 &&
1351 "FirstPrivateArgSize is 0 but FirstPrivateArgInfo is empty");
1352 FirstPrivateArgBuffer.resize(FirstPrivateArgSize, 0);
1353 auto *Itr = FirstPrivateArgBuffer.begin();
1354 // Copy all host data to this buffer
1355 for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) {
1356 // First pad the pointer as we (have to) pad it on the device too.
1357 Itr = std::next(Itr, Info.Padding);
1358 std::copy(Info.HstPtrBegin, Info.HstPtrEnd, Itr);
1359 Itr = std::next(Itr, Info.Size);
1360 }
1361 // Allocate target memory
1362 void *TgtPtr =
1363 Device.allocData(FirstPrivateArgSize, FirstPrivateArgBuffer.data());
1364 if (TgtPtr == nullptr) {
1365 DP("Failed to allocate target memory for private arguments.\n");
1366 return OFFLOAD_FAIL;
1367 }
1368 TgtPtrs.push_back(TgtPtr);
1369 DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD "\n",
1370 FirstPrivateArgSize, DPxPTR(TgtPtr));
1371 // Transfer data to target device
1372 int Ret = Device.submitData(TgtPtr, FirstPrivateArgBuffer.data(),
1373 FirstPrivateArgSize, AsyncInfo);
1374 if (Ret != OFFLOAD_SUCCESS) {
1375 DP("Failed to submit data of private arguments.\n");
1376 return OFFLOAD_FAIL;
1377 }
1378 // Fill in all placeholder pointers
1379 auto TP = reinterpret_cast<uintptr_t>(TgtPtr);
1380 for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) {
1381 void *&Ptr = TgtArgs[Info.Index];
1382 assert(Ptr == nullptr && "Target pointer is already set by mistaken");
1383 // Pad the device pointer to get the right alignment.
1384 TP += Info.Padding;
1385 Ptr = reinterpret_cast<void *>(TP);
1386 TP += Info.Size;
1387 DP("Firstprivate array " DPxMOD " of size %" PRId64 " mapped to " DPxMOD
1388 "\n",
1389 DPxPTR(Info.HstPtrBegin), Info.HstPtrEnd - Info.HstPtrBegin,
1390 DPxPTR(Ptr));
1391 }
1392 }
1393
1394 return OFFLOAD_SUCCESS;
1395 }
1396
1397 /// Free all target memory allocated for private arguments
1398 int free() {
1399 for (void *P : TgtPtrs) {
1400 int Ret = Device.deleteData(P);
1401 if (Ret != OFFLOAD_SUCCESS) {
1402 DP("Deallocation of (first-)private arrays failed.\n");
1403 return OFFLOAD_FAIL;
1404 }
1405 }
1406
1407 TgtPtrs.clear();
1408
1409 return OFFLOAD_SUCCESS;
1410 }
1411};
1412
1413/// Process data before launching the kernel, including calling targetDataBegin
1414/// to map and transfer data to target device, transferring (first-)private
1415/// variables.
1416static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr,
1417 int32_t ArgNum, void **ArgBases, void **Args,
1418 int64_t *ArgSizes, int64_t *ArgTypes,
1419 map_var_info_t *ArgNames, void **ArgMappers,
1420 SmallVector<void *> &TgtArgs,
1421 SmallVector<ptrdiff_t> &TgtOffsets,
1422 PrivateArgumentManagerTy &PrivateArgumentManager,
1423 AsyncInfoTy &AsyncInfo) {
1424
1425 auto DeviceOrErr = PM->getDevice(DeviceId);
1426 if (!DeviceOrErr)
1427 FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
1428
1429 int Ret = targetDataBegin(Loc, *DeviceOrErr, ArgNum, ArgBases, Args, ArgSizes,
1430 ArgTypes, ArgNames, ArgMappers, AsyncInfo);
1431 if (Ret != OFFLOAD_SUCCESS) {
1432 REPORT("Call to targetDataBegin failed, abort target.\n");
1433 return OFFLOAD_FAIL;
1434 }
1435
1436 // List of (first-)private arrays allocated for this target region
1437 SmallVector<int> TgtArgsPositions(ArgNum, -1);
1438
1439 for (int32_t I = 0; I < ArgNum; ++I) {
1440 if (!(ArgTypes[I] & OMP_TGT_MAPTYPE_TARGET_PARAM)) {
1441 // This is not a target parameter, do not push it into TgtArgs.
1442 // Check for lambda mapping.
1443 if (isLambdaMapping(Mapping: ArgTypes[I])) {
1444 assert((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
1445 "PTR_AND_OBJ must be also MEMBER_OF.");
1446 unsigned Idx = getParentIndex(Type: ArgTypes[I]);
1447 int TgtIdx = TgtArgsPositions[Idx];
1448 assert(TgtIdx != -1 && "Base address must be translated already.");
1449 // The parent lambda must be processed already and it must be the last
1450 // in TgtArgs and TgtOffsets arrays.
1451 void *HstPtrVal = Args[I];
1452 void *HstPtrBegin = ArgBases[I];
1453 void *HstPtrBase = Args[Idx];
1454 void *TgtPtrBase =
1455 (void *)((intptr_t)TgtArgs[TgtIdx] + TgtOffsets[TgtIdx]);
1456 DP("Parent lambda base " DPxMOD "\n", DPxPTR(TgtPtrBase));
1457 uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
1458 void *TgtPtrBegin = (void *)((uintptr_t)TgtPtrBase + Delta);
1459 void *&PointerTgtPtrBegin = AsyncInfo.getVoidPtrLocation();
1460 TargetPointerResultTy TPR =
1461 DeviceOrErr->getMappingInfo().getTgtPtrBegin(
1462 HstPtrVal, ArgSizes[I], /*UpdateRefCount=*/false,
1463 /*UseHoldRefCount=*/false);
1464 PointerTgtPtrBegin = TPR.TargetPointer;
1465 if (!TPR.isPresent()) {
1466 DP("No lambda captured variable mapped (" DPxMOD ") - ignored\n",
1467 DPxPTR(HstPtrVal));
1468 continue;
1469 }
1470 if (TPR.Flags.IsHostPointer) {
1471 DP("Unified memory is active, no need to map lambda captured"
1472 "variable (" DPxMOD ")\n",
1473 DPxPTR(HstPtrVal));
1474 continue;
1475 }
1476 DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n",
1477 DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
1478 Ret =
1479 DeviceOrErr->submitData(TgtPtrBegin, &PointerTgtPtrBegin,
1480 sizeof(void *), AsyncInfo, TPR.getEntry());
1481 if (Ret != OFFLOAD_SUCCESS) {
1482 REPORT("Copying data to device failed.\n");
1483 return OFFLOAD_FAIL;
1484 }
1485 }
1486 continue;
1487 }
1488 void *HstPtrBegin = Args[I];
1489 void *HstPtrBase = ArgBases[I];
1490 void *TgtPtrBegin;
1491 map_var_info_t HstPtrName = (!ArgNames) ? nullptr : ArgNames[I];
1492 ptrdiff_t TgtBaseOffset;
1493 TargetPointerResultTy TPR;
1494 if (ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) {
1495 DP("Forwarding first-private value " DPxMOD " to the target construct\n",
1496 DPxPTR(HstPtrBase));
1497 TgtPtrBegin = HstPtrBase;
1498 TgtBaseOffset = 0;
1499 } else if (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE) {
1500 TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
1501 const bool IsFirstPrivate = (ArgTypes[I] & OMP_TGT_MAPTYPE_TO);
1502 // If there is a next argument and it depends on the current one, we need
1503 // to allocate the private memory immediately. If this is not the case,
1504 // then the argument can be marked for optimization and packed with the
1505 // other privates.
1506 const bool AllocImmediately =
1507 (I < ArgNum - 1 && (ArgTypes[I + 1] & OMP_TGT_MAPTYPE_MEMBER_OF));
1508 Ret = PrivateArgumentManager.addArg(
1509 HstPtrBegin, ArgSizes[I], TgtBaseOffset, IsFirstPrivate, TgtPtrBegin,
1510 TgtArgs.size(), HstPtrName, AllocImmediately);
1511 if (Ret != OFFLOAD_SUCCESS) {
1512 REPORT("Failed to process %sprivate argument " DPxMOD "\n",
1513 (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtrBegin));
1514 return OFFLOAD_FAIL;
1515 }
1516 } else {
1517 if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)
1518 HstPtrBase = *reinterpret_cast<void **>(HstPtrBase);
1519 TPR = DeviceOrErr->getMappingInfo().getTgtPtrBegin(
1520 HstPtrBegin, ArgSizes[I],
1521 /*UpdateRefCount=*/false,
1522 /*UseHoldRefCount=*/false);
1523 TgtPtrBegin = TPR.TargetPointer;
1524 TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
1525#ifdef OMPTARGET_DEBUG
1526 void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset);
1527 DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n",
1528 DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin));
1529#endif
1530 }
1531 TgtArgsPositions[I] = TgtArgs.size();
1532 TgtArgs.push_back(TgtPtrBegin);
1533 TgtOffsets.push_back(TgtBaseOffset);
1534 }
1535
1536 assert(TgtArgs.size() == TgtOffsets.size() &&
1537 "Size mismatch in arguments and offsets");
1538
1539 // Pack and transfer first-private arguments
1540 Ret = PrivateArgumentManager.packAndTransfer(TgtArgs);
1541 if (Ret != OFFLOAD_SUCCESS) {
1542 DP("Failed to pack and transfer first private arguments\n");
1543 return OFFLOAD_FAIL;
1544 }
1545
1546 return OFFLOAD_SUCCESS;
1547}
1548
1549/// Process data after launching the kernel, including transferring data back to
1550/// host if needed and deallocating target memory of (first-)private variables.
1551static int processDataAfter(ident_t *Loc, int64_t DeviceId, void *HostPtr,
1552 int32_t ArgNum, void **ArgBases, void **Args,
1553 int64_t *ArgSizes, int64_t *ArgTypes,
1554 map_var_info_t *ArgNames, void **ArgMappers,
1555 PrivateArgumentManagerTy &PrivateArgumentManager,
1556 AsyncInfoTy &AsyncInfo) {
1557
1558 auto DeviceOrErr = PM->getDevice(DeviceId);
1559 if (!DeviceOrErr)
1560 FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
1561
1562 // Move data from device.
1563 int Ret = targetDataEnd(Loc, *DeviceOrErr, ArgNum, ArgBases, Args, ArgSizes,
1564 ArgTypes, ArgNames, ArgMappers, AsyncInfo);
1565 if (Ret != OFFLOAD_SUCCESS) {
1566 REPORT("Call to targetDataEnd failed, abort target.\n");
1567 return OFFLOAD_FAIL;
1568 }
1569
1570 // Free target memory for private arguments after synchronization.
1571 // TODO: We might want to remove `mutable` in the future by not changing the
1572 // captured variables somehow.
1573 AsyncInfo.addPostProcessingFunction(
1574 [PrivateArgumentManager =
1575 std::move(PrivateArgumentManager)]() mutable -> int {
1576 int Ret = PrivateArgumentManager.free();
1577 if (Ret != OFFLOAD_SUCCESS) {
1578 REPORT("Failed to deallocate target memory for private args\n");
1579 return OFFLOAD_FAIL;
1580 }
1581 return Ret;
1582 });
1583
1584 return OFFLOAD_SUCCESS;
1585}
1586} // namespace
1587
1588/// performs the same actions as data_begin in case arg_num is
1589/// non-zero and initiates run of the offloaded region on the target platform;
1590/// if arg_num is non-zero after the region execution is done it also
1591/// performs the same action as data_update and data_end above. This function
1592/// returns 0 if it was able to transfer the execution to a target and an
1593/// integer different from zero otherwise.
1594int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
1595 KernelArgsTy &KernelArgs, AsyncInfoTy &AsyncInfo) {
1596 int32_t DeviceId = Device.DeviceID;
1597 TableMap *TM = getTableMap(HostPtr);
1598 // No map for this host pointer found!
1599 if (!TM) {
1600 REPORT("Host ptr " DPxMOD " does not have a matching target pointer.\n",
1601 DPxPTR(HostPtr));
1602 return OFFLOAD_FAIL;
1603 }
1604
1605 // get target table.
1606 __tgt_target_table *TargetTable = nullptr;
1607 {
1608 std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
1609 assert(TM->Table->TargetsTable.size() > (size_t)DeviceId &&
1610 "Not expecting a device ID outside the table's bounds!");
1611 TargetTable = TM->Table->TargetsTable[DeviceId];
1612 }
1613 assert(TargetTable && "Global data has not been mapped\n");
1614
1615 DP("loop trip count is %" PRIu64 ".\n", KernelArgs.Tripcount);
1616
1617 // We need to keep bases and offsets separate. Sometimes (e.g. in OpenCL) we
1618 // need to manifest base pointers prior to launching a kernel. Even if we have
1619 // mapped an object only partially, e.g. A[N:M], although the kernel is
1620 // expected to access elements starting at address &A[N] and beyond, we still
1621 // need to manifest the base of the array &A[0]. In other cases, e.g. the COI
1622 // API, we need the begin address itself, i.e. &A[N], as the API operates on
1623 // begin addresses, not bases. That's why we pass args and offsets as two
1624 // separate entities so that each plugin can do what it needs. This behavior
1625 // was introdued via https://reviews.llvm.org/D33028 and commit 1546d319244c.
1626 SmallVector<void *> TgtArgs;
1627 SmallVector<ptrdiff_t> TgtOffsets;
1628
1629 PrivateArgumentManagerTy PrivateArgumentManager(Device, AsyncInfo);
1630
1631 int NumClangLaunchArgs = KernelArgs.NumArgs;
1632 int Ret = OFFLOAD_SUCCESS;
1633 if (NumClangLaunchArgs) {
1634 // Process data, such as data mapping, before launching the kernel
1635 Ret = processDataBefore(Loc, DeviceId, HostPtr, NumClangLaunchArgs,
1636 KernelArgs.ArgBasePtrs, KernelArgs.ArgPtrs,
1637 KernelArgs.ArgSizes, KernelArgs.ArgTypes,
1638 KernelArgs.ArgNames, KernelArgs.ArgMappers, TgtArgs,
1639 TgtOffsets, PrivateArgumentManager, AsyncInfo);
1640 if (Ret != OFFLOAD_SUCCESS) {
1641 REPORT("Failed to process data before launching the kernel.\n");
1642 return OFFLOAD_FAIL;
1643 }
1644
1645 // Clang might pass more values via the ArgPtrs to the runtime that we pass
1646 // on to the kernel.
1647 // TOOD: Next time we adjust the KernelArgsTy we should introduce a new
1648 // NumKernelArgs field.
1649 KernelArgs.NumArgs = TgtArgs.size();
1650 }
1651
1652 // Launch device execution.
1653 void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].addr;
1654 DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",
1655 TargetTable->EntriesBegin[TM->Index].name, DPxPTR(TgtEntryPtr), TM->Index);
1656
1657 {
1658 assert(KernelArgs.NumArgs == TgtArgs.size() && "Argument count mismatch!");
1659 TIMESCOPE_WITH_DETAILS_AND_IDENT(
1660 "Kernel Target",
1661 "NumArguments=" + std::to_string(KernelArgs.NumArgs) +
1662 ";NumTeams=" + std::to_string(KernelArgs.NumTeams[0]) +
1663 ";TripCount=" + std::to_string(KernelArgs.Tripcount),
1664 Loc);
1665
1666#ifdef OMPT_SUPPORT
1667 assert(KernelArgs.NumTeams[1] == 0 && KernelArgs.NumTeams[2] == 0 &&
1668 "Multi dimensional launch not supported yet.");
1669 /// RAII to establish tool anchors before and after kernel launch
1670 int32_t NumTeams = KernelArgs.NumTeams[0];
1671 // No need to guard this with OMPT_IF_BUILT
1672 InterfaceRAII TargetSubmitRAII(
1673 RegionInterface.getCallbacks<ompt_callback_target_submit>(), NumTeams);
1674#endif
1675
1676 Ret = Device.launchKernel(TgtEntryPtr, TgtArgs.data(), TgtOffsets.data(),
1677 KernelArgs, AsyncInfo);
1678 }
1679
1680 if (Ret != OFFLOAD_SUCCESS) {
1681 REPORT("Executing target region abort target.\n");
1682 return OFFLOAD_FAIL;
1683 }
1684
1685 if (NumClangLaunchArgs) {
1686 // Transfer data back and deallocate target memory for (first-)private
1687 // variables
1688 Ret = processDataAfter(Loc, DeviceId, HostPtr, NumClangLaunchArgs,
1689 KernelArgs.ArgBasePtrs, KernelArgs.ArgPtrs,
1690 KernelArgs.ArgSizes, KernelArgs.ArgTypes,
1691 KernelArgs.ArgNames, KernelArgs.ArgMappers,
1692 PrivateArgumentManager, AsyncInfo);
1693 if (Ret != OFFLOAD_SUCCESS) {
1694 REPORT("Failed to process data after launching the kernel.\n");
1695 return OFFLOAD_FAIL;
1696 }
1697 }
1698
1699 return OFFLOAD_SUCCESS;
1700}
1701
1702/// Enables the record replay mechanism by pre-allocating MemorySize
1703/// and informing the record-replayer of whether to store the output
1704/// in some file.
1705int target_activate_rr(DeviceTy &Device, uint64_t MemorySize, void *VAddr,
1706 bool IsRecord, bool SaveOutput,
1707 uint64_t &ReqPtrArgOffset) {
1708 return Device.RTL->initialize_record_replay(Device.DeviceID, MemorySize,
1709 VAddr, IsRecord, SaveOutput,
1710 ReqPtrArgOffset);
1711}
1712
1713/// Executes a kernel using pre-recorded information for loading to
1714/// device memory to launch the target kernel with the pre-recorded
1715/// configuration.
1716int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr,
1717 void *DeviceMemory, int64_t DeviceMemorySize, void **TgtArgs,
1718 ptrdiff_t *TgtOffsets, int32_t NumArgs, int32_t NumTeams,
1719 int32_t ThreadLimit, uint64_t LoopTripCount,
1720 AsyncInfoTy &AsyncInfo) {
1721 int32_t DeviceId = Device.DeviceID;
1722 TableMap *TM = getTableMap(HostPtr);
1723 // Fail if the table map fails to find the target kernel pointer for the
1724 // provided host pointer.
1725 if (!TM) {
1726 REPORT("Host ptr " DPxMOD " does not have a matching target pointer.\n",
1727 DPxPTR(HostPtr));
1728 return OFFLOAD_FAIL;
1729 }
1730
1731 // Retrieve the target table of offloading entries.
1732 __tgt_target_table *TargetTable = nullptr;
1733 {
1734 std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
1735 assert(TM->Table->TargetsTable.size() > (size_t)DeviceId &&
1736 "Not expecting a device ID outside the table's bounds!");
1737 TargetTable = TM->Table->TargetsTable[DeviceId];
1738 }
1739 assert(TargetTable && "Global data has not been mapped\n");
1740
1741 // Retrieve the target kernel pointer, allocate and store the recorded device
1742 // memory data, and launch device execution.
1743 void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].addr;
1744 DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",
1745 TargetTable->EntriesBegin[TM->Index].name, DPxPTR(TgtEntryPtr), TM->Index);
1746
1747 void *TgtPtr = Device.allocData(DeviceMemorySize, /*HstPtr=*/nullptr,
1748 TARGET_ALLOC_DEFAULT);
1749 Device.submitData(TgtPtr, DeviceMemory, DeviceMemorySize, AsyncInfo);
1750
1751 KernelArgsTy KernelArgs = {0};
1752 KernelArgs.Version = OMP_KERNEL_ARG_VERSION;
1753 KernelArgs.NumArgs = NumArgs;
1754 KernelArgs.Tripcount = LoopTripCount;
1755 KernelArgs.NumTeams[0] = NumTeams;
1756 KernelArgs.ThreadLimit[0] = ThreadLimit;
1757
1758 int Ret = Device.launchKernel(TgtEntryPtr, TgtArgs, TgtOffsets, KernelArgs,
1759 AsyncInfo);
1760
1761 if (Ret != OFFLOAD_SUCCESS) {
1762 REPORT("Executing target region abort target.\n");
1763 return OFFLOAD_FAIL;
1764 }
1765
1766 return OFFLOAD_SUCCESS;
1767}
1768

source code of offload/src/omptarget.cpp