1//===- PluginInterface.cpp - Target independent plugin device interface ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9//===----------------------------------------------------------------------===//
10
11#include "PluginInterface.h"
12
13#include "Shared/APITypes.h"
14#include "Shared/Debug.h"
15#include "Shared/Environment.h"
16
17#include "ErrorReporting.h"
18#include "GlobalHandler.h"
19#include "JIT.h"
20#include "Shared/Utils.h"
21#include "Utils/ELF.h"
22#include "omptarget.h"
23
24#ifdef OMPT_SUPPORT
25#include "OpenMP/OMPT/Callback.h"
26#include "omp-tools.h"
27#endif
28
29#include "llvm/Bitcode/BitcodeReader.h"
30#include "llvm/Frontend/OpenMP/OMPConstants.h"
31#include "llvm/Support/Error.h"
32#include "llvm/Support/JSON.h"
33#include "llvm/Support/MathExtras.h"
34#include "llvm/Support/MemoryBuffer.h"
35#include "llvm/Support/Signals.h"
36#include "llvm/Support/raw_ostream.h"
37
38#include <cstdint>
39#include <limits>
40
41using namespace llvm;
42using namespace omp;
43using namespace target;
44using namespace plugin;
45using namespace error;
46
47// TODO: Fix any thread safety issues for multi-threaded kernel recording.
48namespace llvm::omp::target::plugin {
49struct RecordReplayTy {
50
51 // Describes the state of the record replay mechanism.
52 enum RRStatusTy { RRDeactivated = 0, RRRecording, RRReplaying };
53
54private:
55 // Memory pointers for recording, replaying memory.
56 void *MemoryStart = nullptr;
57 void *MemoryPtr = nullptr;
58 size_t MemorySize = 0;
59 size_t TotalSize = 0;
60 GenericDeviceTy *Device = nullptr;
61 std::mutex AllocationLock;
62
63 RRStatusTy Status = RRDeactivated;
64 bool ReplaySaveOutput = false;
65 bool UsedVAMap = false;
66 uintptr_t MemoryOffset = 0;
67
68 // A list of all globals mapped to the device.
69 struct GlobalEntry {
70 const char *Name;
71 uint64_t Size;
72 void *Addr;
73 };
74 llvm::SmallVector<GlobalEntry> GlobalEntries{};
75
76 void *suggestAddress(uint64_t MaxMemoryAllocation) {
77 // Get a valid pointer address for this system
78 void *Addr =
79 Device->allocate(1024, /*HstPtr=*/nullptr, TARGET_ALLOC_DEFAULT);
80 Device->free(Addr);
81 // Align Address to MaxMemoryAllocation
82 Addr = (void *)utils::alignPtr((Addr), MaxMemoryAllocation);
83 return Addr;
84 }
85
86 Error preAllocateVAMemory(uint64_t MaxMemoryAllocation, void *VAddr) {
87 size_t ASize = MaxMemoryAllocation;
88
89 if (!VAddr && isRecording())
90 VAddr = suggestAddress(MaxMemoryAllocation);
91
92 DP("Request %ld bytes allocated at %p\n", MaxMemoryAllocation, VAddr);
93
94 if (auto Err = Device->memoryVAMap(&MemoryStart, VAddr, &ASize))
95 return Err;
96
97 if (isReplaying() && VAddr != MemoryStart) {
98 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
99 "record-Replay cannot assign the"
100 "requested recorded address (%p, %p)",
101 VAddr, MemoryStart);
102 }
103
104 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(),
105 "Allocated %" PRIu64 " bytes at %p for replay.\n", ASize, MemoryStart);
106
107 MemoryPtr = MemoryStart;
108 MemorySize = 0;
109 TotalSize = ASize;
110 UsedVAMap = true;
111 return Plugin::success();
112 }
113
114 Error preAllocateHeuristic(uint64_t MaxMemoryAllocation,
115 uint64_t RequiredMemoryAllocation, void *VAddr) {
116 const size_t MAX_MEMORY_ALLOCATION = MaxMemoryAllocation;
117 constexpr size_t STEP = 1024 * 1024 * 1024ULL;
118 MemoryStart = nullptr;
119 for (TotalSize = MAX_MEMORY_ALLOCATION; TotalSize > 0; TotalSize -= STEP) {
120 MemoryStart =
121 Device->allocate(TotalSize, /*HstPtr=*/nullptr, TARGET_ALLOC_DEFAULT);
122 if (MemoryStart)
123 break;
124 }
125 if (!MemoryStart)
126 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
127 "allocating record/replay memory");
128
129 if (VAddr && VAddr != MemoryStart)
130 MemoryOffset = uintptr_t(VAddr) - uintptr_t(MemoryStart);
131
132 MemoryPtr = MemoryStart;
133 MemorySize = 0;
134
135 // Check if we need adjustment.
136 if (MemoryOffset > 0 &&
137 TotalSize >= RequiredMemoryAllocation + MemoryOffset) {
138 // If we are off but "before" the required address and with enough space,
139 // we just "allocate" the offset to match the required address.
140 MemoryPtr = (char *)MemoryPtr + MemoryOffset;
141 MemorySize += MemoryOffset;
142 MemoryOffset = 0;
143 assert(MemoryPtr == VAddr && "Expected offset adjustment to work");
144 } else if (MemoryOffset) {
145 // If we are off and in a situation we cannot just "waste" memory to force
146 // a match, we hope adjusting the arguments is sufficient.
147 REPORT(
148 "WARNING Failed to allocate replay memory at required location %p, "
149 "got %p, trying to offset argument pointers by %" PRIi64 "\n",
150 VAddr, MemoryStart, MemoryOffset);
151 }
152
153 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(),
154 "Allocated %" PRIu64 " bytes at %p for replay.\n", TotalSize,
155 MemoryStart);
156
157 return Plugin::success();
158 }
159
160 Error preallocateDeviceMemory(uint64_t DeviceMemorySize, void *ReqVAddr) {
161 if (Device->supportVAManagement()) {
162 auto Err = preAllocateVAMemory(MaxMemoryAllocation: DeviceMemorySize, VAddr: ReqVAddr);
163 if (Err) {
164 REPORT("WARNING VA mapping failed, fallback to heuristic: "
165 "(Error: %s)\n",
166 toString(E: std::move(Err)).data());
167 }
168 }
169
170 uint64_t DevMemSize;
171 if (Device->getDeviceMemorySize(DevMemSize))
172 return Plugin::error(ErrorCode::UNKNOWN,
173 "cannot determine Device Memory Size");
174
175 return preAllocateHeuristic(MaxMemoryAllocation: DevMemSize, RequiredMemoryAllocation: DeviceMemorySize, VAddr: ReqVAddr);
176 }
177
178 void dumpDeviceMemory(StringRef Filename) {
179 ErrorOr<std::unique_ptr<WritableMemoryBuffer>> DeviceMemoryMB =
180 WritableMemoryBuffer::getNewUninitMemBuffer(Size: MemorySize);
181 if (!DeviceMemoryMB)
182 report_fatal_error(reason: "Error creating MemoryBuffer for device memory");
183
184 auto Err = Device->dataRetrieve(DeviceMemoryMB.get()->getBufferStart(),
185 MemoryStart, MemorySize, nullptr);
186 if (Err)
187 report_fatal_error(reason: "Error retrieving data for target pointer");
188
189 StringRef DeviceMemory(DeviceMemoryMB.get()->getBufferStart(), MemorySize);
190 std::error_code EC;
191 raw_fd_ostream OS(Filename, EC);
192 if (EC)
193 report_fatal_error(reason: "Error dumping memory to file " + Filename + " :" +
194 EC.message());
195 OS << DeviceMemory;
196 OS.close();
197 }
198
199public:
200 bool isRecording() const { return Status == RRStatusTy::RRRecording; }
201 bool isReplaying() const { return Status == RRStatusTy::RRReplaying; }
202 bool isRecordingOrReplaying() const {
203 return (Status != RRStatusTy::RRDeactivated);
204 }
205 void setStatus(RRStatusTy Status) { this->Status = Status; }
206 bool isSaveOutputEnabled() const { return ReplaySaveOutput; }
207 void addEntry(const char *Name, uint64_t Size, void *Addr) {
208 GlobalEntries.emplace_back(Args: GlobalEntry{.Name: Name, .Size: Size, .Addr: Addr});
209 }
210
211 void saveImage(const char *Name, const DeviceImageTy &Image) {
212 SmallString<128> ImageName = {Name, ".image"};
213 std::error_code EC;
214 raw_fd_ostream OS(ImageName, EC);
215 if (EC)
216 report_fatal_error(reason: "Error saving image : " + StringRef(EC.message()));
217 if (const auto *TgtImageBitcode = Image.getTgtImageBitcode()) {
218 size_t Size = utils::getPtrDiff(TgtImageBitcode->ImageEnd,
219 TgtImageBitcode->ImageStart);
220 MemoryBufferRef MBR = MemoryBufferRef(
221 StringRef((const char *)TgtImageBitcode->ImageStart, Size), "");
222 OS << MBR.getBuffer();
223 } else {
224 OS << Image.getMemoryBuffer().getBuffer();
225 }
226 OS.close();
227 }
228
229 void dumpGlobals(StringRef Filename, DeviceImageTy &Image) {
230 int32_t Size = 0;
231
232 for (auto &OffloadEntry : GlobalEntries) {
233 if (!OffloadEntry.Size)
234 continue;
235 // Get the total size of the string and entry including the null byte.
236 Size += std::strlen(s: OffloadEntry.Name) + 1 + sizeof(uint32_t) +
237 OffloadEntry.Size;
238 }
239
240 ErrorOr<std::unique_ptr<WritableMemoryBuffer>> GlobalsMB =
241 WritableMemoryBuffer::getNewUninitMemBuffer(Size);
242 if (!GlobalsMB)
243 report_fatal_error(reason: "Error creating MemoryBuffer for globals memory");
244
245 void *BufferPtr = GlobalsMB.get()->getBufferStart();
246 for (auto &OffloadEntry : GlobalEntries) {
247 if (!OffloadEntry.Size)
248 continue;
249
250 int32_t NameLength = std::strlen(s: OffloadEntry.Name) + 1;
251 memcpy(dest: BufferPtr, src: OffloadEntry.Name, n: NameLength);
252 BufferPtr = utils::advancePtr(BufferPtr, NameLength);
253
254 *((uint32_t *)(BufferPtr)) = OffloadEntry.Size;
255 BufferPtr = utils::advancePtr(BufferPtr, sizeof(uint32_t));
256
257 auto Err = Plugin::success();
258 {
259 if (auto Err = Device->dataRetrieve(BufferPtr, OffloadEntry.Addr,
260 OffloadEntry.Size, nullptr))
261 report_fatal_error(reason: "Error retrieving data for global");
262 }
263 if (Err)
264 report_fatal_error(reason: "Error retrieving data for global");
265 BufferPtr = utils::advancePtr(BufferPtr, OffloadEntry.Size);
266 }
267 assert(BufferPtr == GlobalsMB->get()->getBufferEnd() &&
268 "Buffer over/under-filled.");
269 assert(Size == utils::getPtrDiff(BufferPtr,
270 GlobalsMB->get()->getBufferStart()) &&
271 "Buffer size mismatch");
272
273 StringRef GlobalsMemory(GlobalsMB.get()->getBufferStart(), Size);
274 std::error_code EC;
275 raw_fd_ostream OS(Filename, EC);
276 OS << GlobalsMemory;
277 OS.close();
278 }
279
280 void saveKernelDescr(const char *Name, KernelLaunchParamsTy LaunchParams,
281 int32_t NumArgs, uint64_t NumTeamsClause,
282 uint32_t ThreadLimitClause, uint64_t LoopTripCount) {
283 json::Object JsonKernelInfo;
284 JsonKernelInfo["Name"] = Name;
285 JsonKernelInfo["NumArgs"] = NumArgs;
286 JsonKernelInfo["NumTeamsClause"] = NumTeamsClause;
287 JsonKernelInfo["ThreadLimitClause"] = ThreadLimitClause;
288 JsonKernelInfo["LoopTripCount"] = LoopTripCount;
289 JsonKernelInfo["DeviceMemorySize"] = MemorySize;
290 JsonKernelInfo["DeviceId"] = Device->getDeviceId();
291 JsonKernelInfo["BumpAllocVAStart"] = (intptr_t)MemoryStart;
292
293 json::Array JsonArgPtrs;
294 for (int I = 0; I < NumArgs; ++I)
295 JsonArgPtrs.push_back(E: (intptr_t)LaunchParams.Ptrs[I]);
296 JsonKernelInfo["ArgPtrs"] = json::Value(std::move(JsonArgPtrs));
297
298 json::Array JsonArgOffsets;
299 for (int I = 0; I < NumArgs; ++I)
300 JsonArgOffsets.push_back(E: 0);
301 JsonKernelInfo["ArgOffsets"] = json::Value(std::move(JsonArgOffsets));
302
303 SmallString<128> JsonFilename = {Name, ".json"};
304 std::error_code EC;
305 raw_fd_ostream JsonOS(JsonFilename.str(), EC);
306 if (EC)
307 report_fatal_error(reason: "Error saving kernel json file : " +
308 StringRef(EC.message()));
309 JsonOS << json::Value(std::move(JsonKernelInfo));
310 JsonOS.close();
311 }
312
313 void saveKernelInput(const char *Name, DeviceImageTy &Image) {
314 SmallString<128> GlobalsFilename = {Name, ".globals"};
315 dumpGlobals(GlobalsFilename, Image);
316
317 SmallString<128> MemoryFilename = {Name, ".memory"};
318 dumpDeviceMemory(Filename: MemoryFilename);
319 }
320
321 void saveKernelOutputInfo(const char *Name) {
322 SmallString<128> OutputFilename = {
323 Name, (isRecording() ? ".original.output" : ".replay.output")};
324 dumpDeviceMemory(Filename: OutputFilename);
325 }
326
327 void *alloc(uint64_t Size) {
328 assert(MemoryStart && "Expected memory has been pre-allocated");
329 void *Alloc = nullptr;
330 constexpr int Alignment = 16;
331 // Assumes alignment is a power of 2.
332 int64_t AlignedSize = (Size + (Alignment - 1)) & (~(Alignment - 1));
333 std::lock_guard<std::mutex> LG(AllocationLock);
334 Alloc = MemoryPtr;
335 MemoryPtr = (char *)MemoryPtr + AlignedSize;
336 MemorySize += AlignedSize;
337 DP("Memory Allocator return " DPxMOD "\n", DPxPTR(Alloc));
338 return Alloc;
339 }
340
341 Error init(GenericDeviceTy *Device, uint64_t MemSize, void *VAddr,
342 RRStatusTy Status, bool SaveOutput, uint64_t &ReqPtrArgOffset) {
343 this->Device = Device;
344 this->Status = Status;
345 this->ReplaySaveOutput = SaveOutput;
346
347 if (auto Err = preallocateDeviceMemory(DeviceMemorySize: MemSize, ReqVAddr: VAddr))
348 return Err;
349
350 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(),
351 "Record Replay Initialized (%p)"
352 " as starting address, %lu Memory Size"
353 " and set on status %s\n",
354 MemoryStart, TotalSize,
355 Status == RRStatusTy::RRRecording ? "Recording" : "Replaying");
356
357 // Tell the user to offset pointer arguments as the memory allocation does
358 // not match.
359 ReqPtrArgOffset = MemoryOffset;
360 return Plugin::success();
361 }
362
363 void deinit() {
364 if (UsedVAMap) {
365 if (auto Err = Device->memoryVAUnMap(MemoryStart, TotalSize))
366 report_fatal_error(reason: "Error on releasing virtual memory space");
367 } else {
368 Device->free(MemoryStart);
369 }
370 }
371};
372} // namespace llvm::omp::target::plugin
373
374// Extract the mapping of host function pointers to device function pointers
375// from the entry table. Functions marked as 'indirect' in OpenMP will have
376// offloading entries generated for them which map the host's function pointer
377// to a global containing the corresponding function pointer on the device.
378static Expected<std::pair<void *, uint64_t>>
379setupIndirectCallTable(GenericPluginTy &Plugin, GenericDeviceTy &Device,
380 DeviceImageTy &Image) {
381 GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
382
383 llvm::ArrayRef<llvm::offloading::EntryTy> Entries(
384 Image.getTgtImage()->EntriesBegin, Image.getTgtImage()->EntriesEnd);
385 llvm::SmallVector<std::pair<void *, void *>> IndirectCallTable;
386 for (const auto &Entry : Entries) {
387 if (Entry.Kind != object::OffloadKind::OFK_OpenMP || Entry.Size == 0 ||
388 !(Entry.Flags & OMP_DECLARE_TARGET_INDIRECT))
389 continue;
390
391 assert(Entry.Size == sizeof(void *) && "Global not a function pointer?");
392 auto &[HstPtr, DevPtr] = IndirectCallTable.emplace_back();
393
394 GlobalTy DeviceGlobal(Entry.SymbolName, Entry.Size);
395 if (auto Err =
396 Handler.getGlobalMetadataFromDevice(Device, Image, DeviceGlobal))
397 return std::move(Err);
398
399 HstPtr = Entry.Address;
400 if (auto Err = Device.dataRetrieve(&DevPtr, DeviceGlobal.getPtr(),
401 Entry.Size, nullptr))
402 return std::move(Err);
403 }
404
405 // If we do not have any indirect globals we exit early.
406 if (IndirectCallTable.empty())
407 return std::pair{nullptr, 0};
408
409 // Sort the array to allow for more efficient lookup of device pointers.
410 llvm::sort(C&: IndirectCallTable,
411 Comp: [](const auto &x, const auto &y) { return x.first < y.first; });
412
413 uint64_t TableSize =
414 IndirectCallTable.size() * sizeof(std::pair<void *, void *>);
415 void *DevicePtr = Device.allocate(TableSize, nullptr, TARGET_ALLOC_DEVICE);
416 if (auto Err = Device.dataSubmit(DevicePtr, IndirectCallTable.data(),
417 TableSize, nullptr))
418 return std::move(Err);
419 return std::pair<void *, uint64_t>(DevicePtr, IndirectCallTable.size());
420}
421
422AsyncInfoWrapperTy::AsyncInfoWrapperTy(GenericDeviceTy &Device,
423 __tgt_async_info *AsyncInfoPtr)
424 : Device(Device),
425 AsyncInfoPtr(AsyncInfoPtr ? AsyncInfoPtr : &LocalAsyncInfo) {}
426
427void AsyncInfoWrapperTy::finalize(Error &Err) {
428 assert(AsyncInfoPtr && "AsyncInfoWrapperTy already finalized");
429
430 // If we used a local async info object we want synchronous behavior. In that
431 // case, and assuming the current status code is correct, we will synchronize
432 // explicitly when the object is deleted. Update the error with the result of
433 // the synchronize operation.
434 if (AsyncInfoPtr == &LocalAsyncInfo && LocalAsyncInfo.Queue && !Err)
435 Err = Device.synchronize(&LocalAsyncInfo);
436
437 // Invalidate the wrapper object.
438 AsyncInfoPtr = nullptr;
439}
440
441Error GenericKernelTy::init(GenericDeviceTy &GenericDevice,
442 DeviceImageTy &Image) {
443
444 ImagePtr = &Image;
445
446 // Retrieve kernel environment object for the kernel.
447 std::string EnvironmentName = std::string(Name) + "_kernel_environment";
448 GenericGlobalHandlerTy &GHandler = GenericDevice.Plugin.getGlobalHandler();
449 if (GHandler.isSymbolInImage(GenericDevice, Image, EnvironmentName)) {
450 GlobalTy KernelEnv(EnvironmentName, sizeof(KernelEnvironment),
451 &KernelEnvironment);
452 if (auto Err =
453 GHandler.readGlobalFromImage(GenericDevice, *ImagePtr, KernelEnv))
454 return Err;
455 } else {
456 KernelEnvironment = KernelEnvironmentTy{};
457 DP("Failed to read kernel environment for '%s' Using default Bare (0) "
458 "execution mode\n",
459 getName());
460 }
461
462 // Max = Config.Max > 0 ? min(Config.Max, Device.Max) : Device.Max;
463 MaxNumThreads = KernelEnvironment.Configuration.MaxThreads > 0
464 ? std::min(KernelEnvironment.Configuration.MaxThreads,
465 int32_t(GenericDevice.getThreadLimit()))
466 : GenericDevice.getThreadLimit();
467
468 // Pref = Config.Pref > 0 ? max(Config.Pref, Device.Pref) : Device.Pref;
469 PreferredNumThreads =
470 KernelEnvironment.Configuration.MinThreads > 0
471 ? std::max(KernelEnvironment.Configuration.MinThreads,
472 int32_t(GenericDevice.getDefaultNumThreads()))
473 : GenericDevice.getDefaultNumThreads();
474
475 return initImpl(GenericDevice, Image);
476}
477
478Expected<KernelLaunchEnvironmentTy *>
479GenericKernelTy::getKernelLaunchEnvironment(
480 GenericDeviceTy &GenericDevice, uint32_t Version,
481 AsyncInfoWrapperTy &AsyncInfoWrapper) const {
482 // Ctor/Dtor have no arguments, replaying uses the original kernel launch
483 // environment. Older versions of the compiler do not generate a kernel
484 // launch environment.
485 if (GenericDevice.Plugin.getRecordReplay().isReplaying() ||
486 Version < OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR)
487 return nullptr;
488
489 if (!KernelEnvironment.Configuration.ReductionDataSize ||
490 !KernelEnvironment.Configuration.ReductionBufferLength)
491 return reinterpret_cast<KernelLaunchEnvironmentTy *>(~0);
492
493 // TODO: Check if the kernel needs a launch environment.
494 auto AllocOrErr = GenericDevice.dataAlloc(sizeof(KernelLaunchEnvironmentTy),
495 /*HostPtr=*/nullptr,
496 TargetAllocTy::TARGET_ALLOC_DEVICE);
497 if (!AllocOrErr)
498 return AllocOrErr.takeError();
499
500 // Remember to free the memory later.
501 AsyncInfoWrapper.freeAllocationAfterSynchronization(*AllocOrErr);
502
503 /// Use the KLE in the __tgt_async_info to ensure a stable address for the
504 /// async data transfer.
505 auto &LocalKLE = (*AsyncInfoWrapper).KernelLaunchEnvironment;
506 LocalKLE = KernelLaunchEnvironment;
507 {
508 auto AllocOrErr = GenericDevice.dataAlloc(
509 KernelEnvironment.Configuration.ReductionDataSize *
510 KernelEnvironment.Configuration.ReductionBufferLength,
511 /*HostPtr=*/nullptr, TargetAllocTy::TARGET_ALLOC_DEVICE);
512 if (!AllocOrErr)
513 return AllocOrErr.takeError();
514 LocalKLE.ReductionBuffer = *AllocOrErr;
515 // Remember to free the memory later.
516 AsyncInfoWrapper.freeAllocationAfterSynchronization(*AllocOrErr);
517 }
518
519 INFO(OMP_INFOTYPE_DATA_TRANSFER, GenericDevice.getDeviceId(),
520 "Copying data from host to device, HstPtr=" DPxMOD ", TgtPtr=" DPxMOD
521 ", Size=%" PRId64 ", Name=KernelLaunchEnv\n",
522 DPxPTR(&LocalKLE), DPxPTR(*AllocOrErr),
523 sizeof(KernelLaunchEnvironmentTy));
524
525 auto Err = GenericDevice.dataSubmit(*AllocOrErr, &LocalKLE,
526 sizeof(KernelLaunchEnvironmentTy),
527 AsyncInfoWrapper);
528 if (Err)
529 return Err;
530 return static_cast<KernelLaunchEnvironmentTy *>(*AllocOrErr);
531}
532
533Error GenericKernelTy::printLaunchInfo(GenericDeviceTy &GenericDevice,
534 KernelArgsTy &KernelArgs,
535 uint32_t NumThreads[3],
536 uint32_t NumBlocks[3]) const {
537 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, GenericDevice.getDeviceId(),
538 "Launching kernel %s with [%u,%u,%u] blocks and [%u,%u,%u] threads in "
539 "%s mode\n",
540 getName(), NumBlocks[0], NumBlocks[1], NumBlocks[2], NumThreads[0],
541 NumThreads[1], NumThreads[2], getExecutionModeName());
542 return printLaunchInfoDetails(GenericDevice, KernelArgs, NumThreads,
543 NumBlocks);
544}
545
546Error GenericKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
547 KernelArgsTy &KernelArgs,
548 uint32_t NumThreads[3],
549 uint32_t NumBlocks[3]) const {
550 return Plugin::success();
551}
552
553Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
554 ptrdiff_t *ArgOffsets, KernelArgsTy &KernelArgs,
555 AsyncInfoWrapperTy &AsyncInfoWrapper) const {
556 llvm::SmallVector<void *, 16> Args;
557 llvm::SmallVector<void *, 16> Ptrs;
558
559 auto KernelLaunchEnvOrErr = getKernelLaunchEnvironment(
560 GenericDevice, KernelArgs.Version, AsyncInfoWrapper);
561 if (!KernelLaunchEnvOrErr)
562 return KernelLaunchEnvOrErr.takeError();
563
564 KernelLaunchParamsTy LaunchParams;
565
566 // Kernel languages don't use indirection.
567 if (KernelArgs.Flags.IsCUDA) {
568 LaunchParams =
569 *reinterpret_cast<KernelLaunchParamsTy *>(KernelArgs.ArgPtrs);
570 } else {
571 LaunchParams =
572 prepareArgs(GenericDevice, ArgPtrs, ArgOffsets, KernelArgs.NumArgs,
573 Args, Ptrs, *KernelLaunchEnvOrErr);
574 }
575
576 uint32_t NumThreads[3] = {KernelArgs.ThreadLimit[0],
577 KernelArgs.ThreadLimit[1],
578 KernelArgs.ThreadLimit[2]};
579 uint32_t NumBlocks[3] = {KernelArgs.NumTeams[0], KernelArgs.NumTeams[1],
580 KernelArgs.NumTeams[2]};
581 if (!isBareMode()) {
582 NumThreads[0] = getNumThreads(GenericDevice, NumThreads);
583 NumBlocks[0] = getNumBlocks(GenericDevice, NumBlocks, KernelArgs.Tripcount,
584 NumThreads[0], KernelArgs.ThreadLimit[0] > 0);
585 }
586
587 // Record the kernel description after we modified the argument count and num
588 // blocks/threads.
589 RecordReplayTy &RecordReplay = GenericDevice.Plugin.getRecordReplay();
590 if (RecordReplay.isRecording()) {
591 RecordReplay.saveImage(getName(), getImage());
592 RecordReplay.saveKernelInput(getName(), getImage());
593 RecordReplay.saveKernelDescr(getName(), LaunchParams, KernelArgs.NumArgs,
594 NumBlocks[0], NumThreads[0],
595 KernelArgs.Tripcount);
596 }
597
598 if (auto Err =
599 printLaunchInfo(GenericDevice, KernelArgs, NumThreads, NumBlocks))
600 return Err;
601
602 return launchImpl(GenericDevice, NumThreads, NumBlocks, KernelArgs,
603 LaunchParams, AsyncInfoWrapper);
604}
605
606KernelLaunchParamsTy GenericKernelTy::prepareArgs(
607 GenericDeviceTy &GenericDevice, void **ArgPtrs, ptrdiff_t *ArgOffsets,
608 uint32_t &NumArgs, llvm::SmallVectorImpl<void *> &Args,
609 llvm::SmallVectorImpl<void *> &Ptrs,
610 KernelLaunchEnvironmentTy *KernelLaunchEnvironment) const {
611 uint32_t KLEOffset = !!KernelLaunchEnvironment;
612 NumArgs += KLEOffset;
613
614 if (NumArgs == 0)
615 return KernelLaunchParamsTy{};
616
617 Args.resize(NumArgs);
618 Ptrs.resize(NumArgs);
619
620 if (KernelLaunchEnvironment) {
621 Args[0] = KernelLaunchEnvironment;
622 Ptrs[0] = &Args[0];
623 }
624
625 for (uint32_t I = KLEOffset; I < NumArgs; ++I) {
626 Args[I] =
627 (void *)((intptr_t)ArgPtrs[I - KLEOffset] + ArgOffsets[I - KLEOffset]);
628 Ptrs[I] = &Args[I];
629 }
630 return KernelLaunchParamsTy{sizeof(void *) * NumArgs, &Args[0], &Ptrs[0]};
631}
632
633uint32_t GenericKernelTy::getNumThreads(GenericDeviceTy &GenericDevice,
634 uint32_t ThreadLimitClause[3]) const {
635 assert(!isBareMode() && "bare kernel should not call this function");
636
637 assert(ThreadLimitClause[1] == 1 && ThreadLimitClause[2] == 1 &&
638 "Multi dimensional launch not supported yet.");
639
640 if (ThreadLimitClause[0] > 0 && isGenericMode())
641 ThreadLimitClause[0] += GenericDevice.getWarpSize();
642
643 return std::min(MaxNumThreads, (ThreadLimitClause[0] > 0)
644 ? ThreadLimitClause[0]
645 : PreferredNumThreads);
646}
647
648uint32_t GenericKernelTy::getNumBlocks(GenericDeviceTy &GenericDevice,
649 uint32_t NumTeamsClause[3],
650 uint64_t LoopTripCount,
651 uint32_t &NumThreads,
652 bool IsNumThreadsFromUser) const {
653 assert(!isBareMode() && "bare kernel should not call this function");
654
655 assert(NumTeamsClause[1] == 1 && NumTeamsClause[2] == 1 &&
656 "Multi dimensional launch not supported yet.");
657
658 if (NumTeamsClause[0] > 0) {
659 // TODO: We need to honor any value and consequently allow more than the
660 // block limit. For this we might need to start multiple kernels or let the
661 // blocks start again until the requested number has been started.
662 return std::min(NumTeamsClause[0], GenericDevice.getBlockLimit());
663 }
664
665 uint64_t DefaultNumBlocks = GenericDevice.getDefaultNumBlocks();
666 uint64_t TripCountNumBlocks = std::numeric_limits<uint64_t>::max();
667 if (LoopTripCount > 0) {
668 if (isSPMDMode()) {
669 // We have a combined construct, i.e. `target teams distribute
670 // parallel for [simd]`. We launch so many teams so that each thread
671 // will execute one iteration of the loop; rounded up to the nearest
672 // integer. However, if that results in too few teams, we artificially
673 // reduce the thread count per team to increase the outer parallelism.
674 auto MinThreads = GenericDevice.getMinThreadsForLowTripCountLoop();
675 MinThreads = std::min(MinThreads, NumThreads);
676
677 // Honor the thread_limit clause; only lower the number of threads.
678 [[maybe_unused]] auto OldNumThreads = NumThreads;
679 if (LoopTripCount >= DefaultNumBlocks * NumThreads ||
680 IsNumThreadsFromUser) {
681 // Enough parallelism for teams and threads.
682 TripCountNumBlocks = ((LoopTripCount - 1) / NumThreads) + 1;
683 assert(IsNumThreadsFromUser ||
684 TripCountNumBlocks >= DefaultNumBlocks &&
685 "Expected sufficient outer parallelism.");
686 } else if (LoopTripCount >= DefaultNumBlocks * MinThreads) {
687 // Enough parallelism for teams, limit threads.
688
689 // This case is hard; for now, we force "full warps":
690 // First, compute a thread count assuming DefaultNumBlocks.
691 auto NumThreadsDefaultBlocks =
692 (LoopTripCount + DefaultNumBlocks - 1) / DefaultNumBlocks;
693 // Now get a power of two that is larger or equal.
694 auto NumThreadsDefaultBlocksP2 =
695 llvm::PowerOf2Ceil(NumThreadsDefaultBlocks);
696 // Do not increase a thread limit given be the user.
697 NumThreads = std::min(NumThreads, uint32_t(NumThreadsDefaultBlocksP2));
698 assert(NumThreads >= MinThreads &&
699 "Expected sufficient inner parallelism.");
700 TripCountNumBlocks = ((LoopTripCount - 1) / NumThreads) + 1;
701 } else {
702 // Not enough parallelism for teams and threads, limit both.
703 NumThreads = std::min(NumThreads, MinThreads);
704 TripCountNumBlocks = ((LoopTripCount - 1) / NumThreads) + 1;
705 }
706
707 assert(NumThreads * TripCountNumBlocks >= LoopTripCount &&
708 "Expected sufficient parallelism");
709 assert(OldNumThreads >= NumThreads &&
710 "Number of threads cannot be increased!");
711 } else {
712 assert((isGenericMode() || isGenericSPMDMode()) &&
713 "Unexpected execution mode!");
714 // If we reach this point, then we have a non-combined construct, i.e.
715 // `teams distribute` with a nested `parallel for` and each team is
716 // assigned one iteration of the `distribute` loop. E.g.:
717 //
718 // #pragma omp target teams distribute
719 // for(...loop_tripcount...) {
720 // #pragma omp parallel for
721 // for(...) {}
722 // }
723 //
724 // Threads within a team will execute the iterations of the `parallel`
725 // loop.
726 TripCountNumBlocks = LoopTripCount;
727 }
728 }
729
730 uint32_t PreferredNumBlocks = TripCountNumBlocks;
731 // If the loops are long running we rather reuse blocks than spawn too many.
732 if (GenericDevice.getReuseBlocksForHighTripCount())
733 PreferredNumBlocks = std::min(TripCountNumBlocks, DefaultNumBlocks);
734 return std::min(PreferredNumBlocks, GenericDevice.getBlockLimit());
735}
736
737GenericDeviceTy::GenericDeviceTy(GenericPluginTy &Plugin, int32_t DeviceId,
738 int32_t NumDevices,
739 const llvm::omp::GV &OMPGridValues)
740 : Plugin(Plugin), MemoryManager(nullptr), OMP_TeamLimit("OMP_TEAM_LIMIT"),
741 OMP_NumTeams("OMP_NUM_TEAMS"),
742 OMP_TeamsThreadLimit("OMP_TEAMS_THREAD_LIMIT"),
743 OMPX_DebugKind("LIBOMPTARGET_DEVICE_RTL_DEBUG"),
744 OMPX_SharedMemorySize("LIBOMPTARGET_SHARED_MEMORY_SIZE"),
745 // Do not initialize the following two envars since they depend on the
746 // device initialization. These cannot be consulted until the device is
747 // initialized correctly. We initialize them in GenericDeviceTy::init().
748 OMPX_TargetStackSize(), OMPX_TargetHeapSize(),
749 // By default, the initial number of streams and events is 1.
750 OMPX_InitialNumStreams("LIBOMPTARGET_NUM_INITIAL_STREAMS", 1),
751 OMPX_InitialNumEvents("LIBOMPTARGET_NUM_INITIAL_EVENTS", 1),
752 DeviceId(DeviceId), GridValues(OMPGridValues),
753 PeerAccesses(NumDevices, PeerAccessState::PENDING), PeerAccessesLock(),
754 PinnedAllocs(*this), RPCServer(nullptr) {
755#ifdef OMPT_SUPPORT
756 OmptInitialized.store(false);
757 // Bind the callbacks to this device's member functions
758#define bindOmptCallback(Name, Type, Code) \
759 if (ompt::Initialized && ompt::lookupCallbackByCode) { \
760 ompt::lookupCallbackByCode((ompt_callbacks_t)(Code), \
761 ((ompt_callback_t *)&(Name##_fn))); \
762 DP("OMPT: class bound %s=%p\n", #Name, ((void *)(uint64_t)Name##_fn)); \
763 }
764
765 FOREACH_OMPT_DEVICE_EVENT(bindOmptCallback);
766#undef bindOmptCallback
767
768#endif
769}
770
771Error GenericDeviceTy::init(GenericPluginTy &Plugin) {
772 if (auto Err = initImpl(Plugin))
773 return Err;
774
775#ifdef OMPT_SUPPORT
776 if (ompt::Initialized) {
777 bool ExpectedStatus = false;
778 if (OmptInitialized.compare_exchange_strong(ExpectedStatus, true))
779 performOmptCallback(device_initialize, Plugin.getUserId(DeviceId),
780 /*type=*/getComputeUnitKind().c_str(),
781 /*device=*/reinterpret_cast<ompt_device_t *>(this),
782 /*lookup=*/ompt::lookupCallbackByName,
783 /*documentation=*/nullptr);
784 }
785#endif
786
787 // Read and reinitialize the envars that depend on the device initialization.
788 // Notice these two envars may change the stack size and heap size of the
789 // device, so they need the device properly initialized.
790 auto StackSizeEnvarOrErr = UInt64Envar::create(
791 "LIBOMPTARGET_STACK_SIZE",
792 [this](uint64_t &V) -> Error { return getDeviceStackSize(V); },
793 [this](uint64_t V) -> Error { return setDeviceStackSize(V); });
794 if (!StackSizeEnvarOrErr)
795 return StackSizeEnvarOrErr.takeError();
796 OMPX_TargetStackSize = std::move(*StackSizeEnvarOrErr);
797
798 auto HeapSizeEnvarOrErr = UInt64Envar::create(
799 "LIBOMPTARGET_HEAP_SIZE",
800 [this](uint64_t &V) -> Error { return getDeviceHeapSize(V); },
801 [this](uint64_t V) -> Error { return setDeviceHeapSize(V); });
802 if (!HeapSizeEnvarOrErr)
803 return HeapSizeEnvarOrErr.takeError();
804 OMPX_TargetHeapSize = std::move(*HeapSizeEnvarOrErr);
805
806 // Update the maximum number of teams and threads after the device
807 // initialization sets the corresponding hardware limit.
808 if (OMP_NumTeams > 0)
809 GridValues.GV_Max_Teams =
810 std::min(GridValues.GV_Max_Teams, uint32_t(OMP_NumTeams));
811
812 if (OMP_TeamsThreadLimit > 0)
813 GridValues.GV_Max_WG_Size =
814 std::min(GridValues.GV_Max_WG_Size, uint32_t(OMP_TeamsThreadLimit));
815
816 // Enable the memory manager if required.
817 auto [ThresholdMM, EnableMM] = MemoryManagerTy::getSizeThresholdFromEnv();
818 if (EnableMM)
819 MemoryManager = new MemoryManagerTy(*this, ThresholdMM);
820
821 return Plugin::success();
822}
823
824Error GenericDeviceTy::unloadBinary(DeviceImageTy *Image) {
825 if (auto Err = callGlobalDestructors(Plugin, *Image))
826 return Err;
827
828 if (OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::AllocationTracker)) {
829 GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
830 DeviceMemoryPoolTrackingTy ImageDeviceMemoryPoolTracking = {0, 0, ~0U, 0};
831 GlobalTy TrackerGlobal("__omp_rtl_device_memory_pool_tracker",
832 sizeof(DeviceMemoryPoolTrackingTy),
833 &ImageDeviceMemoryPoolTracking);
834 if (auto Err =
835 GHandler.readGlobalFromDevice(*this, *Image, TrackerGlobal)) {
836 consumeError(std::move(Err));
837 }
838 DeviceMemoryPoolTracking.combine(ImageDeviceMemoryPoolTracking);
839 }
840
841 GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
842 auto ProfOrErr = Handler.readProfilingGlobals(*this, *Image);
843 if (!ProfOrErr)
844 return ProfOrErr.takeError();
845
846 if (!ProfOrErr->empty()) {
847 // Dump out profdata
848 if ((OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::PGODump)) ==
849 uint32_t(DeviceDebugKind::PGODump))
850 ProfOrErr->dump();
851
852 // Write data to profiling file
853 if (auto Err = ProfOrErr->write())
854 return Err;
855 }
856
857 return unloadBinaryImpl(Image);
858}
859
860Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
861 for (auto &I : LoadedImages)
862 if (auto Err = unloadBinary(I))
863 return Err;
864 LoadedImages.clear();
865
866 if (OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::AllocationTracker)) {
867 // TODO: Write this by default into a file.
868 printf("\n\n|-----------------------\n"
869 "| Device memory tracker:\n"
870 "|-----------------------\n"
871 "| #Allocations: %lu\n"
872 "| Byes allocated: %lu\n"
873 "| Minimal allocation: %lu\n"
874 "| Maximal allocation: %lu\n"
875 "|-----------------------\n\n\n",
876 DeviceMemoryPoolTracking.NumAllocations,
877 DeviceMemoryPoolTracking.AllocationTotal,
878 DeviceMemoryPoolTracking.AllocationMin,
879 DeviceMemoryPoolTracking.AllocationMax);
880 }
881
882 // Delete the memory manager before deinitializing the device. Otherwise,
883 // we may delete device allocations after the device is deinitialized.
884 if (MemoryManager)
885 delete MemoryManager;
886 MemoryManager = nullptr;
887
888 RecordReplayTy &RecordReplay = Plugin.getRecordReplay();
889 if (RecordReplay.isRecordingOrReplaying())
890 RecordReplay.deinit();
891
892 if (RPCServer)
893 if (auto Err = RPCServer->deinitDevice(*this))
894 return Err;
895
896#ifdef OMPT_SUPPORT
897 if (ompt::Initialized) {
898 bool ExpectedStatus = true;
899 if (OmptInitialized.compare_exchange_strong(ExpectedStatus, false))
900 performOmptCallback(device_finalize, Plugin.getUserId(DeviceId));
901 }
902#endif
903
904 return deinitImpl();
905}
906Expected<DeviceImageTy *>
907GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
908 const __tgt_device_image *InputTgtImage) {
909 assert(InputTgtImage && "Expected non-null target image");
910 DP("Load data from image " DPxMOD "\n", DPxPTR(InputTgtImage->ImageStart));
911
912 auto PostJITImageOrErr = Plugin.getJIT().process(*InputTgtImage, *this);
913 if (!PostJITImageOrErr) {
914 auto Err = PostJITImageOrErr.takeError();
915 REPORT("Failure to jit IR image %p on device %d: %s\n", InputTgtImage,
916 DeviceId, toStringWithoutConsuming(Err).data());
917 return Plugin::error(ErrorCode::COMPILE_FAILURE, std::move(Err),
918 "failure to jit IR image");
919 }
920
921 // Load the binary and allocate the image object. Use the next available id
922 // for the image id, which is the number of previously loaded images.
923 auto ImageOrErr =
924 loadBinaryImpl(PostJITImageOrErr.get(), LoadedImages.size());
925 if (!ImageOrErr)
926 return ImageOrErr.takeError();
927
928 DeviceImageTy *Image = *ImageOrErr;
929 assert(Image != nullptr && "Invalid image");
930 if (InputTgtImage != PostJITImageOrErr.get())
931 Image->setTgtImageBitcode(InputTgtImage);
932
933 // Add the image to list.
934 LoadedImages.push_back(Image);
935
936 // Setup the device environment if needed.
937 if (auto Err = setupDeviceEnvironment(Plugin, *Image))
938 return std::move(Err);
939
940 // Setup the global device memory pool if needed.
941 if (!Plugin.getRecordReplay().isReplaying() &&
942 shouldSetupDeviceMemoryPool()) {
943 uint64_t HeapSize;
944 auto SizeOrErr = getDeviceHeapSize(HeapSize);
945 if (SizeOrErr) {
946 REPORT("No global device memory pool due to error: %s\n",
947 toString(std::move(SizeOrErr)).data());
948 } else if (auto Err = setupDeviceMemoryPool(Plugin, *Image, HeapSize))
949 return std::move(Err);
950 }
951
952 if (auto Err = setupRPCServer(Plugin, *Image))
953 return std::move(Err);
954
955#ifdef OMPT_SUPPORT
956 if (ompt::Initialized) {
957 size_t Bytes =
958 utils::getPtrDiff(InputTgtImage->ImageEnd, InputTgtImage->ImageStart);
959 performOmptCallback(
960 device_load, Plugin.getUserId(DeviceId),
961 /*FileName=*/nullptr, /*FileOffset=*/0, /*VmaInFile=*/nullptr,
962 /*ImgSize=*/Bytes, /*HostAddr=*/InputTgtImage->ImageStart,
963 /*DeviceAddr=*/nullptr, /* FIXME: ModuleId */ 0);
964 }
965#endif
966
967 // Call any global constructors present on the device.
968 if (auto Err = callGlobalConstructors(Plugin, *Image))
969 return std::move(Err);
970
971 // Return the pointer to the table of entries.
972 return Image;
973}
974
975Error GenericDeviceTy::setupDeviceEnvironment(GenericPluginTy &Plugin,
976 DeviceImageTy &Image) {
977 // There are some plugins that do not need this step.
978 if (!shouldSetupDeviceEnvironment())
979 return Plugin::success();
980
981 // Obtain a table mapping host function pointers to device function pointers.
982 auto CallTablePairOrErr = setupIndirectCallTable(Plugin, *this, Image);
983 if (!CallTablePairOrErr)
984 return CallTablePairOrErr.takeError();
985
986 DeviceEnvironmentTy DeviceEnvironment;
987 DeviceEnvironment.DeviceDebugKind = OMPX_DebugKind;
988 DeviceEnvironment.NumDevices = Plugin.getNumDevices();
989 // TODO: The device ID used here is not the real device ID used by OpenMP.
990 DeviceEnvironment.DeviceNum = DeviceId;
991 DeviceEnvironment.DynamicMemSize = OMPX_SharedMemorySize;
992 DeviceEnvironment.ClockFrequency = getClockFrequency();
993 DeviceEnvironment.IndirectCallTable =
994 reinterpret_cast<uintptr_t>(CallTablePairOrErr->first);
995 DeviceEnvironment.IndirectCallTableSize = CallTablePairOrErr->second;
996 DeviceEnvironment.HardwareParallelism = getHardwareParallelism();
997
998 // Create the metainfo of the device environment global.
999 GlobalTy DevEnvGlobal("__omp_rtl_device_environment",
1000 sizeof(DeviceEnvironmentTy), &DeviceEnvironment);
1001
1002 // Write device environment values to the device.
1003 GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
1004 if (auto Err = GHandler.writeGlobalToDevice(*this, Image, DevEnvGlobal)) {
1005 DP("Missing symbol %s, continue execution anyway.\n",
1006 DevEnvGlobal.getName().data());
1007 consumeError(std::move(Err));
1008 }
1009 return Plugin::success();
1010}
1011
1012Error GenericDeviceTy::setupDeviceMemoryPool(GenericPluginTy &Plugin,
1013 DeviceImageTy &Image,
1014 uint64_t PoolSize) {
1015 // Free the old pool, if any.
1016 if (DeviceMemoryPool.Ptr) {
1017 if (auto Err = dataDelete(DeviceMemoryPool.Ptr,
1018 TargetAllocTy::TARGET_ALLOC_DEVICE))
1019 return Err;
1020 }
1021
1022 DeviceMemoryPool.Size = PoolSize;
1023 auto AllocOrErr = dataAlloc(PoolSize, /*HostPtr=*/nullptr,
1024 TargetAllocTy::TARGET_ALLOC_DEVICE);
1025 if (AllocOrErr) {
1026 DeviceMemoryPool.Ptr = *AllocOrErr;
1027 } else {
1028 auto Err = AllocOrErr.takeError();
1029 REPORT("Failure to allocate device memory for global memory pool: %s\n",
1030 toString(std::move(Err)).data());
1031 DeviceMemoryPool.Ptr = nullptr;
1032 DeviceMemoryPool.Size = 0;
1033 }
1034
1035 // Create the metainfo of the device environment global.
1036 GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
1037 if (!GHandler.isSymbolInImage(*this, Image,
1038 "__omp_rtl_device_memory_pool_tracker")) {
1039 DP("Skip the memory pool as there is no tracker symbol in the image.");
1040 return Error::success();
1041 }
1042
1043 GlobalTy TrackerGlobal("__omp_rtl_device_memory_pool_tracker",
1044 sizeof(DeviceMemoryPoolTrackingTy),
1045 &DeviceMemoryPoolTracking);
1046 if (auto Err = GHandler.writeGlobalToDevice(*this, Image, TrackerGlobal))
1047 return Err;
1048
1049 // Create the metainfo of the device environment global.
1050 GlobalTy DevEnvGlobal("__omp_rtl_device_memory_pool",
1051 sizeof(DeviceMemoryPoolTy), &DeviceMemoryPool);
1052
1053 // Write device environment values to the device.
1054 return GHandler.writeGlobalToDevice(*this, Image, DevEnvGlobal);
1055}
1056
1057Error GenericDeviceTy::setupRPCServer(GenericPluginTy &Plugin,
1058 DeviceImageTy &Image) {
1059 // The plugin either does not need an RPC server or it is unavailable.
1060 if (!shouldSetupRPCServer())
1061 return Plugin::success();
1062
1063 // Check if this device needs to run an RPC server.
1064 RPCServerTy &Server = Plugin.getRPCServer();
1065 auto UsingOrErr =
1066 Server.isDeviceUsingRPC(*this, Plugin.getGlobalHandler(), Image);
1067 if (!UsingOrErr)
1068 return UsingOrErr.takeError();
1069
1070 if (!UsingOrErr.get())
1071 return Plugin::success();
1072
1073 if (auto Err = Server.initDevice(*this, Plugin.getGlobalHandler(), Image))
1074 return Err;
1075
1076 if (auto Err = Server.startThread())
1077 return Err;
1078
1079 RPCServer = &Server;
1080 DP("Running an RPC server on device %d\n", getDeviceId());
1081 return Plugin::success();
1082}
1083
1084Error PinnedAllocationMapTy::insertEntry(void *HstPtr, void *DevAccessiblePtr,
1085 size_t Size, bool ExternallyLocked) {
1086 // Insert the new entry into the map.
1087 auto Res = Allocs.insert({HstPtr, DevAccessiblePtr, Size, ExternallyLocked});
1088 if (!Res.second)
1089 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
1090 "cannot insert locked buffer entry");
1091
1092 // Check whether the next entry overlaps with the inserted entry.
1093 auto It = std::next(Res.first);
1094 if (It == Allocs.end())
1095 return Plugin::success();
1096
1097 const EntryTy *NextEntry = &(*It);
1098 if (intersects(NextEntry->HstPtr, NextEntry->Size, HstPtr, Size))
1099 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
1100 "partial overlapping not allowed in locked buffers");
1101
1102 return Plugin::success();
1103}
1104
1105Error PinnedAllocationMapTy::eraseEntry(const EntryTy &Entry) {
1106 // Erase the existing entry. Notice this requires an additional map lookup,
1107 // but this should not be a performance issue. Using iterators would make
1108 // the code more difficult to read.
1109 size_t Erased = Allocs.erase({Entry.HstPtr});
1110 if (!Erased)
1111 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
1112 "cannot erase locked buffer entry");
1113 return Plugin::success();
1114}
1115
1116Error PinnedAllocationMapTy::registerEntryUse(const EntryTy &Entry,
1117 void *HstPtr, size_t Size) {
1118 if (!contains(Entry.HstPtr, Entry.Size, HstPtr, Size))
1119 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
1120 "partial overlapping not allowed in locked buffers");
1121
1122 ++Entry.References;
1123 return Plugin::success();
1124}
1125
1126Expected<bool> PinnedAllocationMapTy::unregisterEntryUse(const EntryTy &Entry) {
1127 if (Entry.References == 0)
1128 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
1129 "invalid number of references");
1130
1131 // Return whether this was the last user.
1132 return (--Entry.References == 0);
1133}
1134
1135Error PinnedAllocationMapTy::registerHostBuffer(void *HstPtr,
1136 void *DevAccessiblePtr,
1137 size_t Size) {
1138 assert(HstPtr && "Invalid pointer");
1139 assert(DevAccessiblePtr && "Invalid pointer");
1140 assert(Size && "Invalid size");
1141
1142 std::lock_guard<std::shared_mutex> Lock(Mutex);
1143
1144 // No pinned allocation should intersect.
1145 const EntryTy *Entry = findIntersecting(HstPtr);
1146 if (Entry)
1147 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
1148 "cannot insert entry due to an existing one");
1149
1150 // Now insert the new entry.
1151 return insertEntry(HstPtr, DevAccessiblePtr, Size);
1152}
1153
1154Error PinnedAllocationMapTy::unregisterHostBuffer(void *HstPtr) {
1155 assert(HstPtr && "Invalid pointer");
1156
1157 std::lock_guard<std::shared_mutex> Lock(Mutex);
1158
1159 const EntryTy *Entry = findIntersecting(HstPtr);
1160 if (!Entry)
1161 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
1162 "cannot find locked buffer");
1163
1164 // The address in the entry should be the same we are unregistering.
1165 if (Entry->HstPtr != HstPtr)
1166 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
1167 "unexpected host pointer in locked buffer entry");
1168
1169 // Unregister from the entry.
1170 auto LastUseOrErr = unregisterEntryUse(*Entry);
1171 if (!LastUseOrErr)
1172 return LastUseOrErr.takeError();
1173
1174 // There should be no other references to the pinned allocation.
1175 if (!(*LastUseOrErr))
1176 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
1177 "the locked buffer is still being used");
1178
1179 // Erase the entry from the map.
1180 return eraseEntry(*Entry);
1181}
1182
1183Expected<void *> PinnedAllocationMapTy::lockHostBuffer(void *HstPtr,
1184 size_t Size) {
1185 assert(HstPtr && "Invalid pointer");
1186 assert(Size && "Invalid size");
1187
1188 std::lock_guard<std::shared_mutex> Lock(Mutex);
1189
1190 const EntryTy *Entry = findIntersecting(HstPtr);
1191
1192 if (Entry) {
1193 // An already registered intersecting buffer was found. Register a new use.
1194 if (auto Err = registerEntryUse(*Entry, HstPtr, Size))
1195 return std::move(Err);
1196
1197 // Return the device accessible pointer with the correct offset.
1198 return utils::advancePtr(Entry->DevAccessiblePtr,
1199 utils::getPtrDiff(HstPtr, Entry->HstPtr));
1200 }
1201
1202 // No intersecting registered allocation found in the map. First, lock the
1203 // host buffer and retrieve the device accessible pointer.
1204 auto DevAccessiblePtrOrErr = Device.dataLockImpl(HstPtr, Size);
1205 if (!DevAccessiblePtrOrErr)
1206 return DevAccessiblePtrOrErr.takeError();
1207
1208 // Now insert the new entry into the map.
1209 if (auto Err = insertEntry(HstPtr, *DevAccessiblePtrOrErr, Size))
1210 return std::move(Err);
1211
1212 // Return the device accessible pointer.
1213 return *DevAccessiblePtrOrErr;
1214}
1215
1216Error PinnedAllocationMapTy::unlockHostBuffer(void *HstPtr) {
1217 assert(HstPtr && "Invalid pointer");
1218
1219 std::lock_guard<std::shared_mutex> Lock(Mutex);
1220
1221 const EntryTy *Entry = findIntersecting(HstPtr);
1222 if (!Entry)
1223 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
1224 "cannot find locked buffer");
1225
1226 // Unregister from the locked buffer. No need to do anything if there are
1227 // others using the allocation.
1228 auto LastUseOrErr = unregisterEntryUse(*Entry);
1229 if (!LastUseOrErr)
1230 return LastUseOrErr.takeError();
1231
1232 // No need to do anything if there are others using the allocation.
1233 if (!(*LastUseOrErr))
1234 return Plugin::success();
1235
1236 // This was the last user of the allocation. Unlock the original locked buffer
1237 // if it was locked by the plugin. Do not unlock it if it was locked by an
1238 // external entity. Unlock the buffer using the host pointer of the entry.
1239 if (!Entry->ExternallyLocked)
1240 if (auto Err = Device.dataUnlockImpl(Entry->HstPtr))
1241 return Err;
1242
1243 // Erase the entry from the map.
1244 return eraseEntry(*Entry);
1245}
1246
1247Error PinnedAllocationMapTy::lockMappedHostBuffer(void *HstPtr, size_t Size) {
1248 assert(HstPtr && "Invalid pointer");
1249 assert(Size && "Invalid size");
1250
1251 std::lock_guard<std::shared_mutex> Lock(Mutex);
1252
1253 // If previously registered, just register a new user on the entry.
1254 const EntryTy *Entry = findIntersecting(HstPtr);
1255 if (Entry)
1256 return registerEntryUse(*Entry, HstPtr, Size);
1257
1258 size_t BaseSize;
1259 void *BaseHstPtr, *BaseDevAccessiblePtr;
1260
1261 // Check if it was externally pinned by a vendor-specific API.
1262 auto IsPinnedOrErr = Device.isPinnedPtrImpl(HstPtr, BaseHstPtr,
1263 BaseDevAccessiblePtr, BaseSize);
1264 if (!IsPinnedOrErr)
1265 return IsPinnedOrErr.takeError();
1266
1267 // If pinned, just insert the entry representing the whole pinned buffer.
1268 if (*IsPinnedOrErr)
1269 return insertEntry(BaseHstPtr, BaseDevAccessiblePtr, BaseSize,
1270 /*Externallylocked=*/true);
1271
1272 // Not externally pinned. Do nothing if locking of mapped buffers is disabled.
1273 if (!LockMappedBuffers)
1274 return Plugin::success();
1275
1276 // Otherwise, lock the buffer and insert the new entry.
1277 auto DevAccessiblePtrOrErr = Device.dataLockImpl(HstPtr, Size);
1278 if (!DevAccessiblePtrOrErr) {
1279 // Errors may be tolerated.
1280 if (!IgnoreLockMappedFailures)
1281 return DevAccessiblePtrOrErr.takeError();
1282
1283 consumeError(DevAccessiblePtrOrErr.takeError());
1284 return Plugin::success();
1285 }
1286
1287 return insertEntry(HstPtr, *DevAccessiblePtrOrErr, Size);
1288}
1289
1290Error PinnedAllocationMapTy::unlockUnmappedHostBuffer(void *HstPtr) {
1291 assert(HstPtr && "Invalid pointer");
1292
1293 std::lock_guard<std::shared_mutex> Lock(Mutex);
1294
1295 // Check whether there is any intersecting entry.
1296 const EntryTy *Entry = findIntersecting(HstPtr);
1297
1298 // No entry but automatic locking of mapped buffers is disabled, so
1299 // nothing to do.
1300 if (!Entry && !LockMappedBuffers)
1301 return Plugin::success();
1302
1303 // No entry, automatic locking is enabled, but the locking may have failed, so
1304 // do nothing.
1305 if (!Entry && IgnoreLockMappedFailures)
1306 return Plugin::success();
1307
1308 // No entry, but the automatic locking is enabled, so this is an error.
1309 if (!Entry)
1310 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
1311 "locked buffer not found");
1312
1313 // There is entry, so unregister a user and check whether it was the last one.
1314 auto LastUseOrErr = unregisterEntryUse(*Entry);
1315 if (!LastUseOrErr)
1316 return LastUseOrErr.takeError();
1317
1318 // If it is not the last one, there is nothing to do.
1319 if (!(*LastUseOrErr))
1320 return Plugin::success();
1321
1322 // Otherwise, if it was the last and the buffer was locked by the plugin,
1323 // unlock it.
1324 if (!Entry->ExternallyLocked)
1325 if (auto Err = Device.dataUnlockImpl(Entry->HstPtr))
1326 return Err;
1327
1328 // Finally erase the entry from the map.
1329 return eraseEntry(*Entry);
1330}
1331
1332Error GenericDeviceTy::synchronize(__tgt_async_info *AsyncInfo) {
1333 if (!AsyncInfo || !AsyncInfo->Queue)
1334 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
1335 "invalid async info queue");
1336
1337 if (auto Err = synchronizeImpl(*AsyncInfo))
1338 return Err;
1339
1340 for (auto *Ptr : AsyncInfo->AssociatedAllocations)
1341 if (auto Err = dataDelete(Ptr, TargetAllocTy::TARGET_ALLOC_DEVICE))
1342 return Err;
1343 AsyncInfo->AssociatedAllocations.clear();
1344
1345 return Plugin::success();
1346}
1347
1348Error GenericDeviceTy::queryAsync(__tgt_async_info *AsyncInfo) {
1349 if (!AsyncInfo || !AsyncInfo->Queue)
1350 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
1351 "invalid async info queue");
1352
1353 return queryAsyncImpl(*AsyncInfo);
1354}
1355
1356Error GenericDeviceTy::memoryVAMap(void **Addr, void *VAddr, size_t *RSize) {
1357 return Plugin::error(ErrorCode::UNSUPPORTED,
1358 "device does not support VA Management");
1359}
1360
1361Error GenericDeviceTy::memoryVAUnMap(void *VAddr, size_t Size) {
1362 return Plugin::error(ErrorCode::UNSUPPORTED,
1363 "device does not support VA Management");
1364}
1365
1366Error GenericDeviceTy::getDeviceMemorySize(uint64_t &DSize) {
1367 return Plugin::error(
1368 ErrorCode::UNIMPLEMENTED,
1369 "missing getDeviceMemorySize implementation (required by RR-heuristic");
1370}
1371
1372Expected<void *> GenericDeviceTy::dataAlloc(int64_t Size, void *HostPtr,
1373 TargetAllocTy Kind) {
1374 void *Alloc = nullptr;
1375
1376 if (Plugin.getRecordReplay().isRecordingOrReplaying())
1377 return Plugin.getRecordReplay().alloc(Size);
1378
1379 switch (Kind) {
1380 case TARGET_ALLOC_DEFAULT:
1381 case TARGET_ALLOC_DEVICE_NON_BLOCKING:
1382 case TARGET_ALLOC_DEVICE:
1383 if (MemoryManager) {
1384 Alloc = MemoryManager->allocate(Size, HostPtr);
1385 if (!Alloc)
1386 return Plugin::error(ErrorCode::OUT_OF_RESOURCES,
1387 "failed to allocate from memory manager");
1388 break;
1389 }
1390 [[fallthrough]];
1391 case TARGET_ALLOC_HOST:
1392 case TARGET_ALLOC_SHARED:
1393 Alloc = allocate(Size, HostPtr, Kind);
1394 if (!Alloc)
1395 return Plugin::error(ErrorCode::OUT_OF_RESOURCES,
1396 "failed to allocate from device allocator");
1397 }
1398
1399 // Report error if the memory manager or the device allocator did not return
1400 // any memory buffer.
1401 if (!Alloc)
1402 return Plugin::error(ErrorCode::UNIMPLEMENTED,
1403 "invalid target data allocation kind or requested "
1404 "allocator not implemented yet");
1405
1406 // Register allocated buffer as pinned memory if the type is host memory.
1407 if (Kind == TARGET_ALLOC_HOST)
1408 if (auto Err = PinnedAllocs.registerHostBuffer(Alloc, Alloc, Size))
1409 return std::move(Err);
1410
1411 // Keep track of the allocation stack if we track allocation traces.
1412 if (OMPX_TrackAllocationTraces) {
1413 std::string StackTrace;
1414 llvm::raw_string_ostream OS(StackTrace);
1415 llvm::sys::PrintStackTrace(OS);
1416
1417 AllocationTraceInfoTy *ATI = new AllocationTraceInfoTy();
1418 ATI->AllocationTrace = std::move(StackTrace);
1419 ATI->DevicePtr = Alloc;
1420 ATI->HostPtr = HostPtr;
1421 ATI->Size = Size;
1422 ATI->Kind = Kind;
1423
1424 auto AllocationTraceMap = AllocationTraces.getExclusiveAccessor();
1425 auto *&MapATI = (*AllocationTraceMap)[Alloc];
1426 ATI->LastAllocationInfo = MapATI;
1427 MapATI = ATI;
1428 }
1429
1430 return Alloc;
1431}
1432
1433Error GenericDeviceTy::dataDelete(void *TgtPtr, TargetAllocTy Kind) {
1434 // Free is a noop when recording or replaying.
1435 if (Plugin.getRecordReplay().isRecordingOrReplaying())
1436 return Plugin::success();
1437
1438 // Keep track of the deallocation stack if we track allocation traces.
1439 if (OMPX_TrackAllocationTraces) {
1440 AllocationTraceInfoTy *ATI = nullptr;
1441 {
1442 auto AllocationTraceMap = AllocationTraces.getExclusiveAccessor();
1443 ATI = (*AllocationTraceMap)[TgtPtr];
1444 }
1445
1446 std::string StackTrace;
1447 llvm::raw_string_ostream OS(StackTrace);
1448 llvm::sys::PrintStackTrace(OS);
1449
1450 if (!ATI)
1451 ErrorReporter::reportDeallocationOfNonAllocatedPtr(TgtPtr, Kind, ATI,
1452 StackTrace);
1453
1454 // ATI is not null, thus we can lock it to inspect and modify it further.
1455 std::lock_guard<std::mutex> LG(ATI->Lock);
1456 if (!ATI->DeallocationTrace.empty())
1457 ErrorReporter::reportDeallocationOfDeallocatedPtr(TgtPtr, Kind, ATI,
1458 StackTrace);
1459
1460 if (ATI->Kind != Kind)
1461 ErrorReporter::reportDeallocationOfWrongPtrKind(TgtPtr, Kind, ATI,
1462 StackTrace);
1463
1464 ATI->DeallocationTrace = StackTrace;
1465
1466#undef DEALLOCATION_ERROR
1467 }
1468
1469 int Res;
1470 switch (Kind) {
1471 case TARGET_ALLOC_DEFAULT:
1472 case TARGET_ALLOC_DEVICE_NON_BLOCKING:
1473 case TARGET_ALLOC_DEVICE:
1474 if (MemoryManager) {
1475 Res = MemoryManager->free(TgtPtr);
1476 if (Res)
1477 return Plugin::error(
1478 ErrorCode::OUT_OF_RESOURCES,
1479 "failure to deallocate device pointer %p via memory manager",
1480 TgtPtr);
1481 break;
1482 }
1483 [[fallthrough]];
1484 case TARGET_ALLOC_HOST:
1485 case TARGET_ALLOC_SHARED:
1486 Res = free(TgtPtr, Kind);
1487 if (Res)
1488 return Plugin::error(
1489 ErrorCode::UNKNOWN,
1490 "failure to deallocate device pointer %p via device deallocator",
1491 TgtPtr);
1492 }
1493
1494 // Unregister deallocated pinned memory buffer if the type is host memory.
1495 if (Kind == TARGET_ALLOC_HOST)
1496 if (auto Err = PinnedAllocs.unregisterHostBuffer(TgtPtr))
1497 return Err;
1498
1499 return Plugin::success();
1500}
1501
1502Error GenericDeviceTy::dataSubmit(void *TgtPtr, const void *HstPtr,
1503 int64_t Size, __tgt_async_info *AsyncInfo) {
1504 AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
1505
1506 auto Err = dataSubmitImpl(TgtPtr, HstPtr, Size, AsyncInfoWrapper);
1507 AsyncInfoWrapper.finalize(Err);
1508 return Err;
1509}
1510
1511Error GenericDeviceTy::dataRetrieve(void *HstPtr, const void *TgtPtr,
1512 int64_t Size, __tgt_async_info *AsyncInfo) {
1513 AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
1514
1515 auto Err = dataRetrieveImpl(HstPtr, TgtPtr, Size, AsyncInfoWrapper);
1516 AsyncInfoWrapper.finalize(Err);
1517 return Err;
1518}
1519
1520Error GenericDeviceTy::dataExchange(const void *SrcPtr, GenericDeviceTy &DstDev,
1521 void *DstPtr, int64_t Size,
1522 __tgt_async_info *AsyncInfo) {
1523 AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
1524
1525 auto Err = dataExchangeImpl(SrcPtr, DstDev, DstPtr, Size, AsyncInfoWrapper);
1526 AsyncInfoWrapper.finalize(Err);
1527 return Err;
1528}
1529
1530Error GenericDeviceTy::launchKernel(void *EntryPtr, void **ArgPtrs,
1531 ptrdiff_t *ArgOffsets,
1532 KernelArgsTy &KernelArgs,
1533 __tgt_async_info *AsyncInfo) {
1534 AsyncInfoWrapperTy AsyncInfoWrapper(
1535 *this,
1536 Plugin.getRecordReplay().isRecordingOrReplaying() ? nullptr : AsyncInfo);
1537
1538 GenericKernelTy &GenericKernel =
1539 *reinterpret_cast<GenericKernelTy *>(EntryPtr);
1540
1541 {
1542 std::string StackTrace;
1543 if (OMPX_TrackNumKernelLaunches) {
1544 llvm::raw_string_ostream OS(StackTrace);
1545 llvm::sys::PrintStackTrace(OS);
1546 }
1547
1548 auto KernelTraceInfoRecord = KernelLaunchTraces.getExclusiveAccessor();
1549 (*KernelTraceInfoRecord)
1550 .emplace(&GenericKernel, std::move(StackTrace), AsyncInfo);
1551 }
1552
1553 auto Err = GenericKernel.launch(*this, ArgPtrs, ArgOffsets, KernelArgs,
1554 AsyncInfoWrapper);
1555
1556 // 'finalize' here to guarantee next record-replay actions are in-sync
1557 AsyncInfoWrapper.finalize(Err);
1558
1559 RecordReplayTy &RecordReplay = Plugin.getRecordReplay();
1560 if (RecordReplay.isRecordingOrReplaying() &&
1561 RecordReplay.isSaveOutputEnabled())
1562 RecordReplay.saveKernelOutputInfo(GenericKernel.getName());
1563
1564 return Err;
1565}
1566
1567Error GenericDeviceTy::initAsyncInfo(__tgt_async_info **AsyncInfoPtr) {
1568 assert(AsyncInfoPtr && "Invalid async info");
1569
1570 *AsyncInfoPtr = new __tgt_async_info();
1571
1572 AsyncInfoWrapperTy AsyncInfoWrapper(*this, *AsyncInfoPtr);
1573
1574 auto Err = initAsyncInfoImpl(AsyncInfoWrapper);
1575 AsyncInfoWrapper.finalize(Err);
1576 return Err;
1577}
1578
1579Error GenericDeviceTy::initDeviceInfo(__tgt_device_info *DeviceInfo) {
1580 assert(DeviceInfo && "Invalid device info");
1581
1582 return initDeviceInfoImpl(DeviceInfo);
1583}
1584
1585Error GenericDeviceTy::printInfo() {
1586 auto Info = obtainInfoImpl();
1587
1588 // Get the vendor-specific info entries describing the device properties.
1589 if (auto Err = Info.takeError())
1590 return Err;
1591
1592 // Print all info entries.
1593 Info->print();
1594
1595 return Plugin::success();
1596}
1597
1598Error GenericDeviceTy::createEvent(void **EventPtrStorage) {
1599 return createEventImpl(EventPtrStorage);
1600}
1601
1602Error GenericDeviceTy::destroyEvent(void *EventPtr) {
1603 return destroyEventImpl(EventPtr);
1604}
1605
1606Error GenericDeviceTy::recordEvent(void *EventPtr,
1607 __tgt_async_info *AsyncInfo) {
1608 AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
1609
1610 auto Err = recordEventImpl(EventPtr, AsyncInfoWrapper);
1611 AsyncInfoWrapper.finalize(Err);
1612 return Err;
1613}
1614
1615Error GenericDeviceTy::waitEvent(void *EventPtr, __tgt_async_info *AsyncInfo) {
1616 AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
1617
1618 auto Err = waitEventImpl(EventPtr, AsyncInfoWrapper);
1619 AsyncInfoWrapper.finalize(Err);
1620 return Err;
1621}
1622
1623Error GenericDeviceTy::syncEvent(void *EventPtr) {
1624 return syncEventImpl(EventPtr);
1625}
1626
1627bool GenericDeviceTy::useAutoZeroCopy() { return useAutoZeroCopyImpl(); }
1628
1629Error GenericPluginTy::init() {
1630 if (Initialized)
1631 return Plugin::success();
1632
1633 auto NumDevicesOrErr = initImpl();
1634 if (!NumDevicesOrErr)
1635 return NumDevicesOrErr.takeError();
1636 Initialized = true;
1637
1638 NumDevices = *NumDevicesOrErr;
1639 if (NumDevices == 0)
1640 return Plugin::success();
1641
1642 assert(Devices.size() == 0 && "Plugin already initialized");
1643 Devices.resize(NumDevices, nullptr);
1644
1645 GlobalHandler = createGlobalHandler();
1646 assert(GlobalHandler && "Invalid global handler");
1647
1648 RPCServer = new RPCServerTy(*this);
1649 assert(RPCServer && "Invalid RPC server");
1650
1651 RecordReplay = new RecordReplayTy();
1652 assert(RecordReplay && "Invalid RR interface");
1653
1654 return Plugin::success();
1655}
1656
1657Error GenericPluginTy::deinit() {
1658 assert(Initialized && "Plugin was not initialized!");
1659
1660 // Deinitialize all active devices.
1661 for (int32_t DeviceId = 0; DeviceId < NumDevices; ++DeviceId) {
1662 if (Devices[DeviceId]) {
1663 if (auto Err = deinitDevice(DeviceId))
1664 return Err;
1665 }
1666 assert(!Devices[DeviceId] && "Device was not deinitialized");
1667 }
1668
1669 // There is no global handler if no device is available.
1670 if (GlobalHandler)
1671 delete GlobalHandler;
1672
1673 if (RPCServer) {
1674 if (Error Err = RPCServer->shutDown())
1675 return Err;
1676 delete RPCServer;
1677 }
1678
1679 if (RecordReplay)
1680 delete RecordReplay;
1681
1682 // Perform last deinitializations on the plugin.
1683 if (Error Err = deinitImpl())
1684 return Err;
1685 Initialized = false;
1686
1687 return Plugin::success();
1688}
1689
1690Error GenericPluginTy::initDevice(int32_t DeviceId) {
1691 assert(!Devices[DeviceId] && "Device already initialized");
1692
1693 // Create the device and save the reference.
1694 GenericDeviceTy *Device = createDevice(*this, DeviceId, NumDevices);
1695 assert(Device && "Invalid device");
1696
1697 // Save the device reference into the list.
1698 Devices[DeviceId] = Device;
1699
1700 // Initialize the device and its resources.
1701 return Device->init(*this);
1702}
1703
1704Error GenericPluginTy::deinitDevice(int32_t DeviceId) {
1705 // The device may be already deinitialized.
1706 if (Devices[DeviceId] == nullptr)
1707 return Plugin::success();
1708
1709 // Deinitialize the device and release its resources.
1710 if (auto Err = Devices[DeviceId]->deinit(*this))
1711 return Err;
1712
1713 // Delete the device and invalidate its reference.
1714 delete Devices[DeviceId];
1715 Devices[DeviceId] = nullptr;
1716
1717 return Plugin::success();
1718}
1719
1720Expected<bool> GenericPluginTy::checkELFImage(StringRef Image) const {
1721 // First check if this image is a regular ELF file.
1722 if (!utils::elf::isELF(Image))
1723 return false;
1724
1725 // Check if this image is an ELF with a matching machine value.
1726 auto MachineOrErr = utils::elf::checkMachine(Image, getMagicElfBits());
1727 if (!MachineOrErr)
1728 return MachineOrErr.takeError();
1729
1730 return MachineOrErr;
1731}
1732
1733Expected<bool> GenericPluginTy::checkBitcodeImage(StringRef Image) const {
1734 if (identify_magic(Image) != file_magic::bitcode)
1735 return false;
1736
1737 LLVMContext Context;
1738 auto ModuleOrErr = getLazyBitcodeModule(MemoryBufferRef(Image, ""), Context,
1739 /*ShouldLazyLoadMetadata=*/true);
1740 if (!ModuleOrErr)
1741 return ModuleOrErr.takeError();
1742 Module &M = **ModuleOrErr;
1743
1744 return M.getTargetTriple().getArch() == getTripleArch();
1745}
1746
1747int32_t GenericPluginTy::is_initialized() const { return Initialized; }
1748
1749int32_t GenericPluginTy::is_plugin_compatible(__tgt_device_image *Image) {
1750 StringRef Buffer(reinterpret_cast<const char *>(Image->ImageStart),
1751 utils::getPtrDiff(Image->ImageEnd, Image->ImageStart));
1752
1753 auto HandleError = [&](Error Err) -> bool {
1754 [[maybe_unused]] std::string ErrStr = toString(std::move(Err));
1755 DP("Failure to check validity of image %p: %s", Image, ErrStr.c_str());
1756 return false;
1757 };
1758 switch (identify_magic(Buffer)) {
1759 case file_magic::elf:
1760 case file_magic::elf_relocatable:
1761 case file_magic::elf_executable:
1762 case file_magic::elf_shared_object:
1763 case file_magic::elf_core: {
1764 auto MatchOrErr = checkELFImage(Buffer);
1765 if (Error Err = MatchOrErr.takeError())
1766 return HandleError(std::move(Err));
1767 return *MatchOrErr;
1768 }
1769 case file_magic::bitcode: {
1770 auto MatchOrErr = checkBitcodeImage(Buffer);
1771 if (Error Err = MatchOrErr.takeError())
1772 return HandleError(std::move(Err));
1773 return *MatchOrErr;
1774 }
1775 default:
1776 return false;
1777 }
1778}
1779
1780int32_t GenericPluginTy::is_device_compatible(int32_t DeviceId,
1781 __tgt_device_image *Image) {
1782 StringRef Buffer(reinterpret_cast<const char *>(Image->ImageStart),
1783 utils::getPtrDiff(Image->ImageEnd, Image->ImageStart));
1784
1785 auto HandleError = [&](Error Err) -> bool {
1786 [[maybe_unused]] std::string ErrStr = toString(std::move(Err));
1787 DP("Failure to check validity of image %p: %s", Image, ErrStr.c_str());
1788 return false;
1789 };
1790 switch (identify_magic(Buffer)) {
1791 case file_magic::elf:
1792 case file_magic::elf_relocatable:
1793 case file_magic::elf_executable:
1794 case file_magic::elf_shared_object:
1795 case file_magic::elf_core: {
1796 auto MatchOrErr = checkELFImage(Buffer);
1797 if (Error Err = MatchOrErr.takeError())
1798 return HandleError(std::move(Err));
1799 if (!*MatchOrErr)
1800 return false;
1801
1802 // Perform plugin-dependent checks for the specific architecture if needed.
1803 auto CompatibleOrErr = isELFCompatible(DeviceId, Buffer);
1804 if (Error Err = CompatibleOrErr.takeError())
1805 return HandleError(std::move(Err));
1806 return *CompatibleOrErr;
1807 }
1808 case file_magic::bitcode: {
1809 auto MatchOrErr = checkBitcodeImage(Buffer);
1810 if (Error Err = MatchOrErr.takeError())
1811 return HandleError(std::move(Err));
1812 return *MatchOrErr;
1813 }
1814 default:
1815 return false;
1816 }
1817}
1818
1819int32_t GenericPluginTy::is_device_initialized(int32_t DeviceId) const {
1820 return isValidDeviceId(DeviceId) && Devices[DeviceId] != nullptr;
1821}
1822
1823int32_t GenericPluginTy::init_device(int32_t DeviceId) {
1824 auto Err = initDevice(DeviceId);
1825 if (Err) {
1826 REPORT("Failure to initialize device %d: %s\n", DeviceId,
1827 toString(std::move(Err)).data());
1828 return OFFLOAD_FAIL;
1829 }
1830
1831 return OFFLOAD_SUCCESS;
1832}
1833
1834int32_t GenericPluginTy::number_of_devices() { return getNumDevices(); }
1835
1836int32_t GenericPluginTy::is_data_exchangable(int32_t SrcDeviceId,
1837 int32_t DstDeviceId) {
1838 return isDataExchangable(SrcDeviceId, DstDeviceId);
1839}
1840
1841int32_t GenericPluginTy::initialize_record_replay(int32_t DeviceId,
1842 int64_t MemorySize,
1843 void *VAddr, bool isRecord,
1844 bool SaveOutput,
1845 uint64_t &ReqPtrArgOffset) {
1846 GenericDeviceTy &Device = getDevice(DeviceId);
1847 RecordReplayTy::RRStatusTy Status =
1848 isRecord ? RecordReplayTy::RRStatusTy::RRRecording
1849 : RecordReplayTy::RRStatusTy::RRReplaying;
1850
1851 if (auto Err = RecordReplay->init(&Device, MemorySize, VAddr, Status,
1852 SaveOutput, ReqPtrArgOffset)) {
1853 REPORT("WARNING RR did not initialize RR-properly with %lu bytes"
1854 "(Error: %s)\n",
1855 MemorySize, toString(std::move(Err)).data());
1856 RecordReplay->setStatus(RecordReplayTy::RRStatusTy::RRDeactivated);
1857
1858 if (!isRecord) {
1859 return OFFLOAD_FAIL;
1860 }
1861 }
1862 return OFFLOAD_SUCCESS;
1863}
1864
1865int32_t GenericPluginTy::load_binary(int32_t DeviceId,
1866 __tgt_device_image *TgtImage,
1867 __tgt_device_binary *Binary) {
1868 GenericDeviceTy &Device = getDevice(DeviceId);
1869
1870 auto ImageOrErr = Device.loadBinary(*this, TgtImage);
1871 if (!ImageOrErr) {
1872 auto Err = ImageOrErr.takeError();
1873 REPORT("Failure to load binary image %p on device %d: %s\n", TgtImage,
1874 DeviceId, toString(std::move(Err)).data());
1875 return OFFLOAD_FAIL;
1876 }
1877
1878 DeviceImageTy *Image = *ImageOrErr;
1879 assert(Image != nullptr && "Invalid Image");
1880
1881 *Binary = __tgt_device_binary{reinterpret_cast<uint64_t>(Image)};
1882
1883 return OFFLOAD_SUCCESS;
1884}
1885
1886void *GenericPluginTy::data_alloc(int32_t DeviceId, int64_t Size, void *HostPtr,
1887 int32_t Kind) {
1888 auto AllocOrErr =
1889 getDevice(DeviceId).dataAlloc(Size, HostPtr, (TargetAllocTy)Kind);
1890 if (!AllocOrErr) {
1891 auto Err = AllocOrErr.takeError();
1892 REPORT("Failure to allocate device memory: %s\n",
1893 toString(std::move(Err)).data());
1894 return nullptr;
1895 }
1896 assert(*AllocOrErr && "Null pointer upon successful allocation");
1897
1898 return *AllocOrErr;
1899}
1900
1901int32_t GenericPluginTy::data_delete(int32_t DeviceId, void *TgtPtr,
1902 int32_t Kind) {
1903 auto Err =
1904 getDevice(DeviceId).dataDelete(TgtPtr, static_cast<TargetAllocTy>(Kind));
1905 if (Err) {
1906 REPORT("Failure to deallocate device pointer %p: %s\n", TgtPtr,
1907 toString(std::move(Err)).data());
1908 return OFFLOAD_FAIL;
1909 }
1910
1911 return OFFLOAD_SUCCESS;
1912}
1913
1914int32_t GenericPluginTy::data_lock(int32_t DeviceId, void *Ptr, int64_t Size,
1915 void **LockedPtr) {
1916 auto LockedPtrOrErr = getDevice(DeviceId).dataLock(Ptr, Size);
1917 if (!LockedPtrOrErr) {
1918 auto Err = LockedPtrOrErr.takeError();
1919 REPORT("Failure to lock memory %p: %s\n", Ptr,
1920 toString(std::move(Err)).data());
1921 return OFFLOAD_FAIL;
1922 }
1923
1924 if (!(*LockedPtrOrErr)) {
1925 REPORT("Failure to lock memory %p: obtained a null locked pointer\n", Ptr);
1926 return OFFLOAD_FAIL;
1927 }
1928 *LockedPtr = *LockedPtrOrErr;
1929
1930 return OFFLOAD_SUCCESS;
1931}
1932
1933int32_t GenericPluginTy::data_unlock(int32_t DeviceId, void *Ptr) {
1934 auto Err = getDevice(DeviceId).dataUnlock(Ptr);
1935 if (Err) {
1936 REPORT("Failure to unlock memory %p: %s\n", Ptr,
1937 toString(std::move(Err)).data());
1938 return OFFLOAD_FAIL;
1939 }
1940
1941 return OFFLOAD_SUCCESS;
1942}
1943
1944int32_t GenericPluginTy::data_notify_mapped(int32_t DeviceId, void *HstPtr,
1945 int64_t Size) {
1946 auto Err = getDevice(DeviceId).notifyDataMapped(HstPtr, Size);
1947 if (Err) {
1948 REPORT("Failure to notify data mapped %p: %s\n", HstPtr,
1949 toString(std::move(Err)).data());
1950 return OFFLOAD_FAIL;
1951 }
1952
1953 return OFFLOAD_SUCCESS;
1954}
1955
1956int32_t GenericPluginTy::data_notify_unmapped(int32_t DeviceId, void *HstPtr) {
1957 auto Err = getDevice(DeviceId).notifyDataUnmapped(HstPtr);
1958 if (Err) {
1959 REPORT("Failure to notify data unmapped %p: %s\n", HstPtr,
1960 toString(std::move(Err)).data());
1961 return OFFLOAD_FAIL;
1962 }
1963
1964 return OFFLOAD_SUCCESS;
1965}
1966
1967int32_t GenericPluginTy::data_submit(int32_t DeviceId, void *TgtPtr,
1968 void *HstPtr, int64_t Size) {
1969 return data_submit_async(DeviceId, TgtPtr, HstPtr, Size,
1970 /*AsyncInfoPtr=*/nullptr);
1971}
1972
1973int32_t GenericPluginTy::data_submit_async(int32_t DeviceId, void *TgtPtr,
1974 void *HstPtr, int64_t Size,
1975 __tgt_async_info *AsyncInfoPtr) {
1976 auto Err = getDevice(DeviceId).dataSubmit(TgtPtr, HstPtr, Size, AsyncInfoPtr);
1977 if (Err) {
1978 REPORT("Failure to copy data from host to device. Pointers: host "
1979 "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 ": %s\n",
1980 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size,
1981 toString(std::move(Err)).data());
1982 return OFFLOAD_FAIL;
1983 }
1984
1985 return OFFLOAD_SUCCESS;
1986}
1987
1988int32_t GenericPluginTy::data_retrieve(int32_t DeviceId, void *HstPtr,
1989 void *TgtPtr, int64_t Size) {
1990 return data_retrieve_async(DeviceId, HstPtr, TgtPtr, Size,
1991 /*AsyncInfoPtr=*/nullptr);
1992}
1993
1994int32_t GenericPluginTy::data_retrieve_async(int32_t DeviceId, void *HstPtr,
1995 void *TgtPtr, int64_t Size,
1996 __tgt_async_info *AsyncInfoPtr) {
1997 auto Err =
1998 getDevice(DeviceId).dataRetrieve(HstPtr, TgtPtr, Size, AsyncInfoPtr);
1999 if (Err) {
2000 REPORT("Failure to copy data from device to host. Pointers: host "
2001 "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 ": %s\n",
2002 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size,
2003 toString(std::move(Err)).data());
2004 return OFFLOAD_FAIL;
2005 }
2006
2007 return OFFLOAD_SUCCESS;
2008}
2009
2010int32_t GenericPluginTy::data_exchange(int32_t SrcDeviceId, void *SrcPtr,
2011 int32_t DstDeviceId, void *DstPtr,
2012 int64_t Size) {
2013 return data_exchange_async(SrcDeviceId, SrcPtr, DstDeviceId, DstPtr, Size,
2014 /*AsyncInfoPtr=*/nullptr);
2015}
2016
2017int32_t GenericPluginTy::data_exchange_async(int32_t SrcDeviceId, void *SrcPtr,
2018 int DstDeviceId, void *DstPtr,
2019 int64_t Size,
2020 __tgt_async_info *AsyncInfo) {
2021 GenericDeviceTy &SrcDevice = getDevice(SrcDeviceId);
2022 GenericDeviceTy &DstDevice = getDevice(DstDeviceId);
2023 auto Err = SrcDevice.dataExchange(SrcPtr, DstDevice, DstPtr, Size, AsyncInfo);
2024 if (Err) {
2025 REPORT("Failure to copy data from device (%d) to device (%d). Pointers: "
2026 "host = " DPxMOD ", device = " DPxMOD ", size = %" PRId64 ": %s\n",
2027 SrcDeviceId, DstDeviceId, DPxPTR(SrcPtr), DPxPTR(DstPtr), Size,
2028 toString(std::move(Err)).data());
2029 return OFFLOAD_FAIL;
2030 }
2031
2032 return OFFLOAD_SUCCESS;
2033}
2034
2035int32_t GenericPluginTy::launch_kernel(int32_t DeviceId, void *TgtEntryPtr,
2036 void **TgtArgs, ptrdiff_t *TgtOffsets,
2037 KernelArgsTy *KernelArgs,
2038 __tgt_async_info *AsyncInfoPtr) {
2039 auto Err = getDevice(DeviceId).launchKernel(TgtEntryPtr, TgtArgs, TgtOffsets,
2040 *KernelArgs, AsyncInfoPtr);
2041 if (Err) {
2042 REPORT("Failure to run target region " DPxMOD " in device %d: %s\n",
2043 DPxPTR(TgtEntryPtr), DeviceId, toString(std::move(Err)).data());
2044 return OFFLOAD_FAIL;
2045 }
2046
2047 return OFFLOAD_SUCCESS;
2048}
2049
2050int32_t GenericPluginTy::synchronize(int32_t DeviceId,
2051 __tgt_async_info *AsyncInfoPtr) {
2052 auto Err = getDevice(DeviceId).synchronize(AsyncInfoPtr);
2053 if (Err) {
2054 REPORT("Failure to synchronize stream %p: %s\n", AsyncInfoPtr->Queue,
2055 toString(std::move(Err)).data());
2056 return OFFLOAD_FAIL;
2057 }
2058
2059 return OFFLOAD_SUCCESS;
2060}
2061
2062int32_t GenericPluginTy::query_async(int32_t DeviceId,
2063 __tgt_async_info *AsyncInfoPtr) {
2064 auto Err = getDevice(DeviceId).queryAsync(AsyncInfoPtr);
2065 if (Err) {
2066 REPORT("Failure to query stream %p: %s\n", AsyncInfoPtr->Queue,
2067 toString(std::move(Err)).data());
2068 return OFFLOAD_FAIL;
2069 }
2070
2071 return OFFLOAD_SUCCESS;
2072}
2073
2074void GenericPluginTy::print_device_info(int32_t DeviceId) {
2075 if (auto Err = getDevice(DeviceId).printInfo())
2076 REPORT("Failure to print device %d info: %s\n", DeviceId,
2077 toString(std::move(Err)).data());
2078}
2079
2080int32_t GenericPluginTy::create_event(int32_t DeviceId, void **EventPtr) {
2081 auto Err = getDevice(DeviceId).createEvent(EventPtr);
2082 if (Err) {
2083 REPORT("Failure to create event: %s\n", toString(std::move(Err)).data());
2084 return OFFLOAD_FAIL;
2085 }
2086
2087 return OFFLOAD_SUCCESS;
2088}
2089
2090int32_t GenericPluginTy::record_event(int32_t DeviceId, void *EventPtr,
2091 __tgt_async_info *AsyncInfoPtr) {
2092 auto Err = getDevice(DeviceId).recordEvent(EventPtr, AsyncInfoPtr);
2093 if (Err) {
2094 REPORT("Failure to record event %p: %s\n", EventPtr,
2095 toString(std::move(Err)).data());
2096 return OFFLOAD_FAIL;
2097 }
2098
2099 return OFFLOAD_SUCCESS;
2100}
2101
2102int32_t GenericPluginTy::wait_event(int32_t DeviceId, void *EventPtr,
2103 __tgt_async_info *AsyncInfoPtr) {
2104 auto Err = getDevice(DeviceId).waitEvent(EventPtr, AsyncInfoPtr);
2105 if (Err) {
2106 REPORT("Failure to wait event %p: %s\n", EventPtr,
2107 toString(std::move(Err)).data());
2108 return OFFLOAD_FAIL;
2109 }
2110
2111 return OFFLOAD_SUCCESS;
2112}
2113
2114int32_t GenericPluginTy::sync_event(int32_t DeviceId, void *EventPtr) {
2115 auto Err = getDevice(DeviceId).syncEvent(EventPtr);
2116 if (Err) {
2117 REPORT("Failure to synchronize event %p: %s\n", EventPtr,
2118 toString(std::move(Err)).data());
2119 return OFFLOAD_FAIL;
2120 }
2121
2122 return OFFLOAD_SUCCESS;
2123}
2124
2125int32_t GenericPluginTy::destroy_event(int32_t DeviceId, void *EventPtr) {
2126 auto Err = getDevice(DeviceId).destroyEvent(EventPtr);
2127 if (Err) {
2128 REPORT("Failure to destroy event %p: %s\n", EventPtr,
2129 toString(std::move(Err)).data());
2130 return OFFLOAD_FAIL;
2131 }
2132
2133 return OFFLOAD_SUCCESS;
2134}
2135
2136void GenericPluginTy::set_info_flag(uint32_t NewInfoLevel) {
2137 std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal();
2138 InfoLevel.store(NewInfoLevel);
2139}
2140
2141int32_t GenericPluginTy::init_async_info(int32_t DeviceId,
2142 __tgt_async_info **AsyncInfoPtr) {
2143 assert(AsyncInfoPtr && "Invalid async info");
2144
2145 auto Err = getDevice(DeviceId).initAsyncInfo(AsyncInfoPtr);
2146 if (Err) {
2147 REPORT("Failure to initialize async info at " DPxMOD " on device %d: %s\n",
2148 DPxPTR(*AsyncInfoPtr), DeviceId, toString(std::move(Err)).data());
2149 return OFFLOAD_FAIL;
2150 }
2151
2152 return OFFLOAD_SUCCESS;
2153}
2154
2155int32_t GenericPluginTy::init_device_info(int32_t DeviceId,
2156 __tgt_device_info *DeviceInfo,
2157 const char **ErrStr) {
2158 *ErrStr = "";
2159
2160 auto Err = getDevice(DeviceId).initDeviceInfo(DeviceInfo);
2161 if (Err) {
2162 REPORT("Failure to initialize device info at " DPxMOD " on device %d: %s\n",
2163 DPxPTR(DeviceInfo), DeviceId, toString(std::move(Err)).data());
2164 return OFFLOAD_FAIL;
2165 }
2166
2167 return OFFLOAD_SUCCESS;
2168}
2169
2170int32_t GenericPluginTy::set_device_identifier(int32_t UserId,
2171 int32_t DeviceId) {
2172 UserDeviceIds[DeviceId] = UserId;
2173
2174 return OFFLOAD_SUCCESS;
2175}
2176
2177int32_t GenericPluginTy::use_auto_zero_copy(int32_t DeviceId) {
2178 return getDevice(DeviceId).useAutoZeroCopy();
2179}
2180
2181int32_t GenericPluginTy::get_global(__tgt_device_binary Binary, uint64_t Size,
2182 const char *Name, void **DevicePtr) {
2183 assert(Binary.handle && "Invalid device binary handle");
2184 DeviceImageTy &Image = *reinterpret_cast<DeviceImageTy *>(Binary.handle);
2185
2186 GenericDeviceTy &Device = Image.getDevice();
2187
2188 GlobalTy DeviceGlobal(Name, Size);
2189 GenericGlobalHandlerTy &GHandler = getGlobalHandler();
2190 if (auto Err =
2191 GHandler.getGlobalMetadataFromDevice(Device, Image, DeviceGlobal)) {
2192 REPORT("Failure to look up global address: %s\n",
2193 toString(std::move(Err)).data());
2194 return OFFLOAD_FAIL;
2195 }
2196
2197 *DevicePtr = DeviceGlobal.getPtr();
2198 assert(DevicePtr && "Invalid device global's address");
2199
2200 // Save the loaded globals if we are recording.
2201 RecordReplayTy &RecordReplay = Device.Plugin.getRecordReplay();
2202 if (RecordReplay.isRecording())
2203 RecordReplay.addEntry(Name, Size, *DevicePtr);
2204
2205 return OFFLOAD_SUCCESS;
2206}
2207
2208int32_t GenericPluginTy::get_function(__tgt_device_binary Binary,
2209 const char *Name, void **KernelPtr) {
2210 assert(Binary.handle && "Invalid device binary handle");
2211 DeviceImageTy &Image = *reinterpret_cast<DeviceImageTy *>(Binary.handle);
2212
2213 GenericDeviceTy &Device = Image.getDevice();
2214
2215 auto KernelOrErr = Device.constructKernel(Name);
2216 if (Error Err = KernelOrErr.takeError()) {
2217 REPORT("Failure to look up kernel: %s\n", toString(std::move(Err)).data());
2218 return OFFLOAD_FAIL;
2219 }
2220
2221 GenericKernelTy &Kernel = *KernelOrErr;
2222 if (auto Err = Kernel.init(Device, Image)) {
2223 REPORT("Failure to init kernel: %s\n", toString(std::move(Err)).data());
2224 return OFFLOAD_FAIL;
2225 }
2226
2227 // Note that this is not the kernel's device address.
2228 *KernelPtr = &Kernel;
2229 return OFFLOAD_SUCCESS;
2230}
2231

source code of offload/plugins-nextgen/common/src/PluginInterface.cpp