1//===- PluginInterface.cpp - Target independent plugin device interface ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9//===----------------------------------------------------------------------===//
10
11#include "PluginInterface.h"
12
13#include "Shared/APITypes.h"
14#include "Shared/Debug.h"
15#include "Shared/Environment.h"
16
17#include "ErrorReporting.h"
18#include "GlobalHandler.h"
19#include "JIT.h"
20#include "Shared/Utils.h"
21#include "Utils/ELF.h"
22#include "omptarget.h"
23
24#ifdef OMPT_SUPPORT
25#include "OpenMP/OMPT/Callback.h"
26#include "omp-tools.h"
27#endif
28
29#include "llvm/Bitcode/BitcodeReader.h"
30#include "llvm/Frontend/OpenMP/OMPConstants.h"
31#include "llvm/Support/Error.h"
32#include "llvm/Support/JSON.h"
33#include "llvm/Support/MathExtras.h"
34#include "llvm/Support/MemoryBuffer.h"
35#include "llvm/Support/Signals.h"
36#include "llvm/Support/raw_ostream.h"
37
38#include <cstdint>
39#include <limits>
40
41using namespace llvm;
42using namespace omp;
43using namespace target;
44using namespace plugin;
45using namespace error;
46
47// TODO: Fix any thread safety issues for multi-threaded kernel recording.
48namespace llvm::omp::target::plugin {
49struct RecordReplayTy {
50
51 // Describes the state of the record replay mechanism.
52 enum RRStatusTy { RRDeactivated = 0, RRRecording, RRReplaying };
53
54private:
55 // Memory pointers for recording, replaying memory.
56 void *MemoryStart = nullptr;
57 void *MemoryPtr = nullptr;
58 size_t MemorySize = 0;
59 size_t TotalSize = 0;
60 GenericDeviceTy *Device = nullptr;
61 std::mutex AllocationLock;
62
63 RRStatusTy Status = RRDeactivated;
64 bool ReplaySaveOutput = false;
65 bool UsedVAMap = false;
66 uintptr_t MemoryOffset = 0;
67
68 // A list of all globals mapped to the device.
69 struct GlobalEntry {
70 const char *Name;
71 uint64_t Size;
72 void *Addr;
73 };
74 llvm::SmallVector<GlobalEntry> GlobalEntries{};
75
76 void *suggestAddress(uint64_t MaxMemoryAllocation) {
77 // Get a valid pointer address for this system
78 void *Addr =
79 Device->allocate(1024, /*HstPtr=*/nullptr, TARGET_ALLOC_DEFAULT);
80 Device->free(Addr);
81 // Align Address to MaxMemoryAllocation
82 Addr = (void *)utils::alignPtr((Addr), MaxMemoryAllocation);
83 return Addr;
84 }
85
86 Error preAllocateVAMemory(uint64_t MaxMemoryAllocation, void *VAddr) {
87 size_t ASize = MaxMemoryAllocation;
88
89 if (!VAddr && isRecording())
90 VAddr = suggestAddress(MaxMemoryAllocation);
91
92 DP("Request %ld bytes allocated at %p\n", MaxMemoryAllocation, VAddr);
93
94 if (auto Err = Device->memoryVAMap(&MemoryStart, VAddr, &ASize))
95 return Err;
96
97 if (isReplaying() && VAddr != MemoryStart) {
98 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
99 "record-Replay cannot assign the"
100 "requested recorded address (%p, %p)",
101 VAddr, MemoryStart);
102 }
103
104 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(),
105 "Allocated %" PRIu64 " bytes at %p for replay.\n", ASize, MemoryStart);
106
107 MemoryPtr = MemoryStart;
108 MemorySize = 0;
109 TotalSize = ASize;
110 UsedVAMap = true;
111 return Plugin::success();
112 }
113
114 Error preAllocateHeuristic(uint64_t MaxMemoryAllocation,
115 uint64_t RequiredMemoryAllocation, void *VAddr) {
116 const size_t MAX_MEMORY_ALLOCATION = MaxMemoryAllocation;
117 constexpr size_t STEP = 1024 * 1024 * 1024ULL;
118 MemoryStart = nullptr;
119 for (TotalSize = MAX_MEMORY_ALLOCATION; TotalSize > 0; TotalSize -= STEP) {
120 MemoryStart =
121 Device->allocate(TotalSize, /*HstPtr=*/nullptr, TARGET_ALLOC_DEFAULT);
122 if (MemoryStart)
123 break;
124 }
125 if (!MemoryStart)
126 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
127 "allocating record/replay memory");
128
129 if (VAddr && VAddr != MemoryStart)
130 MemoryOffset = uintptr_t(VAddr) - uintptr_t(MemoryStart);
131
132 MemoryPtr = MemoryStart;
133 MemorySize = 0;
134
135 // Check if we need adjustment.
136 if (MemoryOffset > 0 &&
137 TotalSize >= RequiredMemoryAllocation + MemoryOffset) {
138 // If we are off but "before" the required address and with enough space,
139 // we just "allocate" the offset to match the required address.
140 MemoryPtr = (char *)MemoryPtr + MemoryOffset;
141 MemorySize += MemoryOffset;
142 MemoryOffset = 0;
143 assert(MemoryPtr == VAddr && "Expected offset adjustment to work");
144 } else if (MemoryOffset) {
145 // If we are off and in a situation we cannot just "waste" memory to force
146 // a match, we hope adjusting the arguments is sufficient.
147 REPORT(
148 "WARNING Failed to allocate replay memory at required location %p, "
149 "got %p, trying to offset argument pointers by %" PRIi64 "\n",
150 VAddr, MemoryStart, MemoryOffset);
151 }
152
153 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(),
154 "Allocated %" PRIu64 " bytes at %p for replay.\n", TotalSize,
155 MemoryStart);
156
157 return Plugin::success();
158 }
159
160 Error preallocateDeviceMemory(uint64_t DeviceMemorySize, void *ReqVAddr) {
161 if (Device->supportVAManagement()) {
162 auto Err = preAllocateVAMemory(MaxMemoryAllocation: DeviceMemorySize, VAddr: ReqVAddr);
163 if (Err) {
164 REPORT("WARNING VA mapping failed, fallback to heuristic: "
165 "(Error: %s)\n",
166 toString(E: std::move(Err)).data());
167 }
168 }
169
170 uint64_t DevMemSize;
171 if (Device->getDeviceMemorySize(DevMemSize))
172 return Plugin::error(ErrorCode::UNKNOWN,
173 "cannot determine Device Memory Size");
174
175 return preAllocateHeuristic(MaxMemoryAllocation: DevMemSize, RequiredMemoryAllocation: DeviceMemorySize, VAddr: ReqVAddr);
176 }
177
178 void dumpDeviceMemory(StringRef Filename) {
179 ErrorOr<std::unique_ptr<WritableMemoryBuffer>> DeviceMemoryMB =
180 WritableMemoryBuffer::getNewUninitMemBuffer(MemorySize);
181 if (!DeviceMemoryMB)
182 report_fatal_error(reason: "Error creating MemoryBuffer for device memory");
183
184 auto Err = Device->dataRetrieve(DeviceMemoryMB.get()->getBufferStart(),
185 MemoryStart, MemorySize, nullptr);
186 if (Err)
187 report_fatal_error(reason: "Error retrieving data for target pointer");
188
189 StringRef DeviceMemory(DeviceMemoryMB.get()->getBufferStart(), MemorySize);
190 std::error_code EC;
191 raw_fd_ostream OS(Filename, EC);
192 if (EC)
193 report_fatal_error(reason: "Error dumping memory to file " + Filename + " :" +
194 EC.message());
195 OS << DeviceMemory;
196 OS.close();
197 }
198
199public:
200 bool isRecording() const { return Status == RRStatusTy::RRRecording; }
201 bool isReplaying() const { return Status == RRStatusTy::RRReplaying; }
202 bool isRecordingOrReplaying() const {
203 return (Status != RRStatusTy::RRDeactivated);
204 }
205 void setStatus(RRStatusTy Status) { this->Status = Status; }
206 bool isSaveOutputEnabled() const { return ReplaySaveOutput; }
207 void addEntry(const char *Name, uint64_t Size, void *Addr) {
208 GlobalEntries.emplace_back(GlobalEntry{Name, Size, Addr});
209 }
210
211 void saveImage(const char *Name, const DeviceImageTy &Image) {
212 SmallString<128> ImageName = {Name, ".image"};
213 std::error_code EC;
214 raw_fd_ostream OS(ImageName, EC);
215 if (EC)
216 report_fatal_error(reason: "Error saving image : " + StringRef(EC.message()));
217 if (const auto *TgtImageBitcode = Image.getTgtImageBitcode()) {
218 size_t Size = utils::getPtrDiff(TgtImageBitcode->ImageEnd,
219 TgtImageBitcode->ImageStart);
220 MemoryBufferRef MBR = MemoryBufferRef(
221 StringRef((const char *)TgtImageBitcode->ImageStart, Size), "");
222 OS << MBR.getBuffer();
223 } else {
224 OS << Image.getMemoryBuffer().getBuffer();
225 }
226 OS.close();
227 }
228
229 void dumpGlobals(StringRef Filename, DeviceImageTy &Image) {
230 int32_t Size = 0;
231
232 for (auto &OffloadEntry : GlobalEntries) {
233 if (!OffloadEntry.Size)
234 continue;
235 // Get the total size of the string and entry including the null byte.
236 Size += std::strlen(OffloadEntry.Name) + 1 + sizeof(uint32_t) +
237 OffloadEntry.Size;
238 }
239
240 ErrorOr<std::unique_ptr<WritableMemoryBuffer>> GlobalsMB =
241 WritableMemoryBuffer::getNewUninitMemBuffer(Size);
242 if (!GlobalsMB)
243 report_fatal_error(reason: "Error creating MemoryBuffer for globals memory");
244
245 void *BufferPtr = GlobalsMB.get()->getBufferStart();
246 for (auto &OffloadEntry : GlobalEntries) {
247 if (!OffloadEntry.Size)
248 continue;
249
250 int32_t NameLength = std::strlen(OffloadEntry.Name) + 1;
251 memcpy(BufferPtr, OffloadEntry.Name, NameLength);
252 BufferPtr = utils::advancePtr(BufferPtr, NameLength);
253
254 *((uint32_t *)(BufferPtr)) = OffloadEntry.Size;
255 BufferPtr = utils::advancePtr(BufferPtr, sizeof(uint32_t));
256
257 auto Err = Plugin::success();
258 {
259 if (auto Err = Device->dataRetrieve(BufferPtr, OffloadEntry.Addr,
260 OffloadEntry.Size, nullptr))
261 report_fatal_error("Error retrieving data for global");
262 }
263 if (Err)
264 report_fatal_error("Error retrieving data for global");
265 BufferPtr = utils::advancePtr(BufferPtr, OffloadEntry.Size);
266 }
267 assert(BufferPtr == GlobalsMB->get()->getBufferEnd() &&
268 "Buffer over/under-filled.");
269 assert(Size == utils::getPtrDiff(BufferPtr,
270 GlobalsMB->get()->getBufferStart()) &&
271 "Buffer size mismatch");
272
273 StringRef GlobalsMemory(GlobalsMB.get()->getBufferStart(), Size);
274 std::error_code EC;
275 raw_fd_ostream OS(Filename, EC);
276 OS << GlobalsMemory;
277 OS.close();
278 }
279
280 void saveKernelDescr(const char *Name, KernelLaunchParamsTy LaunchParams,
281 int32_t NumArgs, uint64_t NumTeamsClause,
282 uint32_t ThreadLimitClause, uint64_t LoopTripCount) {
283 json::Object JsonKernelInfo;
284 JsonKernelInfo["Name"] = Name;
285 JsonKernelInfo["NumArgs"] = NumArgs;
286 JsonKernelInfo["NumTeamsClause"] = NumTeamsClause;
287 JsonKernelInfo["ThreadLimitClause"] = ThreadLimitClause;
288 JsonKernelInfo["LoopTripCount"] = LoopTripCount;
289 JsonKernelInfo["DeviceMemorySize"] = MemorySize;
290 JsonKernelInfo["DeviceId"] = Device->getDeviceId();
291 JsonKernelInfo["BumpAllocVAStart"] = (intptr_t)MemoryStart;
292
293 json::Array JsonArgPtrs;
294 for (int I = 0; I < NumArgs; ++I)
295 JsonArgPtrs.push_back((intptr_t)LaunchParams.Ptrs[I]);
296 JsonKernelInfo["ArgPtrs"] = json::Value(std::move(JsonArgPtrs));
297
298 json::Array JsonArgOffsets;
299 for (int I = 0; I < NumArgs; ++I)
300 JsonArgOffsets.push_back(0);
301 JsonKernelInfo["ArgOffsets"] = json::Value(std::move(JsonArgOffsets));
302
303 SmallString<128> JsonFilename = {Name, ".json"};
304 std::error_code EC;
305 raw_fd_ostream JsonOS(JsonFilename.str(), EC);
306 if (EC)
307 report_fatal_error(reason: "Error saving kernel json file : " +
308 StringRef(EC.message()));
309 JsonOS << json::Value(std::move(JsonKernelInfo));
310 JsonOS.close();
311 }
312
313 void saveKernelInput(const char *Name, DeviceImageTy &Image) {
314 SmallString<128> GlobalsFilename = {Name, ".globals"};
315 dumpGlobals(GlobalsFilename, Image);
316
317 SmallString<128> MemoryFilename = {Name, ".memory"};
318 dumpDeviceMemory(Filename: MemoryFilename);
319 }
320
321 void saveKernelOutputInfo(const char *Name) {
322 SmallString<128> OutputFilename = {
323 Name, (isRecording() ? ".original.output" : ".replay.output")};
324 dumpDeviceMemory(Filename: OutputFilename);
325 }
326
327 void *alloc(uint64_t Size) {
328 assert(MemoryStart && "Expected memory has been pre-allocated");
329 void *Alloc = nullptr;
330 constexpr int Alignment = 16;
331 // Assumes alignment is a power of 2.
332 int64_t AlignedSize = (Size + (Alignment - 1)) & (~(Alignment - 1));
333 std::lock_guard<std::mutex> LG(AllocationLock);
334 Alloc = MemoryPtr;
335 MemoryPtr = (char *)MemoryPtr + AlignedSize;
336 MemorySize += AlignedSize;
337 DP("Memory Allocator return " DPxMOD "\n", DPxPTR(Alloc));
338 return Alloc;
339 }
340
341 Error init(GenericDeviceTy *Device, uint64_t MemSize, void *VAddr,
342 RRStatusTy Status, bool SaveOutput, uint64_t &ReqPtrArgOffset) {
343 this->Device = Device;
344 this->Status = Status;
345 this->ReplaySaveOutput = SaveOutput;
346
347 if (auto Err = preallocateDeviceMemory(DeviceMemorySize: MemSize, ReqVAddr: VAddr))
348 return Err;
349
350 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(),
351 "Record Replay Initialized (%p)"
352 " as starting address, %lu Memory Size"
353 " and set on status %s\n",
354 MemoryStart, TotalSize,
355 Status == RRStatusTy::RRRecording ? "Recording" : "Replaying");
356
357 // Tell the user to offset pointer arguments as the memory allocation does
358 // not match.
359 ReqPtrArgOffset = MemoryOffset;
360 return Plugin::success();
361 }
362
363 void deinit() {
364 if (UsedVAMap) {
365 if (auto Err = Device->memoryVAUnMap(MemoryStart, TotalSize))
366 report_fatal_error(reason: "Error on releasing virtual memory space");
367 } else {
368 Device->free(MemoryStart);
369 }
370 }
371};
372} // namespace llvm::omp::target::plugin
373
374// Extract the mapping of host function pointers to device function pointers
375// from the entry table. Functions marked as 'indirect' in OpenMP will have
376// offloading entries generated for them which map the host's function pointer
377// to a global containing the corresponding function pointer on the device.
378static Expected<std::pair<void *, uint64_t>>
379setupIndirectCallTable(GenericPluginTy &Plugin, GenericDeviceTy &Device,
380 DeviceImageTy &Image) {
381 GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
382
383 llvm::ArrayRef<llvm::offloading::EntryTy> Entries(
384 Image.getTgtImage()->EntriesBegin, Image.getTgtImage()->EntriesEnd);
385 llvm::SmallVector<std::pair<void *, void *>> IndirectCallTable;
386 for (const auto &Entry : Entries) {
387 if (Entry.Kind != object::OffloadKind::OFK_OpenMP || Entry.Size == 0 ||
388 !(Entry.Flags & OMP_DECLARE_TARGET_INDIRECT))
389 continue;
390
391 assert(Entry.Size == sizeof(void *) && "Global not a function pointer?");
392 auto &[HstPtr, DevPtr] = IndirectCallTable.emplace_back();
393
394 GlobalTy DeviceGlobal(Entry.SymbolName, Entry.Size);
395 if (auto Err =
396 Handler.getGlobalMetadataFromDevice(Device, Image, DeviceGlobal))
397 return std::move(Err);
398
399 HstPtr = Entry.Address;
400 if (auto Err = Device.dataRetrieve(&DevPtr, DeviceGlobal.getPtr(),
401 Entry.Size, nullptr))
402 return std::move(Err);
403 }
404
405 // If we do not have any indirect globals we exit early.
406 if (IndirectCallTable.empty())
407 return std::pair{nullptr, 0};
408
409 // Sort the array to allow for more efficient lookup of device pointers.
410 llvm::sort(IndirectCallTable,
411 [](const auto &x, const auto &y) { return x.first < y.first; });
412
413 uint64_t TableSize =
414 IndirectCallTable.size() * sizeof(std::pair<void *, void *>);
415 void *DevicePtr = Device.allocate(TableSize, nullptr, TARGET_ALLOC_DEVICE);
416 if (auto Err = Device.dataSubmit(DevicePtr, IndirectCallTable.data(),
417 TableSize, nullptr))
418 return std::move(Err);
419 return std::pair<void *, uint64_t>(DevicePtr, IndirectCallTable.size());
420}
421
422AsyncInfoWrapperTy::AsyncInfoWrapperTy(GenericDeviceTy &Device,
423 __tgt_async_info *AsyncInfoPtr)
424 : Device(Device),
425 AsyncInfoPtr(AsyncInfoPtr ? AsyncInfoPtr : &LocalAsyncInfo) {}
426
427void AsyncInfoWrapperTy::finalize(Error &Err) {
428 assert(AsyncInfoPtr && "AsyncInfoWrapperTy already finalized");
429
430 // If we used a local async info object we want synchronous behavior. In that
431 // case, and assuming the current status code is correct, we will synchronize
432 // explicitly when the object is deleted. Update the error with the result of
433 // the synchronize operation.
434 if (AsyncInfoPtr == &LocalAsyncInfo && LocalAsyncInfo.Queue && !Err)
435 Err = Device.synchronize(&LocalAsyncInfo);
436
437 // Invalidate the wrapper object.
438 AsyncInfoPtr = nullptr;
439}
440
441Error GenericKernelTy::init(GenericDeviceTy &GenericDevice,
442 DeviceImageTy &Image) {
443
444 ImagePtr = &Image;
445
446 // Retrieve kernel environment object for the kernel.
447 std::string EnvironmentName = std::string(Name) + "_kernel_environment";
448 GenericGlobalHandlerTy &GHandler = GenericDevice.Plugin.getGlobalHandler();
449 if (GHandler.isSymbolInImage(GenericDevice, Image, EnvironmentName)) {
450 GlobalTy KernelEnv(EnvironmentName, sizeof(KernelEnvironment),
451 &KernelEnvironment);
452 if (auto Err =
453 GHandler.readGlobalFromImage(GenericDevice, *ImagePtr, KernelEnv))
454 return Err;
455 } else {
456 KernelEnvironment = KernelEnvironmentTy{};
457 DP("Failed to read kernel environment for '%s' Using default Bare (0) "
458 "execution mode\n",
459 Name);
460 }
461
462 // Max = Config.Max > 0 ? min(Config.Max, Device.Max) : Device.Max;
463 MaxNumThreads = KernelEnvironment.Configuration.MaxThreads > 0
464 ? std::min(KernelEnvironment.Configuration.MaxThreads,
465 int32_t(GenericDevice.getThreadLimit()))
466 : GenericDevice.getThreadLimit();
467
468 // Pref = Config.Pref > 0 ? max(Config.Pref, Device.Pref) : Device.Pref;
469 PreferredNumThreads =
470 KernelEnvironment.Configuration.MinThreads > 0
471 ? std::max(KernelEnvironment.Configuration.MinThreads,
472 int32_t(GenericDevice.getDefaultNumThreads()))
473 : GenericDevice.getDefaultNumThreads();
474
475 return initImpl(GenericDevice, Image);
476}
477
478Expected<KernelLaunchEnvironmentTy *>
479GenericKernelTy::getKernelLaunchEnvironment(
480 GenericDeviceTy &GenericDevice, uint32_t Version,
481 AsyncInfoWrapperTy &AsyncInfoWrapper) const {
482 // Ctor/Dtor have no arguments, replaying uses the original kernel launch
483 // environment. Older versions of the compiler do not generate a kernel
484 // launch environment.
485 if (GenericDevice.Plugin.getRecordReplay().isReplaying() ||
486 Version < OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR)
487 return nullptr;
488
489 if (!KernelEnvironment.Configuration.ReductionDataSize ||
490 !KernelEnvironment.Configuration.ReductionBufferLength)
491 return reinterpret_cast<KernelLaunchEnvironmentTy *>(~0);
492
493 // TODO: Check if the kernel needs a launch environment.
494 auto AllocOrErr = GenericDevice.dataAlloc(sizeof(KernelLaunchEnvironmentTy),
495 /*HostPtr=*/nullptr,
496 TargetAllocTy::TARGET_ALLOC_DEVICE);
497 if (!AllocOrErr)
498 return AllocOrErr.takeError();
499
500 // Remember to free the memory later.
501 AsyncInfoWrapper.freeAllocationAfterSynchronization(*AllocOrErr);
502
503 /// Use the KLE in the __tgt_async_info to ensure a stable address for the
504 /// async data transfer.
505 auto &LocalKLE = (*AsyncInfoWrapper).KernelLaunchEnvironment;
506 LocalKLE = KernelLaunchEnvironment;
507 {
508 auto AllocOrErr = GenericDevice.dataAlloc(
509 KernelEnvironment.Configuration.ReductionDataSize *
510 KernelEnvironment.Configuration.ReductionBufferLength,
511 /*HostPtr=*/nullptr, TargetAllocTy::TARGET_ALLOC_DEVICE);
512 if (!AllocOrErr)
513 return AllocOrErr.takeError();
514 LocalKLE.ReductionBuffer = *AllocOrErr;
515 // Remember to free the memory later.
516 AsyncInfoWrapper.freeAllocationAfterSynchronization(*AllocOrErr);
517 }
518
519 INFO(OMP_INFOTYPE_DATA_TRANSFER, GenericDevice.getDeviceId(),
520 "Copying data from host to device, HstPtr=" DPxMOD ", TgtPtr=" DPxMOD
521 ", Size=%" PRId64 ", Name=KernelLaunchEnv\n",
522 DPxPTR(&LocalKLE), DPxPTR(*AllocOrErr),
523 sizeof(KernelLaunchEnvironmentTy));
524
525 auto Err = GenericDevice.dataSubmit(*AllocOrErr, &LocalKLE,
526 sizeof(KernelLaunchEnvironmentTy),
527 AsyncInfoWrapper);
528 if (Err)
529 return Err;
530 return static_cast<KernelLaunchEnvironmentTy *>(*AllocOrErr);
531}
532
533Error GenericKernelTy::printLaunchInfo(GenericDeviceTy &GenericDevice,
534 KernelArgsTy &KernelArgs,
535 uint32_t NumThreads[3],
536 uint32_t NumBlocks[3]) const {
537 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, GenericDevice.getDeviceId(),
538 "Launching kernel %s with [%u,%u,%u] blocks and [%u,%u,%u] threads in "
539 "%s mode\n",
540 getName(), NumBlocks[0], NumBlocks[1], NumBlocks[2], NumThreads[0],
541 NumThreads[1], NumThreads[2], getExecutionModeName());
542 return printLaunchInfoDetails(GenericDevice, KernelArgs, NumThreads,
543 NumBlocks);
544}
545
546Error GenericKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
547 KernelArgsTy &KernelArgs,
548 uint32_t NumThreads[3],
549 uint32_t NumBlocks[3]) const {
550 return Plugin::success();
551}
552
553Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
554 ptrdiff_t *ArgOffsets, KernelArgsTy &KernelArgs,
555 AsyncInfoWrapperTy &AsyncInfoWrapper) const {
556 llvm::SmallVector<void *, 16> Args;
557 llvm::SmallVector<void *, 16> Ptrs;
558
559 auto KernelLaunchEnvOrErr = getKernelLaunchEnvironment(
560 GenericDevice, KernelArgs.Version, AsyncInfoWrapper);
561 if (!KernelLaunchEnvOrErr)
562 return KernelLaunchEnvOrErr.takeError();
563
564 KernelLaunchParamsTy LaunchParams;
565
566 // Kernel languages don't use indirection.
567 if (KernelArgs.Flags.IsCUDA) {
568 LaunchParams =
569 *reinterpret_cast<KernelLaunchParamsTy *>(KernelArgs.ArgPtrs);
570 } else {
571 LaunchParams =
572 prepareArgs(GenericDevice, ArgPtrs, ArgOffsets, KernelArgs.NumArgs,
573 Args, Ptrs, *KernelLaunchEnvOrErr);
574 }
575
576 uint32_t NumThreads[3] = {KernelArgs.ThreadLimit[0],
577 KernelArgs.ThreadLimit[1],
578 KernelArgs.ThreadLimit[2]};
579 uint32_t NumBlocks[3] = {KernelArgs.NumTeams[0], KernelArgs.NumTeams[1],
580 KernelArgs.NumTeams[2]};
581 if (!isBareMode()) {
582 NumThreads[0] = getNumThreads(GenericDevice, NumThreads);
583 NumBlocks[0] = getNumBlocks(GenericDevice, NumBlocks, KernelArgs.Tripcount,
584 NumThreads[0], KernelArgs.ThreadLimit[0] > 0);
585 }
586
587 // Record the kernel description after we modified the argument count and num
588 // blocks/threads.
589 RecordReplayTy &RecordReplay = GenericDevice.Plugin.getRecordReplay();
590 if (RecordReplay.isRecording()) {
591 RecordReplay.saveImage(getName(), getImage());
592 RecordReplay.saveKernelInput(getName(), getImage());
593 RecordReplay.saveKernelDescr(getName(), LaunchParams, KernelArgs.NumArgs,
594 NumBlocks[0], NumThreads[0],
595 KernelArgs.Tripcount);
596 }
597
598 if (auto Err =
599 printLaunchInfo(GenericDevice, KernelArgs, NumThreads, NumBlocks))
600 return Err;
601
602 return launchImpl(GenericDevice, NumThreads, NumBlocks, KernelArgs,
603 LaunchParams, AsyncInfoWrapper);
604}
605
606KernelLaunchParamsTy GenericKernelTy::prepareArgs(
607 GenericDeviceTy &GenericDevice, void **ArgPtrs, ptrdiff_t *ArgOffsets,
608 uint32_t &NumArgs, llvm::SmallVectorImpl<void *> &Args,
609 llvm::SmallVectorImpl<void *> &Ptrs,
610 KernelLaunchEnvironmentTy *KernelLaunchEnvironment) const {
611 uint32_t KLEOffset = !!KernelLaunchEnvironment;
612 NumArgs += KLEOffset;
613
614 if (NumArgs == 0)
615 return KernelLaunchParamsTy{};
616
617 Args.resize(NumArgs);
618 Ptrs.resize(NumArgs);
619
620 if (KernelLaunchEnvironment) {
621 Args[0] = KernelLaunchEnvironment;
622 Ptrs[0] = &Args[0];
623 }
624
625 for (uint32_t I = KLEOffset; I < NumArgs; ++I) {
626 Args[I] =
627 (void *)((intptr_t)ArgPtrs[I - KLEOffset] + ArgOffsets[I - KLEOffset]);
628 Ptrs[I] = &Args[I];
629 }
630 return KernelLaunchParamsTy{sizeof(void *) * NumArgs, &Args[0], &Ptrs[0]};
631}
632
633uint32_t GenericKernelTy::getNumThreads(GenericDeviceTy &GenericDevice,
634 uint32_t ThreadLimitClause[3]) const {
635 assert(!isBareMode() && "bare kernel should not call this function");
636
637 assert(ThreadLimitClause[1] == 1 && ThreadLimitClause[2] == 1 &&
638 "Multi dimensional launch not supported yet.");
639
640 if (ThreadLimitClause[0] > 0 && isGenericMode())
641 ThreadLimitClause[0] += GenericDevice.getWarpSize();
642
643 return std::min(MaxNumThreads, (ThreadLimitClause[0] > 0)
644 ? ThreadLimitClause[0]
645 : PreferredNumThreads);
646}
647
648uint32_t GenericKernelTy::getNumBlocks(GenericDeviceTy &GenericDevice,
649 uint32_t NumTeamsClause[3],
650 uint64_t LoopTripCount,
651 uint32_t &NumThreads,
652 bool IsNumThreadsFromUser) const {
653 assert(!isBareMode() && "bare kernel should not call this function");
654
655 assert(NumTeamsClause[1] == 1 && NumTeamsClause[2] == 1 &&
656 "Multi dimensional launch not supported yet.");
657
658 if (NumTeamsClause[0] > 0) {
659 // TODO: We need to honor any value and consequently allow more than the
660 // block limit. For this we might need to start multiple kernels or let the
661 // blocks start again until the requested number has been started.
662 return std::min(NumTeamsClause[0], GenericDevice.getBlockLimit());
663 }
664
665 uint64_t DefaultNumBlocks = GenericDevice.getDefaultNumBlocks();
666 uint64_t TripCountNumBlocks = std::numeric_limits<uint64_t>::max();
667 if (LoopTripCount > 0) {
668 if (isSPMDMode()) {
669 // We have a combined construct, i.e. `target teams distribute
670 // parallel for [simd]`. We launch so many teams so that each thread
671 // will execute one iteration of the loop; rounded up to the nearest
672 // integer. However, if that results in too few teams, we artificially
673 // reduce the thread count per team to increase the outer parallelism.
674 auto MinThreads = GenericDevice.getMinThreadsForLowTripCountLoop();
675 MinThreads = std::min(MinThreads, NumThreads);
676
677 // Honor the thread_limit clause; only lower the number of threads.
678 [[maybe_unused]] auto OldNumThreads = NumThreads;
679 if (LoopTripCount >= DefaultNumBlocks * NumThreads ||
680 IsNumThreadsFromUser) {
681 // Enough parallelism for teams and threads.
682 TripCountNumBlocks = ((LoopTripCount - 1) / NumThreads) + 1;
683 assert(IsNumThreadsFromUser ||
684 TripCountNumBlocks >= DefaultNumBlocks &&
685 "Expected sufficient outer parallelism.");
686 } else if (LoopTripCount >= DefaultNumBlocks * MinThreads) {
687 // Enough parallelism for teams, limit threads.
688
689 // This case is hard; for now, we force "full warps":
690 // First, compute a thread count assuming DefaultNumBlocks.
691 auto NumThreadsDefaultBlocks =
692 (LoopTripCount + DefaultNumBlocks - 1) / DefaultNumBlocks;
693 // Now get a power of two that is larger or equal.
694 auto NumThreadsDefaultBlocksP2 =
695 llvm::PowerOf2Ceil(NumThreadsDefaultBlocks);
696 // Do not increase a thread limit given be the user.
697 NumThreads = std::min(NumThreads, uint32_t(NumThreadsDefaultBlocksP2));
698 assert(NumThreads >= MinThreads &&
699 "Expected sufficient inner parallelism.");
700 TripCountNumBlocks = ((LoopTripCount - 1) / NumThreads) + 1;
701 } else {
702 // Not enough parallelism for teams and threads, limit both.
703 NumThreads = std::min(NumThreads, MinThreads);
704 TripCountNumBlocks = ((LoopTripCount - 1) / NumThreads) + 1;
705 }
706
707 assert(NumThreads * TripCountNumBlocks >= LoopTripCount &&
708 "Expected sufficient parallelism");
709 assert(OldNumThreads >= NumThreads &&
710 "Number of threads cannot be increased!");
711 } else {
712 assert((isGenericMode() || isGenericSPMDMode()) &&
713 "Unexpected execution mode!");
714 // If we reach this point, then we have a non-combined construct, i.e.
715 // `teams distribute` with a nested `parallel for` and each team is
716 // assigned one iteration of the `distribute` loop. E.g.:
717 //
718 // #pragma omp target teams distribute
719 // for(...loop_tripcount...) {
720 // #pragma omp parallel for
721 // for(...) {}
722 // }
723 //
724 // Threads within a team will execute the iterations of the `parallel`
725 // loop.
726 TripCountNumBlocks = LoopTripCount;
727 }
728 }
729
730 uint32_t PreferredNumBlocks = TripCountNumBlocks;
731 // If the loops are long running we rather reuse blocks than spawn too many.
732 if (GenericDevice.getReuseBlocksForHighTripCount())
733 PreferredNumBlocks = std::min(TripCountNumBlocks, DefaultNumBlocks);
734 return std::min(PreferredNumBlocks, GenericDevice.getBlockLimit());
735}
736
737GenericDeviceTy::GenericDeviceTy(GenericPluginTy &Plugin, int32_t DeviceId,
738 int32_t NumDevices,
739 const llvm::omp::GV &OMPGridValues)
740 : Plugin(Plugin), MemoryManager(nullptr), OMP_TeamLimit("OMP_TEAM_LIMIT"),
741 OMP_NumTeams("OMP_NUM_TEAMS"),
742 OMP_TeamsThreadLimit("OMP_TEAMS_THREAD_LIMIT"),
743 OMPX_DebugKind("LIBOMPTARGET_DEVICE_RTL_DEBUG"),
744 OMPX_SharedMemorySize("LIBOMPTARGET_SHARED_MEMORY_SIZE"),
745 // Do not initialize the following two envars since they depend on the
746 // device initialization. These cannot be consulted until the device is
747 // initialized correctly. We initialize them in GenericDeviceTy::init().
748 OMPX_TargetStackSize(), OMPX_TargetHeapSize(),
749 // By default, the initial number of streams and events is 1.
750 OMPX_InitialNumStreams("LIBOMPTARGET_NUM_INITIAL_STREAMS", 1),
751 OMPX_InitialNumEvents("LIBOMPTARGET_NUM_INITIAL_EVENTS", 1),
752 DeviceId(DeviceId), GridValues(OMPGridValues),
753 PeerAccesses(NumDevices, PeerAccessState::PENDING), PeerAccessesLock(),
754 PinnedAllocs(*this), RPCServer(nullptr) {
755#ifdef OMPT_SUPPORT
756 OmptInitialized.store(false);
757 // Bind the callbacks to this device's member functions
758#define bindOmptCallback(Name, Type, Code) \
759 if (ompt::Initialized && ompt::lookupCallbackByCode) { \
760 ompt::lookupCallbackByCode((ompt_callbacks_t)(Code), \
761 ((ompt_callback_t *)&(Name##_fn))); \
762 DP("OMPT: class bound %s=%p\n", #Name, ((void *)(uint64_t)Name##_fn)); \
763 }
764
765 FOREACH_OMPT_DEVICE_EVENT(bindOmptCallback);
766#undef bindOmptCallback
767
768#endif
769}
770
771Error GenericDeviceTy::init(GenericPluginTy &Plugin) {
772 if (auto Err = initImpl(Plugin))
773 return Err;
774
775#ifdef OMPT_SUPPORT
776 if (ompt::Initialized) {
777 bool ExpectedStatus = false;
778 if (OmptInitialized.compare_exchange_strong(ExpectedStatus, true))
779 performOmptCallback(device_initialize, Plugin.getUserId(DeviceId),
780 /*type=*/getComputeUnitKind().c_str(),
781 /*device=*/reinterpret_cast<ompt_device_t *>(this),
782 /*lookup=*/ompt::lookupCallbackByName,
783 /*documentation=*/nullptr);
784 }
785#endif
786
787 // Read and reinitialize the envars that depend on the device initialization.
788 // Notice these two envars may change the stack size and heap size of the
789 // device, so they need the device properly initialized.
790 auto StackSizeEnvarOrErr = UInt64Envar::create(
791 "LIBOMPTARGET_STACK_SIZE",
792 [this](uint64_t &V) -> Error { return getDeviceStackSize(V); },
793 [this](uint64_t V) -> Error { return setDeviceStackSize(V); });
794 if (!StackSizeEnvarOrErr)
795 return StackSizeEnvarOrErr.takeError();
796 OMPX_TargetStackSize = std::move(*StackSizeEnvarOrErr);
797
798 auto HeapSizeEnvarOrErr = UInt64Envar::create(
799 "LIBOMPTARGET_HEAP_SIZE",
800 [this](uint64_t &V) -> Error { return getDeviceHeapSize(V); },
801 [this](uint64_t V) -> Error { return setDeviceHeapSize(V); });
802 if (!HeapSizeEnvarOrErr)
803 return HeapSizeEnvarOrErr.takeError();
804 OMPX_TargetHeapSize = std::move(*HeapSizeEnvarOrErr);
805
806 // Update the maximum number of teams and threads after the device
807 // initialization sets the corresponding hardware limit.
808 if (OMP_NumTeams > 0)
809 GridValues.GV_Max_Teams =
810 std::min(GridValues.GV_Max_Teams, uint32_t(OMP_NumTeams));
811
812 if (OMP_TeamsThreadLimit > 0)
813 GridValues.GV_Max_WG_Size =
814 std::min(GridValues.GV_Max_WG_Size, uint32_t(OMP_TeamsThreadLimit));
815
816 // Enable the memory manager if required.
817 auto [ThresholdMM, EnableMM] = MemoryManagerTy::getSizeThresholdFromEnv();
818 if (EnableMM)
819 MemoryManager = new MemoryManagerTy(*this, ThresholdMM);
820
821 return Plugin::success();
822}
823
824Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
825 for (DeviceImageTy *Image : LoadedImages)
826 if (auto Err = callGlobalDestructors(Plugin, *Image))
827 return Err;
828
829 if (OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::AllocationTracker)) {
830 GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
831 for (auto *Image : LoadedImages) {
832 DeviceMemoryPoolTrackingTy ImageDeviceMemoryPoolTracking = {0, 0, ~0U, 0};
833 GlobalTy TrackerGlobal("__omp_rtl_device_memory_pool_tracker",
834 sizeof(DeviceMemoryPoolTrackingTy),
835 &ImageDeviceMemoryPoolTracking);
836 if (auto Err =
837 GHandler.readGlobalFromDevice(*this, *Image, TrackerGlobal)) {
838 consumeError(std::move(Err));
839 continue;
840 }
841 DeviceMemoryPoolTracking.combine(ImageDeviceMemoryPoolTracking);
842 }
843
844 // TODO: Write this by default into a file.
845 printf("\n\n|-----------------------\n"
846 "| Device memory tracker:\n"
847 "|-----------------------\n"
848 "| #Allocations: %lu\n"
849 "| Byes allocated: %lu\n"
850 "| Minimal allocation: %lu\n"
851 "| Maximal allocation: %lu\n"
852 "|-----------------------\n\n\n",
853 DeviceMemoryPoolTracking.NumAllocations,
854 DeviceMemoryPoolTracking.AllocationTotal,
855 DeviceMemoryPoolTracking.AllocationMin,
856 DeviceMemoryPoolTracking.AllocationMax);
857 }
858
859 for (auto *Image : LoadedImages) {
860 GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
861 auto ProfOrErr = Handler.readProfilingGlobals(*this, *Image);
862 if (!ProfOrErr)
863 return ProfOrErr.takeError();
864
865 if (ProfOrErr->empty())
866 continue;
867
868 // Dump out profdata
869 if ((OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::PGODump)) ==
870 uint32_t(DeviceDebugKind::PGODump))
871 ProfOrErr->dump();
872
873 // Write data to profiling file
874 if (auto Err = ProfOrErr->write())
875 return Err;
876 }
877
878 // Delete the memory manager before deinitializing the device. Otherwise,
879 // we may delete device allocations after the device is deinitialized.
880 if (MemoryManager)
881 delete MemoryManager;
882 MemoryManager = nullptr;
883
884 RecordReplayTy &RecordReplay = Plugin.getRecordReplay();
885 if (RecordReplay.isRecordingOrReplaying())
886 RecordReplay.deinit();
887
888 if (RPCServer)
889 if (auto Err = RPCServer->deinitDevice(*this))
890 return Err;
891
892#ifdef OMPT_SUPPORT
893 if (ompt::Initialized) {
894 bool ExpectedStatus = true;
895 if (OmptInitialized.compare_exchange_strong(ExpectedStatus, false))
896 performOmptCallback(device_finalize, Plugin.getUserId(DeviceId));
897 }
898#endif
899
900 return deinitImpl();
901}
902Expected<DeviceImageTy *>
903GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
904 const __tgt_device_image *InputTgtImage) {
905 assert(InputTgtImage && "Expected non-null target image");
906 DP("Load data from image " DPxMOD "\n", DPxPTR(InputTgtImage->ImageStart));
907
908 auto PostJITImageOrErr = Plugin.getJIT().process(*InputTgtImage, *this);
909 if (!PostJITImageOrErr) {
910 auto Err = PostJITImageOrErr.takeError();
911 REPORT("Failure to jit IR image %p on device %d: %s\n", InputTgtImage,
912 DeviceId, toString(std::move(Err)).data());
913 return nullptr;
914 }
915
916 // Load the binary and allocate the image object. Use the next available id
917 // for the image id, which is the number of previously loaded images.
918 auto ImageOrErr =
919 loadBinaryImpl(PostJITImageOrErr.get(), LoadedImages.size());
920 if (!ImageOrErr)
921 return ImageOrErr.takeError();
922
923 DeviceImageTy *Image = *ImageOrErr;
924 assert(Image != nullptr && "Invalid image");
925 if (InputTgtImage != PostJITImageOrErr.get())
926 Image->setTgtImageBitcode(InputTgtImage);
927
928 // Add the image to list.
929 LoadedImages.push_back(Image);
930
931 // Setup the device environment if needed.
932 if (auto Err = setupDeviceEnvironment(Plugin, *Image))
933 return std::move(Err);
934
935 // Setup the global device memory pool if needed.
936 if (!Plugin.getRecordReplay().isReplaying() &&
937 shouldSetupDeviceMemoryPool()) {
938 uint64_t HeapSize;
939 auto SizeOrErr = getDeviceHeapSize(HeapSize);
940 if (SizeOrErr) {
941 REPORT("No global device memory pool due to error: %s\n",
942 toString(std::move(SizeOrErr)).data());
943 } else if (auto Err = setupDeviceMemoryPool(Plugin, *Image, HeapSize))
944 return std::move(Err);
945 }
946
947 if (auto Err = setupRPCServer(Plugin, *Image))
948 return std::move(Err);
949
950#ifdef OMPT_SUPPORT
951 if (ompt::Initialized) {
952 size_t Bytes =
953 utils::getPtrDiff(InputTgtImage->ImageEnd, InputTgtImage->ImageStart);
954 performOmptCallback(
955 device_load, Plugin.getUserId(DeviceId),
956 /*FileName=*/nullptr, /*FileOffset=*/0, /*VmaInFile=*/nullptr,
957 /*ImgSize=*/Bytes, /*HostAddr=*/InputTgtImage->ImageStart,
958 /*DeviceAddr=*/nullptr, /* FIXME: ModuleId */ 0);
959 }
960#endif
961
962 // Call any global constructors present on the device.
963 if (auto Err = callGlobalConstructors(Plugin, *Image))
964 return std::move(Err);
965
966 // Return the pointer to the table of entries.
967 return Image;
968}
969
970Error GenericDeviceTy::setupDeviceEnvironment(GenericPluginTy &Plugin,
971 DeviceImageTy &Image) {
972 // There are some plugins that do not need this step.
973 if (!shouldSetupDeviceEnvironment())
974 return Plugin::success();
975
976 // Obtain a table mapping host function pointers to device function pointers.
977 auto CallTablePairOrErr = setupIndirectCallTable(Plugin, *this, Image);
978 if (!CallTablePairOrErr)
979 return CallTablePairOrErr.takeError();
980
981 DeviceEnvironmentTy DeviceEnvironment;
982 DeviceEnvironment.DeviceDebugKind = OMPX_DebugKind;
983 DeviceEnvironment.NumDevices = Plugin.getNumDevices();
984 // TODO: The device ID used here is not the real device ID used by OpenMP.
985 DeviceEnvironment.DeviceNum = DeviceId;
986 DeviceEnvironment.DynamicMemSize = OMPX_SharedMemorySize;
987 DeviceEnvironment.ClockFrequency = getClockFrequency();
988 DeviceEnvironment.IndirectCallTable =
989 reinterpret_cast<uintptr_t>(CallTablePairOrErr->first);
990 DeviceEnvironment.IndirectCallTableSize = CallTablePairOrErr->second;
991 DeviceEnvironment.HardwareParallelism = getHardwareParallelism();
992
993 // Create the metainfo of the device environment global.
994 GlobalTy DevEnvGlobal("__omp_rtl_device_environment",
995 sizeof(DeviceEnvironmentTy), &DeviceEnvironment);
996
997 // Write device environment values to the device.
998 GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
999 if (auto Err = GHandler.writeGlobalToDevice(*this, Image, DevEnvGlobal)) {
1000 DP("Missing symbol %s, continue execution anyway.\n",
1001 DevEnvGlobal.getName().data());
1002 consumeError(std::move(Err));
1003 }
1004 return Plugin::success();
1005}
1006
1007Error GenericDeviceTy::setupDeviceMemoryPool(GenericPluginTy &Plugin,
1008 DeviceImageTy &Image,
1009 uint64_t PoolSize) {
1010 // Free the old pool, if any.
1011 if (DeviceMemoryPool.Ptr) {
1012 if (auto Err = dataDelete(DeviceMemoryPool.Ptr,
1013 TargetAllocTy::TARGET_ALLOC_DEVICE))
1014 return Err;
1015 }
1016
1017 DeviceMemoryPool.Size = PoolSize;
1018 auto AllocOrErr = dataAlloc(PoolSize, /*HostPtr=*/nullptr,
1019 TargetAllocTy::TARGET_ALLOC_DEVICE);
1020 if (AllocOrErr) {
1021 DeviceMemoryPool.Ptr = *AllocOrErr;
1022 } else {
1023 auto Err = AllocOrErr.takeError();
1024 REPORT("Failure to allocate device memory for global memory pool: %s\n",
1025 toString(std::move(Err)).data());
1026 DeviceMemoryPool.Ptr = nullptr;
1027 DeviceMemoryPool.Size = 0;
1028 }
1029
1030 // Create the metainfo of the device environment global.
1031 GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
1032 if (!GHandler.isSymbolInImage(*this, Image,
1033 "__omp_rtl_device_memory_pool_tracker")) {
1034 DP("Skip the memory pool as there is no tracker symbol in the image.");
1035 return Error::success();
1036 }
1037
1038 GlobalTy TrackerGlobal("__omp_rtl_device_memory_pool_tracker",
1039 sizeof(DeviceMemoryPoolTrackingTy),
1040 &DeviceMemoryPoolTracking);
1041 if (auto Err = GHandler.writeGlobalToDevice(*this, Image, TrackerGlobal))
1042 return Err;
1043
1044 // Create the metainfo of the device environment global.
1045 GlobalTy DevEnvGlobal("__omp_rtl_device_memory_pool",
1046 sizeof(DeviceMemoryPoolTy), &DeviceMemoryPool);
1047
1048 // Write device environment values to the device.
1049 return GHandler.writeGlobalToDevice(*this, Image, DevEnvGlobal);
1050}
1051
1052Error GenericDeviceTy::setupRPCServer(GenericPluginTy &Plugin,
1053 DeviceImageTy &Image) {
1054 // The plugin either does not need an RPC server or it is unavailable.
1055 if (!shouldSetupRPCServer())
1056 return Plugin::success();
1057
1058 // Check if this device needs to run an RPC server.
1059 RPCServerTy &Server = Plugin.getRPCServer();
1060 auto UsingOrErr =
1061 Server.isDeviceUsingRPC(*this, Plugin.getGlobalHandler(), Image);
1062 if (!UsingOrErr)
1063 return UsingOrErr.takeError();
1064
1065 if (!UsingOrErr.get())
1066 return Plugin::success();
1067
1068 if (auto Err = Server.initDevice(*this, Plugin.getGlobalHandler(), Image))
1069 return Err;
1070
1071 if (auto Err = Server.startThread())
1072 return Err;
1073
1074 RPCServer = &Server;
1075 DP("Running an RPC server on device %d\n", getDeviceId());
1076 return Plugin::success();
1077}
1078
1079Error PinnedAllocationMapTy::insertEntry(void *HstPtr, void *DevAccessiblePtr,
1080 size_t Size, bool ExternallyLocked) {
1081 // Insert the new entry into the map.
1082 auto Res = Allocs.insert({HstPtr, DevAccessiblePtr, Size, ExternallyLocked});
1083 if (!Res.second)
1084 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
1085 "cannot insert locked buffer entry");
1086
1087 // Check whether the next entry overlaps with the inserted entry.
1088 auto It = std::next(Res.first);
1089 if (It == Allocs.end())
1090 return Plugin::success();
1091
1092 const EntryTy *NextEntry = &(*It);
1093 if (intersects(NextEntry->HstPtr, NextEntry->Size, HstPtr, Size))
1094 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
1095 "partial overlapping not allowed in locked buffers");
1096
1097 return Plugin::success();
1098}
1099
1100Error PinnedAllocationMapTy::eraseEntry(const EntryTy &Entry) {
1101 // Erase the existing entry. Notice this requires an additional map lookup,
1102 // but this should not be a performance issue. Using iterators would make
1103 // the code more difficult to read.
1104 size_t Erased = Allocs.erase({Entry.HstPtr});
1105 if (!Erased)
1106 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
1107 "cannot erase locked buffer entry");
1108 return Plugin::success();
1109}
1110
1111Error PinnedAllocationMapTy::registerEntryUse(const EntryTy &Entry,
1112 void *HstPtr, size_t Size) {
1113 if (!contains(Entry.HstPtr, Entry.Size, HstPtr, Size))
1114 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
1115 "partial overlapping not allowed in locked buffers");
1116
1117 ++Entry.References;
1118 return Plugin::success();
1119}
1120
1121Expected<bool> PinnedAllocationMapTy::unregisterEntryUse(const EntryTy &Entry) {
1122 if (Entry.References == 0)
1123 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
1124 "invalid number of references");
1125
1126 // Return whether this was the last user.
1127 return (--Entry.References == 0);
1128}
1129
1130Error PinnedAllocationMapTy::registerHostBuffer(void *HstPtr,
1131 void *DevAccessiblePtr,
1132 size_t Size) {
1133 assert(HstPtr && "Invalid pointer");
1134 assert(DevAccessiblePtr && "Invalid pointer");
1135 assert(Size && "Invalid size");
1136
1137 std::lock_guard<std::shared_mutex> Lock(Mutex);
1138
1139 // No pinned allocation should intersect.
1140 const EntryTy *Entry = findIntersecting(HstPtr);
1141 if (Entry)
1142 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
1143 "cannot insert entry due to an existing one");
1144
1145 // Now insert the new entry.
1146 return insertEntry(HstPtr, DevAccessiblePtr, Size);
1147}
1148
1149Error PinnedAllocationMapTy::unregisterHostBuffer(void *HstPtr) {
1150 assert(HstPtr && "Invalid pointer");
1151
1152 std::lock_guard<std::shared_mutex> Lock(Mutex);
1153
1154 const EntryTy *Entry = findIntersecting(HstPtr);
1155 if (!Entry)
1156 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
1157 "cannot find locked buffer");
1158
1159 // The address in the entry should be the same we are unregistering.
1160 if (Entry->HstPtr != HstPtr)
1161 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
1162 "unexpected host pointer in locked buffer entry");
1163
1164 // Unregister from the entry.
1165 auto LastUseOrErr = unregisterEntryUse(*Entry);
1166 if (!LastUseOrErr)
1167 return LastUseOrErr.takeError();
1168
1169 // There should be no other references to the pinned allocation.
1170 if (!(*LastUseOrErr))
1171 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
1172 "the locked buffer is still being used");
1173
1174 // Erase the entry from the map.
1175 return eraseEntry(*Entry);
1176}
1177
1178Expected<void *> PinnedAllocationMapTy::lockHostBuffer(void *HstPtr,
1179 size_t Size) {
1180 assert(HstPtr && "Invalid pointer");
1181 assert(Size && "Invalid size");
1182
1183 std::lock_guard<std::shared_mutex> Lock(Mutex);
1184
1185 const EntryTy *Entry = findIntersecting(HstPtr);
1186
1187 if (Entry) {
1188 // An already registered intersecting buffer was found. Register a new use.
1189 if (auto Err = registerEntryUse(*Entry, HstPtr, Size))
1190 return std::move(Err);
1191
1192 // Return the device accessible pointer with the correct offset.
1193 return utils::advancePtr(Entry->DevAccessiblePtr,
1194 utils::getPtrDiff(HstPtr, Entry->HstPtr));
1195 }
1196
1197 // No intersecting registered allocation found in the map. First, lock the
1198 // host buffer and retrieve the device accessible pointer.
1199 auto DevAccessiblePtrOrErr = Device.dataLockImpl(HstPtr, Size);
1200 if (!DevAccessiblePtrOrErr)
1201 return DevAccessiblePtrOrErr.takeError();
1202
1203 // Now insert the new entry into the map.
1204 if (auto Err = insertEntry(HstPtr, *DevAccessiblePtrOrErr, Size))
1205 return std::move(Err);
1206
1207 // Return the device accessible pointer.
1208 return *DevAccessiblePtrOrErr;
1209}
1210
1211Error PinnedAllocationMapTy::unlockHostBuffer(void *HstPtr) {
1212 assert(HstPtr && "Invalid pointer");
1213
1214 std::lock_guard<std::shared_mutex> Lock(Mutex);
1215
1216 const EntryTy *Entry = findIntersecting(HstPtr);
1217 if (!Entry)
1218 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
1219 "cannot find locked buffer");
1220
1221 // Unregister from the locked buffer. No need to do anything if there are
1222 // others using the allocation.
1223 auto LastUseOrErr = unregisterEntryUse(*Entry);
1224 if (!LastUseOrErr)
1225 return LastUseOrErr.takeError();
1226
1227 // No need to do anything if there are others using the allocation.
1228 if (!(*LastUseOrErr))
1229 return Plugin::success();
1230
1231 // This was the last user of the allocation. Unlock the original locked buffer
1232 // if it was locked by the plugin. Do not unlock it if it was locked by an
1233 // external entity. Unlock the buffer using the host pointer of the entry.
1234 if (!Entry->ExternallyLocked)
1235 if (auto Err = Device.dataUnlockImpl(Entry->HstPtr))
1236 return Err;
1237
1238 // Erase the entry from the map.
1239 return eraseEntry(*Entry);
1240}
1241
1242Error PinnedAllocationMapTy::lockMappedHostBuffer(void *HstPtr, size_t Size) {
1243 assert(HstPtr && "Invalid pointer");
1244 assert(Size && "Invalid size");
1245
1246 std::lock_guard<std::shared_mutex> Lock(Mutex);
1247
1248 // If previously registered, just register a new user on the entry.
1249 const EntryTy *Entry = findIntersecting(HstPtr);
1250 if (Entry)
1251 return registerEntryUse(*Entry, HstPtr, Size);
1252
1253 size_t BaseSize;
1254 void *BaseHstPtr, *BaseDevAccessiblePtr;
1255
1256 // Check if it was externally pinned by a vendor-specific API.
1257 auto IsPinnedOrErr = Device.isPinnedPtrImpl(HstPtr, BaseHstPtr,
1258 BaseDevAccessiblePtr, BaseSize);
1259 if (!IsPinnedOrErr)
1260 return IsPinnedOrErr.takeError();
1261
1262 // If pinned, just insert the entry representing the whole pinned buffer.
1263 if (*IsPinnedOrErr)
1264 return insertEntry(BaseHstPtr, BaseDevAccessiblePtr, BaseSize,
1265 /*Externallylocked=*/true);
1266
1267 // Not externally pinned. Do nothing if locking of mapped buffers is disabled.
1268 if (!LockMappedBuffers)
1269 return Plugin::success();
1270
1271 // Otherwise, lock the buffer and insert the new entry.
1272 auto DevAccessiblePtrOrErr = Device.dataLockImpl(HstPtr, Size);
1273 if (!DevAccessiblePtrOrErr) {
1274 // Errors may be tolerated.
1275 if (!IgnoreLockMappedFailures)
1276 return DevAccessiblePtrOrErr.takeError();
1277
1278 consumeError(DevAccessiblePtrOrErr.takeError());
1279 return Plugin::success();
1280 }
1281
1282 return insertEntry(HstPtr, *DevAccessiblePtrOrErr, Size);
1283}
1284
1285Error PinnedAllocationMapTy::unlockUnmappedHostBuffer(void *HstPtr) {
1286 assert(HstPtr && "Invalid pointer");
1287
1288 std::lock_guard<std::shared_mutex> Lock(Mutex);
1289
1290 // Check whether there is any intersecting entry.
1291 const EntryTy *Entry = findIntersecting(HstPtr);
1292
1293 // No entry but automatic locking of mapped buffers is disabled, so
1294 // nothing to do.
1295 if (!Entry && !LockMappedBuffers)
1296 return Plugin::success();
1297
1298 // No entry, automatic locking is enabled, but the locking may have failed, so
1299 // do nothing.
1300 if (!Entry && IgnoreLockMappedFailures)
1301 return Plugin::success();
1302
1303 // No entry, but the automatic locking is enabled, so this is an error.
1304 if (!Entry)
1305 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
1306 "locked buffer not found");
1307
1308 // There is entry, so unregister a user and check whether it was the last one.
1309 auto LastUseOrErr = unregisterEntryUse(*Entry);
1310 if (!LastUseOrErr)
1311 return LastUseOrErr.takeError();
1312
1313 // If it is not the last one, there is nothing to do.
1314 if (!(*LastUseOrErr))
1315 return Plugin::success();
1316
1317 // Otherwise, if it was the last and the buffer was locked by the plugin,
1318 // unlock it.
1319 if (!Entry->ExternallyLocked)
1320 if (auto Err = Device.dataUnlockImpl(Entry->HstPtr))
1321 return Err;
1322
1323 // Finally erase the entry from the map.
1324 return eraseEntry(*Entry);
1325}
1326
1327Error GenericDeviceTy::synchronize(__tgt_async_info *AsyncInfo) {
1328 if (!AsyncInfo || !AsyncInfo->Queue)
1329 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
1330 "invalid async info queue");
1331
1332 if (auto Err = synchronizeImpl(*AsyncInfo))
1333 return Err;
1334
1335 for (auto *Ptr : AsyncInfo->AssociatedAllocations)
1336 if (auto Err = dataDelete(Ptr, TargetAllocTy::TARGET_ALLOC_DEVICE))
1337 return Err;
1338 AsyncInfo->AssociatedAllocations.clear();
1339
1340 return Plugin::success();
1341}
1342
1343Error GenericDeviceTy::queryAsync(__tgt_async_info *AsyncInfo) {
1344 if (!AsyncInfo || !AsyncInfo->Queue)
1345 return Plugin::error(ErrorCode::INVALID_ARGUMENT,
1346 "invalid async info queue");
1347
1348 return queryAsyncImpl(*AsyncInfo);
1349}
1350
1351Error GenericDeviceTy::memoryVAMap(void **Addr, void *VAddr, size_t *RSize) {
1352 return Plugin::error(ErrorCode::UNSUPPORTED,
1353 "device does not support VA Management");
1354}
1355
1356Error GenericDeviceTy::memoryVAUnMap(void *VAddr, size_t Size) {
1357 return Plugin::error(ErrorCode::UNSUPPORTED,
1358 "device does not support VA Management");
1359}
1360
1361Error GenericDeviceTy::getDeviceMemorySize(uint64_t &DSize) {
1362 return Plugin::error(
1363 ErrorCode::UNIMPLEMENTED,
1364 "missing getDeviceMemorySize implementation (required by RR-heuristic");
1365}
1366
1367Expected<void *> GenericDeviceTy::dataAlloc(int64_t Size, void *HostPtr,
1368 TargetAllocTy Kind) {
1369 void *Alloc = nullptr;
1370
1371 if (Plugin.getRecordReplay().isRecordingOrReplaying())
1372 return Plugin.getRecordReplay().alloc(Size);
1373
1374 switch (Kind) {
1375 case TARGET_ALLOC_DEFAULT:
1376 case TARGET_ALLOC_DEVICE_NON_BLOCKING:
1377 case TARGET_ALLOC_DEVICE:
1378 if (MemoryManager) {
1379 Alloc = MemoryManager->allocate(Size, HostPtr);
1380 if (!Alloc)
1381 return Plugin::error(ErrorCode::OUT_OF_RESOURCES,
1382 "failed to allocate from memory manager");
1383 break;
1384 }
1385 [[fallthrough]];
1386 case TARGET_ALLOC_HOST:
1387 case TARGET_ALLOC_SHARED:
1388 Alloc = allocate(Size, HostPtr, Kind);
1389 if (!Alloc)
1390 return Plugin::error(ErrorCode::OUT_OF_RESOURCES,
1391 "failed to allocate from device allocator");
1392 }
1393
1394 // Report error if the memory manager or the device allocator did not return
1395 // any memory buffer.
1396 if (!Alloc)
1397 return Plugin::error(ErrorCode::UNIMPLEMENTED,
1398 "invalid target data allocation kind or requested "
1399 "allocator not implemented yet");
1400
1401 // Register allocated buffer as pinned memory if the type is host memory.
1402 if (Kind == TARGET_ALLOC_HOST)
1403 if (auto Err = PinnedAllocs.registerHostBuffer(Alloc, Alloc, Size))
1404 return std::move(Err);
1405
1406 // Keep track of the allocation stack if we track allocation traces.
1407 if (OMPX_TrackAllocationTraces) {
1408 std::string StackTrace;
1409 llvm::raw_string_ostream OS(StackTrace);
1410 llvm::sys::PrintStackTrace(OS);
1411
1412 AllocationTraceInfoTy *ATI = new AllocationTraceInfoTy();
1413 ATI->AllocationTrace = std::move(StackTrace);
1414 ATI->DevicePtr = Alloc;
1415 ATI->HostPtr = HostPtr;
1416 ATI->Size = Size;
1417 ATI->Kind = Kind;
1418
1419 auto AllocationTraceMap = AllocationTraces.getExclusiveAccessor();
1420 auto *&MapATI = (*AllocationTraceMap)[Alloc];
1421 ATI->LastAllocationInfo = MapATI;
1422 MapATI = ATI;
1423 }
1424
1425 return Alloc;
1426}
1427
1428Error GenericDeviceTy::dataDelete(void *TgtPtr, TargetAllocTy Kind) {
1429 // Free is a noop when recording or replaying.
1430 if (Plugin.getRecordReplay().isRecordingOrReplaying())
1431 return Plugin::success();
1432
1433 // Keep track of the deallocation stack if we track allocation traces.
1434 if (OMPX_TrackAllocationTraces) {
1435 AllocationTraceInfoTy *ATI = nullptr;
1436 {
1437 auto AllocationTraceMap = AllocationTraces.getExclusiveAccessor();
1438 ATI = (*AllocationTraceMap)[TgtPtr];
1439 }
1440
1441 std::string StackTrace;
1442 llvm::raw_string_ostream OS(StackTrace);
1443 llvm::sys::PrintStackTrace(OS);
1444
1445 if (!ATI)
1446 ErrorReporter::reportDeallocationOfNonAllocatedPtr(TgtPtr, Kind, ATI,
1447 StackTrace);
1448
1449 // ATI is not null, thus we can lock it to inspect and modify it further.
1450 std::lock_guard<std::mutex> LG(ATI->Lock);
1451 if (!ATI->DeallocationTrace.empty())
1452 ErrorReporter::reportDeallocationOfDeallocatedPtr(TgtPtr, Kind, ATI,
1453 StackTrace);
1454
1455 if (ATI->Kind != Kind)
1456 ErrorReporter::reportDeallocationOfWrongPtrKind(TgtPtr, Kind, ATI,
1457 StackTrace);
1458
1459 ATI->DeallocationTrace = StackTrace;
1460
1461#undef DEALLOCATION_ERROR
1462 }
1463
1464 int Res;
1465 switch (Kind) {
1466 case TARGET_ALLOC_DEFAULT:
1467 case TARGET_ALLOC_DEVICE_NON_BLOCKING:
1468 case TARGET_ALLOC_DEVICE:
1469 if (MemoryManager) {
1470 Res = MemoryManager->free(TgtPtr);
1471 if (Res)
1472 return Plugin::error(
1473 ErrorCode::OUT_OF_RESOURCES,
1474 "failure to deallocate device pointer %p via memory manager",
1475 TgtPtr);
1476 break;
1477 }
1478 [[fallthrough]];
1479 case TARGET_ALLOC_HOST:
1480 case TARGET_ALLOC_SHARED:
1481 Res = free(TgtPtr, Kind);
1482 if (Res)
1483 return Plugin::error(
1484 ErrorCode::UNKNOWN,
1485 "failure to deallocate device pointer %p via device deallocator",
1486 TgtPtr);
1487 }
1488
1489 // Unregister deallocated pinned memory buffer if the type is host memory.
1490 if (Kind == TARGET_ALLOC_HOST)
1491 if (auto Err = PinnedAllocs.unregisterHostBuffer(TgtPtr))
1492 return Err;
1493
1494 return Plugin::success();
1495}
1496
1497Error GenericDeviceTy::dataSubmit(void *TgtPtr, const void *HstPtr,
1498 int64_t Size, __tgt_async_info *AsyncInfo) {
1499 AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
1500
1501 auto Err = dataSubmitImpl(TgtPtr, HstPtr, Size, AsyncInfoWrapper);
1502 AsyncInfoWrapper.finalize(Err);
1503 return Err;
1504}
1505
1506Error GenericDeviceTy::dataRetrieve(void *HstPtr, const void *TgtPtr,
1507 int64_t Size, __tgt_async_info *AsyncInfo) {
1508 AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
1509
1510 auto Err = dataRetrieveImpl(HstPtr, TgtPtr, Size, AsyncInfoWrapper);
1511 AsyncInfoWrapper.finalize(Err);
1512 return Err;
1513}
1514
1515Error GenericDeviceTy::dataExchange(const void *SrcPtr, GenericDeviceTy &DstDev,
1516 void *DstPtr, int64_t Size,
1517 __tgt_async_info *AsyncInfo) {
1518 AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
1519
1520 auto Err = dataExchangeImpl(SrcPtr, DstDev, DstPtr, Size, AsyncInfoWrapper);
1521 AsyncInfoWrapper.finalize(Err);
1522 return Err;
1523}
1524
1525Error GenericDeviceTy::launchKernel(void *EntryPtr, void **ArgPtrs,
1526 ptrdiff_t *ArgOffsets,
1527 KernelArgsTy &KernelArgs,
1528 __tgt_async_info *AsyncInfo) {
1529 AsyncInfoWrapperTy AsyncInfoWrapper(
1530 *this,
1531 Plugin.getRecordReplay().isRecordingOrReplaying() ? nullptr : AsyncInfo);
1532
1533 GenericKernelTy &GenericKernel =
1534 *reinterpret_cast<GenericKernelTy *>(EntryPtr);
1535
1536 {
1537 std::string StackTrace;
1538 if (OMPX_TrackNumKernelLaunches) {
1539 llvm::raw_string_ostream OS(StackTrace);
1540 llvm::sys::PrintStackTrace(OS);
1541 }
1542
1543 auto KernelTraceInfoRecord = KernelLaunchTraces.getExclusiveAccessor();
1544 (*KernelTraceInfoRecord)
1545 .emplace(&GenericKernel, std::move(StackTrace), AsyncInfo);
1546 }
1547
1548 auto Err = GenericKernel.launch(*this, ArgPtrs, ArgOffsets, KernelArgs,
1549 AsyncInfoWrapper);
1550
1551 // 'finalize' here to guarantee next record-replay actions are in-sync
1552 AsyncInfoWrapper.finalize(Err);
1553
1554 RecordReplayTy &RecordReplay = Plugin.getRecordReplay();
1555 if (RecordReplay.isRecordingOrReplaying() &&
1556 RecordReplay.isSaveOutputEnabled())
1557 RecordReplay.saveKernelOutputInfo(GenericKernel.getName());
1558
1559 return Err;
1560}
1561
1562Error GenericDeviceTy::initAsyncInfo(__tgt_async_info **AsyncInfoPtr) {
1563 assert(AsyncInfoPtr && "Invalid async info");
1564
1565 *AsyncInfoPtr = new __tgt_async_info();
1566
1567 AsyncInfoWrapperTy AsyncInfoWrapper(*this, *AsyncInfoPtr);
1568
1569 auto Err = initAsyncInfoImpl(AsyncInfoWrapper);
1570 AsyncInfoWrapper.finalize(Err);
1571 return Err;
1572}
1573
1574Error GenericDeviceTy::initDeviceInfo(__tgt_device_info *DeviceInfo) {
1575 assert(DeviceInfo && "Invalid device info");
1576
1577 return initDeviceInfoImpl(DeviceInfo);
1578}
1579
1580Error GenericDeviceTy::printInfo() {
1581 InfoQueueTy InfoQueue;
1582
1583 // Get the vendor-specific info entries describing the device properties.
1584 if (auto Err = obtainInfoImpl(InfoQueue))
1585 return Err;
1586
1587 // Print all info entries.
1588 InfoQueue.print();
1589
1590 return Plugin::success();
1591}
1592
1593Error GenericDeviceTy::createEvent(void **EventPtrStorage) {
1594 return createEventImpl(EventPtrStorage);
1595}
1596
1597Error GenericDeviceTy::destroyEvent(void *EventPtr) {
1598 return destroyEventImpl(EventPtr);
1599}
1600
1601Error GenericDeviceTy::recordEvent(void *EventPtr,
1602 __tgt_async_info *AsyncInfo) {
1603 AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
1604
1605 auto Err = recordEventImpl(EventPtr, AsyncInfoWrapper);
1606 AsyncInfoWrapper.finalize(Err);
1607 return Err;
1608}
1609
1610Error GenericDeviceTy::waitEvent(void *EventPtr, __tgt_async_info *AsyncInfo) {
1611 AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
1612
1613 auto Err = waitEventImpl(EventPtr, AsyncInfoWrapper);
1614 AsyncInfoWrapper.finalize(Err);
1615 return Err;
1616}
1617
1618Error GenericDeviceTy::syncEvent(void *EventPtr) {
1619 return syncEventImpl(EventPtr);
1620}
1621
1622bool GenericDeviceTy::useAutoZeroCopy() { return useAutoZeroCopyImpl(); }
1623
1624Error GenericPluginTy::init() {
1625 if (Initialized)
1626 return Plugin::success();
1627
1628 auto NumDevicesOrErr = initImpl();
1629 if (!NumDevicesOrErr)
1630 return NumDevicesOrErr.takeError();
1631 Initialized = true;
1632
1633 NumDevices = *NumDevicesOrErr;
1634 if (NumDevices == 0)
1635 return Plugin::success();
1636
1637 assert(Devices.size() == 0 && "Plugin already initialized");
1638 Devices.resize(NumDevices, nullptr);
1639
1640 GlobalHandler = createGlobalHandler();
1641 assert(GlobalHandler && "Invalid global handler");
1642
1643 RPCServer = new RPCServerTy(*this);
1644 assert(RPCServer && "Invalid RPC server");
1645
1646 RecordReplay = new RecordReplayTy();
1647 assert(RecordReplay && "Invalid RR interface");
1648
1649 return Plugin::success();
1650}
1651
1652Error GenericPluginTy::deinit() {
1653 assert(Initialized && "Plugin was not initialized!");
1654
1655 // Deinitialize all active devices.
1656 for (int32_t DeviceId = 0; DeviceId < NumDevices; ++DeviceId) {
1657 if (Devices[DeviceId]) {
1658 if (auto Err = deinitDevice(DeviceId))
1659 return Err;
1660 }
1661 assert(!Devices[DeviceId] && "Device was not deinitialized");
1662 }
1663
1664 // There is no global handler if no device is available.
1665 if (GlobalHandler)
1666 delete GlobalHandler;
1667
1668 if (RPCServer) {
1669 if (Error Err = RPCServer->shutDown())
1670 return Err;
1671 delete RPCServer;
1672 }
1673
1674 if (RecordReplay)
1675 delete RecordReplay;
1676
1677 // Perform last deinitializations on the plugin.
1678 if (Error Err = deinitImpl())
1679 return Err;
1680 Initialized = false;
1681
1682 return Plugin::success();
1683}
1684
1685Error GenericPluginTy::initDevice(int32_t DeviceId) {
1686 assert(!Devices[DeviceId] && "Device already initialized");
1687
1688 // Create the device and save the reference.
1689 GenericDeviceTy *Device = createDevice(*this, DeviceId, NumDevices);
1690 assert(Device && "Invalid device");
1691
1692 // Save the device reference into the list.
1693 Devices[DeviceId] = Device;
1694
1695 // Initialize the device and its resources.
1696 return Device->init(*this);
1697}
1698
1699Error GenericPluginTy::deinitDevice(int32_t DeviceId) {
1700 // The device may be already deinitialized.
1701 if (Devices[DeviceId] == nullptr)
1702 return Plugin::success();
1703
1704 // Deinitialize the device and release its resources.
1705 if (auto Err = Devices[DeviceId]->deinit(*this))
1706 return Err;
1707
1708 // Delete the device and invalidate its reference.
1709 delete Devices[DeviceId];
1710 Devices[DeviceId] = nullptr;
1711
1712 return Plugin::success();
1713}
1714
1715Expected<bool> GenericPluginTy::checkELFImage(StringRef Image) const {
1716 // First check if this image is a regular ELF file.
1717 if (!utils::elf::isELF(Image))
1718 return false;
1719
1720 // Check if this image is an ELF with a matching machine value.
1721 auto MachineOrErr = utils::elf::checkMachine(Image, getMagicElfBits());
1722 if (!MachineOrErr)
1723 return MachineOrErr.takeError();
1724
1725 return MachineOrErr;
1726}
1727
1728Expected<bool> GenericPluginTy::checkBitcodeImage(StringRef Image) const {
1729 if (identify_magic(Image) != file_magic::bitcode)
1730 return false;
1731
1732 LLVMContext Context;
1733 auto ModuleOrErr = getLazyBitcodeModule(MemoryBufferRef(Image, ""), Context,
1734 /*ShouldLazyLoadMetadata=*/true);
1735 if (!ModuleOrErr)
1736 return ModuleOrErr.takeError();
1737 Module &M = **ModuleOrErr;
1738
1739 return M.getTargetTriple().getArch() == getTripleArch();
1740}
1741
1742int32_t GenericPluginTy::is_initialized() const { return Initialized; }
1743
1744int32_t GenericPluginTy::is_plugin_compatible(__tgt_device_image *Image) {
1745 StringRef Buffer(reinterpret_cast<const char *>(Image->ImageStart),
1746 utils::getPtrDiff(Image->ImageEnd, Image->ImageStart));
1747
1748 auto HandleError = [&](Error Err) -> bool {
1749 [[maybe_unused]] std::string ErrStr = toString(std::move(Err));
1750 DP("Failure to check validity of image %p: %s", Image, ErrStr.c_str());
1751 return false;
1752 };
1753 switch (identify_magic(Buffer)) {
1754 case file_magic::elf:
1755 case file_magic::elf_relocatable:
1756 case file_magic::elf_executable:
1757 case file_magic::elf_shared_object:
1758 case file_magic::elf_core: {
1759 auto MatchOrErr = checkELFImage(Buffer);
1760 if (Error Err = MatchOrErr.takeError())
1761 return HandleError(std::move(Err));
1762 return *MatchOrErr;
1763 }
1764 case file_magic::bitcode: {
1765 auto MatchOrErr = checkBitcodeImage(Buffer);
1766 if (Error Err = MatchOrErr.takeError())
1767 return HandleError(std::move(Err));
1768 return *MatchOrErr;
1769 }
1770 default:
1771 return false;
1772 }
1773}
1774
1775int32_t GenericPluginTy::is_device_compatible(int32_t DeviceId,
1776 __tgt_device_image *Image) {
1777 StringRef Buffer(reinterpret_cast<const char *>(Image->ImageStart),
1778 utils::getPtrDiff(Image->ImageEnd, Image->ImageStart));
1779
1780 auto HandleError = [&](Error Err) -> bool {
1781 [[maybe_unused]] std::string ErrStr = toString(std::move(Err));
1782 DP("Failure to check validity of image %p: %s", Image, ErrStr.c_str());
1783 return false;
1784 };
1785 switch (identify_magic(Buffer)) {
1786 case file_magic::elf:
1787 case file_magic::elf_relocatable:
1788 case file_magic::elf_executable:
1789 case file_magic::elf_shared_object:
1790 case file_magic::elf_core: {
1791 auto MatchOrErr = checkELFImage(Buffer);
1792 if (Error Err = MatchOrErr.takeError())
1793 return HandleError(std::move(Err));
1794 if (!*MatchOrErr)
1795 return false;
1796
1797 // Perform plugin-dependent checks for the specific architecture if needed.
1798 auto CompatibleOrErr = isELFCompatible(DeviceId, Buffer);
1799 if (Error Err = CompatibleOrErr.takeError())
1800 return HandleError(std::move(Err));
1801 return *CompatibleOrErr;
1802 }
1803 case file_magic::bitcode: {
1804 auto MatchOrErr = checkBitcodeImage(Buffer);
1805 if (Error Err = MatchOrErr.takeError())
1806 return HandleError(std::move(Err));
1807 return *MatchOrErr;
1808 }
1809 default:
1810 return false;
1811 }
1812}
1813
1814int32_t GenericPluginTy::is_device_initialized(int32_t DeviceId) const {
1815 return isValidDeviceId(DeviceId) && Devices[DeviceId] != nullptr;
1816}
1817
1818int32_t GenericPluginTy::init_device(int32_t DeviceId) {
1819 auto Err = initDevice(DeviceId);
1820 if (Err) {
1821 REPORT("Failure to initialize device %d: %s\n", DeviceId,
1822 toString(std::move(Err)).data());
1823 return OFFLOAD_FAIL;
1824 }
1825
1826 return OFFLOAD_SUCCESS;
1827}
1828
1829int32_t GenericPluginTy::number_of_devices() { return getNumDevices(); }
1830
1831int32_t GenericPluginTy::is_data_exchangable(int32_t SrcDeviceId,
1832 int32_t DstDeviceId) {
1833 return isDataExchangable(SrcDeviceId, DstDeviceId);
1834}
1835
1836int32_t GenericPluginTy::initialize_record_replay(int32_t DeviceId,
1837 int64_t MemorySize,
1838 void *VAddr, bool isRecord,
1839 bool SaveOutput,
1840 uint64_t &ReqPtrArgOffset) {
1841 GenericDeviceTy &Device = getDevice(DeviceId);
1842 RecordReplayTy::RRStatusTy Status =
1843 isRecord ? RecordReplayTy::RRStatusTy::RRRecording
1844 : RecordReplayTy::RRStatusTy::RRReplaying;
1845
1846 if (auto Err = RecordReplay->init(&Device, MemorySize, VAddr, Status,
1847 SaveOutput, ReqPtrArgOffset)) {
1848 REPORT("WARNING RR did not initialize RR-properly with %lu bytes"
1849 "(Error: %s)\n",
1850 MemorySize, toString(std::move(Err)).data());
1851 RecordReplay->setStatus(RecordReplayTy::RRStatusTy::RRDeactivated);
1852
1853 if (!isRecord) {
1854 return OFFLOAD_FAIL;
1855 }
1856 }
1857 return OFFLOAD_SUCCESS;
1858}
1859
1860int32_t GenericPluginTy::load_binary(int32_t DeviceId,
1861 __tgt_device_image *TgtImage,
1862 __tgt_device_binary *Binary) {
1863 GenericDeviceTy &Device = getDevice(DeviceId);
1864
1865 auto ImageOrErr = Device.loadBinary(*this, TgtImage);
1866 if (!ImageOrErr) {
1867 auto Err = ImageOrErr.takeError();
1868 REPORT("Failure to load binary image %p on device %d: %s\n", TgtImage,
1869 DeviceId, toString(std::move(Err)).data());
1870 return OFFLOAD_FAIL;
1871 }
1872
1873 DeviceImageTy *Image = *ImageOrErr;
1874 assert(Image != nullptr && "Invalid Image");
1875
1876 *Binary = __tgt_device_binary{reinterpret_cast<uint64_t>(Image)};
1877
1878 return OFFLOAD_SUCCESS;
1879}
1880
1881void *GenericPluginTy::data_alloc(int32_t DeviceId, int64_t Size, void *HostPtr,
1882 int32_t Kind) {
1883 auto AllocOrErr =
1884 getDevice(DeviceId).dataAlloc(Size, HostPtr, (TargetAllocTy)Kind);
1885 if (!AllocOrErr) {
1886 auto Err = AllocOrErr.takeError();
1887 REPORT("Failure to allocate device memory: %s\n",
1888 toString(std::move(Err)).data());
1889 return nullptr;
1890 }
1891 assert(*AllocOrErr && "Null pointer upon successful allocation");
1892
1893 return *AllocOrErr;
1894}
1895
1896int32_t GenericPluginTy::data_delete(int32_t DeviceId, void *TgtPtr,
1897 int32_t Kind) {
1898 auto Err =
1899 getDevice(DeviceId).dataDelete(TgtPtr, static_cast<TargetAllocTy>(Kind));
1900 if (Err) {
1901 REPORT("Failure to deallocate device pointer %p: %s\n", TgtPtr,
1902 toString(std::move(Err)).data());
1903 return OFFLOAD_FAIL;
1904 }
1905
1906 return OFFLOAD_SUCCESS;
1907}
1908
1909int32_t GenericPluginTy::data_lock(int32_t DeviceId, void *Ptr, int64_t Size,
1910 void **LockedPtr) {
1911 auto LockedPtrOrErr = getDevice(DeviceId).dataLock(Ptr, Size);
1912 if (!LockedPtrOrErr) {
1913 auto Err = LockedPtrOrErr.takeError();
1914 REPORT("Failure to lock memory %p: %s\n", Ptr,
1915 toString(std::move(Err)).data());
1916 return OFFLOAD_FAIL;
1917 }
1918
1919 if (!(*LockedPtrOrErr)) {
1920 REPORT("Failure to lock memory %p: obtained a null locked pointer\n", Ptr);
1921 return OFFLOAD_FAIL;
1922 }
1923 *LockedPtr = *LockedPtrOrErr;
1924
1925 return OFFLOAD_SUCCESS;
1926}
1927
1928int32_t GenericPluginTy::data_unlock(int32_t DeviceId, void *Ptr) {
1929 auto Err = getDevice(DeviceId).dataUnlock(Ptr);
1930 if (Err) {
1931 REPORT("Failure to unlock memory %p: %s\n", Ptr,
1932 toString(std::move(Err)).data());
1933 return OFFLOAD_FAIL;
1934 }
1935
1936 return OFFLOAD_SUCCESS;
1937}
1938
1939int32_t GenericPluginTy::data_notify_mapped(int32_t DeviceId, void *HstPtr,
1940 int64_t Size) {
1941 auto Err = getDevice(DeviceId).notifyDataMapped(HstPtr, Size);
1942 if (Err) {
1943 REPORT("Failure to notify data mapped %p: %s\n", HstPtr,
1944 toString(std::move(Err)).data());
1945 return OFFLOAD_FAIL;
1946 }
1947
1948 return OFFLOAD_SUCCESS;
1949}
1950
1951int32_t GenericPluginTy::data_notify_unmapped(int32_t DeviceId, void *HstPtr) {
1952 auto Err = getDevice(DeviceId).notifyDataUnmapped(HstPtr);
1953 if (Err) {
1954 REPORT("Failure to notify data unmapped %p: %s\n", HstPtr,
1955 toString(std::move(Err)).data());
1956 return OFFLOAD_FAIL;
1957 }
1958
1959 return OFFLOAD_SUCCESS;
1960}
1961
1962int32_t GenericPluginTy::data_submit(int32_t DeviceId, void *TgtPtr,
1963 void *HstPtr, int64_t Size) {
1964 return data_submit_async(DeviceId, TgtPtr, HstPtr, Size,
1965 /*AsyncInfoPtr=*/nullptr);
1966}
1967
1968int32_t GenericPluginTy::data_submit_async(int32_t DeviceId, void *TgtPtr,
1969 void *HstPtr, int64_t Size,
1970 __tgt_async_info *AsyncInfoPtr) {
1971 auto Err = getDevice(DeviceId).dataSubmit(TgtPtr, HstPtr, Size, AsyncInfoPtr);
1972 if (Err) {
1973 REPORT("Failure to copy data from host to device. Pointers: host "
1974 "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 ": %s\n",
1975 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size,
1976 toString(std::move(Err)).data());
1977 return OFFLOAD_FAIL;
1978 }
1979
1980 return OFFLOAD_SUCCESS;
1981}
1982
1983int32_t GenericPluginTy::data_retrieve(int32_t DeviceId, void *HstPtr,
1984 void *TgtPtr, int64_t Size) {
1985 return data_retrieve_async(DeviceId, HstPtr, TgtPtr, Size,
1986 /*AsyncInfoPtr=*/nullptr);
1987}
1988
1989int32_t GenericPluginTy::data_retrieve_async(int32_t DeviceId, void *HstPtr,
1990 void *TgtPtr, int64_t Size,
1991 __tgt_async_info *AsyncInfoPtr) {
1992 auto Err =
1993 getDevice(DeviceId).dataRetrieve(HstPtr, TgtPtr, Size, AsyncInfoPtr);
1994 if (Err) {
1995 REPORT("Failure to copy data from device to host. Pointers: host "
1996 "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 ": %s\n",
1997 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size,
1998 toString(std::move(Err)).data());
1999 return OFFLOAD_FAIL;
2000 }
2001
2002 return OFFLOAD_SUCCESS;
2003}
2004
2005int32_t GenericPluginTy::data_exchange(int32_t SrcDeviceId, void *SrcPtr,
2006 int32_t DstDeviceId, void *DstPtr,
2007 int64_t Size) {
2008 return data_exchange_async(SrcDeviceId, SrcPtr, DstDeviceId, DstPtr, Size,
2009 /*AsyncInfoPtr=*/nullptr);
2010}
2011
2012int32_t GenericPluginTy::data_exchange_async(int32_t SrcDeviceId, void *SrcPtr,
2013 int DstDeviceId, void *DstPtr,
2014 int64_t Size,
2015 __tgt_async_info *AsyncInfo) {
2016 GenericDeviceTy &SrcDevice = getDevice(SrcDeviceId);
2017 GenericDeviceTy &DstDevice = getDevice(DstDeviceId);
2018 auto Err = SrcDevice.dataExchange(SrcPtr, DstDevice, DstPtr, Size, AsyncInfo);
2019 if (Err) {
2020 REPORT("Failure to copy data from device (%d) to device (%d). Pointers: "
2021 "host = " DPxMOD ", device = " DPxMOD ", size = %" PRId64 ": %s\n",
2022 SrcDeviceId, DstDeviceId, DPxPTR(SrcPtr), DPxPTR(DstPtr), Size,
2023 toString(std::move(Err)).data());
2024 return OFFLOAD_FAIL;
2025 }
2026
2027 return OFFLOAD_SUCCESS;
2028}
2029
2030int32_t GenericPluginTy::launch_kernel(int32_t DeviceId, void *TgtEntryPtr,
2031 void **TgtArgs, ptrdiff_t *TgtOffsets,
2032 KernelArgsTy *KernelArgs,
2033 __tgt_async_info *AsyncInfoPtr) {
2034 auto Err = getDevice(DeviceId).launchKernel(TgtEntryPtr, TgtArgs, TgtOffsets,
2035 *KernelArgs, AsyncInfoPtr);
2036 if (Err) {
2037 REPORT("Failure to run target region " DPxMOD " in device %d: %s\n",
2038 DPxPTR(TgtEntryPtr), DeviceId, toString(std::move(Err)).data());
2039 return OFFLOAD_FAIL;
2040 }
2041
2042 return OFFLOAD_SUCCESS;
2043}
2044
2045int32_t GenericPluginTy::synchronize(int32_t DeviceId,
2046 __tgt_async_info *AsyncInfoPtr) {
2047 auto Err = getDevice(DeviceId).synchronize(AsyncInfoPtr);
2048 if (Err) {
2049 REPORT("Failure to synchronize stream %p: %s\n", AsyncInfoPtr->Queue,
2050 toString(std::move(Err)).data());
2051 return OFFLOAD_FAIL;
2052 }
2053
2054 return OFFLOAD_SUCCESS;
2055}
2056
2057int32_t GenericPluginTy::query_async(int32_t DeviceId,
2058 __tgt_async_info *AsyncInfoPtr) {
2059 auto Err = getDevice(DeviceId).queryAsync(AsyncInfoPtr);
2060 if (Err) {
2061 REPORT("Failure to query stream %p: %s\n", AsyncInfoPtr->Queue,
2062 toString(std::move(Err)).data());
2063 return OFFLOAD_FAIL;
2064 }
2065
2066 return OFFLOAD_SUCCESS;
2067}
2068
2069void GenericPluginTy::print_device_info(int32_t DeviceId) {
2070 if (auto Err = getDevice(DeviceId).printInfo())
2071 REPORT("Failure to print device %d info: %s\n", DeviceId,
2072 toString(std::move(Err)).data());
2073}
2074
2075int32_t GenericPluginTy::create_event(int32_t DeviceId, void **EventPtr) {
2076 auto Err = getDevice(DeviceId).createEvent(EventPtr);
2077 if (Err) {
2078 REPORT("Failure to create event: %s\n", toString(std::move(Err)).data());
2079 return OFFLOAD_FAIL;
2080 }
2081
2082 return OFFLOAD_SUCCESS;
2083}
2084
2085int32_t GenericPluginTy::record_event(int32_t DeviceId, void *EventPtr,
2086 __tgt_async_info *AsyncInfoPtr) {
2087 auto Err = getDevice(DeviceId).recordEvent(EventPtr, AsyncInfoPtr);
2088 if (Err) {
2089 REPORT("Failure to record event %p: %s\n", EventPtr,
2090 toString(std::move(Err)).data());
2091 return OFFLOAD_FAIL;
2092 }
2093
2094 return OFFLOAD_SUCCESS;
2095}
2096
2097int32_t GenericPluginTy::wait_event(int32_t DeviceId, void *EventPtr,
2098 __tgt_async_info *AsyncInfoPtr) {
2099 auto Err = getDevice(DeviceId).waitEvent(EventPtr, AsyncInfoPtr);
2100 if (Err) {
2101 REPORT("Failure to wait event %p: %s\n", EventPtr,
2102 toString(std::move(Err)).data());
2103 return OFFLOAD_FAIL;
2104 }
2105
2106 return OFFLOAD_SUCCESS;
2107}
2108
2109int32_t GenericPluginTy::sync_event(int32_t DeviceId, void *EventPtr) {
2110 auto Err = getDevice(DeviceId).syncEvent(EventPtr);
2111 if (Err) {
2112 REPORT("Failure to synchronize event %p: %s\n", EventPtr,
2113 toString(std::move(Err)).data());
2114 return OFFLOAD_FAIL;
2115 }
2116
2117 return OFFLOAD_SUCCESS;
2118}
2119
2120int32_t GenericPluginTy::destroy_event(int32_t DeviceId, void *EventPtr) {
2121 auto Err = getDevice(DeviceId).destroyEvent(EventPtr);
2122 if (Err) {
2123 REPORT("Failure to destroy event %p: %s\n", EventPtr,
2124 toString(std::move(Err)).data());
2125 return OFFLOAD_FAIL;
2126 }
2127
2128 return OFFLOAD_SUCCESS;
2129}
2130
2131void GenericPluginTy::set_info_flag(uint32_t NewInfoLevel) {
2132 std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal();
2133 InfoLevel.store(NewInfoLevel);
2134}
2135
2136int32_t GenericPluginTy::init_async_info(int32_t DeviceId,
2137 __tgt_async_info **AsyncInfoPtr) {
2138 assert(AsyncInfoPtr && "Invalid async info");
2139
2140 auto Err = getDevice(DeviceId).initAsyncInfo(AsyncInfoPtr);
2141 if (Err) {
2142 REPORT("Failure to initialize async info at " DPxMOD " on device %d: %s\n",
2143 DPxPTR(*AsyncInfoPtr), DeviceId, toString(std::move(Err)).data());
2144 return OFFLOAD_FAIL;
2145 }
2146
2147 return OFFLOAD_SUCCESS;
2148}
2149
2150int32_t GenericPluginTy::init_device_info(int32_t DeviceId,
2151 __tgt_device_info *DeviceInfo,
2152 const char **ErrStr) {
2153 *ErrStr = "";
2154
2155 auto Err = getDevice(DeviceId).initDeviceInfo(DeviceInfo);
2156 if (Err) {
2157 REPORT("Failure to initialize device info at " DPxMOD " on device %d: %s\n",
2158 DPxPTR(DeviceInfo), DeviceId, toString(std::move(Err)).data());
2159 return OFFLOAD_FAIL;
2160 }
2161
2162 return OFFLOAD_SUCCESS;
2163}
2164
2165int32_t GenericPluginTy::set_device_identifier(int32_t UserId,
2166 int32_t DeviceId) {
2167 UserDeviceIds[DeviceId] = UserId;
2168
2169 return OFFLOAD_SUCCESS;
2170}
2171
2172int32_t GenericPluginTy::use_auto_zero_copy(int32_t DeviceId) {
2173 return getDevice(DeviceId).useAutoZeroCopy();
2174}
2175
2176int32_t GenericPluginTy::get_global(__tgt_device_binary Binary, uint64_t Size,
2177 const char *Name, void **DevicePtr) {
2178 assert(Binary.handle && "Invalid device binary handle");
2179 DeviceImageTy &Image = *reinterpret_cast<DeviceImageTy *>(Binary.handle);
2180
2181 GenericDeviceTy &Device = Image.getDevice();
2182
2183 GlobalTy DeviceGlobal(Name, Size);
2184 GenericGlobalHandlerTy &GHandler = getGlobalHandler();
2185 if (auto Err =
2186 GHandler.getGlobalMetadataFromDevice(Device, Image, DeviceGlobal)) {
2187 REPORT("Failure to look up global address: %s\n",
2188 toString(std::move(Err)).data());
2189 return OFFLOAD_FAIL;
2190 }
2191
2192 *DevicePtr = DeviceGlobal.getPtr();
2193 assert(DevicePtr && "Invalid device global's address");
2194
2195 // Save the loaded globals if we are recording.
2196 RecordReplayTy &RecordReplay = Device.Plugin.getRecordReplay();
2197 if (RecordReplay.isRecording())
2198 RecordReplay.addEntry(Name, Size, *DevicePtr);
2199
2200 return OFFLOAD_SUCCESS;
2201}
2202
2203int32_t GenericPluginTy::get_function(__tgt_device_binary Binary,
2204 const char *Name, void **KernelPtr) {
2205 assert(Binary.handle && "Invalid device binary handle");
2206 DeviceImageTy &Image = *reinterpret_cast<DeviceImageTy *>(Binary.handle);
2207
2208 GenericDeviceTy &Device = Image.getDevice();
2209
2210 auto KernelOrErr = Device.constructKernel(Name);
2211 if (Error Err = KernelOrErr.takeError()) {
2212 REPORT("Failure to look up kernel: %s\n", toString(std::move(Err)).data());
2213 return OFFLOAD_FAIL;
2214 }
2215
2216 GenericKernelTy &Kernel = *KernelOrErr;
2217 if (auto Err = Kernel.init(Device, Image)) {
2218 REPORT("Failure to init kernel: %s\n", toString(std::move(Err)).data());
2219 return OFFLOAD_FAIL;
2220 }
2221
2222 // Note that this is not the kernel's device address.
2223 *KernelPtr = &Kernel;
2224 return OFFLOAD_SUCCESS;
2225}
2226

source code of offload/plugins-nextgen/common/src/PluginInterface.cpp