1 | //===- PluginInterface.cpp - Target independent plugin device interface ---===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | //===----------------------------------------------------------------------===// |
10 | |
11 | #include "PluginInterface.h" |
12 | |
13 | #include "Shared/APITypes.h" |
14 | #include "Shared/Debug.h" |
15 | #include "Shared/Environment.h" |
16 | #include "Shared/PluginAPI.h" |
17 | |
18 | #include "GlobalHandler.h" |
19 | #include "JIT.h" |
20 | #include "Utils/ELF.h" |
21 | #include "omptarget.h" |
22 | |
23 | #ifdef OMPT_SUPPORT |
24 | #include "OpenMP/OMPT/Callback.h" |
25 | #include "omp-tools.h" |
26 | #endif |
27 | |
28 | #include "llvm/Frontend/OpenMP/OMPConstants.h" |
29 | #include "llvm/Support/Error.h" |
30 | #include "llvm/Support/JSON.h" |
31 | #include "llvm/Support/MathExtras.h" |
32 | #include "llvm/Support/MemoryBuffer.h" |
33 | |
34 | #include <cstdint> |
35 | #include <limits> |
36 | |
37 | using namespace llvm; |
38 | using namespace omp; |
39 | using namespace target; |
40 | using namespace plugin; |
41 | |
42 | GenericPluginTy *PluginTy::SpecificPlugin = nullptr; |
43 | |
44 | // TODO: Fix any thread safety issues for multi-threaded kernel recording. |
45 | struct RecordReplayTy { |
46 | |
47 | // Describes the state of the record replay mechanism. |
48 | enum RRStatusTy { RRDeactivated = 0, RRRecording, RRReplaying }; |
49 | |
50 | private: |
51 | // Memory pointers for recording, replaying memory. |
52 | void *MemoryStart = nullptr; |
53 | void *MemoryPtr = nullptr; |
54 | size_t MemorySize = 0; |
55 | size_t TotalSize = 0; |
56 | GenericDeviceTy *Device = nullptr; |
57 | std::mutex AllocationLock; |
58 | |
59 | RRStatusTy Status = RRDeactivated; |
60 | bool ReplaySaveOutput = false; |
61 | bool UsedVAMap = false; |
62 | uintptr_t MemoryOffset = 0; |
63 | |
64 | // A list of all globals mapped to the device. |
65 | struct GlobalEntry { |
66 | const char *Name; |
67 | uint64_t Size; |
68 | void *Addr; |
69 | }; |
70 | llvm::SmallVector<GlobalEntry> GlobalEntries{}; |
71 | |
72 | void *suggestAddress(uint64_t MaxMemoryAllocation) { |
73 | // Get a valid pointer address for this system |
74 | void *Addr = |
75 | Device->allocate(1024, /*HstPtr=*/nullptr, TARGET_ALLOC_DEFAULT); |
76 | Device->free(Addr); |
77 | // Align Address to MaxMemoryAllocation |
78 | Addr = (void *)alignPtr((Addr), MaxMemoryAllocation); |
79 | return Addr; |
80 | } |
81 | |
82 | Error preAllocateVAMemory(uint64_t MaxMemoryAllocation, void *VAddr) { |
83 | size_t ASize = MaxMemoryAllocation; |
84 | |
85 | if (!VAddr && isRecording()) |
86 | VAddr = suggestAddress(MaxMemoryAllocation); |
87 | |
88 | DP("Request %ld bytes allocated at %p\n" , MaxMemoryAllocation, VAddr); |
89 | |
90 | if (auto Err = Device->memoryVAMap(&MemoryStart, VAddr, &ASize)) |
91 | return Err; |
92 | |
93 | if (isReplaying() && VAddr != MemoryStart) { |
94 | return Plugin::error("Record-Replay cannot assign the" |
95 | "requested recorded address (%p, %p)" , |
96 | VAddr, MemoryStart); |
97 | } |
98 | |
99 | INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(), |
100 | "Allocated %" PRIu64 " bytes at %p for replay.\n" , ASize, MemoryStart); |
101 | |
102 | MemoryPtr = MemoryStart; |
103 | MemorySize = 0; |
104 | TotalSize = ASize; |
105 | UsedVAMap = true; |
106 | return Plugin::success(); |
107 | } |
108 | |
109 | Error preAllocateHeuristic(uint64_t MaxMemoryAllocation, |
110 | uint64_t RequiredMemoryAllocation, void *VAddr) { |
111 | const size_t MAX_MEMORY_ALLOCATION = MaxMemoryAllocation; |
112 | constexpr size_t STEP = 1024 * 1024 * 1024ULL; |
113 | MemoryStart = nullptr; |
114 | for (TotalSize = MAX_MEMORY_ALLOCATION; TotalSize > 0; TotalSize -= STEP) { |
115 | MemoryStart = |
116 | Device->allocate(TotalSize, /*HstPtr=*/nullptr, TARGET_ALLOC_DEFAULT); |
117 | if (MemoryStart) |
118 | break; |
119 | } |
120 | if (!MemoryStart) |
121 | return Plugin::error("Allocating record/replay memory" ); |
122 | |
123 | if (VAddr && VAddr != MemoryStart) |
124 | MemoryOffset = uintptr_t(VAddr) - uintptr_t(MemoryStart); |
125 | |
126 | MemoryPtr = MemoryStart; |
127 | MemorySize = 0; |
128 | |
129 | // Check if we need adjustment. |
130 | if (MemoryOffset > 0 && |
131 | TotalSize >= RequiredMemoryAllocation + MemoryOffset) { |
132 | // If we are off but "before" the required address and with enough space, |
133 | // we just "allocate" the offset to match the required address. |
134 | MemoryPtr = (char *)MemoryPtr + MemoryOffset; |
135 | MemorySize += MemoryOffset; |
136 | MemoryOffset = 0; |
137 | assert(MemoryPtr == VAddr && "Expected offset adjustment to work" ); |
138 | } else if (MemoryOffset) { |
139 | // If we are off and in a situation we cannot just "waste" memory to force |
140 | // a match, we hope adjusting the arguments is sufficient. |
141 | REPORT( |
142 | "WARNING Failed to allocate replay memory at required location %p, " |
143 | "got %p, trying to offset argument pointers by %" PRIi64 "\n" , |
144 | VAddr, MemoryStart, MemoryOffset); |
145 | } |
146 | |
147 | INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(), |
148 | "Allocated %" PRIu64 " bytes at %p for replay.\n" , TotalSize, |
149 | MemoryStart); |
150 | |
151 | return Plugin::success(); |
152 | } |
153 | |
154 | Error preallocateDeviceMemory(uint64_t DeviceMemorySize, void *ReqVAddr) { |
155 | if (Device->supportVAManagement()) { |
156 | auto Err = preAllocateVAMemory(MaxMemoryAllocation: DeviceMemorySize, VAddr: ReqVAddr); |
157 | if (Err) { |
158 | REPORT("WARNING VA mapping failed, fallback to heuristic: " |
159 | "(Error: %s)\n" , |
160 | toString(E: std::move(Err)).data()); |
161 | } |
162 | } |
163 | |
164 | uint64_t DevMemSize; |
165 | if (Device->getDeviceMemorySize(DevMemSize)) |
166 | return Plugin::error("Cannot determine Device Memory Size" ); |
167 | |
168 | return preAllocateHeuristic(MaxMemoryAllocation: DevMemSize, RequiredMemoryAllocation: DeviceMemorySize, VAddr: ReqVAddr); |
169 | } |
170 | |
171 | void dumpDeviceMemory(StringRef Filename) { |
172 | ErrorOr<std::unique_ptr<WritableMemoryBuffer>> DeviceMemoryMB = |
173 | WritableMemoryBuffer::getNewUninitMemBuffer(MemorySize); |
174 | if (!DeviceMemoryMB) |
175 | report_fatal_error(reason: "Error creating MemoryBuffer for device memory" ); |
176 | |
177 | auto Err = Device->dataRetrieve(DeviceMemoryMB.get()->getBufferStart(), |
178 | MemoryStart, MemorySize, nullptr); |
179 | if (Err) |
180 | report_fatal_error(reason: "Error retrieving data for target pointer" ); |
181 | |
182 | StringRef DeviceMemory(DeviceMemoryMB.get()->getBufferStart(), MemorySize); |
183 | std::error_code EC; |
184 | raw_fd_ostream OS(Filename, EC); |
185 | if (EC) |
186 | report_fatal_error(reason: "Error dumping memory to file " + Filename + " :" + |
187 | EC.message()); |
188 | OS << DeviceMemory; |
189 | OS.close(); |
190 | } |
191 | |
192 | public: |
193 | bool isRecording() const { return Status == RRStatusTy::RRRecording; } |
194 | bool isReplaying() const { return Status == RRStatusTy::RRReplaying; } |
195 | bool isRecordingOrReplaying() const { |
196 | return (Status != RRStatusTy::RRDeactivated); |
197 | } |
198 | void setStatus(RRStatusTy Status) { this->Status = Status; } |
199 | bool isSaveOutputEnabled() const { return ReplaySaveOutput; } |
200 | void addEntry(const char *Name, uint64_t Size, void *Addr) { |
201 | GlobalEntries.emplace_back(GlobalEntry{Name, Size, Addr}); |
202 | } |
203 | |
204 | void saveImage(const char *Name, const DeviceImageTy &Image) { |
205 | SmallString<128> ImageName = {Name, ".image" }; |
206 | std::error_code EC; |
207 | raw_fd_ostream OS(ImageName, EC); |
208 | if (EC) |
209 | report_fatal_error(reason: "Error saving image : " + StringRef(EC.message())); |
210 | if (const auto *TgtImageBitcode = Image.getTgtImageBitcode()) { |
211 | size_t Size = |
212 | getPtrDiff(TgtImageBitcode->ImageEnd, TgtImageBitcode->ImageStart); |
213 | MemoryBufferRef MBR = MemoryBufferRef( |
214 | StringRef((const char *)TgtImageBitcode->ImageStart, Size), "" ); |
215 | OS << MBR.getBuffer(); |
216 | } else { |
217 | OS << Image.getMemoryBuffer().getBuffer(); |
218 | } |
219 | OS.close(); |
220 | } |
221 | |
222 | void dumpGlobals(StringRef Filename, DeviceImageTy &Image) { |
223 | int32_t Size = 0; |
224 | |
225 | for (auto &OffloadEntry : GlobalEntries) { |
226 | if (!OffloadEntry.Size) |
227 | continue; |
228 | // Get the total size of the string and entry including the null byte. |
229 | Size += std::strlen(OffloadEntry.Name) + 1 + sizeof(uint32_t) + |
230 | OffloadEntry.Size; |
231 | } |
232 | |
233 | ErrorOr<std::unique_ptr<WritableMemoryBuffer>> GlobalsMB = |
234 | WritableMemoryBuffer::getNewUninitMemBuffer(Size); |
235 | if (!GlobalsMB) |
236 | report_fatal_error(reason: "Error creating MemoryBuffer for globals memory" ); |
237 | |
238 | void *BufferPtr = GlobalsMB.get()->getBufferStart(); |
239 | for (auto &OffloadEntry : GlobalEntries) { |
240 | if (!OffloadEntry.Size) |
241 | continue; |
242 | |
243 | int32_t NameLength = std::strlen(OffloadEntry.Name) + 1; |
244 | memcpy(BufferPtr, OffloadEntry.Name, NameLength); |
245 | BufferPtr = advanceVoidPtr(BufferPtr, NameLength); |
246 | |
247 | *((uint32_t *)(BufferPtr)) = OffloadEntry.Size; |
248 | BufferPtr = advanceVoidPtr(BufferPtr, sizeof(uint32_t)); |
249 | |
250 | auto Err = Plugin::success(); |
251 | { |
252 | if (auto Err = Device->dataRetrieve(BufferPtr, OffloadEntry.Addr, |
253 | OffloadEntry.Size, nullptr)) |
254 | report_fatal_error("Error retrieving data for global" ); |
255 | } |
256 | if (Err) |
257 | report_fatal_error("Error retrieving data for global" ); |
258 | BufferPtr = advanceVoidPtr(BufferPtr, OffloadEntry.Size); |
259 | } |
260 | assert(BufferPtr == GlobalsMB->get()->getBufferEnd() && |
261 | "Buffer over/under-filled." ); |
262 | assert(Size == getPtrDiff(BufferPtr, GlobalsMB->get()->getBufferStart()) && |
263 | "Buffer size mismatch" ); |
264 | |
265 | StringRef GlobalsMemory(GlobalsMB.get()->getBufferStart(), Size); |
266 | std::error_code EC; |
267 | raw_fd_ostream OS(Filename, EC); |
268 | OS << GlobalsMemory; |
269 | OS.close(); |
270 | } |
271 | |
272 | void saveKernelDescr(const char *Name, void **ArgPtrs, int32_t NumArgs, |
273 | uint64_t NumTeamsClause, uint32_t ThreadLimitClause, |
274 | uint64_t LoopTripCount) { |
275 | json::Object JsonKernelInfo; |
276 | JsonKernelInfo["Name" ] = Name; |
277 | JsonKernelInfo["NumArgs" ] = NumArgs; |
278 | JsonKernelInfo["NumTeamsClause" ] = NumTeamsClause; |
279 | JsonKernelInfo["ThreadLimitClause" ] = ThreadLimitClause; |
280 | JsonKernelInfo["LoopTripCount" ] = LoopTripCount; |
281 | JsonKernelInfo["DeviceMemorySize" ] = MemorySize; |
282 | JsonKernelInfo["DeviceId" ] = Device->getDeviceId(); |
283 | JsonKernelInfo["BumpAllocVAStart" ] = (intptr_t)MemoryStart; |
284 | |
285 | json::Array JsonArgPtrs; |
286 | for (int I = 0; I < NumArgs; ++I) |
287 | JsonArgPtrs.push_back((intptr_t)ArgPtrs[I]); |
288 | JsonKernelInfo["ArgPtrs" ] = json::Value(std::move(JsonArgPtrs)); |
289 | |
290 | json::Array JsonArgOffsets; |
291 | for (int I = 0; I < NumArgs; ++I) |
292 | JsonArgOffsets.push_back(0); |
293 | JsonKernelInfo["ArgOffsets" ] = json::Value(std::move(JsonArgOffsets)); |
294 | |
295 | SmallString<128> JsonFilename = {Name, ".json" }; |
296 | std::error_code EC; |
297 | raw_fd_ostream JsonOS(JsonFilename.str(), EC); |
298 | if (EC) |
299 | report_fatal_error(reason: "Error saving kernel json file : " + |
300 | StringRef(EC.message())); |
301 | JsonOS << json::Value(std::move(JsonKernelInfo)); |
302 | JsonOS.close(); |
303 | } |
304 | |
305 | void saveKernelInput(const char *Name, DeviceImageTy &Image) { |
306 | SmallString<128> GlobalsFilename = {Name, ".globals" }; |
307 | dumpGlobals(GlobalsFilename, Image); |
308 | |
309 | SmallString<128> MemoryFilename = {Name, ".memory" }; |
310 | dumpDeviceMemory(Filename: MemoryFilename); |
311 | } |
312 | |
313 | void saveKernelOutputInfo(const char *Name) { |
314 | SmallString<128> OutputFilename = { |
315 | Name, (isRecording() ? ".original.output" : ".replay.output" )}; |
316 | dumpDeviceMemory(Filename: OutputFilename); |
317 | } |
318 | |
319 | void *alloc(uint64_t Size) { |
320 | assert(MemoryStart && "Expected memory has been pre-allocated" ); |
321 | void *Alloc = nullptr; |
322 | constexpr int Alignment = 16; |
323 | // Assumes alignment is a power of 2. |
324 | int64_t AlignedSize = (Size + (Alignment - 1)) & (~(Alignment - 1)); |
325 | std::lock_guard<std::mutex> LG(AllocationLock); |
326 | Alloc = MemoryPtr; |
327 | MemoryPtr = (char *)MemoryPtr + AlignedSize; |
328 | MemorySize += AlignedSize; |
329 | DP("Memory Allocator return " DPxMOD "\n" , DPxPTR(Alloc)); |
330 | return Alloc; |
331 | } |
332 | |
333 | Error init(GenericDeviceTy *Device, uint64_t MemSize, void *VAddr, |
334 | RRStatusTy Status, bool SaveOutput, uint64_t &ReqPtrArgOffset) { |
335 | this->Device = Device; |
336 | this->Status = Status; |
337 | this->ReplaySaveOutput = SaveOutput; |
338 | |
339 | if (auto Err = preallocateDeviceMemory(DeviceMemorySize: MemSize, ReqVAddr: VAddr)) |
340 | return Err; |
341 | |
342 | INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(), |
343 | "Record Replay Initialized (%p)" |
344 | " as starting address, %lu Memory Size" |
345 | " and set on status %s\n" , |
346 | MemoryStart, TotalSize, |
347 | Status == RRStatusTy::RRRecording ? "Recording" : "Replaying" ); |
348 | |
349 | // Tell the user to offset pointer arguments as the memory allocation does |
350 | // not match. |
351 | ReqPtrArgOffset = MemoryOffset; |
352 | return Plugin::success(); |
353 | } |
354 | |
355 | void deinit() { |
356 | if (UsedVAMap) { |
357 | if (auto Err = Device->memoryVAUnMap(MemoryStart, TotalSize)) |
358 | report_fatal_error(reason: "Error on releasing virtual memory space" ); |
359 | } else { |
360 | Device->free(MemoryStart); |
361 | } |
362 | } |
363 | }; |
364 | |
365 | static RecordReplayTy RecordReplay; |
366 | |
367 | // Extract the mapping of host function pointers to device function pointers |
368 | // from the entry table. Functions marked as 'indirect' in OpenMP will have |
369 | // offloading entries generated for them which map the host's function pointer |
370 | // to a global containing the corresponding function pointer on the device. |
371 | static Expected<std::pair<void *, uint64_t>> |
372 | setupIndirectCallTable(GenericPluginTy &Plugin, GenericDeviceTy &Device, |
373 | DeviceImageTy &Image) { |
374 | GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler(); |
375 | |
376 | llvm::ArrayRef<__tgt_offload_entry> Entries(Image.getTgtImage()->EntriesBegin, |
377 | Image.getTgtImage()->EntriesEnd); |
378 | llvm::SmallVector<std::pair<void *, void *>> IndirectCallTable; |
379 | for (const auto &Entry : Entries) { |
380 | if (Entry.size == 0 || !(Entry.flags & OMP_DECLARE_TARGET_INDIRECT)) |
381 | continue; |
382 | |
383 | assert(Entry.size == sizeof(void *) && "Global not a function pointer?" ); |
384 | auto &[HstPtr, DevPtr] = IndirectCallTable.emplace_back(); |
385 | |
386 | GlobalTy DeviceGlobal(Entry.name, Entry.size); |
387 | if (auto Err = |
388 | Handler.getGlobalMetadataFromDevice(Device, Image, DeviceGlobal)) |
389 | return std::move(Err); |
390 | |
391 | HstPtr = Entry.addr; |
392 | if (auto Err = Device.dataRetrieve(&DevPtr, DeviceGlobal.getPtr(), |
393 | Entry.size, nullptr)) |
394 | return std::move(Err); |
395 | } |
396 | |
397 | // If we do not have any indirect globals we exit early. |
398 | if (IndirectCallTable.empty()) |
399 | return std::pair{nullptr, 0}; |
400 | |
401 | // Sort the array to allow for more efficient lookup of device pointers. |
402 | llvm::sort(IndirectCallTable, |
403 | [](const auto &x, const auto &y) { return x.first < y.first; }); |
404 | |
405 | uint64_t TableSize = |
406 | IndirectCallTable.size() * sizeof(std::pair<void *, void *>); |
407 | void *DevicePtr = Device.allocate(TableSize, nullptr, TARGET_ALLOC_DEVICE); |
408 | if (auto Err = Device.dataSubmit(DevicePtr, IndirectCallTable.data(), |
409 | TableSize, nullptr)) |
410 | return std::move(Err); |
411 | return std::pair<void *, uint64_t>(DevicePtr, IndirectCallTable.size()); |
412 | } |
413 | |
414 | AsyncInfoWrapperTy::AsyncInfoWrapperTy(GenericDeviceTy &Device, |
415 | __tgt_async_info *AsyncInfoPtr) |
416 | : Device(Device), |
417 | AsyncInfoPtr(AsyncInfoPtr ? AsyncInfoPtr : &LocalAsyncInfo) {} |
418 | |
419 | void AsyncInfoWrapperTy::finalize(Error &Err) { |
420 | assert(AsyncInfoPtr && "AsyncInfoWrapperTy already finalized" ); |
421 | |
422 | // If we used a local async info object we want synchronous behavior. In that |
423 | // case, and assuming the current status code is correct, we will synchronize |
424 | // explicitly when the object is deleted. Update the error with the result of |
425 | // the synchronize operation. |
426 | if (AsyncInfoPtr == &LocalAsyncInfo && LocalAsyncInfo.Queue && !Err) |
427 | Err = Device.synchronize(&LocalAsyncInfo); |
428 | |
429 | // Invalidate the wrapper object. |
430 | AsyncInfoPtr = nullptr; |
431 | } |
432 | |
433 | Error GenericKernelTy::init(GenericDeviceTy &GenericDevice, |
434 | DeviceImageTy &Image) { |
435 | |
436 | ImagePtr = &Image; |
437 | |
438 | // Retrieve kernel environment object for the kernel. |
439 | GlobalTy KernelEnv(std::string(Name) + "_kernel_environment" , |
440 | sizeof(KernelEnvironment), &KernelEnvironment); |
441 | GenericGlobalHandlerTy &GHandler = GenericDevice.Plugin.getGlobalHandler(); |
442 | if (auto Err = |
443 | GHandler.readGlobalFromImage(GenericDevice, *ImagePtr, KernelEnv)) { |
444 | [[maybe_unused]] std::string ErrStr = toString(std::move(Err)); |
445 | DP("Failed to read kernel environment for '%s': %s\n" |
446 | "Using default SPMD (2) execution mode\n" , |
447 | Name, ErrStr.data()); |
448 | assert(KernelEnvironment.Configuration.ReductionDataSize == 0 && |
449 | "Default initialization failed." ); |
450 | IsBareKernel = true; |
451 | } |
452 | |
453 | // Max = Config.Max > 0 ? min(Config.Max, Device.Max) : Device.Max; |
454 | MaxNumThreads = KernelEnvironment.Configuration.MaxThreads > 0 |
455 | ? std::min(KernelEnvironment.Configuration.MaxThreads, |
456 | int32_t(GenericDevice.getThreadLimit())) |
457 | : GenericDevice.getThreadLimit(); |
458 | |
459 | // Pref = Config.Pref > 0 ? max(Config.Pref, Device.Pref) : Device.Pref; |
460 | PreferredNumThreads = |
461 | KernelEnvironment.Configuration.MinThreads > 0 |
462 | ? std::max(KernelEnvironment.Configuration.MinThreads, |
463 | int32_t(GenericDevice.getDefaultNumThreads())) |
464 | : GenericDevice.getDefaultNumThreads(); |
465 | |
466 | return initImpl(GenericDevice, Image); |
467 | } |
468 | |
469 | Expected<KernelLaunchEnvironmentTy *> |
470 | GenericKernelTy::getKernelLaunchEnvironment( |
471 | GenericDeviceTy &GenericDevice, uint32_t Version, |
472 | AsyncInfoWrapperTy &AsyncInfoWrapper) const { |
473 | // Ctor/Dtor have no arguments, replaying uses the original kernel launch |
474 | // environment. Older versions of the compiler do not generate a kernel |
475 | // launch environment. |
476 | if (isCtorOrDtor() || RecordReplay.isReplaying() || |
477 | Version < OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR) |
478 | return nullptr; |
479 | |
480 | if (!KernelEnvironment.Configuration.ReductionDataSize || |
481 | !KernelEnvironment.Configuration.ReductionBufferLength) |
482 | return reinterpret_cast<KernelLaunchEnvironmentTy *>(~0); |
483 | |
484 | // TODO: Check if the kernel needs a launch environment. |
485 | auto AllocOrErr = GenericDevice.dataAlloc(sizeof(KernelLaunchEnvironmentTy), |
486 | /*HostPtr=*/nullptr, |
487 | TargetAllocTy::TARGET_ALLOC_DEVICE); |
488 | if (!AllocOrErr) |
489 | return AllocOrErr.takeError(); |
490 | |
491 | // Remember to free the memory later. |
492 | AsyncInfoWrapper.freeAllocationAfterSynchronization(*AllocOrErr); |
493 | |
494 | /// Use the KLE in the __tgt_async_info to ensure a stable address for the |
495 | /// async data transfer. |
496 | auto &LocalKLE = (*AsyncInfoWrapper).KernelLaunchEnvironment; |
497 | LocalKLE = KernelLaunchEnvironment; |
498 | { |
499 | auto AllocOrErr = GenericDevice.dataAlloc( |
500 | KernelEnvironment.Configuration.ReductionDataSize * |
501 | KernelEnvironment.Configuration.ReductionBufferLength, |
502 | /*HostPtr=*/nullptr, TargetAllocTy::TARGET_ALLOC_DEVICE); |
503 | if (!AllocOrErr) |
504 | return AllocOrErr.takeError(); |
505 | LocalKLE.ReductionBuffer = *AllocOrErr; |
506 | // Remember to free the memory later. |
507 | AsyncInfoWrapper.freeAllocationAfterSynchronization(*AllocOrErr); |
508 | } |
509 | |
510 | INFO(OMP_INFOTYPE_DATA_TRANSFER, GenericDevice.getDeviceId(), |
511 | "Copying data from host to device, HstPtr=" DPxMOD ", TgtPtr=" DPxMOD |
512 | ", Size=%" PRId64 ", Name=KernelLaunchEnv\n" , |
513 | DPxPTR(&LocalKLE), DPxPTR(*AllocOrErr), |
514 | sizeof(KernelLaunchEnvironmentTy)); |
515 | |
516 | auto Err = GenericDevice.dataSubmit(*AllocOrErr, &LocalKLE, |
517 | sizeof(KernelLaunchEnvironmentTy), |
518 | AsyncInfoWrapper); |
519 | if (Err) |
520 | return Err; |
521 | return static_cast<KernelLaunchEnvironmentTy *>(*AllocOrErr); |
522 | } |
523 | |
524 | Error GenericKernelTy::printLaunchInfo(GenericDeviceTy &GenericDevice, |
525 | KernelArgsTy &KernelArgs, |
526 | uint32_t NumThreads, |
527 | uint64_t NumBlocks) const { |
528 | INFO(OMP_INFOTYPE_PLUGIN_KERNEL, GenericDevice.getDeviceId(), |
529 | "Launching kernel %s with %" PRIu64 |
530 | " blocks and %d threads in %s mode\n" , |
531 | getName(), NumBlocks, NumThreads, getExecutionModeName()); |
532 | return printLaunchInfoDetails(GenericDevice, KernelArgs, NumThreads, |
533 | NumBlocks); |
534 | } |
535 | |
536 | Error GenericKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice, |
537 | KernelArgsTy &KernelArgs, |
538 | uint32_t NumThreads, |
539 | uint64_t NumBlocks) const { |
540 | return Plugin::success(); |
541 | } |
542 | |
543 | Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs, |
544 | ptrdiff_t *ArgOffsets, KernelArgsTy &KernelArgs, |
545 | AsyncInfoWrapperTy &AsyncInfoWrapper) const { |
546 | llvm::SmallVector<void *, 16> Args; |
547 | llvm::SmallVector<void *, 16> Ptrs; |
548 | |
549 | auto KernelLaunchEnvOrErr = getKernelLaunchEnvironment( |
550 | GenericDevice, KernelArgs.Version, AsyncInfoWrapper); |
551 | if (!KernelLaunchEnvOrErr) |
552 | return KernelLaunchEnvOrErr.takeError(); |
553 | |
554 | void *KernelArgsPtr = |
555 | prepareArgs(GenericDevice, ArgPtrs, ArgOffsets, KernelArgs.NumArgs, Args, |
556 | Ptrs, *KernelLaunchEnvOrErr); |
557 | |
558 | uint32_t NumThreads = getNumThreads(GenericDevice, KernelArgs.ThreadLimit); |
559 | uint64_t NumBlocks = |
560 | getNumBlocks(GenericDevice, KernelArgs.NumTeams, KernelArgs.Tripcount, |
561 | NumThreads, KernelArgs.ThreadLimit[0] > 0); |
562 | |
563 | // Record the kernel description after we modified the argument count and num |
564 | // blocks/threads. |
565 | if (RecordReplay.isRecording()) { |
566 | RecordReplay.saveImage(getName(), getImage()); |
567 | RecordReplay.saveKernelInput(getName(), getImage()); |
568 | RecordReplay.saveKernelDescr(getName(), Ptrs.data(), KernelArgs.NumArgs, |
569 | NumBlocks, NumThreads, KernelArgs.Tripcount); |
570 | } |
571 | |
572 | if (auto Err = |
573 | printLaunchInfo(GenericDevice, KernelArgs, NumThreads, NumBlocks)) |
574 | return Err; |
575 | |
576 | return launchImpl(GenericDevice, NumThreads, NumBlocks, KernelArgs, |
577 | KernelArgsPtr, AsyncInfoWrapper); |
578 | } |
579 | |
580 | void *GenericKernelTy::prepareArgs( |
581 | GenericDeviceTy &GenericDevice, void **ArgPtrs, ptrdiff_t *ArgOffsets, |
582 | uint32_t &NumArgs, llvm::SmallVectorImpl<void *> &Args, |
583 | llvm::SmallVectorImpl<void *> &Ptrs, |
584 | KernelLaunchEnvironmentTy *KernelLaunchEnvironment) const { |
585 | if (isCtorOrDtor()) |
586 | return nullptr; |
587 | |
588 | uint32_t KLEOffset = !!KernelLaunchEnvironment; |
589 | NumArgs += KLEOffset; |
590 | |
591 | if (NumArgs == 0) |
592 | return nullptr; |
593 | |
594 | Args.resize(NumArgs); |
595 | Ptrs.resize(NumArgs); |
596 | |
597 | if (KernelLaunchEnvironment) { |
598 | Ptrs[0] = KernelLaunchEnvironment; |
599 | Args[0] = &Ptrs[0]; |
600 | } |
601 | |
602 | for (int I = KLEOffset; I < NumArgs; ++I) { |
603 | Ptrs[I] = |
604 | (void *)((intptr_t)ArgPtrs[I - KLEOffset] + ArgOffsets[I - KLEOffset]); |
605 | Args[I] = &Ptrs[I]; |
606 | } |
607 | return &Args[0]; |
608 | } |
609 | |
610 | uint32_t GenericKernelTy::getNumThreads(GenericDeviceTy &GenericDevice, |
611 | uint32_t ThreadLimitClause[3]) const { |
612 | assert(ThreadLimitClause[1] == 0 && ThreadLimitClause[2] == 0 && |
613 | "Multi dimensional launch not supported yet." ); |
614 | |
615 | if (IsBareKernel && ThreadLimitClause[0] > 0) |
616 | return ThreadLimitClause[0]; |
617 | |
618 | if (ThreadLimitClause[0] > 0 && isGenericMode()) |
619 | ThreadLimitClause[0] += GenericDevice.getWarpSize(); |
620 | |
621 | return std::min(MaxNumThreads, (ThreadLimitClause[0] > 0) |
622 | ? ThreadLimitClause[0] |
623 | : PreferredNumThreads); |
624 | } |
625 | |
626 | uint64_t GenericKernelTy::getNumBlocks(GenericDeviceTy &GenericDevice, |
627 | uint32_t NumTeamsClause[3], |
628 | uint64_t LoopTripCount, |
629 | uint32_t &NumThreads, |
630 | bool IsNumThreadsFromUser) const { |
631 | assert(NumTeamsClause[1] == 0 && NumTeamsClause[2] == 0 && |
632 | "Multi dimensional launch not supported yet." ); |
633 | |
634 | if (IsBareKernel && NumTeamsClause[0] > 0) |
635 | return NumTeamsClause[0]; |
636 | |
637 | if (NumTeamsClause[0] > 0) { |
638 | // TODO: We need to honor any value and consequently allow more than the |
639 | // block limit. For this we might need to start multiple kernels or let the |
640 | // blocks start again until the requested number has been started. |
641 | return std::min(NumTeamsClause[0], GenericDevice.getBlockLimit()); |
642 | } |
643 | |
644 | uint64_t DefaultNumBlocks = GenericDevice.getDefaultNumBlocks(); |
645 | uint64_t TripCountNumBlocks = std::numeric_limits<uint64_t>::max(); |
646 | if (LoopTripCount > 0) { |
647 | if (isSPMDMode()) { |
648 | // We have a combined construct, i.e. `target teams distribute |
649 | // parallel for [simd]`. We launch so many teams so that each thread |
650 | // will execute one iteration of the loop; rounded up to the nearest |
651 | // integer. However, if that results in too few teams, we artificially |
652 | // reduce the thread count per team to increase the outer parallelism. |
653 | auto MinThreads = GenericDevice.getMinThreadsForLowTripCountLoop(); |
654 | MinThreads = std::min(MinThreads, NumThreads); |
655 | |
656 | // Honor the thread_limit clause; only lower the number of threads. |
657 | [[maybe_unused]] auto OldNumThreads = NumThreads; |
658 | if (LoopTripCount >= DefaultNumBlocks * NumThreads || |
659 | IsNumThreadsFromUser) { |
660 | // Enough parallelism for teams and threads. |
661 | TripCountNumBlocks = ((LoopTripCount - 1) / NumThreads) + 1; |
662 | assert(IsNumThreadsFromUser || |
663 | TripCountNumBlocks >= DefaultNumBlocks && |
664 | "Expected sufficient outer parallelism." ); |
665 | } else if (LoopTripCount >= DefaultNumBlocks * MinThreads) { |
666 | // Enough parallelism for teams, limit threads. |
667 | |
668 | // This case is hard; for now, we force "full warps": |
669 | // First, compute a thread count assuming DefaultNumBlocks. |
670 | auto NumThreadsDefaultBlocks = |
671 | (LoopTripCount + DefaultNumBlocks - 1) / DefaultNumBlocks; |
672 | // Now get a power of two that is larger or equal. |
673 | auto NumThreadsDefaultBlocksP2 = |
674 | llvm::PowerOf2Ceil(NumThreadsDefaultBlocks); |
675 | // Do not increase a thread limit given be the user. |
676 | NumThreads = std::min(NumThreads, uint32_t(NumThreadsDefaultBlocksP2)); |
677 | assert(NumThreads >= MinThreads && |
678 | "Expected sufficient inner parallelism." ); |
679 | TripCountNumBlocks = ((LoopTripCount - 1) / NumThreads) + 1; |
680 | } else { |
681 | // Not enough parallelism for teams and threads, limit both. |
682 | NumThreads = std::min(NumThreads, MinThreads); |
683 | TripCountNumBlocks = ((LoopTripCount - 1) / NumThreads) + 1; |
684 | } |
685 | |
686 | assert(NumThreads * TripCountNumBlocks >= LoopTripCount && |
687 | "Expected sufficient parallelism" ); |
688 | assert(OldNumThreads >= NumThreads && |
689 | "Number of threads cannot be increased!" ); |
690 | } else { |
691 | assert((isGenericMode() || isGenericSPMDMode()) && |
692 | "Unexpected execution mode!" ); |
693 | // If we reach this point, then we have a non-combined construct, i.e. |
694 | // `teams distribute` with a nested `parallel for` and each team is |
695 | // assigned one iteration of the `distribute` loop. E.g.: |
696 | // |
697 | // #pragma omp target teams distribute |
698 | // for(...loop_tripcount...) { |
699 | // #pragma omp parallel for |
700 | // for(...) {} |
701 | // } |
702 | // |
703 | // Threads within a team will execute the iterations of the `parallel` |
704 | // loop. |
705 | TripCountNumBlocks = LoopTripCount; |
706 | } |
707 | } |
708 | // If the loops are long running we rather reuse blocks than spawn too many. |
709 | uint32_t PreferredNumBlocks = std::min(TripCountNumBlocks, DefaultNumBlocks); |
710 | return std::min(PreferredNumBlocks, GenericDevice.getBlockLimit()); |
711 | } |
712 | |
713 | GenericDeviceTy::GenericDeviceTy(GenericPluginTy &Plugin, int32_t DeviceId, |
714 | int32_t NumDevices, |
715 | const llvm::omp::GV &OMPGridValues) |
716 | : Plugin(Plugin), MemoryManager(nullptr), OMP_TeamLimit("OMP_TEAM_LIMIT" ), |
717 | OMP_NumTeams("OMP_NUM_TEAMS" ), |
718 | OMP_TeamsThreadLimit("OMP_TEAMS_THREAD_LIMIT" ), |
719 | OMPX_DebugKind("LIBOMPTARGET_DEVICE_RTL_DEBUG" ), |
720 | OMPX_SharedMemorySize("LIBOMPTARGET_SHARED_MEMORY_SIZE" ), |
721 | // Do not initialize the following two envars since they depend on the |
722 | // device initialization. These cannot be consulted until the device is |
723 | // initialized correctly. We intialize them in GenericDeviceTy::init(). |
724 | OMPX_TargetStackSize(), OMPX_TargetHeapSize(), |
725 | // By default, the initial number of streams and events is 1. |
726 | OMPX_InitialNumStreams("LIBOMPTARGET_NUM_INITIAL_STREAMS" , 1), |
727 | OMPX_InitialNumEvents("LIBOMPTARGET_NUM_INITIAL_EVENTS" , 1), |
728 | DeviceId(DeviceId), GridValues(OMPGridValues), |
729 | PeerAccesses(NumDevices, PeerAccessState::PENDING), PeerAccessesLock(), |
730 | PinnedAllocs(*this), RPCServer(nullptr) { |
731 | #ifdef OMPT_SUPPORT |
732 | OmptInitialized.store(false); |
733 | // Bind the callbacks to this device's member functions |
734 | #define bindOmptCallback(Name, Type, Code) \ |
735 | if (ompt::Initialized && ompt::lookupCallbackByCode) { \ |
736 | ompt::lookupCallbackByCode((ompt_callbacks_t)(Code), \ |
737 | ((ompt_callback_t *)&(Name##_fn))); \ |
738 | DP("OMPT: class bound %s=%p\n", #Name, ((void *)(uint64_t)Name##_fn)); \ |
739 | } |
740 | |
741 | FOREACH_OMPT_DEVICE_EVENT(bindOmptCallback); |
742 | #undef bindOmptCallback |
743 | |
744 | #endif |
745 | } |
746 | |
747 | Error GenericDeviceTy::init(GenericPluginTy &Plugin) { |
748 | if (auto Err = initImpl(Plugin)) |
749 | return Err; |
750 | |
751 | #ifdef OMPT_SUPPORT |
752 | if (ompt::Initialized) { |
753 | bool ExpectedStatus = false; |
754 | if (OmptInitialized.compare_exchange_strong(ExpectedStatus, true)) |
755 | performOmptCallback(device_initialize, /*device_num=*/DeviceId + |
756 | Plugin.getDeviceIdStartIndex(), |
757 | /*type=*/getComputeUnitKind().c_str(), |
758 | /*device=*/reinterpret_cast<ompt_device_t *>(this), |
759 | /*lookup=*/ompt::lookupCallbackByName, |
760 | /*documentation=*/nullptr); |
761 | } |
762 | #endif |
763 | |
764 | // Read and reinitialize the envars that depend on the device initialization. |
765 | // Notice these two envars may change the stack size and heap size of the |
766 | // device, so they need the device properly initialized. |
767 | auto StackSizeEnvarOrErr = UInt64Envar::create( |
768 | "LIBOMPTARGET_STACK_SIZE" , |
769 | [this](uint64_t &V) -> Error { return getDeviceStackSize(V); }, |
770 | [this](uint64_t V) -> Error { return setDeviceStackSize(V); }); |
771 | if (!StackSizeEnvarOrErr) |
772 | return StackSizeEnvarOrErr.takeError(); |
773 | OMPX_TargetStackSize = std::move(*StackSizeEnvarOrErr); |
774 | |
775 | auto HeapSizeEnvarOrErr = UInt64Envar::create( |
776 | "LIBOMPTARGET_HEAP_SIZE" , |
777 | [this](uint64_t &V) -> Error { return getDeviceHeapSize(V); }, |
778 | [this](uint64_t V) -> Error { return setDeviceHeapSize(V); }); |
779 | if (!HeapSizeEnvarOrErr) |
780 | return HeapSizeEnvarOrErr.takeError(); |
781 | OMPX_TargetHeapSize = std::move(*HeapSizeEnvarOrErr); |
782 | |
783 | // Update the maximum number of teams and threads after the device |
784 | // initialization sets the corresponding hardware limit. |
785 | if (OMP_NumTeams > 0) |
786 | GridValues.GV_Max_Teams = |
787 | std::min(GridValues.GV_Max_Teams, uint32_t(OMP_NumTeams)); |
788 | |
789 | if (OMP_TeamsThreadLimit > 0) |
790 | GridValues.GV_Max_WG_Size = |
791 | std::min(GridValues.GV_Max_WG_Size, uint32_t(OMP_TeamsThreadLimit)); |
792 | |
793 | // Enable the memory manager if required. |
794 | auto [ThresholdMM, EnableMM] = MemoryManagerTy::getSizeThresholdFromEnv(); |
795 | if (EnableMM) |
796 | MemoryManager = new MemoryManagerTy(*this, ThresholdMM); |
797 | |
798 | return Plugin::success(); |
799 | } |
800 | |
801 | Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) { |
802 | for (DeviceImageTy *Image : LoadedImages) |
803 | if (auto Err = callGlobalDestructors(Plugin, *Image)) |
804 | return Err; |
805 | |
806 | if (OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::AllocationTracker)) { |
807 | GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler(); |
808 | for (auto *Image : LoadedImages) { |
809 | DeviceMemoryPoolTrackingTy ImageDeviceMemoryPoolTracking = {0, 0, ~0U, 0}; |
810 | GlobalTy TrackerGlobal("__omp_rtl_device_memory_pool_tracker" , |
811 | sizeof(DeviceMemoryPoolTrackingTy), |
812 | &ImageDeviceMemoryPoolTracking); |
813 | if (auto Err = |
814 | GHandler.readGlobalFromDevice(*this, *Image, TrackerGlobal)) { |
815 | consumeError(std::move(Err)); |
816 | continue; |
817 | } |
818 | DeviceMemoryPoolTracking.combine(ImageDeviceMemoryPoolTracking); |
819 | } |
820 | |
821 | // TODO: Write this by default into a file. |
822 | printf("\n\n|-----------------------\n" |
823 | "| Device memory tracker:\n" |
824 | "|-----------------------\n" |
825 | "| #Allocations: %lu\n" |
826 | "| Byes allocated: %lu\n" |
827 | "| Minimal allocation: %lu\n" |
828 | "| Maximal allocation: %lu\n" |
829 | "|-----------------------\n\n\n" , |
830 | DeviceMemoryPoolTracking.NumAllocations, |
831 | DeviceMemoryPoolTracking.AllocationTotal, |
832 | DeviceMemoryPoolTracking.AllocationMin, |
833 | DeviceMemoryPoolTracking.AllocationMax); |
834 | } |
835 | |
836 | // Delete the memory manager before deinitializing the device. Otherwise, |
837 | // we may delete device allocations after the device is deinitialized. |
838 | if (MemoryManager) |
839 | delete MemoryManager; |
840 | MemoryManager = nullptr; |
841 | |
842 | if (RecordReplay.isRecordingOrReplaying()) |
843 | RecordReplay.deinit(); |
844 | |
845 | if (RPCServer) |
846 | if (auto Err = RPCServer->deinitDevice(*this)) |
847 | return Err; |
848 | |
849 | #ifdef OMPT_SUPPORT |
850 | if (ompt::Initialized) { |
851 | bool ExpectedStatus = true; |
852 | if (OmptInitialized.compare_exchange_strong(ExpectedStatus, false)) |
853 | performOmptCallback(device_finalize, |
854 | /*device_num=*/DeviceId + |
855 | Plugin.getDeviceIdStartIndex()); |
856 | } |
857 | #endif |
858 | |
859 | return deinitImpl(); |
860 | } |
861 | Expected<DeviceImageTy *> |
862 | GenericDeviceTy::loadBinary(GenericPluginTy &Plugin, |
863 | const __tgt_device_image *InputTgtImage) { |
864 | assert(InputTgtImage && "Expected non-null target image" ); |
865 | DP("Load data from image " DPxMOD "\n" , DPxPTR(InputTgtImage->ImageStart)); |
866 | |
867 | auto PostJITImageOrErr = Plugin.getJIT().process(*InputTgtImage, *this); |
868 | if (!PostJITImageOrErr) { |
869 | auto Err = PostJITImageOrErr.takeError(); |
870 | REPORT("Failure to jit IR image %p on device %d: %s\n" , InputTgtImage, |
871 | DeviceId, toString(std::move(Err)).data()); |
872 | return nullptr; |
873 | } |
874 | |
875 | // Load the binary and allocate the image object. Use the next available id |
876 | // for the image id, which is the number of previously loaded images. |
877 | auto ImageOrErr = |
878 | loadBinaryImpl(PostJITImageOrErr.get(), LoadedImages.size()); |
879 | if (!ImageOrErr) |
880 | return ImageOrErr.takeError(); |
881 | |
882 | DeviceImageTy *Image = *ImageOrErr; |
883 | assert(Image != nullptr && "Invalid image" ); |
884 | if (InputTgtImage != PostJITImageOrErr.get()) |
885 | Image->setTgtImageBitcode(InputTgtImage); |
886 | |
887 | // Add the image to list. |
888 | LoadedImages.push_back(Image); |
889 | |
890 | // Setup the device environment if needed. |
891 | if (auto Err = setupDeviceEnvironment(Plugin, *Image)) |
892 | return std::move(Err); |
893 | |
894 | // Setup the global device memory pool if needed. |
895 | if (!RecordReplay.isReplaying() && shouldSetupDeviceMemoryPool()) { |
896 | uint64_t HeapSize; |
897 | auto SizeOrErr = getDeviceHeapSize(HeapSize); |
898 | if (SizeOrErr) { |
899 | REPORT("No global device memory pool due to error: %s\n" , |
900 | toString(std::move(SizeOrErr)).data()); |
901 | } else if (auto Err = setupDeviceMemoryPool(Plugin, *Image, HeapSize)) |
902 | return std::move(Err); |
903 | } |
904 | |
905 | if (auto Err = setupRPCServer(Plugin, *Image)) |
906 | return std::move(Err); |
907 | |
908 | #ifdef OMPT_SUPPORT |
909 | if (ompt::Initialized) { |
910 | size_t Bytes = |
911 | getPtrDiff(InputTgtImage->ImageEnd, InputTgtImage->ImageStart); |
912 | performOmptCallback( |
913 | device_load, /*device_num=*/DeviceId + Plugin.getDeviceIdStartIndex(), |
914 | /*FileName=*/nullptr, /*FileOffset=*/0, /*VmaInFile=*/nullptr, |
915 | /*ImgSize=*/Bytes, /*HostAddr=*/InputTgtImage->ImageStart, |
916 | /*DeviceAddr=*/nullptr, /* FIXME: ModuleId */ 0); |
917 | } |
918 | #endif |
919 | |
920 | // Call any global constructors present on the device. |
921 | if (auto Err = callGlobalConstructors(Plugin, *Image)) |
922 | return std::move(Err); |
923 | |
924 | // Return the pointer to the table of entries. |
925 | return Image; |
926 | } |
927 | |
928 | Error GenericDeviceTy::setupDeviceEnvironment(GenericPluginTy &Plugin, |
929 | DeviceImageTy &Image) { |
930 | // There are some plugins that do not need this step. |
931 | if (!shouldSetupDeviceEnvironment()) |
932 | return Plugin::success(); |
933 | |
934 | // Obtain a table mapping host function pointers to device function pointers. |
935 | auto CallTablePairOrErr = setupIndirectCallTable(Plugin, *this, Image); |
936 | if (!CallTablePairOrErr) |
937 | return CallTablePairOrErr.takeError(); |
938 | |
939 | DeviceEnvironmentTy DeviceEnvironment; |
940 | DeviceEnvironment.DeviceDebugKind = OMPX_DebugKind; |
941 | DeviceEnvironment.NumDevices = Plugin.getNumDevices(); |
942 | // TODO: The device ID used here is not the real device ID used by OpenMP. |
943 | DeviceEnvironment.DeviceNum = DeviceId; |
944 | DeviceEnvironment.DynamicMemSize = OMPX_SharedMemorySize; |
945 | DeviceEnvironment.ClockFrequency = getClockFrequency(); |
946 | DeviceEnvironment.IndirectCallTable = |
947 | reinterpret_cast<uintptr_t>(CallTablePairOrErr->first); |
948 | DeviceEnvironment.IndirectCallTableSize = CallTablePairOrErr->second; |
949 | DeviceEnvironment.HardwareParallelism = getHardwareParallelism(); |
950 | |
951 | // Create the metainfo of the device environment global. |
952 | GlobalTy DevEnvGlobal("__omp_rtl_device_environment" , |
953 | sizeof(DeviceEnvironmentTy), &DeviceEnvironment); |
954 | |
955 | // Write device environment values to the device. |
956 | GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler(); |
957 | if (auto Err = GHandler.writeGlobalToDevice(*this, Image, DevEnvGlobal)) { |
958 | DP("Missing symbol %s, continue execution anyway.\n" , |
959 | DevEnvGlobal.getName().data()); |
960 | consumeError(std::move(Err)); |
961 | } |
962 | return Plugin::success(); |
963 | } |
964 | |
965 | Error GenericDeviceTy::setupDeviceMemoryPool(GenericPluginTy &Plugin, |
966 | DeviceImageTy &Image, |
967 | uint64_t PoolSize) { |
968 | // Free the old pool, if any. |
969 | if (DeviceMemoryPool.Ptr) { |
970 | if (auto Err = dataDelete(DeviceMemoryPool.Ptr, |
971 | TargetAllocTy::TARGET_ALLOC_DEVICE)) |
972 | return Err; |
973 | } |
974 | |
975 | DeviceMemoryPool.Size = PoolSize; |
976 | auto AllocOrErr = dataAlloc(PoolSize, /*HostPtr=*/nullptr, |
977 | TargetAllocTy::TARGET_ALLOC_DEVICE); |
978 | if (AllocOrErr) { |
979 | DeviceMemoryPool.Ptr = *AllocOrErr; |
980 | } else { |
981 | auto Err = AllocOrErr.takeError(); |
982 | REPORT("Failure to allocate device memory for global memory pool: %s\n" , |
983 | toString(std::move(Err)).data()); |
984 | DeviceMemoryPool.Ptr = nullptr; |
985 | DeviceMemoryPool.Size = 0; |
986 | } |
987 | |
988 | // Create the metainfo of the device environment global. |
989 | GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler(); |
990 | if (!GHandler.isSymbolInImage(*this, Image, |
991 | "__omp_rtl_device_memory_pool_tracker" )) { |
992 | DP("Skip the memory pool as there is no tracker symbol in the image." ); |
993 | return Error::success(); |
994 | } |
995 | |
996 | GlobalTy TrackerGlobal("__omp_rtl_device_memory_pool_tracker" , |
997 | sizeof(DeviceMemoryPoolTrackingTy), |
998 | &DeviceMemoryPoolTracking); |
999 | if (auto Err = GHandler.writeGlobalToDevice(*this, Image, TrackerGlobal)) |
1000 | return Err; |
1001 | |
1002 | // Create the metainfo of the device environment global. |
1003 | GlobalTy DevEnvGlobal("__omp_rtl_device_memory_pool" , |
1004 | sizeof(DeviceMemoryPoolTy), &DeviceMemoryPool); |
1005 | |
1006 | // Write device environment values to the device. |
1007 | return GHandler.writeGlobalToDevice(*this, Image, DevEnvGlobal); |
1008 | } |
1009 | |
1010 | Error GenericDeviceTy::setupRPCServer(GenericPluginTy &Plugin, |
1011 | DeviceImageTy &Image) { |
1012 | // The plugin either does not need an RPC server or it is unavailible. |
1013 | if (!shouldSetupRPCServer()) |
1014 | return Plugin::success(); |
1015 | |
1016 | // Check if this device needs to run an RPC server. |
1017 | RPCServerTy &Server = Plugin.getRPCServer(); |
1018 | auto UsingOrErr = |
1019 | Server.isDeviceUsingRPC(*this, Plugin.getGlobalHandler(), Image); |
1020 | if (!UsingOrErr) |
1021 | return UsingOrErr.takeError(); |
1022 | |
1023 | if (!UsingOrErr.get()) |
1024 | return Plugin::success(); |
1025 | |
1026 | if (auto Err = Server.initDevice(*this, Plugin.getGlobalHandler(), Image)) |
1027 | return Err; |
1028 | |
1029 | RPCServer = &Server; |
1030 | DP("Running an RPC server on device %d\n" , getDeviceId()); |
1031 | return Plugin::success(); |
1032 | } |
1033 | |
1034 | Error PinnedAllocationMapTy::insertEntry(void *HstPtr, void *DevAccessiblePtr, |
1035 | size_t Size, bool ExternallyLocked) { |
1036 | // Insert the new entry into the map. |
1037 | auto Res = Allocs.insert({HstPtr, DevAccessiblePtr, Size, ExternallyLocked}); |
1038 | if (!Res.second) |
1039 | return Plugin::error("Cannot insert locked buffer entry" ); |
1040 | |
1041 | // Check whether the next entry overlaps with the inserted entry. |
1042 | auto It = std::next(Res.first); |
1043 | if (It == Allocs.end()) |
1044 | return Plugin::success(); |
1045 | |
1046 | const EntryTy *NextEntry = &(*It); |
1047 | if (intersects(NextEntry->HstPtr, NextEntry->Size, HstPtr, Size)) |
1048 | return Plugin::error("Partial overlapping not allowed in locked buffers" ); |
1049 | |
1050 | return Plugin::success(); |
1051 | } |
1052 | |
1053 | Error PinnedAllocationMapTy::eraseEntry(const EntryTy &Entry) { |
1054 | // Erase the existing entry. Notice this requires an additional map lookup, |
1055 | // but this should not be a performance issue. Using iterators would make |
1056 | // the code more difficult to read. |
1057 | size_t Erased = Allocs.erase({Entry.HstPtr}); |
1058 | if (!Erased) |
1059 | return Plugin::error("Cannot erase locked buffer entry" ); |
1060 | return Plugin::success(); |
1061 | } |
1062 | |
1063 | Error PinnedAllocationMapTy::registerEntryUse(const EntryTy &Entry, |
1064 | void *HstPtr, size_t Size) { |
1065 | if (!contains(Entry.HstPtr, Entry.Size, HstPtr, Size)) |
1066 | return Plugin::error("Partial overlapping not allowed in locked buffers" ); |
1067 | |
1068 | ++Entry.References; |
1069 | return Plugin::success(); |
1070 | } |
1071 | |
1072 | Expected<bool> PinnedAllocationMapTy::unregisterEntryUse(const EntryTy &Entry) { |
1073 | if (Entry.References == 0) |
1074 | return Plugin::error("Invalid number of references" ); |
1075 | |
1076 | // Return whether this was the last user. |
1077 | return (--Entry.References == 0); |
1078 | } |
1079 | |
1080 | Error PinnedAllocationMapTy::registerHostBuffer(void *HstPtr, |
1081 | void *DevAccessiblePtr, |
1082 | size_t Size) { |
1083 | assert(HstPtr && "Invalid pointer" ); |
1084 | assert(DevAccessiblePtr && "Invalid pointer" ); |
1085 | assert(Size && "Invalid size" ); |
1086 | |
1087 | std::lock_guard<std::shared_mutex> Lock(Mutex); |
1088 | |
1089 | // No pinned allocation should intersect. |
1090 | const EntryTy *Entry = findIntersecting(HstPtr); |
1091 | if (Entry) |
1092 | return Plugin::error("Cannot insert entry due to an existing one" ); |
1093 | |
1094 | // Now insert the new entry. |
1095 | return insertEntry(HstPtr, DevAccessiblePtr, Size); |
1096 | } |
1097 | |
1098 | Error PinnedAllocationMapTy::unregisterHostBuffer(void *HstPtr) { |
1099 | assert(HstPtr && "Invalid pointer" ); |
1100 | |
1101 | std::lock_guard<std::shared_mutex> Lock(Mutex); |
1102 | |
1103 | const EntryTy *Entry = findIntersecting(HstPtr); |
1104 | if (!Entry) |
1105 | return Plugin::error("Cannot find locked buffer" ); |
1106 | |
1107 | // The address in the entry should be the same we are unregistering. |
1108 | if (Entry->HstPtr != HstPtr) |
1109 | return Plugin::error("Unexpected host pointer in locked buffer entry" ); |
1110 | |
1111 | // Unregister from the entry. |
1112 | auto LastUseOrErr = unregisterEntryUse(*Entry); |
1113 | if (!LastUseOrErr) |
1114 | return LastUseOrErr.takeError(); |
1115 | |
1116 | // There should be no other references to the pinned allocation. |
1117 | if (!(*LastUseOrErr)) |
1118 | return Plugin::error("The locked buffer is still being used" ); |
1119 | |
1120 | // Erase the entry from the map. |
1121 | return eraseEntry(*Entry); |
1122 | } |
1123 | |
1124 | Expected<void *> PinnedAllocationMapTy::lockHostBuffer(void *HstPtr, |
1125 | size_t Size) { |
1126 | assert(HstPtr && "Invalid pointer" ); |
1127 | assert(Size && "Invalid size" ); |
1128 | |
1129 | std::lock_guard<std::shared_mutex> Lock(Mutex); |
1130 | |
1131 | const EntryTy *Entry = findIntersecting(HstPtr); |
1132 | |
1133 | if (Entry) { |
1134 | // An already registered intersecting buffer was found. Register a new use. |
1135 | if (auto Err = registerEntryUse(*Entry, HstPtr, Size)) |
1136 | return std::move(Err); |
1137 | |
1138 | // Return the device accessible pointer with the correct offset. |
1139 | return advanceVoidPtr(Entry->DevAccessiblePtr, |
1140 | getPtrDiff(HstPtr, Entry->HstPtr)); |
1141 | } |
1142 | |
1143 | // No intersecting registered allocation found in the map. First, lock the |
1144 | // host buffer and retrieve the device accessible pointer. |
1145 | auto DevAccessiblePtrOrErr = Device.dataLockImpl(HstPtr, Size); |
1146 | if (!DevAccessiblePtrOrErr) |
1147 | return DevAccessiblePtrOrErr.takeError(); |
1148 | |
1149 | // Now insert the new entry into the map. |
1150 | if (auto Err = insertEntry(HstPtr, *DevAccessiblePtrOrErr, Size)) |
1151 | return std::move(Err); |
1152 | |
1153 | // Return the device accessible pointer. |
1154 | return *DevAccessiblePtrOrErr; |
1155 | } |
1156 | |
1157 | Error PinnedAllocationMapTy::unlockHostBuffer(void *HstPtr) { |
1158 | assert(HstPtr && "Invalid pointer" ); |
1159 | |
1160 | std::lock_guard<std::shared_mutex> Lock(Mutex); |
1161 | |
1162 | const EntryTy *Entry = findIntersecting(HstPtr); |
1163 | if (!Entry) |
1164 | return Plugin::error("Cannot find locked buffer" ); |
1165 | |
1166 | // Unregister from the locked buffer. No need to do anything if there are |
1167 | // others using the allocation. |
1168 | auto LastUseOrErr = unregisterEntryUse(*Entry); |
1169 | if (!LastUseOrErr) |
1170 | return LastUseOrErr.takeError(); |
1171 | |
1172 | // No need to do anything if there are others using the allocation. |
1173 | if (!(*LastUseOrErr)) |
1174 | return Plugin::success(); |
1175 | |
1176 | // This was the last user of the allocation. Unlock the original locked buffer |
1177 | // if it was locked by the plugin. Do not unlock it if it was locked by an |
1178 | // external entity. Unlock the buffer using the host pointer of the entry. |
1179 | if (!Entry->ExternallyLocked) |
1180 | if (auto Err = Device.dataUnlockImpl(Entry->HstPtr)) |
1181 | return Err; |
1182 | |
1183 | // Erase the entry from the map. |
1184 | return eraseEntry(*Entry); |
1185 | } |
1186 | |
1187 | Error PinnedAllocationMapTy::lockMappedHostBuffer(void *HstPtr, size_t Size) { |
1188 | assert(HstPtr && "Invalid pointer" ); |
1189 | assert(Size && "Invalid size" ); |
1190 | |
1191 | std::lock_guard<std::shared_mutex> Lock(Mutex); |
1192 | |
1193 | // If previously registered, just register a new user on the entry. |
1194 | const EntryTy *Entry = findIntersecting(HstPtr); |
1195 | if (Entry) |
1196 | return registerEntryUse(*Entry, HstPtr, Size); |
1197 | |
1198 | size_t BaseSize; |
1199 | void *BaseHstPtr, *BaseDevAccessiblePtr; |
1200 | |
1201 | // Check if it was externally pinned by a vendor-specific API. |
1202 | auto IsPinnedOrErr = Device.isPinnedPtrImpl(HstPtr, BaseHstPtr, |
1203 | BaseDevAccessiblePtr, BaseSize); |
1204 | if (!IsPinnedOrErr) |
1205 | return IsPinnedOrErr.takeError(); |
1206 | |
1207 | // If pinned, just insert the entry representing the whole pinned buffer. |
1208 | if (*IsPinnedOrErr) |
1209 | return insertEntry(BaseHstPtr, BaseDevAccessiblePtr, BaseSize, |
1210 | /*Externallylocked=*/true); |
1211 | |
1212 | // Not externally pinned. Do nothing if locking of mapped buffers is disabled. |
1213 | if (!LockMappedBuffers) |
1214 | return Plugin::success(); |
1215 | |
1216 | // Otherwise, lock the buffer and insert the new entry. |
1217 | auto DevAccessiblePtrOrErr = Device.dataLockImpl(HstPtr, Size); |
1218 | if (!DevAccessiblePtrOrErr) { |
1219 | // Errors may be tolerated. |
1220 | if (!IgnoreLockMappedFailures) |
1221 | return DevAccessiblePtrOrErr.takeError(); |
1222 | |
1223 | consumeError(DevAccessiblePtrOrErr.takeError()); |
1224 | return Plugin::success(); |
1225 | } |
1226 | |
1227 | return insertEntry(HstPtr, *DevAccessiblePtrOrErr, Size); |
1228 | } |
1229 | |
1230 | Error PinnedAllocationMapTy::unlockUnmappedHostBuffer(void *HstPtr) { |
1231 | assert(HstPtr && "Invalid pointer" ); |
1232 | |
1233 | std::lock_guard<std::shared_mutex> Lock(Mutex); |
1234 | |
1235 | // Check whether there is any intersecting entry. |
1236 | const EntryTy *Entry = findIntersecting(HstPtr); |
1237 | |
1238 | // No entry but automatic locking of mapped buffers is disabled, so |
1239 | // nothing to do. |
1240 | if (!Entry && !LockMappedBuffers) |
1241 | return Plugin::success(); |
1242 | |
1243 | // No entry, automatic locking is enabled, but the locking may have failed, so |
1244 | // do nothing. |
1245 | if (!Entry && IgnoreLockMappedFailures) |
1246 | return Plugin::success(); |
1247 | |
1248 | // No entry, but the automatic locking is enabled, so this is an error. |
1249 | if (!Entry) |
1250 | return Plugin::error("Locked buffer not found" ); |
1251 | |
1252 | // There is entry, so unregister a user and check whether it was the last one. |
1253 | auto LastUseOrErr = unregisterEntryUse(*Entry); |
1254 | if (!LastUseOrErr) |
1255 | return LastUseOrErr.takeError(); |
1256 | |
1257 | // If it is not the last one, there is nothing to do. |
1258 | if (!(*LastUseOrErr)) |
1259 | return Plugin::success(); |
1260 | |
1261 | // Otherwise, if it was the last and the buffer was locked by the plugin, |
1262 | // unlock it. |
1263 | if (!Entry->ExternallyLocked) |
1264 | if (auto Err = Device.dataUnlockImpl(Entry->HstPtr)) |
1265 | return Err; |
1266 | |
1267 | // Finally erase the entry from the map. |
1268 | return eraseEntry(*Entry); |
1269 | } |
1270 | |
1271 | Error GenericDeviceTy::synchronize(__tgt_async_info *AsyncInfo) { |
1272 | if (!AsyncInfo || !AsyncInfo->Queue) |
1273 | return Plugin::error("Invalid async info queue" ); |
1274 | |
1275 | if (auto Err = synchronizeImpl(*AsyncInfo)) |
1276 | return Err; |
1277 | |
1278 | for (auto *Ptr : AsyncInfo->AssociatedAllocations) |
1279 | if (auto Err = dataDelete(Ptr, TargetAllocTy::TARGET_ALLOC_DEVICE)) |
1280 | return Err; |
1281 | AsyncInfo->AssociatedAllocations.clear(); |
1282 | |
1283 | return Plugin::success(); |
1284 | } |
1285 | |
1286 | Error GenericDeviceTy::queryAsync(__tgt_async_info *AsyncInfo) { |
1287 | if (!AsyncInfo || !AsyncInfo->Queue) |
1288 | return Plugin::error("Invalid async info queue" ); |
1289 | |
1290 | return queryAsyncImpl(*AsyncInfo); |
1291 | } |
1292 | |
1293 | Error GenericDeviceTy::memoryVAMap(void **Addr, void *VAddr, size_t *RSize) { |
1294 | return Plugin::error("Device does not suppport VA Management" ); |
1295 | } |
1296 | |
1297 | Error GenericDeviceTy::memoryVAUnMap(void *VAddr, size_t Size) { |
1298 | return Plugin::error("Device does not suppport VA Management" ); |
1299 | } |
1300 | |
1301 | Error GenericDeviceTy::getDeviceMemorySize(uint64_t &DSize) { |
1302 | return Plugin::error( |
1303 | "Mising getDeviceMemorySize impelmentation (required by RR-heuristic" ); |
1304 | } |
1305 | |
1306 | Expected<void *> GenericDeviceTy::dataAlloc(int64_t Size, void *HostPtr, |
1307 | TargetAllocTy Kind) { |
1308 | void *Alloc = nullptr; |
1309 | |
1310 | if (RecordReplay.isRecordingOrReplaying()) |
1311 | return RecordReplay.alloc(Size); |
1312 | |
1313 | switch (Kind) { |
1314 | case TARGET_ALLOC_DEFAULT: |
1315 | case TARGET_ALLOC_DEVICE_NON_BLOCKING: |
1316 | case TARGET_ALLOC_DEVICE: |
1317 | if (MemoryManager) { |
1318 | Alloc = MemoryManager->allocate(Size, HostPtr); |
1319 | if (!Alloc) |
1320 | return Plugin::error("Failed to allocate from memory manager" ); |
1321 | break; |
1322 | } |
1323 | [[fallthrough]]; |
1324 | case TARGET_ALLOC_HOST: |
1325 | case TARGET_ALLOC_SHARED: |
1326 | Alloc = allocate(Size, HostPtr, Kind); |
1327 | if (!Alloc) |
1328 | return Plugin::error("Failed to allocate from device allocator" ); |
1329 | } |
1330 | |
1331 | // Report error if the memory manager or the device allocator did not return |
1332 | // any memory buffer. |
1333 | if (!Alloc) |
1334 | return Plugin::error("Invalid target data allocation kind or requested " |
1335 | "allocator not implemented yet" ); |
1336 | |
1337 | // Register allocated buffer as pinned memory if the type is host memory. |
1338 | if (Kind == TARGET_ALLOC_HOST) |
1339 | if (auto Err = PinnedAllocs.registerHostBuffer(Alloc, Alloc, Size)) |
1340 | return std::move(Err); |
1341 | |
1342 | return Alloc; |
1343 | } |
1344 | |
1345 | Error GenericDeviceTy::dataDelete(void *TgtPtr, TargetAllocTy Kind) { |
1346 | // Free is a noop when recording or replaying. |
1347 | if (RecordReplay.isRecordingOrReplaying()) |
1348 | return Plugin::success(); |
1349 | |
1350 | int Res; |
1351 | if (MemoryManager) |
1352 | Res = MemoryManager->free(TgtPtr); |
1353 | else |
1354 | Res = free(TgtPtr, Kind); |
1355 | |
1356 | if (Res) |
1357 | return Plugin::error("Failure to deallocate device pointer %p" , TgtPtr); |
1358 | |
1359 | // Unregister deallocated pinned memory buffer if the type is host memory. |
1360 | if (Kind == TARGET_ALLOC_HOST) |
1361 | if (auto Err = PinnedAllocs.unregisterHostBuffer(TgtPtr)) |
1362 | return Err; |
1363 | |
1364 | return Plugin::success(); |
1365 | } |
1366 | |
1367 | Error GenericDeviceTy::dataSubmit(void *TgtPtr, const void *HstPtr, |
1368 | int64_t Size, __tgt_async_info *AsyncInfo) { |
1369 | AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo); |
1370 | |
1371 | auto Err = dataSubmitImpl(TgtPtr, HstPtr, Size, AsyncInfoWrapper); |
1372 | AsyncInfoWrapper.finalize(Err); |
1373 | return Err; |
1374 | } |
1375 | |
1376 | Error GenericDeviceTy::dataRetrieve(void *HstPtr, const void *TgtPtr, |
1377 | int64_t Size, __tgt_async_info *AsyncInfo) { |
1378 | AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo); |
1379 | |
1380 | auto Err = dataRetrieveImpl(HstPtr, TgtPtr, Size, AsyncInfoWrapper); |
1381 | AsyncInfoWrapper.finalize(Err); |
1382 | return Err; |
1383 | } |
1384 | |
1385 | Error GenericDeviceTy::dataExchange(const void *SrcPtr, GenericDeviceTy &DstDev, |
1386 | void *DstPtr, int64_t Size, |
1387 | __tgt_async_info *AsyncInfo) { |
1388 | AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo); |
1389 | |
1390 | auto Err = dataExchangeImpl(SrcPtr, DstDev, DstPtr, Size, AsyncInfoWrapper); |
1391 | AsyncInfoWrapper.finalize(Err); |
1392 | return Err; |
1393 | } |
1394 | |
1395 | Error GenericDeviceTy::launchKernel(void *EntryPtr, void **ArgPtrs, |
1396 | ptrdiff_t *ArgOffsets, |
1397 | KernelArgsTy &KernelArgs, |
1398 | __tgt_async_info *AsyncInfo) { |
1399 | AsyncInfoWrapperTy AsyncInfoWrapper( |
1400 | *this, RecordReplay.isRecordingOrReplaying() ? nullptr : AsyncInfo); |
1401 | |
1402 | GenericKernelTy &GenericKernel = |
1403 | *reinterpret_cast<GenericKernelTy *>(EntryPtr); |
1404 | |
1405 | auto Err = GenericKernel.launch(*this, ArgPtrs, ArgOffsets, KernelArgs, |
1406 | AsyncInfoWrapper); |
1407 | |
1408 | // 'finalize' here to guarantee next record-replay actions are in-sync |
1409 | AsyncInfoWrapper.finalize(Err); |
1410 | |
1411 | if (RecordReplay.isRecordingOrReplaying() && |
1412 | RecordReplay.isSaveOutputEnabled()) |
1413 | RecordReplay.saveKernelOutputInfo(GenericKernel.getName()); |
1414 | |
1415 | return Err; |
1416 | } |
1417 | |
1418 | Error GenericDeviceTy::initAsyncInfo(__tgt_async_info **AsyncInfoPtr) { |
1419 | assert(AsyncInfoPtr && "Invalid async info" ); |
1420 | |
1421 | *AsyncInfoPtr = new __tgt_async_info(); |
1422 | |
1423 | AsyncInfoWrapperTy AsyncInfoWrapper(*this, *AsyncInfoPtr); |
1424 | |
1425 | auto Err = initAsyncInfoImpl(AsyncInfoWrapper); |
1426 | AsyncInfoWrapper.finalize(Err); |
1427 | return Err; |
1428 | } |
1429 | |
1430 | Error GenericDeviceTy::initDeviceInfo(__tgt_device_info *DeviceInfo) { |
1431 | assert(DeviceInfo && "Invalid device info" ); |
1432 | |
1433 | return initDeviceInfoImpl(DeviceInfo); |
1434 | } |
1435 | |
1436 | Error GenericDeviceTy::printInfo() { |
1437 | InfoQueueTy InfoQueue; |
1438 | |
1439 | // Get the vendor-specific info entries describing the device properties. |
1440 | if (auto Err = obtainInfoImpl(InfoQueue)) |
1441 | return Err; |
1442 | |
1443 | // Print all info entries. |
1444 | InfoQueue.print(); |
1445 | |
1446 | return Plugin::success(); |
1447 | } |
1448 | |
1449 | Error GenericDeviceTy::createEvent(void **EventPtrStorage) { |
1450 | return createEventImpl(EventPtrStorage); |
1451 | } |
1452 | |
1453 | Error GenericDeviceTy::destroyEvent(void *EventPtr) { |
1454 | return destroyEventImpl(EventPtr); |
1455 | } |
1456 | |
1457 | Error GenericDeviceTy::recordEvent(void *EventPtr, |
1458 | __tgt_async_info *AsyncInfo) { |
1459 | AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo); |
1460 | |
1461 | auto Err = recordEventImpl(EventPtr, AsyncInfoWrapper); |
1462 | AsyncInfoWrapper.finalize(Err); |
1463 | return Err; |
1464 | } |
1465 | |
1466 | Error GenericDeviceTy::waitEvent(void *EventPtr, __tgt_async_info *AsyncInfo) { |
1467 | AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo); |
1468 | |
1469 | auto Err = waitEventImpl(EventPtr, AsyncInfoWrapper); |
1470 | AsyncInfoWrapper.finalize(Err); |
1471 | return Err; |
1472 | } |
1473 | |
1474 | Error GenericDeviceTy::syncEvent(void *EventPtr) { |
1475 | return syncEventImpl(EventPtr); |
1476 | } |
1477 | |
1478 | bool GenericDeviceTy::useAutoZeroCopy() { return useAutoZeroCopyImpl(); } |
1479 | |
1480 | Error GenericPluginTy::init() { |
1481 | auto NumDevicesOrErr = initImpl(); |
1482 | if (!NumDevicesOrErr) |
1483 | return NumDevicesOrErr.takeError(); |
1484 | |
1485 | NumDevices = *NumDevicesOrErr; |
1486 | if (NumDevices == 0) |
1487 | return Plugin::success(); |
1488 | |
1489 | assert(Devices.size() == 0 && "Plugin already initialized" ); |
1490 | Devices.resize(NumDevices, nullptr); |
1491 | |
1492 | GlobalHandler = createGlobalHandler(); |
1493 | assert(GlobalHandler && "Invalid global handler" ); |
1494 | |
1495 | RPCServer = new RPCServerTy(*this); |
1496 | assert(RPCServer && "Invalid RPC server" ); |
1497 | |
1498 | return Plugin::success(); |
1499 | } |
1500 | |
1501 | Error GenericPluginTy::deinit() { |
1502 | // Deinitialize all active devices. |
1503 | for (int32_t DeviceId = 0; DeviceId < NumDevices; ++DeviceId) { |
1504 | if (Devices[DeviceId]) { |
1505 | if (auto Err = deinitDevice(DeviceId)) |
1506 | return Err; |
1507 | } |
1508 | assert(!Devices[DeviceId] && "Device was not deinitialized" ); |
1509 | } |
1510 | |
1511 | // There is no global handler if no device is available. |
1512 | if (GlobalHandler) |
1513 | delete GlobalHandler; |
1514 | |
1515 | if (RPCServer) |
1516 | delete RPCServer; |
1517 | |
1518 | // Perform last deinitializations on the plugin. |
1519 | return deinitImpl(); |
1520 | } |
1521 | |
1522 | Error GenericPluginTy::initDevice(int32_t DeviceId) { |
1523 | assert(!Devices[DeviceId] && "Device already initialized" ); |
1524 | |
1525 | // Create the device and save the reference. |
1526 | GenericDeviceTy *Device = createDevice(*this, DeviceId, NumDevices); |
1527 | assert(Device && "Invalid device" ); |
1528 | |
1529 | // Save the device reference into the list. |
1530 | Devices[DeviceId] = Device; |
1531 | |
1532 | // Initialize the device and its resources. |
1533 | return Device->init(*this); |
1534 | } |
1535 | |
1536 | Error GenericPluginTy::deinitDevice(int32_t DeviceId) { |
1537 | // The device may be already deinitialized. |
1538 | if (Devices[DeviceId] == nullptr) |
1539 | return Plugin::success(); |
1540 | |
1541 | // Deinitialize the device and release its resources. |
1542 | if (auto Err = Devices[DeviceId]->deinit(*this)) |
1543 | return Err; |
1544 | |
1545 | // Delete the device and invalidate its reference. |
1546 | delete Devices[DeviceId]; |
1547 | Devices[DeviceId] = nullptr; |
1548 | |
1549 | return Plugin::success(); |
1550 | } |
1551 | |
1552 | Expected<bool> GenericPluginTy::checkELFImage(StringRef Image) const { |
1553 | // First check if this image is a regular ELF file. |
1554 | if (!utils::elf::isELF(Image)) |
1555 | return false; |
1556 | |
1557 | // Check if this image is an ELF with a matching machine value. |
1558 | auto MachineOrErr = utils::elf::checkMachine(Image, getMagicElfBits()); |
1559 | if (!MachineOrErr) |
1560 | return MachineOrErr.takeError(); |
1561 | |
1562 | if (!*MachineOrErr) |
1563 | return false; |
1564 | |
1565 | // Perform plugin-dependent checks for the specific architecture if needed. |
1566 | return isELFCompatible(Image); |
1567 | } |
1568 | |
1569 | int32_t GenericPluginTy::is_valid_binary(__tgt_device_image *Image) { |
1570 | StringRef Buffer(reinterpret_cast<const char *>(Image->ImageStart), |
1571 | target::getPtrDiff(Image->ImageEnd, Image->ImageStart)); |
1572 | |
1573 | auto HandleError = [&](Error Err) -> bool { |
1574 | [[maybe_unused]] std::string ErrStr = toString(std::move(Err)); |
1575 | DP("Failure to check validity of image %p: %s" , Image, ErrStr.c_str()); |
1576 | return false; |
1577 | }; |
1578 | switch (identify_magic(Buffer)) { |
1579 | case file_magic::elf: |
1580 | case file_magic::elf_relocatable: |
1581 | case file_magic::elf_executable: |
1582 | case file_magic::elf_shared_object: |
1583 | case file_magic::elf_core: { |
1584 | auto MatchOrErr = checkELFImage(Buffer); |
1585 | if (Error Err = MatchOrErr.takeError()) |
1586 | return HandleError(std::move(Err)); |
1587 | return *MatchOrErr; |
1588 | } |
1589 | case file_magic::bitcode: { |
1590 | auto MatchOrErr = getJIT().checkBitcodeImage(Buffer); |
1591 | if (Error Err = MatchOrErr.takeError()) |
1592 | return HandleError(std::move(Err)); |
1593 | return *MatchOrErr; |
1594 | } |
1595 | default: |
1596 | return false; |
1597 | } |
1598 | } |
1599 | |
1600 | int32_t GenericPluginTy::init_device(int32_t DeviceId) { |
1601 | auto Err = initDevice(DeviceId); |
1602 | if (Err) { |
1603 | REPORT("Failure to initialize device %d: %s\n" , DeviceId, |
1604 | toString(std::move(Err)).data()); |
1605 | return OFFLOAD_FAIL; |
1606 | } |
1607 | |
1608 | return OFFLOAD_SUCCESS; |
1609 | } |
1610 | |
1611 | int32_t GenericPluginTy::number_of_devices() { return getNumDevices(); } |
1612 | |
1613 | int64_t GenericPluginTy::init_requires(int64_t RequiresFlags) { |
1614 | setRequiresFlag(RequiresFlags); |
1615 | return OFFLOAD_SUCCESS; |
1616 | } |
1617 | |
1618 | int32_t GenericPluginTy::is_data_exchangable(int32_t SrcDeviceId, |
1619 | int32_t DstDeviceId) { |
1620 | return isDataExchangable(SrcDeviceId, DstDeviceId); |
1621 | } |
1622 | |
1623 | int32_t GenericPluginTy::initialize_record_replay(int32_t DeviceId, |
1624 | int64_t MemorySize, |
1625 | void *VAddr, bool isRecord, |
1626 | bool SaveOutput, |
1627 | uint64_t &ReqPtrArgOffset) { |
1628 | GenericDeviceTy &Device = getDevice(DeviceId); |
1629 | RecordReplayTy::RRStatusTy Status = |
1630 | isRecord ? RecordReplayTy::RRStatusTy::RRRecording |
1631 | : RecordReplayTy::RRStatusTy::RRReplaying; |
1632 | |
1633 | if (auto Err = RecordReplay.init(&Device, MemorySize, VAddr, Status, |
1634 | SaveOutput, ReqPtrArgOffset)) { |
1635 | REPORT("WARNING RR did not intialize RR-properly with %lu bytes" |
1636 | "(Error: %s)\n" , |
1637 | MemorySize, toString(std::move(Err)).data()); |
1638 | RecordReplay.setStatus(RecordReplayTy::RRStatusTy::RRDeactivated); |
1639 | |
1640 | if (!isRecord) { |
1641 | return OFFLOAD_FAIL; |
1642 | } |
1643 | } |
1644 | return OFFLOAD_SUCCESS; |
1645 | } |
1646 | |
1647 | int32_t GenericPluginTy::load_binary(int32_t DeviceId, |
1648 | __tgt_device_image *TgtImage, |
1649 | __tgt_device_binary *Binary) { |
1650 | GenericDeviceTy &Device = getDevice(DeviceId); |
1651 | |
1652 | auto ImageOrErr = Device.loadBinary(*this, TgtImage); |
1653 | if (!ImageOrErr) { |
1654 | auto Err = ImageOrErr.takeError(); |
1655 | REPORT("Failure to load binary image %p on device %d: %s\n" , TgtImage, |
1656 | DeviceId, toString(std::move(Err)).data()); |
1657 | return OFFLOAD_FAIL; |
1658 | } |
1659 | |
1660 | DeviceImageTy *Image = *ImageOrErr; |
1661 | assert(Image != nullptr && "Invalid Image" ); |
1662 | |
1663 | *Binary = __tgt_device_binary{reinterpret_cast<uint64_t>(Image)}; |
1664 | |
1665 | return OFFLOAD_SUCCESS; |
1666 | } |
1667 | |
1668 | void *GenericPluginTy::data_alloc(int32_t DeviceId, int64_t Size, void *HostPtr, |
1669 | int32_t Kind) { |
1670 | auto AllocOrErr = |
1671 | getDevice(DeviceId).dataAlloc(Size, HostPtr, (TargetAllocTy)Kind); |
1672 | if (!AllocOrErr) { |
1673 | auto Err = AllocOrErr.takeError(); |
1674 | REPORT("Failure to allocate device memory: %s\n" , |
1675 | toString(std::move(Err)).data()); |
1676 | return nullptr; |
1677 | } |
1678 | assert(*AllocOrErr && "Null pointer upon successful allocation" ); |
1679 | |
1680 | return *AllocOrErr; |
1681 | } |
1682 | |
1683 | int32_t GenericPluginTy::data_delete(int32_t DeviceId, void *TgtPtr, |
1684 | int32_t Kind) { |
1685 | auto Err = |
1686 | getDevice(DeviceId).dataDelete(TgtPtr, static_cast<TargetAllocTy>(Kind)); |
1687 | if (Err) { |
1688 | REPORT("Failure to deallocate device pointer %p: %s\n" , TgtPtr, |
1689 | toString(std::move(Err)).data()); |
1690 | return OFFLOAD_FAIL; |
1691 | } |
1692 | |
1693 | return OFFLOAD_SUCCESS; |
1694 | } |
1695 | |
1696 | int32_t GenericPluginTy::data_lock(int32_t DeviceId, void *Ptr, int64_t Size, |
1697 | void **LockedPtr) { |
1698 | auto LockedPtrOrErr = getDevice(DeviceId).dataLock(Ptr, Size); |
1699 | if (!LockedPtrOrErr) { |
1700 | auto Err = LockedPtrOrErr.takeError(); |
1701 | REPORT("Failure to lock memory %p: %s\n" , Ptr, |
1702 | toString(std::move(Err)).data()); |
1703 | return OFFLOAD_FAIL; |
1704 | } |
1705 | |
1706 | if (!(*LockedPtrOrErr)) { |
1707 | REPORT("Failure to lock memory %p: obtained a null locked pointer\n" , Ptr); |
1708 | return OFFLOAD_FAIL; |
1709 | } |
1710 | *LockedPtr = *LockedPtrOrErr; |
1711 | |
1712 | return OFFLOAD_SUCCESS; |
1713 | } |
1714 | |
1715 | int32_t GenericPluginTy::data_unlock(int32_t DeviceId, void *Ptr) { |
1716 | auto Err = getDevice(DeviceId).dataUnlock(Ptr); |
1717 | if (Err) { |
1718 | REPORT("Failure to unlock memory %p: %s\n" , Ptr, |
1719 | toString(std::move(Err)).data()); |
1720 | return OFFLOAD_FAIL; |
1721 | } |
1722 | |
1723 | return OFFLOAD_SUCCESS; |
1724 | } |
1725 | |
1726 | int32_t GenericPluginTy::data_notify_mapped(int32_t DeviceId, void *HstPtr, |
1727 | int64_t Size) { |
1728 | auto Err = getDevice(DeviceId).notifyDataMapped(HstPtr, Size); |
1729 | if (Err) { |
1730 | REPORT("Failure to notify data mapped %p: %s\n" , HstPtr, |
1731 | toString(std::move(Err)).data()); |
1732 | return OFFLOAD_FAIL; |
1733 | } |
1734 | |
1735 | return OFFLOAD_SUCCESS; |
1736 | } |
1737 | |
1738 | int32_t GenericPluginTy::data_notify_unmapped(int32_t DeviceId, void *HstPtr) { |
1739 | auto Err = getDevice(DeviceId).notifyDataUnmapped(HstPtr); |
1740 | if (Err) { |
1741 | REPORT("Failure to notify data unmapped %p: %s\n" , HstPtr, |
1742 | toString(std::move(Err)).data()); |
1743 | return OFFLOAD_FAIL; |
1744 | } |
1745 | |
1746 | return OFFLOAD_SUCCESS; |
1747 | } |
1748 | |
1749 | int32_t GenericPluginTy::data_submit(int32_t DeviceId, void *TgtPtr, |
1750 | void *HstPtr, int64_t Size) { |
1751 | return data_submit_async(DeviceId, TgtPtr, HstPtr, Size, |
1752 | /*AsyncInfoPtr=*/nullptr); |
1753 | } |
1754 | |
1755 | int32_t GenericPluginTy::data_submit_async(int32_t DeviceId, void *TgtPtr, |
1756 | void *HstPtr, int64_t Size, |
1757 | __tgt_async_info *AsyncInfoPtr) { |
1758 | auto Err = getDevice(DeviceId).dataSubmit(TgtPtr, HstPtr, Size, AsyncInfoPtr); |
1759 | if (Err) { |
1760 | REPORT("Failure to copy data from host to device. Pointers: host " |
1761 | "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 ": %s\n" , |
1762 | DPxPTR(HstPtr), DPxPTR(TgtPtr), Size, |
1763 | toString(std::move(Err)).data()); |
1764 | return OFFLOAD_FAIL; |
1765 | } |
1766 | |
1767 | return OFFLOAD_SUCCESS; |
1768 | } |
1769 | |
1770 | int32_t GenericPluginTy::data_retrieve(int32_t DeviceId, void *HstPtr, |
1771 | void *TgtPtr, int64_t Size) { |
1772 | return data_retrieve_async(DeviceId, HstPtr, TgtPtr, Size, |
1773 | /*AsyncInfoPtr=*/nullptr); |
1774 | } |
1775 | |
1776 | int32_t GenericPluginTy::data_retrieve_async(int32_t DeviceId, void *HstPtr, |
1777 | void *TgtPtr, int64_t Size, |
1778 | __tgt_async_info *AsyncInfoPtr) { |
1779 | auto Err = |
1780 | getDevice(DeviceId).dataRetrieve(HstPtr, TgtPtr, Size, AsyncInfoPtr); |
1781 | if (Err) { |
1782 | REPORT("Faliure to copy data from device to host. Pointers: host " |
1783 | "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 ": %s\n" , |
1784 | DPxPTR(HstPtr), DPxPTR(TgtPtr), Size, |
1785 | toString(std::move(Err)).data()); |
1786 | return OFFLOAD_FAIL; |
1787 | } |
1788 | |
1789 | return OFFLOAD_SUCCESS; |
1790 | } |
1791 | |
1792 | int32_t GenericPluginTy::data_exchange(int32_t SrcDeviceId, void *SrcPtr, |
1793 | int32_t DstDeviceId, void *DstPtr, |
1794 | int64_t Size) { |
1795 | return data_exchange_async(SrcDeviceId, SrcPtr, DstDeviceId, DstPtr, Size, |
1796 | /*AsyncInfoPtr=*/nullptr); |
1797 | } |
1798 | |
1799 | int32_t GenericPluginTy::data_exchange_async(int32_t SrcDeviceId, void *SrcPtr, |
1800 | int DstDeviceId, void *DstPtr, |
1801 | int64_t Size, |
1802 | __tgt_async_info *AsyncInfo) { |
1803 | GenericDeviceTy &SrcDevice = getDevice(SrcDeviceId); |
1804 | GenericDeviceTy &DstDevice = getDevice(DstDeviceId); |
1805 | auto Err = SrcDevice.dataExchange(SrcPtr, DstDevice, DstPtr, Size, AsyncInfo); |
1806 | if (Err) { |
1807 | REPORT("Failure to copy data from device (%d) to device (%d). Pointers: " |
1808 | "host = " DPxMOD ", device = " DPxMOD ", size = %" PRId64 ": %s\n" , |
1809 | SrcDeviceId, DstDeviceId, DPxPTR(SrcPtr), DPxPTR(DstPtr), Size, |
1810 | toString(std::move(Err)).data()); |
1811 | return OFFLOAD_FAIL; |
1812 | } |
1813 | |
1814 | return OFFLOAD_SUCCESS; |
1815 | } |
1816 | |
1817 | int32_t GenericPluginTy::launch_kernel(int32_t DeviceId, void *TgtEntryPtr, |
1818 | void **TgtArgs, ptrdiff_t *TgtOffsets, |
1819 | KernelArgsTy *KernelArgs, |
1820 | __tgt_async_info *AsyncInfoPtr) { |
1821 | auto Err = getDevice(DeviceId).launchKernel(TgtEntryPtr, TgtArgs, TgtOffsets, |
1822 | *KernelArgs, AsyncInfoPtr); |
1823 | if (Err) { |
1824 | REPORT("Failure to run target region " DPxMOD " in device %d: %s\n" , |
1825 | DPxPTR(TgtEntryPtr), DeviceId, toString(std::move(Err)).data()); |
1826 | return OFFLOAD_FAIL; |
1827 | } |
1828 | |
1829 | return OFFLOAD_SUCCESS; |
1830 | } |
1831 | |
1832 | int32_t GenericPluginTy::synchronize(int32_t DeviceId, |
1833 | __tgt_async_info *AsyncInfoPtr) { |
1834 | auto Err = getDevice(DeviceId).synchronize(AsyncInfoPtr); |
1835 | if (Err) { |
1836 | REPORT("Failure to synchronize stream %p: %s\n" , AsyncInfoPtr->Queue, |
1837 | toString(std::move(Err)).data()); |
1838 | return OFFLOAD_FAIL; |
1839 | } |
1840 | |
1841 | return OFFLOAD_SUCCESS; |
1842 | } |
1843 | |
1844 | int32_t GenericPluginTy::query_async(int32_t DeviceId, |
1845 | __tgt_async_info *AsyncInfoPtr) { |
1846 | auto Err = getDevice(DeviceId).queryAsync(AsyncInfoPtr); |
1847 | if (Err) { |
1848 | REPORT("Failure to query stream %p: %s\n" , AsyncInfoPtr->Queue, |
1849 | toString(std::move(Err)).data()); |
1850 | return OFFLOAD_FAIL; |
1851 | } |
1852 | |
1853 | return OFFLOAD_SUCCESS; |
1854 | } |
1855 | |
1856 | void GenericPluginTy::print_device_info(int32_t DeviceId) { |
1857 | if (auto Err = getDevice(DeviceId).printInfo()) |
1858 | REPORT("Failure to print device %d info: %s\n" , DeviceId, |
1859 | toString(std::move(Err)).data()); |
1860 | } |
1861 | |
1862 | int32_t GenericPluginTy::create_event(int32_t DeviceId, void **EventPtr) { |
1863 | auto Err = getDevice(DeviceId).createEvent(EventPtr); |
1864 | if (Err) { |
1865 | REPORT("Failure to create event: %s\n" , toString(std::move(Err)).data()); |
1866 | return OFFLOAD_FAIL; |
1867 | } |
1868 | |
1869 | return OFFLOAD_SUCCESS; |
1870 | } |
1871 | |
1872 | int32_t GenericPluginTy::record_event(int32_t DeviceId, void *EventPtr, |
1873 | __tgt_async_info *AsyncInfoPtr) { |
1874 | auto Err = getDevice(DeviceId).recordEvent(EventPtr, AsyncInfoPtr); |
1875 | if (Err) { |
1876 | REPORT("Failure to record event %p: %s\n" , EventPtr, |
1877 | toString(std::move(Err)).data()); |
1878 | return OFFLOAD_FAIL; |
1879 | } |
1880 | |
1881 | return OFFLOAD_SUCCESS; |
1882 | } |
1883 | |
1884 | int32_t GenericPluginTy::wait_event(int32_t DeviceId, void *EventPtr, |
1885 | __tgt_async_info *AsyncInfoPtr) { |
1886 | auto Err = getDevice(DeviceId).waitEvent(EventPtr, AsyncInfoPtr); |
1887 | if (Err) { |
1888 | REPORT("Failure to wait event %p: %s\n" , EventPtr, |
1889 | toString(std::move(Err)).data()); |
1890 | return OFFLOAD_FAIL; |
1891 | } |
1892 | |
1893 | return OFFLOAD_SUCCESS; |
1894 | } |
1895 | |
1896 | int32_t GenericPluginTy::sync_event(int32_t DeviceId, void *EventPtr) { |
1897 | auto Err = getDevice(DeviceId).syncEvent(EventPtr); |
1898 | if (Err) { |
1899 | REPORT("Failure to synchronize event %p: %s\n" , EventPtr, |
1900 | toString(std::move(Err)).data()); |
1901 | return OFFLOAD_FAIL; |
1902 | } |
1903 | |
1904 | return OFFLOAD_SUCCESS; |
1905 | } |
1906 | |
1907 | int32_t GenericPluginTy::destroy_event(int32_t DeviceId, void *EventPtr) { |
1908 | auto Err = getDevice(DeviceId).destroyEvent(EventPtr); |
1909 | if (Err) { |
1910 | REPORT("Failure to destroy event %p: %s\n" , EventPtr, |
1911 | toString(std::move(Err)).data()); |
1912 | return OFFLOAD_FAIL; |
1913 | } |
1914 | |
1915 | return OFFLOAD_SUCCESS; |
1916 | } |
1917 | |
1918 | void GenericPluginTy::set_info_flag(uint32_t NewInfoLevel) { |
1919 | std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal(); |
1920 | InfoLevel.store(NewInfoLevel); |
1921 | } |
1922 | |
1923 | int32_t GenericPluginTy::init_async_info(int32_t DeviceId, |
1924 | __tgt_async_info **AsyncInfoPtr) { |
1925 | assert(AsyncInfoPtr && "Invalid async info" ); |
1926 | |
1927 | auto Err = getDevice(DeviceId).initAsyncInfo(AsyncInfoPtr); |
1928 | if (Err) { |
1929 | REPORT("Failure to initialize async info at " DPxMOD " on device %d: %s\n" , |
1930 | DPxPTR(*AsyncInfoPtr), DeviceId, toString(std::move(Err)).data()); |
1931 | return OFFLOAD_FAIL; |
1932 | } |
1933 | |
1934 | return OFFLOAD_SUCCESS; |
1935 | } |
1936 | |
1937 | int32_t GenericPluginTy::init_device_info(int32_t DeviceId, |
1938 | __tgt_device_info *DeviceInfo, |
1939 | const char **ErrStr) { |
1940 | *ErrStr = "" ; |
1941 | |
1942 | auto Err = getDevice(DeviceId).initDeviceInfo(DeviceInfo); |
1943 | if (Err) { |
1944 | REPORT("Failure to initialize device info at " DPxMOD " on device %d: %s\n" , |
1945 | DPxPTR(DeviceInfo), DeviceId, toString(std::move(Err)).data()); |
1946 | return OFFLOAD_FAIL; |
1947 | } |
1948 | |
1949 | return OFFLOAD_SUCCESS; |
1950 | } |
1951 | |
1952 | int32_t GenericPluginTy::set_device_offset(int32_t DeviceIdOffset) { |
1953 | setDeviceIdStartIndex(DeviceIdOffset); |
1954 | |
1955 | return OFFLOAD_SUCCESS; |
1956 | } |
1957 | |
1958 | int32_t GenericPluginTy::use_auto_zero_copy(int32_t DeviceId) { |
1959 | // Automatic zero-copy only applies to programs that did |
1960 | // not request unified_shared_memory and are deployed on an |
1961 | // APU with XNACK enabled. |
1962 | if (getRequiresFlags() & OMP_REQ_UNIFIED_SHARED_MEMORY) |
1963 | return false; |
1964 | return getDevice(DeviceId).useAutoZeroCopy(); |
1965 | } |
1966 | |
1967 | int32_t GenericPluginTy::get_global(__tgt_device_binary Binary, uint64_t Size, |
1968 | const char *Name, void **DevicePtr) { |
1969 | assert(Binary.handle && "Invalid device binary handle" ); |
1970 | DeviceImageTy &Image = *reinterpret_cast<DeviceImageTy *>(Binary.handle); |
1971 | |
1972 | GenericDeviceTy &Device = Image.getDevice(); |
1973 | |
1974 | GlobalTy DeviceGlobal(Name, Size); |
1975 | GenericGlobalHandlerTy &GHandler = getGlobalHandler(); |
1976 | if (auto Err = |
1977 | GHandler.getGlobalMetadataFromDevice(Device, Image, DeviceGlobal)) { |
1978 | REPORT("Failure to look up global address: %s\n" , |
1979 | toString(std::move(Err)).data()); |
1980 | return OFFLOAD_FAIL; |
1981 | } |
1982 | |
1983 | *DevicePtr = DeviceGlobal.getPtr(); |
1984 | assert(DevicePtr && "Invalid device global's address" ); |
1985 | |
1986 | // Save the loaded globals if we are recording. |
1987 | if (RecordReplay.isRecording()) |
1988 | RecordReplay.addEntry(Name, Size, *DevicePtr); |
1989 | |
1990 | return OFFLOAD_SUCCESS; |
1991 | } |
1992 | |
1993 | int32_t GenericPluginTy::get_function(__tgt_device_binary Binary, |
1994 | const char *Name, void **KernelPtr) { |
1995 | assert(Binary.handle && "Invalid device binary handle" ); |
1996 | DeviceImageTy &Image = *reinterpret_cast<DeviceImageTy *>(Binary.handle); |
1997 | |
1998 | GenericDeviceTy &Device = Image.getDevice(); |
1999 | |
2000 | auto KernelOrErr = Device.constructKernel(Name); |
2001 | if (Error Err = KernelOrErr.takeError()) { |
2002 | REPORT("Failure to look up kernel: %s\n" , toString(std::move(Err)).data()); |
2003 | return OFFLOAD_FAIL; |
2004 | } |
2005 | |
2006 | GenericKernelTy &Kernel = *KernelOrErr; |
2007 | if (auto Err = Kernel.init(Device, Image)) { |
2008 | REPORT("Failure to init kernel: %s\n" , toString(std::move(Err)).data()); |
2009 | return OFFLOAD_FAIL; |
2010 | } |
2011 | |
2012 | // Note that this is not the kernel's device address. |
2013 | *KernelPtr = &Kernel; |
2014 | return OFFLOAD_SUCCESS; |
2015 | } |
2016 | |
2017 | bool llvm::omp::target::plugin::libomptargetSupportsRPC() { |
2018 | #ifdef LIBOMPTARGET_RPC_SUPPORT |
2019 | return true; |
2020 | #else |
2021 | return false; |
2022 | #endif |
2023 | } |
2024 | |
2025 | /// Exposed library API function, basically wrappers around the GenericDeviceTy |
2026 | /// functionality with the same name. All non-async functions are redirected |
2027 | /// to the async versions right away with a NULL AsyncInfoPtr. |
2028 | #ifdef __cplusplus |
2029 | extern "C" { |
2030 | #endif |
2031 | |
2032 | int32_t __tgt_rtl_init_plugin() { |
2033 | auto Err = PluginTy::initIfNeeded(); |
2034 | if (Err) { |
2035 | [[maybe_unused]] std::string ErrStr = toString(std::move(Err)); |
2036 | DP("Failed to init plugin: %s" , ErrStr.c_str()); |
2037 | return OFFLOAD_FAIL; |
2038 | } |
2039 | |
2040 | return OFFLOAD_SUCCESS; |
2041 | } |
2042 | |
2043 | int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) { |
2044 | if (!PluginTy::isActive()) |
2045 | return false; |
2046 | |
2047 | return PluginTy::get().is_valid_binary(Image); |
2048 | } |
2049 | |
2050 | int32_t __tgt_rtl_init_device(int32_t DeviceId) { |
2051 | return PluginTy::get().init_device(DeviceId); |
2052 | } |
2053 | |
2054 | int32_t __tgt_rtl_number_of_devices() { |
2055 | return PluginTy::get().number_of_devices(); |
2056 | } |
2057 | |
2058 | int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { |
2059 | return PluginTy::get().init_requires(RequiresFlags); |
2060 | } |
2061 | |
2062 | int32_t __tgt_rtl_is_data_exchangable(int32_t SrcDeviceId, |
2063 | int32_t DstDeviceId) { |
2064 | return PluginTy::get().is_data_exchangable(SrcDeviceId, DstDeviceId); |
2065 | } |
2066 | |
2067 | int32_t __tgt_rtl_initialize_record_replay(int32_t DeviceId, int64_t MemorySize, |
2068 | void *VAddr, bool isRecord, |
2069 | bool SaveOutput, |
2070 | uint64_t &ReqPtrArgOffset) { |
2071 | return PluginTy::get().initialize_record_replay( |
2072 | DeviceId, MemorySize, VAddr, isRecord, SaveOutput, ReqPtrArgOffset); |
2073 | } |
2074 | |
2075 | int32_t __tgt_rtl_load_binary(int32_t DeviceId, __tgt_device_image *TgtImage, |
2076 | __tgt_device_binary *Binary) { |
2077 | return PluginTy::get().load_binary(DeviceId, TgtImage, Binary); |
2078 | } |
2079 | |
2080 | void *__tgt_rtl_data_alloc(int32_t DeviceId, int64_t Size, void *HostPtr, |
2081 | int32_t Kind) { |
2082 | return PluginTy::get().data_alloc(DeviceId, Size, HostPtr, Kind); |
2083 | } |
2084 | |
2085 | int32_t __tgt_rtl_data_delete(int32_t DeviceId, void *TgtPtr, int32_t Kind) { |
2086 | return PluginTy::get().data_delete(DeviceId, TgtPtr, Kind); |
2087 | } |
2088 | |
2089 | int32_t __tgt_rtl_data_lock(int32_t DeviceId, void *Ptr, int64_t Size, |
2090 | void **LockedPtr) { |
2091 | return PluginTy::get().data_lock(DeviceId, Ptr, Size, LockedPtr); |
2092 | } |
2093 | |
2094 | int32_t __tgt_rtl_data_unlock(int32_t DeviceId, void *Ptr) { |
2095 | return PluginTy::get().data_unlock(DeviceId, Ptr); |
2096 | } |
2097 | |
2098 | int32_t __tgt_rtl_data_notify_mapped(int32_t DeviceId, void *HstPtr, |
2099 | int64_t Size) { |
2100 | return PluginTy::get().data_notify_mapped(DeviceId, HstPtr, Size); |
2101 | } |
2102 | |
2103 | int32_t __tgt_rtl_data_notify_unmapped(int32_t DeviceId, void *HstPtr) { |
2104 | return PluginTy::get().data_notify_unmapped(DeviceId, HstPtr); |
2105 | } |
2106 | |
2107 | int32_t __tgt_rtl_data_submit(int32_t DeviceId, void *TgtPtr, void *HstPtr, |
2108 | int64_t Size) { |
2109 | return PluginTy::get().data_submit(DeviceId, TgtPtr, HstPtr, Size); |
2110 | } |
2111 | |
2112 | int32_t __tgt_rtl_data_submit_async(int32_t DeviceId, void *TgtPtr, |
2113 | void *HstPtr, int64_t Size, |
2114 | __tgt_async_info *AsyncInfoPtr) { |
2115 | return PluginTy::get().data_submit_async(DeviceId, TgtPtr, HstPtr, Size, |
2116 | AsyncInfoPtr); |
2117 | } |
2118 | |
2119 | int32_t __tgt_rtl_data_retrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, |
2120 | int64_t Size) { |
2121 | return PluginTy::get().data_retrieve(DeviceId, HstPtr, TgtPtr, Size); |
2122 | } |
2123 | |
2124 | int32_t __tgt_rtl_data_retrieve_async(int32_t DeviceId, void *HstPtr, |
2125 | void *TgtPtr, int64_t Size, |
2126 | __tgt_async_info *AsyncInfoPtr) { |
2127 | return PluginTy::get().data_retrieve_async(DeviceId, HstPtr, TgtPtr, Size, |
2128 | AsyncInfoPtr); |
2129 | } |
2130 | |
2131 | int32_t __tgt_rtl_data_exchange(int32_t SrcDeviceId, void *SrcPtr, |
2132 | int32_t DstDeviceId, void *DstPtr, |
2133 | int64_t Size) { |
2134 | return PluginTy::get().data_exchange(SrcDeviceId, SrcPtr, DstDeviceId, DstPtr, |
2135 | Size); |
2136 | } |
2137 | |
2138 | int32_t __tgt_rtl_data_exchange_async(int32_t SrcDeviceId, void *SrcPtr, |
2139 | int DstDeviceId, void *DstPtr, |
2140 | int64_t Size, |
2141 | __tgt_async_info *AsyncInfo) { |
2142 | return PluginTy::get().data_exchange_async(SrcDeviceId, SrcPtr, DstDeviceId, |
2143 | DstPtr, Size, AsyncInfo); |
2144 | } |
2145 | |
2146 | int32_t __tgt_rtl_launch_kernel(int32_t DeviceId, void *TgtEntryPtr, |
2147 | void **TgtArgs, ptrdiff_t *TgtOffsets, |
2148 | KernelArgsTy *KernelArgs, |
2149 | __tgt_async_info *AsyncInfoPtr) { |
2150 | return PluginTy::get().launch_kernel(DeviceId, TgtEntryPtr, TgtArgs, |
2151 | TgtOffsets, KernelArgs, AsyncInfoPtr); |
2152 | } |
2153 | |
2154 | int32_t __tgt_rtl_synchronize(int32_t DeviceId, |
2155 | __tgt_async_info *AsyncInfoPtr) { |
2156 | return PluginTy::get().synchronize(DeviceId, AsyncInfoPtr); |
2157 | } |
2158 | |
2159 | int32_t __tgt_rtl_query_async(int32_t DeviceId, |
2160 | __tgt_async_info *AsyncInfoPtr) { |
2161 | return PluginTy::get().query_async(DeviceId, AsyncInfoPtr); |
2162 | } |
2163 | |
2164 | void __tgt_rtl_print_device_info(int32_t DeviceId) { |
2165 | PluginTy::get().print_device_info(DeviceId); |
2166 | } |
2167 | |
2168 | int32_t __tgt_rtl_create_event(int32_t DeviceId, void **EventPtr) { |
2169 | return PluginTy::get().create_event(DeviceId, EventPtr); |
2170 | } |
2171 | |
2172 | int32_t __tgt_rtl_record_event(int32_t DeviceId, void *EventPtr, |
2173 | __tgt_async_info *AsyncInfoPtr) { |
2174 | return PluginTy::get().record_event(DeviceId, EventPtr, AsyncInfoPtr); |
2175 | } |
2176 | |
2177 | int32_t __tgt_rtl_wait_event(int32_t DeviceId, void *EventPtr, |
2178 | __tgt_async_info *AsyncInfoPtr) { |
2179 | return PluginTy::get().wait_event(DeviceId, EventPtr, AsyncInfoPtr); |
2180 | } |
2181 | |
2182 | int32_t __tgt_rtl_sync_event(int32_t DeviceId, void *EventPtr) { |
2183 | return PluginTy::get().sync_event(DeviceId, EventPtr); |
2184 | } |
2185 | |
2186 | int32_t __tgt_rtl_destroy_event(int32_t DeviceId, void *EventPtr) { |
2187 | return PluginTy::get().destroy_event(DeviceId, EventPtr); |
2188 | } |
2189 | |
2190 | void __tgt_rtl_set_info_flag(uint32_t NewInfoLevel) { |
2191 | return PluginTy::get().set_info_flag(NewInfoLevel); |
2192 | } |
2193 | |
2194 | int32_t __tgt_rtl_init_async_info(int32_t DeviceId, |
2195 | __tgt_async_info **AsyncInfoPtr) { |
2196 | return PluginTy::get().init_async_info(DeviceId, AsyncInfoPtr); |
2197 | } |
2198 | |
2199 | int32_t __tgt_rtl_init_device_info(int32_t DeviceId, |
2200 | __tgt_device_info *DeviceInfo, |
2201 | const char **ErrStr) { |
2202 | return PluginTy::get().init_device_info(DeviceId, DeviceInfo, ErrStr); |
2203 | } |
2204 | |
2205 | int32_t __tgt_rtl_set_device_offset(int32_t DeviceIdOffset) { |
2206 | return PluginTy::get().set_device_offset(DeviceIdOffset); |
2207 | } |
2208 | |
2209 | int32_t __tgt_rtl_use_auto_zero_copy(int32_t DeviceId) { |
2210 | return PluginTy::get().use_auto_zero_copy(DeviceId); |
2211 | } |
2212 | |
2213 | int32_t __tgt_rtl_get_global(__tgt_device_binary Binary, uint64_t Size, |
2214 | const char *Name, void **DevicePtr) { |
2215 | return PluginTy::get().get_global(Binary, Size, Name, DevicePtr); |
2216 | } |
2217 | |
2218 | int32_t __tgt_rtl_get_function(__tgt_device_binary Binary, const char *Name, |
2219 | void **KernelPtr) { |
2220 | return PluginTy::get().get_function(Binary, Name, KernelPtr); |
2221 | } |
2222 | |
2223 | #ifdef __cplusplus |
2224 | } |
2225 | #endif |
2226 | |