1//===- AMDGPUAttributor.cpp -----------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file This pass uses Attributor framework to deduce AMDGPU attributes.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AMDGPU.h"
14#include "GCNSubtarget.h"
15#include "Utils/AMDGPUBaseInfo.h"
16#include "llvm/Analysis/CycleAnalysis.h"
17#include "llvm/CodeGen/TargetPassConfig.h"
18#include "llvm/IR/IntrinsicsAMDGPU.h"
19#include "llvm/IR/IntrinsicsR600.h"
20#include "llvm/Target/TargetMachine.h"
21#include "llvm/Transforms/IPO/Attributor.h"
22
23#define DEBUG_TYPE "amdgpu-attributor"
24
25namespace llvm {
26void initializeCycleInfoWrapperPassPass(PassRegistry &);
27}
28
29using namespace llvm;
30
31static cl::opt<unsigned> KernargPreloadCount(
32 "amdgpu-kernarg-preload-count",
33 cl::desc("How many kernel arguments to preload onto SGPRs"), cl::init(Val: 0));
34
35#define AMDGPU_ATTRIBUTE(Name, Str) Name##_POS,
36
37enum ImplicitArgumentPositions {
38 #include "AMDGPUAttributes.def"
39 LAST_ARG_POS
40};
41
42#define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS,
43
44enum ImplicitArgumentMask {
45 NOT_IMPLICIT_INPUT = 0,
46 #include "AMDGPUAttributes.def"
47 ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1
48};
49
50#define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str},
51static constexpr std::pair<ImplicitArgumentMask,
52 StringLiteral> ImplicitAttrs[] = {
53 #include "AMDGPUAttributes.def"
54};
55
56// We do not need to note the x workitem or workgroup id because they are always
57// initialized.
58//
59// TODO: We should not add the attributes if the known compile time workgroup
60// size is 1 for y/z.
61static ImplicitArgumentMask
62intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit,
63 bool HasApertureRegs, bool SupportsGetDoorBellID,
64 unsigned CodeObjectVersion) {
65 switch (ID) {
66 case Intrinsic::amdgcn_workitem_id_x:
67 NonKernelOnly = true;
68 return WORKITEM_ID_X;
69 case Intrinsic::amdgcn_workgroup_id_x:
70 NonKernelOnly = true;
71 return WORKGROUP_ID_X;
72 case Intrinsic::amdgcn_workitem_id_y:
73 case Intrinsic::r600_read_tidig_y:
74 return WORKITEM_ID_Y;
75 case Intrinsic::amdgcn_workitem_id_z:
76 case Intrinsic::r600_read_tidig_z:
77 return WORKITEM_ID_Z;
78 case Intrinsic::amdgcn_workgroup_id_y:
79 case Intrinsic::r600_read_tgid_y:
80 return WORKGROUP_ID_Y;
81 case Intrinsic::amdgcn_workgroup_id_z:
82 case Intrinsic::r600_read_tgid_z:
83 return WORKGROUP_ID_Z;
84 case Intrinsic::amdgcn_lds_kernel_id:
85 return LDS_KERNEL_ID;
86 case Intrinsic::amdgcn_dispatch_ptr:
87 return DISPATCH_PTR;
88 case Intrinsic::amdgcn_dispatch_id:
89 return DISPATCH_ID;
90 case Intrinsic::amdgcn_implicitarg_ptr:
91 return IMPLICIT_ARG_PTR;
92 // Need queue_ptr anyway. But under V5, we also need implicitarg_ptr to access
93 // queue_ptr.
94 case Intrinsic::amdgcn_queue_ptr:
95 NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5);
96 return QUEUE_PTR;
97 case Intrinsic::amdgcn_is_shared:
98 case Intrinsic::amdgcn_is_private:
99 if (HasApertureRegs)
100 return NOT_IMPLICIT_INPUT;
101 // Under V5, we need implicitarg_ptr + offsets to access private_base or
102 // shared_base. For pre-V5, however, need to access them through queue_ptr +
103 // offsets.
104 return CodeObjectVersion >= AMDGPU::AMDHSA_COV5 ? IMPLICIT_ARG_PTR :
105 QUEUE_PTR;
106 case Intrinsic::trap:
107 if (SupportsGetDoorBellID) // GetDoorbellID support implemented since V4.
108 return CodeObjectVersion >= AMDGPU::AMDHSA_COV4 ? NOT_IMPLICIT_INPUT :
109 QUEUE_PTR;
110 NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5);
111 return QUEUE_PTR;
112 default:
113 return NOT_IMPLICIT_INPUT;
114 }
115}
116
117static bool castRequiresQueuePtr(unsigned SrcAS) {
118 return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
119}
120
121static bool isDSAddress(const Constant *C) {
122 const GlobalValue *GV = dyn_cast<GlobalValue>(Val: C);
123 if (!GV)
124 return false;
125 unsigned AS = GV->getAddressSpace();
126 return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS;
127}
128
129/// Returns true if the function requires the implicit argument be passed
130/// regardless of the function contents.
131static bool funcRequiresHostcallPtr(const Function &F) {
132 // Sanitizers require the hostcall buffer passed in the implicit arguments.
133 return F.hasFnAttribute(Attribute::SanitizeAddress) ||
134 F.hasFnAttribute(Attribute::SanitizeThread) ||
135 F.hasFnAttribute(Attribute::SanitizeMemory) ||
136 F.hasFnAttribute(Attribute::SanitizeHWAddress) ||
137 F.hasFnAttribute(Attribute::SanitizeMemTag);
138}
139
140namespace {
141class AMDGPUInformationCache : public InformationCache {
142public:
143 AMDGPUInformationCache(const Module &M, AnalysisGetter &AG,
144 BumpPtrAllocator &Allocator,
145 SetVector<Function *> *CGSCC, TargetMachine &TM)
146 : InformationCache(M, AG, Allocator, CGSCC), TM(TM),
147 CodeObjectVersion(AMDGPU::getAMDHSACodeObjectVersion(M)) {}
148
149 TargetMachine &TM;
150
151 enum ConstantStatus { DS_GLOBAL = 1 << 0, ADDR_SPACE_CAST = 1 << 1 };
152
153 /// Check if the subtarget has aperture regs.
154 bool hasApertureRegs(Function &F) {
155 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
156 return ST.hasApertureRegs();
157 }
158
159 /// Check if the subtarget supports GetDoorbellID.
160 bool supportsGetDoorbellID(Function &F) {
161 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
162 return ST.supportsGetDoorbellID();
163 }
164
165 std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) {
166 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
167 return ST.getFlatWorkGroupSizes(F);
168 }
169
170 std::pair<unsigned, unsigned>
171 getMaximumFlatWorkGroupRange(const Function &F) {
172 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
173 return {ST.getMinFlatWorkGroupSize(), ST.getMaxFlatWorkGroupSize()};
174 }
175
176 /// Get code object version.
177 unsigned getCodeObjectVersion() const {
178 return CodeObjectVersion;
179 }
180
181 /// Get the effective value of "amdgpu-waves-per-eu" for the function,
182 /// accounting for the interaction with the passed value to use for
183 /// "amdgpu-flat-work-group-size".
184 std::pair<unsigned, unsigned>
185 getWavesPerEU(const Function &F,
186 std::pair<unsigned, unsigned> FlatWorkGroupSize) {
187 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
188 return ST.getWavesPerEU(F, FlatWorkGroupSize);
189 }
190
191 std::pair<unsigned, unsigned>
192 getEffectiveWavesPerEU(const Function &F,
193 std::pair<unsigned, unsigned> WavesPerEU,
194 std::pair<unsigned, unsigned> FlatWorkGroupSize) {
195 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
196 return ST.getEffectiveWavesPerEU(WavesPerEU, FlatWorkGroupSize);
197 }
198
199 unsigned getMaxWavesPerEU(const Function &F) {
200 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
201 return ST.getMaxWavesPerEU();
202 }
203
204private:
205 /// Check if the ConstantExpr \p CE requires the queue pointer.
206 static bool visitConstExpr(const ConstantExpr *CE) {
207 if (CE->getOpcode() == Instruction::AddrSpaceCast) {
208 unsigned SrcAS = CE->getOperand(i_nocapture: 0)->getType()->getPointerAddressSpace();
209 return castRequiresQueuePtr(SrcAS);
210 }
211 return false;
212 }
213
214 /// Get the constant access bitmap for \p C.
215 uint8_t getConstantAccess(const Constant *C,
216 SmallPtrSetImpl<const Constant *> &Visited) {
217 auto It = ConstantStatus.find(Val: C);
218 if (It != ConstantStatus.end())
219 return It->second;
220
221 uint8_t Result = 0;
222 if (isDSAddress(C))
223 Result = DS_GLOBAL;
224
225 if (const auto *CE = dyn_cast<ConstantExpr>(Val: C))
226 if (visitConstExpr(CE))
227 Result |= ADDR_SPACE_CAST;
228
229 for (const Use &U : C->operands()) {
230 const auto *OpC = dyn_cast<Constant>(Val: U);
231 if (!OpC || !Visited.insert(Ptr: OpC).second)
232 continue;
233
234 Result |= getConstantAccess(C: OpC, Visited);
235 }
236 return Result;
237 }
238
239public:
240 /// Returns true if \p Fn needs the queue pointer because of \p C.
241 bool needsQueuePtr(const Constant *C, Function &Fn) {
242 bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(CC: Fn.getCallingConv());
243 bool HasAperture = hasApertureRegs(F&: Fn);
244
245 // No need to explore the constants.
246 if (!IsNonEntryFunc && HasAperture)
247 return false;
248
249 SmallPtrSet<const Constant *, 8> Visited;
250 uint8_t Access = getConstantAccess(C, Visited);
251
252 // We need to trap on DS globals in non-entry functions.
253 if (IsNonEntryFunc && (Access & DS_GLOBAL))
254 return true;
255
256 return !HasAperture && (Access & ADDR_SPACE_CAST);
257 }
258
259private:
260 /// Used to determine if the Constant needs the queue pointer.
261 DenseMap<const Constant *, uint8_t> ConstantStatus;
262 const unsigned CodeObjectVersion;
263};
264
265struct AAAMDAttributes
266 : public StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, 0>,
267 AbstractAttribute> {
268 using Base = StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, 0>,
269 AbstractAttribute>;
270
271 AAAMDAttributes(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
272
273 /// Create an abstract attribute view for the position \p IRP.
274 static AAAMDAttributes &createForPosition(const IRPosition &IRP,
275 Attributor &A);
276
277 /// See AbstractAttribute::getName().
278 const std::string getName() const override { return "AAAMDAttributes"; }
279
280 /// See AbstractAttribute::getIdAddr().
281 const char *getIdAddr() const override { return &ID; }
282
283 /// This function should return true if the type of the \p AA is
284 /// AAAMDAttributes.
285 static bool classof(const AbstractAttribute *AA) {
286 return (AA->getIdAddr() == &ID);
287 }
288
289 /// Unique ID (due to the unique address)
290 static const char ID;
291};
292const char AAAMDAttributes::ID = 0;
293
294struct AAUniformWorkGroupSize
295 : public StateWrapper<BooleanState, AbstractAttribute> {
296 using Base = StateWrapper<BooleanState, AbstractAttribute>;
297 AAUniformWorkGroupSize(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
298
299 /// Create an abstract attribute view for the position \p IRP.
300 static AAUniformWorkGroupSize &createForPosition(const IRPosition &IRP,
301 Attributor &A);
302
303 /// See AbstractAttribute::getName().
304 const std::string getName() const override {
305 return "AAUniformWorkGroupSize";
306 }
307
308 /// See AbstractAttribute::getIdAddr().
309 const char *getIdAddr() const override { return &ID; }
310
311 /// This function should return true if the type of the \p AA is
312 /// AAAMDAttributes.
313 static bool classof(const AbstractAttribute *AA) {
314 return (AA->getIdAddr() == &ID);
315 }
316
317 /// Unique ID (due to the unique address)
318 static const char ID;
319};
320const char AAUniformWorkGroupSize::ID = 0;
321
322struct AAUniformWorkGroupSizeFunction : public AAUniformWorkGroupSize {
323 AAUniformWorkGroupSizeFunction(const IRPosition &IRP, Attributor &A)
324 : AAUniformWorkGroupSize(IRP, A) {}
325
326 void initialize(Attributor &A) override {
327 Function *F = getAssociatedFunction();
328 CallingConv::ID CC = F->getCallingConv();
329
330 if (CC != CallingConv::AMDGPU_KERNEL)
331 return;
332
333 bool InitialValue = false;
334 if (F->hasFnAttribute(Kind: "uniform-work-group-size"))
335 InitialValue = F->getFnAttribute(Kind: "uniform-work-group-size")
336 .getValueAsString()
337 .equals(RHS: "true");
338
339 if (InitialValue)
340 indicateOptimisticFixpoint();
341 else
342 indicatePessimisticFixpoint();
343 }
344
345 ChangeStatus updateImpl(Attributor &A) override {
346 ChangeStatus Change = ChangeStatus::UNCHANGED;
347
348 auto CheckCallSite = [&](AbstractCallSite CS) {
349 Function *Caller = CS.getInstruction()->getFunction();
350 LLVM_DEBUG(dbgs() << "[AAUniformWorkGroupSize] Call " << Caller->getName()
351 << "->" << getAssociatedFunction()->getName() << "\n");
352
353 const auto *CallerInfo = A.getAAFor<AAUniformWorkGroupSize>(
354 QueryingAA: *this, IRP: IRPosition::function(F: *Caller), DepClass: DepClassTy::REQUIRED);
355 if (!CallerInfo)
356 return false;
357
358 Change = Change | clampStateAndIndicateChange(S&: this->getState(),
359 R: CallerInfo->getState());
360
361 return true;
362 };
363
364 bool AllCallSitesKnown = true;
365 if (!A.checkForAllCallSites(Pred: CheckCallSite, QueryingAA: *this, RequireAllCallSites: true, UsedAssumedInformation&: AllCallSitesKnown))
366 return indicatePessimisticFixpoint();
367
368 return Change;
369 }
370
371 ChangeStatus manifest(Attributor &A) override {
372 SmallVector<Attribute, 8> AttrList;
373 LLVMContext &Ctx = getAssociatedFunction()->getContext();
374
375 AttrList.push_back(Elt: Attribute::get(Context&: Ctx, Kind: "uniform-work-group-size",
376 Val: getAssumed() ? "true" : "false"));
377 return A.manifestAttrs(IRP: getIRPosition(), DeducedAttrs: AttrList,
378 /* ForceReplace */ true);
379 }
380
381 bool isValidState() const override {
382 // This state is always valid, even when the state is false.
383 return true;
384 }
385
386 const std::string getAsStr(Attributor *) const override {
387 return "AMDWorkGroupSize[" + std::to_string(val: getAssumed()) + "]";
388 }
389
390 /// See AbstractAttribute::trackStatistics()
391 void trackStatistics() const override {}
392};
393
394AAUniformWorkGroupSize &
395AAUniformWorkGroupSize::createForPosition(const IRPosition &IRP,
396 Attributor &A) {
397 if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
398 return *new (A.Allocator) AAUniformWorkGroupSizeFunction(IRP, A);
399 llvm_unreachable(
400 "AAUniformWorkGroupSize is only valid for function position");
401}
402
403struct AAAMDAttributesFunction : public AAAMDAttributes {
404 AAAMDAttributesFunction(const IRPosition &IRP, Attributor &A)
405 : AAAMDAttributes(IRP, A) {}
406
407 void initialize(Attributor &A) override {
408 Function *F = getAssociatedFunction();
409
410 // If the function requires the implicit arg pointer due to sanitizers,
411 // assume it's needed even if explicitly marked as not requiring it.
412 const bool NeedsHostcall = funcRequiresHostcallPtr(F: *F);
413 if (NeedsHostcall) {
414 removeAssumedBits(BitsEncoding: IMPLICIT_ARG_PTR);
415 removeAssumedBits(BitsEncoding: HOSTCALL_PTR);
416 }
417
418 for (auto Attr : ImplicitAttrs) {
419 if (NeedsHostcall &&
420 (Attr.first == IMPLICIT_ARG_PTR || Attr.first == HOSTCALL_PTR))
421 continue;
422
423 if (F->hasFnAttribute(Kind: Attr.second))
424 addKnownBits(Bits: Attr.first);
425 }
426
427 if (F->isDeclaration())
428 return;
429
430 // Ignore functions with graphics calling conventions, these are currently
431 // not allowed to have kernel arguments.
432 if (AMDGPU::isGraphics(CC: F->getCallingConv())) {
433 indicatePessimisticFixpoint();
434 return;
435 }
436 }
437
438 ChangeStatus updateImpl(Attributor &A) override {
439 Function *F = getAssociatedFunction();
440 // The current assumed state used to determine a change.
441 auto OrigAssumed = getAssumed();
442
443 // Check for Intrinsics and propagate attributes.
444 const AACallEdges *AAEdges = A.getAAFor<AACallEdges>(
445 QueryingAA: *this, IRP: this->getIRPosition(), DepClass: DepClassTy::REQUIRED);
446 if (!AAEdges || AAEdges->hasNonAsmUnknownCallee())
447 return indicatePessimisticFixpoint();
448
449 bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(CC: F->getCallingConv());
450
451 bool NeedsImplicit = false;
452 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
453 bool HasApertureRegs = InfoCache.hasApertureRegs(F&: *F);
454 bool SupportsGetDoorbellID = InfoCache.supportsGetDoorbellID(F&: *F);
455 unsigned COV = InfoCache.getCodeObjectVersion();
456
457 for (Function *Callee : AAEdges->getOptimisticEdges()) {
458 Intrinsic::ID IID = Callee->getIntrinsicID();
459 if (IID == Intrinsic::not_intrinsic) {
460 const AAAMDAttributes *AAAMD = A.getAAFor<AAAMDAttributes>(
461 QueryingAA: *this, IRP: IRPosition::function(F: *Callee), DepClass: DepClassTy::REQUIRED);
462 if (!AAAMD)
463 return indicatePessimisticFixpoint();
464 *this &= *AAAMD;
465 continue;
466 }
467
468 bool NonKernelOnly = false;
469 ImplicitArgumentMask AttrMask =
470 intrinsicToAttrMask(ID: IID, NonKernelOnly, NeedsImplicit,
471 HasApertureRegs, SupportsGetDoorBellID: SupportsGetDoorbellID, CodeObjectVersion: COV);
472 if (AttrMask != NOT_IMPLICIT_INPUT) {
473 if ((IsNonEntryFunc || !NonKernelOnly))
474 removeAssumedBits(BitsEncoding: AttrMask);
475 }
476 }
477
478 // Need implicitarg_ptr to acess queue_ptr, private_base, and shared_base.
479 if (NeedsImplicit)
480 removeAssumedBits(BitsEncoding: IMPLICIT_ARG_PTR);
481
482 if (isAssumed(BitsEncoding: QUEUE_PTR) && checkForQueuePtr(A)) {
483 // Under V5, we need implicitarg_ptr + offsets to access private_base or
484 // shared_base. We do not actually need queue_ptr.
485 if (COV >= 5)
486 removeAssumedBits(BitsEncoding: IMPLICIT_ARG_PTR);
487 else
488 removeAssumedBits(BitsEncoding: QUEUE_PTR);
489 }
490
491 if (funcRetrievesMultigridSyncArg(A, COV)) {
492 assert(!isAssumed(IMPLICIT_ARG_PTR) &&
493 "multigrid_sync_arg needs implicitarg_ptr");
494 removeAssumedBits(BitsEncoding: MULTIGRID_SYNC_ARG);
495 }
496
497 if (funcRetrievesHostcallPtr(A, COV)) {
498 assert(!isAssumed(IMPLICIT_ARG_PTR) && "hostcall needs implicitarg_ptr");
499 removeAssumedBits(BitsEncoding: HOSTCALL_PTR);
500 }
501
502 if (funcRetrievesHeapPtr(A, COV)) {
503 assert(!isAssumed(IMPLICIT_ARG_PTR) && "heap_ptr needs implicitarg_ptr");
504 removeAssumedBits(BitsEncoding: HEAP_PTR);
505 }
506
507 if (isAssumed(BitsEncoding: QUEUE_PTR) && funcRetrievesQueuePtr(A, COV)) {
508 assert(!isAssumed(IMPLICIT_ARG_PTR) && "queue_ptr needs implicitarg_ptr");
509 removeAssumedBits(BitsEncoding: QUEUE_PTR);
510 }
511
512 if (isAssumed(BitsEncoding: LDS_KERNEL_ID) && funcRetrievesLDSKernelId(A)) {
513 removeAssumedBits(BitsEncoding: LDS_KERNEL_ID);
514 }
515
516 if (isAssumed(BitsEncoding: DEFAULT_QUEUE) && funcRetrievesDefaultQueue(A, COV))
517 removeAssumedBits(BitsEncoding: DEFAULT_QUEUE);
518
519 if (isAssumed(BitsEncoding: COMPLETION_ACTION) && funcRetrievesCompletionAction(A, COV))
520 removeAssumedBits(BitsEncoding: COMPLETION_ACTION);
521
522 return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED
523 : ChangeStatus::UNCHANGED;
524 }
525
526 ChangeStatus manifest(Attributor &A) override {
527 SmallVector<Attribute, 8> AttrList;
528 LLVMContext &Ctx = getAssociatedFunction()->getContext();
529
530 for (auto Attr : ImplicitAttrs) {
531 if (isKnown(BitsEncoding: Attr.first))
532 AttrList.push_back(Elt: Attribute::get(Context&: Ctx, Kind: Attr.second));
533 }
534
535 return A.manifestAttrs(IRP: getIRPosition(), DeducedAttrs: AttrList,
536 /* ForceReplace */ true);
537 }
538
539 const std::string getAsStr(Attributor *) const override {
540 std::string Str;
541 raw_string_ostream OS(Str);
542 OS << "AMDInfo[";
543 for (auto Attr : ImplicitAttrs)
544 if (isAssumed(BitsEncoding: Attr.first))
545 OS << ' ' << Attr.second;
546 OS << " ]";
547 return OS.str();
548 }
549
550 /// See AbstractAttribute::trackStatistics()
551 void trackStatistics() const override {}
552
553private:
554 bool checkForQueuePtr(Attributor &A) {
555 Function *F = getAssociatedFunction();
556 bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(CC: F->getCallingConv());
557
558 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
559
560 bool NeedsQueuePtr = false;
561
562 auto CheckAddrSpaceCasts = [&](Instruction &I) {
563 unsigned SrcAS = static_cast<AddrSpaceCastInst &>(I).getSrcAddressSpace();
564 if (castRequiresQueuePtr(SrcAS)) {
565 NeedsQueuePtr = true;
566 return false;
567 }
568 return true;
569 };
570
571 bool HasApertureRegs = InfoCache.hasApertureRegs(F&: *F);
572
573 // `checkForAllInstructions` is much more cheaper than going through all
574 // instructions, try it first.
575
576 // The queue pointer is not needed if aperture regs is present.
577 if (!HasApertureRegs) {
578 bool UsedAssumedInformation = false;
579 A.checkForAllInstructions(Pred: CheckAddrSpaceCasts, QueryingAA: *this,
580 Opcodes: {Instruction::AddrSpaceCast},
581 UsedAssumedInformation);
582 }
583
584 // If we found that we need the queue pointer, nothing else to do.
585 if (NeedsQueuePtr)
586 return true;
587
588 if (!IsNonEntryFunc && HasApertureRegs)
589 return false;
590
591 for (BasicBlock &BB : *F) {
592 for (Instruction &I : BB) {
593 for (const Use &U : I.operands()) {
594 if (const auto *C = dyn_cast<Constant>(Val: U)) {
595 if (InfoCache.needsQueuePtr(C, Fn&: *F))
596 return true;
597 }
598 }
599 }
600 }
601
602 return false;
603 }
604
605 bool funcRetrievesMultigridSyncArg(Attributor &A, unsigned COV) {
606 auto Pos = llvm::AMDGPU::getMultigridSyncArgImplicitArgPosition(COV);
607 AA::RangeTy Range(Pos, 8);
608 return funcRetrievesImplicitKernelArg(A, Range);
609 }
610
611 bool funcRetrievesHostcallPtr(Attributor &A, unsigned COV) {
612 auto Pos = llvm::AMDGPU::getHostcallImplicitArgPosition(COV);
613 AA::RangeTy Range(Pos, 8);
614 return funcRetrievesImplicitKernelArg(A, Range);
615 }
616
617 bool funcRetrievesDefaultQueue(Attributor &A, unsigned COV) {
618 auto Pos = llvm::AMDGPU::getDefaultQueueImplicitArgPosition(COV);
619 AA::RangeTy Range(Pos, 8);
620 return funcRetrievesImplicitKernelArg(A, Range);
621 }
622
623 bool funcRetrievesCompletionAction(Attributor &A, unsigned COV) {
624 auto Pos = llvm::AMDGPU::getCompletionActionImplicitArgPosition(COV);
625 AA::RangeTy Range(Pos, 8);
626 return funcRetrievesImplicitKernelArg(A, Range);
627 }
628
629 bool funcRetrievesHeapPtr(Attributor &A, unsigned COV) {
630 if (COV < 5)
631 return false;
632 AA::RangeTy Range(AMDGPU::ImplicitArg::HEAP_PTR_OFFSET, 8);
633 return funcRetrievesImplicitKernelArg(A, Range);
634 }
635
636 bool funcRetrievesQueuePtr(Attributor &A, unsigned COV) {
637 if (COV < 5)
638 return false;
639 AA::RangeTy Range(AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET, 8);
640 return funcRetrievesImplicitKernelArg(A, Range);
641 }
642
643 bool funcRetrievesImplicitKernelArg(Attributor &A, AA::RangeTy Range) {
644 // Check if this is a call to the implicitarg_ptr builtin and it
645 // is used to retrieve the hostcall pointer. The implicit arg for
646 // hostcall is not used only if every use of the implicitarg_ptr
647 // is a load that clearly does not retrieve any byte of the
648 // hostcall pointer. We check this by tracing all the uses of the
649 // initial call to the implicitarg_ptr intrinsic.
650 auto DoesNotLeadToKernelArgLoc = [&](Instruction &I) {
651 auto &Call = cast<CallBase>(Val&: I);
652 if (Call.getIntrinsicID() != Intrinsic::amdgcn_implicitarg_ptr)
653 return true;
654
655 const auto *PointerInfoAA = A.getAAFor<AAPointerInfo>(
656 QueryingAA: *this, IRP: IRPosition::callsite_returned(CB: Call), DepClass: DepClassTy::REQUIRED);
657 if (!PointerInfoAA)
658 return false;
659
660 return PointerInfoAA->forallInterferingAccesses(
661 Range, CB: [](const AAPointerInfo::Access &Acc, bool IsExact) {
662 return Acc.getRemoteInst()->isDroppable();
663 });
664 };
665
666 bool UsedAssumedInformation = false;
667 return !A.checkForAllCallLikeInstructions(Pred: DoesNotLeadToKernelArgLoc, QueryingAA: *this,
668 UsedAssumedInformation);
669 }
670
671 bool funcRetrievesLDSKernelId(Attributor &A) {
672 auto DoesNotRetrieve = [&](Instruction &I) {
673 auto &Call = cast<CallBase>(Val&: I);
674 return Call.getIntrinsicID() != Intrinsic::amdgcn_lds_kernel_id;
675 };
676 bool UsedAssumedInformation = false;
677 return !A.checkForAllCallLikeInstructions(DoesNotRetrieve, *this,
678 UsedAssumedInformation);
679 }
680};
681
682AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP,
683 Attributor &A) {
684 if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
685 return *new (A.Allocator) AAAMDAttributesFunction(IRP, A);
686 llvm_unreachable("AAAMDAttributes is only valid for function position");
687}
688
689/// Base class to derive different size ranges.
690struct AAAMDSizeRangeAttribute
691 : public StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t> {
692 using Base = StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t>;
693
694 StringRef AttrName;
695
696 AAAMDSizeRangeAttribute(const IRPosition &IRP, Attributor &A,
697 StringRef AttrName)
698 : Base(IRP, 32), AttrName(AttrName) {}
699
700 /// See AbstractAttribute::trackStatistics()
701 void trackStatistics() const override {}
702
703 template <class AttributeImpl>
704 ChangeStatus updateImplImpl(Attributor &A) {
705 ChangeStatus Change = ChangeStatus::UNCHANGED;
706
707 auto CheckCallSite = [&](AbstractCallSite CS) {
708 Function *Caller = CS.getInstruction()->getFunction();
709 LLVM_DEBUG(dbgs() << '[' << getName() << "] Call " << Caller->getName()
710 << "->" << getAssociatedFunction()->getName() << '\n');
711
712 const auto *CallerInfo = A.getAAFor<AttributeImpl>(
713 *this, IRPosition::function(F: *Caller), DepClassTy::REQUIRED);
714 if (!CallerInfo)
715 return false;
716
717 Change |=
718 clampStateAndIndicateChange(this->getState(), CallerInfo->getState());
719
720 return true;
721 };
722
723 bool AllCallSitesKnown = true;
724 if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
725 return indicatePessimisticFixpoint();
726
727 return Change;
728 }
729
730 ChangeStatus emitAttributeIfNotDefault(Attributor &A, unsigned Min,
731 unsigned Max) {
732 // Don't add the attribute if it's the implied default.
733 if (getAssumed().getLower() == Min && getAssumed().getUpper() - 1 == Max)
734 return ChangeStatus::UNCHANGED;
735
736 Function *F = getAssociatedFunction();
737 LLVMContext &Ctx = F->getContext();
738 SmallString<10> Buffer;
739 raw_svector_ostream OS(Buffer);
740 OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1;
741 return A.manifestAttrs(IRP: getIRPosition(),
742 DeducedAttrs: {Attribute::get(Context&: Ctx, Kind: AttrName, Val: OS.str())},
743 /* ForceReplace */ true);
744 }
745
746 const std::string getAsStr(Attributor *) const override {
747 std::string Str;
748 raw_string_ostream OS(Str);
749 OS << getName() << '[';
750 OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1;
751 OS << ']';
752 return OS.str();
753 }
754};
755
756/// Propagate amdgpu-flat-work-group-size attribute.
757struct AAAMDFlatWorkGroupSize : public AAAMDSizeRangeAttribute {
758 AAAMDFlatWorkGroupSize(const IRPosition &IRP, Attributor &A)
759 : AAAMDSizeRangeAttribute(IRP, A, "amdgpu-flat-work-group-size") {}
760
761 void initialize(Attributor &A) override {
762 Function *F = getAssociatedFunction();
763 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
764 unsigned MinGroupSize, MaxGroupSize;
765 std::tie(args&: MinGroupSize, args&: MaxGroupSize) = InfoCache.getFlatWorkGroupSizes(F: *F);
766 intersectKnown(
767 R: ConstantRange(APInt(32, MinGroupSize), APInt(32, MaxGroupSize + 1)));
768
769 if (AMDGPU::isEntryFunctionCC(CC: F->getCallingConv()))
770 indicatePessimisticFixpoint();
771 }
772
773 ChangeStatus updateImpl(Attributor &A) override {
774 return updateImplImpl<AAAMDFlatWorkGroupSize>(A);
775 }
776
777 /// Create an abstract attribute view for the position \p IRP.
778 static AAAMDFlatWorkGroupSize &createForPosition(const IRPosition &IRP,
779 Attributor &A);
780
781 ChangeStatus manifest(Attributor &A) override {
782 Function *F = getAssociatedFunction();
783 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
784 unsigned Min, Max;
785 std::tie(args&: Min, args&: Max) = InfoCache.getMaximumFlatWorkGroupRange(F: *F);
786 return emitAttributeIfNotDefault(A, Min, Max);
787 }
788
789 /// See AbstractAttribute::getName()
790 const std::string getName() const override {
791 return "AAAMDFlatWorkGroupSize";
792 }
793
794 /// See AbstractAttribute::getIdAddr()
795 const char *getIdAddr() const override { return &ID; }
796
797 /// This function should return true if the type of the \p AA is
798 /// AAAMDFlatWorkGroupSize
799 static bool classof(const AbstractAttribute *AA) {
800 return (AA->getIdAddr() == &ID);
801 }
802
803 /// Unique ID (due to the unique address)
804 static const char ID;
805};
806
807const char AAAMDFlatWorkGroupSize::ID = 0;
808
809AAAMDFlatWorkGroupSize &
810AAAMDFlatWorkGroupSize::createForPosition(const IRPosition &IRP,
811 Attributor &A) {
812 if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
813 return *new (A.Allocator) AAAMDFlatWorkGroupSize(IRP, A);
814 llvm_unreachable(
815 "AAAMDFlatWorkGroupSize is only valid for function position");
816}
817
818/// Propagate amdgpu-waves-per-eu attribute.
819struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
820 AAAMDWavesPerEU(const IRPosition &IRP, Attributor &A)
821 : AAAMDSizeRangeAttribute(IRP, A, "amdgpu-waves-per-eu") {}
822
823 bool isValidState() const override {
824 return !Assumed.isEmptySet() && IntegerRangeState::isValidState();
825 }
826
827 void initialize(Attributor &A) override {
828 Function *F = getAssociatedFunction();
829 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
830
831 if (const auto *AssumedGroupSize = A.getAAFor<AAAMDFlatWorkGroupSize>(
832 QueryingAA: *this, IRP: IRPosition::function(F: *F), DepClass: DepClassTy::REQUIRED)) {
833
834 unsigned Min, Max;
835 std::tie(args&: Min, args&: Max) = InfoCache.getWavesPerEU(
836 F: *F, FlatWorkGroupSize: {AssumedGroupSize->getAssumed().getLower().getZExtValue(),
837 AssumedGroupSize->getAssumed().getUpper().getZExtValue() - 1});
838
839 ConstantRange Range(APInt(32, Min), APInt(32, Max + 1));
840 intersectKnown(R: Range);
841 }
842
843 if (AMDGPU::isEntryFunctionCC(CC: F->getCallingConv()))
844 indicatePessimisticFixpoint();
845 }
846
847 ChangeStatus updateImpl(Attributor &A) override {
848 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
849 ChangeStatus Change = ChangeStatus::UNCHANGED;
850
851 auto CheckCallSite = [&](AbstractCallSite CS) {
852 Function *Caller = CS.getInstruction()->getFunction();
853 Function *Func = getAssociatedFunction();
854 LLVM_DEBUG(dbgs() << '[' << getName() << "] Call " << Caller->getName()
855 << "->" << Func->getName() << '\n');
856
857 const auto *CallerInfo = A.getAAFor<AAAMDWavesPerEU>(
858 QueryingAA: *this, IRP: IRPosition::function(F: *Caller), DepClass: DepClassTy::REQUIRED);
859 const auto *AssumedGroupSize = A.getAAFor<AAAMDFlatWorkGroupSize>(
860 QueryingAA: *this, IRP: IRPosition::function(F: *Func), DepClass: DepClassTy::REQUIRED);
861 if (!CallerInfo || !AssumedGroupSize)
862 return false;
863
864 unsigned Min, Max;
865 std::tie(args&: Min, args&: Max) = InfoCache.getEffectiveWavesPerEU(
866 F: *Caller,
867 WavesPerEU: {CallerInfo->getAssumed().getLower().getZExtValue(),
868 CallerInfo->getAssumed().getUpper().getZExtValue() - 1},
869 FlatWorkGroupSize: {AssumedGroupSize->getAssumed().getLower().getZExtValue(),
870 AssumedGroupSize->getAssumed().getUpper().getZExtValue() - 1});
871 ConstantRange CallerRange(APInt(32, Min), APInt(32, Max + 1));
872 IntegerRangeState CallerRangeState(CallerRange);
873 Change |= clampStateAndIndicateChange(S&: this->getState(), R: CallerRangeState);
874
875 return true;
876 };
877
878 bool AllCallSitesKnown = true;
879 if (!A.checkForAllCallSites(Pred: CheckCallSite, QueryingAA: *this, RequireAllCallSites: true, UsedAssumedInformation&: AllCallSitesKnown))
880 return indicatePessimisticFixpoint();
881
882 return Change;
883 }
884
885 /// Create an abstract attribute view for the position \p IRP.
886 static AAAMDWavesPerEU &createForPosition(const IRPosition &IRP,
887 Attributor &A);
888
889 ChangeStatus manifest(Attributor &A) override {
890 Function *F = getAssociatedFunction();
891 auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
892 unsigned Max = InfoCache.getMaxWavesPerEU(F: *F);
893 return emitAttributeIfNotDefault(A, Min: 1, Max);
894 }
895
896 /// See AbstractAttribute::getName()
897 const std::string getName() const override { return "AAAMDWavesPerEU"; }
898
899 /// See AbstractAttribute::getIdAddr()
900 const char *getIdAddr() const override { return &ID; }
901
902 /// This function should return true if the type of the \p AA is
903 /// AAAMDWavesPerEU
904 static bool classof(const AbstractAttribute *AA) {
905 return (AA->getIdAddr() == &ID);
906 }
907
908 /// Unique ID (due to the unique address)
909 static const char ID;
910};
911
912const char AAAMDWavesPerEU::ID = 0;
913
914AAAMDWavesPerEU &AAAMDWavesPerEU::createForPosition(const IRPosition &IRP,
915 Attributor &A) {
916 if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
917 return *new (A.Allocator) AAAMDWavesPerEU(IRP, A);
918 llvm_unreachable("AAAMDWavesPerEU is only valid for function position");
919}
920
921static void addPreloadKernArgHint(Function &F, TargetMachine &TM) {
922 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
923 for (unsigned I = 0;
924 I < F.arg_size() &&
925 I < std::min(a: KernargPreloadCount.getValue(), b: ST.getMaxNumUserSGPRs());
926 ++I) {
927 Argument &Arg = *F.getArg(i: I);
928 // Check for incompatible attributes.
929 if (Arg.hasByRefAttr() || Arg.hasNestAttr())
930 break;
931
932 Arg.addAttr(Attribute::InReg);
933 }
934}
935
936static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM) {
937 SetVector<Function *> Functions;
938 for (Function &F : M) {
939 if (!F.isIntrinsic())
940 Functions.insert(X: &F);
941 }
942
943 CallGraphUpdater CGUpdater;
944 BumpPtrAllocator Allocator;
945 AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, TM);
946 DenseSet<const char *> Allowed(
947 {&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
948 &AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID,
949 &AAAMDWavesPerEU::ID, &AACallEdges::ID, &AAPointerInfo::ID,
950 &AAPotentialConstantValues::ID, &AAUnderlyingObjects::ID});
951
952 AttributorConfig AC(CGUpdater);
953 AC.Allowed = &Allowed;
954 AC.IsModulePass = true;
955 AC.DefaultInitializeLiveInternals = false;
956 AC.IPOAmendableCB = [](const Function &F) {
957 return F.getCallingConv() == CallingConv::AMDGPU_KERNEL;
958 };
959
960 Attributor A(Functions, InfoCache, AC);
961
962 for (Function &F : M) {
963 if (!F.isIntrinsic()) {
964 A.getOrCreateAAFor<AAAMDAttributes>(IRP: IRPosition::function(F));
965 A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRP: IRPosition::function(F));
966 CallingConv::ID CC = F.getCallingConv();
967 if (!AMDGPU::isEntryFunctionCC(CC)) {
968 A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRP: IRPosition::function(F));
969 A.getOrCreateAAFor<AAAMDWavesPerEU>(IRP: IRPosition::function(F));
970 } else if (CC == CallingConv::AMDGPU_KERNEL) {
971 addPreloadKernArgHint(F, TM);
972 }
973 }
974 }
975
976 ChangeStatus Change = A.run();
977 return Change == ChangeStatus::CHANGED;
978}
979
980class AMDGPUAttributorLegacy : public ModulePass {
981public:
982 AMDGPUAttributorLegacy() : ModulePass(ID) {}
983
984 /// doInitialization - Virtual method overridden by subclasses to do
985 /// any necessary initialization before any pass is run.
986 bool doInitialization(Module &) override {
987 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
988 if (!TPC)
989 report_fatal_error(reason: "TargetMachine is required");
990
991 TM = &TPC->getTM<TargetMachine>();
992 return false;
993 }
994
995 bool runOnModule(Module &M) override {
996 AnalysisGetter AG(this);
997 return runImpl(M, AG, TM&: *TM);
998 }
999
1000 void getAnalysisUsage(AnalysisUsage &AU) const override {
1001 AU.addRequired<CycleInfoWrapperPass>();
1002 }
1003
1004 StringRef getPassName() const override { return "AMDGPU Attributor"; }
1005 TargetMachine *TM;
1006 static char ID;
1007};
1008} // namespace
1009
1010PreservedAnalyses llvm::AMDGPUAttributorPass::run(Module &M,
1011 ModuleAnalysisManager &AM) {
1012
1013 FunctionAnalysisManager &FAM =
1014 AM.getResult<FunctionAnalysisManagerModuleProxy>(IR&: M).getManager();
1015 AnalysisGetter AG(FAM);
1016
1017 // TODO: Probably preserves CFG
1018 return runImpl(M, AG, TM) ? PreservedAnalyses::none()
1019 : PreservedAnalyses::all();
1020}
1021
1022char AMDGPUAttributorLegacy::ID = 0;
1023
1024Pass *llvm::createAMDGPUAttributorLegacyPass() {
1025 return new AMDGPUAttributorLegacy();
1026}
1027INITIALIZE_PASS_BEGIN(AMDGPUAttributorLegacy, DEBUG_TYPE, "AMDGPU Attributor",
1028 false, false)
1029INITIALIZE_PASS_DEPENDENCY(CycleInfoWrapperPass);
1030INITIALIZE_PASS_END(AMDGPUAttributorLegacy, DEBUG_TYPE, "AMDGPU Attributor",
1031 false, false)
1032

source code of llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp