AMDGPUAttributor.cpp source code [llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp]

1	//===- AMDGPUAttributor.cpp -----------------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file This pass uses Attributor framework to deduce AMDGPU attributes.
10	//
11	//===----------------------------------------------------------------------===//
12
13	#include "AMDGPU.h"
14	#include "GCNSubtarget.h"
15	#include "Utils/AMDGPUBaseInfo.h"
16	#include "llvm/Analysis/CycleAnalysis.h"
17	#include "llvm/CodeGen/TargetPassConfig.h"
18	#include "llvm/IR/IntrinsicsAMDGPU.h"
19	#include "llvm/IR/IntrinsicsR600.h"
20	#include "llvm/Target/TargetMachine.h"
21	#include "llvm/Transforms/IPO/Attributor.h"
22
23	#define DEBUG_TYPE "amdgpu-attributor"
24
25	namespace llvm {
26	void initializeCycleInfoWrapperPassPass(PassRegistry &);
27	}
28
29	using namespace llvm;
30
31	static cl::opt<unsigned> KernargPreloadCount(
32	"amdgpu-kernarg-preload-count",
33	cl::desc ("How many kernel arguments to preload onto SGPRs"), cl::init(Val: `0`));
34
35	#define AMDGPU_ATTRIBUTE(Name, Str) Name##_POS,
36
37	enum ImplicitArgumentPositions {
38	#include "AMDGPUAttributes.def"
39	LAST_ARG_POS
40	};
41
42	#define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS,
43
44	enum ImplicitArgumentMask {
45	NOT_IMPLICIT_INPUT = `0`,
46	#include "AMDGPUAttributes.def"
47	ALL_ARGUMENT_MASK = (`1` << LAST_ARG_POS) - `1`
48	};
49
50	#define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str},
51	static constexpr std::pair<ImplicitArgumentMask,
52	StringLiteral> ImplicitAttrs[] = {
53	#include "AMDGPUAttributes.def"
54	};
55
56	// We do not need to note the x workitem or workgroup id because they are always
57	// initialized.
58	//
59	// TODO: We should not add the attributes if the known compile time workgroup
60	// size is 1 for y/z.
61	static ImplicitArgumentMask
62	intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit,
63	bool HasApertureRegs, bool SupportsGetDoorBellID,
64	unsigned CodeObjectVersion) {
65	switch (ID) {
66	case Intrinsic::amdgcn_workitem_id_x:
67	NonKernelOnly = true;
68	return WORKITEM_ID_X;
69	case Intrinsic::amdgcn_workgroup_id_x:
70	NonKernelOnly = true;
71	return WORKGROUP_ID_X;
72	case Intrinsic::amdgcn_workitem_id_y:
73	case Intrinsic::r600_read_tidig_y:
74	return WORKITEM_ID_Y;
75	case Intrinsic::amdgcn_workitem_id_z:
76	case Intrinsic::r600_read_tidig_z:
77	return WORKITEM_ID_Z;
78	case Intrinsic::amdgcn_workgroup_id_y:
79	case Intrinsic::r600_read_tgid_y:
80	return WORKGROUP_ID_Y;
81	case Intrinsic::amdgcn_workgroup_id_z:
82	case Intrinsic::r600_read_tgid_z:
83	return WORKGROUP_ID_Z;
84	case Intrinsic::amdgcn_lds_kernel_id:
85	return LDS_KERNEL_ID;
86	case Intrinsic::amdgcn_dispatch_ptr:
87	return DISPATCH_PTR;
88	case Intrinsic::amdgcn_dispatch_id:
89	return DISPATCH_ID;
90	case Intrinsic::amdgcn_implicitarg_ptr:
91	return IMPLICIT_ARG_PTR;
92	// Need queue_ptr anyway. But under V5, we also need implicitarg_ptr to access
93	// queue_ptr.
94	case Intrinsic::amdgcn_queue_ptr:
95	NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5);
96	return QUEUE_PTR;
97	case Intrinsic::amdgcn_is_shared:
98	case Intrinsic::amdgcn_is_private:
99	if (HasApertureRegs)
100	return NOT_IMPLICIT_INPUT;
101	// Under V5, we need implicitarg_ptr + offsets to access private_base or
102	// shared_base. For pre-V5, however, need to access them through queue_ptr +
103	// offsets.
104	return CodeObjectVersion >= AMDGPU::AMDHSA_COV5 ? IMPLICIT_ARG_PTR :
105	QUEUE_PTR;
106	case Intrinsic::trap:
107	if (SupportsGetDoorBellID) // GetDoorbellID support implemented since V4.
108	return CodeObjectVersion >= AMDGPU::AMDHSA_COV4 ? NOT_IMPLICIT_INPUT :
109	QUEUE_PTR;
110	NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5);
111	return QUEUE_PTR;
112	default:
113	return NOT_IMPLICIT_INPUT;
114	}
115	}
116
117	static bool castRequiresQueuePtr(unsigned SrcAS) {
118	return SrcAS == AMDGPUAS::LOCAL_ADDRESS \|\| SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
119	}
120
121	static bool isDSAddress(const Constant *C) {
122	const GlobalValue *GV = dyn_cast<GlobalValue>(Val: C);
123	if (!GV)
124	return false;
125	unsigned AS = GV->getAddressSpace();
126	return AS == AMDGPUAS::LOCAL_ADDRESS \|\| AS == AMDGPUAS::REGION_ADDRESS;
127	}
128
129	/// Returns true if the function requires the implicit argument be passed
130	/// regardless of the function contents.
131	static bool funcRequiresHostcallPtr(const Function &F) {
132	// Sanitizers require the hostcall buffer passed in the implicit arguments.
133	return F.hasFnAttribute(Attribute::SanitizeAddress) \|\|
134	F.hasFnAttribute(Attribute::SanitizeThread) \|\|
135	F.hasFnAttribute(Attribute::SanitizeMemory) \|\|
136	F.hasFnAttribute(Attribute::SanitizeHWAddress) \|\|
137	F.hasFnAttribute(Attribute::SanitizeMemTag);
138	}
139
140	namespace {
141	class AMDGPUInformationCache : public InformationCache {
142	public:
143	AMDGPUInformationCache(const Module &M, AnalysisGetter &AG,
144	BumpPtrAllocator &Allocator,
145	SetVector<Function > CGSCC, TargetMachine &TM)
146	: InformationCache (M, AG, Allocator, CGSCC), TM(TM),
147	CodeObjectVersion(AMDGPU::getAMDHSACodeObjectVersion(M)) {}
148
149	TargetMachine &TM;
150
151	enum ConstantStatus { DS_GLOBAL = `1` << `0`, ADDR_SPACE_CAST = `1` << `1` };
152
153	/// Check if the subtarget has aperture regs.
154	bool hasApertureRegs(Function &F) {
155	const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
156	return ST.hasApertureRegs();
157	}
158
159	/// Check if the subtarget supports GetDoorbellID.
160	bool supportsGetDoorbellID(Function &F) {
161	const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
162	return ST.supportsGetDoorbellID();
163	}
164
165	std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) {
166	const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
167	return ST.getFlatWorkGroupSizes(F);
168	}
169
170	std::pair<unsigned, unsigned>
171	getMaximumFlatWorkGroupRange(const Function &F) {
172	const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
173	return {ST.getMinFlatWorkGroupSize(), ST.getMaxFlatWorkGroupSize()};
174	}
175
176	/// Get code object version.
177	unsigned getCodeObjectVersion() const {
178	return CodeObjectVersion;
179	}
180
181	/// Get the effective value of "amdgpu-waves-per-eu" for the function,
182	/// accounting for the interaction with the passed value to use for
183	/// "amdgpu-flat-work-group-size".
184	std::pair<unsigned, unsigned>
185	getWavesPerEU(const Function &F,
186	std::pair<unsigned, unsigned> FlatWorkGroupSize) {
187	const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
188	return ST.getWavesPerEU(F, FlatWorkGroupSize);
189	}
190
191	std::pair<unsigned, unsigned>
192	getEffectiveWavesPerEU(const Function &F,
193	std::pair<unsigned, unsigned> WavesPerEU,
194	std::pair<unsigned, unsigned> FlatWorkGroupSize) {
195	const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
196	return ST.getEffectiveWavesPerEU(WavesPerEU, FlatWorkGroupSize);
197	}
198
199	unsigned getMaxWavesPerEU(const Function &F) {
200	const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
201	return ST.getMaxWavesPerEU();
202	}
203
204	private:
205	/// Check if the ConstantExpr \p CE requires the queue pointer.
206	static bool visitConstExpr(const ConstantExpr *CE) {
207	if (CE->getOpcode() == Instruction::AddrSpaceCast) {
208	unsigned SrcAS = CE->getOperand(i_nocapture: `0`)->getType()->getPointerAddressSpace();
209	return castRequiresQueuePtr(SrcAS);
210	}
211	return false;
212	}
213
214	/// Get the constant access bitmap for \p C.
215	uint8_t getConstantAccess(const Constant *C,
216	SmallPtrSetImpl<const Constant *> &Visited) {
217	auto It = ConstantStatus.find(Val: C);
218	if (It != ConstantStatus.end())
219	return It ->second;
220
221	uint8_t Result = `0`;
222	if (isDSAddress(C))
223	Result = DS_GLOBAL;
224
225	if (const auto *CE = dyn_cast<ConstantExpr>(Val: C))
226	if (visitConstExpr(CE))
227	Result \|= ADDR_SPACE_CAST;
228
229	for (const Use &U : C->operands()) {
230	const auto *OpC = dyn_cast<Constant>(Val: U);
231	if (!OpC \|\| !Visited.insert(Ptr: OpC).second)
232	continue;
233
234	Result \|= getConstantAccess(C: OpC, Visited);
235	}
236	return Result;
237	}
238
239	public:
240	/// Returns true if \p Fn needs the queue pointer because of \p C.
241	bool needsQueuePtr(const Constant *C, Function &Fn) {
242	bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(CC: Fn.getCallingConv());
243	bool HasAperture = hasApertureRegs(F&: Fn);
244
245	// No need to explore the constants.
246	if (!IsNonEntryFunc && HasAperture)
247	return false;
248
249	SmallPtrSet<const Constant *, `8`> Visited;
250	uint8_t Access = getConstantAccess(C, Visited);
251
252	// We need to trap on DS globals in non-entry functions.
253	if (IsNonEntryFunc && (Access & DS_GLOBAL))
254	return true;
255
256	return !HasAperture && (Access & ADDR_SPACE_CAST);
257	}
258
259	private:
260	/// Used to determine if the Constant needs the queue pointer.
261	DenseMap<const Constant *, uint8_t> ConstantStatus;
262	const unsigned CodeObjectVersion;
263	};
264
265	struct AAAMDAttributes
266	: public StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, `0`>,
267	AbstractAttribute> {
268	using Base = StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, `0`>,
269	AbstractAttribute>;
270
271	AAAMDAttributes(const IRPosition &IRP, Attributor &A) : Base (IRP) {}
272
273	/// Create an abstract attribute view for the position \p IRP.
274	static AAAMDAttributes &createForPosition(const IRPosition &IRP,
275	Attributor &A);
276
277	/// See AbstractAttribute::getName().
278	const std::string getName() const override { return "AAAMDAttributes"; }
279
280	/// See AbstractAttribute::getIdAddr().
281	const char getIdAddr() const* override { return &ID; }
282
283	/// This function should return true if the type of the \p AA is
284	/// AAAMDAttributes.
285	static bool classof(const AbstractAttribute *AA) {
286	return (AA->getIdAddr() == &ID);
287	}
288
289	/// Unique ID (due to the unique address)
290	static const char ID;
291	};
292	const char AAAMDAttributes::ID = `0`;
293
294	struct AAUniformWorkGroupSize
295	: public StateWrapper<BooleanState, AbstractAttribute> {
296	using Base = StateWrapper<BooleanState, AbstractAttribute>;
297	AAUniformWorkGroupSize(const IRPosition &IRP, Attributor &A) : Base (IRP) {}
298
299	/// Create an abstract attribute view for the position \p IRP.
300	static AAUniformWorkGroupSize &createForPosition(const IRPosition &IRP,
301	Attributor &A);
302
303	/// See AbstractAttribute::getName().
304	const std::string getName() const override {
305	return "AAUniformWorkGroupSize";
306	}
307
308	/// See AbstractAttribute::getIdAddr().
309	const char getIdAddr() const* override { return &ID; }
310
311	/// This function should return true if the type of the \p AA is
312	/// AAAMDAttributes.
313	static bool classof(const AbstractAttribute *AA) {
314	return (AA->getIdAddr() == &ID);
315	}
316
317	/// Unique ID (due to the unique address)
318	static const char ID;
319	};
320	const char AAUniformWorkGroupSize::ID = `0`;
321
322	struct AAUniformWorkGroupSizeFunction : public AAUniformWorkGroupSize {
323	AAUniformWorkGroupSizeFunction(const IRPosition &IRP, Attributor &A)
324	: AAUniformWorkGroupSize (IRP, A) {}
325
326	void initialize(Attributor &A) override {
327	Function *F = getAssociatedFunction();
328	CallingConv::ID CC = F->getCallingConv();
329
330	if (CC != CallingConv::AMDGPU_KERNEL)
331	return;
332
333	bool InitialValue = false;
334	if (F->hasFnAttribute(Kind: "uniform-work-group-size"))
335	InitialValue = F->getFnAttribute(Kind: "uniform-work-group-size")
336	.getValueAsString()
337	.equals(RHS: "true");
338
339	if (InitialValue)
340	indicateOptimisticFixpoint();
341	else
342	indicatePessimisticFixpoint();
343	}
344
345	ChangeStatus updateImpl(Attributor &A) override {
346	ChangeStatus Change = ChangeStatus::UNCHANGED;
347
348	auto CheckCallSite = [&](AbstractCallSite CS) {
349	Function *Caller = CS.getInstruction()->getFunction();
350	LLVM_DEBUG(dbgs() << "[AAUniformWorkGroupSize] Call " << Caller->getName()
351	<< "->" << getAssociatedFunction()->getName() << "\n");
352
353	const auto *CallerInfo = A.getAAFor<AAUniformWorkGroupSize>(
354	QueryingAA: *this, IRP: IRPosition::function(F: *Caller), DepClass: DepClassTy::REQUIRED);
355	if (!CallerInfo)
356	return false;
357
358	Change = Change \| clampStateAndIndicateChange(S&: this->getState(),
359	R: CallerInfo->getState());
360
361	return true;
362	};
363
364	bool AllCallSitesKnown = true;
365	if (!A.checkForAllCallSites(Pred: CheckCallSite, QueryingAA: *this, RequireAllCallSites: true, UsedAssumedInformation&: AllCallSitesKnown))
366	return indicatePessimisticFixpoint();
367
368	return Change;
369	}
370
371	ChangeStatus manifest(Attributor &A) override {
372	SmallVector<Attribute, `8`> AttrList;
373	LLVMContext &Ctx = getAssociatedFunction()->getContext();
374
375	AttrList.push_back(Elt: Attribute::get(Context&: Ctx, Kind: "uniform-work-group-size",
376	Val: getAssumed() ? "true" : "false"));
377	return A.manifestAttrs(IRP: getIRPosition(), DeducedAttrs: AttrList,
378	/ ForceReplace / true);
379	}
380
381	bool isValidState() const override {
382	// This state is always valid, even when the state is false.
383	return true;
384	}
385
386	const std::string getAsStr(Attributor ) const* override {
387	return "AMDWorkGroupSize[" + std::to_string(val: getAssumed()) + "]";
388	}
389
390	/// See AbstractAttribute::trackStatistics()
391	void trackStatistics() const override {}
392	};
393
394	AAUniformWorkGroupSize &
395	AAUniformWorkGroupSize::createForPosition(const IRPosition &IRP,
396	Attributor &A) {
397	if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
398	return *new (A.Allocator) AAUniformWorkGroupSizeFunction (IRP, A);
399	llvm_unreachable(
400	"AAUniformWorkGroupSize is only valid for function position");
401	}
402
403	struct AAAMDAttributesFunction : public AAAMDAttributes {
404	AAAMDAttributesFunction(const IRPosition &IRP, Attributor &A)
405	: AAAMDAttributes (IRP, A) {}
406
407	void initialize(Attributor &A) override {
408	Function *F = getAssociatedFunction();
409
410	// If the function requires the implicit arg pointer due to sanitizers,
411	// assume it's needed even if explicitly marked as not requiring it.
412	const bool NeedsHostcall = funcRequiresHostcallPtr(F: *F);
413	if (NeedsHostcall) {
414	removeAssumedBits(BitsEncoding: IMPLICIT_ARG_PTR);
415	removeAssumedBits(BitsEncoding: HOSTCALL_PTR);
416	}
417
418	for (auto Attr : ImplicitAttrs) {
419	if (NeedsHostcall &&
420	(Attr.first == IMPLICIT_ARG_PTR \|\| Attr.first == HOSTCALL_PTR))
421	continue;
422
423	if (F->hasFnAttribute(Kind: Attr.second))
424	addKnownBits(Bits: Attr.first);
425	}
426
427	if (F->isDeclaration())
428	return;
429
430	// Ignore functions with graphics calling conventions, these are currently
431	// not allowed to have kernel arguments.
432	if (AMDGPU::isGraphics(CC: F->getCallingConv())) {
433	indicatePessimisticFixpoint();
434	return;
435	}
436	}
437
438	ChangeStatus updateImpl(Attributor &A) override {
439	Function *F = getAssociatedFunction();
440	// The current assumed state used to determine a change.
441	auto OrigAssumed = getAssumed();
442
443	// Check for Intrinsics and propagate attributes.
444	const AACallEdges *AAEdges = A.getAAFor<AACallEdges>(
445	QueryingAA: *this, IRP: this->getIRPosition(), DepClass: DepClassTy::REQUIRED);
446	if (!AAEdges \|\| AAEdges->hasNonAsmUnknownCallee())
447	return indicatePessimisticFixpoint();
448
449	bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(CC: F->getCallingConv());
450
451	bool NeedsImplicit = false;
452	auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
453	bool HasApertureRegs = InfoCache.hasApertureRegs(F&: *F);
454	bool SupportsGetDoorbellID = InfoCache.supportsGetDoorbellID(F&: *F);
455	unsigned COV = InfoCache.getCodeObjectVersion();
456
457	for (Function *Callee : AAEdges->getOptimisticEdges()) {
458	Intrinsic::ID IID = Callee->getIntrinsicID();
459	if (IID == Intrinsic::not_intrinsic) {
460	const AAAMDAttributes *AAAMD = A.getAAFor<AAAMDAttributes>(
461	QueryingAA: *this, IRP: IRPosition::function(F: *Callee), DepClass: DepClassTy::REQUIRED);
462	if (!AAAMD)
463	return indicatePessimisticFixpoint();
464	*this &= *AAAMD;
465	continue;
466	}
467
468	bool NonKernelOnly = false;
469	ImplicitArgumentMask AttrMask =
470	intrinsicToAttrMask(ID: IID, NonKernelOnly, NeedsImplicit,
471	HasApertureRegs, SupportsGetDoorBellID: SupportsGetDoorbellID, CodeObjectVersion: COV);
472	if (AttrMask != NOT_IMPLICIT_INPUT) {
473	if ((IsNonEntryFunc \|\| !NonKernelOnly))
474	removeAssumedBits(BitsEncoding: AttrMask);
475	}
476	}
477
478	// Need implicitarg_ptr to acess queue_ptr, private_base, and shared_base.
479	if (NeedsImplicit)
480	removeAssumedBits(BitsEncoding: IMPLICIT_ARG_PTR);
481
482	if (isAssumed(BitsEncoding: QUEUE_PTR) && checkForQueuePtr(A)) {
483	// Under V5, we need implicitarg_ptr + offsets to access private_base or
484	// shared_base. We do not actually need queue_ptr.
485	if (COV >= `5`)
486	removeAssumedBits(BitsEncoding: IMPLICIT_ARG_PTR);
487	else
488	removeAssumedBits(BitsEncoding: QUEUE_PTR);
489	}
490
491	if (funcRetrievesMultigridSyncArg(A, COV)) {
492	assert(!isAssumed(IMPLICIT_ARG_PTR) &&
493	"multigrid_sync_arg needs implicitarg_ptr");
494	removeAssumedBits(BitsEncoding: MULTIGRID_SYNC_ARG);
495	}
496
497	if (funcRetrievesHostcallPtr(A, COV)) {
498	assert(!isAssumed(IMPLICIT_ARG_PTR) && "hostcall needs implicitarg_ptr");
499	removeAssumedBits(BitsEncoding: HOSTCALL_PTR);
500	}
501
502	if (funcRetrievesHeapPtr(A, COV)) {
503	assert(!isAssumed(IMPLICIT_ARG_PTR) && "heap_ptr needs implicitarg_ptr");
504	removeAssumedBits(BitsEncoding: HEAP_PTR);
505	}
506
507	if (isAssumed(BitsEncoding: QUEUE_PTR) && funcRetrievesQueuePtr(A, COV)) {
508	assert(!isAssumed(IMPLICIT_ARG_PTR) && "queue_ptr needs implicitarg_ptr");
509	removeAssumedBits(BitsEncoding: QUEUE_PTR);
510	}
511
512	if (isAssumed(BitsEncoding: LDS_KERNEL_ID) && funcRetrievesLDSKernelId(A)) {
513	removeAssumedBits(BitsEncoding: LDS_KERNEL_ID);
514	}
515
516	if (isAssumed(BitsEncoding: DEFAULT_QUEUE) && funcRetrievesDefaultQueue(A, COV))
517	removeAssumedBits(BitsEncoding: DEFAULT_QUEUE);
518
519	if (isAssumed(BitsEncoding: COMPLETION_ACTION) && funcRetrievesCompletionAction(A, COV))
520	removeAssumedBits(BitsEncoding: COMPLETION_ACTION);
521
522	return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED
523	: ChangeStatus::UNCHANGED;
524	}
525
526	ChangeStatus manifest(Attributor &A) override {
527	SmallVector<Attribute, `8`> AttrList;
528	LLVMContext &Ctx = getAssociatedFunction()->getContext();
529
530	for (auto Attr : ImplicitAttrs) {
531	if (isKnown(BitsEncoding: Attr.first))
532	AttrList.push_back(Elt: Attribute::get(Context&: Ctx, Kind: Attr.second));
533	}
534
535	return A.manifestAttrs(IRP: getIRPosition(), DeducedAttrs: AttrList,
536	/ ForceReplace / true);
537	}
538
539	const std::string getAsStr(Attributor ) const* override {
540	std::string Str;
541	raw_string_ostream OS(Str);
542	OS << "AMDInfo[";
543	for (auto Attr : ImplicitAttrs)
544	if (isAssumed(BitsEncoding: Attr.first))
545	OS << `' '` << Attr.second;
546	OS << " ]";
547	return OS.str();
548	}
549
550	/// See AbstractAttribute::trackStatistics()
551	void trackStatistics() const override {}
552
553	private:
554	bool checkForQueuePtr(Attributor &A) {
555	Function *F = getAssociatedFunction();
556	bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(CC: F->getCallingConv());
557
558	auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
559
560	bool NeedsQueuePtr = false;
561
562	auto CheckAddrSpaceCasts = [&](Instruction &I) {
563	unsigned SrcAS = static_cast<AddrSpaceCastInst &>(I).getSrcAddressSpace();
564	if (castRequiresQueuePtr(SrcAS)) {
565	NeedsQueuePtr = true;
566	return false;
567	}
568	return true;
569	};
570
571	bool HasApertureRegs = InfoCache.hasApertureRegs(F&: *F);
572
573	// `checkForAllInstructions` is much more cheaper than going through all
574	// instructions, try it first.
575
576	// The queue pointer is not needed if aperture regs is present.
577	if (!HasApertureRegs) {
578	bool UsedAssumedInformation = false;
579	A.checkForAllInstructions(Pred: CheckAddrSpaceCasts, QueryingAA: *this,
580	Opcodes: {Instruction::AddrSpaceCast},
581	UsedAssumedInformation);
582	}
583
584	// If we found that we need the queue pointer, nothing else to do.
585	if (NeedsQueuePtr)
586	return true;
587
588	if (!IsNonEntryFunc && HasApertureRegs)
589	return false;
590
591	for (BasicBlock &BB : *F) {
592	for (Instruction &I : BB) {
593	for (const Use &U : I.operands()) {
594	if (const auto *C = dyn_cast<Constant>(Val: U)) {
595	if (InfoCache.needsQueuePtr(C, Fn&: *F))
596	return true;
597	}
598	}
599	}
600	}
601
602	return false;
603	}
604
605	bool funcRetrievesMultigridSyncArg(Attributor &A, unsigned COV) {
606	auto Pos = llvm::AMDGPU::getMultigridSyncArgImplicitArgPosition(COV);
607	AA::RangeTy Range(Pos, `8`);
608	return funcRetrievesImplicitKernelArg(A, Range);
609	}
610
611	bool funcRetrievesHostcallPtr(Attributor &A, unsigned COV) {
612	auto Pos = llvm::AMDGPU::getHostcallImplicitArgPosition(COV);
613	AA::RangeTy Range(Pos, `8`);
614	return funcRetrievesImplicitKernelArg(A, Range);
615	}
616
617	bool funcRetrievesDefaultQueue(Attributor &A, unsigned COV) {
618	auto Pos = llvm::AMDGPU::getDefaultQueueImplicitArgPosition(COV);
619	AA::RangeTy Range(Pos, `8`);
620	return funcRetrievesImplicitKernelArg(A, Range);
621	}
622
623	bool funcRetrievesCompletionAction(Attributor &A, unsigned COV) {
624	auto Pos = llvm::AMDGPU::getCompletionActionImplicitArgPosition(COV);
625	AA::RangeTy Range(Pos, `8`);
626	return funcRetrievesImplicitKernelArg(A, Range);
627	}
628
629	bool funcRetrievesHeapPtr(Attributor &A, unsigned COV) {
630	if (COV < `5`)
631	return false;
632	AA::RangeTy Range(AMDGPU::ImplicitArg::HEAP_PTR_OFFSET, `8`);
633	return funcRetrievesImplicitKernelArg(A, Range);
634	}
635
636	bool funcRetrievesQueuePtr(Attributor &A, unsigned COV) {
637	if (COV < `5`)
638	return false;
639	AA::RangeTy Range(AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET, `8`);
640	return funcRetrievesImplicitKernelArg(A, Range);
641	}
642
643	bool funcRetrievesImplicitKernelArg(Attributor &A, AA::RangeTy Range) {
644	// Check if this is a call to the implicitarg_ptr builtin and it
645	// is used to retrieve the hostcall pointer. The implicit arg for
646	// hostcall is not used only if every use of the implicitarg_ptr
647	// is a load that clearly does not retrieve any byte of the
648	// hostcall pointer. We check this by tracing all the uses of the
649	// initial call to the implicitarg_ptr intrinsic.
650	auto DoesNotLeadToKernelArgLoc = [&](Instruction &I) {
651	auto &Call = cast<CallBase>(Val&: I);
652	if (Call.getIntrinsicID() != Intrinsic::amdgcn_implicitarg_ptr)
653	return true;
654
655	const auto *PointerInfoAA = A.getAAFor<AAPointerInfo>(
656	QueryingAA: *this, IRP: IRPosition::callsite_returned(CB: Call), DepClass: DepClassTy::REQUIRED);
657	if (!PointerInfoAA)
658	return false;
659
660	return PointerInfoAA->forallInterferingAccesses(
661	Range, CB: [](const AAPointerInfo::Access &Acc, bool IsExact) {
662	return Acc.getRemoteInst()->isDroppable();
663	});
664	};
665
666	bool UsedAssumedInformation = false;
667	return !A.checkForAllCallLikeInstructions(Pred: DoesNotLeadToKernelArgLoc, QueryingAA: *this,
668	UsedAssumedInformation);
669	}
670
671	bool funcRetrievesLDSKernelId(Attributor &A) {
672	auto DoesNotRetrieve = [&](Instruction &I) {
673	auto &Call = cast<CallBase>(Val&: I);
674	return Call.getIntrinsicID() != Intrinsic::amdgcn_lds_kernel_id;
675	};
676	bool UsedAssumedInformation = false;
677	return !A.checkForAllCallLikeInstructions(DoesNotRetrieve, *this,
678	UsedAssumedInformation);
679	}
680	};
681
682	AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP,
683	Attributor &A) {
684	if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
685	return *new (A.Allocator) AAAMDAttributesFunction (IRP, A);
686	llvm_unreachable("AAAMDAttributes is only valid for function position");
687	}
688
689	/// Base class to derive different size ranges.
690	struct AAAMDSizeRangeAttribute
691	: public StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t> {
692	using Base = StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t>;
693
694	StringRef AttrName;
695
696	AAAMDSizeRangeAttribute(const IRPosition &IRP, Attributor &A,
697	StringRef AttrName)
698	: Base (IRP, `32`), AttrName (AttrName) {}
699
700	/// See AbstractAttribute::trackStatistics()
701	void trackStatistics() const override {}
702
703	template <class AttributeImpl>
704	ChangeStatus updateImplImpl(Attributor &A) {
705	ChangeStatus Change = ChangeStatus::UNCHANGED;
706
707	auto CheckCallSite = [&](AbstractCallSite CS) {
708	Function *Caller = CS.getInstruction()->getFunction();
709	LLVM_DEBUG(dbgs() << `'['` << getName() << "] Call " << Caller->getName()
710	<< "->" << getAssociatedFunction()->getName() << `'\n'`);
711
712	const auto *CallerInfo = A.getAAFor<AttributeImpl>(
713	*this, IRPosition::function(F: *Caller), DepClassTy::REQUIRED);
714	if (!CallerInfo)
715	return false;
716
717	Change \|=
718	clampStateAndIndicateChange(this->getState(), CallerInfo->getState());
719
720	return true;
721	};
722
723	bool AllCallSitesKnown = true;
724	if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
725	return indicatePessimisticFixpoint();
726
727	return Change;
728	}
729
730	ChangeStatus emitAttributeIfNotDefault(Attributor &A, unsigned Min,
731	unsigned Max) {
732	// Don't add the attribute if it's the implied default.
733	if (getAssumed().getLower() == Min && getAssumed().getUpper() - `1` == Max)
734	return ChangeStatus::UNCHANGED;
735
736	Function *F = getAssociatedFunction();
737	LLVMContext &Ctx = F->getContext();
738	SmallString<`10`> Buffer;
739	raw_svector_ostream OS(Buffer);
740	OS << getAssumed().getLower() << `','` << getAssumed().getUpper() - `1`;
741	return A.manifestAttrs(IRP: getIRPosition(),
742	DeducedAttrs: {Attribute::get(Context&: Ctx, Kind: AttrName, Val: OS.str())},
743	/ ForceReplace / true);
744	}
745
746	const std::string getAsStr(Attributor ) const* override {
747	std::string Str;
748	raw_string_ostream OS(Str);
749	OS << getName() << `'['`;
750	OS << getAssumed().getLower() << `','` << getAssumed().getUpper() - `1`;
751	OS << `']'`;
752	return OS.str();
753	}
754	};
755
756	/// Propagate amdgpu-flat-work-group-size attribute.
757	struct AAAMDFlatWorkGroupSize : public AAAMDSizeRangeAttribute {
758	AAAMDFlatWorkGroupSize(const IRPosition &IRP, Attributor &A)
759	: AAAMDSizeRangeAttribute (IRP, A, "amdgpu-flat-work-group-size") {}
760
761	void initialize(Attributor &A) override {
762	Function *F = getAssociatedFunction();
763	auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
764	unsigned MinGroupSize, MaxGroupSize;
765	std::tie(args&: MinGroupSize, args&: MaxGroupSize) = InfoCache.getFlatWorkGroupSizes(F: *F);
766	intersectKnown(
767	R: ConstantRange (APInt (`32`, MinGroupSize), APInt (`32`, MaxGroupSize + `1`)));
768
769	if (AMDGPU::isEntryFunctionCC(CC: F->getCallingConv()))
770	indicatePessimisticFixpoint();
771	}
772
773	ChangeStatus updateImpl(Attributor &A) override {
774	return updateImplImpl<AAAMDFlatWorkGroupSize>(A);
775	}
776
777	/// Create an abstract attribute view for the position \p IRP.
778	static AAAMDFlatWorkGroupSize &createForPosition(const IRPosition &IRP,
779	Attributor &A);
780
781	ChangeStatus manifest(Attributor &A) override {
782	Function *F = getAssociatedFunction();
783	auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
784	unsigned Min, Max;
785	std::tie(args&: Min, args&: Max) = InfoCache.getMaximumFlatWorkGroupRange(F: *F);
786	return emitAttributeIfNotDefault(A, Min, Max);
787	}
788
789	/// See AbstractAttribute::getName()
790	const std::string getName() const override {
791	return "AAAMDFlatWorkGroupSize";
792	}
793
794	/// See AbstractAttribute::getIdAddr()
795	const char getIdAddr() const* override { return &ID; }
796
797	/// This function should return true if the type of the \p AA is
798	/// AAAMDFlatWorkGroupSize
799	static bool classof(const AbstractAttribute *AA) {
800	return (AA->getIdAddr() == &ID);
801	}
802
803	/// Unique ID (due to the unique address)
804	static const char ID;
805	};
806
807	const char AAAMDFlatWorkGroupSize::ID = `0`;
808
809	AAAMDFlatWorkGroupSize &
810	AAAMDFlatWorkGroupSize::createForPosition(const IRPosition &IRP,
811	Attributor &A) {
812	if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
813	return *new (A.Allocator) AAAMDFlatWorkGroupSize (IRP, A);
814	llvm_unreachable(
815	"AAAMDFlatWorkGroupSize is only valid for function position");
816	}
817
818	/// Propagate amdgpu-waves-per-eu attribute.
819	struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
820	AAAMDWavesPerEU(const IRPosition &IRP, Attributor &A)
821	: AAAMDSizeRangeAttribute (IRP, A, "amdgpu-waves-per-eu") {}
822
823	bool isValidState() const override {
824	return !Assumed.isEmptySet() && IntegerRangeState::isValidState();
825	}
826
827	void initialize(Attributor &A) override {
828	Function *F = getAssociatedFunction();
829	auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
830
831	if (const auto *AssumedGroupSize = A.getAAFor<AAAMDFlatWorkGroupSize>(
832	QueryingAA: *this, IRP: IRPosition::function(F: *F), DepClass: DepClassTy::REQUIRED)) {
833
834	unsigned Min, Max;
835	std::tie(args&: Min, args&: Max) = InfoCache.getWavesPerEU(
836	F: *F, FlatWorkGroupSize: {AssumedGroupSize->getAssumed().getLower().getZExtValue(),
837	AssumedGroupSize->getAssumed().getUpper().getZExtValue() - `1`});
838
839	ConstantRange Range(APInt (`32`, Min), APInt (`32`, Max + `1`));
840	intersectKnown(R: Range);
841	}
842
843	if (AMDGPU::isEntryFunctionCC(CC: F->getCallingConv()))
844	indicatePessimisticFixpoint();
845	}
846
847	ChangeStatus updateImpl(Attributor &A) override {
848	auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
849	ChangeStatus Change = ChangeStatus::UNCHANGED;
850
851	auto CheckCallSite = [&](AbstractCallSite CS) {
852	Function *Caller = CS.getInstruction()->getFunction();
853	Function *Func = getAssociatedFunction();
854	LLVM_DEBUG(dbgs() << `'['` << getName() << "] Call " << Caller->getName()
855	<< "->" << Func->getName() << `'\n'`);
856
857	const auto *CallerInfo = A.getAAFor<AAAMDWavesPerEU>(
858	QueryingAA: *this, IRP: IRPosition::function(F: *Caller), DepClass: DepClassTy::REQUIRED);
859	const auto *AssumedGroupSize = A.getAAFor<AAAMDFlatWorkGroupSize>(
860	QueryingAA: *this, IRP: IRPosition::function(F: *Func), DepClass: DepClassTy::REQUIRED);
861	if (!CallerInfo \|\| !AssumedGroupSize)
862	return false;
863
864	unsigned Min, Max;
865	std::tie(args&: Min, args&: Max) = InfoCache.getEffectiveWavesPerEU(
866	F: *Caller,
867	WavesPerEU: {CallerInfo->getAssumed().getLower().getZExtValue(),
868	CallerInfo->getAssumed().getUpper().getZExtValue() - `1`},
869	FlatWorkGroupSize: {AssumedGroupSize->getAssumed().getLower().getZExtValue(),
870	AssumedGroupSize->getAssumed().getUpper().getZExtValue() - `1`});
871	ConstantRange CallerRange(APInt (`32`, Min), APInt (`32`, Max + `1`));
872	IntegerRangeState CallerRangeState(CallerRange);
873	Change \|= clampStateAndIndicateChange(S&: this->getState(), R: CallerRangeState);
874
875	return true;
876	};
877
878	bool AllCallSitesKnown = true;
879	if (!A.checkForAllCallSites(Pred: CheckCallSite, QueryingAA: *this, RequireAllCallSites: true, UsedAssumedInformation&: AllCallSitesKnown))
880	return indicatePessimisticFixpoint();
881
882	return Change;
883	}
884
885	/// Create an abstract attribute view for the position \p IRP.
886	static AAAMDWavesPerEU &createForPosition(const IRPosition &IRP,
887	Attributor &A);
888
889	ChangeStatus manifest(Attributor &A) override {
890	Function *F = getAssociatedFunction();
891	auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
892	unsigned Max = InfoCache.getMaxWavesPerEU(F: *F);
893	return emitAttributeIfNotDefault(A, Min: `1`, Max);
894	}
895
896	/// See AbstractAttribute::getName()
897	const std::string getName() const override { return "AAAMDWavesPerEU"; }
898
899	/// See AbstractAttribute::getIdAddr()
900	const char getIdAddr() const* override { return &ID; }
901
902	/// This function should return true if the type of the \p AA is
903	/// AAAMDWavesPerEU
904	static bool classof(const AbstractAttribute *AA) {
905	return (AA->getIdAddr() == &ID);
906	}
907
908	/// Unique ID (due to the unique address)
909	static const char ID;
910	};
911
912	const char AAAMDWavesPerEU::ID = `0`;
913
914	AAAMDWavesPerEU &AAAMDWavesPerEU::createForPosition(const IRPosition &IRP,
915	Attributor &A) {
916	if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
917	return *new (A.Allocator) AAAMDWavesPerEU (IRP, A);
918	llvm_unreachable("AAAMDWavesPerEU is only valid for function position");
919	}
920
921	static void addPreloadKernArgHint(Function &F, TargetMachine &TM) {
922	const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
923	for (unsigned I = `0`;
924	I < F.arg_size() &&
925	I < std::min(a: KernargPreloadCount.getValue(), b: ST.getMaxNumUserSGPRs());
926	++I) {
927	Argument &Arg = *F.getArg(i: I);
928	// Check for incompatible attributes.
929	if (Arg.hasByRefAttr() \|\| Arg.hasNestAttr())
930	break;
931
932	Arg.addAttr(Attribute::InReg);
933	}
934	}
935
936	static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM) {
937	SetVector<Function *> Functions;
938	for (Function &F : M) {
939	if (!F.isIntrinsic())
940	Functions.insert(X: &F);
941	}
942
943	CallGraphUpdater CGUpdater;
944	BumpPtrAllocator Allocator;
945	AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, TM);
946	DenseSet<const char *> Allowed(
947	{&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
948	&AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID,
949	&AAAMDWavesPerEU::ID, &AACallEdges::ID, &AAPointerInfo::ID,
950	&AAPotentialConstantValues::ID, &AAUnderlyingObjects::ID});
951
952	AttributorConfig AC(CGUpdater);
953	AC.Allowed = &Allowed;
954	AC.IsModulePass = true;
955	AC.DefaultInitializeLiveInternals = false;
956	AC.IPOAmendableCB = [](const Function &F) {
957	return F.getCallingConv() == CallingConv::AMDGPU_KERNEL;
958	};
959
960	Attributor A(Functions, InfoCache, AC);
961
962	for (Function &F : M) {
963	if (!F.isIntrinsic()) {
964	A.getOrCreateAAFor<AAAMDAttributes>(IRP: IRPosition::function(F));
965	A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRP: IRPosition::function(F));
966	CallingConv::ID CC = F.getCallingConv();
967	if (!AMDGPU::isEntryFunctionCC(CC)) {
968	A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRP: IRPosition::function(F));
969	A.getOrCreateAAFor<AAAMDWavesPerEU>(IRP: IRPosition::function(F));
970	} else if (CC == CallingConv::AMDGPU_KERNEL) {
971	addPreloadKernArgHint(F, TM);
972	}
973	}
974	}
975
976	ChangeStatus Change = A.run();
977	return Change == ChangeStatus::CHANGED;
978	}
979
980	class AMDGPUAttributorLegacy : public ModulePass {
981	public:
982	AMDGPUAttributorLegacy() : ModulePass (ID) {}
983
984	/// doInitialization - Virtual method overridden by subclasses to do
985	/// any necessary initialization before any pass is run.
986	bool doInitialization(Module &) override {
987	auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
988	if (!TPC)
989	report_fatal_error(reason: "TargetMachine is required");
990
991	TM = &TPC->getTM<TargetMachine>();
992	return false;
993	}
994
995	bool runOnModule(Module &M) override {
996	AnalysisGetter AG(this);
997	return runImpl(M, AG, TM&: *TM);
998	}
999
1000	void getAnalysisUsage(AnalysisUsage &AU) const override {
1001	AU.addRequired<CycleInfoWrapperPass>();
1002	}
1003
1004	StringRef getPassName() const override { return "AMDGPU Attributor"; }
1005	TargetMachine *TM;
1006	static char ID;
1007	};
1008	} // namespace
1009
1010	PreservedAnalyses llvm::AMDGPUAttributorPass::run(Module &M,
1011	ModuleAnalysisManager &AM) {
1012
1013	FunctionAnalysisManager &FAM =
1014	AM.getResult<FunctionAnalysisManagerModuleProxy>(IR&: M).getManager();
1015	AnalysisGetter AG(FAM);
1016
1017	// TODO: Probably preserves CFG
1018	return runImpl(M, AG, TM) ? PreservedAnalyses::none()
1019	: PreservedAnalyses::all();
1020	}
1021
1022	char AMDGPUAttributorLegacy::ID = `0`;
1023
1024	Pass *llvm::createAMDGPUAttributorLegacyPass() {
1025	return new AMDGPUAttributorLegacy ();
1026	}
1027	INITIALIZE_PASS_BEGIN(AMDGPUAttributorLegacy, DEBUG_TYPE, "AMDGPU Attributor",
1028	false, false)
1029	INITIALIZE_PASS_DEPENDENCY(CycleInfoWrapperPass);
1030	INITIALIZE_PASS_END(AMDGPUAttributorLegacy, DEBUG_TYPE, "AMDGPU Attributor",
1031	false, false)
1032

source code of llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp