AMDGPUResourceUsageAnalysis.cpp source code [llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp]

1	//===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -----------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// \brief Analyzes how many registers and other resources are used by
11	/// functions.
12	///
13	/// The results of this analysis are used to fill the register usage, flat
14	/// usage, etc. into hardware registers.
15	///
16	/// The analysis takes callees into account. E.g. if a function A that needs 10
17	/// VGPRs calls a function B that needs 20 VGPRs, querying the VGPR usage of A
18	/// will return 20.
19	/// It is assumed that an indirect call can go into any function except
20	/// hardware-entrypoints. Therefore the register usage of functions with
21	/// indirect calls is estimated as the maximum of all non-entrypoint functions
22	/// in the module.
23	///
24	//===----------------------------------------------------------------------===//
25
26	#include "AMDGPUResourceUsageAnalysis.h"
27	#include "AMDGPU.h"
28	#include "GCNSubtarget.h"
29	#include "SIMachineFunctionInfo.h"
30	#include "llvm/ADT/PostOrderIterator.h"
31	#include "llvm/Analysis/CallGraph.h"
32	#include "llvm/CodeGen/MachineFrameInfo.h"
33	#include "llvm/CodeGen/TargetPassConfig.h"
34	#include "llvm/IR/GlobalAlias.h"
35	#include "llvm/IR/GlobalValue.h"
36	#include "llvm/Target/TargetMachine.h"
37
38	using namespace llvm;
39	using namespace llvm::AMDGPU;
40
41	#define DEBUG_TYPE "amdgpu-resource-usage"
42
43	char llvm::AMDGPUResourceUsageAnalysis::ID = `0`;
44	char &llvm::AMDGPUResourceUsageAnalysisID = AMDGPUResourceUsageAnalysis::ID;
45
46	// In code object v4 and older, we need to tell the runtime some amount ahead of
47	// time if we don't know the true stack size. Assume a smaller number if this is
48	// only due to dynamic / non-entry block allocas.
49	static cl::opt<uint32_t> clAssumedStackSizeForExternalCall(
50	"amdgpu-assume-external-call-stack-size",
51	cl::desc ("Assumed stack use of any external call (in bytes)"), cl::Hidden,
52	cl::init(Val: `16384`));
53
54	static cl::opt<uint32_t> clAssumedStackSizeForDynamicSizeObjects(
55	"amdgpu-assume-dynamic-stack-object-size",
56	cl::desc ("Assumed extra stack use if there are any "
57	"variable sized objects (in bytes)"),
58	cl::Hidden, cl::init(Val: `4096`));
59
60	INITIALIZE_PASS(AMDGPUResourceUsageAnalysis, DEBUG_TYPE,
61	"Function register usage analysis", true, true)
62
63	static const Function getCalleeFunction(const* MachineOperand &Op) {
64	if (Op.isImm()) {
65	assert(Op.getImm() == `0`);
66	return nullptr;
67	}
68	if (auto *GA = dyn_cast<GlobalAlias>(Val: Op.getGlobal()))
69	return cast<Function>(Val: GA->getOperand(i_nocapture: `0`));
70	return cast<Function>(Val: Op.getGlobal());
71	}
72
73	static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
74	const SIInstrInfo &TII, unsigned Reg) {
75	for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) {
76	if (!UseOp.isImplicit() \|\| !TII.isFLAT(MI: *UseOp.getParent()))
77	return true;
78	}
79
80	return false;
81	}
82
83	int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumSGPRs(
84	const GCNSubtarget &ST) const {
85	return NumExplicitSGPR +
86	IsaInfo::getNumExtraSGPRs(&ST, UsesVCC, UsesFlatScratch,
87	ST.getTargetID().isXnackOnOrAny());
88	}
89
90	int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
91	const GCNSubtarget &ST, int32_t ArgNumAGPR, int32_t ArgNumVGPR) const {
92	return AMDGPU::getTotalNumVGPRs(has90AInsts: ST.hasGFX90AInsts(), ArgNumAGPR, ArgNumVGPR);
93	}
94
95	int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
96	const GCNSubtarget &ST) const {
97	return getTotalNumVGPRs(ST, ArgNumAGPR: NumAGPR, ArgNumVGPR: NumVGPR);
98	}
99
100	bool AMDGPUResourceUsageAnalysis::runOnModule(Module &M) {
101	auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
102	if (!TPC)
103	return false;
104
105	MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
106	const TargetMachine &TM = TPC->getTM<TargetMachine>();
107	const MCSubtargetInfo &STI = *TM.getMCSubtargetInfo();
108	bool HasIndirectCall = false;
109
110	CallGraph CG = CallGraph (M);
111	auto End = po_end(G: &CG);
112
113	// By default, for code object v5 and later, track only the minimum scratch
114	// size
115	uint32_t AssumedStackSizeForDynamicSizeObjects =
116	clAssumedStackSizeForDynamicSizeObjects;
117	uint32_t AssumedStackSizeForExternalCall = clAssumedStackSizeForExternalCall;
118	if (AMDGPU::getAMDHSACodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5 \|\|
119	STI.getTargetTriple().getOS() == Triple::AMDPAL) {
120	if (clAssumedStackSizeForDynamicSizeObjects.getNumOccurrences() == `0`)
121	AssumedStackSizeForDynamicSizeObjects = `0`;
122	if (clAssumedStackSizeForExternalCall.getNumOccurrences() == `0`)
123	AssumedStackSizeForExternalCall = `0`;
124	}
125
126	for (auto IT = po_begin(G: &CG); IT != End; ++IT) {
127	Function *F = IT ->getFunction();
128	if (!F \|\| F->isDeclaration())
129	continue;
130
131	MachineFunction MF = MMI.getMachineFunction(F: F);
132	assert(MF && "function must have been generated already");
133
134	auto CI =
135	CallGraphResourceInfo.insert(KV: std::pair(F, SIFunctionResourceInfo ()));
136	SIFunctionResourceInfo &Info = CI.first ->second;
137	assert(CI.second && "should only be called once per function");
138	Info = analyzeResourceUsage(MF: *MF, TM, AssumedStackSizeForDynamicSizeObjects,
139	AssumedStackSizeForExternalCall);
140	HasIndirectCall \|= Info.HasIndirectCall;
141	}
142
143	// It's possible we have unreachable functions in the module which weren't
144	// visited by the PO traversal. Make sure we have some resource counts to
145	// report.
146	for (const auto &IT : CG) {
147	const Function *F = IT.first;
148	if (!F \|\| F->isDeclaration())
149	continue;
150
151	auto CI =
152	CallGraphResourceInfo.insert(KV: std::pair(F, SIFunctionResourceInfo ()));
153	if (!CI.second) // Skip already visited functions
154	continue;
155
156	SIFunctionResourceInfo &Info = CI.first ->second;
157	MachineFunction MF = MMI.getMachineFunction(F: F);
158	assert(MF && "function must have been generated already");
159	Info = analyzeResourceUsage(MF: *MF, TM, AssumedStackSizeForDynamicSizeObjects,
160	AssumedStackSizeForExternalCall);
161	HasIndirectCall \|= Info.HasIndirectCall;
162	}
163
164	if (HasIndirectCall)
165	propagateIndirectCallRegisterUsage();
166
167	return false;
168	}
169
170	AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo
171	AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
172	const MachineFunction &MF, const TargetMachine &TM,
173	uint32_t AssumedStackSizeForDynamicSizeObjects,
174	uint32_t AssumedStackSizeForExternalCall) const {
175	SIFunctionResourceInfo Info;
176
177	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
178	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
179	const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
180	const MachineRegisterInfo &MRI = MF.getRegInfo();
181	const SIInstrInfo *TII = ST.getInstrInfo();
182	const SIRegisterInfo &TRI = TII->getRegisterInfo();
183
184	Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::PhysReg: FLAT_SCR_LO) \|\|
185	MRI.isPhysRegUsed(AMDGPU::PhysReg: FLAT_SCR_HI) \|\|
186	MRI.isLiveIn(Reg: MFI->getPreloadedReg(
187	Value: AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT));
188
189	// Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
190	// instructions aren't used to access the scratch buffer. Inline assembly may
191	// need it though.
192	//
193	// If we only have implicit uses of flat_scr on flat instructions, it is not
194	// really needed.
195	if (Info.UsesFlatScratch && !MFI->getUserSGPRInfo().hasFlatScratchInit() &&
196	(!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) &&
197	!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) &&
198	!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) {
199	Info.UsesFlatScratch = false;
200	}
201
202	Info.PrivateSegmentSize = FrameInfo.getStackSize();
203
204	// Assume a big number if there are any unknown sized objects.
205	Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
206	if (Info.HasDynamicallySizedStack)
207	Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects;
208
209	if (MFI->isStackRealigned())
210	Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
211
212	Info.UsesVCC =
213	MRI.isPhysRegUsed(AMDGPU::PhysReg: VCC_LO) \|\| MRI.isPhysRegUsed(AMDGPU::PhysReg: VCC_HI);
214
215	// If there are no calls, MachineRegisterInfo can tell us the used register
216	// count easily.
217	// A tail call isn't considered a call for MachineFrameInfo's purposes.
218	if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
219	MCPhysReg HighestVGPRReg = AMDGPU::NoRegister;
220	for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
221	if (MRI.isPhysRegUsed(Reg)) {
222	HighestVGPRReg = Reg;
223	break;
224	}
225	}
226
227	if (ST.hasMAIInsts()) {
228	MCPhysReg HighestAGPRReg = AMDGPU::NoRegister;
229	for (MCPhysReg Reg : reverse(AMDGPU::AGPR_32RegClass.getRegisters())) {
230	if (MRI.isPhysRegUsed(Reg)) {
231	HighestAGPRReg = Reg;
232	break;
233	}
234	}
235	Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister
236	? `0`
237	: TRI.getHWRegIndex(Reg: HighestAGPRReg) + `1`;
238	}
239
240	MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
241	for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
242	if (MRI.isPhysRegUsed(Reg)) {
243	HighestSGPRReg = Reg;
244	break;
245	}
246	}
247
248	// We found the maximum register index. They start at 0, so add one to get
249	// the number of registers.
250	Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister
251	? `0`
252	: TRI.getHWRegIndex(Reg: HighestVGPRReg) + `1`;
253	Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister
254	? `0`
255	: TRI.getHWRegIndex(Reg: HighestSGPRReg) + `1`;
256
257	return Info;
258	}
259
260	int32_t MaxVGPR = -`1`;
261	int32_t MaxAGPR = -`1`;
262	int32_t MaxSGPR = -`1`;
263	uint64_t CalleeFrameSize = `0`;
264
265	for (const MachineBasicBlock &MBB : MF) {
266	for (const MachineInstr &MI : MBB) {
267	// TODO: Check regmasks? Do they occur anywhere except calls?
268	for (const MachineOperand &MO : MI.operands()) {
269	unsigned Width = `0`;
270	bool IsSGPR = false;
271	bool IsAGPR = false;
272
273	if (!MO.isReg())
274	continue;
275
276	Register Reg = MO.getReg();
277	switch (Reg) {
278	case AMDGPU::EXEC:
279	case AMDGPU::EXEC_LO:
280	case AMDGPU::EXEC_HI:
281	case AMDGPU::SCC:
282	case AMDGPU::M0:
283	case AMDGPU::M0_LO16:
284	case AMDGPU::M0_HI16:
285	case AMDGPU::SRC_SHARED_BASE_LO:
286	case AMDGPU::SRC_SHARED_BASE:
287	case AMDGPU::SRC_SHARED_LIMIT_LO:
288	case AMDGPU::SRC_SHARED_LIMIT:
289	case AMDGPU::SRC_PRIVATE_BASE_LO:
290	case AMDGPU::SRC_PRIVATE_BASE:
291	case AMDGPU::SRC_PRIVATE_LIMIT_LO:
292	case AMDGPU::SRC_PRIVATE_LIMIT:
293	case AMDGPU::SGPR_NULL:
294	case AMDGPU::SGPR_NULL64:
295	case AMDGPU::MODE:
296	continue;
297
298	case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
299	llvm_unreachable("src_pops_exiting_wave_id should not be used");
300
301	case AMDGPU::NoRegister:
302	assert(MI.isDebugInstr() &&
303	"Instruction uses invalid noreg register");
304	continue;
305
306	case AMDGPU::VCC:
307	case AMDGPU::VCC_LO:
308	case AMDGPU::VCC_HI:
309	case AMDGPU::VCC_LO_LO16:
310	case AMDGPU::VCC_LO_HI16:
311	case AMDGPU::VCC_HI_LO16:
312	case AMDGPU::VCC_HI_HI16:
313	Info.UsesVCC = true;
314	continue;
315
316	case AMDGPU::FLAT_SCR:
317	case AMDGPU::FLAT_SCR_LO:
318	case AMDGPU::FLAT_SCR_HI:
319	continue;
320
321	case AMDGPU::XNACK_MASK:
322	case AMDGPU::XNACK_MASK_LO:
323	case AMDGPU::XNACK_MASK_HI:
324	llvm_unreachable("xnack_mask registers should not be used");
325
326	case AMDGPU::LDS_DIRECT:
327	llvm_unreachable("lds_direct register should not be used");
328
329	case AMDGPU::TBA:
330	case AMDGPU::TBA_LO:
331	case AMDGPU::TBA_HI:
332	case AMDGPU::TMA:
333	case AMDGPU::TMA_LO:
334	case AMDGPU::TMA_HI:
335	llvm_unreachable("trap handler registers should not be used");
336
337	case AMDGPU::SRC_VCCZ:
338	llvm_unreachable("src_vccz register should not be used");
339
340	case AMDGPU::SRC_EXECZ:
341	llvm_unreachable("src_execz register should not be used");
342
343	case AMDGPU::SRC_SCC:
344	llvm_unreachable("src_scc register should not be used");
345
346	default:
347	break;
348	}
349
350	if (AMDGPU::SGPR_32RegClass.contains(Reg) \|\|
351	AMDGPU::SGPR_LO16RegClass.contains(Reg) \|\|
352	AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
353	IsSGPR = true;
354	Width = `1`;
355	} else if (AMDGPU::VGPR_32RegClass.contains(Reg) \|\|
356	AMDGPU::VGPR_16RegClass.contains(Reg)) {
357	IsSGPR = false;
358	Width = `1`;
359	} else if (AMDGPU::AGPR_32RegClass.contains(Reg) \|\|
360	AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
361	IsSGPR = false;
362	IsAGPR = true;
363	Width = `1`;
364	} else if (AMDGPU::SGPR_64RegClass.contains(Reg)) {
365	IsSGPR = true;
366	Width = `2`;
367	} else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
368	IsSGPR = false;
369	Width = `2`;
370	} else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
371	IsSGPR = false;
372	IsAGPR = true;
373	Width = `2`;
374	} else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
375	IsSGPR = false;
376	Width = `3`;
377	} else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
378	IsSGPR = true;
379	Width = `3`;
380	} else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
381	IsSGPR = false;
382	IsAGPR = true;
383	Width = `3`;
384	} else if (AMDGPU::SGPR_128RegClass.contains(Reg)) {
385	IsSGPR = true;
386	Width = `4`;
387	} else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
388	IsSGPR = false;
389	Width = `4`;
390	} else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
391	IsSGPR = false;
392	IsAGPR = true;
393	Width = `4`;
394	} else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
395	IsSGPR = false;
396	Width = `5`;
397	} else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
398	IsSGPR = true;
399	Width = `5`;
400	} else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
401	IsSGPR = false;
402	IsAGPR = true;
403	Width = `5`;
404	} else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
405	IsSGPR = false;
406	Width = `6`;
407	} else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
408	IsSGPR = true;
409	Width = `6`;
410	} else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
411	IsSGPR = false;
412	IsAGPR = true;
413	Width = `6`;
414	} else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
415	IsSGPR = false;
416	Width = `7`;
417	} else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
418	IsSGPR = true;
419	Width = `7`;
420	} else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
421	IsSGPR = false;
422	IsAGPR = true;
423	Width = `7`;
424	} else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
425	IsSGPR = true;
426	Width = `8`;
427	} else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
428	IsSGPR = false;
429	Width = `8`;
430	} else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
431	IsSGPR = false;
432	IsAGPR = true;
433	Width = `8`;
434	} else if (AMDGPU::VReg_288RegClass.contains(Reg)) {
435	IsSGPR = false;
436	Width = `9`;
437	} else if (AMDGPU::SReg_288RegClass.contains(Reg)) {
438	IsSGPR = true;
439	Width = `9`;
440	} else if (AMDGPU::AReg_288RegClass.contains(Reg)) {
441	IsSGPR = false;
442	IsAGPR = true;
443	Width = `9`;
444	} else if (AMDGPU::VReg_320RegClass.contains(Reg)) {
445	IsSGPR = false;
446	Width = `10`;
447	} else if (AMDGPU::SReg_320RegClass.contains(Reg)) {
448	IsSGPR = true;
449	Width = `10`;
450	} else if (AMDGPU::AReg_320RegClass.contains(Reg)) {
451	IsSGPR = false;
452	IsAGPR = true;
453	Width = `10`;
454	} else if (AMDGPU::VReg_352RegClass.contains(Reg)) {
455	IsSGPR = false;
456	Width = `11`;
457	} else if (AMDGPU::SReg_352RegClass.contains(Reg)) {
458	IsSGPR = true;
459	Width = `11`;
460	} else if (AMDGPU::AReg_352RegClass.contains(Reg)) {
461	IsSGPR = false;
462	IsAGPR = true;
463	Width = `11`;
464	} else if (AMDGPU::VReg_384RegClass.contains(Reg)) {
465	IsSGPR = false;
466	Width = `12`;
467	} else if (AMDGPU::SReg_384RegClass.contains(Reg)) {
468	IsSGPR = true;
469	Width = `12`;
470	} else if (AMDGPU::AReg_384RegClass.contains(Reg)) {
471	IsSGPR = false;
472	IsAGPR = true;
473	Width = `12`;
474	} else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
475	IsSGPR = true;
476	Width = `16`;
477	} else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
478	IsSGPR = false;
479	Width = `16`;
480	} else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
481	IsSGPR = false;
482	IsAGPR = true;
483	Width = `16`;
484	} else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
485	IsSGPR = true;
486	Width = `32`;
487	} else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
488	IsSGPR = false;
489	Width = `32`;
490	} else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
491	IsSGPR = false;
492	IsAGPR = true;
493	Width = `32`;
494	} else {
495	// We only expect TTMP registers or registers that do not belong to
496	// any RC.
497	assert((AMDGPU::TTMP_32RegClass.contains(Reg) \|\|
498	AMDGPU::TTMP_64RegClass.contains(Reg) \|\|
499	AMDGPU::TTMP_128RegClass.contains(Reg) \|\|
500	AMDGPU::TTMP_256RegClass.contains(Reg) \|\|
501	AMDGPU::TTMP_512RegClass.contains(Reg) \|\|
502	!TRI.getPhysRegBaseClass(Reg)) &&
503	"Unknown register class");
504	}
505	unsigned HWReg = TRI.getHWRegIndex(Reg);
506	int MaxUsed = HWReg + Width - `1`;
507	if (IsSGPR) {
508	MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
509	} else if (IsAGPR) {
510	MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
511	} else {
512	MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
513	}
514	}
515
516	if (MI.isCall()) {
517	// Pseudo used just to encode the underlying global. Is there a better
518	// way to track this?
519
520	const MachineOperand *CalleeOp =
521	TII->getNamedOperand(MI, AMDGPU::OpName::callee);
522
523	const Function Callee = getCalleeFunction(Op: CalleeOp);
524	DenseMap<const Function *, SIFunctionResourceInfo>::const_iterator I =
525	CallGraphResourceInfo.end();
526
527	// Avoid crashing on undefined behavior with an illegal call to a
528	// kernel. If a callsite's calling convention doesn't match the
529	// function's, it's undefined behavior. If the callsite calling
530	// convention does match, that would have errored earlier.
531	if (Callee && AMDGPU::isEntryFunctionCC(CC: Callee->getCallingConv()))
532	report_fatal_error(reason: "invalid call to entry function");
533
534	bool IsIndirect = !Callee \|\| Callee->isDeclaration();
535	if (!IsIndirect)
536	I = CallGraphResourceInfo.find(Val: Callee);
537
538	// FIXME: Call site could have norecurse on it
539	if (!Callee \|\| !Callee->doesNotRecurse()) {
540	Info.HasRecursion = true;
541
542	// TODO: If we happen to know there is no stack usage in the
543	// callgraph, we don't need to assume an infinitely growing stack.
544	if (!MI.isReturn()) {
545	// We don't need to assume an unknown stack size for tail calls.
546
547	// FIXME: This only benefits in the case where the kernel does not
548	// directly call the tail called function. If a kernel directly
549	// calls a tail recursive function, we'll assume maximum stack size
550	// based on the regular call instruction.
551	CalleeFrameSize = std::max(
552	a: CalleeFrameSize,
553	b: static_cast<uint64_t>(AssumedStackSizeForExternalCall));
554	}
555	}
556
557	if (IsIndirect \|\| I == CallGraphResourceInfo.end()) {
558	CalleeFrameSize =
559	std::max(a: CalleeFrameSize,
560	b: static_cast<uint64_t>(AssumedStackSizeForExternalCall));
561
562	// Register usage of indirect calls gets handled later
563	Info.UsesVCC = true;
564	Info.UsesFlatScratch = ST.hasFlatAddressSpace();
565	Info.HasDynamicallySizedStack = true;
566	Info.HasIndirectCall = true;
567	} else {
568	// We force CodeGen to run in SCC order, so the callee's register
569	// usage etc. should be the cumulative usage of all callees.
570	MaxSGPR = std::max(a: I ->second.NumExplicitSGPR - `1`, b: MaxSGPR);
571	MaxVGPR = std::max(a: I ->second.NumVGPR - `1`, b: MaxVGPR);
572	MaxAGPR = std::max(a: I ->second.NumAGPR - `1`, b: MaxAGPR);
573	CalleeFrameSize =
574	std::max(a: I ->second.PrivateSegmentSize, b: CalleeFrameSize);
575	Info.UsesVCC \|= I ->second.UsesVCC;
576	Info.UsesFlatScratch \|= I ->second.UsesFlatScratch;
577	Info.HasDynamicallySizedStack \|= I ->second.HasDynamicallySizedStack;
578	Info.HasRecursion \|= I ->second.HasRecursion;
579	Info.HasIndirectCall \|= I ->second.HasIndirectCall;
580	}
581	}
582	}
583	}
584
585	Info.NumExplicitSGPR = MaxSGPR + `1`;
586	Info.NumVGPR = MaxVGPR + `1`;
587	Info.NumAGPR = MaxAGPR + `1`;
588	Info.PrivateSegmentSize += CalleeFrameSize;
589
590	return Info;
591	}
592
593	void AMDGPUResourceUsageAnalysis::propagateIndirectCallRegisterUsage() {
594	// Collect the maximum number of registers from non-hardware-entrypoints.
595	// All these functions are potential targets for indirect calls.
596	int32_t NonKernelMaxSGPRs = `0`;
597	int32_t NonKernelMaxVGPRs = `0`;
598	int32_t NonKernelMaxAGPRs = `0`;
599
600	for (const auto &I : CallGraphResourceInfo) {
601	if (!AMDGPU::isEntryFunctionCC(CC: I.getFirst()->getCallingConv())) {
602	auto &Info = I.getSecond();
603	NonKernelMaxSGPRs = std::max(a: NonKernelMaxSGPRs, b: Info.NumExplicitSGPR);
604	NonKernelMaxVGPRs = std::max(a: NonKernelMaxVGPRs, b: Info.NumVGPR);
605	NonKernelMaxAGPRs = std::max(a: NonKernelMaxAGPRs, b: Info.NumAGPR);
606	}
607	}
608
609	// Add register usage for functions with indirect calls.
610	// For calls to unknown functions, we assume the maximum register usage of
611	// all non-hardware-entrypoints in the current module.
612	for (auto &I : CallGraphResourceInfo) {
613	auto &Info = I.getSecond();
614	if (Info.HasIndirectCall) {
615	Info.NumExplicitSGPR = std::max(a: Info.NumExplicitSGPR, b: NonKernelMaxSGPRs);
616	Info.NumVGPR = std::max(a: Info.NumVGPR, b: NonKernelMaxVGPRs);
617	Info.NumAGPR = std::max(a: Info.NumAGPR, b: NonKernelMaxAGPRs);
618	}
619	}
620	}
621

source code of llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp