SIFrameLowering.cpp source code [llvm/lib/Target/AMDGPU/SIFrameLowering.cpp]

1	//===----------------------- SIFrameLowering.cpp --------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//==-----------------------------------------------------------------------===//
8
9	#include "SIFrameLowering.h"
10	#include "AMDGPU.h"
11	#include "GCNSubtarget.h"
12	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
13	#include "SIMachineFunctionInfo.h"
14	#include "llvm/CodeGen/LiveRegUnits.h"
15	#include "llvm/CodeGen/MachineFrameInfo.h"
16	#include "llvm/CodeGen/RegisterScavenging.h"
17	#include "llvm/Target/TargetMachine.h"
18
19	using namespace llvm;
20
21	#define DEBUG_TYPE "frame-info"
22
23	static cl::opt<bool> EnableSpillVGPRToAGPR(
24	"amdgpu-spill-vgpr-to-agpr",
25	cl::desc ("Enable spilling VGPRs to AGPRs"),
26	cl::ReallyHidden,
27	cl::init(Val: true));
28
29	// Find a register matching \p RC from \p LiveUnits which is unused and
30	// available throughout the function. On failure, returns AMDGPU::NoRegister.
31	// TODO: Rewrite the loop here to iterate over MCRegUnits instead of
32	// MCRegisters. This should reduce the number of iterations and avoid redundant
33	// checking.
34	static MCRegister findUnusedRegister(MachineRegisterInfo &MRI,
35	const LiveRegUnits &LiveUnits,
36	const TargetRegisterClass &RC) {
37	for (MCRegister Reg : RC) {
38	if (!MRI.isPhysRegUsed(PhysReg: Reg) && LiveUnits.available(Reg) &&
39	!MRI.isReserved(PhysReg: Reg))
40	return Reg;
41	}
42	return MCRegister ();
43	}
44
45	// Find a scratch register that we can use in the prologue. We avoid using
46	// callee-save registers since they may appear to be free when this is called
47	// from canUseAsPrologue (during shrink wrapping), but then no longer be free
48	// when this is called from emitPrologue.
49	static MCRegister findScratchNonCalleeSaveRegister(
50	MachineRegisterInfo &MRI, LiveRegUnits &LiveUnits,
51	const TargetRegisterClass &RC, bool Unused = false) {
52	// Mark callee saved registers as used so we will not choose them.
53	const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
54	for (unsigned i = `0`; CSRegs[i]; ++i)
55	LiveUnits.addReg(Reg: CSRegs[i]);
56
57	// We are looking for a register that can be used throughout the entire
58	// function, so any use is unacceptable.
59	if (Unused)
60	return findUnusedRegister(MRI, LiveUnits, RC);
61
62	for (MCRegister Reg : RC) {
63	if (LiveUnits.available(Reg) && !MRI.isReserved(PhysReg: Reg))
64	return Reg;
65	}
66
67	return MCRegister ();
68	}
69
70	/// Query target location for spilling SGPRs
71	/// \p IncludeScratchCopy : Also look for free scratch SGPRs
72	static void getVGPRSpillLaneOrTempRegister(
73	MachineFunction &MF, LiveRegUnits &LiveUnits, Register SGPR,
74	const TargetRegisterClass &RC = AMDGPU::SReg_32_XM0_XEXECRegClass,
75	bool IncludeScratchCopy = true) {
76	SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
77	MachineFrameInfo &FrameInfo = MF.getFrameInfo();
78
79	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
80	const SIRegisterInfo *TRI = ST.getRegisterInfo();
81	unsigned Size = TRI->getSpillSize(RC);
82	Align Alignment = TRI->getSpillAlign(RC);
83
84	// We need to save and restore the given SGPR.
85
86	Register ScratchSGPR;
87	// 1: Try to save the given register into an unused scratch SGPR. The
88	// LiveUnits should have all the callee saved registers marked as used. For
89	// certain cases we skip copy to scratch SGPR.
90	if (IncludeScratchCopy)
91	ScratchSGPR = findUnusedRegister(MF.getRegInfo(), LiveUnits, RC);
92
93	if (!ScratchSGPR) {
94	int FI = FrameInfo.CreateStackObject(Size, Alignment, isSpillSlot: true, Alloca: nullptr,
95	ID: TargetStackID::SGPRSpill);
96
97	if (TRI->spillSGPRToVGPR() &&
98	MFI->allocateSGPRSpillToVGPRLane(MF, FI, /SpillToPhysVGPRLane=/true,
99	/IsPrologEpilog=/true)) {
100	// 2: There's no free lane to spill, and no free register to save the
101	// SGPR, so we're forced to take another VGPR to use for the spill.
102	MFI->addToPrologEpilogSGPRSpills(
103	Reg: SGPR, SI: PrologEpilogSGPRSaveRestoreInfo (
104	SGPRSaveKind::SPILL_TO_VGPR_LANE, FI));
105
106	LLVM_DEBUG(auto Spill = MFI->getSGPRSpillToPhysicalVGPRLanes(FI).front();
107	dbgs() << printReg(SGPR, TRI) << " requires fallback spill to "
108	<< printReg(Spill.VGPR, TRI) << `':'` << Spill.Lane
109	<< `'\n'`;);
110	} else {
111	// Remove dead <FI> index
112	MF.getFrameInfo().RemoveStackObject(ObjectIdx: FI);
113	// 3: If all else fails, spill the register to memory.
114	FI = FrameInfo.CreateSpillStackObject(Size, Alignment);
115	MFI->addToPrologEpilogSGPRSpills(
116	Reg: SGPR,
117	SI: PrologEpilogSGPRSaveRestoreInfo (SGPRSaveKind::SPILL_TO_MEM, FI));
118	LLVM_DEBUG(dbgs() << "Reserved FI " << FI << " for spilling "
119	<< printReg(SGPR, TRI) << `'\n'`);
120	}
121	} else {
122	MFI->addToPrologEpilogSGPRSpills(
123	Reg: SGPR, SI: PrologEpilogSGPRSaveRestoreInfo (
124	SGPRSaveKind::COPY_TO_SCRATCH_SGPR, ScratchSGPR));
125	LiveUnits.addReg(Reg: ScratchSGPR);
126	LLVM_DEBUG(dbgs() << "Saving " << printReg(SGPR, TRI) << " with copy to "
127	<< printReg(ScratchSGPR, TRI) << `'\n'`);
128	}
129	}
130
131	// We need to specially emit stack operations here because a different frame
132	// register is used than in the rest of the function, as getFrameRegister would
133	// use.
134	static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI,
135	const SIMachineFunctionInfo &FuncInfo,
136	LiveRegUnits &LiveUnits, MachineFunction &MF,
137	MachineBasicBlock &MBB,
138	MachineBasicBlock::iterator I, const DebugLoc &DL,
139	Register SpillReg, int FI, Register FrameReg,
140	int64_t DwordOff = `0`) {
141	unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
142	: AMDGPU::BUFFER_STORE_DWORD_OFFSET;
143
144	MachineFrameInfo &FrameInfo = MF.getFrameInfo();
145	MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
146	MachineMemOperand *MMO = MF.getMachineMemOperand(
147	PtrInfo, F: MachineMemOperand::MOStore, Size: FrameInfo.getObjectSize(ObjectIdx: FI),
148	BaseAlignment: FrameInfo.getObjectAlign(ObjectIdx: FI));
149	LiveUnits.addReg(Reg: SpillReg);
150	bool IsKill = !MBB.isLiveIn(Reg: SpillReg);
151	TRI.buildSpillLoadStore(MBB, MI: I, DL, LoadStoreOp: Opc, Index: FI, ValueReg: SpillReg, ValueIsKill: IsKill, ScratchOffsetReg: FrameReg,
152	InstrOffset: DwordOff, MMO, RS: nullptr, LiveUnits: &LiveUnits);
153	if (IsKill)
154	LiveUnits.removeReg(Reg: SpillReg);
155	}
156
157	static void buildEpilogRestore(const GCNSubtarget &ST,
158	const SIRegisterInfo &TRI,
159	const SIMachineFunctionInfo &FuncInfo,
160	LiveRegUnits &LiveUnits, MachineFunction &MF,
161	MachineBasicBlock &MBB,
162	MachineBasicBlock::iterator I,
163	const DebugLoc &DL, Register SpillReg, int FI,
164	Register FrameReg, int64_t DwordOff = `0`) {
165	unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
166	: AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
167
168	MachineFrameInfo &FrameInfo = MF.getFrameInfo();
169	MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
170	MachineMemOperand *MMO = MF.getMachineMemOperand(
171	PtrInfo, F: MachineMemOperand::MOLoad, Size: FrameInfo.getObjectSize(ObjectIdx: FI),
172	BaseAlignment: FrameInfo.getObjectAlign(ObjectIdx: FI));
173	TRI.buildSpillLoadStore(MBB, MI: I, DL, LoadStoreOp: Opc, Index: FI, ValueReg: SpillReg, ValueIsKill: false, ScratchOffsetReg: FrameReg,
174	InstrOffset: DwordOff, MMO, RS: nullptr, LiveUnits: &LiveUnits);
175	}
176
177	static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
178	const DebugLoc &DL, const SIInstrInfo *TII,
179	Register TargetReg) {
180	MachineFunction *MF = MBB.getParent();
181	const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
182	const SIRegisterInfo *TRI = &TII->getRegisterInfo();
183	const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
184	Register TargetLo = TRI->getSubReg(TargetReg, AMDGPU::sub0);
185	Register TargetHi = TRI->getSubReg(TargetReg, AMDGPU::sub1);
186
187	if (MFI->getGITPtrHigh() != `0xffffffff`) {
188	BuildMI(BB&: MBB, I, MIMD: DL, MCID: SMovB32, DestReg: TargetHi)
189	.addImm(Val: MFI->getGITPtrHigh())
190	.addReg(RegNo: TargetReg, flags: RegState::ImplicitDefine);
191	} else {
192	const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64_pseudo);
193	BuildMI(BB&: MBB, I, MIMD: DL, MCID: GetPC64, DestReg: TargetReg);
194	}
195	Register GitPtrLo = MFI->getGITPtrLoReg(MF: *MF);
196	MF->getRegInfo().addLiveIn(Reg: GitPtrLo);
197	MBB.addLiveIn(PhysReg: GitPtrLo);
198	BuildMI(BB&: MBB, I, MIMD: DL, MCID: SMovB32, DestReg: TargetLo)
199	.addReg(RegNo: GitPtrLo);
200	}
201
202	static void initLiveUnits(LiveRegUnits &LiveUnits, const SIRegisterInfo &TRI,
203	const SIMachineFunctionInfo *FuncInfo,
204	MachineFunction &MF, MachineBasicBlock &MBB,
205	MachineBasicBlock::iterator MBBI, bool IsProlog) {
206	if (LiveUnits.empty()) {
207	LiveUnits.init(TRI);
208	if (IsProlog) {
209	LiveUnits.addLiveIns(MBB);
210	} else {
211	// In epilog.
212	LiveUnits.addLiveOuts(MBB);
213	LiveUnits.stepBackward(MI: *MBBI);
214	}
215	}
216	}
217
218	namespace llvm {
219
220	// SpillBuilder to save/restore special SGPR spills like the one needed for FP,
221	// BP, etc. These spills are delayed until the current function's frame is
222	// finalized. For a given register, the builder uses the
223	// PrologEpilogSGPRSaveRestoreInfo to decide the spill method.
224	class PrologEpilogSGPRSpillBuilder {
225	MachineBasicBlock::iterator MI;
226	MachineBasicBlock &MBB;
227	MachineFunction &MF;
228	const GCNSubtarget &ST;
229	MachineFrameInfo &MFI;
230	SIMachineFunctionInfo *FuncInfo;
231	const SIInstrInfo *TII;
232	const SIRegisterInfo &TRI;
233	Register SuperReg;
234	const PrologEpilogSGPRSaveRestoreInfo SI;
235	LiveRegUnits &LiveUnits;
236	const DebugLoc &DL;
237	Register FrameReg;
238	ArrayRef<int16_t> SplitParts;
239	unsigned NumSubRegs;
240	unsigned EltSize = `4`;
241
242	void saveToMemory(const int FI) const {
243	MachineRegisterInfo &MRI = MF.getRegInfo();
244	assert(!MFI.isDeadObjectIndex(FI));
245
246	initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI: MI, /IsProlog/ true);
247
248	MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
249	MRI, LiveUnits, AMDGPU::VGPR_32RegClass);
250	if (!TmpVGPR)
251	report_fatal_error(reason: "failed to find free scratch register");
252
253	for (unsigned I = `0`, DwordOff = `0`; I < NumSubRegs; ++I) {
254	Register SubReg = NumSubRegs == `1`
255	? SuperReg
256	: Register(TRI.getSubReg(SuperReg, SplitParts [I]));
257	BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
258	.addReg(SubReg);
259
260	buildPrologSpill(ST, TRI, FuncInfo: *FuncInfo, LiveUnits, MF, MBB, I: MI, DL, SpillReg: TmpVGPR,
261	FI, FrameReg, DwordOff);
262	DwordOff += `4`;
263	}
264	}
265
266	void saveToVGPRLane(const int FI) const {
267	assert(!MFI.isDeadObjectIndex(FI));
268
269	assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
270	ArrayRef<SIRegisterInfo::SpilledReg> Spill =
271	FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FrameIndex: FI);
272	assert(Spill.size() == NumSubRegs);
273
274	for (unsigned I = `0`; I < NumSubRegs; ++I) {
275	Register SubReg = NumSubRegs == `1`
276	? SuperReg
277	: Register(TRI.getSubReg(SuperReg, SplitParts [I]));
278	BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_S32_TO_VGPR),
279	Spill [I].VGPR)
280	.addReg(SubReg)
281	.addImm(Spill [I].Lane)
282	.addReg(Spill [I].VGPR, RegState::Undef);
283	}
284	}
285
286	void copyToScratchSGPR(Register DstReg) const {
287	BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), DstReg)
288	.addReg(SuperReg)
289	.setMIFlag(MachineInstr::FrameSetup);
290	}
291
292	void restoreFromMemory(const int FI) {
293	MachineRegisterInfo &MRI = MF.getRegInfo();
294
295	initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI: MI, /IsProlog/ false);
296	MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
297	MRI, LiveUnits, AMDGPU::VGPR_32RegClass);
298	if (!TmpVGPR)
299	report_fatal_error(reason: "failed to find free scratch register");
300
301	for (unsigned I = `0`, DwordOff = `0`; I < NumSubRegs; ++I) {
302	Register SubReg = NumSubRegs == `1`
303	? SuperReg
304	: Register(TRI.getSubReg(SuperReg, SplitParts [I]));
305
306	buildEpilogRestore(ST, TRI, FuncInfo: *FuncInfo, LiveUnits, MF, MBB, I: MI, DL,
307	SpillReg: TmpVGPR, FI, FrameReg, DwordOff);
308	BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
309	.addReg(TmpVGPR, RegState::Kill);
310	DwordOff += `4`;
311	}
312	}
313
314	void restoreFromVGPRLane(const int FI) {
315	assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
316	ArrayRef<SIRegisterInfo::SpilledReg> Spill =
317	FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FrameIndex: FI);
318	assert(Spill.size() == NumSubRegs);
319
320	for (unsigned I = `0`; I < NumSubRegs; ++I) {
321	Register SubReg = NumSubRegs == `1`
322	? SuperReg
323	: Register(TRI.getSubReg(SuperReg, SplitParts [I]));
324	BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
325	.addReg(Spill [I].VGPR)
326	.addImm(Spill [I].Lane);
327	}
328	}
329
330	void copyFromScratchSGPR(Register SrcReg) const {
331	BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), SuperReg)
332	.addReg(SrcReg)
333	.setMIFlag(MachineInstr::FrameDestroy);
334	}
335
336	public:
337	PrologEpilogSGPRSpillBuilder(Register Reg,
338	const PrologEpilogSGPRSaveRestoreInfo SI,
339	MachineBasicBlock &MBB,
340	MachineBasicBlock::iterator MI,
341	const DebugLoc &DL, const SIInstrInfo *TII,
342	const SIRegisterInfo &TRI,
343	LiveRegUnits &LiveUnits, Register FrameReg)
344	: MI (MI), MBB(MBB), MF(*MBB.getParent()),
345	ST(MF.getSubtarget<GCNSubtarget>()), MFI(MF.getFrameInfo()),
346	FuncInfo(MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI),
347	SuperReg (Reg), SI (SI), LiveUnits(LiveUnits), DL(DL),
348	FrameReg (FrameReg) {
349	const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(SuperReg);
350	SplitParts = TRI.getRegSplitParts(RC, EltSize);
351	NumSubRegs = SplitParts.empty() ? `1` : SplitParts.size();
352
353	assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
354	}
355
356	void save() {
357	switch (SI.getKind()) {
358	case SGPRSaveKind::SPILL_TO_MEM:
359	return saveToMemory(FI: SI.getIndex());
360	case SGPRSaveKind::SPILL_TO_VGPR_LANE:
361	return saveToVGPRLane(FI: SI.getIndex());
362	case SGPRSaveKind::COPY_TO_SCRATCH_SGPR:
363	return copyToScratchSGPR(DstReg: SI.getReg());
364	}
365	}
366
367	void restore() {
368	switch (SI.getKind()) {
369	case SGPRSaveKind::SPILL_TO_MEM:
370	return restoreFromMemory(FI: SI.getIndex());
371	case SGPRSaveKind::SPILL_TO_VGPR_LANE:
372	return restoreFromVGPRLane(FI: SI.getIndex());
373	case SGPRSaveKind::COPY_TO_SCRATCH_SGPR:
374	return copyFromScratchSGPR(SrcReg: SI.getReg());
375	}
376	}
377	};
378
379	} // namespace llvm
380
381	// Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()`
382	void SIFrameLowering::emitEntryFunctionFlatScratchInit(
383	MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
384	const DebugLoc &DL, Register ScratchWaveOffsetReg) const {
385	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
386	const SIInstrInfo *TII = ST.getInstrInfo();
387	const SIRegisterInfo *TRI = &TII->getRegisterInfo();
388	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
389
390	// We don't need this if we only have spills since there is no user facing
391	// scratch.
392
393	// TODO: If we know we don't have flat instructions earlier, we can omit
394	// this from the input registers.
395	//
396	// TODO: We only need to know if we access scratch space through a flat
397	// pointer. Because we only detect if flat instructions are used at all,
398	// this will be used more often than necessary on VI.
399
400	Register FlatScrInitLo;
401	Register FlatScrInitHi;
402
403	if (ST.isAmdPalOS()) {
404	// Extract the scratch offset from the descriptor in the GIT
405	LiveRegUnits LiveUnits;
406	LiveUnits.init(*TRI);
407	LiveUnits.addLiveIns(MBB);
408
409	// Find unused reg to load flat scratch init into
410	MachineRegisterInfo &MRI = MF.getRegInfo();
411	Register FlatScrInit = AMDGPU::NoRegister;
412	ArrayRef<MCPhysReg> AllSGPR64s = TRI->getAllSGPR64(MF);
413	unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + `1`) / `2`;
414	AllSGPR64s = AllSGPR64s.slice(
415	N: std::min(a: static_cast<unsigned>(AllSGPR64s.size()), b: NumPreloaded));
416	Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
417	for (MCPhysReg Reg : AllSGPR64s) {
418	if (LiveUnits.available(Reg) && !MRI.isReserved(PhysReg: Reg) &&
419	MRI.isAllocatable(PhysReg: Reg) && !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) {
420	FlatScrInit = Reg;
421	break;
422	}
423	}
424	assert(FlatScrInit && "Failed to find free register for scratch init");
425
426	FlatScrInitLo = TRI->getSubReg(FlatScrInit, AMDGPU::sub0);
427	FlatScrInitHi = TRI->getSubReg(FlatScrInit, AMDGPU::sub1);
428
429	buildGitPtr(MBB, I, DL, TII, TargetReg: FlatScrInit);
430
431	// We now have the GIT ptr - now get the scratch descriptor from the entry
432	// at offset 0 (or offset 16 for a compute shader).
433	MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
434	const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
435	auto *MMO = MF.getMachineMemOperand(
436	PtrInfo,
437	F: MachineMemOperand::MOLoad \| MachineMemOperand::MOInvariant \|
438	MachineMemOperand::MODereferenceable,
439	Size: `8`, BaseAlignment: Align (`4`));
440	unsigned Offset =
441	MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS ? `16` : `0`;
442	const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
443	unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
444	BuildMI(BB&: MBB, I, MIMD: DL, MCID: LoadDwordX2, DestReg: FlatScrInit)
445	.addReg(RegNo: FlatScrInit)
446	.addImm(Val: EncodedOffset) // offset
447	.addImm(Val: `0`) // cpol
448	.addMemOperand(MMO);
449
450	// Mask the offset in [47:0] of the descriptor
451	const MCInstrDesc &SAndB32 = TII->get(AMDGPU::S_AND_B32);
452	auto And = BuildMI(BB&: MBB, I, MIMD: DL, MCID: SAndB32, DestReg: FlatScrInitHi)
453	.addReg(RegNo: FlatScrInitHi)
454	.addImm(Val: `0xffff`);
455	And ->getOperand(i: `3`).setIsDead(); // Mark SCC as dead.
456	} else {
457	Register FlatScratchInitReg =
458	MFI->getPreloadedReg(Value: AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
459	assert(FlatScratchInitReg);
460
461	MachineRegisterInfo &MRI = MF.getRegInfo();
462	MRI.addLiveIn(Reg: FlatScratchInitReg);
463	MBB.addLiveIn(PhysReg: FlatScratchInitReg);
464
465	FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
466	FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
467	}
468
469	// Do a 64-bit pointer add.
470	if (ST.flatScratchIsPointer()) {
471	if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
472	BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
473	.addReg(FlatScrInitLo)
474	.addReg(ScratchWaveOffsetReg);
475	auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32),
476	FlatScrInitHi)
477	.addReg(FlatScrInitHi)
478	.addImm(`0`);
479	Addc->getOperand(`3`).setIsDead(); // Mark SCC as dead.
480
481	using namespace AMDGPU::Hwreg;
482	BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32))
483	.addReg(FlatScrInitLo)
484	.addImm(int16_t(HwregEncoding::encode(ID_FLAT_SCR_LO, `0`, `32`)));
485	BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32))
486	.addReg(FlatScrInitHi)
487	.addImm(int16_t(HwregEncoding::encode(ID_FLAT_SCR_HI, `0`, `32`)));
488	return;
489	}
490
491	// For GFX9.
492	BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO)
493	.addReg(FlatScrInitLo)
494	.addReg(ScratchWaveOffsetReg);
495	auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32),
496	AMDGPU::FLAT_SCR_HI)
497	.addReg(FlatScrInitHi)
498	.addImm(`0`);
499	Addc->getOperand(`3`).setIsDead(); // Mark SCC as dead.
500
501	return;
502	}
503
504	assert(ST.getGeneration() < AMDGPUSubtarget::GFX9);
505
506	// Copy the size in bytes.
507	BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO)
508	.addReg(FlatScrInitHi, RegState::Kill);
509
510	// Add wave offset in bytes to private base offset.
511	// See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
512	BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), FlatScrInitLo)
513	.addReg(FlatScrInitLo)
514	.addReg(ScratchWaveOffsetReg);
515
516	// Convert offset to 256-byte units.
517	auto LShr = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32),
518	AMDGPU::FLAT_SCR_HI)
519	.addReg(FlatScrInitLo, RegState::Kill)
520	.addImm(`8`);
521	LShr->getOperand(`3`).setIsDead(); // Mark SCC as dead.
522	}
523
524	// Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not
525	// memory. They should have been removed by now.
526	static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
527	for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
528	I != E; ++I) {
529	if (!MFI.isDeadObjectIndex(ObjectIdx: I))
530	return false;
531	}
532
533	return true;
534	}
535
536	// Shift down registers reserved for the scratch RSRC.
537	Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg(
538	MachineFunction &MF) const {
539
540	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
541	const SIInstrInfo *TII = ST.getInstrInfo();
542	const SIRegisterInfo *TRI = &TII->getRegisterInfo();
543	MachineRegisterInfo &MRI = MF.getRegInfo();
544	SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
545
546	assert(MFI->isEntryFunction());
547
548	Register ScratchRsrcReg = MFI->getScratchRSrcReg();
549
550	if (!ScratchRsrcReg \|\| (!MRI.isPhysRegUsed(PhysReg: ScratchRsrcReg) &&
551	allStackObjectsAreDead(MFI: MF.getFrameInfo())))
552	return Register ();
553
554	if (ST.hasSGPRInitBug() \|\|
555	ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF))
556	return ScratchRsrcReg;
557
558	// We reserved the last registers for this. Shift it down to the end of those
559	// which were actually used.
560	//
561	// FIXME: It might be safer to use a pseudoregister before replacement.
562
563	// FIXME: We should be able to eliminate unused input registers. We only
564	// cannot do this for the resources required for scratch access. For now we
565	// skip over user SGPRs and may leave unused holes.
566
567	unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + `3`) / `4`;
568	ArrayRef<MCPhysReg> AllSGPR128s = TRI->getAllSGPR128(MF);
569	AllSGPR128s = AllSGPR128s.slice(N: std::min(a: static_cast<unsigned>(AllSGPR128s.size()), b: NumPreloaded));
570
571	// Skip the last N reserved elements because they should have already been
572	// reserved for VCC etc.
573	Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
574	for (MCPhysReg Reg : AllSGPR128s) {
575	// Pick the first unallocated one. Make sure we don't clobber the other
576	// reserved input we needed. Also for PAL, make sure we don't clobber
577	// the GIT pointer passed in SGPR0 or SGPR8.
578	if (!MRI.isPhysRegUsed(PhysReg: Reg) && MRI.isAllocatable(PhysReg: Reg) &&
579	(!GITPtrLoReg \|\| !TRI->isSubRegisterEq(Reg, GITPtrLoReg))) {
580	MRI.replaceRegWith(FromReg: ScratchRsrcReg, ToReg: Reg);
581	MFI->setScratchRSrcReg(Reg);
582	return Reg;
583	}
584	}
585
586	return ScratchRsrcReg;
587	}
588
589	static unsigned getScratchScaleFactor(const GCNSubtarget &ST) {
590	return ST.enableFlatScratch() ? `1` : ST.getWavefrontSize();
591	}
592
593	void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
594	MachineBasicBlock &MBB) const {
595	assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
596
597	// FIXME: If we only have SGPR spills, we won't actually be using scratch
598	// memory since these spill to VGPRs. We should be cleaning up these unused
599	// SGPR spill frame indices somewhere.
600
601	// FIXME: We still have implicit uses on SGPR spill instructions in case they
602	// need to spill to vector memory. It's likely that will not happen, but at
603	// this point it appears we need the setup. This part of the prolog should be
604	// emitted after frame indices are eliminated.
605
606	// FIXME: Remove all of the isPhysRegUsed checks
607
608	SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
609	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
610	const SIInstrInfo *TII = ST.getInstrInfo();
611	const SIRegisterInfo *TRI = &TII->getRegisterInfo();
612	MachineRegisterInfo &MRI = MF.getRegInfo();
613	const Function &F = MF.getFunction();
614	MachineFrameInfo &FrameInfo = MF.getFrameInfo();
615
616	assert(MFI->isEntryFunction());
617
618	Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
619	Value: AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
620
621	// We need to do the replacement of the private segment buffer register even
622	// if there are no stack objects. There could be stores to undef or a
623	// constant without an associated object.
624	//
625	// This will return `Register()` in cases where there are no actual
626	// uses of the SRSRC.
627	Register ScratchRsrcReg;
628	if (!ST.enableFlatScratch())
629	ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF);
630
631	// Make the selected register live throughout the function.
632	if (ScratchRsrcReg) {
633	for (MachineBasicBlock &OtherBB : MF) {
634	if (&OtherBB != &MBB) {
635	OtherBB.addLiveIn(PhysReg: ScratchRsrcReg);
636	}
637	}
638	}
639
640	// Now that we have fixed the reserved SRSRC we need to locate the
641	// (potentially) preloaded SRSRC.
642	Register PreloadedScratchRsrcReg;
643	if (ST.isAmdHsaOrMesa(F)) {
644	PreloadedScratchRsrcReg =
645	MFI->getPreloadedReg(Value: AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
646	if (ScratchRsrcReg && PreloadedScratchRsrcReg) {
647	// We added live-ins during argument lowering, but since they were not
648	// used they were deleted. We're adding the uses now, so add them back.
649	MRI.addLiveIn(Reg: PreloadedScratchRsrcReg);
650	MBB.addLiveIn(PhysReg: PreloadedScratchRsrcReg);
651	}
652	}
653
654	// Debug location must be unknown since the first debug location is used to
655	// determine the end of the prologue.
656	DebugLoc DL;
657	MachineBasicBlock::iterator I = MBB.begin();
658
659	// We found the SRSRC first because it needs four registers and has an
660	// alignment requirement. If the SRSRC that we found is clobbering with
661	// the scratch wave offset, which may be in a fixed SGPR or a free SGPR
662	// chosen by SITargetLowering::allocateSystemSGPRs, COPY the scratch
663	// wave offset to a free SGPR.
664	Register ScratchWaveOffsetReg;
665	if (PreloadedScratchWaveOffsetReg &&
666	TRI->isSubRegisterEq(ScratchRsrcReg, PreloadedScratchWaveOffsetReg)) {
667	ArrayRef<MCPhysReg> AllSGPRs = TRI->getAllSGPR32(MF);
668	unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
669	AllSGPRs = AllSGPRs.slice(
670	N: std::min(a: static_cast<unsigned>(AllSGPRs.size()), b: NumPreloaded));
671	Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
672	for (MCPhysReg Reg : AllSGPRs) {
673	if (!MRI.isPhysRegUsed(PhysReg: Reg) && MRI.isAllocatable(PhysReg: Reg) &&
674	!TRI->isSubRegisterEq(ScratchRsrcReg, Reg) && GITPtrLoReg != Reg) {
675	ScratchWaveOffsetReg = Reg;
676	BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg)
677	.addReg(PreloadedScratchWaveOffsetReg, RegState::Kill);
678	break;
679	}
680	}
681	} else {
682	ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg;
683	}
684	assert(ScratchWaveOffsetReg \|\| !PreloadedScratchWaveOffsetReg);
685
686	if (requiresStackPointerReference(MF)) {
687	Register SPReg = MFI->getStackPtrOffsetReg();
688	assert(SPReg != AMDGPU::SP_REG);
689	BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg)
690	.addImm(FrameInfo.getStackSize() * getScratchScaleFactor(ST));
691	}
692
693	if (hasFP(MF)) {
694	Register FPReg = MFI->getFrameOffsetReg();
695	assert(FPReg != AMDGPU::FP_REG);
696	BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(`0`);
697	}
698
699	bool NeedsFlatScratchInit =
700	MFI->getUserSGPRInfo().hasFlatScratchInit() &&
701	(MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) \|\| FrameInfo.hasCalls() \|\|
702	(!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch()));
703
704	if ((NeedsFlatScratchInit \|\| ScratchRsrcReg) &&
705	PreloadedScratchWaveOffsetReg && !ST.flatScratchIsArchitected()) {
706	MRI.addLiveIn(Reg: PreloadedScratchWaveOffsetReg);
707	MBB.addLiveIn(PhysReg: PreloadedScratchWaveOffsetReg);
708	}
709
710	if (NeedsFlatScratchInit) {
711	emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
712	}
713
714	if (ScratchRsrcReg) {
715	emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL,
716	PreloadedPrivateBufferReg: PreloadedScratchRsrcReg,
717	ScratchRsrcReg, ScratchWaveOffsetReg);
718	}
719	}
720
721	// Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg`
722	void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
723	MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
724	const DebugLoc &DL, Register PreloadedScratchRsrcReg,
725	Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const {
726
727	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
728	const SIInstrInfo *TII = ST.getInstrInfo();
729	const SIRegisterInfo *TRI = &TII->getRegisterInfo();
730	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
731	const Function &Fn = MF.getFunction();
732
733	if (ST.isAmdPalOS()) {
734	// The pointer to the GIT is formed from the offset passed in and either
735	// the amdgpu-git-ptr-high function attribute or the top part of the PC
736	Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
737	Register Rsrc03 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
738
739	buildGitPtr(MBB, I, DL, TII, TargetReg: Rsrc01);
740
741	// We now have the GIT ptr - now get the scratch descriptor from the entry
742	// at offset 0 (or offset 16 for a compute shader).
743	MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
744	const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM);
745	auto MMO = MF.getMachineMemOperand(PtrInfo,
746	F: MachineMemOperand::MOLoad \|
747	MachineMemOperand::MOInvariant \|
748	MachineMemOperand::MODereferenceable,
749	Size: `16`, BaseAlignment: Align (`4`));
750	unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? `16` : `0`;
751	const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
752	unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
753	BuildMI(BB&: MBB, I, MIMD: DL, MCID: LoadDwordX4, DestReg: ScratchRsrcReg)
754	.addReg(RegNo: Rsrc01)
755	.addImm(Val: EncodedOffset) // offset
756	.addImm(Val: `0`) // cpol
757	.addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine)
758	.addMemOperand(MMO);
759
760	// The driver will always set the SRD for wave 64 (bits 118:117 of
761	// descriptor / bits 22:21 of third sub-reg will be 0b11)
762	// If the shader is actually wave32 we have to modify the const_index_stride
763	// field of the descriptor 3rd sub-reg (bits 22:21) to 0b10 (stride=32). The
764	// reason the driver does this is that there can be cases where it presents
765	// 2 shaders with different wave size (e.g. VsFs).
766	// TODO: convert to using SCRATCH instructions or multiple SRD buffers
767	if (ST.isWave32()) {
768	const MCInstrDesc &SBitsetB32 = TII->get(AMDGPU::S_BITSET0_B32);
769	BuildMI(BB&: MBB, I, MIMD: DL, MCID: SBitsetB32, DestReg: Rsrc03)
770	.addImm(Val: `21`)
771	.addReg(RegNo: Rsrc03);
772	}
773	} else if (ST.isMesaGfxShader(F: Fn) \|\| !PreloadedScratchRsrcReg) {
774	assert(!ST.isAmdHsaOrMesa(Fn));
775	const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
776
777	Register Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
778	Register Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
779
780	// Use relocations to get the pointer, and setup the other bits manually.
781	uint64_t Rsrc23 = TII->getScratchRsrcWords23();
782
783	if (MFI->getUserSGPRInfo().hasImplicitBufferPtr()) {
784	Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
785
786	if (AMDGPU::isCompute(CC: MF.getFunction().getCallingConv())) {
787	const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64);
788
789	BuildMI(BB&: MBB, I, MIMD: DL, MCID: Mov64, DestReg: Rsrc01)
790	.addReg(RegNo: MFI->getImplicitBufferPtrUserSGPR())
791	.addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine);
792	} else {
793	const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
794
795	MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
796	auto MMO = MF.getMachineMemOperand(
797	PtrInfo,
798	F: MachineMemOperand::MOLoad \| MachineMemOperand::MOInvariant \|
799	MachineMemOperand::MODereferenceable,
800	Size: `8`, BaseAlignment: Align (`4`));
801	BuildMI(BB&: MBB, I, MIMD: DL, MCID: LoadDwordX2, DestReg: Rsrc01)
802	.addReg(RegNo: MFI->getImplicitBufferPtrUserSGPR())
803	.addImm(Val: `0`) // offset
804	.addImm(Val: `0`) // cpol
805	.addMemOperand(MMO)
806	.addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine);
807
808	MF.getRegInfo().addLiveIn(Reg: MFI->getImplicitBufferPtrUserSGPR());
809	MBB.addLiveIn(PhysReg: MFI->getImplicitBufferPtrUserSGPR());
810	}
811	} else {
812	Register Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
813	Register Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
814
815	BuildMI(BB&: MBB, I, MIMD: DL, MCID: SMovB32, DestReg: Rsrc0)
816	.addExternalSymbol(FnName: "SCRATCH_RSRC_DWORD0")
817	.addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine);
818
819	BuildMI(BB&: MBB, I, MIMD: DL, MCID: SMovB32, DestReg: Rsrc1)
820	.addExternalSymbol(FnName: "SCRATCH_RSRC_DWORD1")
821	.addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine);
822	}
823
824	BuildMI(BB&: MBB, I, MIMD: DL, MCID: SMovB32, DestReg: Rsrc2)
825	.addImm(Val: Rsrc23 & `0xffffffff`)
826	.addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine);
827
828	BuildMI(BB&: MBB, I, MIMD: DL, MCID: SMovB32, DestReg: Rsrc3)
829	.addImm(Val: Rsrc23 >> `32`)
830	.addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine);
831	} else if (ST.isAmdHsaOrMesa(Fn)) {
832	assert(PreloadedScratchRsrcReg);
833
834	if (ScratchRsrcReg != PreloadedScratchRsrcReg) {
835	BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
836	.addReg(PreloadedScratchRsrcReg, RegState::Kill);
837	}
838	}
839
840	// Add the scratch wave offset into the scratch RSRC.
841	//
842	// We only want to update the first 48 bits, which is the base address
843	// pointer, without touching the adjacent 16 bits of flags. We know this add
844	// cannot carry-out from bit 47, otherwise the scratch allocation would be
845	// impossible to fit in the 48-bit global address space.
846	//
847	// TODO: Evaluate if it is better to just construct an SRD using the flat
848	// scratch init and some constants rather than update the one we are passed.
849	Register ScratchRsrcSub0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
850	Register ScratchRsrcSub1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
851
852	// We cannot Kill ScratchWaveOffsetReg here because we allow it to be used in
853	// the kernel body via inreg arguments.
854	BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), ScratchRsrcSub0)
855	.addReg(ScratchRsrcSub0)
856	.addReg(ScratchWaveOffsetReg)
857	.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
858	auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), ScratchRsrcSub1)
859	.addReg(ScratchRsrcSub1)
860	.addImm(`0`)
861	.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
862	Addc->getOperand(`3`).setIsDead(); // Mark SCC as dead.
863	}
864
865	bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
866	switch (ID) {
867	case TargetStackID::Default:
868	case TargetStackID::NoAlloc:
869	case TargetStackID::SGPRSpill:
870	return true;
871	case TargetStackID::ScalableVector:
872	case TargetStackID::WasmLocal:
873	return false;
874	}
875	llvm_unreachable("Invalid TargetStackID::Value");
876	}
877
878	// Activate only the inactive lanes when \p EnableInactiveLanes is true.
879	// Otherwise, activate all lanes. It returns the saved exec.
880	static Register buildScratchExecCopy(LiveRegUnits &LiveUnits,
881	MachineFunction &MF,
882	MachineBasicBlock &MBB,
883	MachineBasicBlock::iterator MBBI,
884	const DebugLoc &DL, bool IsProlog,
885	bool EnableInactiveLanes) {
886	Register ScratchExecCopy;
887	MachineRegisterInfo &MRI = MF.getRegInfo();
888	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
889	const SIInstrInfo *TII = ST.getInstrInfo();
890	const SIRegisterInfo &TRI = TII->getRegisterInfo();
891	SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
892
893	initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, IsProlog);
894
895	ScratchExecCopy = findScratchNonCalleeSaveRegister(
896	MRI, LiveUnits, RC: *TRI.getWaveMaskRegClass());
897	if (!ScratchExecCopy)
898	report_fatal_error(reason: "failed to find free scratch register");
899
900	LiveUnits.addReg(Reg: ScratchExecCopy);
901
902	const unsigned SaveExecOpc =
903	ST.isWave32() ? (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B32
904	: AMDGPU::S_OR_SAVEEXEC_B32)
905	: (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B64
906	: AMDGPU::S_OR_SAVEEXEC_B64);
907	auto SaveExec =
908	BuildMI(MBB, MBBI, DL, TII->get(SaveExecOpc), ScratchExecCopy).addImm(-`1`);
909	SaveExec->getOperand(`3`).setIsDead(); // Mark SCC as dead.
910
911	return ScratchExecCopy;
912	}
913
914	void SIFrameLowering::emitCSRSpillStores(
915	MachineFunction &MF, MachineBasicBlock &MBB,
916	MachineBasicBlock::iterator MBBI, DebugLoc &DL, LiveRegUnits &LiveUnits,
917	Register FrameReg, Register FramePtrRegScratchCopy) const {
918	SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
919	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
920	const SIInstrInfo *TII = ST.getInstrInfo();
921	const SIRegisterInfo &TRI = TII->getRegisterInfo();
922
923	// Spill Whole-Wave Mode VGPRs. Save only the inactive lanes of the scratch
924	// registers. However, save all lanes of callee-saved VGPRs. Due to this, we
925	// might end up flipping the EXEC bits twice.
926	Register ScratchExecCopy;
927	SmallVector<std::pair<Register, int>, `2`> WWMCalleeSavedRegs, WWMScratchRegs;
928	FuncInfo->splitWWMSpillRegisters(MF, CalleeSavedRegs&: WWMCalleeSavedRegs, ScratchRegs&: WWMScratchRegs);
929	if (!WWMScratchRegs.empty())
930	ScratchExecCopy =
931	buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
932	/IsProlog/ true, /EnableInactiveLanes/ true);
933
934	auto StoreWWMRegisters =
935	[&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) {
936	for (const auto &Reg : WWMRegs) {
937	Register VGPR = Reg.first;
938	int FI = Reg.second;
939	buildPrologSpill(ST, TRI, FuncInfo: *FuncInfo, LiveUnits, MF, MBB, I: MBBI, DL,
940	SpillReg: VGPR, FI, FrameReg);
941	}
942	};
943
944	StoreWWMRegisters (WWMScratchRegs);
945	if (!WWMCalleeSavedRegs.empty()) {
946	if (ScratchExecCopy) {
947	unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
948	BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-`1`);
949	} else {
950	ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
951	/IsProlog/ true,
952	/EnableInactiveLanes/ false);
953	}
954	}
955
956	StoreWWMRegisters (WWMCalleeSavedRegs);
957	if (ScratchExecCopy) {
958	// FIXME: Split block and make terminator.
959	unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
960	BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec())
961	.addReg(ScratchExecCopy, RegState::Kill);
962	LiveUnits.addReg(Reg: ScratchExecCopy);
963	}
964
965	Register FramePtrReg = FuncInfo->getFrameOffsetReg();
966
967	for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) {
968	// Special handle FP spill:
969	// Skip if FP is saved to a scratch SGPR, the save has already been emitted.
970	// Otherwise, FP has been moved to a temporary register and spill it
971	// instead.
972	Register Reg =
973	Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first;
974	if (!Reg)
975	continue;
976
977	PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI,
978	LiveUnits, FrameReg);
979	SB.save();
980	}
981
982	// If a copy to scratch SGPR has been chosen for any of the SGPR spills, make
983	// such scratch registers live throughout the function.
984	SmallVector<Register, `1`> ScratchSGPRs;
985	FuncInfo->getAllScratchSGPRCopyDstRegs(Regs&: ScratchSGPRs);
986	if (!ScratchSGPRs.empty()) {
987	for (MachineBasicBlock &MBB : MF) {
988	for (MCPhysReg Reg : ScratchSGPRs)
989	MBB.addLiveIn(PhysReg: Reg);
990
991	MBB.sortUniqueLiveIns();
992	}
993	if (!LiveUnits.empty()) {
994	for (MCPhysReg Reg : ScratchSGPRs)
995	LiveUnits.addReg(Reg);
996	}
997	}
998	}
999
1000	void SIFrameLowering::emitCSRSpillRestores(
1001	MachineFunction &MF, MachineBasicBlock &MBB,
1002	MachineBasicBlock::iterator MBBI, DebugLoc &DL, LiveRegUnits &LiveUnits,
1003	Register FrameReg, Register FramePtrRegScratchCopy) const {
1004	const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1005	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1006	const SIInstrInfo *TII = ST.getInstrInfo();
1007	const SIRegisterInfo &TRI = TII->getRegisterInfo();
1008	Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1009
1010	for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) {
1011	// Special handle FP restore:
1012	// Skip if FP needs to be restored from the scratch SGPR. Otherwise, restore
1013	// the FP value to a temporary register. The frame pointer should be
1014	// overwritten only at the end when all other spills are restored from
1015	// current frame.
1016	Register Reg =
1017	Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first;
1018	if (!Reg)
1019	continue;
1020
1021	PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI,
1022	LiveUnits, FrameReg);
1023	SB.restore();
1024	}
1025
1026	// Restore Whole-Wave Mode VGPRs. Restore only the inactive lanes of the
1027	// scratch registers. However, restore all lanes of callee-saved VGPRs. Due to
1028	// this, we might end up flipping the EXEC bits twice.
1029	Register ScratchExecCopy;
1030	SmallVector<std::pair<Register, int>, `2`> WWMCalleeSavedRegs, WWMScratchRegs;
1031	FuncInfo->splitWWMSpillRegisters(MF, CalleeSavedRegs&: WWMCalleeSavedRegs, ScratchRegs&: WWMScratchRegs);
1032	if (!WWMScratchRegs.empty())
1033	ScratchExecCopy =
1034	buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
1035	/IsProlog/ false, /EnableInactiveLanes/ true);
1036
1037	auto RestoreWWMRegisters =
1038	[&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) {
1039	for (const auto &Reg : WWMRegs) {
1040	Register VGPR = Reg.first;
1041	int FI = Reg.second;
1042	buildEpilogRestore(ST, TRI, FuncInfo: *FuncInfo, LiveUnits, MF, MBB, I: MBBI, DL,
1043	SpillReg: VGPR, FI, FrameReg);
1044	}
1045	};
1046
1047	RestoreWWMRegisters (WWMScratchRegs);
1048	if (!WWMCalleeSavedRegs.empty()) {
1049	if (ScratchExecCopy) {
1050	unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1051	BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-`1`);
1052	} else {
1053	ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
1054	/IsProlog/ false,
1055	/EnableInactiveLanes/ false);
1056	}
1057	}
1058
1059	RestoreWWMRegisters (WWMCalleeSavedRegs);
1060	if (ScratchExecCopy) {
1061	// FIXME: Split block and make terminator.
1062	unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1063	BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec())
1064	.addReg(ScratchExecCopy, RegState::Kill);
1065	}
1066	}
1067
1068	void SIFrameLowering::emitPrologue(MachineFunction &MF,
1069	MachineBasicBlock &MBB) const {
1070	SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1071	if (FuncInfo->isEntryFunction()) {
1072	emitEntryFunctionPrologue(MF, MBB);
1073	return;
1074	}
1075
1076	MachineFrameInfo &MFI = MF.getFrameInfo();
1077	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1078	const SIInstrInfo *TII = ST.getInstrInfo();
1079	const SIRegisterInfo &TRI = TII->getRegisterInfo();
1080	MachineRegisterInfo &MRI = MF.getRegInfo();
1081
1082	Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
1083	Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1084	Register BasePtrReg =
1085	TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register ();
1086	LiveRegUnits LiveUnits;
1087
1088	MachineBasicBlock::iterator MBBI = MBB.begin();
1089	// DebugLoc must be unknown since the first instruction with DebugLoc is used
1090	// to determine the end of the prologue.
1091	DebugLoc DL;
1092
1093	if (FuncInfo->isChainFunction()) {
1094	// Functions with the amdgpu_cs_chain[_preserve] CC don't receive a SP, but
1095	// are free to set one up if they need it.
1096	bool UseSP = requiresStackPointerReference(MF);
1097	if (UseSP) {
1098	assert(StackPtrReg != AMDGPU::SP_REG);
1099
1100	BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_MOV_B32), StackPtrReg)
1101	.addImm(MFI.getStackSize() * getScratchScaleFactor(ST));
1102	}
1103	}
1104
1105	bool HasFP = false;
1106	bool HasBP = false;
1107	uint32_t NumBytes = MFI.getStackSize();
1108	uint32_t RoundedSize = NumBytes;
1109
1110	if (TRI.hasStackRealignment(MF))
1111	HasFP = true;
1112
1113	Register FramePtrRegScratchCopy;
1114	if (!HasFP && !hasFP(MF)) {
1115	// Emit the CSR spill stores with SP base register.
1116	emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits,
1117	FrameReg: FuncInfo->isChainFunction() ? Register () : StackPtrReg,
1118	FramePtrRegScratchCopy);
1119	} else {
1120	// CSR spill stores will use FP as base register.
1121	Register SGPRForFPSaveRestoreCopy =
1122	FuncInfo->getScratchSGPRCopyDstReg(Reg: FramePtrReg);
1123
1124	initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, /IsProlog/ true);
1125	if (SGPRForFPSaveRestoreCopy) {
1126	// Copy FP to the scratch register now and emit the CFI entry. It avoids
1127	// the extra FP copy needed in the other two cases when FP is spilled to
1128	// memory or to a VGPR lane.
1129	PrologEpilogSGPRSpillBuilder SB(
1130	FramePtrReg,
1131	FuncInfo->getPrologEpilogSGPRSaveRestoreInfo(Reg: FramePtrReg), MBB, MBBI,
1132	DL, TII, TRI, LiveUnits, FramePtrReg);
1133	SB.save();
1134	LiveUnits.addReg(Reg: SGPRForFPSaveRestoreCopy);
1135	} else {
1136	// Copy FP into a new scratch register so that its previous value can be
1137	// spilled after setting up the new frame.
1138	FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister(
1139	MRI, LiveUnits, AMDGPU::SReg_32_XM0_XEXECRegClass);
1140	if (!FramePtrRegScratchCopy)
1141	report_fatal_error(reason: "failed to find free scratch register");
1142
1143	LiveUnits.addReg(Reg: FramePtrRegScratchCopy);
1144	BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrRegScratchCopy)
1145	.addReg(FramePtrReg);
1146	}
1147	}
1148
1149	if (HasFP) {
1150	const unsigned Alignment = MFI.getMaxAlign().value();
1151
1152	RoundedSize += Alignment;
1153	if (LiveUnits.empty()) {
1154	LiveUnits.init(TRI);
1155	LiveUnits.addLiveIns(MBB);
1156	}
1157
1158	// s_add_i32 s33, s32, NumBytes
1159	// s_and_b32 s33, s33, 0b111...0000
1160	BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), FramePtrReg)
1161	.addReg(StackPtrReg)
1162	.addImm((Alignment - `1`) * getScratchScaleFactor(ST))
1163	.setMIFlag(MachineInstr::FrameSetup);
1164	auto And = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg)
1165	.addReg(FramePtrReg, RegState::Kill)
1166	.addImm(-Alignment * getScratchScaleFactor(ST))
1167	.setMIFlag(MachineInstr::FrameSetup);
1168	And->getOperand(`3`).setIsDead(); // Mark SCC as dead.
1169	FuncInfo->setIsStackRealigned(true);
1170	} else if ((HasFP = hasFP(MF))) {
1171	BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
1172	.addReg(StackPtrReg)
1173	.setMIFlag(MachineInstr::FrameSetup);
1174	}
1175
1176	// If FP is used, emit the CSR spills with FP base register.
1177	if (HasFP) {
1178	emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits, FrameReg: FramePtrReg,
1179	FramePtrRegScratchCopy);
1180	if (FramePtrRegScratchCopy)
1181	LiveUnits.removeReg(Reg: FramePtrRegScratchCopy);
1182	}
1183
1184	// If we need a base pointer, set it up here. It's whatever the value of
1185	// the stack pointer is at this point. Any variable size objects will be
1186	// allocated after this, so we can still use the base pointer to reference
1187	// the incoming arguments.
1188	if ((HasBP = TRI.hasBasePointer(MF))) {
1189	BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg)
1190	.addReg(StackPtrReg)
1191	.setMIFlag(MachineInstr::FrameSetup);
1192	}
1193
1194	if (HasFP && RoundedSize != `0`) {
1195	auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg)
1196	.addReg(StackPtrReg)
1197	.addImm(RoundedSize * getScratchScaleFactor(ST))
1198	.setMIFlag(MachineInstr::FrameSetup);
1199	Add->getOperand(`3`).setIsDead(); // Mark SCC as dead.
1200	}
1201
1202	bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(Reg: FramePtrReg);
1203	(void)FPSaved;
1204	assert((!HasFP \|\| FPSaved) &&
1205	"Needed to save FP but didn't save it anywhere");
1206
1207	// If we allow spilling to AGPRs we may have saved FP but then spill
1208	// everything into AGPRs instead of the stack.
1209	assert((HasFP \|\| !FPSaved \|\| EnableSpillVGPRToAGPR) &&
1210	"Saved FP but didn't need it");
1211
1212	bool BPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(Reg: BasePtrReg);
1213	(void)BPSaved;
1214	assert((!HasBP \|\| BPSaved) &&
1215	"Needed to save BP but didn't save it anywhere");
1216
1217	assert((HasBP \|\| !BPSaved) && "Saved BP but didn't need it");
1218	}
1219
1220	void SIFrameLowering::emitEpilogue(MachineFunction &MF,
1221	MachineBasicBlock &MBB) const {
1222	const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1223	if (FuncInfo->isEntryFunction())
1224	return;
1225
1226	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1227	const SIInstrInfo *TII = ST.getInstrInfo();
1228	const SIRegisterInfo &TRI = TII->getRegisterInfo();
1229	MachineRegisterInfo &MRI = MF.getRegInfo();
1230	LiveRegUnits LiveUnits;
1231	// Get the insert location for the epilogue. If there were no terminators in
1232	// the block, get the last instruction.
1233	MachineBasicBlock::iterator MBBI = MBB.end();
1234	DebugLoc DL;
1235	if (!MBB.empty()) {
1236	MBBI = MBB.getLastNonDebugInstr();
1237	if (MBBI != MBB.end())
1238	DL = MBBI ->getDebugLoc();
1239
1240	MBBI = MBB.getFirstTerminator();
1241	}
1242
1243	const MachineFrameInfo &MFI = MF.getFrameInfo();
1244	uint32_t NumBytes = MFI.getStackSize();
1245	uint32_t RoundedSize = FuncInfo->isStackRealigned()
1246	? NumBytes + MFI.getMaxAlign().value()
1247	: NumBytes;
1248	const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
1249	Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1250	bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(Reg: FramePtrReg);
1251
1252	Register FramePtrRegScratchCopy;
1253	Register SGPRForFPSaveRestoreCopy =
1254	FuncInfo->getScratchSGPRCopyDstReg(Reg: FramePtrReg);
1255	if (FPSaved) {
1256	// CSR spill restores should use FP as base register. If
1257	// SGPRForFPSaveRestoreCopy is not true, restore the previous value of FP
1258	// into a new scratch register and copy to FP later when other registers are
1259	// restored from the current stack frame.
1260	initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, /IsProlog/ false);
1261	if (SGPRForFPSaveRestoreCopy) {
1262	LiveUnits.addReg(Reg: SGPRForFPSaveRestoreCopy);
1263	} else {
1264	FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister(
1265	MRI, LiveUnits, AMDGPU::SReg_32_XM0_XEXECRegClass);
1266	if (!FramePtrRegScratchCopy)
1267	report_fatal_error(reason: "failed to find free scratch register");
1268
1269	LiveUnits.addReg(Reg: FramePtrRegScratchCopy);
1270	}
1271
1272	emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits, FrameReg: FramePtrReg,
1273	FramePtrRegScratchCopy);
1274	}
1275
1276	if (RoundedSize != `0` && hasFP(MF)) {
1277	auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg)
1278	.addReg(StackPtrReg)
1279	.addImm(-static_cast<int64_t>(RoundedSize * getScratchScaleFactor(ST)))
1280	.setMIFlag(MachineInstr::FrameDestroy);
1281	Add->getOperand(`3`).setIsDead(); // Mark SCC as dead.
1282	}
1283
1284	if (FPSaved) {
1285	// Insert the copy to restore FP.
1286	Register SrcReg = SGPRForFPSaveRestoreCopy ? SGPRForFPSaveRestoreCopy
1287	: FramePtrRegScratchCopy;
1288	MachineInstrBuilder MIB =
1289	BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
1290	.addReg(SrcReg);
1291	if (SGPRForFPSaveRestoreCopy)
1292	MIB.setMIFlag(MachineInstr::FrameDestroy);
1293	} else {
1294	// Insert the CSR spill restores with SP as the base register.
1295	emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits, FrameReg: StackPtrReg,
1296	FramePtrRegScratchCopy);
1297	}
1298	}
1299
1300	#ifndef NDEBUG
1301	static bool allSGPRSpillsAreDead(const MachineFunction &MF) {
1302	const MachineFrameInfo &MFI = MF.getFrameInfo();
1303	const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1304	for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
1305	I != E; ++I) {
1306	if (!MFI.isDeadObjectIndex(ObjectIdx: I) &&
1307	MFI.getStackID(ObjectIdx: I) == TargetStackID::SGPRSpill &&
1308	!FuncInfo->checkIndexInPrologEpilogSGPRSpills(FI: I)) {
1309	return false;
1310	}
1311	}
1312
1313	return true;
1314	}
1315	#endif
1316
1317	StackOffset SIFrameLowering::getFrameIndexReference(const MachineFunction &MF,
1318	int FI,
1319	Register &FrameReg) const {
1320	const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
1321
1322	FrameReg = RI->getFrameRegister(MF);
1323	return StackOffset::getFixed(Fixed: MF.getFrameInfo().getObjectOffset(ObjectIdx: FI));
1324	}
1325
1326	void SIFrameLowering::processFunctionBeforeFrameFinalized(
1327	MachineFunction &MF,
1328	RegScavenger RS) const* {
1329	MachineFrameInfo &MFI = MF.getFrameInfo();
1330
1331	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1332	const SIInstrInfo *TII = ST.getInstrInfo();
1333	const SIRegisterInfo *TRI = ST.getRegisterInfo();
1334	MachineRegisterInfo &MRI = MF.getRegInfo();
1335	SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1336
1337	// Allocate spill slots for WWM reserved VGPRs.
1338	// For chain functions, we only need to do this if we have calls to
1339	// llvm.amdgcn.cs.chain.
1340	bool IsChainWithoutCalls =
1341	FuncInfo->isChainFunction() && !MF.getFrameInfo().hasTailCall();
1342	if (!FuncInfo->isEntryFunction() && !IsChainWithoutCalls) {
1343	for (Register Reg : FuncInfo->getWWMReservedRegs()) {
1344	const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
1345	FuncInfo->allocateWWMSpill(MF, VGPR: Reg, Size: TRI->getSpillSize(*RC),
1346	Alignment: TRI->getSpillAlign(*RC));
1347	}
1348	}
1349
1350	const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs()
1351	&& EnableSpillVGPRToAGPR;
1352
1353	if (SpillVGPRToAGPR) {
1354	// To track the spill frame indices handled in this pass.
1355	BitVector SpillFIs(MFI.getObjectIndexEnd(), false);
1356	BitVector NonVGPRSpillFIs(MFI.getObjectIndexEnd(), false);
1357
1358	bool SeenDbgInstr = false;
1359
1360	for (MachineBasicBlock &MBB : MF) {
1361	for (MachineInstr &MI : llvm::make_early_inc_range(Range&: MBB)) {
1362	int FrameIndex;
1363	if (MI.isDebugInstr())
1364	SeenDbgInstr = true;
1365
1366	if (TII->isVGPRSpill(MI)) {
1367	// Try to eliminate stack used by VGPR spills before frame
1368	// finalization.
1369	unsigned FIOp = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
1370	AMDGPU::OpName::vaddr);
1371	int FI = MI.getOperand(i: FIOp).getIndex();
1372	Register VReg =
1373	TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
1374	if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI,
1375	isAGPRtoVGPR: TRI->isAGPR(MRI, Reg: VReg))) {
1376	assert(RS != nullptr);
1377	RS->enterBasicBlockEnd(MBB);
1378	RS->backward(I: std::next(x: MI.getIterator()));
1379	TRI->eliminateFrameIndex(MI, SPAdj: `0`, FIOperandNum: FIOp, RS);
1380	SpillFIs.set(FI);
1381	continue;
1382	}
1383	} else if (TII->isStoreToStackSlot(MI, FrameIndex) \|\|
1384	TII->isLoadFromStackSlot(MI, FrameIndex))
1385	if (!MFI.isFixedObjectIndex(ObjectIdx: FrameIndex))
1386	NonVGPRSpillFIs.set(FrameIndex);
1387	}
1388	}
1389
1390	// Stack slot coloring may assign different objects to the same stack slot.
1391	// If not, then the VGPR to AGPR spill slot is dead.
1392	for (unsigned FI : SpillFIs.set_bits())
1393	if (!NonVGPRSpillFIs.test(Idx: FI))
1394	FuncInfo->setVGPRToAGPRSpillDead(FI);
1395
1396	for (MachineBasicBlock &MBB : MF) {
1397	for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs())
1398	MBB.addLiveIn(PhysReg: Reg);
1399
1400	for (MCPhysReg Reg : FuncInfo->getAGPRSpillVGPRs())
1401	MBB.addLiveIn(PhysReg: Reg);
1402
1403	MBB.sortUniqueLiveIns();
1404
1405	if (!SpillFIs.empty() && SeenDbgInstr) {
1406	// FIXME: The dead frame indices are replaced with a null register from
1407	// the debug value instructions. We should instead, update it with the
1408	// correct register value. But not sure the register value alone is
1409	for (MachineInstr &MI : MBB) {
1410	if (MI.isDebugValue() && MI.getOperand(i: `0`).isFI() &&
1411	!MFI.isFixedObjectIndex(ObjectIdx: MI.getOperand(i: `0`).getIndex()) &&
1412	SpillFIs [MI.getOperand(i: `0`).getIndex()]) {
1413	MI.getOperand(i: `0`).ChangeToRegister(Reg: Register (), isDef: false /isDef/);
1414	}
1415	}
1416	}
1417	}
1418	}
1419
1420	// At this point we've already allocated all spilled SGPRs to VGPRs if we
1421	// can. Any remaining SGPR spills will go to memory, so move them back to the
1422	// default stack.
1423	bool HaveSGPRToVMemSpill =
1424	FuncInfo->removeDeadFrameIndices(MFI, /ResetSGPRSpillStackIDs/ true);
1425	assert(allSGPRSpillsAreDead(MF) &&
1426	"SGPR spill should have been removed in SILowerSGPRSpills");
1427
1428	// FIXME: The other checks should be redundant with allStackObjectsAreDead,
1429	// but currently hasNonSpillStackObjects is set only from source
1430	// allocas. Stack temps produced from legalization are not counted currently.
1431	if (!allStackObjectsAreDead(MFI)) {
1432	assert(RS && "RegScavenger required if spilling");
1433
1434	// Add an emergency spill slot
1435	RS->addScavengingFrameIndex(FI: FuncInfo->getScavengeFI(MFI, TRI: *TRI));
1436
1437	// If we are spilling SGPRs to memory with a large frame, we may need a
1438	// second VGPR emergency frame index.
1439	if (HaveSGPRToVMemSpill &&
1440	allocateScavengingFrameIndexesNearIncomingSP(MF)) {
1441	RS->addScavengingFrameIndex(FI: MFI.CreateStackObject(Size: `4`, Alignment: Align (`4`), isSpillSlot: false));
1442	}
1443	}
1444	}
1445
1446	void SIFrameLowering::processFunctionBeforeFrameIndicesReplaced(
1447	MachineFunction &MF, RegScavenger RS) const* {
1448	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1449	const SIRegisterInfo *TRI = ST.getRegisterInfo();
1450	MachineRegisterInfo &MRI = MF.getRegInfo();
1451	SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1452
1453	if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
1454	// On gfx908, we had initially reserved highest available VGPR for AGPR
1455	// copy. Now since we are done with RA, check if there exist an unused VGPR
1456	// which is lower than the eariler reserved VGPR before RA. If one exist,
1457	// use it for AGPR copy instead of one reserved before RA.
1458	Register VGPRForAGPRCopy = FuncInfo->getVGPRForAGPRCopy();
1459	Register UnusedLowVGPR =
1460	TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
1461	if (UnusedLowVGPR && (TRI->getHWRegIndex(Reg: UnusedLowVGPR) <
1462	TRI->getHWRegIndex(Reg: VGPRForAGPRCopy))) {
1463	// Reserve this newly identified VGPR (for AGPR copy)
1464	// reserved registers should already be frozen at this point
1465	// so we can avoid calling MRI.freezeReservedRegs and just use
1466	// MRI.reserveReg
1467	FuncInfo->setVGPRForAGPRCopy(UnusedLowVGPR);
1468	MRI.reserveReg(UnusedLowVGPR, TRI);
1469	}
1470	}
1471	// We initally reserved the highest available SGPR pair for long branches
1472	// now, after RA, we shift down to a lower unused one if one exists
1473	Register LongBranchReservedReg = FuncInfo->getLongBranchReservedReg();
1474	Register UnusedLowSGPR =
1475	TRI->findUnusedRegister(MRI, &AMDGPU::SGPR_64RegClass, MF);
1476	// If LongBranchReservedReg is null then we didn't find a long branch
1477	// and never reserved a register to begin with so there is nothing to
1478	// shift down. Then if UnusedLowSGPR is null, there isn't available lower
1479	// register to use so just keep the original one we set.
1480	if (LongBranchReservedReg && UnusedLowSGPR) {
1481	FuncInfo->setLongBranchReservedReg(UnusedLowSGPR);
1482	MRI.reserveReg(UnusedLowSGPR, TRI);
1483	}
1484	}
1485
1486	// The special SGPR spills like the one needed for FP, BP or any reserved
1487	// registers delayed until frame lowering.
1488	void SIFrameLowering::determinePrologEpilogSGPRSaves(
1489	MachineFunction &MF, BitVector &SavedVGPRs,
1490	bool NeedExecCopyReservedReg) const {
1491	MachineFrameInfo &FrameInfo = MF.getFrameInfo();
1492	MachineRegisterInfo &MRI = MF.getRegInfo();
1493	SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1494	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1495	const SIRegisterInfo *TRI = ST.getRegisterInfo();
1496	LiveRegUnits LiveUnits;
1497	LiveUnits.init(*TRI);
1498	// Initially mark callee saved registers as used so we will not choose them
1499	// while looking for scratch SGPRs.
1500	const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
1501	for (unsigned I = `0`; CSRegs[I]; ++I)
1502	LiveUnits.addReg(Reg: CSRegs[I]);
1503
1504	const TargetRegisterClass &RC = *TRI->getWaveMaskRegClass();
1505
1506	Register ReservedRegForExecCopy = MFI->getSGPRForEXECCopy();
1507	if (NeedExecCopyReservedReg \|\|
1508	(ReservedRegForExecCopy &&
1509	MRI.isPhysRegUsed(PhysReg: ReservedRegForExecCopy, /SkipRegMaskTest=/true))) {
1510	MRI.reserveReg(ReservedRegForExecCopy, TRI);
1511	Register UnusedScratchReg = findUnusedRegister(MRI, LiveUnits, RC);
1512	if (UnusedScratchReg) {
1513	// If found any unused scratch SGPR, reserve the register itself for Exec
1514	// copy and there is no need for any spills in that case.
1515	MFI->setSGPRForEXECCopy(UnusedScratchReg);
1516	MRI.replaceRegWith(FromReg: ReservedRegForExecCopy, ToReg: UnusedScratchReg);
1517	LiveUnits.addReg(Reg: UnusedScratchReg);
1518	} else {
1519	// Needs spill.
1520	assert(!MFI->hasPrologEpilogSGPRSpillEntry(ReservedRegForExecCopy) &&
1521	"Re-reserving spill slot for EXEC copy register");
1522	getVGPRSpillLaneOrTempRegister(MF, LiveUnits, SGPR: ReservedRegForExecCopy, RC,
1523	/IncludeScratchCopy=/false);
1524	}
1525	} else if (ReservedRegForExecCopy) {
1526	// Reset it at this point. There are no whole-wave copies and spills
1527	// encountered.
1528	MFI->setSGPRForEXECCopy(AMDGPU::NoRegister);
1529	}
1530
1531	// hasFP only knows about stack objects that already exist. We're now
1532	// determining the stack slots that will be created, so we have to predict
1533	// them. Stack objects force FP usage with calls.
1534	//
1535	// Note a new VGPR CSR may be introduced if one is used for the spill, but we
1536	// don't want to report it here.
1537	//
1538	// FIXME: Is this really hasReservedCallFrame?
1539	const bool WillHaveFP =
1540	FrameInfo.hasCalls() &&
1541	(SavedVGPRs.any() \|\| !allStackObjectsAreDead(MFI: FrameInfo));
1542
1543	if (WillHaveFP \|\| hasFP(MF)) {
1544	Register FramePtrReg = MFI->getFrameOffsetReg();
1545	assert(!MFI->hasPrologEpilogSGPRSpillEntry(FramePtrReg) &&
1546	"Re-reserving spill slot for FP");
1547	getVGPRSpillLaneOrTempRegister(MF, LiveUnits, SGPR: FramePtrReg);
1548	}
1549
1550	if (TRI->hasBasePointer(MF)) {
1551	Register BasePtrReg = TRI->getBaseRegister();
1552	assert(!MFI->hasPrologEpilogSGPRSpillEntry(BasePtrReg) &&
1553	"Re-reserving spill slot for BP");
1554	getVGPRSpillLaneOrTempRegister(MF, LiveUnits, SGPR: BasePtrReg);
1555	}
1556	}
1557
1558	// Only report VGPRs to generic code.
1559	void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
1560	BitVector &SavedVGPRs,
1561	RegScavenger RS) const* {
1562	SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1563
1564	// If this is a function with the amdgpu_cs_chain[_preserve] calling
1565	// convention and it doesn't contain any calls to llvm.amdgcn.cs.chain, then
1566	// we don't need to save and restore anything.
1567	if (MFI->isChainFunction() && !MF.getFrameInfo().hasTailCall())
1568	return;
1569
1570	MFI->shiftSpillPhysVGPRsToLowestRange(MF);
1571
1572	TargetFrameLowering::determineCalleeSaves(MF, SavedRegs&: SavedVGPRs, RS);
1573	if (MFI->isEntryFunction())
1574	return;
1575
1576	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1577	const SIRegisterInfo *TRI = ST.getRegisterInfo();
1578	const SIInstrInfo *TII = ST.getInstrInfo();
1579	bool NeedExecCopyReservedReg = false;
1580
1581	MachineInstr ReturnMI = nullptr*;
1582	for (MachineBasicBlock &MBB : MF) {
1583	for (MachineInstr &MI : MBB) {
1584	// WRITELANE instructions used for SGPR spills can overwrite the inactive
1585	// lanes of VGPRs and callee must spill and restore them even if they are
1586	// marked Caller-saved.
1587
1588	// TODO: Handle this elsewhere at an early point. Walking through all MBBs
1589	// here would be a bad heuristic. A better way should be by calling
1590	// allocateWWMSpill during the regalloc pipeline whenever a physical
1591	// register is allocated for the intended virtual registers.
1592	if (MI.getOpcode() == AMDGPU::SI_SPILL_S32_TO_VGPR)
1593	MFI->allocateWWMSpill(MF, VGPR: MI.getOperand(i: `0`).getReg());
1594	else if (MI.getOpcode() == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
1595	MFI->allocateWWMSpill(MF, VGPR: MI.getOperand(i: `1`).getReg());
1596	else if (TII->isWWMRegSpillOpcode(Opcode: MI.getOpcode()))
1597	NeedExecCopyReservedReg = true;
1598	else if (MI.getOpcode() == AMDGPU::SI_RETURN \|\|
1599	MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG \|\|
1600	(MFI->isChainFunction() &&
1601	TII->isChainCallOpcode(MI.getOpcode()))) {
1602	// We expect all return to be the same size.
1603	assert(!ReturnMI \|\|
1604	(count_if(MI.operands(), [](auto Op) { return Op.isReg(); }) ==
1605	count_if(ReturnMI->operands(), [](auto Op) { return Op.isReg(); })));
1606	ReturnMI = &MI;
1607	}
1608	}
1609	}
1610
1611	// Remove any VGPRs used in the return value because these do not need to be saved.
1612	// This prevents CSR restore from clobbering return VGPRs.
1613	if (ReturnMI) {
1614	for (auto &Op : ReturnMI->operands()) {
1615	if (Op.isReg())
1616	SavedVGPRs.reset(Idx: Op.getReg());
1617	}
1618	}
1619
1620	// Ignore the SGPRs the default implementation found.
1621	SavedVGPRs.clearBitsNotInMask(Mask: TRI->getAllVectorRegMask());
1622
1623	// Do not save AGPRs prior to GFX90A because there was no easy way to do so.
1624	// In gfx908 there was do AGPR loads and stores and thus spilling also
1625	// require a temporary VGPR.
1626	if (!ST.hasGFX90AInsts())
1627	SavedVGPRs.clearBitsInMask(Mask: TRI->getAllAGPRRegMask());
1628
1629	determinePrologEpilogSGPRSaves(MF, SavedVGPRs, NeedExecCopyReservedReg);
1630
1631	// The Whole-Wave VGPRs need to be specially inserted in the prolog, so don't
1632	// allow the default insertion to handle them.
1633	for (auto &Reg : MFI->getWWMSpills())
1634	SavedVGPRs.reset(Idx: Reg.first);
1635
1636	// Mark all lane VGPRs as BB LiveIns.
1637	for (MachineBasicBlock &MBB : MF) {
1638	for (auto &Reg : MFI->getWWMSpills())
1639	MBB.addLiveIn(PhysReg: Reg.first);
1640
1641	MBB.sortUniqueLiveIns();
1642	}
1643	}
1644
1645	void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
1646	BitVector &SavedRegs,
1647	RegScavenger RS) const* {
1648	TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
1649	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1650	if (MFI->isEntryFunction())
1651	return;
1652
1653	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1654	const SIRegisterInfo *TRI = ST.getRegisterInfo();
1655
1656	// The SP is specifically managed and we don't want extra spills of it.
1657	SavedRegs.reset(Idx: MFI->getStackPtrOffsetReg());
1658
1659	const BitVector AllSavedRegs = SavedRegs;
1660	SavedRegs.clearBitsInMask(Mask: TRI->getAllVectorRegMask());
1661
1662	// We have to anticipate introducing CSR VGPR spills or spill of caller
1663	// save VGPR reserved for SGPR spills as we now always create stack entry
1664	// for it, if we don't have any stack objects already, since we require a FP
1665	// if there is a call and stack. We will allocate a VGPR for SGPR spills if
1666	// there are any SGPR spills. Whether they are CSR spills or otherwise.
1667	MachineFrameInfo &FrameInfo = MF.getFrameInfo();
1668	const bool WillHaveFP =
1669	FrameInfo.hasCalls() && (AllSavedRegs.any() \|\| MFI->hasSpilledSGPRs());
1670
1671	// FP will be specially managed like SP.
1672	if (WillHaveFP \|\| hasFP(MF))
1673	SavedRegs.reset(Idx: MFI->getFrameOffsetReg());
1674
1675	// Return address use with return instruction is hidden through the SI_RETURN
1676	// pseudo. Given that and since the IPRA computes actual register usage and
1677	// does not use CSR list, the clobbering of return address by function calls
1678	// (D117243) or otherwise (D120922) is ignored/not seen by the IPRA's register
1679	// usage collection. This will ensure save/restore of return address happens
1680	// in those scenarios.
1681	const MachineRegisterInfo &MRI = MF.getRegInfo();
1682	Register RetAddrReg = TRI->getReturnAddressReg(MF);
1683	if (!MFI->isEntryFunction() &&
1684	(FrameInfo.hasCalls() \|\| MRI.isPhysRegModified(PhysReg: RetAddrReg))) {
1685	SavedRegs.set(TRI->getSubReg(RetAddrReg, AMDGPU::sub0));
1686	SavedRegs.set(TRI->getSubReg(RetAddrReg, AMDGPU::sub1));
1687	}
1688	}
1689
1690	bool SIFrameLowering::assignCalleeSavedSpillSlots(
1691	MachineFunction &MF, const TargetRegisterInfo *TRI,
1692	std::vector<CalleeSavedInfo> &CSI) const {
1693	if (CSI.empty())
1694	return true; // Early exit if no callee saved registers are modified!
1695
1696	const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1697	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1698	const SIRegisterInfo *RI = ST.getRegisterInfo();
1699	Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1700	Register BasePtrReg = RI->getBaseRegister();
1701	Register SGPRForFPSaveRestoreCopy =
1702	FuncInfo->getScratchSGPRCopyDstReg(Reg: FramePtrReg);
1703	Register SGPRForBPSaveRestoreCopy =
1704	FuncInfo->getScratchSGPRCopyDstReg(Reg: BasePtrReg);
1705	if (!SGPRForFPSaveRestoreCopy && !SGPRForBPSaveRestoreCopy)
1706	return false;
1707
1708	unsigned NumModifiedRegs = `0`;
1709
1710	if (SGPRForFPSaveRestoreCopy)
1711	NumModifiedRegs++;
1712	if (SGPRForBPSaveRestoreCopy)
1713	NumModifiedRegs++;
1714
1715	for (auto &CS : CSI) {
1716	if (CS.getReg() == FramePtrReg && SGPRForFPSaveRestoreCopy) {
1717	CS.setDstReg(SGPRForFPSaveRestoreCopy);
1718	if (--NumModifiedRegs)
1719	break;
1720	} else if (CS.getReg() == BasePtrReg && SGPRForBPSaveRestoreCopy) {
1721	CS.setDstReg(SGPRForBPSaveRestoreCopy);
1722	if (--NumModifiedRegs)
1723	break;
1724	}
1725	}
1726
1727	return false;
1728	}
1729
1730	bool SIFrameLowering::allocateScavengingFrameIndexesNearIncomingSP(
1731	const MachineFunction &MF) const {
1732
1733	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1734	const MachineFrameInfo &MFI = MF.getFrameInfo();
1735	const SIInstrInfo *TII = ST.getInstrInfo();
1736	uint64_t EstStackSize = MFI.estimateStackSize(MF);
1737	uint64_t MaxOffset = EstStackSize - `1`;
1738
1739	// We need the emergency stack slots to be allocated in range of the
1740	// MUBUF/flat scratch immediate offset from the base register, so assign these
1741	// first at the incoming SP position.
1742	//
1743	// TODO: We could try sorting the objects to find a hole in the first bytes
1744	// rather than allocating as close to possible. This could save a lot of space
1745	// on frames with alignment requirements.
1746	if (ST.enableFlatScratch()) {
1747	if (TII->isLegalFLATOffset(Offset: MaxOffset, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS,
1748	FlatVariant: SIInstrFlags::FlatScratch))
1749	return false;
1750	} else {
1751	if (TII->isLegalMUBUFImmOffset(Imm: MaxOffset))
1752	return false;
1753	}
1754
1755	return true;
1756	}
1757
1758	MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
1759	MachineFunction &MF,
1760	MachineBasicBlock &MBB,
1761	MachineBasicBlock::iterator I) const {
1762	int64_t Amount = I ->getOperand(i: `0`).getImm();
1763	if (Amount == `0`)
1764	return MBB.erase(I);
1765
1766	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1767	const SIInstrInfo *TII = ST.getInstrInfo();
1768	const DebugLoc &DL = I ->getDebugLoc();
1769	unsigned Opc = I ->getOpcode();
1770	bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
1771	uint64_t CalleePopAmount = IsDestroy ? I ->getOperand(i: `1`).getImm() : `0`;
1772
1773	if (!hasReservedCallFrame(MF)) {
1774	Amount = alignTo(Size: Amount, A: getStackAlign());
1775	assert(isUInt<`32`>(Amount) && "exceeded stack address space size");
1776	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1777	Register SPReg = MFI->getStackPtrOffsetReg();
1778
1779	Amount *= getScratchScaleFactor(ST);
1780	if (IsDestroy)
1781	Amount = -Amount;
1782	auto Add = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SPReg)
1783	.addReg(SPReg)
1784	.addImm(Amount);
1785	Add->getOperand(`3`).setIsDead(); // Mark SCC as dead.
1786	} else if (CalleePopAmount != `0`) {
1787	llvm_unreachable("is this used?");
1788	}
1789
1790	return MBB.erase(I);
1791	}
1792
1793	/// Returns true if the frame will require a reference to the stack pointer.
1794	///
1795	/// This is the set of conditions common to setting up the stack pointer in a
1796	/// kernel, and for using a frame pointer in a callable function.
1797	///
1798	/// FIXME: Should also check hasOpaqueSPAdjustment and if any inline asm
1799	/// references SP.
1800	static bool frameTriviallyRequiresSP(const MachineFrameInfo &MFI) {
1801	return MFI.hasVarSizedObjects() \|\| MFI.hasStackMap() \|\| MFI.hasPatchPoint();
1802	}
1803
1804	// The FP for kernels is always known 0, so we never really need to setup an
1805	// explicit register for it. However, DisableFramePointerElim will force us to
1806	// use a register for it.
1807	bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
1808	const MachineFrameInfo &MFI = MF.getFrameInfo();
1809
1810	// For entry & chain functions we can use an immediate offset in most cases,
1811	// so the presence of calls doesn't imply we need a distinct frame pointer.
1812	if (MFI.hasCalls() &&
1813	!MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() &&
1814	!MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) {
1815	// All offsets are unsigned, so need to be addressed in the same direction
1816	// as stack growth.
1817
1818	// FIXME: This function is pretty broken, since it can be called before the
1819	// frame layout is determined or CSR spills are inserted.
1820	return MFI.getStackSize() != `0`;
1821	}
1822
1823	return frameTriviallyRequiresSP(MFI) \|\| MFI.isFrameAddressTaken() \|\|
1824	MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->hasStackRealignment(
1825	MF) \|\|
1826	MF.getTarget().Options.DisableFramePointerElim(MF);
1827	}
1828
1829	// This is essentially a reduced version of hasFP for entry functions. Since the
1830	// stack pointer is known 0 on entry to kernels, we never really need an FP
1831	// register. We may need to initialize the stack pointer depending on the frame
1832	// properties, which logically overlaps many of the cases where an ordinary
1833	// function would require an FP.
1834	// Also used for chain functions. While not technically entry functions, chain
1835	// functions may need to set up a stack pointer in some situations.
1836	bool SIFrameLowering::requiresStackPointerReference(
1837	const MachineFunction &MF) const {
1838	// Callable functions always require a stack pointer reference.
1839	assert((MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() \|\|
1840	MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) &&
1841	"only expected to call this for entry points and chain functions");
1842
1843	const MachineFrameInfo &MFI = MF.getFrameInfo();
1844
1845	// Entry points ordinarily don't need to initialize SP. We have to set it up
1846	// for callees if there are any. Also note tail calls are impossible/don't
1847	// make any sense for kernels.
1848	if (MFI.hasCalls())
1849	return true;
1850
1851	// We still need to initialize the SP if we're doing anything weird that
1852	// references the SP, like variable sized stack objects.
1853	return frameTriviallyRequiresSP(MFI);
1854	}
1855

source code of llvm/lib/Target/AMDGPU/SIFrameLowering.cpp