1//===----------------------- SIFrameLowering.cpp --------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8
9#include "SIFrameLowering.h"
10#include "AMDGPU.h"
11#include "GCNSubtarget.h"
12#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
13#include "SIMachineFunctionInfo.h"
14#include "llvm/CodeGen/LiveRegUnits.h"
15#include "llvm/CodeGen/MachineFrameInfo.h"
16#include "llvm/CodeGen/RegisterScavenging.h"
17#include "llvm/Target/TargetMachine.h"
18
19using namespace llvm;
20
21#define DEBUG_TYPE "frame-info"
22
23static cl::opt<bool> EnableSpillVGPRToAGPR(
24 "amdgpu-spill-vgpr-to-agpr",
25 cl::desc("Enable spilling VGPRs to AGPRs"),
26 cl::ReallyHidden,
27 cl::init(Val: true));
28
29// Find a register matching \p RC from \p LiveUnits which is unused and
30// available throughout the function. On failure, returns AMDGPU::NoRegister.
31// TODO: Rewrite the loop here to iterate over MCRegUnits instead of
32// MCRegisters. This should reduce the number of iterations and avoid redundant
33// checking.
34static MCRegister findUnusedRegister(MachineRegisterInfo &MRI,
35 const LiveRegUnits &LiveUnits,
36 const TargetRegisterClass &RC) {
37 for (MCRegister Reg : RC) {
38 if (!MRI.isPhysRegUsed(PhysReg: Reg) && LiveUnits.available(Reg) &&
39 !MRI.isReserved(PhysReg: Reg))
40 return Reg;
41 }
42 return MCRegister();
43}
44
45// Find a scratch register that we can use in the prologue. We avoid using
46// callee-save registers since they may appear to be free when this is called
47// from canUseAsPrologue (during shrink wrapping), but then no longer be free
48// when this is called from emitPrologue.
49static MCRegister findScratchNonCalleeSaveRegister(
50 MachineRegisterInfo &MRI, LiveRegUnits &LiveUnits,
51 const TargetRegisterClass &RC, bool Unused = false) {
52 // Mark callee saved registers as used so we will not choose them.
53 const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
54 for (unsigned i = 0; CSRegs[i]; ++i)
55 LiveUnits.addReg(Reg: CSRegs[i]);
56
57 // We are looking for a register that can be used throughout the entire
58 // function, so any use is unacceptable.
59 if (Unused)
60 return findUnusedRegister(MRI, LiveUnits, RC);
61
62 for (MCRegister Reg : RC) {
63 if (LiveUnits.available(Reg) && !MRI.isReserved(PhysReg: Reg))
64 return Reg;
65 }
66
67 return MCRegister();
68}
69
70/// Query target location for spilling SGPRs
71/// \p IncludeScratchCopy : Also look for free scratch SGPRs
72static void getVGPRSpillLaneOrTempRegister(
73 MachineFunction &MF, LiveRegUnits &LiveUnits, Register SGPR,
74 const TargetRegisterClass &RC = AMDGPU::SReg_32_XM0_XEXECRegClass,
75 bool IncludeScratchCopy = true) {
76 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
77 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
78
79 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
80 const SIRegisterInfo *TRI = ST.getRegisterInfo();
81 unsigned Size = TRI->getSpillSize(RC);
82 Align Alignment = TRI->getSpillAlign(RC);
83
84 // We need to save and restore the given SGPR.
85
86 Register ScratchSGPR;
87 // 1: Try to save the given register into an unused scratch SGPR. The
88 // LiveUnits should have all the callee saved registers marked as used. For
89 // certain cases we skip copy to scratch SGPR.
90 if (IncludeScratchCopy)
91 ScratchSGPR = findUnusedRegister(MF.getRegInfo(), LiveUnits, RC);
92
93 if (!ScratchSGPR) {
94 int FI = FrameInfo.CreateStackObject(Size, Alignment, isSpillSlot: true, Alloca: nullptr,
95 ID: TargetStackID::SGPRSpill);
96
97 if (TRI->spillSGPRToVGPR() &&
98 MFI->allocateSGPRSpillToVGPRLane(MF, FI, /*SpillToPhysVGPRLane=*/true,
99 /*IsPrologEpilog=*/true)) {
100 // 2: There's no free lane to spill, and no free register to save the
101 // SGPR, so we're forced to take another VGPR to use for the spill.
102 MFI->addToPrologEpilogSGPRSpills(
103 Reg: SGPR, SI: PrologEpilogSGPRSaveRestoreInfo(
104 SGPRSaveKind::SPILL_TO_VGPR_LANE, FI));
105
106 LLVM_DEBUG(auto Spill = MFI->getSGPRSpillToPhysicalVGPRLanes(FI).front();
107 dbgs() << printReg(SGPR, TRI) << " requires fallback spill to "
108 << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane
109 << '\n';);
110 } else {
111 // Remove dead <FI> index
112 MF.getFrameInfo().RemoveStackObject(ObjectIdx: FI);
113 // 3: If all else fails, spill the register to memory.
114 FI = FrameInfo.CreateSpillStackObject(Size, Alignment);
115 MFI->addToPrologEpilogSGPRSpills(
116 Reg: SGPR,
117 SI: PrologEpilogSGPRSaveRestoreInfo(SGPRSaveKind::SPILL_TO_MEM, FI));
118 LLVM_DEBUG(dbgs() << "Reserved FI " << FI << " for spilling "
119 << printReg(SGPR, TRI) << '\n');
120 }
121 } else {
122 MFI->addToPrologEpilogSGPRSpills(
123 Reg: SGPR, SI: PrologEpilogSGPRSaveRestoreInfo(
124 SGPRSaveKind::COPY_TO_SCRATCH_SGPR, ScratchSGPR));
125 LiveUnits.addReg(Reg: ScratchSGPR);
126 LLVM_DEBUG(dbgs() << "Saving " << printReg(SGPR, TRI) << " with copy to "
127 << printReg(ScratchSGPR, TRI) << '\n');
128 }
129}
130
131// We need to specially emit stack operations here because a different frame
132// register is used than in the rest of the function, as getFrameRegister would
133// use.
134static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI,
135 const SIMachineFunctionInfo &FuncInfo,
136 LiveRegUnits &LiveUnits, MachineFunction &MF,
137 MachineBasicBlock &MBB,
138 MachineBasicBlock::iterator I, const DebugLoc &DL,
139 Register SpillReg, int FI, Register FrameReg,
140 int64_t DwordOff = 0) {
141 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
142 : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
143
144 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
145 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
146 MachineMemOperand *MMO = MF.getMachineMemOperand(
147 PtrInfo, F: MachineMemOperand::MOStore, Size: FrameInfo.getObjectSize(ObjectIdx: FI),
148 BaseAlignment: FrameInfo.getObjectAlign(ObjectIdx: FI));
149 LiveUnits.addReg(Reg: SpillReg);
150 bool IsKill = !MBB.isLiveIn(Reg: SpillReg);
151 TRI.buildSpillLoadStore(MBB, MI: I, DL, LoadStoreOp: Opc, Index: FI, ValueReg: SpillReg, ValueIsKill: IsKill, ScratchOffsetReg: FrameReg,
152 InstrOffset: DwordOff, MMO, RS: nullptr, LiveUnits: &LiveUnits);
153 if (IsKill)
154 LiveUnits.removeReg(Reg: SpillReg);
155}
156
157static void buildEpilogRestore(const GCNSubtarget &ST,
158 const SIRegisterInfo &TRI,
159 const SIMachineFunctionInfo &FuncInfo,
160 LiveRegUnits &LiveUnits, MachineFunction &MF,
161 MachineBasicBlock &MBB,
162 MachineBasicBlock::iterator I,
163 const DebugLoc &DL, Register SpillReg, int FI,
164 Register FrameReg, int64_t DwordOff = 0) {
165 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
166 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
167
168 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
169 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
170 MachineMemOperand *MMO = MF.getMachineMemOperand(
171 PtrInfo, F: MachineMemOperand::MOLoad, Size: FrameInfo.getObjectSize(ObjectIdx: FI),
172 BaseAlignment: FrameInfo.getObjectAlign(ObjectIdx: FI));
173 TRI.buildSpillLoadStore(MBB, MI: I, DL, LoadStoreOp: Opc, Index: FI, ValueReg: SpillReg, ValueIsKill: false, ScratchOffsetReg: FrameReg,
174 InstrOffset: DwordOff, MMO, RS: nullptr, LiveUnits: &LiveUnits);
175}
176
177static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
178 const DebugLoc &DL, const SIInstrInfo *TII,
179 Register TargetReg) {
180 MachineFunction *MF = MBB.getParent();
181 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
182 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
183 const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
184 Register TargetLo = TRI->getSubReg(TargetReg, AMDGPU::sub0);
185 Register TargetHi = TRI->getSubReg(TargetReg, AMDGPU::sub1);
186
187 if (MFI->getGITPtrHigh() != 0xffffffff) {
188 BuildMI(BB&: MBB, I, MIMD: DL, MCID: SMovB32, DestReg: TargetHi)
189 .addImm(Val: MFI->getGITPtrHigh())
190 .addReg(RegNo: TargetReg, flags: RegState::ImplicitDefine);
191 } else {
192 const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64_pseudo);
193 BuildMI(BB&: MBB, I, MIMD: DL, MCID: GetPC64, DestReg: TargetReg);
194 }
195 Register GitPtrLo = MFI->getGITPtrLoReg(MF: *MF);
196 MF->getRegInfo().addLiveIn(Reg: GitPtrLo);
197 MBB.addLiveIn(PhysReg: GitPtrLo);
198 BuildMI(BB&: MBB, I, MIMD: DL, MCID: SMovB32, DestReg: TargetLo)
199 .addReg(RegNo: GitPtrLo);
200}
201
202static void initLiveUnits(LiveRegUnits &LiveUnits, const SIRegisterInfo &TRI,
203 const SIMachineFunctionInfo *FuncInfo,
204 MachineFunction &MF, MachineBasicBlock &MBB,
205 MachineBasicBlock::iterator MBBI, bool IsProlog) {
206 if (LiveUnits.empty()) {
207 LiveUnits.init(TRI);
208 if (IsProlog) {
209 LiveUnits.addLiveIns(MBB);
210 } else {
211 // In epilog.
212 LiveUnits.addLiveOuts(MBB);
213 LiveUnits.stepBackward(MI: *MBBI);
214 }
215 }
216}
217
218namespace llvm {
219
220// SpillBuilder to save/restore special SGPR spills like the one needed for FP,
221// BP, etc. These spills are delayed until the current function's frame is
222// finalized. For a given register, the builder uses the
223// PrologEpilogSGPRSaveRestoreInfo to decide the spill method.
224class PrologEpilogSGPRSpillBuilder {
225 MachineBasicBlock::iterator MI;
226 MachineBasicBlock &MBB;
227 MachineFunction &MF;
228 const GCNSubtarget &ST;
229 MachineFrameInfo &MFI;
230 SIMachineFunctionInfo *FuncInfo;
231 const SIInstrInfo *TII;
232 const SIRegisterInfo &TRI;
233 Register SuperReg;
234 const PrologEpilogSGPRSaveRestoreInfo SI;
235 LiveRegUnits &LiveUnits;
236 const DebugLoc &DL;
237 Register FrameReg;
238 ArrayRef<int16_t> SplitParts;
239 unsigned NumSubRegs;
240 unsigned EltSize = 4;
241
242 void saveToMemory(const int FI) const {
243 MachineRegisterInfo &MRI = MF.getRegInfo();
244 assert(!MFI.isDeadObjectIndex(FI));
245
246 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI: MI, /*IsProlog*/ true);
247
248 MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
249 MRI, LiveUnits, AMDGPU::VGPR_32RegClass);
250 if (!TmpVGPR)
251 report_fatal_error(reason: "failed to find free scratch register");
252
253 for (unsigned I = 0, DwordOff = 0; I < NumSubRegs; ++I) {
254 Register SubReg = NumSubRegs == 1
255 ? SuperReg
256 : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
257 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
258 .addReg(SubReg);
259
260 buildPrologSpill(ST, TRI, FuncInfo: *FuncInfo, LiveUnits, MF, MBB, I: MI, DL, SpillReg: TmpVGPR,
261 FI, FrameReg, DwordOff);
262 DwordOff += 4;
263 }
264 }
265
266 void saveToVGPRLane(const int FI) const {
267 assert(!MFI.isDeadObjectIndex(FI));
268
269 assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
270 ArrayRef<SIRegisterInfo::SpilledReg> Spill =
271 FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FrameIndex: FI);
272 assert(Spill.size() == NumSubRegs);
273
274 for (unsigned I = 0; I < NumSubRegs; ++I) {
275 Register SubReg = NumSubRegs == 1
276 ? SuperReg
277 : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
278 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_S32_TO_VGPR),
279 Spill[I].VGPR)
280 .addReg(SubReg)
281 .addImm(Spill[I].Lane)
282 .addReg(Spill[I].VGPR, RegState::Undef);
283 }
284 }
285
286 void copyToScratchSGPR(Register DstReg) const {
287 BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), DstReg)
288 .addReg(SuperReg)
289 .setMIFlag(MachineInstr::FrameSetup);
290 }
291
292 void restoreFromMemory(const int FI) {
293 MachineRegisterInfo &MRI = MF.getRegInfo();
294
295 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI: MI, /*IsProlog*/ false);
296 MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
297 MRI, LiveUnits, AMDGPU::VGPR_32RegClass);
298 if (!TmpVGPR)
299 report_fatal_error(reason: "failed to find free scratch register");
300
301 for (unsigned I = 0, DwordOff = 0; I < NumSubRegs; ++I) {
302 Register SubReg = NumSubRegs == 1
303 ? SuperReg
304 : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
305
306 buildEpilogRestore(ST, TRI, FuncInfo: *FuncInfo, LiveUnits, MF, MBB, I: MI, DL,
307 SpillReg: TmpVGPR, FI, FrameReg, DwordOff);
308 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
309 .addReg(TmpVGPR, RegState::Kill);
310 DwordOff += 4;
311 }
312 }
313
314 void restoreFromVGPRLane(const int FI) {
315 assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
316 ArrayRef<SIRegisterInfo::SpilledReg> Spill =
317 FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FrameIndex: FI);
318 assert(Spill.size() == NumSubRegs);
319
320 for (unsigned I = 0; I < NumSubRegs; ++I) {
321 Register SubReg = NumSubRegs == 1
322 ? SuperReg
323 : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
324 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
325 .addReg(Spill[I].VGPR)
326 .addImm(Spill[I].Lane);
327 }
328 }
329
330 void copyFromScratchSGPR(Register SrcReg) const {
331 BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), SuperReg)
332 .addReg(SrcReg)
333 .setMIFlag(MachineInstr::FrameDestroy);
334 }
335
336public:
337 PrologEpilogSGPRSpillBuilder(Register Reg,
338 const PrologEpilogSGPRSaveRestoreInfo SI,
339 MachineBasicBlock &MBB,
340 MachineBasicBlock::iterator MI,
341 const DebugLoc &DL, const SIInstrInfo *TII,
342 const SIRegisterInfo &TRI,
343 LiveRegUnits &LiveUnits, Register FrameReg)
344 : MI(MI), MBB(MBB), MF(*MBB.getParent()),
345 ST(MF.getSubtarget<GCNSubtarget>()), MFI(MF.getFrameInfo()),
346 FuncInfo(MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI),
347 SuperReg(Reg), SI(SI), LiveUnits(LiveUnits), DL(DL),
348 FrameReg(FrameReg) {
349 const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(SuperReg);
350 SplitParts = TRI.getRegSplitParts(RC, EltSize);
351 NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
352
353 assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
354 }
355
356 void save() {
357 switch (SI.getKind()) {
358 case SGPRSaveKind::SPILL_TO_MEM:
359 return saveToMemory(FI: SI.getIndex());
360 case SGPRSaveKind::SPILL_TO_VGPR_LANE:
361 return saveToVGPRLane(FI: SI.getIndex());
362 case SGPRSaveKind::COPY_TO_SCRATCH_SGPR:
363 return copyToScratchSGPR(DstReg: SI.getReg());
364 }
365 }
366
367 void restore() {
368 switch (SI.getKind()) {
369 case SGPRSaveKind::SPILL_TO_MEM:
370 return restoreFromMemory(FI: SI.getIndex());
371 case SGPRSaveKind::SPILL_TO_VGPR_LANE:
372 return restoreFromVGPRLane(FI: SI.getIndex());
373 case SGPRSaveKind::COPY_TO_SCRATCH_SGPR:
374 return copyFromScratchSGPR(SrcReg: SI.getReg());
375 }
376 }
377};
378
379} // namespace llvm
380
381// Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()`
382void SIFrameLowering::emitEntryFunctionFlatScratchInit(
383 MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
384 const DebugLoc &DL, Register ScratchWaveOffsetReg) const {
385 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
386 const SIInstrInfo *TII = ST.getInstrInfo();
387 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
388 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
389
390 // We don't need this if we only have spills since there is no user facing
391 // scratch.
392
393 // TODO: If we know we don't have flat instructions earlier, we can omit
394 // this from the input registers.
395 //
396 // TODO: We only need to know if we access scratch space through a flat
397 // pointer. Because we only detect if flat instructions are used at all,
398 // this will be used more often than necessary on VI.
399
400 Register FlatScrInitLo;
401 Register FlatScrInitHi;
402
403 if (ST.isAmdPalOS()) {
404 // Extract the scratch offset from the descriptor in the GIT
405 LiveRegUnits LiveUnits;
406 LiveUnits.init(*TRI);
407 LiveUnits.addLiveIns(MBB);
408
409 // Find unused reg to load flat scratch init into
410 MachineRegisterInfo &MRI = MF.getRegInfo();
411 Register FlatScrInit = AMDGPU::NoRegister;
412 ArrayRef<MCPhysReg> AllSGPR64s = TRI->getAllSGPR64(MF);
413 unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 1) / 2;
414 AllSGPR64s = AllSGPR64s.slice(
415 N: std::min(a: static_cast<unsigned>(AllSGPR64s.size()), b: NumPreloaded));
416 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
417 for (MCPhysReg Reg : AllSGPR64s) {
418 if (LiveUnits.available(Reg) && !MRI.isReserved(PhysReg: Reg) &&
419 MRI.isAllocatable(PhysReg: Reg) && !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) {
420 FlatScrInit = Reg;
421 break;
422 }
423 }
424 assert(FlatScrInit && "Failed to find free register for scratch init");
425
426 FlatScrInitLo = TRI->getSubReg(FlatScrInit, AMDGPU::sub0);
427 FlatScrInitHi = TRI->getSubReg(FlatScrInit, AMDGPU::sub1);
428
429 buildGitPtr(MBB, I, DL, TII, TargetReg: FlatScrInit);
430
431 // We now have the GIT ptr - now get the scratch descriptor from the entry
432 // at offset 0 (or offset 16 for a compute shader).
433 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
434 const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
435 auto *MMO = MF.getMachineMemOperand(
436 PtrInfo,
437 F: MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
438 MachineMemOperand::MODereferenceable,
439 Size: 8, BaseAlignment: Align(4));
440 unsigned Offset =
441 MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
442 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
443 unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
444 BuildMI(BB&: MBB, I, MIMD: DL, MCID: LoadDwordX2, DestReg: FlatScrInit)
445 .addReg(RegNo: FlatScrInit)
446 .addImm(Val: EncodedOffset) // offset
447 .addImm(Val: 0) // cpol
448 .addMemOperand(MMO);
449
450 // Mask the offset in [47:0] of the descriptor
451 const MCInstrDesc &SAndB32 = TII->get(AMDGPU::S_AND_B32);
452 auto And = BuildMI(BB&: MBB, I, MIMD: DL, MCID: SAndB32, DestReg: FlatScrInitHi)
453 .addReg(RegNo: FlatScrInitHi)
454 .addImm(Val: 0xffff);
455 And->getOperand(i: 3).setIsDead(); // Mark SCC as dead.
456 } else {
457 Register FlatScratchInitReg =
458 MFI->getPreloadedReg(Value: AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
459 assert(FlatScratchInitReg);
460
461 MachineRegisterInfo &MRI = MF.getRegInfo();
462 MRI.addLiveIn(Reg: FlatScratchInitReg);
463 MBB.addLiveIn(PhysReg: FlatScratchInitReg);
464
465 FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
466 FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
467 }
468
469 // Do a 64-bit pointer add.
470 if (ST.flatScratchIsPointer()) {
471 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
472 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
473 .addReg(FlatScrInitLo)
474 .addReg(ScratchWaveOffsetReg);
475 auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32),
476 FlatScrInitHi)
477 .addReg(FlatScrInitHi)
478 .addImm(0);
479 Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
480
481 using namespace AMDGPU::Hwreg;
482 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32))
483 .addReg(FlatScrInitLo)
484 .addImm(int16_t(HwregEncoding::encode(ID_FLAT_SCR_LO, 0, 32)));
485 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32))
486 .addReg(FlatScrInitHi)
487 .addImm(int16_t(HwregEncoding::encode(ID_FLAT_SCR_HI, 0, 32)));
488 return;
489 }
490
491 // For GFX9.
492 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO)
493 .addReg(FlatScrInitLo)
494 .addReg(ScratchWaveOffsetReg);
495 auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32),
496 AMDGPU::FLAT_SCR_HI)
497 .addReg(FlatScrInitHi)
498 .addImm(0);
499 Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
500
501 return;
502 }
503
504 assert(ST.getGeneration() < AMDGPUSubtarget::GFX9);
505
506 // Copy the size in bytes.
507 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO)
508 .addReg(FlatScrInitHi, RegState::Kill);
509
510 // Add wave offset in bytes to private base offset.
511 // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
512 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), FlatScrInitLo)
513 .addReg(FlatScrInitLo)
514 .addReg(ScratchWaveOffsetReg);
515
516 // Convert offset to 256-byte units.
517 auto LShr = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32),
518 AMDGPU::FLAT_SCR_HI)
519 .addReg(FlatScrInitLo, RegState::Kill)
520 .addImm(8);
521 LShr->getOperand(3).setIsDead(); // Mark SCC as dead.
522}
523
524// Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not
525// memory. They should have been removed by now.
526static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
527 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
528 I != E; ++I) {
529 if (!MFI.isDeadObjectIndex(ObjectIdx: I))
530 return false;
531 }
532
533 return true;
534}
535
536// Shift down registers reserved for the scratch RSRC.
537Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg(
538 MachineFunction &MF) const {
539
540 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
541 const SIInstrInfo *TII = ST.getInstrInfo();
542 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
543 MachineRegisterInfo &MRI = MF.getRegInfo();
544 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
545
546 assert(MFI->isEntryFunction());
547
548 Register ScratchRsrcReg = MFI->getScratchRSrcReg();
549
550 if (!ScratchRsrcReg || (!MRI.isPhysRegUsed(PhysReg: ScratchRsrcReg) &&
551 allStackObjectsAreDead(MFI: MF.getFrameInfo())))
552 return Register();
553
554 if (ST.hasSGPRInitBug() ||
555 ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF))
556 return ScratchRsrcReg;
557
558 // We reserved the last registers for this. Shift it down to the end of those
559 // which were actually used.
560 //
561 // FIXME: It might be safer to use a pseudoregister before replacement.
562
563 // FIXME: We should be able to eliminate unused input registers. We only
564 // cannot do this for the resources required for scratch access. For now we
565 // skip over user SGPRs and may leave unused holes.
566
567 unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4;
568 ArrayRef<MCPhysReg> AllSGPR128s = TRI->getAllSGPR128(MF);
569 AllSGPR128s = AllSGPR128s.slice(N: std::min(a: static_cast<unsigned>(AllSGPR128s.size()), b: NumPreloaded));
570
571 // Skip the last N reserved elements because they should have already been
572 // reserved for VCC etc.
573 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
574 for (MCPhysReg Reg : AllSGPR128s) {
575 // Pick the first unallocated one. Make sure we don't clobber the other
576 // reserved input we needed. Also for PAL, make sure we don't clobber
577 // the GIT pointer passed in SGPR0 or SGPR8.
578 if (!MRI.isPhysRegUsed(PhysReg: Reg) && MRI.isAllocatable(PhysReg: Reg) &&
579 (!GITPtrLoReg || !TRI->isSubRegisterEq(Reg, GITPtrLoReg))) {
580 MRI.replaceRegWith(FromReg: ScratchRsrcReg, ToReg: Reg);
581 MFI->setScratchRSrcReg(Reg);
582 return Reg;
583 }
584 }
585
586 return ScratchRsrcReg;
587}
588
589static unsigned getScratchScaleFactor(const GCNSubtarget &ST) {
590 return ST.enableFlatScratch() ? 1 : ST.getWavefrontSize();
591}
592
593void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
594 MachineBasicBlock &MBB) const {
595 assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
596
597 // FIXME: If we only have SGPR spills, we won't actually be using scratch
598 // memory since these spill to VGPRs. We should be cleaning up these unused
599 // SGPR spill frame indices somewhere.
600
601 // FIXME: We still have implicit uses on SGPR spill instructions in case they
602 // need to spill to vector memory. It's likely that will not happen, but at
603 // this point it appears we need the setup. This part of the prolog should be
604 // emitted after frame indices are eliminated.
605
606 // FIXME: Remove all of the isPhysRegUsed checks
607
608 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
609 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
610 const SIInstrInfo *TII = ST.getInstrInfo();
611 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
612 MachineRegisterInfo &MRI = MF.getRegInfo();
613 const Function &F = MF.getFunction();
614 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
615
616 assert(MFI->isEntryFunction());
617
618 Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
619 Value: AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
620
621 // We need to do the replacement of the private segment buffer register even
622 // if there are no stack objects. There could be stores to undef or a
623 // constant without an associated object.
624 //
625 // This will return `Register()` in cases where there are no actual
626 // uses of the SRSRC.
627 Register ScratchRsrcReg;
628 if (!ST.enableFlatScratch())
629 ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF);
630
631 // Make the selected register live throughout the function.
632 if (ScratchRsrcReg) {
633 for (MachineBasicBlock &OtherBB : MF) {
634 if (&OtherBB != &MBB) {
635 OtherBB.addLiveIn(PhysReg: ScratchRsrcReg);
636 }
637 }
638 }
639
640 // Now that we have fixed the reserved SRSRC we need to locate the
641 // (potentially) preloaded SRSRC.
642 Register PreloadedScratchRsrcReg;
643 if (ST.isAmdHsaOrMesa(F)) {
644 PreloadedScratchRsrcReg =
645 MFI->getPreloadedReg(Value: AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
646 if (ScratchRsrcReg && PreloadedScratchRsrcReg) {
647 // We added live-ins during argument lowering, but since they were not
648 // used they were deleted. We're adding the uses now, so add them back.
649 MRI.addLiveIn(Reg: PreloadedScratchRsrcReg);
650 MBB.addLiveIn(PhysReg: PreloadedScratchRsrcReg);
651 }
652 }
653
654 // Debug location must be unknown since the first debug location is used to
655 // determine the end of the prologue.
656 DebugLoc DL;
657 MachineBasicBlock::iterator I = MBB.begin();
658
659 // We found the SRSRC first because it needs four registers and has an
660 // alignment requirement. If the SRSRC that we found is clobbering with
661 // the scratch wave offset, which may be in a fixed SGPR or a free SGPR
662 // chosen by SITargetLowering::allocateSystemSGPRs, COPY the scratch
663 // wave offset to a free SGPR.
664 Register ScratchWaveOffsetReg;
665 if (PreloadedScratchWaveOffsetReg &&
666 TRI->isSubRegisterEq(ScratchRsrcReg, PreloadedScratchWaveOffsetReg)) {
667 ArrayRef<MCPhysReg> AllSGPRs = TRI->getAllSGPR32(MF);
668 unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
669 AllSGPRs = AllSGPRs.slice(
670 N: std::min(a: static_cast<unsigned>(AllSGPRs.size()), b: NumPreloaded));
671 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
672 for (MCPhysReg Reg : AllSGPRs) {
673 if (!MRI.isPhysRegUsed(PhysReg: Reg) && MRI.isAllocatable(PhysReg: Reg) &&
674 !TRI->isSubRegisterEq(ScratchRsrcReg, Reg) && GITPtrLoReg != Reg) {
675 ScratchWaveOffsetReg = Reg;
676 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg)
677 .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill);
678 break;
679 }
680 }
681 } else {
682 ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg;
683 }
684 assert(ScratchWaveOffsetReg || !PreloadedScratchWaveOffsetReg);
685
686 if (requiresStackPointerReference(MF)) {
687 Register SPReg = MFI->getStackPtrOffsetReg();
688 assert(SPReg != AMDGPU::SP_REG);
689 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg)
690 .addImm(FrameInfo.getStackSize() * getScratchScaleFactor(ST));
691 }
692
693 if (hasFP(MF)) {
694 Register FPReg = MFI->getFrameOffsetReg();
695 assert(FPReg != AMDGPU::FP_REG);
696 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0);
697 }
698
699 bool NeedsFlatScratchInit =
700 MFI->getUserSGPRInfo().hasFlatScratchInit() &&
701 (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() ||
702 (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch()));
703
704 if ((NeedsFlatScratchInit || ScratchRsrcReg) &&
705 PreloadedScratchWaveOffsetReg && !ST.flatScratchIsArchitected()) {
706 MRI.addLiveIn(Reg: PreloadedScratchWaveOffsetReg);
707 MBB.addLiveIn(PhysReg: PreloadedScratchWaveOffsetReg);
708 }
709
710 if (NeedsFlatScratchInit) {
711 emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
712 }
713
714 if (ScratchRsrcReg) {
715 emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL,
716 PreloadedPrivateBufferReg: PreloadedScratchRsrcReg,
717 ScratchRsrcReg, ScratchWaveOffsetReg);
718 }
719}
720
721// Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg`
722void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
723 MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
724 const DebugLoc &DL, Register PreloadedScratchRsrcReg,
725 Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const {
726
727 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
728 const SIInstrInfo *TII = ST.getInstrInfo();
729 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
730 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
731 const Function &Fn = MF.getFunction();
732
733 if (ST.isAmdPalOS()) {
734 // The pointer to the GIT is formed from the offset passed in and either
735 // the amdgpu-git-ptr-high function attribute or the top part of the PC
736 Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
737 Register Rsrc03 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
738
739 buildGitPtr(MBB, I, DL, TII, TargetReg: Rsrc01);
740
741 // We now have the GIT ptr - now get the scratch descriptor from the entry
742 // at offset 0 (or offset 16 for a compute shader).
743 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
744 const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM);
745 auto MMO = MF.getMachineMemOperand(PtrInfo,
746 F: MachineMemOperand::MOLoad |
747 MachineMemOperand::MOInvariant |
748 MachineMemOperand::MODereferenceable,
749 Size: 16, BaseAlignment: Align(4));
750 unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
751 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
752 unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
753 BuildMI(BB&: MBB, I, MIMD: DL, MCID: LoadDwordX4, DestReg: ScratchRsrcReg)
754 .addReg(RegNo: Rsrc01)
755 .addImm(Val: EncodedOffset) // offset
756 .addImm(Val: 0) // cpol
757 .addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine)
758 .addMemOperand(MMO);
759
760 // The driver will always set the SRD for wave 64 (bits 118:117 of
761 // descriptor / bits 22:21 of third sub-reg will be 0b11)
762 // If the shader is actually wave32 we have to modify the const_index_stride
763 // field of the descriptor 3rd sub-reg (bits 22:21) to 0b10 (stride=32). The
764 // reason the driver does this is that there can be cases where it presents
765 // 2 shaders with different wave size (e.g. VsFs).
766 // TODO: convert to using SCRATCH instructions or multiple SRD buffers
767 if (ST.isWave32()) {
768 const MCInstrDesc &SBitsetB32 = TII->get(AMDGPU::S_BITSET0_B32);
769 BuildMI(BB&: MBB, I, MIMD: DL, MCID: SBitsetB32, DestReg: Rsrc03)
770 .addImm(Val: 21)
771 .addReg(RegNo: Rsrc03);
772 }
773 } else if (ST.isMesaGfxShader(F: Fn) || !PreloadedScratchRsrcReg) {
774 assert(!ST.isAmdHsaOrMesa(Fn));
775 const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
776
777 Register Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
778 Register Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
779
780 // Use relocations to get the pointer, and setup the other bits manually.
781 uint64_t Rsrc23 = TII->getScratchRsrcWords23();
782
783 if (MFI->getUserSGPRInfo().hasImplicitBufferPtr()) {
784 Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
785
786 if (AMDGPU::isCompute(CC: MF.getFunction().getCallingConv())) {
787 const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64);
788
789 BuildMI(BB&: MBB, I, MIMD: DL, MCID: Mov64, DestReg: Rsrc01)
790 .addReg(RegNo: MFI->getImplicitBufferPtrUserSGPR())
791 .addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine);
792 } else {
793 const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
794
795 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
796 auto MMO = MF.getMachineMemOperand(
797 PtrInfo,
798 F: MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
799 MachineMemOperand::MODereferenceable,
800 Size: 8, BaseAlignment: Align(4));
801 BuildMI(BB&: MBB, I, MIMD: DL, MCID: LoadDwordX2, DestReg: Rsrc01)
802 .addReg(RegNo: MFI->getImplicitBufferPtrUserSGPR())
803 .addImm(Val: 0) // offset
804 .addImm(Val: 0) // cpol
805 .addMemOperand(MMO)
806 .addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine);
807
808 MF.getRegInfo().addLiveIn(Reg: MFI->getImplicitBufferPtrUserSGPR());
809 MBB.addLiveIn(PhysReg: MFI->getImplicitBufferPtrUserSGPR());
810 }
811 } else {
812 Register Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
813 Register Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
814
815 BuildMI(BB&: MBB, I, MIMD: DL, MCID: SMovB32, DestReg: Rsrc0)
816 .addExternalSymbol(FnName: "SCRATCH_RSRC_DWORD0")
817 .addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine);
818
819 BuildMI(BB&: MBB, I, MIMD: DL, MCID: SMovB32, DestReg: Rsrc1)
820 .addExternalSymbol(FnName: "SCRATCH_RSRC_DWORD1")
821 .addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine);
822 }
823
824 BuildMI(BB&: MBB, I, MIMD: DL, MCID: SMovB32, DestReg: Rsrc2)
825 .addImm(Val: Rsrc23 & 0xffffffff)
826 .addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine);
827
828 BuildMI(BB&: MBB, I, MIMD: DL, MCID: SMovB32, DestReg: Rsrc3)
829 .addImm(Val: Rsrc23 >> 32)
830 .addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine);
831 } else if (ST.isAmdHsaOrMesa(Fn)) {
832 assert(PreloadedScratchRsrcReg);
833
834 if (ScratchRsrcReg != PreloadedScratchRsrcReg) {
835 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
836 .addReg(PreloadedScratchRsrcReg, RegState::Kill);
837 }
838 }
839
840 // Add the scratch wave offset into the scratch RSRC.
841 //
842 // We only want to update the first 48 bits, which is the base address
843 // pointer, without touching the adjacent 16 bits of flags. We know this add
844 // cannot carry-out from bit 47, otherwise the scratch allocation would be
845 // impossible to fit in the 48-bit global address space.
846 //
847 // TODO: Evaluate if it is better to just construct an SRD using the flat
848 // scratch init and some constants rather than update the one we are passed.
849 Register ScratchRsrcSub0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
850 Register ScratchRsrcSub1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
851
852 // We cannot Kill ScratchWaveOffsetReg here because we allow it to be used in
853 // the kernel body via inreg arguments.
854 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), ScratchRsrcSub0)
855 .addReg(ScratchRsrcSub0)
856 .addReg(ScratchWaveOffsetReg)
857 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
858 auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), ScratchRsrcSub1)
859 .addReg(ScratchRsrcSub1)
860 .addImm(0)
861 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
862 Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
863}
864
865bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
866 switch (ID) {
867 case TargetStackID::Default:
868 case TargetStackID::NoAlloc:
869 case TargetStackID::SGPRSpill:
870 return true;
871 case TargetStackID::ScalableVector:
872 case TargetStackID::WasmLocal:
873 return false;
874 }
875 llvm_unreachable("Invalid TargetStackID::Value");
876}
877
878// Activate only the inactive lanes when \p EnableInactiveLanes is true.
879// Otherwise, activate all lanes. It returns the saved exec.
880static Register buildScratchExecCopy(LiveRegUnits &LiveUnits,
881 MachineFunction &MF,
882 MachineBasicBlock &MBB,
883 MachineBasicBlock::iterator MBBI,
884 const DebugLoc &DL, bool IsProlog,
885 bool EnableInactiveLanes) {
886 Register ScratchExecCopy;
887 MachineRegisterInfo &MRI = MF.getRegInfo();
888 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
889 const SIInstrInfo *TII = ST.getInstrInfo();
890 const SIRegisterInfo &TRI = TII->getRegisterInfo();
891 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
892
893 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, IsProlog);
894
895 ScratchExecCopy = findScratchNonCalleeSaveRegister(
896 MRI, LiveUnits, RC: *TRI.getWaveMaskRegClass());
897 if (!ScratchExecCopy)
898 report_fatal_error(reason: "failed to find free scratch register");
899
900 LiveUnits.addReg(Reg: ScratchExecCopy);
901
902 const unsigned SaveExecOpc =
903 ST.isWave32() ? (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B32
904 : AMDGPU::S_OR_SAVEEXEC_B32)
905 : (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B64
906 : AMDGPU::S_OR_SAVEEXEC_B64);
907 auto SaveExec =
908 BuildMI(MBB, MBBI, DL, TII->get(SaveExecOpc), ScratchExecCopy).addImm(-1);
909 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
910
911 return ScratchExecCopy;
912}
913
914void SIFrameLowering::emitCSRSpillStores(
915 MachineFunction &MF, MachineBasicBlock &MBB,
916 MachineBasicBlock::iterator MBBI, DebugLoc &DL, LiveRegUnits &LiveUnits,
917 Register FrameReg, Register FramePtrRegScratchCopy) const {
918 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
919 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
920 const SIInstrInfo *TII = ST.getInstrInfo();
921 const SIRegisterInfo &TRI = TII->getRegisterInfo();
922
923 // Spill Whole-Wave Mode VGPRs. Save only the inactive lanes of the scratch
924 // registers. However, save all lanes of callee-saved VGPRs. Due to this, we
925 // might end up flipping the EXEC bits twice.
926 Register ScratchExecCopy;
927 SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs;
928 FuncInfo->splitWWMSpillRegisters(MF, CalleeSavedRegs&: WWMCalleeSavedRegs, ScratchRegs&: WWMScratchRegs);
929 if (!WWMScratchRegs.empty())
930 ScratchExecCopy =
931 buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
932 /*IsProlog*/ true, /*EnableInactiveLanes*/ true);
933
934 auto StoreWWMRegisters =
935 [&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) {
936 for (const auto &Reg : WWMRegs) {
937 Register VGPR = Reg.first;
938 int FI = Reg.second;
939 buildPrologSpill(ST, TRI, FuncInfo: *FuncInfo, LiveUnits, MF, MBB, I: MBBI, DL,
940 SpillReg: VGPR, FI, FrameReg);
941 }
942 };
943
944 StoreWWMRegisters(WWMScratchRegs);
945 if (!WWMCalleeSavedRegs.empty()) {
946 if (ScratchExecCopy) {
947 unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
948 BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1);
949 } else {
950 ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
951 /*IsProlog*/ true,
952 /*EnableInactiveLanes*/ false);
953 }
954 }
955
956 StoreWWMRegisters(WWMCalleeSavedRegs);
957 if (ScratchExecCopy) {
958 // FIXME: Split block and make terminator.
959 unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
960 BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec())
961 .addReg(ScratchExecCopy, RegState::Kill);
962 LiveUnits.addReg(Reg: ScratchExecCopy);
963 }
964
965 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
966
967 for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) {
968 // Special handle FP spill:
969 // Skip if FP is saved to a scratch SGPR, the save has already been emitted.
970 // Otherwise, FP has been moved to a temporary register and spill it
971 // instead.
972 Register Reg =
973 Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first;
974 if (!Reg)
975 continue;
976
977 PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI,
978 LiveUnits, FrameReg);
979 SB.save();
980 }
981
982 // If a copy to scratch SGPR has been chosen for any of the SGPR spills, make
983 // such scratch registers live throughout the function.
984 SmallVector<Register, 1> ScratchSGPRs;
985 FuncInfo->getAllScratchSGPRCopyDstRegs(Regs&: ScratchSGPRs);
986 if (!ScratchSGPRs.empty()) {
987 for (MachineBasicBlock &MBB : MF) {
988 for (MCPhysReg Reg : ScratchSGPRs)
989 MBB.addLiveIn(PhysReg: Reg);
990
991 MBB.sortUniqueLiveIns();
992 }
993 if (!LiveUnits.empty()) {
994 for (MCPhysReg Reg : ScratchSGPRs)
995 LiveUnits.addReg(Reg);
996 }
997 }
998}
999
1000void SIFrameLowering::emitCSRSpillRestores(
1001 MachineFunction &MF, MachineBasicBlock &MBB,
1002 MachineBasicBlock::iterator MBBI, DebugLoc &DL, LiveRegUnits &LiveUnits,
1003 Register FrameReg, Register FramePtrRegScratchCopy) const {
1004 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1005 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1006 const SIInstrInfo *TII = ST.getInstrInfo();
1007 const SIRegisterInfo &TRI = TII->getRegisterInfo();
1008 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1009
1010 for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) {
1011 // Special handle FP restore:
1012 // Skip if FP needs to be restored from the scratch SGPR. Otherwise, restore
1013 // the FP value to a temporary register. The frame pointer should be
1014 // overwritten only at the end when all other spills are restored from
1015 // current frame.
1016 Register Reg =
1017 Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first;
1018 if (!Reg)
1019 continue;
1020
1021 PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI,
1022 LiveUnits, FrameReg);
1023 SB.restore();
1024 }
1025
1026 // Restore Whole-Wave Mode VGPRs. Restore only the inactive lanes of the
1027 // scratch registers. However, restore all lanes of callee-saved VGPRs. Due to
1028 // this, we might end up flipping the EXEC bits twice.
1029 Register ScratchExecCopy;
1030 SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs;
1031 FuncInfo->splitWWMSpillRegisters(MF, CalleeSavedRegs&: WWMCalleeSavedRegs, ScratchRegs&: WWMScratchRegs);
1032 if (!WWMScratchRegs.empty())
1033 ScratchExecCopy =
1034 buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
1035 /*IsProlog*/ false, /*EnableInactiveLanes*/ true);
1036
1037 auto RestoreWWMRegisters =
1038 [&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) {
1039 for (const auto &Reg : WWMRegs) {
1040 Register VGPR = Reg.first;
1041 int FI = Reg.second;
1042 buildEpilogRestore(ST, TRI, FuncInfo: *FuncInfo, LiveUnits, MF, MBB, I: MBBI, DL,
1043 SpillReg: VGPR, FI, FrameReg);
1044 }
1045 };
1046
1047 RestoreWWMRegisters(WWMScratchRegs);
1048 if (!WWMCalleeSavedRegs.empty()) {
1049 if (ScratchExecCopy) {
1050 unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1051 BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1);
1052 } else {
1053 ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
1054 /*IsProlog*/ false,
1055 /*EnableInactiveLanes*/ false);
1056 }
1057 }
1058
1059 RestoreWWMRegisters(WWMCalleeSavedRegs);
1060 if (ScratchExecCopy) {
1061 // FIXME: Split block and make terminator.
1062 unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1063 BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec())
1064 .addReg(ScratchExecCopy, RegState::Kill);
1065 }
1066}
1067
1068void SIFrameLowering::emitPrologue(MachineFunction &MF,
1069 MachineBasicBlock &MBB) const {
1070 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1071 if (FuncInfo->isEntryFunction()) {
1072 emitEntryFunctionPrologue(MF, MBB);
1073 return;
1074 }
1075
1076 MachineFrameInfo &MFI = MF.getFrameInfo();
1077 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1078 const SIInstrInfo *TII = ST.getInstrInfo();
1079 const SIRegisterInfo &TRI = TII->getRegisterInfo();
1080 MachineRegisterInfo &MRI = MF.getRegInfo();
1081
1082 Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
1083 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1084 Register BasePtrReg =
1085 TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register();
1086 LiveRegUnits LiveUnits;
1087
1088 MachineBasicBlock::iterator MBBI = MBB.begin();
1089 // DebugLoc must be unknown since the first instruction with DebugLoc is used
1090 // to determine the end of the prologue.
1091 DebugLoc DL;
1092
1093 if (FuncInfo->isChainFunction()) {
1094 // Functions with the amdgpu_cs_chain[_preserve] CC don't receive a SP, but
1095 // are free to set one up if they need it.
1096 bool UseSP = requiresStackPointerReference(MF);
1097 if (UseSP) {
1098 assert(StackPtrReg != AMDGPU::SP_REG);
1099
1100 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_MOV_B32), StackPtrReg)
1101 .addImm(MFI.getStackSize() * getScratchScaleFactor(ST));
1102 }
1103 }
1104
1105 bool HasFP = false;
1106 bool HasBP = false;
1107 uint32_t NumBytes = MFI.getStackSize();
1108 uint32_t RoundedSize = NumBytes;
1109
1110 if (TRI.hasStackRealignment(MF))
1111 HasFP = true;
1112
1113 Register FramePtrRegScratchCopy;
1114 if (!HasFP && !hasFP(MF)) {
1115 // Emit the CSR spill stores with SP base register.
1116 emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits,
1117 FrameReg: FuncInfo->isChainFunction() ? Register() : StackPtrReg,
1118 FramePtrRegScratchCopy);
1119 } else {
1120 // CSR spill stores will use FP as base register.
1121 Register SGPRForFPSaveRestoreCopy =
1122 FuncInfo->getScratchSGPRCopyDstReg(Reg: FramePtrReg);
1123
1124 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true);
1125 if (SGPRForFPSaveRestoreCopy) {
1126 // Copy FP to the scratch register now and emit the CFI entry. It avoids
1127 // the extra FP copy needed in the other two cases when FP is spilled to
1128 // memory or to a VGPR lane.
1129 PrologEpilogSGPRSpillBuilder SB(
1130 FramePtrReg,
1131 FuncInfo->getPrologEpilogSGPRSaveRestoreInfo(Reg: FramePtrReg), MBB, MBBI,
1132 DL, TII, TRI, LiveUnits, FramePtrReg);
1133 SB.save();
1134 LiveUnits.addReg(Reg: SGPRForFPSaveRestoreCopy);
1135 } else {
1136 // Copy FP into a new scratch register so that its previous value can be
1137 // spilled after setting up the new frame.
1138 FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister(
1139 MRI, LiveUnits, AMDGPU::SReg_32_XM0_XEXECRegClass);
1140 if (!FramePtrRegScratchCopy)
1141 report_fatal_error(reason: "failed to find free scratch register");
1142
1143 LiveUnits.addReg(Reg: FramePtrRegScratchCopy);
1144 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrRegScratchCopy)
1145 .addReg(FramePtrReg);
1146 }
1147 }
1148
1149 if (HasFP) {
1150 const unsigned Alignment = MFI.getMaxAlign().value();
1151
1152 RoundedSize += Alignment;
1153 if (LiveUnits.empty()) {
1154 LiveUnits.init(TRI);
1155 LiveUnits.addLiveIns(MBB);
1156 }
1157
1158 // s_add_i32 s33, s32, NumBytes
1159 // s_and_b32 s33, s33, 0b111...0000
1160 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), FramePtrReg)
1161 .addReg(StackPtrReg)
1162 .addImm((Alignment - 1) * getScratchScaleFactor(ST))
1163 .setMIFlag(MachineInstr::FrameSetup);
1164 auto And = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg)
1165 .addReg(FramePtrReg, RegState::Kill)
1166 .addImm(-Alignment * getScratchScaleFactor(ST))
1167 .setMIFlag(MachineInstr::FrameSetup);
1168 And->getOperand(3).setIsDead(); // Mark SCC as dead.
1169 FuncInfo->setIsStackRealigned(true);
1170 } else if ((HasFP = hasFP(MF))) {
1171 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
1172 .addReg(StackPtrReg)
1173 .setMIFlag(MachineInstr::FrameSetup);
1174 }
1175
1176 // If FP is used, emit the CSR spills with FP base register.
1177 if (HasFP) {
1178 emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits, FrameReg: FramePtrReg,
1179 FramePtrRegScratchCopy);
1180 if (FramePtrRegScratchCopy)
1181 LiveUnits.removeReg(Reg: FramePtrRegScratchCopy);
1182 }
1183
1184 // If we need a base pointer, set it up here. It's whatever the value of
1185 // the stack pointer is at this point. Any variable size objects will be
1186 // allocated after this, so we can still use the base pointer to reference
1187 // the incoming arguments.
1188 if ((HasBP = TRI.hasBasePointer(MF))) {
1189 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg)
1190 .addReg(StackPtrReg)
1191 .setMIFlag(MachineInstr::FrameSetup);
1192 }
1193
1194 if (HasFP && RoundedSize != 0) {
1195 auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg)
1196 .addReg(StackPtrReg)
1197 .addImm(RoundedSize * getScratchScaleFactor(ST))
1198 .setMIFlag(MachineInstr::FrameSetup);
1199 Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1200 }
1201
1202 bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(Reg: FramePtrReg);
1203 (void)FPSaved;
1204 assert((!HasFP || FPSaved) &&
1205 "Needed to save FP but didn't save it anywhere");
1206
1207 // If we allow spilling to AGPRs we may have saved FP but then spill
1208 // everything into AGPRs instead of the stack.
1209 assert((HasFP || !FPSaved || EnableSpillVGPRToAGPR) &&
1210 "Saved FP but didn't need it");
1211
1212 bool BPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(Reg: BasePtrReg);
1213 (void)BPSaved;
1214 assert((!HasBP || BPSaved) &&
1215 "Needed to save BP but didn't save it anywhere");
1216
1217 assert((HasBP || !BPSaved) && "Saved BP but didn't need it");
1218}
1219
1220void SIFrameLowering::emitEpilogue(MachineFunction &MF,
1221 MachineBasicBlock &MBB) const {
1222 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1223 if (FuncInfo->isEntryFunction())
1224 return;
1225
1226 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1227 const SIInstrInfo *TII = ST.getInstrInfo();
1228 const SIRegisterInfo &TRI = TII->getRegisterInfo();
1229 MachineRegisterInfo &MRI = MF.getRegInfo();
1230 LiveRegUnits LiveUnits;
1231 // Get the insert location for the epilogue. If there were no terminators in
1232 // the block, get the last instruction.
1233 MachineBasicBlock::iterator MBBI = MBB.end();
1234 DebugLoc DL;
1235 if (!MBB.empty()) {
1236 MBBI = MBB.getLastNonDebugInstr();
1237 if (MBBI != MBB.end())
1238 DL = MBBI->getDebugLoc();
1239
1240 MBBI = MBB.getFirstTerminator();
1241 }
1242
1243 const MachineFrameInfo &MFI = MF.getFrameInfo();
1244 uint32_t NumBytes = MFI.getStackSize();
1245 uint32_t RoundedSize = FuncInfo->isStackRealigned()
1246 ? NumBytes + MFI.getMaxAlign().value()
1247 : NumBytes;
1248 const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
1249 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1250 bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(Reg: FramePtrReg);
1251
1252 Register FramePtrRegScratchCopy;
1253 Register SGPRForFPSaveRestoreCopy =
1254 FuncInfo->getScratchSGPRCopyDstReg(Reg: FramePtrReg);
1255 if (FPSaved) {
1256 // CSR spill restores should use FP as base register. If
1257 // SGPRForFPSaveRestoreCopy is not true, restore the previous value of FP
1258 // into a new scratch register and copy to FP later when other registers are
1259 // restored from the current stack frame.
1260 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false);
1261 if (SGPRForFPSaveRestoreCopy) {
1262 LiveUnits.addReg(Reg: SGPRForFPSaveRestoreCopy);
1263 } else {
1264 FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister(
1265 MRI, LiveUnits, AMDGPU::SReg_32_XM0_XEXECRegClass);
1266 if (!FramePtrRegScratchCopy)
1267 report_fatal_error(reason: "failed to find free scratch register");
1268
1269 LiveUnits.addReg(Reg: FramePtrRegScratchCopy);
1270 }
1271
1272 emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits, FrameReg: FramePtrReg,
1273 FramePtrRegScratchCopy);
1274 }
1275
1276 if (RoundedSize != 0 && hasFP(MF)) {
1277 auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg)
1278 .addReg(StackPtrReg)
1279 .addImm(-static_cast<int64_t>(RoundedSize * getScratchScaleFactor(ST)))
1280 .setMIFlag(MachineInstr::FrameDestroy);
1281 Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1282 }
1283
1284 if (FPSaved) {
1285 // Insert the copy to restore FP.
1286 Register SrcReg = SGPRForFPSaveRestoreCopy ? SGPRForFPSaveRestoreCopy
1287 : FramePtrRegScratchCopy;
1288 MachineInstrBuilder MIB =
1289 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
1290 .addReg(SrcReg);
1291 if (SGPRForFPSaveRestoreCopy)
1292 MIB.setMIFlag(MachineInstr::FrameDestroy);
1293 } else {
1294 // Insert the CSR spill restores with SP as the base register.
1295 emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits, FrameReg: StackPtrReg,
1296 FramePtrRegScratchCopy);
1297 }
1298}
1299
1300#ifndef NDEBUG
1301static bool allSGPRSpillsAreDead(const MachineFunction &MF) {
1302 const MachineFrameInfo &MFI = MF.getFrameInfo();
1303 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1304 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
1305 I != E; ++I) {
1306 if (!MFI.isDeadObjectIndex(ObjectIdx: I) &&
1307 MFI.getStackID(ObjectIdx: I) == TargetStackID::SGPRSpill &&
1308 !FuncInfo->checkIndexInPrologEpilogSGPRSpills(FI: I)) {
1309 return false;
1310 }
1311 }
1312
1313 return true;
1314}
1315#endif
1316
1317StackOffset SIFrameLowering::getFrameIndexReference(const MachineFunction &MF,
1318 int FI,
1319 Register &FrameReg) const {
1320 const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
1321
1322 FrameReg = RI->getFrameRegister(MF);
1323 return StackOffset::getFixed(Fixed: MF.getFrameInfo().getObjectOffset(ObjectIdx: FI));
1324}
1325
1326void SIFrameLowering::processFunctionBeforeFrameFinalized(
1327 MachineFunction &MF,
1328 RegScavenger *RS) const {
1329 MachineFrameInfo &MFI = MF.getFrameInfo();
1330
1331 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1332 const SIInstrInfo *TII = ST.getInstrInfo();
1333 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1334 MachineRegisterInfo &MRI = MF.getRegInfo();
1335 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1336
1337 // Allocate spill slots for WWM reserved VGPRs.
1338 // For chain functions, we only need to do this if we have calls to
1339 // llvm.amdgcn.cs.chain.
1340 bool IsChainWithoutCalls =
1341 FuncInfo->isChainFunction() && !MF.getFrameInfo().hasTailCall();
1342 if (!FuncInfo->isEntryFunction() && !IsChainWithoutCalls) {
1343 for (Register Reg : FuncInfo->getWWMReservedRegs()) {
1344 const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
1345 FuncInfo->allocateWWMSpill(MF, VGPR: Reg, Size: TRI->getSpillSize(*RC),
1346 Alignment: TRI->getSpillAlign(*RC));
1347 }
1348 }
1349
1350 const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs()
1351 && EnableSpillVGPRToAGPR;
1352
1353 if (SpillVGPRToAGPR) {
1354 // To track the spill frame indices handled in this pass.
1355 BitVector SpillFIs(MFI.getObjectIndexEnd(), false);
1356 BitVector NonVGPRSpillFIs(MFI.getObjectIndexEnd(), false);
1357
1358 bool SeenDbgInstr = false;
1359
1360 for (MachineBasicBlock &MBB : MF) {
1361 for (MachineInstr &MI : llvm::make_early_inc_range(Range&: MBB)) {
1362 int FrameIndex;
1363 if (MI.isDebugInstr())
1364 SeenDbgInstr = true;
1365
1366 if (TII->isVGPRSpill(MI)) {
1367 // Try to eliminate stack used by VGPR spills before frame
1368 // finalization.
1369 unsigned FIOp = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
1370 AMDGPU::OpName::vaddr);
1371 int FI = MI.getOperand(i: FIOp).getIndex();
1372 Register VReg =
1373 TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
1374 if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI,
1375 isAGPRtoVGPR: TRI->isAGPR(MRI, Reg: VReg))) {
1376 assert(RS != nullptr);
1377 RS->enterBasicBlockEnd(MBB);
1378 RS->backward(I: std::next(x: MI.getIterator()));
1379 TRI->eliminateFrameIndex(MI, SPAdj: 0, FIOperandNum: FIOp, RS);
1380 SpillFIs.set(FI);
1381 continue;
1382 }
1383 } else if (TII->isStoreToStackSlot(MI, FrameIndex) ||
1384 TII->isLoadFromStackSlot(MI, FrameIndex))
1385 if (!MFI.isFixedObjectIndex(ObjectIdx: FrameIndex))
1386 NonVGPRSpillFIs.set(FrameIndex);
1387 }
1388 }
1389
1390 // Stack slot coloring may assign different objects to the same stack slot.
1391 // If not, then the VGPR to AGPR spill slot is dead.
1392 for (unsigned FI : SpillFIs.set_bits())
1393 if (!NonVGPRSpillFIs.test(Idx: FI))
1394 FuncInfo->setVGPRToAGPRSpillDead(FI);
1395
1396 for (MachineBasicBlock &MBB : MF) {
1397 for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs())
1398 MBB.addLiveIn(PhysReg: Reg);
1399
1400 for (MCPhysReg Reg : FuncInfo->getAGPRSpillVGPRs())
1401 MBB.addLiveIn(PhysReg: Reg);
1402
1403 MBB.sortUniqueLiveIns();
1404
1405 if (!SpillFIs.empty() && SeenDbgInstr) {
1406 // FIXME: The dead frame indices are replaced with a null register from
1407 // the debug value instructions. We should instead, update it with the
1408 // correct register value. But not sure the register value alone is
1409 for (MachineInstr &MI : MBB) {
1410 if (MI.isDebugValue() && MI.getOperand(i: 0).isFI() &&
1411 !MFI.isFixedObjectIndex(ObjectIdx: MI.getOperand(i: 0).getIndex()) &&
1412 SpillFIs[MI.getOperand(i: 0).getIndex()]) {
1413 MI.getOperand(i: 0).ChangeToRegister(Reg: Register(), isDef: false /*isDef*/);
1414 }
1415 }
1416 }
1417 }
1418 }
1419
1420 // At this point we've already allocated all spilled SGPRs to VGPRs if we
1421 // can. Any remaining SGPR spills will go to memory, so move them back to the
1422 // default stack.
1423 bool HaveSGPRToVMemSpill =
1424 FuncInfo->removeDeadFrameIndices(MFI, /*ResetSGPRSpillStackIDs*/ true);
1425 assert(allSGPRSpillsAreDead(MF) &&
1426 "SGPR spill should have been removed in SILowerSGPRSpills");
1427
1428 // FIXME: The other checks should be redundant with allStackObjectsAreDead,
1429 // but currently hasNonSpillStackObjects is set only from source
1430 // allocas. Stack temps produced from legalization are not counted currently.
1431 if (!allStackObjectsAreDead(MFI)) {
1432 assert(RS && "RegScavenger required if spilling");
1433
1434 // Add an emergency spill slot
1435 RS->addScavengingFrameIndex(FI: FuncInfo->getScavengeFI(MFI, TRI: *TRI));
1436
1437 // If we are spilling SGPRs to memory with a large frame, we may need a
1438 // second VGPR emergency frame index.
1439 if (HaveSGPRToVMemSpill &&
1440 allocateScavengingFrameIndexesNearIncomingSP(MF)) {
1441 RS->addScavengingFrameIndex(FI: MFI.CreateStackObject(Size: 4, Alignment: Align(4), isSpillSlot: false));
1442 }
1443 }
1444}
1445
1446void SIFrameLowering::processFunctionBeforeFrameIndicesReplaced(
1447 MachineFunction &MF, RegScavenger *RS) const {
1448 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1449 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1450 MachineRegisterInfo &MRI = MF.getRegInfo();
1451 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1452
1453 if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
1454 // On gfx908, we had initially reserved highest available VGPR for AGPR
1455 // copy. Now since we are done with RA, check if there exist an unused VGPR
1456 // which is lower than the eariler reserved VGPR before RA. If one exist,
1457 // use it for AGPR copy instead of one reserved before RA.
1458 Register VGPRForAGPRCopy = FuncInfo->getVGPRForAGPRCopy();
1459 Register UnusedLowVGPR =
1460 TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
1461 if (UnusedLowVGPR && (TRI->getHWRegIndex(Reg: UnusedLowVGPR) <
1462 TRI->getHWRegIndex(Reg: VGPRForAGPRCopy))) {
1463 // Reserve this newly identified VGPR (for AGPR copy)
1464 // reserved registers should already be frozen at this point
1465 // so we can avoid calling MRI.freezeReservedRegs and just use
1466 // MRI.reserveReg
1467 FuncInfo->setVGPRForAGPRCopy(UnusedLowVGPR);
1468 MRI.reserveReg(UnusedLowVGPR, TRI);
1469 }
1470 }
1471 // We initally reserved the highest available SGPR pair for long branches
1472 // now, after RA, we shift down to a lower unused one if one exists
1473 Register LongBranchReservedReg = FuncInfo->getLongBranchReservedReg();
1474 Register UnusedLowSGPR =
1475 TRI->findUnusedRegister(MRI, &AMDGPU::SGPR_64RegClass, MF);
1476 // If LongBranchReservedReg is null then we didn't find a long branch
1477 // and never reserved a register to begin with so there is nothing to
1478 // shift down. Then if UnusedLowSGPR is null, there isn't available lower
1479 // register to use so just keep the original one we set.
1480 if (LongBranchReservedReg && UnusedLowSGPR) {
1481 FuncInfo->setLongBranchReservedReg(UnusedLowSGPR);
1482 MRI.reserveReg(UnusedLowSGPR, TRI);
1483 }
1484}
1485
1486// The special SGPR spills like the one needed for FP, BP or any reserved
1487// registers delayed until frame lowering.
1488void SIFrameLowering::determinePrologEpilogSGPRSaves(
1489 MachineFunction &MF, BitVector &SavedVGPRs,
1490 bool NeedExecCopyReservedReg) const {
1491 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
1492 MachineRegisterInfo &MRI = MF.getRegInfo();
1493 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1494 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1495 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1496 LiveRegUnits LiveUnits;
1497 LiveUnits.init(*TRI);
1498 // Initially mark callee saved registers as used so we will not choose them
1499 // while looking for scratch SGPRs.
1500 const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
1501 for (unsigned I = 0; CSRegs[I]; ++I)
1502 LiveUnits.addReg(Reg: CSRegs[I]);
1503
1504 const TargetRegisterClass &RC = *TRI->getWaveMaskRegClass();
1505
1506 Register ReservedRegForExecCopy = MFI->getSGPRForEXECCopy();
1507 if (NeedExecCopyReservedReg ||
1508 (ReservedRegForExecCopy &&
1509 MRI.isPhysRegUsed(PhysReg: ReservedRegForExecCopy, /*SkipRegMaskTest=*/true))) {
1510 MRI.reserveReg(ReservedRegForExecCopy, TRI);
1511 Register UnusedScratchReg = findUnusedRegister(MRI, LiveUnits, RC);
1512 if (UnusedScratchReg) {
1513 // If found any unused scratch SGPR, reserve the register itself for Exec
1514 // copy and there is no need for any spills in that case.
1515 MFI->setSGPRForEXECCopy(UnusedScratchReg);
1516 MRI.replaceRegWith(FromReg: ReservedRegForExecCopy, ToReg: UnusedScratchReg);
1517 LiveUnits.addReg(Reg: UnusedScratchReg);
1518 } else {
1519 // Needs spill.
1520 assert(!MFI->hasPrologEpilogSGPRSpillEntry(ReservedRegForExecCopy) &&
1521 "Re-reserving spill slot for EXEC copy register");
1522 getVGPRSpillLaneOrTempRegister(MF, LiveUnits, SGPR: ReservedRegForExecCopy, RC,
1523 /*IncludeScratchCopy=*/false);
1524 }
1525 } else if (ReservedRegForExecCopy) {
1526 // Reset it at this point. There are no whole-wave copies and spills
1527 // encountered.
1528 MFI->setSGPRForEXECCopy(AMDGPU::NoRegister);
1529 }
1530
1531 // hasFP only knows about stack objects that already exist. We're now
1532 // determining the stack slots that will be created, so we have to predict
1533 // them. Stack objects force FP usage with calls.
1534 //
1535 // Note a new VGPR CSR may be introduced if one is used for the spill, but we
1536 // don't want to report it here.
1537 //
1538 // FIXME: Is this really hasReservedCallFrame?
1539 const bool WillHaveFP =
1540 FrameInfo.hasCalls() &&
1541 (SavedVGPRs.any() || !allStackObjectsAreDead(MFI: FrameInfo));
1542
1543 if (WillHaveFP || hasFP(MF)) {
1544 Register FramePtrReg = MFI->getFrameOffsetReg();
1545 assert(!MFI->hasPrologEpilogSGPRSpillEntry(FramePtrReg) &&
1546 "Re-reserving spill slot for FP");
1547 getVGPRSpillLaneOrTempRegister(MF, LiveUnits, SGPR: FramePtrReg);
1548 }
1549
1550 if (TRI->hasBasePointer(MF)) {
1551 Register BasePtrReg = TRI->getBaseRegister();
1552 assert(!MFI->hasPrologEpilogSGPRSpillEntry(BasePtrReg) &&
1553 "Re-reserving spill slot for BP");
1554 getVGPRSpillLaneOrTempRegister(MF, LiveUnits, SGPR: BasePtrReg);
1555 }
1556}
1557
1558// Only report VGPRs to generic code.
1559void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
1560 BitVector &SavedVGPRs,
1561 RegScavenger *RS) const {
1562 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1563
1564 // If this is a function with the amdgpu_cs_chain[_preserve] calling
1565 // convention and it doesn't contain any calls to llvm.amdgcn.cs.chain, then
1566 // we don't need to save and restore anything.
1567 if (MFI->isChainFunction() && !MF.getFrameInfo().hasTailCall())
1568 return;
1569
1570 MFI->shiftSpillPhysVGPRsToLowestRange(MF);
1571
1572 TargetFrameLowering::determineCalleeSaves(MF, SavedRegs&: SavedVGPRs, RS);
1573 if (MFI->isEntryFunction())
1574 return;
1575
1576 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1577 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1578 const SIInstrInfo *TII = ST.getInstrInfo();
1579 bool NeedExecCopyReservedReg = false;
1580
1581 MachineInstr *ReturnMI = nullptr;
1582 for (MachineBasicBlock &MBB : MF) {
1583 for (MachineInstr &MI : MBB) {
1584 // WRITELANE instructions used for SGPR spills can overwrite the inactive
1585 // lanes of VGPRs and callee must spill and restore them even if they are
1586 // marked Caller-saved.
1587
1588 // TODO: Handle this elsewhere at an early point. Walking through all MBBs
1589 // here would be a bad heuristic. A better way should be by calling
1590 // allocateWWMSpill during the regalloc pipeline whenever a physical
1591 // register is allocated for the intended virtual registers.
1592 if (MI.getOpcode() == AMDGPU::SI_SPILL_S32_TO_VGPR)
1593 MFI->allocateWWMSpill(MF, VGPR: MI.getOperand(i: 0).getReg());
1594 else if (MI.getOpcode() == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
1595 MFI->allocateWWMSpill(MF, VGPR: MI.getOperand(i: 1).getReg());
1596 else if (TII->isWWMRegSpillOpcode(Opcode: MI.getOpcode()))
1597 NeedExecCopyReservedReg = true;
1598 else if (MI.getOpcode() == AMDGPU::SI_RETURN ||
1599 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
1600 (MFI->isChainFunction() &&
1601 TII->isChainCallOpcode(MI.getOpcode()))) {
1602 // We expect all return to be the same size.
1603 assert(!ReturnMI ||
1604 (count_if(MI.operands(), [](auto Op) { return Op.isReg(); }) ==
1605 count_if(ReturnMI->operands(), [](auto Op) { return Op.isReg(); })));
1606 ReturnMI = &MI;
1607 }
1608 }
1609 }
1610
1611 // Remove any VGPRs used in the return value because these do not need to be saved.
1612 // This prevents CSR restore from clobbering return VGPRs.
1613 if (ReturnMI) {
1614 for (auto &Op : ReturnMI->operands()) {
1615 if (Op.isReg())
1616 SavedVGPRs.reset(Idx: Op.getReg());
1617 }
1618 }
1619
1620 // Ignore the SGPRs the default implementation found.
1621 SavedVGPRs.clearBitsNotInMask(Mask: TRI->getAllVectorRegMask());
1622
1623 // Do not save AGPRs prior to GFX90A because there was no easy way to do so.
1624 // In gfx908 there was do AGPR loads and stores and thus spilling also
1625 // require a temporary VGPR.
1626 if (!ST.hasGFX90AInsts())
1627 SavedVGPRs.clearBitsInMask(Mask: TRI->getAllAGPRRegMask());
1628
1629 determinePrologEpilogSGPRSaves(MF, SavedVGPRs, NeedExecCopyReservedReg);
1630
1631 // The Whole-Wave VGPRs need to be specially inserted in the prolog, so don't
1632 // allow the default insertion to handle them.
1633 for (auto &Reg : MFI->getWWMSpills())
1634 SavedVGPRs.reset(Idx: Reg.first);
1635
1636 // Mark all lane VGPRs as BB LiveIns.
1637 for (MachineBasicBlock &MBB : MF) {
1638 for (auto &Reg : MFI->getWWMSpills())
1639 MBB.addLiveIn(PhysReg: Reg.first);
1640
1641 MBB.sortUniqueLiveIns();
1642 }
1643}
1644
1645void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
1646 BitVector &SavedRegs,
1647 RegScavenger *RS) const {
1648 TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
1649 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1650 if (MFI->isEntryFunction())
1651 return;
1652
1653 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1654 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1655
1656 // The SP is specifically managed and we don't want extra spills of it.
1657 SavedRegs.reset(Idx: MFI->getStackPtrOffsetReg());
1658
1659 const BitVector AllSavedRegs = SavedRegs;
1660 SavedRegs.clearBitsInMask(Mask: TRI->getAllVectorRegMask());
1661
1662 // We have to anticipate introducing CSR VGPR spills or spill of caller
1663 // save VGPR reserved for SGPR spills as we now always create stack entry
1664 // for it, if we don't have any stack objects already, since we require a FP
1665 // if there is a call and stack. We will allocate a VGPR for SGPR spills if
1666 // there are any SGPR spills. Whether they are CSR spills or otherwise.
1667 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
1668 const bool WillHaveFP =
1669 FrameInfo.hasCalls() && (AllSavedRegs.any() || MFI->hasSpilledSGPRs());
1670
1671 // FP will be specially managed like SP.
1672 if (WillHaveFP || hasFP(MF))
1673 SavedRegs.reset(Idx: MFI->getFrameOffsetReg());
1674
1675 // Return address use with return instruction is hidden through the SI_RETURN
1676 // pseudo. Given that and since the IPRA computes actual register usage and
1677 // does not use CSR list, the clobbering of return address by function calls
1678 // (D117243) or otherwise (D120922) is ignored/not seen by the IPRA's register
1679 // usage collection. This will ensure save/restore of return address happens
1680 // in those scenarios.
1681 const MachineRegisterInfo &MRI = MF.getRegInfo();
1682 Register RetAddrReg = TRI->getReturnAddressReg(MF);
1683 if (!MFI->isEntryFunction() &&
1684 (FrameInfo.hasCalls() || MRI.isPhysRegModified(PhysReg: RetAddrReg))) {
1685 SavedRegs.set(TRI->getSubReg(RetAddrReg, AMDGPU::sub0));
1686 SavedRegs.set(TRI->getSubReg(RetAddrReg, AMDGPU::sub1));
1687 }
1688}
1689
1690bool SIFrameLowering::assignCalleeSavedSpillSlots(
1691 MachineFunction &MF, const TargetRegisterInfo *TRI,
1692 std::vector<CalleeSavedInfo> &CSI) const {
1693 if (CSI.empty())
1694 return true; // Early exit if no callee saved registers are modified!
1695
1696 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1697 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1698 const SIRegisterInfo *RI = ST.getRegisterInfo();
1699 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1700 Register BasePtrReg = RI->getBaseRegister();
1701 Register SGPRForFPSaveRestoreCopy =
1702 FuncInfo->getScratchSGPRCopyDstReg(Reg: FramePtrReg);
1703 Register SGPRForBPSaveRestoreCopy =
1704 FuncInfo->getScratchSGPRCopyDstReg(Reg: BasePtrReg);
1705 if (!SGPRForFPSaveRestoreCopy && !SGPRForBPSaveRestoreCopy)
1706 return false;
1707
1708 unsigned NumModifiedRegs = 0;
1709
1710 if (SGPRForFPSaveRestoreCopy)
1711 NumModifiedRegs++;
1712 if (SGPRForBPSaveRestoreCopy)
1713 NumModifiedRegs++;
1714
1715 for (auto &CS : CSI) {
1716 if (CS.getReg() == FramePtrReg && SGPRForFPSaveRestoreCopy) {
1717 CS.setDstReg(SGPRForFPSaveRestoreCopy);
1718 if (--NumModifiedRegs)
1719 break;
1720 } else if (CS.getReg() == BasePtrReg && SGPRForBPSaveRestoreCopy) {
1721 CS.setDstReg(SGPRForBPSaveRestoreCopy);
1722 if (--NumModifiedRegs)
1723 break;
1724 }
1725 }
1726
1727 return false;
1728}
1729
1730bool SIFrameLowering::allocateScavengingFrameIndexesNearIncomingSP(
1731 const MachineFunction &MF) const {
1732
1733 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1734 const MachineFrameInfo &MFI = MF.getFrameInfo();
1735 const SIInstrInfo *TII = ST.getInstrInfo();
1736 uint64_t EstStackSize = MFI.estimateStackSize(MF);
1737 uint64_t MaxOffset = EstStackSize - 1;
1738
1739 // We need the emergency stack slots to be allocated in range of the
1740 // MUBUF/flat scratch immediate offset from the base register, so assign these
1741 // first at the incoming SP position.
1742 //
1743 // TODO: We could try sorting the objects to find a hole in the first bytes
1744 // rather than allocating as close to possible. This could save a lot of space
1745 // on frames with alignment requirements.
1746 if (ST.enableFlatScratch()) {
1747 if (TII->isLegalFLATOffset(Offset: MaxOffset, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS,
1748 FlatVariant: SIInstrFlags::FlatScratch))
1749 return false;
1750 } else {
1751 if (TII->isLegalMUBUFImmOffset(Imm: MaxOffset))
1752 return false;
1753 }
1754
1755 return true;
1756}
1757
1758MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
1759 MachineFunction &MF,
1760 MachineBasicBlock &MBB,
1761 MachineBasicBlock::iterator I) const {
1762 int64_t Amount = I->getOperand(i: 0).getImm();
1763 if (Amount == 0)
1764 return MBB.erase(I);
1765
1766 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1767 const SIInstrInfo *TII = ST.getInstrInfo();
1768 const DebugLoc &DL = I->getDebugLoc();
1769 unsigned Opc = I->getOpcode();
1770 bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
1771 uint64_t CalleePopAmount = IsDestroy ? I->getOperand(i: 1).getImm() : 0;
1772
1773 if (!hasReservedCallFrame(MF)) {
1774 Amount = alignTo(Size: Amount, A: getStackAlign());
1775 assert(isUInt<32>(Amount) && "exceeded stack address space size");
1776 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1777 Register SPReg = MFI->getStackPtrOffsetReg();
1778
1779 Amount *= getScratchScaleFactor(ST);
1780 if (IsDestroy)
1781 Amount = -Amount;
1782 auto Add = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SPReg)
1783 .addReg(SPReg)
1784 .addImm(Amount);
1785 Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1786 } else if (CalleePopAmount != 0) {
1787 llvm_unreachable("is this used?");
1788 }
1789
1790 return MBB.erase(I);
1791}
1792
1793/// Returns true if the frame will require a reference to the stack pointer.
1794///
1795/// This is the set of conditions common to setting up the stack pointer in a
1796/// kernel, and for using a frame pointer in a callable function.
1797///
1798/// FIXME: Should also check hasOpaqueSPAdjustment and if any inline asm
1799/// references SP.
1800static bool frameTriviallyRequiresSP(const MachineFrameInfo &MFI) {
1801 return MFI.hasVarSizedObjects() || MFI.hasStackMap() || MFI.hasPatchPoint();
1802}
1803
1804// The FP for kernels is always known 0, so we never really need to setup an
1805// explicit register for it. However, DisableFramePointerElim will force us to
1806// use a register for it.
1807bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
1808 const MachineFrameInfo &MFI = MF.getFrameInfo();
1809
1810 // For entry & chain functions we can use an immediate offset in most cases,
1811 // so the presence of calls doesn't imply we need a distinct frame pointer.
1812 if (MFI.hasCalls() &&
1813 !MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() &&
1814 !MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) {
1815 // All offsets are unsigned, so need to be addressed in the same direction
1816 // as stack growth.
1817
1818 // FIXME: This function is pretty broken, since it can be called before the
1819 // frame layout is determined or CSR spills are inserted.
1820 return MFI.getStackSize() != 0;
1821 }
1822
1823 return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() ||
1824 MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->hasStackRealignment(
1825 MF) ||
1826 MF.getTarget().Options.DisableFramePointerElim(MF);
1827}
1828
1829// This is essentially a reduced version of hasFP for entry functions. Since the
1830// stack pointer is known 0 on entry to kernels, we never really need an FP
1831// register. We may need to initialize the stack pointer depending on the frame
1832// properties, which logically overlaps many of the cases where an ordinary
1833// function would require an FP.
1834// Also used for chain functions. While not technically entry functions, chain
1835// functions may need to set up a stack pointer in some situations.
1836bool SIFrameLowering::requiresStackPointerReference(
1837 const MachineFunction &MF) const {
1838 // Callable functions always require a stack pointer reference.
1839 assert((MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() ||
1840 MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) &&
1841 "only expected to call this for entry points and chain functions");
1842
1843 const MachineFrameInfo &MFI = MF.getFrameInfo();
1844
1845 // Entry points ordinarily don't need to initialize SP. We have to set it up
1846 // for callees if there are any. Also note tail calls are impossible/don't
1847 // make any sense for kernels.
1848 if (MFI.hasCalls())
1849 return true;
1850
1851 // We still need to initialize the SP if we're doing anything weird that
1852 // references the SP, like variable sized stack objects.
1853 return frameTriviallyRequiresSP(MFI);
1854}
1855

source code of llvm/lib/Target/AMDGPU/SIFrameLowering.cpp