1 | //===----------------------- SIFrameLowering.cpp --------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //==-----------------------------------------------------------------------===// |
8 | |
9 | #include "SIFrameLowering.h" |
10 | #include "AMDGPU.h" |
11 | #include "GCNSubtarget.h" |
12 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
13 | #include "SIMachineFunctionInfo.h" |
14 | #include "llvm/CodeGen/LiveRegUnits.h" |
15 | #include "llvm/CodeGen/MachineFrameInfo.h" |
16 | #include "llvm/CodeGen/RegisterScavenging.h" |
17 | #include "llvm/Target/TargetMachine.h" |
18 | |
19 | using namespace llvm; |
20 | |
21 | #define DEBUG_TYPE "frame-info" |
22 | |
23 | static cl::opt<bool> EnableSpillVGPRToAGPR( |
24 | "amdgpu-spill-vgpr-to-agpr" , |
25 | cl::desc("Enable spilling VGPRs to AGPRs" ), |
26 | cl::ReallyHidden, |
27 | cl::init(Val: true)); |
28 | |
29 | // Find a register matching \p RC from \p LiveUnits which is unused and |
30 | // available throughout the function. On failure, returns AMDGPU::NoRegister. |
31 | // TODO: Rewrite the loop here to iterate over MCRegUnits instead of |
32 | // MCRegisters. This should reduce the number of iterations and avoid redundant |
33 | // checking. |
34 | static MCRegister findUnusedRegister(MachineRegisterInfo &MRI, |
35 | const LiveRegUnits &LiveUnits, |
36 | const TargetRegisterClass &RC) { |
37 | for (MCRegister Reg : RC) { |
38 | if (!MRI.isPhysRegUsed(PhysReg: Reg) && LiveUnits.available(Reg) && |
39 | !MRI.isReserved(PhysReg: Reg)) |
40 | return Reg; |
41 | } |
42 | return MCRegister(); |
43 | } |
44 | |
45 | // Find a scratch register that we can use in the prologue. We avoid using |
46 | // callee-save registers since they may appear to be free when this is called |
47 | // from canUseAsPrologue (during shrink wrapping), but then no longer be free |
48 | // when this is called from emitPrologue. |
49 | static MCRegister findScratchNonCalleeSaveRegister( |
50 | MachineRegisterInfo &MRI, LiveRegUnits &LiveUnits, |
51 | const TargetRegisterClass &RC, bool Unused = false) { |
52 | // Mark callee saved registers as used so we will not choose them. |
53 | const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs(); |
54 | for (unsigned i = 0; CSRegs[i]; ++i) |
55 | LiveUnits.addReg(Reg: CSRegs[i]); |
56 | |
57 | // We are looking for a register that can be used throughout the entire |
58 | // function, so any use is unacceptable. |
59 | if (Unused) |
60 | return findUnusedRegister(MRI, LiveUnits, RC); |
61 | |
62 | for (MCRegister Reg : RC) { |
63 | if (LiveUnits.available(Reg) && !MRI.isReserved(PhysReg: Reg)) |
64 | return Reg; |
65 | } |
66 | |
67 | return MCRegister(); |
68 | } |
69 | |
70 | /// Query target location for spilling SGPRs |
71 | /// \p IncludeScratchCopy : Also look for free scratch SGPRs |
72 | static void getVGPRSpillLaneOrTempRegister( |
73 | MachineFunction &MF, LiveRegUnits &LiveUnits, Register SGPR, |
74 | const TargetRegisterClass &RC = AMDGPU::SReg_32_XM0_XEXECRegClass, |
75 | bool IncludeScratchCopy = true) { |
76 | SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
77 | MachineFrameInfo &FrameInfo = MF.getFrameInfo(); |
78 | |
79 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
80 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
81 | unsigned Size = TRI->getSpillSize(RC); |
82 | Align Alignment = TRI->getSpillAlign(RC); |
83 | |
84 | // We need to save and restore the given SGPR. |
85 | |
86 | Register ScratchSGPR; |
87 | // 1: Try to save the given register into an unused scratch SGPR. The |
88 | // LiveUnits should have all the callee saved registers marked as used. For |
89 | // certain cases we skip copy to scratch SGPR. |
90 | if (IncludeScratchCopy) |
91 | ScratchSGPR = findUnusedRegister(MF.getRegInfo(), LiveUnits, RC); |
92 | |
93 | if (!ScratchSGPR) { |
94 | int FI = FrameInfo.CreateStackObject(Size, Alignment, isSpillSlot: true, Alloca: nullptr, |
95 | ID: TargetStackID::SGPRSpill); |
96 | |
97 | if (TRI->spillSGPRToVGPR() && |
98 | MFI->allocateSGPRSpillToVGPRLane(MF, FI, /*SpillToPhysVGPRLane=*/true, |
99 | /*IsPrologEpilog=*/true)) { |
100 | // 2: There's no free lane to spill, and no free register to save the |
101 | // SGPR, so we're forced to take another VGPR to use for the spill. |
102 | MFI->addToPrologEpilogSGPRSpills( |
103 | Reg: SGPR, SI: PrologEpilogSGPRSaveRestoreInfo( |
104 | SGPRSaveKind::SPILL_TO_VGPR_LANE, FI)); |
105 | |
106 | LLVM_DEBUG(auto Spill = MFI->getSGPRSpillToPhysicalVGPRLanes(FI).front(); |
107 | dbgs() << printReg(SGPR, TRI) << " requires fallback spill to " |
108 | << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane |
109 | << '\n';); |
110 | } else { |
111 | // Remove dead <FI> index |
112 | MF.getFrameInfo().RemoveStackObject(ObjectIdx: FI); |
113 | // 3: If all else fails, spill the register to memory. |
114 | FI = FrameInfo.CreateSpillStackObject(Size, Alignment); |
115 | MFI->addToPrologEpilogSGPRSpills( |
116 | Reg: SGPR, |
117 | SI: PrologEpilogSGPRSaveRestoreInfo(SGPRSaveKind::SPILL_TO_MEM, FI)); |
118 | LLVM_DEBUG(dbgs() << "Reserved FI " << FI << " for spilling " |
119 | << printReg(SGPR, TRI) << '\n'); |
120 | } |
121 | } else { |
122 | MFI->addToPrologEpilogSGPRSpills( |
123 | Reg: SGPR, SI: PrologEpilogSGPRSaveRestoreInfo( |
124 | SGPRSaveKind::COPY_TO_SCRATCH_SGPR, ScratchSGPR)); |
125 | LiveUnits.addReg(Reg: ScratchSGPR); |
126 | LLVM_DEBUG(dbgs() << "Saving " << printReg(SGPR, TRI) << " with copy to " |
127 | << printReg(ScratchSGPR, TRI) << '\n'); |
128 | } |
129 | } |
130 | |
131 | // We need to specially emit stack operations here because a different frame |
132 | // register is used than in the rest of the function, as getFrameRegister would |
133 | // use. |
134 | static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI, |
135 | const SIMachineFunctionInfo &FuncInfo, |
136 | LiveRegUnits &LiveUnits, MachineFunction &MF, |
137 | MachineBasicBlock &MBB, |
138 | MachineBasicBlock::iterator I, const DebugLoc &DL, |
139 | Register SpillReg, int FI, Register FrameReg, |
140 | int64_t DwordOff = 0) { |
141 | unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR |
142 | : AMDGPU::BUFFER_STORE_DWORD_OFFSET; |
143 | |
144 | MachineFrameInfo &FrameInfo = MF.getFrameInfo(); |
145 | MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); |
146 | MachineMemOperand *MMO = MF.getMachineMemOperand( |
147 | PtrInfo, F: MachineMemOperand::MOStore, Size: FrameInfo.getObjectSize(ObjectIdx: FI), |
148 | BaseAlignment: FrameInfo.getObjectAlign(ObjectIdx: FI)); |
149 | LiveUnits.addReg(Reg: SpillReg); |
150 | bool IsKill = !MBB.isLiveIn(Reg: SpillReg); |
151 | TRI.buildSpillLoadStore(MBB, MI: I, DL, LoadStoreOp: Opc, Index: FI, ValueReg: SpillReg, ValueIsKill: IsKill, ScratchOffsetReg: FrameReg, |
152 | InstrOffset: DwordOff, MMO, RS: nullptr, LiveUnits: &LiveUnits); |
153 | if (IsKill) |
154 | LiveUnits.removeReg(Reg: SpillReg); |
155 | } |
156 | |
157 | static void buildEpilogRestore(const GCNSubtarget &ST, |
158 | const SIRegisterInfo &TRI, |
159 | const SIMachineFunctionInfo &FuncInfo, |
160 | LiveRegUnits &LiveUnits, MachineFunction &MF, |
161 | MachineBasicBlock &MBB, |
162 | MachineBasicBlock::iterator I, |
163 | const DebugLoc &DL, Register SpillReg, int FI, |
164 | Register FrameReg, int64_t DwordOff = 0) { |
165 | unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR |
166 | : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; |
167 | |
168 | MachineFrameInfo &FrameInfo = MF.getFrameInfo(); |
169 | MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); |
170 | MachineMemOperand *MMO = MF.getMachineMemOperand( |
171 | PtrInfo, F: MachineMemOperand::MOLoad, Size: FrameInfo.getObjectSize(ObjectIdx: FI), |
172 | BaseAlignment: FrameInfo.getObjectAlign(ObjectIdx: FI)); |
173 | TRI.buildSpillLoadStore(MBB, MI: I, DL, LoadStoreOp: Opc, Index: FI, ValueReg: SpillReg, ValueIsKill: false, ScratchOffsetReg: FrameReg, |
174 | InstrOffset: DwordOff, MMO, RS: nullptr, LiveUnits: &LiveUnits); |
175 | } |
176 | |
177 | static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, |
178 | const DebugLoc &DL, const SIInstrInfo *TII, |
179 | Register TargetReg) { |
180 | MachineFunction *MF = MBB.getParent(); |
181 | const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); |
182 | const SIRegisterInfo *TRI = &TII->getRegisterInfo(); |
183 | const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); |
184 | Register TargetLo = TRI->getSubReg(TargetReg, AMDGPU::sub0); |
185 | Register TargetHi = TRI->getSubReg(TargetReg, AMDGPU::sub1); |
186 | |
187 | if (MFI->getGITPtrHigh() != 0xffffffff) { |
188 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: SMovB32, DestReg: TargetHi) |
189 | .addImm(Val: MFI->getGITPtrHigh()) |
190 | .addReg(RegNo: TargetReg, flags: RegState::ImplicitDefine); |
191 | } else { |
192 | const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64_pseudo); |
193 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: GetPC64, DestReg: TargetReg); |
194 | } |
195 | Register GitPtrLo = MFI->getGITPtrLoReg(MF: *MF); |
196 | MF->getRegInfo().addLiveIn(Reg: GitPtrLo); |
197 | MBB.addLiveIn(PhysReg: GitPtrLo); |
198 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: SMovB32, DestReg: TargetLo) |
199 | .addReg(RegNo: GitPtrLo); |
200 | } |
201 | |
202 | static void initLiveUnits(LiveRegUnits &LiveUnits, const SIRegisterInfo &TRI, |
203 | const SIMachineFunctionInfo *FuncInfo, |
204 | MachineFunction &MF, MachineBasicBlock &MBB, |
205 | MachineBasicBlock::iterator MBBI, bool IsProlog) { |
206 | if (LiveUnits.empty()) { |
207 | LiveUnits.init(TRI); |
208 | if (IsProlog) { |
209 | LiveUnits.addLiveIns(MBB); |
210 | } else { |
211 | // In epilog. |
212 | LiveUnits.addLiveOuts(MBB); |
213 | LiveUnits.stepBackward(MI: *MBBI); |
214 | } |
215 | } |
216 | } |
217 | |
218 | namespace llvm { |
219 | |
220 | // SpillBuilder to save/restore special SGPR spills like the one needed for FP, |
221 | // BP, etc. These spills are delayed until the current function's frame is |
222 | // finalized. For a given register, the builder uses the |
223 | // PrologEpilogSGPRSaveRestoreInfo to decide the spill method. |
224 | class PrologEpilogSGPRSpillBuilder { |
225 | MachineBasicBlock::iterator MI; |
226 | MachineBasicBlock &MBB; |
227 | MachineFunction &MF; |
228 | const GCNSubtarget &ST; |
229 | MachineFrameInfo &MFI; |
230 | SIMachineFunctionInfo *FuncInfo; |
231 | const SIInstrInfo *TII; |
232 | const SIRegisterInfo &TRI; |
233 | Register SuperReg; |
234 | const PrologEpilogSGPRSaveRestoreInfo SI; |
235 | LiveRegUnits &LiveUnits; |
236 | const DebugLoc &DL; |
237 | Register FrameReg; |
238 | ArrayRef<int16_t> SplitParts; |
239 | unsigned NumSubRegs; |
240 | unsigned EltSize = 4; |
241 | |
242 | void saveToMemory(const int FI) const { |
243 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
244 | assert(!MFI.isDeadObjectIndex(FI)); |
245 | |
246 | initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI: MI, /*IsProlog*/ true); |
247 | |
248 | MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( |
249 | MRI, LiveUnits, AMDGPU::VGPR_32RegClass); |
250 | if (!TmpVGPR) |
251 | report_fatal_error(reason: "failed to find free scratch register" ); |
252 | |
253 | for (unsigned I = 0, DwordOff = 0; I < NumSubRegs; ++I) { |
254 | Register SubReg = NumSubRegs == 1 |
255 | ? SuperReg |
256 | : Register(TRI.getSubReg(SuperReg, SplitParts[I])); |
257 | BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) |
258 | .addReg(SubReg); |
259 | |
260 | buildPrologSpill(ST, TRI, FuncInfo: *FuncInfo, LiveUnits, MF, MBB, I: MI, DL, SpillReg: TmpVGPR, |
261 | FI, FrameReg, DwordOff); |
262 | DwordOff += 4; |
263 | } |
264 | } |
265 | |
266 | void saveToVGPRLane(const int FI) const { |
267 | assert(!MFI.isDeadObjectIndex(FI)); |
268 | |
269 | assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); |
270 | ArrayRef<SIRegisterInfo::SpilledReg> Spill = |
271 | FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FrameIndex: FI); |
272 | assert(Spill.size() == NumSubRegs); |
273 | |
274 | for (unsigned I = 0; I < NumSubRegs; ++I) { |
275 | Register SubReg = NumSubRegs == 1 |
276 | ? SuperReg |
277 | : Register(TRI.getSubReg(SuperReg, SplitParts[I])); |
278 | BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_S32_TO_VGPR), |
279 | Spill[I].VGPR) |
280 | .addReg(SubReg) |
281 | .addImm(Spill[I].Lane) |
282 | .addReg(Spill[I].VGPR, RegState::Undef); |
283 | } |
284 | } |
285 | |
286 | void copyToScratchSGPR(Register DstReg) const { |
287 | BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), DstReg) |
288 | .addReg(SuperReg) |
289 | .setMIFlag(MachineInstr::FrameSetup); |
290 | } |
291 | |
292 | void restoreFromMemory(const int FI) { |
293 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
294 | |
295 | initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI: MI, /*IsProlog*/ false); |
296 | MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( |
297 | MRI, LiveUnits, AMDGPU::VGPR_32RegClass); |
298 | if (!TmpVGPR) |
299 | report_fatal_error(reason: "failed to find free scratch register" ); |
300 | |
301 | for (unsigned I = 0, DwordOff = 0; I < NumSubRegs; ++I) { |
302 | Register SubReg = NumSubRegs == 1 |
303 | ? SuperReg |
304 | : Register(TRI.getSubReg(SuperReg, SplitParts[I])); |
305 | |
306 | buildEpilogRestore(ST, TRI, FuncInfo: *FuncInfo, LiveUnits, MF, MBB, I: MI, DL, |
307 | SpillReg: TmpVGPR, FI, FrameReg, DwordOff); |
308 | BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg) |
309 | .addReg(TmpVGPR, RegState::Kill); |
310 | DwordOff += 4; |
311 | } |
312 | } |
313 | |
314 | void restoreFromVGPRLane(const int FI) { |
315 | assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); |
316 | ArrayRef<SIRegisterInfo::SpilledReg> Spill = |
317 | FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FrameIndex: FI); |
318 | assert(Spill.size() == NumSubRegs); |
319 | |
320 | for (unsigned I = 0; I < NumSubRegs; ++I) { |
321 | Register SubReg = NumSubRegs == 1 |
322 | ? SuperReg |
323 | : Register(TRI.getSubReg(SuperReg, SplitParts[I])); |
324 | BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg) |
325 | .addReg(Spill[I].VGPR) |
326 | .addImm(Spill[I].Lane); |
327 | } |
328 | } |
329 | |
330 | void copyFromScratchSGPR(Register SrcReg) const { |
331 | BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), SuperReg) |
332 | .addReg(SrcReg) |
333 | .setMIFlag(MachineInstr::FrameDestroy); |
334 | } |
335 | |
336 | public: |
337 | PrologEpilogSGPRSpillBuilder(Register Reg, |
338 | const PrologEpilogSGPRSaveRestoreInfo SI, |
339 | MachineBasicBlock &MBB, |
340 | MachineBasicBlock::iterator MI, |
341 | const DebugLoc &DL, const SIInstrInfo *TII, |
342 | const SIRegisterInfo &TRI, |
343 | LiveRegUnits &LiveUnits, Register FrameReg) |
344 | : MI(MI), MBB(MBB), MF(*MBB.getParent()), |
345 | ST(MF.getSubtarget<GCNSubtarget>()), MFI(MF.getFrameInfo()), |
346 | FuncInfo(MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI), |
347 | SuperReg(Reg), SI(SI), LiveUnits(LiveUnits), DL(DL), |
348 | FrameReg(FrameReg) { |
349 | const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(SuperReg); |
350 | SplitParts = TRI.getRegSplitParts(RC, EltSize); |
351 | NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); |
352 | |
353 | assert(SuperReg != AMDGPU::M0 && "m0 should never spill" ); |
354 | } |
355 | |
356 | void save() { |
357 | switch (SI.getKind()) { |
358 | case SGPRSaveKind::SPILL_TO_MEM: |
359 | return saveToMemory(FI: SI.getIndex()); |
360 | case SGPRSaveKind::SPILL_TO_VGPR_LANE: |
361 | return saveToVGPRLane(FI: SI.getIndex()); |
362 | case SGPRSaveKind::COPY_TO_SCRATCH_SGPR: |
363 | return copyToScratchSGPR(DstReg: SI.getReg()); |
364 | } |
365 | } |
366 | |
367 | void restore() { |
368 | switch (SI.getKind()) { |
369 | case SGPRSaveKind::SPILL_TO_MEM: |
370 | return restoreFromMemory(FI: SI.getIndex()); |
371 | case SGPRSaveKind::SPILL_TO_VGPR_LANE: |
372 | return restoreFromVGPRLane(FI: SI.getIndex()); |
373 | case SGPRSaveKind::COPY_TO_SCRATCH_SGPR: |
374 | return copyFromScratchSGPR(SrcReg: SI.getReg()); |
375 | } |
376 | } |
377 | }; |
378 | |
379 | } // namespace llvm |
380 | |
381 | // Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()` |
382 | void SIFrameLowering::emitEntryFunctionFlatScratchInit( |
383 | MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, |
384 | const DebugLoc &DL, Register ScratchWaveOffsetReg) const { |
385 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
386 | const SIInstrInfo *TII = ST.getInstrInfo(); |
387 | const SIRegisterInfo *TRI = &TII->getRegisterInfo(); |
388 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
389 | |
390 | // We don't need this if we only have spills since there is no user facing |
391 | // scratch. |
392 | |
393 | // TODO: If we know we don't have flat instructions earlier, we can omit |
394 | // this from the input registers. |
395 | // |
396 | // TODO: We only need to know if we access scratch space through a flat |
397 | // pointer. Because we only detect if flat instructions are used at all, |
398 | // this will be used more often than necessary on VI. |
399 | |
400 | Register FlatScrInitLo; |
401 | Register FlatScrInitHi; |
402 | |
403 | if (ST.isAmdPalOS()) { |
404 | // Extract the scratch offset from the descriptor in the GIT |
405 | LiveRegUnits LiveUnits; |
406 | LiveUnits.init(*TRI); |
407 | LiveUnits.addLiveIns(MBB); |
408 | |
409 | // Find unused reg to load flat scratch init into |
410 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
411 | Register FlatScrInit = AMDGPU::NoRegister; |
412 | ArrayRef<MCPhysReg> AllSGPR64s = TRI->getAllSGPR64(MF); |
413 | unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 1) / 2; |
414 | AllSGPR64s = AllSGPR64s.slice( |
415 | N: std::min(a: static_cast<unsigned>(AllSGPR64s.size()), b: NumPreloaded)); |
416 | Register GITPtrLoReg = MFI->getGITPtrLoReg(MF); |
417 | for (MCPhysReg Reg : AllSGPR64s) { |
418 | if (LiveUnits.available(Reg) && !MRI.isReserved(PhysReg: Reg) && |
419 | MRI.isAllocatable(PhysReg: Reg) && !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) { |
420 | FlatScrInit = Reg; |
421 | break; |
422 | } |
423 | } |
424 | assert(FlatScrInit && "Failed to find free register for scratch init" ); |
425 | |
426 | FlatScrInitLo = TRI->getSubReg(FlatScrInit, AMDGPU::sub0); |
427 | FlatScrInitHi = TRI->getSubReg(FlatScrInit, AMDGPU::sub1); |
428 | |
429 | buildGitPtr(MBB, I, DL, TII, TargetReg: FlatScrInit); |
430 | |
431 | // We now have the GIT ptr - now get the scratch descriptor from the entry |
432 | // at offset 0 (or offset 16 for a compute shader). |
433 | MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); |
434 | const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM); |
435 | auto *MMO = MF.getMachineMemOperand( |
436 | PtrInfo, |
437 | F: MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant | |
438 | MachineMemOperand::MODereferenceable, |
439 | Size: 8, BaseAlignment: Align(4)); |
440 | unsigned Offset = |
441 | MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0; |
442 | const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>(); |
443 | unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset); |
444 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: LoadDwordX2, DestReg: FlatScrInit) |
445 | .addReg(RegNo: FlatScrInit) |
446 | .addImm(Val: EncodedOffset) // offset |
447 | .addImm(Val: 0) // cpol |
448 | .addMemOperand(MMO); |
449 | |
450 | // Mask the offset in [47:0] of the descriptor |
451 | const MCInstrDesc &SAndB32 = TII->get(AMDGPU::S_AND_B32); |
452 | auto And = BuildMI(BB&: MBB, I, MIMD: DL, MCID: SAndB32, DestReg: FlatScrInitHi) |
453 | .addReg(RegNo: FlatScrInitHi) |
454 | .addImm(Val: 0xffff); |
455 | And->getOperand(i: 3).setIsDead(); // Mark SCC as dead. |
456 | } else { |
457 | Register FlatScratchInitReg = |
458 | MFI->getPreloadedReg(Value: AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT); |
459 | assert(FlatScratchInitReg); |
460 | |
461 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
462 | MRI.addLiveIn(Reg: FlatScratchInitReg); |
463 | MBB.addLiveIn(PhysReg: FlatScratchInitReg); |
464 | |
465 | FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0); |
466 | FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1); |
467 | } |
468 | |
469 | // Do a 64-bit pointer add. |
470 | if (ST.flatScratchIsPointer()) { |
471 | if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) { |
472 | BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo) |
473 | .addReg(FlatScrInitLo) |
474 | .addReg(ScratchWaveOffsetReg); |
475 | auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), |
476 | FlatScrInitHi) |
477 | .addReg(FlatScrInitHi) |
478 | .addImm(0); |
479 | Addc->getOperand(3).setIsDead(); // Mark SCC as dead. |
480 | |
481 | using namespace AMDGPU::Hwreg; |
482 | BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)) |
483 | .addReg(FlatScrInitLo) |
484 | .addImm(int16_t(HwregEncoding::encode(ID_FLAT_SCR_LO, 0, 32))); |
485 | BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)) |
486 | .addReg(FlatScrInitHi) |
487 | .addImm(int16_t(HwregEncoding::encode(ID_FLAT_SCR_HI, 0, 32))); |
488 | return; |
489 | } |
490 | |
491 | // For GFX9. |
492 | BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO) |
493 | .addReg(FlatScrInitLo) |
494 | .addReg(ScratchWaveOffsetReg); |
495 | auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), |
496 | AMDGPU::FLAT_SCR_HI) |
497 | .addReg(FlatScrInitHi) |
498 | .addImm(0); |
499 | Addc->getOperand(3).setIsDead(); // Mark SCC as dead. |
500 | |
501 | return; |
502 | } |
503 | |
504 | assert(ST.getGeneration() < AMDGPUSubtarget::GFX9); |
505 | |
506 | // Copy the size in bytes. |
507 | BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO) |
508 | .addReg(FlatScrInitHi, RegState::Kill); |
509 | |
510 | // Add wave offset in bytes to private base offset. |
511 | // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init. |
512 | BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), FlatScrInitLo) |
513 | .addReg(FlatScrInitLo) |
514 | .addReg(ScratchWaveOffsetReg); |
515 | |
516 | // Convert offset to 256-byte units. |
517 | auto LShr = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), |
518 | AMDGPU::FLAT_SCR_HI) |
519 | .addReg(FlatScrInitLo, RegState::Kill) |
520 | .addImm(8); |
521 | LShr->getOperand(3).setIsDead(); // Mark SCC as dead. |
522 | } |
523 | |
524 | // Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not |
525 | // memory. They should have been removed by now. |
526 | static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) { |
527 | for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); |
528 | I != E; ++I) { |
529 | if (!MFI.isDeadObjectIndex(ObjectIdx: I)) |
530 | return false; |
531 | } |
532 | |
533 | return true; |
534 | } |
535 | |
536 | // Shift down registers reserved for the scratch RSRC. |
537 | Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg( |
538 | MachineFunction &MF) const { |
539 | |
540 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
541 | const SIInstrInfo *TII = ST.getInstrInfo(); |
542 | const SIRegisterInfo *TRI = &TII->getRegisterInfo(); |
543 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
544 | SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
545 | |
546 | assert(MFI->isEntryFunction()); |
547 | |
548 | Register ScratchRsrcReg = MFI->getScratchRSrcReg(); |
549 | |
550 | if (!ScratchRsrcReg || (!MRI.isPhysRegUsed(PhysReg: ScratchRsrcReg) && |
551 | allStackObjectsAreDead(MFI: MF.getFrameInfo()))) |
552 | return Register(); |
553 | |
554 | if (ST.hasSGPRInitBug() || |
555 | ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF)) |
556 | return ScratchRsrcReg; |
557 | |
558 | // We reserved the last registers for this. Shift it down to the end of those |
559 | // which were actually used. |
560 | // |
561 | // FIXME: It might be safer to use a pseudoregister before replacement. |
562 | |
563 | // FIXME: We should be able to eliminate unused input registers. We only |
564 | // cannot do this for the resources required for scratch access. For now we |
565 | // skip over user SGPRs and may leave unused holes. |
566 | |
567 | unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4; |
568 | ArrayRef<MCPhysReg> AllSGPR128s = TRI->getAllSGPR128(MF); |
569 | AllSGPR128s = AllSGPR128s.slice(N: std::min(a: static_cast<unsigned>(AllSGPR128s.size()), b: NumPreloaded)); |
570 | |
571 | // Skip the last N reserved elements because they should have already been |
572 | // reserved for VCC etc. |
573 | Register GITPtrLoReg = MFI->getGITPtrLoReg(MF); |
574 | for (MCPhysReg Reg : AllSGPR128s) { |
575 | // Pick the first unallocated one. Make sure we don't clobber the other |
576 | // reserved input we needed. Also for PAL, make sure we don't clobber |
577 | // the GIT pointer passed in SGPR0 or SGPR8. |
578 | if (!MRI.isPhysRegUsed(PhysReg: Reg) && MRI.isAllocatable(PhysReg: Reg) && |
579 | (!GITPtrLoReg || !TRI->isSubRegisterEq(Reg, GITPtrLoReg))) { |
580 | MRI.replaceRegWith(FromReg: ScratchRsrcReg, ToReg: Reg); |
581 | MFI->setScratchRSrcReg(Reg); |
582 | return Reg; |
583 | } |
584 | } |
585 | |
586 | return ScratchRsrcReg; |
587 | } |
588 | |
589 | static unsigned getScratchScaleFactor(const GCNSubtarget &ST) { |
590 | return ST.enableFlatScratch() ? 1 : ST.getWavefrontSize(); |
591 | } |
592 | |
593 | void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, |
594 | MachineBasicBlock &MBB) const { |
595 | assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported" ); |
596 | |
597 | // FIXME: If we only have SGPR spills, we won't actually be using scratch |
598 | // memory since these spill to VGPRs. We should be cleaning up these unused |
599 | // SGPR spill frame indices somewhere. |
600 | |
601 | // FIXME: We still have implicit uses on SGPR spill instructions in case they |
602 | // need to spill to vector memory. It's likely that will not happen, but at |
603 | // this point it appears we need the setup. This part of the prolog should be |
604 | // emitted after frame indices are eliminated. |
605 | |
606 | // FIXME: Remove all of the isPhysRegUsed checks |
607 | |
608 | SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
609 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
610 | const SIInstrInfo *TII = ST.getInstrInfo(); |
611 | const SIRegisterInfo *TRI = &TII->getRegisterInfo(); |
612 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
613 | const Function &F = MF.getFunction(); |
614 | MachineFrameInfo &FrameInfo = MF.getFrameInfo(); |
615 | |
616 | assert(MFI->isEntryFunction()); |
617 | |
618 | Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg( |
619 | Value: AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); |
620 | |
621 | // We need to do the replacement of the private segment buffer register even |
622 | // if there are no stack objects. There could be stores to undef or a |
623 | // constant without an associated object. |
624 | // |
625 | // This will return `Register()` in cases where there are no actual |
626 | // uses of the SRSRC. |
627 | Register ScratchRsrcReg; |
628 | if (!ST.enableFlatScratch()) |
629 | ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF); |
630 | |
631 | // Make the selected register live throughout the function. |
632 | if (ScratchRsrcReg) { |
633 | for (MachineBasicBlock &OtherBB : MF) { |
634 | if (&OtherBB != &MBB) { |
635 | OtherBB.addLiveIn(PhysReg: ScratchRsrcReg); |
636 | } |
637 | } |
638 | } |
639 | |
640 | // Now that we have fixed the reserved SRSRC we need to locate the |
641 | // (potentially) preloaded SRSRC. |
642 | Register PreloadedScratchRsrcReg; |
643 | if (ST.isAmdHsaOrMesa(F)) { |
644 | PreloadedScratchRsrcReg = |
645 | MFI->getPreloadedReg(Value: AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); |
646 | if (ScratchRsrcReg && PreloadedScratchRsrcReg) { |
647 | // We added live-ins during argument lowering, but since they were not |
648 | // used they were deleted. We're adding the uses now, so add them back. |
649 | MRI.addLiveIn(Reg: PreloadedScratchRsrcReg); |
650 | MBB.addLiveIn(PhysReg: PreloadedScratchRsrcReg); |
651 | } |
652 | } |
653 | |
654 | // Debug location must be unknown since the first debug location is used to |
655 | // determine the end of the prologue. |
656 | DebugLoc DL; |
657 | MachineBasicBlock::iterator I = MBB.begin(); |
658 | |
659 | // We found the SRSRC first because it needs four registers and has an |
660 | // alignment requirement. If the SRSRC that we found is clobbering with |
661 | // the scratch wave offset, which may be in a fixed SGPR or a free SGPR |
662 | // chosen by SITargetLowering::allocateSystemSGPRs, COPY the scratch |
663 | // wave offset to a free SGPR. |
664 | Register ScratchWaveOffsetReg; |
665 | if (PreloadedScratchWaveOffsetReg && |
666 | TRI->isSubRegisterEq(ScratchRsrcReg, PreloadedScratchWaveOffsetReg)) { |
667 | ArrayRef<MCPhysReg> AllSGPRs = TRI->getAllSGPR32(MF); |
668 | unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); |
669 | AllSGPRs = AllSGPRs.slice( |
670 | N: std::min(a: static_cast<unsigned>(AllSGPRs.size()), b: NumPreloaded)); |
671 | Register GITPtrLoReg = MFI->getGITPtrLoReg(MF); |
672 | for (MCPhysReg Reg : AllSGPRs) { |
673 | if (!MRI.isPhysRegUsed(PhysReg: Reg) && MRI.isAllocatable(PhysReg: Reg) && |
674 | !TRI->isSubRegisterEq(ScratchRsrcReg, Reg) && GITPtrLoReg != Reg) { |
675 | ScratchWaveOffsetReg = Reg; |
676 | BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg) |
677 | .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill); |
678 | break; |
679 | } |
680 | } |
681 | } else { |
682 | ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg; |
683 | } |
684 | assert(ScratchWaveOffsetReg || !PreloadedScratchWaveOffsetReg); |
685 | |
686 | if (requiresStackPointerReference(MF)) { |
687 | Register SPReg = MFI->getStackPtrOffsetReg(); |
688 | assert(SPReg != AMDGPU::SP_REG); |
689 | BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg) |
690 | .addImm(FrameInfo.getStackSize() * getScratchScaleFactor(ST)); |
691 | } |
692 | |
693 | if (hasFP(MF)) { |
694 | Register FPReg = MFI->getFrameOffsetReg(); |
695 | assert(FPReg != AMDGPU::FP_REG); |
696 | BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0); |
697 | } |
698 | |
699 | bool NeedsFlatScratchInit = |
700 | MFI->getUserSGPRInfo().hasFlatScratchInit() && |
701 | (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() || |
702 | (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch())); |
703 | |
704 | if ((NeedsFlatScratchInit || ScratchRsrcReg) && |
705 | PreloadedScratchWaveOffsetReg && !ST.flatScratchIsArchitected()) { |
706 | MRI.addLiveIn(Reg: PreloadedScratchWaveOffsetReg); |
707 | MBB.addLiveIn(PhysReg: PreloadedScratchWaveOffsetReg); |
708 | } |
709 | |
710 | if (NeedsFlatScratchInit) { |
711 | emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg); |
712 | } |
713 | |
714 | if (ScratchRsrcReg) { |
715 | emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL, |
716 | PreloadedPrivateBufferReg: PreloadedScratchRsrcReg, |
717 | ScratchRsrcReg, ScratchWaveOffsetReg); |
718 | } |
719 | } |
720 | |
721 | // Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg` |
722 | void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup( |
723 | MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, |
724 | const DebugLoc &DL, Register PreloadedScratchRsrcReg, |
725 | Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const { |
726 | |
727 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
728 | const SIInstrInfo *TII = ST.getInstrInfo(); |
729 | const SIRegisterInfo *TRI = &TII->getRegisterInfo(); |
730 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
731 | const Function &Fn = MF.getFunction(); |
732 | |
733 | if (ST.isAmdPalOS()) { |
734 | // The pointer to the GIT is formed from the offset passed in and either |
735 | // the amdgpu-git-ptr-high function attribute or the top part of the PC |
736 | Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); |
737 | Register Rsrc03 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); |
738 | |
739 | buildGitPtr(MBB, I, DL, TII, TargetReg: Rsrc01); |
740 | |
741 | // We now have the GIT ptr - now get the scratch descriptor from the entry |
742 | // at offset 0 (or offset 16 for a compute shader). |
743 | MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); |
744 | const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM); |
745 | auto MMO = MF.getMachineMemOperand(PtrInfo, |
746 | F: MachineMemOperand::MOLoad | |
747 | MachineMemOperand::MOInvariant | |
748 | MachineMemOperand::MODereferenceable, |
749 | Size: 16, BaseAlignment: Align(4)); |
750 | unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0; |
751 | const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>(); |
752 | unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset); |
753 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: LoadDwordX4, DestReg: ScratchRsrcReg) |
754 | .addReg(RegNo: Rsrc01) |
755 | .addImm(Val: EncodedOffset) // offset |
756 | .addImm(Val: 0) // cpol |
757 | .addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine) |
758 | .addMemOperand(MMO); |
759 | |
760 | // The driver will always set the SRD for wave 64 (bits 118:117 of |
761 | // descriptor / bits 22:21 of third sub-reg will be 0b11) |
762 | // If the shader is actually wave32 we have to modify the const_index_stride |
763 | // field of the descriptor 3rd sub-reg (bits 22:21) to 0b10 (stride=32). The |
764 | // reason the driver does this is that there can be cases where it presents |
765 | // 2 shaders with different wave size (e.g. VsFs). |
766 | // TODO: convert to using SCRATCH instructions or multiple SRD buffers |
767 | if (ST.isWave32()) { |
768 | const MCInstrDesc &SBitsetB32 = TII->get(AMDGPU::S_BITSET0_B32); |
769 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: SBitsetB32, DestReg: Rsrc03) |
770 | .addImm(Val: 21) |
771 | .addReg(RegNo: Rsrc03); |
772 | } |
773 | } else if (ST.isMesaGfxShader(F: Fn) || !PreloadedScratchRsrcReg) { |
774 | assert(!ST.isAmdHsaOrMesa(Fn)); |
775 | const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); |
776 | |
777 | Register Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); |
778 | Register Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); |
779 | |
780 | // Use relocations to get the pointer, and setup the other bits manually. |
781 | uint64_t Rsrc23 = TII->getScratchRsrcWords23(); |
782 | |
783 | if (MFI->getUserSGPRInfo().hasImplicitBufferPtr()) { |
784 | Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); |
785 | |
786 | if (AMDGPU::isCompute(CC: MF.getFunction().getCallingConv())) { |
787 | const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64); |
788 | |
789 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: Mov64, DestReg: Rsrc01) |
790 | .addReg(RegNo: MFI->getImplicitBufferPtrUserSGPR()) |
791 | .addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine); |
792 | } else { |
793 | const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM); |
794 | |
795 | MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); |
796 | auto MMO = MF.getMachineMemOperand( |
797 | PtrInfo, |
798 | F: MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant | |
799 | MachineMemOperand::MODereferenceable, |
800 | Size: 8, BaseAlignment: Align(4)); |
801 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: LoadDwordX2, DestReg: Rsrc01) |
802 | .addReg(RegNo: MFI->getImplicitBufferPtrUserSGPR()) |
803 | .addImm(Val: 0) // offset |
804 | .addImm(Val: 0) // cpol |
805 | .addMemOperand(MMO) |
806 | .addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine); |
807 | |
808 | MF.getRegInfo().addLiveIn(Reg: MFI->getImplicitBufferPtrUserSGPR()); |
809 | MBB.addLiveIn(PhysReg: MFI->getImplicitBufferPtrUserSGPR()); |
810 | } |
811 | } else { |
812 | Register Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); |
813 | Register Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); |
814 | |
815 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: SMovB32, DestReg: Rsrc0) |
816 | .addExternalSymbol(FnName: "SCRATCH_RSRC_DWORD0" ) |
817 | .addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine); |
818 | |
819 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: SMovB32, DestReg: Rsrc1) |
820 | .addExternalSymbol(FnName: "SCRATCH_RSRC_DWORD1" ) |
821 | .addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine); |
822 | } |
823 | |
824 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: SMovB32, DestReg: Rsrc2) |
825 | .addImm(Val: Rsrc23 & 0xffffffff) |
826 | .addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine); |
827 | |
828 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: SMovB32, DestReg: Rsrc3) |
829 | .addImm(Val: Rsrc23 >> 32) |
830 | .addReg(RegNo: ScratchRsrcReg, flags: RegState::ImplicitDefine); |
831 | } else if (ST.isAmdHsaOrMesa(Fn)) { |
832 | assert(PreloadedScratchRsrcReg); |
833 | |
834 | if (ScratchRsrcReg != PreloadedScratchRsrcReg) { |
835 | BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg) |
836 | .addReg(PreloadedScratchRsrcReg, RegState::Kill); |
837 | } |
838 | } |
839 | |
840 | // Add the scratch wave offset into the scratch RSRC. |
841 | // |
842 | // We only want to update the first 48 bits, which is the base address |
843 | // pointer, without touching the adjacent 16 bits of flags. We know this add |
844 | // cannot carry-out from bit 47, otherwise the scratch allocation would be |
845 | // impossible to fit in the 48-bit global address space. |
846 | // |
847 | // TODO: Evaluate if it is better to just construct an SRD using the flat |
848 | // scratch init and some constants rather than update the one we are passed. |
849 | Register ScratchRsrcSub0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); |
850 | Register ScratchRsrcSub1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); |
851 | |
852 | // We cannot Kill ScratchWaveOffsetReg here because we allow it to be used in |
853 | // the kernel body via inreg arguments. |
854 | BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), ScratchRsrcSub0) |
855 | .addReg(ScratchRsrcSub0) |
856 | .addReg(ScratchWaveOffsetReg) |
857 | .addReg(ScratchRsrcReg, RegState::ImplicitDefine); |
858 | auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), ScratchRsrcSub1) |
859 | .addReg(ScratchRsrcSub1) |
860 | .addImm(0) |
861 | .addReg(ScratchRsrcReg, RegState::ImplicitDefine); |
862 | Addc->getOperand(3).setIsDead(); // Mark SCC as dead. |
863 | } |
864 | |
865 | bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const { |
866 | switch (ID) { |
867 | case TargetStackID::Default: |
868 | case TargetStackID::NoAlloc: |
869 | case TargetStackID::SGPRSpill: |
870 | return true; |
871 | case TargetStackID::ScalableVector: |
872 | case TargetStackID::WasmLocal: |
873 | return false; |
874 | } |
875 | llvm_unreachable("Invalid TargetStackID::Value" ); |
876 | } |
877 | |
878 | // Activate only the inactive lanes when \p EnableInactiveLanes is true. |
879 | // Otherwise, activate all lanes. It returns the saved exec. |
880 | static Register buildScratchExecCopy(LiveRegUnits &LiveUnits, |
881 | MachineFunction &MF, |
882 | MachineBasicBlock &MBB, |
883 | MachineBasicBlock::iterator MBBI, |
884 | const DebugLoc &DL, bool IsProlog, |
885 | bool EnableInactiveLanes) { |
886 | Register ScratchExecCopy; |
887 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
888 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
889 | const SIInstrInfo *TII = ST.getInstrInfo(); |
890 | const SIRegisterInfo &TRI = TII->getRegisterInfo(); |
891 | SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); |
892 | |
893 | initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, IsProlog); |
894 | |
895 | ScratchExecCopy = findScratchNonCalleeSaveRegister( |
896 | MRI, LiveUnits, RC: *TRI.getWaveMaskRegClass()); |
897 | if (!ScratchExecCopy) |
898 | report_fatal_error(reason: "failed to find free scratch register" ); |
899 | |
900 | LiveUnits.addReg(Reg: ScratchExecCopy); |
901 | |
902 | const unsigned SaveExecOpc = |
903 | ST.isWave32() ? (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B32 |
904 | : AMDGPU::S_OR_SAVEEXEC_B32) |
905 | : (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B64 |
906 | : AMDGPU::S_OR_SAVEEXEC_B64); |
907 | auto SaveExec = |
908 | BuildMI(MBB, MBBI, DL, TII->get(SaveExecOpc), ScratchExecCopy).addImm(-1); |
909 | SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead. |
910 | |
911 | return ScratchExecCopy; |
912 | } |
913 | |
914 | void SIFrameLowering::emitCSRSpillStores( |
915 | MachineFunction &MF, MachineBasicBlock &MBB, |
916 | MachineBasicBlock::iterator MBBI, DebugLoc &DL, LiveRegUnits &LiveUnits, |
917 | Register FrameReg, Register FramePtrRegScratchCopy) const { |
918 | SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); |
919 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
920 | const SIInstrInfo *TII = ST.getInstrInfo(); |
921 | const SIRegisterInfo &TRI = TII->getRegisterInfo(); |
922 | |
923 | // Spill Whole-Wave Mode VGPRs. Save only the inactive lanes of the scratch |
924 | // registers. However, save all lanes of callee-saved VGPRs. Due to this, we |
925 | // might end up flipping the EXEC bits twice. |
926 | Register ScratchExecCopy; |
927 | SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs; |
928 | FuncInfo->splitWWMSpillRegisters(MF, CalleeSavedRegs&: WWMCalleeSavedRegs, ScratchRegs&: WWMScratchRegs); |
929 | if (!WWMScratchRegs.empty()) |
930 | ScratchExecCopy = |
931 | buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, |
932 | /*IsProlog*/ true, /*EnableInactiveLanes*/ true); |
933 | |
934 | auto StoreWWMRegisters = |
935 | [&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) { |
936 | for (const auto &Reg : WWMRegs) { |
937 | Register VGPR = Reg.first; |
938 | int FI = Reg.second; |
939 | buildPrologSpill(ST, TRI, FuncInfo: *FuncInfo, LiveUnits, MF, MBB, I: MBBI, DL, |
940 | SpillReg: VGPR, FI, FrameReg); |
941 | } |
942 | }; |
943 | |
944 | StoreWWMRegisters(WWMScratchRegs); |
945 | if (!WWMCalleeSavedRegs.empty()) { |
946 | if (ScratchExecCopy) { |
947 | unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; |
948 | BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1); |
949 | } else { |
950 | ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, |
951 | /*IsProlog*/ true, |
952 | /*EnableInactiveLanes*/ false); |
953 | } |
954 | } |
955 | |
956 | StoreWWMRegisters(WWMCalleeSavedRegs); |
957 | if (ScratchExecCopy) { |
958 | // FIXME: Split block and make terminator. |
959 | unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; |
960 | BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec()) |
961 | .addReg(ScratchExecCopy, RegState::Kill); |
962 | LiveUnits.addReg(Reg: ScratchExecCopy); |
963 | } |
964 | |
965 | Register FramePtrReg = FuncInfo->getFrameOffsetReg(); |
966 | |
967 | for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) { |
968 | // Special handle FP spill: |
969 | // Skip if FP is saved to a scratch SGPR, the save has already been emitted. |
970 | // Otherwise, FP has been moved to a temporary register and spill it |
971 | // instead. |
972 | Register Reg = |
973 | Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first; |
974 | if (!Reg) |
975 | continue; |
976 | |
977 | PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI, |
978 | LiveUnits, FrameReg); |
979 | SB.save(); |
980 | } |
981 | |
982 | // If a copy to scratch SGPR has been chosen for any of the SGPR spills, make |
983 | // such scratch registers live throughout the function. |
984 | SmallVector<Register, 1> ScratchSGPRs; |
985 | FuncInfo->getAllScratchSGPRCopyDstRegs(Regs&: ScratchSGPRs); |
986 | if (!ScratchSGPRs.empty()) { |
987 | for (MachineBasicBlock &MBB : MF) { |
988 | for (MCPhysReg Reg : ScratchSGPRs) |
989 | MBB.addLiveIn(PhysReg: Reg); |
990 | |
991 | MBB.sortUniqueLiveIns(); |
992 | } |
993 | if (!LiveUnits.empty()) { |
994 | for (MCPhysReg Reg : ScratchSGPRs) |
995 | LiveUnits.addReg(Reg); |
996 | } |
997 | } |
998 | } |
999 | |
1000 | void SIFrameLowering::emitCSRSpillRestores( |
1001 | MachineFunction &MF, MachineBasicBlock &MBB, |
1002 | MachineBasicBlock::iterator MBBI, DebugLoc &DL, LiveRegUnits &LiveUnits, |
1003 | Register FrameReg, Register FramePtrRegScratchCopy) const { |
1004 | const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); |
1005 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
1006 | const SIInstrInfo *TII = ST.getInstrInfo(); |
1007 | const SIRegisterInfo &TRI = TII->getRegisterInfo(); |
1008 | Register FramePtrReg = FuncInfo->getFrameOffsetReg(); |
1009 | |
1010 | for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) { |
1011 | // Special handle FP restore: |
1012 | // Skip if FP needs to be restored from the scratch SGPR. Otherwise, restore |
1013 | // the FP value to a temporary register. The frame pointer should be |
1014 | // overwritten only at the end when all other spills are restored from |
1015 | // current frame. |
1016 | Register Reg = |
1017 | Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first; |
1018 | if (!Reg) |
1019 | continue; |
1020 | |
1021 | PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI, |
1022 | LiveUnits, FrameReg); |
1023 | SB.restore(); |
1024 | } |
1025 | |
1026 | // Restore Whole-Wave Mode VGPRs. Restore only the inactive lanes of the |
1027 | // scratch registers. However, restore all lanes of callee-saved VGPRs. Due to |
1028 | // this, we might end up flipping the EXEC bits twice. |
1029 | Register ScratchExecCopy; |
1030 | SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs; |
1031 | FuncInfo->splitWWMSpillRegisters(MF, CalleeSavedRegs&: WWMCalleeSavedRegs, ScratchRegs&: WWMScratchRegs); |
1032 | if (!WWMScratchRegs.empty()) |
1033 | ScratchExecCopy = |
1034 | buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, |
1035 | /*IsProlog*/ false, /*EnableInactiveLanes*/ true); |
1036 | |
1037 | auto RestoreWWMRegisters = |
1038 | [&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) { |
1039 | for (const auto &Reg : WWMRegs) { |
1040 | Register VGPR = Reg.first; |
1041 | int FI = Reg.second; |
1042 | buildEpilogRestore(ST, TRI, FuncInfo: *FuncInfo, LiveUnits, MF, MBB, I: MBBI, DL, |
1043 | SpillReg: VGPR, FI, FrameReg); |
1044 | } |
1045 | }; |
1046 | |
1047 | RestoreWWMRegisters(WWMScratchRegs); |
1048 | if (!WWMCalleeSavedRegs.empty()) { |
1049 | if (ScratchExecCopy) { |
1050 | unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; |
1051 | BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1); |
1052 | } else { |
1053 | ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, |
1054 | /*IsProlog*/ false, |
1055 | /*EnableInactiveLanes*/ false); |
1056 | } |
1057 | } |
1058 | |
1059 | RestoreWWMRegisters(WWMCalleeSavedRegs); |
1060 | if (ScratchExecCopy) { |
1061 | // FIXME: Split block and make terminator. |
1062 | unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; |
1063 | BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec()) |
1064 | .addReg(ScratchExecCopy, RegState::Kill); |
1065 | } |
1066 | } |
1067 | |
1068 | void SIFrameLowering::emitPrologue(MachineFunction &MF, |
1069 | MachineBasicBlock &MBB) const { |
1070 | SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); |
1071 | if (FuncInfo->isEntryFunction()) { |
1072 | emitEntryFunctionPrologue(MF, MBB); |
1073 | return; |
1074 | } |
1075 | |
1076 | MachineFrameInfo &MFI = MF.getFrameInfo(); |
1077 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
1078 | const SIInstrInfo *TII = ST.getInstrInfo(); |
1079 | const SIRegisterInfo &TRI = TII->getRegisterInfo(); |
1080 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
1081 | |
1082 | Register StackPtrReg = FuncInfo->getStackPtrOffsetReg(); |
1083 | Register FramePtrReg = FuncInfo->getFrameOffsetReg(); |
1084 | Register BasePtrReg = |
1085 | TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register(); |
1086 | LiveRegUnits LiveUnits; |
1087 | |
1088 | MachineBasicBlock::iterator MBBI = MBB.begin(); |
1089 | // DebugLoc must be unknown since the first instruction with DebugLoc is used |
1090 | // to determine the end of the prologue. |
1091 | DebugLoc DL; |
1092 | |
1093 | if (FuncInfo->isChainFunction()) { |
1094 | // Functions with the amdgpu_cs_chain[_preserve] CC don't receive a SP, but |
1095 | // are free to set one up if they need it. |
1096 | bool UseSP = requiresStackPointerReference(MF); |
1097 | if (UseSP) { |
1098 | assert(StackPtrReg != AMDGPU::SP_REG); |
1099 | |
1100 | BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_MOV_B32), StackPtrReg) |
1101 | .addImm(MFI.getStackSize() * getScratchScaleFactor(ST)); |
1102 | } |
1103 | } |
1104 | |
1105 | bool HasFP = false; |
1106 | bool HasBP = false; |
1107 | uint32_t NumBytes = MFI.getStackSize(); |
1108 | uint32_t RoundedSize = NumBytes; |
1109 | |
1110 | if (TRI.hasStackRealignment(MF)) |
1111 | HasFP = true; |
1112 | |
1113 | Register FramePtrRegScratchCopy; |
1114 | if (!HasFP && !hasFP(MF)) { |
1115 | // Emit the CSR spill stores with SP base register. |
1116 | emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits, |
1117 | FrameReg: FuncInfo->isChainFunction() ? Register() : StackPtrReg, |
1118 | FramePtrRegScratchCopy); |
1119 | } else { |
1120 | // CSR spill stores will use FP as base register. |
1121 | Register SGPRForFPSaveRestoreCopy = |
1122 | FuncInfo->getScratchSGPRCopyDstReg(Reg: FramePtrReg); |
1123 | |
1124 | initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true); |
1125 | if (SGPRForFPSaveRestoreCopy) { |
1126 | // Copy FP to the scratch register now and emit the CFI entry. It avoids |
1127 | // the extra FP copy needed in the other two cases when FP is spilled to |
1128 | // memory or to a VGPR lane. |
1129 | PrologEpilogSGPRSpillBuilder SB( |
1130 | FramePtrReg, |
1131 | FuncInfo->getPrologEpilogSGPRSaveRestoreInfo(Reg: FramePtrReg), MBB, MBBI, |
1132 | DL, TII, TRI, LiveUnits, FramePtrReg); |
1133 | SB.save(); |
1134 | LiveUnits.addReg(Reg: SGPRForFPSaveRestoreCopy); |
1135 | } else { |
1136 | // Copy FP into a new scratch register so that its previous value can be |
1137 | // spilled after setting up the new frame. |
1138 | FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister( |
1139 | MRI, LiveUnits, AMDGPU::SReg_32_XM0_XEXECRegClass); |
1140 | if (!FramePtrRegScratchCopy) |
1141 | report_fatal_error(reason: "failed to find free scratch register" ); |
1142 | |
1143 | LiveUnits.addReg(Reg: FramePtrRegScratchCopy); |
1144 | BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrRegScratchCopy) |
1145 | .addReg(FramePtrReg); |
1146 | } |
1147 | } |
1148 | |
1149 | if (HasFP) { |
1150 | const unsigned Alignment = MFI.getMaxAlign().value(); |
1151 | |
1152 | RoundedSize += Alignment; |
1153 | if (LiveUnits.empty()) { |
1154 | LiveUnits.init(TRI); |
1155 | LiveUnits.addLiveIns(MBB); |
1156 | } |
1157 | |
1158 | // s_add_i32 s33, s32, NumBytes |
1159 | // s_and_b32 s33, s33, 0b111...0000 |
1160 | BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), FramePtrReg) |
1161 | .addReg(StackPtrReg) |
1162 | .addImm((Alignment - 1) * getScratchScaleFactor(ST)) |
1163 | .setMIFlag(MachineInstr::FrameSetup); |
1164 | auto And = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg) |
1165 | .addReg(FramePtrReg, RegState::Kill) |
1166 | .addImm(-Alignment * getScratchScaleFactor(ST)) |
1167 | .setMIFlag(MachineInstr::FrameSetup); |
1168 | And->getOperand(3).setIsDead(); // Mark SCC as dead. |
1169 | FuncInfo->setIsStackRealigned(true); |
1170 | } else if ((HasFP = hasFP(MF))) { |
1171 | BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg) |
1172 | .addReg(StackPtrReg) |
1173 | .setMIFlag(MachineInstr::FrameSetup); |
1174 | } |
1175 | |
1176 | // If FP is used, emit the CSR spills with FP base register. |
1177 | if (HasFP) { |
1178 | emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits, FrameReg: FramePtrReg, |
1179 | FramePtrRegScratchCopy); |
1180 | if (FramePtrRegScratchCopy) |
1181 | LiveUnits.removeReg(Reg: FramePtrRegScratchCopy); |
1182 | } |
1183 | |
1184 | // If we need a base pointer, set it up here. It's whatever the value of |
1185 | // the stack pointer is at this point. Any variable size objects will be |
1186 | // allocated after this, so we can still use the base pointer to reference |
1187 | // the incoming arguments. |
1188 | if ((HasBP = TRI.hasBasePointer(MF))) { |
1189 | BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg) |
1190 | .addReg(StackPtrReg) |
1191 | .setMIFlag(MachineInstr::FrameSetup); |
1192 | } |
1193 | |
1194 | if (HasFP && RoundedSize != 0) { |
1195 | auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg) |
1196 | .addReg(StackPtrReg) |
1197 | .addImm(RoundedSize * getScratchScaleFactor(ST)) |
1198 | .setMIFlag(MachineInstr::FrameSetup); |
1199 | Add->getOperand(3).setIsDead(); // Mark SCC as dead. |
1200 | } |
1201 | |
1202 | bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(Reg: FramePtrReg); |
1203 | (void)FPSaved; |
1204 | assert((!HasFP || FPSaved) && |
1205 | "Needed to save FP but didn't save it anywhere" ); |
1206 | |
1207 | // If we allow spilling to AGPRs we may have saved FP but then spill |
1208 | // everything into AGPRs instead of the stack. |
1209 | assert((HasFP || !FPSaved || EnableSpillVGPRToAGPR) && |
1210 | "Saved FP but didn't need it" ); |
1211 | |
1212 | bool BPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(Reg: BasePtrReg); |
1213 | (void)BPSaved; |
1214 | assert((!HasBP || BPSaved) && |
1215 | "Needed to save BP but didn't save it anywhere" ); |
1216 | |
1217 | assert((HasBP || !BPSaved) && "Saved BP but didn't need it" ); |
1218 | } |
1219 | |
1220 | void SIFrameLowering::emitEpilogue(MachineFunction &MF, |
1221 | MachineBasicBlock &MBB) const { |
1222 | const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); |
1223 | if (FuncInfo->isEntryFunction()) |
1224 | return; |
1225 | |
1226 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
1227 | const SIInstrInfo *TII = ST.getInstrInfo(); |
1228 | const SIRegisterInfo &TRI = TII->getRegisterInfo(); |
1229 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
1230 | LiveRegUnits LiveUnits; |
1231 | // Get the insert location for the epilogue. If there were no terminators in |
1232 | // the block, get the last instruction. |
1233 | MachineBasicBlock::iterator MBBI = MBB.end(); |
1234 | DebugLoc DL; |
1235 | if (!MBB.empty()) { |
1236 | MBBI = MBB.getLastNonDebugInstr(); |
1237 | if (MBBI != MBB.end()) |
1238 | DL = MBBI->getDebugLoc(); |
1239 | |
1240 | MBBI = MBB.getFirstTerminator(); |
1241 | } |
1242 | |
1243 | const MachineFrameInfo &MFI = MF.getFrameInfo(); |
1244 | uint32_t NumBytes = MFI.getStackSize(); |
1245 | uint32_t RoundedSize = FuncInfo->isStackRealigned() |
1246 | ? NumBytes + MFI.getMaxAlign().value() |
1247 | : NumBytes; |
1248 | const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg(); |
1249 | Register FramePtrReg = FuncInfo->getFrameOffsetReg(); |
1250 | bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(Reg: FramePtrReg); |
1251 | |
1252 | Register FramePtrRegScratchCopy; |
1253 | Register SGPRForFPSaveRestoreCopy = |
1254 | FuncInfo->getScratchSGPRCopyDstReg(Reg: FramePtrReg); |
1255 | if (FPSaved) { |
1256 | // CSR spill restores should use FP as base register. If |
1257 | // SGPRForFPSaveRestoreCopy is not true, restore the previous value of FP |
1258 | // into a new scratch register and copy to FP later when other registers are |
1259 | // restored from the current stack frame. |
1260 | initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false); |
1261 | if (SGPRForFPSaveRestoreCopy) { |
1262 | LiveUnits.addReg(Reg: SGPRForFPSaveRestoreCopy); |
1263 | } else { |
1264 | FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister( |
1265 | MRI, LiveUnits, AMDGPU::SReg_32_XM0_XEXECRegClass); |
1266 | if (!FramePtrRegScratchCopy) |
1267 | report_fatal_error(reason: "failed to find free scratch register" ); |
1268 | |
1269 | LiveUnits.addReg(Reg: FramePtrRegScratchCopy); |
1270 | } |
1271 | |
1272 | emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits, FrameReg: FramePtrReg, |
1273 | FramePtrRegScratchCopy); |
1274 | } |
1275 | |
1276 | if (RoundedSize != 0 && hasFP(MF)) { |
1277 | auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg) |
1278 | .addReg(StackPtrReg) |
1279 | .addImm(-static_cast<int64_t>(RoundedSize * getScratchScaleFactor(ST))) |
1280 | .setMIFlag(MachineInstr::FrameDestroy); |
1281 | Add->getOperand(3).setIsDead(); // Mark SCC as dead. |
1282 | } |
1283 | |
1284 | if (FPSaved) { |
1285 | // Insert the copy to restore FP. |
1286 | Register SrcReg = SGPRForFPSaveRestoreCopy ? SGPRForFPSaveRestoreCopy |
1287 | : FramePtrRegScratchCopy; |
1288 | MachineInstrBuilder MIB = |
1289 | BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg) |
1290 | .addReg(SrcReg); |
1291 | if (SGPRForFPSaveRestoreCopy) |
1292 | MIB.setMIFlag(MachineInstr::FrameDestroy); |
1293 | } else { |
1294 | // Insert the CSR spill restores with SP as the base register. |
1295 | emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits, FrameReg: StackPtrReg, |
1296 | FramePtrRegScratchCopy); |
1297 | } |
1298 | } |
1299 | |
1300 | #ifndef NDEBUG |
1301 | static bool allSGPRSpillsAreDead(const MachineFunction &MF) { |
1302 | const MachineFrameInfo &MFI = MF.getFrameInfo(); |
1303 | const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); |
1304 | for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); |
1305 | I != E; ++I) { |
1306 | if (!MFI.isDeadObjectIndex(ObjectIdx: I) && |
1307 | MFI.getStackID(ObjectIdx: I) == TargetStackID::SGPRSpill && |
1308 | !FuncInfo->checkIndexInPrologEpilogSGPRSpills(FI: I)) { |
1309 | return false; |
1310 | } |
1311 | } |
1312 | |
1313 | return true; |
1314 | } |
1315 | #endif |
1316 | |
1317 | StackOffset SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, |
1318 | int FI, |
1319 | Register &FrameReg) const { |
1320 | const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo(); |
1321 | |
1322 | FrameReg = RI->getFrameRegister(MF); |
1323 | return StackOffset::getFixed(Fixed: MF.getFrameInfo().getObjectOffset(ObjectIdx: FI)); |
1324 | } |
1325 | |
1326 | void SIFrameLowering::processFunctionBeforeFrameFinalized( |
1327 | MachineFunction &MF, |
1328 | RegScavenger *RS) const { |
1329 | MachineFrameInfo &MFI = MF.getFrameInfo(); |
1330 | |
1331 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
1332 | const SIInstrInfo *TII = ST.getInstrInfo(); |
1333 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
1334 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
1335 | SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); |
1336 | |
1337 | // Allocate spill slots for WWM reserved VGPRs. |
1338 | // For chain functions, we only need to do this if we have calls to |
1339 | // llvm.amdgcn.cs.chain. |
1340 | bool IsChainWithoutCalls = |
1341 | FuncInfo->isChainFunction() && !MF.getFrameInfo().hasTailCall(); |
1342 | if (!FuncInfo->isEntryFunction() && !IsChainWithoutCalls) { |
1343 | for (Register Reg : FuncInfo->getWWMReservedRegs()) { |
1344 | const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg); |
1345 | FuncInfo->allocateWWMSpill(MF, VGPR: Reg, Size: TRI->getSpillSize(*RC), |
1346 | Alignment: TRI->getSpillAlign(*RC)); |
1347 | } |
1348 | } |
1349 | |
1350 | const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs() |
1351 | && EnableSpillVGPRToAGPR; |
1352 | |
1353 | if (SpillVGPRToAGPR) { |
1354 | // To track the spill frame indices handled in this pass. |
1355 | BitVector SpillFIs(MFI.getObjectIndexEnd(), false); |
1356 | BitVector NonVGPRSpillFIs(MFI.getObjectIndexEnd(), false); |
1357 | |
1358 | bool SeenDbgInstr = false; |
1359 | |
1360 | for (MachineBasicBlock &MBB : MF) { |
1361 | for (MachineInstr &MI : llvm::make_early_inc_range(Range&: MBB)) { |
1362 | int FrameIndex; |
1363 | if (MI.isDebugInstr()) |
1364 | SeenDbgInstr = true; |
1365 | |
1366 | if (TII->isVGPRSpill(MI)) { |
1367 | // Try to eliminate stack used by VGPR spills before frame |
1368 | // finalization. |
1369 | unsigned FIOp = AMDGPU::getNamedOperandIdx(MI.getOpcode(), |
1370 | AMDGPU::OpName::vaddr); |
1371 | int FI = MI.getOperand(i: FIOp).getIndex(); |
1372 | Register VReg = |
1373 | TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg(); |
1374 | if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI, |
1375 | isAGPRtoVGPR: TRI->isAGPR(MRI, Reg: VReg))) { |
1376 | assert(RS != nullptr); |
1377 | RS->enterBasicBlockEnd(MBB); |
1378 | RS->backward(I: std::next(x: MI.getIterator())); |
1379 | TRI->eliminateFrameIndex(MI, SPAdj: 0, FIOperandNum: FIOp, RS); |
1380 | SpillFIs.set(FI); |
1381 | continue; |
1382 | } |
1383 | } else if (TII->isStoreToStackSlot(MI, FrameIndex) || |
1384 | TII->isLoadFromStackSlot(MI, FrameIndex)) |
1385 | if (!MFI.isFixedObjectIndex(ObjectIdx: FrameIndex)) |
1386 | NonVGPRSpillFIs.set(FrameIndex); |
1387 | } |
1388 | } |
1389 | |
1390 | // Stack slot coloring may assign different objects to the same stack slot. |
1391 | // If not, then the VGPR to AGPR spill slot is dead. |
1392 | for (unsigned FI : SpillFIs.set_bits()) |
1393 | if (!NonVGPRSpillFIs.test(Idx: FI)) |
1394 | FuncInfo->setVGPRToAGPRSpillDead(FI); |
1395 | |
1396 | for (MachineBasicBlock &MBB : MF) { |
1397 | for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs()) |
1398 | MBB.addLiveIn(PhysReg: Reg); |
1399 | |
1400 | for (MCPhysReg Reg : FuncInfo->getAGPRSpillVGPRs()) |
1401 | MBB.addLiveIn(PhysReg: Reg); |
1402 | |
1403 | MBB.sortUniqueLiveIns(); |
1404 | |
1405 | if (!SpillFIs.empty() && SeenDbgInstr) { |
1406 | // FIXME: The dead frame indices are replaced with a null register from |
1407 | // the debug value instructions. We should instead, update it with the |
1408 | // correct register value. But not sure the register value alone is |
1409 | for (MachineInstr &MI : MBB) { |
1410 | if (MI.isDebugValue() && MI.getOperand(i: 0).isFI() && |
1411 | !MFI.isFixedObjectIndex(ObjectIdx: MI.getOperand(i: 0).getIndex()) && |
1412 | SpillFIs[MI.getOperand(i: 0).getIndex()]) { |
1413 | MI.getOperand(i: 0).ChangeToRegister(Reg: Register(), isDef: false /*isDef*/); |
1414 | } |
1415 | } |
1416 | } |
1417 | } |
1418 | } |
1419 | |
1420 | // At this point we've already allocated all spilled SGPRs to VGPRs if we |
1421 | // can. Any remaining SGPR spills will go to memory, so move them back to the |
1422 | // default stack. |
1423 | bool HaveSGPRToVMemSpill = |
1424 | FuncInfo->removeDeadFrameIndices(MFI, /*ResetSGPRSpillStackIDs*/ true); |
1425 | assert(allSGPRSpillsAreDead(MF) && |
1426 | "SGPR spill should have been removed in SILowerSGPRSpills" ); |
1427 | |
1428 | // FIXME: The other checks should be redundant with allStackObjectsAreDead, |
1429 | // but currently hasNonSpillStackObjects is set only from source |
1430 | // allocas. Stack temps produced from legalization are not counted currently. |
1431 | if (!allStackObjectsAreDead(MFI)) { |
1432 | assert(RS && "RegScavenger required if spilling" ); |
1433 | |
1434 | // Add an emergency spill slot |
1435 | RS->addScavengingFrameIndex(FI: FuncInfo->getScavengeFI(MFI, TRI: *TRI)); |
1436 | |
1437 | // If we are spilling SGPRs to memory with a large frame, we may need a |
1438 | // second VGPR emergency frame index. |
1439 | if (HaveSGPRToVMemSpill && |
1440 | allocateScavengingFrameIndexesNearIncomingSP(MF)) { |
1441 | RS->addScavengingFrameIndex(FI: MFI.CreateStackObject(Size: 4, Alignment: Align(4), isSpillSlot: false)); |
1442 | } |
1443 | } |
1444 | } |
1445 | |
1446 | void SIFrameLowering::processFunctionBeforeFrameIndicesReplaced( |
1447 | MachineFunction &MF, RegScavenger *RS) const { |
1448 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
1449 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
1450 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
1451 | SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); |
1452 | |
1453 | if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) { |
1454 | // On gfx908, we had initially reserved highest available VGPR for AGPR |
1455 | // copy. Now since we are done with RA, check if there exist an unused VGPR |
1456 | // which is lower than the eariler reserved VGPR before RA. If one exist, |
1457 | // use it for AGPR copy instead of one reserved before RA. |
1458 | Register VGPRForAGPRCopy = FuncInfo->getVGPRForAGPRCopy(); |
1459 | Register UnusedLowVGPR = |
1460 | TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF); |
1461 | if (UnusedLowVGPR && (TRI->getHWRegIndex(Reg: UnusedLowVGPR) < |
1462 | TRI->getHWRegIndex(Reg: VGPRForAGPRCopy))) { |
1463 | // Reserve this newly identified VGPR (for AGPR copy) |
1464 | // reserved registers should already be frozen at this point |
1465 | // so we can avoid calling MRI.freezeReservedRegs and just use |
1466 | // MRI.reserveReg |
1467 | FuncInfo->setVGPRForAGPRCopy(UnusedLowVGPR); |
1468 | MRI.reserveReg(UnusedLowVGPR, TRI); |
1469 | } |
1470 | } |
1471 | // We initally reserved the highest available SGPR pair for long branches |
1472 | // now, after RA, we shift down to a lower unused one if one exists |
1473 | Register LongBranchReservedReg = FuncInfo->getLongBranchReservedReg(); |
1474 | Register UnusedLowSGPR = |
1475 | TRI->findUnusedRegister(MRI, &AMDGPU::SGPR_64RegClass, MF); |
1476 | // If LongBranchReservedReg is null then we didn't find a long branch |
1477 | // and never reserved a register to begin with so there is nothing to |
1478 | // shift down. Then if UnusedLowSGPR is null, there isn't available lower |
1479 | // register to use so just keep the original one we set. |
1480 | if (LongBranchReservedReg && UnusedLowSGPR) { |
1481 | FuncInfo->setLongBranchReservedReg(UnusedLowSGPR); |
1482 | MRI.reserveReg(UnusedLowSGPR, TRI); |
1483 | } |
1484 | } |
1485 | |
1486 | // The special SGPR spills like the one needed for FP, BP or any reserved |
1487 | // registers delayed until frame lowering. |
1488 | void SIFrameLowering::determinePrologEpilogSGPRSaves( |
1489 | MachineFunction &MF, BitVector &SavedVGPRs, |
1490 | bool NeedExecCopyReservedReg) const { |
1491 | MachineFrameInfo &FrameInfo = MF.getFrameInfo(); |
1492 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
1493 | SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
1494 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
1495 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
1496 | LiveRegUnits LiveUnits; |
1497 | LiveUnits.init(*TRI); |
1498 | // Initially mark callee saved registers as used so we will not choose them |
1499 | // while looking for scratch SGPRs. |
1500 | const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs(); |
1501 | for (unsigned I = 0; CSRegs[I]; ++I) |
1502 | LiveUnits.addReg(Reg: CSRegs[I]); |
1503 | |
1504 | const TargetRegisterClass &RC = *TRI->getWaveMaskRegClass(); |
1505 | |
1506 | Register ReservedRegForExecCopy = MFI->getSGPRForEXECCopy(); |
1507 | if (NeedExecCopyReservedReg || |
1508 | (ReservedRegForExecCopy && |
1509 | MRI.isPhysRegUsed(PhysReg: ReservedRegForExecCopy, /*SkipRegMaskTest=*/true))) { |
1510 | MRI.reserveReg(ReservedRegForExecCopy, TRI); |
1511 | Register UnusedScratchReg = findUnusedRegister(MRI, LiveUnits, RC); |
1512 | if (UnusedScratchReg) { |
1513 | // If found any unused scratch SGPR, reserve the register itself for Exec |
1514 | // copy and there is no need for any spills in that case. |
1515 | MFI->setSGPRForEXECCopy(UnusedScratchReg); |
1516 | MRI.replaceRegWith(FromReg: ReservedRegForExecCopy, ToReg: UnusedScratchReg); |
1517 | LiveUnits.addReg(Reg: UnusedScratchReg); |
1518 | } else { |
1519 | // Needs spill. |
1520 | assert(!MFI->hasPrologEpilogSGPRSpillEntry(ReservedRegForExecCopy) && |
1521 | "Re-reserving spill slot for EXEC copy register" ); |
1522 | getVGPRSpillLaneOrTempRegister(MF, LiveUnits, SGPR: ReservedRegForExecCopy, RC, |
1523 | /*IncludeScratchCopy=*/false); |
1524 | } |
1525 | } else if (ReservedRegForExecCopy) { |
1526 | // Reset it at this point. There are no whole-wave copies and spills |
1527 | // encountered. |
1528 | MFI->setSGPRForEXECCopy(AMDGPU::NoRegister); |
1529 | } |
1530 | |
1531 | // hasFP only knows about stack objects that already exist. We're now |
1532 | // determining the stack slots that will be created, so we have to predict |
1533 | // them. Stack objects force FP usage with calls. |
1534 | // |
1535 | // Note a new VGPR CSR may be introduced if one is used for the spill, but we |
1536 | // don't want to report it here. |
1537 | // |
1538 | // FIXME: Is this really hasReservedCallFrame? |
1539 | const bool WillHaveFP = |
1540 | FrameInfo.hasCalls() && |
1541 | (SavedVGPRs.any() || !allStackObjectsAreDead(MFI: FrameInfo)); |
1542 | |
1543 | if (WillHaveFP || hasFP(MF)) { |
1544 | Register FramePtrReg = MFI->getFrameOffsetReg(); |
1545 | assert(!MFI->hasPrologEpilogSGPRSpillEntry(FramePtrReg) && |
1546 | "Re-reserving spill slot for FP" ); |
1547 | getVGPRSpillLaneOrTempRegister(MF, LiveUnits, SGPR: FramePtrReg); |
1548 | } |
1549 | |
1550 | if (TRI->hasBasePointer(MF)) { |
1551 | Register BasePtrReg = TRI->getBaseRegister(); |
1552 | assert(!MFI->hasPrologEpilogSGPRSpillEntry(BasePtrReg) && |
1553 | "Re-reserving spill slot for BP" ); |
1554 | getVGPRSpillLaneOrTempRegister(MF, LiveUnits, SGPR: BasePtrReg); |
1555 | } |
1556 | } |
1557 | |
1558 | // Only report VGPRs to generic code. |
1559 | void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, |
1560 | BitVector &SavedVGPRs, |
1561 | RegScavenger *RS) const { |
1562 | SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
1563 | |
1564 | // If this is a function with the amdgpu_cs_chain[_preserve] calling |
1565 | // convention and it doesn't contain any calls to llvm.amdgcn.cs.chain, then |
1566 | // we don't need to save and restore anything. |
1567 | if (MFI->isChainFunction() && !MF.getFrameInfo().hasTailCall()) |
1568 | return; |
1569 | |
1570 | MFI->shiftSpillPhysVGPRsToLowestRange(MF); |
1571 | |
1572 | TargetFrameLowering::determineCalleeSaves(MF, SavedRegs&: SavedVGPRs, RS); |
1573 | if (MFI->isEntryFunction()) |
1574 | return; |
1575 | |
1576 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
1577 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
1578 | const SIInstrInfo *TII = ST.getInstrInfo(); |
1579 | bool NeedExecCopyReservedReg = false; |
1580 | |
1581 | MachineInstr *ReturnMI = nullptr; |
1582 | for (MachineBasicBlock &MBB : MF) { |
1583 | for (MachineInstr &MI : MBB) { |
1584 | // WRITELANE instructions used for SGPR spills can overwrite the inactive |
1585 | // lanes of VGPRs and callee must spill and restore them even if they are |
1586 | // marked Caller-saved. |
1587 | |
1588 | // TODO: Handle this elsewhere at an early point. Walking through all MBBs |
1589 | // here would be a bad heuristic. A better way should be by calling |
1590 | // allocateWWMSpill during the regalloc pipeline whenever a physical |
1591 | // register is allocated for the intended virtual registers. |
1592 | if (MI.getOpcode() == AMDGPU::SI_SPILL_S32_TO_VGPR) |
1593 | MFI->allocateWWMSpill(MF, VGPR: MI.getOperand(i: 0).getReg()); |
1594 | else if (MI.getOpcode() == AMDGPU::SI_RESTORE_S32_FROM_VGPR) |
1595 | MFI->allocateWWMSpill(MF, VGPR: MI.getOperand(i: 1).getReg()); |
1596 | else if (TII->isWWMRegSpillOpcode(Opcode: MI.getOpcode())) |
1597 | NeedExecCopyReservedReg = true; |
1598 | else if (MI.getOpcode() == AMDGPU::SI_RETURN || |
1599 | MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG || |
1600 | (MFI->isChainFunction() && |
1601 | TII->isChainCallOpcode(MI.getOpcode()))) { |
1602 | // We expect all return to be the same size. |
1603 | assert(!ReturnMI || |
1604 | (count_if(MI.operands(), [](auto Op) { return Op.isReg(); }) == |
1605 | count_if(ReturnMI->operands(), [](auto Op) { return Op.isReg(); }))); |
1606 | ReturnMI = &MI; |
1607 | } |
1608 | } |
1609 | } |
1610 | |
1611 | // Remove any VGPRs used in the return value because these do not need to be saved. |
1612 | // This prevents CSR restore from clobbering return VGPRs. |
1613 | if (ReturnMI) { |
1614 | for (auto &Op : ReturnMI->operands()) { |
1615 | if (Op.isReg()) |
1616 | SavedVGPRs.reset(Idx: Op.getReg()); |
1617 | } |
1618 | } |
1619 | |
1620 | // Ignore the SGPRs the default implementation found. |
1621 | SavedVGPRs.clearBitsNotInMask(Mask: TRI->getAllVectorRegMask()); |
1622 | |
1623 | // Do not save AGPRs prior to GFX90A because there was no easy way to do so. |
1624 | // In gfx908 there was do AGPR loads and stores and thus spilling also |
1625 | // require a temporary VGPR. |
1626 | if (!ST.hasGFX90AInsts()) |
1627 | SavedVGPRs.clearBitsInMask(Mask: TRI->getAllAGPRRegMask()); |
1628 | |
1629 | determinePrologEpilogSGPRSaves(MF, SavedVGPRs, NeedExecCopyReservedReg); |
1630 | |
1631 | // The Whole-Wave VGPRs need to be specially inserted in the prolog, so don't |
1632 | // allow the default insertion to handle them. |
1633 | for (auto &Reg : MFI->getWWMSpills()) |
1634 | SavedVGPRs.reset(Idx: Reg.first); |
1635 | |
1636 | // Mark all lane VGPRs as BB LiveIns. |
1637 | for (MachineBasicBlock &MBB : MF) { |
1638 | for (auto &Reg : MFI->getWWMSpills()) |
1639 | MBB.addLiveIn(PhysReg: Reg.first); |
1640 | |
1641 | MBB.sortUniqueLiveIns(); |
1642 | } |
1643 | } |
1644 | |
1645 | void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF, |
1646 | BitVector &SavedRegs, |
1647 | RegScavenger *RS) const { |
1648 | TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); |
1649 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
1650 | if (MFI->isEntryFunction()) |
1651 | return; |
1652 | |
1653 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
1654 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
1655 | |
1656 | // The SP is specifically managed and we don't want extra spills of it. |
1657 | SavedRegs.reset(Idx: MFI->getStackPtrOffsetReg()); |
1658 | |
1659 | const BitVector AllSavedRegs = SavedRegs; |
1660 | SavedRegs.clearBitsInMask(Mask: TRI->getAllVectorRegMask()); |
1661 | |
1662 | // We have to anticipate introducing CSR VGPR spills or spill of caller |
1663 | // save VGPR reserved for SGPR spills as we now always create stack entry |
1664 | // for it, if we don't have any stack objects already, since we require a FP |
1665 | // if there is a call and stack. We will allocate a VGPR for SGPR spills if |
1666 | // there are any SGPR spills. Whether they are CSR spills or otherwise. |
1667 | MachineFrameInfo &FrameInfo = MF.getFrameInfo(); |
1668 | const bool WillHaveFP = |
1669 | FrameInfo.hasCalls() && (AllSavedRegs.any() || MFI->hasSpilledSGPRs()); |
1670 | |
1671 | // FP will be specially managed like SP. |
1672 | if (WillHaveFP || hasFP(MF)) |
1673 | SavedRegs.reset(Idx: MFI->getFrameOffsetReg()); |
1674 | |
1675 | // Return address use with return instruction is hidden through the SI_RETURN |
1676 | // pseudo. Given that and since the IPRA computes actual register usage and |
1677 | // does not use CSR list, the clobbering of return address by function calls |
1678 | // (D117243) or otherwise (D120922) is ignored/not seen by the IPRA's register |
1679 | // usage collection. This will ensure save/restore of return address happens |
1680 | // in those scenarios. |
1681 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
1682 | Register RetAddrReg = TRI->getReturnAddressReg(MF); |
1683 | if (!MFI->isEntryFunction() && |
1684 | (FrameInfo.hasCalls() || MRI.isPhysRegModified(PhysReg: RetAddrReg))) { |
1685 | SavedRegs.set(TRI->getSubReg(RetAddrReg, AMDGPU::sub0)); |
1686 | SavedRegs.set(TRI->getSubReg(RetAddrReg, AMDGPU::sub1)); |
1687 | } |
1688 | } |
1689 | |
1690 | bool SIFrameLowering::assignCalleeSavedSpillSlots( |
1691 | MachineFunction &MF, const TargetRegisterInfo *TRI, |
1692 | std::vector<CalleeSavedInfo> &CSI) const { |
1693 | if (CSI.empty()) |
1694 | return true; // Early exit if no callee saved registers are modified! |
1695 | |
1696 | const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); |
1697 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
1698 | const SIRegisterInfo *RI = ST.getRegisterInfo(); |
1699 | Register FramePtrReg = FuncInfo->getFrameOffsetReg(); |
1700 | Register BasePtrReg = RI->getBaseRegister(); |
1701 | Register SGPRForFPSaveRestoreCopy = |
1702 | FuncInfo->getScratchSGPRCopyDstReg(Reg: FramePtrReg); |
1703 | Register SGPRForBPSaveRestoreCopy = |
1704 | FuncInfo->getScratchSGPRCopyDstReg(Reg: BasePtrReg); |
1705 | if (!SGPRForFPSaveRestoreCopy && !SGPRForBPSaveRestoreCopy) |
1706 | return false; |
1707 | |
1708 | unsigned NumModifiedRegs = 0; |
1709 | |
1710 | if (SGPRForFPSaveRestoreCopy) |
1711 | NumModifiedRegs++; |
1712 | if (SGPRForBPSaveRestoreCopy) |
1713 | NumModifiedRegs++; |
1714 | |
1715 | for (auto &CS : CSI) { |
1716 | if (CS.getReg() == FramePtrReg && SGPRForFPSaveRestoreCopy) { |
1717 | CS.setDstReg(SGPRForFPSaveRestoreCopy); |
1718 | if (--NumModifiedRegs) |
1719 | break; |
1720 | } else if (CS.getReg() == BasePtrReg && SGPRForBPSaveRestoreCopy) { |
1721 | CS.setDstReg(SGPRForBPSaveRestoreCopy); |
1722 | if (--NumModifiedRegs) |
1723 | break; |
1724 | } |
1725 | } |
1726 | |
1727 | return false; |
1728 | } |
1729 | |
1730 | bool SIFrameLowering::allocateScavengingFrameIndexesNearIncomingSP( |
1731 | const MachineFunction &MF) const { |
1732 | |
1733 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
1734 | const MachineFrameInfo &MFI = MF.getFrameInfo(); |
1735 | const SIInstrInfo *TII = ST.getInstrInfo(); |
1736 | uint64_t EstStackSize = MFI.estimateStackSize(MF); |
1737 | uint64_t MaxOffset = EstStackSize - 1; |
1738 | |
1739 | // We need the emergency stack slots to be allocated in range of the |
1740 | // MUBUF/flat scratch immediate offset from the base register, so assign these |
1741 | // first at the incoming SP position. |
1742 | // |
1743 | // TODO: We could try sorting the objects to find a hole in the first bytes |
1744 | // rather than allocating as close to possible. This could save a lot of space |
1745 | // on frames with alignment requirements. |
1746 | if (ST.enableFlatScratch()) { |
1747 | if (TII->isLegalFLATOffset(Offset: MaxOffset, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS, |
1748 | FlatVariant: SIInstrFlags::FlatScratch)) |
1749 | return false; |
1750 | } else { |
1751 | if (TII->isLegalMUBUFImmOffset(Imm: MaxOffset)) |
1752 | return false; |
1753 | } |
1754 | |
1755 | return true; |
1756 | } |
1757 | |
1758 | MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr( |
1759 | MachineFunction &MF, |
1760 | MachineBasicBlock &MBB, |
1761 | MachineBasicBlock::iterator I) const { |
1762 | int64_t Amount = I->getOperand(i: 0).getImm(); |
1763 | if (Amount == 0) |
1764 | return MBB.erase(I); |
1765 | |
1766 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
1767 | const SIInstrInfo *TII = ST.getInstrInfo(); |
1768 | const DebugLoc &DL = I->getDebugLoc(); |
1769 | unsigned Opc = I->getOpcode(); |
1770 | bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode(); |
1771 | uint64_t CalleePopAmount = IsDestroy ? I->getOperand(i: 1).getImm() : 0; |
1772 | |
1773 | if (!hasReservedCallFrame(MF)) { |
1774 | Amount = alignTo(Size: Amount, A: getStackAlign()); |
1775 | assert(isUInt<32>(Amount) && "exceeded stack address space size" ); |
1776 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
1777 | Register SPReg = MFI->getStackPtrOffsetReg(); |
1778 | |
1779 | Amount *= getScratchScaleFactor(ST); |
1780 | if (IsDestroy) |
1781 | Amount = -Amount; |
1782 | auto Add = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SPReg) |
1783 | .addReg(SPReg) |
1784 | .addImm(Amount); |
1785 | Add->getOperand(3).setIsDead(); // Mark SCC as dead. |
1786 | } else if (CalleePopAmount != 0) { |
1787 | llvm_unreachable("is this used?" ); |
1788 | } |
1789 | |
1790 | return MBB.erase(I); |
1791 | } |
1792 | |
1793 | /// Returns true if the frame will require a reference to the stack pointer. |
1794 | /// |
1795 | /// This is the set of conditions common to setting up the stack pointer in a |
1796 | /// kernel, and for using a frame pointer in a callable function. |
1797 | /// |
1798 | /// FIXME: Should also check hasOpaqueSPAdjustment and if any inline asm |
1799 | /// references SP. |
1800 | static bool frameTriviallyRequiresSP(const MachineFrameInfo &MFI) { |
1801 | return MFI.hasVarSizedObjects() || MFI.hasStackMap() || MFI.hasPatchPoint(); |
1802 | } |
1803 | |
1804 | // The FP for kernels is always known 0, so we never really need to setup an |
1805 | // explicit register for it. However, DisableFramePointerElim will force us to |
1806 | // use a register for it. |
1807 | bool SIFrameLowering::hasFP(const MachineFunction &MF) const { |
1808 | const MachineFrameInfo &MFI = MF.getFrameInfo(); |
1809 | |
1810 | // For entry & chain functions we can use an immediate offset in most cases, |
1811 | // so the presence of calls doesn't imply we need a distinct frame pointer. |
1812 | if (MFI.hasCalls() && |
1813 | !MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() && |
1814 | !MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) { |
1815 | // All offsets are unsigned, so need to be addressed in the same direction |
1816 | // as stack growth. |
1817 | |
1818 | // FIXME: This function is pretty broken, since it can be called before the |
1819 | // frame layout is determined or CSR spills are inserted. |
1820 | return MFI.getStackSize() != 0; |
1821 | } |
1822 | |
1823 | return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() || |
1824 | MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->hasStackRealignment( |
1825 | MF) || |
1826 | MF.getTarget().Options.DisableFramePointerElim(MF); |
1827 | } |
1828 | |
1829 | // This is essentially a reduced version of hasFP for entry functions. Since the |
1830 | // stack pointer is known 0 on entry to kernels, we never really need an FP |
1831 | // register. We may need to initialize the stack pointer depending on the frame |
1832 | // properties, which logically overlaps many of the cases where an ordinary |
1833 | // function would require an FP. |
1834 | // Also used for chain functions. While not technically entry functions, chain |
1835 | // functions may need to set up a stack pointer in some situations. |
1836 | bool SIFrameLowering::requiresStackPointerReference( |
1837 | const MachineFunction &MF) const { |
1838 | // Callable functions always require a stack pointer reference. |
1839 | assert((MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() || |
1840 | MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) && |
1841 | "only expected to call this for entry points and chain functions" ); |
1842 | |
1843 | const MachineFrameInfo &MFI = MF.getFrameInfo(); |
1844 | |
1845 | // Entry points ordinarily don't need to initialize SP. We have to set it up |
1846 | // for callees if there are any. Also note tail calls are impossible/don't |
1847 | // make any sense for kernels. |
1848 | if (MFI.hasCalls()) |
1849 | return true; |
1850 | |
1851 | // We still need to initialize the SP if we're doing anything weird that |
1852 | // references the SP, like variable sized stack objects. |
1853 | return frameTriviallyRequiresSP(MFI); |
1854 | } |
1855 | |