1//===-- SIRegisterInfo.cpp - SI Register Information ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI implementation of the TargetRegisterInfo class.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPU.h"
15#include "AMDGPURegisterBankInfo.h"
16#include "GCNSubtarget.h"
17#include "MCTargetDesc/AMDGPUInstPrinter.h"
18#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
19#include "SIMachineFunctionInfo.h"
20#include "SIRegisterInfo.h"
21#include "llvm/CodeGen/LiveIntervals.h"
22#include "llvm/CodeGen/LiveRegUnits.h"
23#include "llvm/CodeGen/MachineDominators.h"
24#include "llvm/CodeGen/MachineFrameInfo.h"
25#include "llvm/CodeGen/RegisterScavenging.h"
26
27using namespace llvm;
28
29#define GET_REGINFO_TARGET_DESC
30#include "AMDGPUGenRegisterInfo.inc"
31
32static cl::opt<bool> EnableSpillSGPRToVGPR(
33 "amdgpu-spill-sgpr-to-vgpr",
34 cl::desc("Enable spilling SGPRs to VGPRs"),
35 cl::ReallyHidden,
36 cl::init(true));
37
38std::array<std::vector<int16_t>, 16> SIRegisterInfo::RegSplitParts;
39std::array<std::array<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable;
40
41// Map numbers of DWORDs to indexes in SubRegFromChannelTable.
42// Valid indexes are shifted 1, such that a 0 mapping means unsupported.
43// e.g. for 8 DWORDs (256-bit), SubRegFromChannelTableWidthMap[8] = 8,
44// meaning index 7 in SubRegFromChannelTable.
45static const std::array<unsigned, 17> SubRegFromChannelTableWidthMap = {
46 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 9};
47
48namespace llvm {
49
50// A temporary struct to spill SGPRs.
51// This is mostly to spill SGPRs to memory. Spilling SGPRs into VGPR lanes emits
52// just v_writelane and v_readlane.
53//
54// When spilling to memory, the SGPRs are written into VGPR lanes and the VGPR
55// is saved to scratch (or the other way around for loads).
56// For this, a VGPR is required where the needed lanes can be clobbered. The
57// RegScavenger can provide a VGPR where currently active lanes can be
58// clobbered, but we still need to save inactive lanes.
59// The high-level steps are:
60// - Try to scavenge SGPR(s) to save exec
61// - Try to scavenge VGPR
62// - Save needed, all or inactive lanes of a TmpVGPR
63// - Spill/Restore SGPRs using TmpVGPR
64// - Restore TmpVGPR
65//
66// To save all lanes of TmpVGPR, exec needs to be saved and modified. If we
67// cannot scavenge temporary SGPRs to save exec, we use the following code:
68// buffer_store_dword TmpVGPR ; only if active lanes need to be saved
69// s_not exec, exec
70// buffer_store_dword TmpVGPR ; save inactive lanes
71// s_not exec, exec
72struct SGPRSpillBuilder {
73 struct PerVGPRData {
74 unsigned PerVGPR;
75 unsigned NumVGPRs;
76 int64_t VGPRLanes;
77 };
78
79 // The SGPR to save
80 Register SuperReg;
81 MachineBasicBlock::iterator MI;
82 ArrayRef<int16_t> SplitParts;
83 unsigned NumSubRegs;
84 bool IsKill;
85 const DebugLoc &DL;
86
87 /* When spilling to stack */
88 // The SGPRs are written into this VGPR, which is then written to scratch
89 // (or vice versa for loads).
90 Register TmpVGPR = AMDGPU::NoRegister;
91 // Temporary spill slot to save TmpVGPR to.
92 int TmpVGPRIndex = 0;
93 // If TmpVGPR is live before the spill or if it is scavenged.
94 bool TmpVGPRLive = false;
95 // Scavenged SGPR to save EXEC.
96 Register SavedExecReg = AMDGPU::NoRegister;
97 // Stack index to write the SGPRs to.
98 int Index;
99 unsigned EltSize = 4;
100
101 RegScavenger *RS;
102 MachineBasicBlock *MBB;
103 MachineFunction &MF;
104 SIMachineFunctionInfo &MFI;
105 const SIInstrInfo &TII;
106 const SIRegisterInfo &TRI;
107 bool IsWave32;
108 Register ExecReg;
109 unsigned MovOpc;
110 unsigned NotOpc;
111
112 SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII,
113 bool IsWave32, MachineBasicBlock::iterator MI, int Index,
114 RegScavenger *RS)
115 : SGPRSpillBuilder(TRI, TII, IsWave32, MI, MI->getOperand(i: 0).getReg(),
116 MI->getOperand(i: 0).isKill(), Index, RS) {}
117
118 SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII,
119 bool IsWave32, MachineBasicBlock::iterator MI, Register Reg,
120 bool IsKill, int Index, RegScavenger *RS)
121 : SuperReg(Reg), MI(MI), IsKill(IsKill), DL(MI->getDebugLoc()),
122 Index(Index), RS(RS), MBB(MI->getParent()), MF(*MBB->getParent()),
123 MFI(*MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI),
124 IsWave32(IsWave32) {
125 const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(SuperReg);
126 SplitParts = TRI.getRegSplitParts(RC, EltSize);
127 NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
128
129 if (IsWave32) {
130 ExecReg = AMDGPU::EXEC_LO;
131 MovOpc = AMDGPU::S_MOV_B32;
132 NotOpc = AMDGPU::S_NOT_B32;
133 } else {
134 ExecReg = AMDGPU::EXEC;
135 MovOpc = AMDGPU::S_MOV_B64;
136 NotOpc = AMDGPU::S_NOT_B64;
137 }
138
139 assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
140 assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI &&
141 SuperReg != AMDGPU::EXEC && "exec should never spill");
142 }
143
144 PerVGPRData getPerVGPRData() {
145 PerVGPRData Data;
146 Data.PerVGPR = IsWave32 ? 32 : 64;
147 Data.NumVGPRs = (NumSubRegs + (Data.PerVGPR - 1)) / Data.PerVGPR;
148 Data.VGPRLanes = (1LL << std::min(a: Data.PerVGPR, b: NumSubRegs)) - 1LL;
149 return Data;
150 }
151
152 // Tries to scavenge SGPRs to save EXEC and a VGPR. Uses v0 if no VGPR is
153 // free.
154 // Writes these instructions if an SGPR can be scavenged:
155 // s_mov_b64 s[6:7], exec ; Save exec
156 // s_mov_b64 exec, 3 ; Wanted lanemask
157 // buffer_store_dword v1 ; Write scavenged VGPR to emergency slot
158 //
159 // Writes these instructions if no SGPR can be scavenged:
160 // buffer_store_dword v0 ; Only if no free VGPR was found
161 // s_not_b64 exec, exec
162 // buffer_store_dword v0 ; Save inactive lanes
163 // ; exec stays inverted, it is flipped back in
164 // ; restore.
165 void prepare() {
166 // Scavenged temporary VGPR to use. It must be scavenged once for any number
167 // of spilled subregs.
168 // FIXME: The liveness analysis is limited and does not tell if a register
169 // is in use in lanes that are currently inactive. We can never be sure if
170 // a register as actually in use in another lane, so we need to save all
171 // used lanes of the chosen VGPR.
172 assert(RS && "Cannot spill SGPR to memory without RegScavenger");
173 TmpVGPR = RS->scavengeRegisterBackwards(AMDGPU::RC: VGPR_32RegClass, To: MI, RestoreAfter: false,
174 SPAdj: 0, AllowSpill: false);
175
176 // Reserve temporary stack slot
177 TmpVGPRIndex = MFI.getScavengeFI(MFI&: MF.getFrameInfo(), TRI);
178 if (TmpVGPR) {
179 // Found a register that is dead in the currently active lanes, we only
180 // need to spill inactive lanes.
181 TmpVGPRLive = false;
182 } else {
183 // Pick v0 because it doesn't make a difference.
184 TmpVGPR = AMDGPU::VGPR0;
185 TmpVGPRLive = true;
186 }
187
188 if (TmpVGPRLive) {
189 // We need to inform the scavenger that this index is already in use until
190 // we're done with the custom emergency spill.
191 RS->assignRegToScavengingIndex(FI: TmpVGPRIndex, Reg: TmpVGPR);
192 }
193
194 // We may end up recursively calling the scavenger, and don't want to re-use
195 // the same register.
196 RS->setRegUsed(Reg: TmpVGPR);
197
198 // Try to scavenge SGPRs to save exec
199 assert(!SavedExecReg && "Exec is already saved, refuse to save again");
200 const TargetRegisterClass &RC =
201 IsWave32 ? AMDGPU::SGPR_32RegClass : AMDGPU::SGPR_64RegClass;
202 RS->setRegUsed(Reg: SuperReg);
203 SavedExecReg = RS->scavengeRegisterBackwards(RC, To: MI, RestoreAfter: false, SPAdj: 0, AllowSpill: false);
204
205 int64_t VGPRLanes = getPerVGPRData().VGPRLanes;
206
207 if (SavedExecReg) {
208 RS->setRegUsed(Reg: SavedExecReg);
209 // Set exec to needed lanes
210 BuildMI(*MBB, MI, DL, TII.get(MovOpc), SavedExecReg).addReg(ExecReg);
211 auto I =
212 BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg).addImm(VGPRLanes);
213 if (!TmpVGPRLive)
214 I.addReg(TmpVGPR, RegState::ImplicitDefine);
215 // Spill needed lanes
216 TRI.buildVGPRSpillLoadStore(SB&: *this, Index: TmpVGPRIndex, Offset: 0, /*IsLoad*/ false);
217 } else {
218 // The modify and restore of exec clobber SCC, which we would have to save
219 // and restore. FIXME: We probably would need to reserve a register for
220 // this.
221 if (RS->isRegUsed(Reg: AMDGPU::SCC))
222 MI->emitError(Msg: "unhandled SGPR spill to memory");
223
224 // Spill active lanes
225 if (TmpVGPRLive)
226 TRI.buildVGPRSpillLoadStore(SB&: *this, Index: TmpVGPRIndex, Offset: 0, /*IsLoad*/ false,
227 /*IsKill*/ false);
228 // Spill inactive lanes
229 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
230 if (!TmpVGPRLive)
231 I.addReg(TmpVGPR, RegState::ImplicitDefine);
232 I->getOperand(2).setIsDead(); // Mark SCC as dead.
233 TRI.buildVGPRSpillLoadStore(SB&: *this, Index: TmpVGPRIndex, Offset: 0, /*IsLoad*/ false);
234 }
235 }
236
237 // Writes these instructions if an SGPR can be scavenged:
238 // buffer_load_dword v1 ; Write scavenged VGPR to emergency slot
239 // s_waitcnt vmcnt(0) ; If a free VGPR was found
240 // s_mov_b64 exec, s[6:7] ; Save exec
241 //
242 // Writes these instructions if no SGPR can be scavenged:
243 // buffer_load_dword v0 ; Restore inactive lanes
244 // s_waitcnt vmcnt(0) ; If a free VGPR was found
245 // s_not_b64 exec, exec
246 // buffer_load_dword v0 ; Only if no free VGPR was found
247 void restore() {
248 if (SavedExecReg) {
249 // Restore used lanes
250 TRI.buildVGPRSpillLoadStore(SB&: *this, Index: TmpVGPRIndex, Offset: 0, /*IsLoad*/ true,
251 /*IsKill*/ false);
252 // Restore exec
253 auto I = BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg)
254 .addReg(SavedExecReg, RegState::Kill);
255 // Add an implicit use of the load so it is not dead.
256 // FIXME This inserts an unnecessary waitcnt
257 if (!TmpVGPRLive) {
258 I.addReg(TmpVGPR, RegState::ImplicitKill);
259 }
260 } else {
261 // Restore inactive lanes
262 TRI.buildVGPRSpillLoadStore(SB&: *this, Index: TmpVGPRIndex, Offset: 0, /*IsLoad*/ true,
263 /*IsKill*/ false);
264 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
265 if (!TmpVGPRLive)
266 I.addReg(TmpVGPR, RegState::ImplicitKill);
267 I->getOperand(2).setIsDead(); // Mark SCC as dead.
268
269 // Restore active lanes
270 if (TmpVGPRLive)
271 TRI.buildVGPRSpillLoadStore(SB&: *this, Index: TmpVGPRIndex, Offset: 0, /*IsLoad*/ true);
272 }
273
274 // Inform the scavenger where we're releasing our custom scavenged register.
275 if (TmpVGPRLive) {
276 MachineBasicBlock::iterator RestorePt = std::prev(x: MI);
277 RS->assignRegToScavengingIndex(FI: TmpVGPRIndex, Reg: TmpVGPR, Restore: &*RestorePt);
278 }
279 }
280
281 // Write TmpVGPR to memory or read TmpVGPR from memory.
282 // Either using a single buffer_load/store if exec is set to the needed mask
283 // or using
284 // buffer_load
285 // s_not exec, exec
286 // buffer_load
287 // s_not exec, exec
288 void readWriteTmpVGPR(unsigned Offset, bool IsLoad) {
289 if (SavedExecReg) {
290 // Spill needed lanes
291 TRI.buildVGPRSpillLoadStore(SB&: *this, Index, Offset, IsLoad);
292 } else {
293 // The modify and restore of exec clobber SCC, which we would have to save
294 // and restore. FIXME: We probably would need to reserve a register for
295 // this.
296 if (RS->isRegUsed(Reg: AMDGPU::SCC))
297 MI->emitError(Msg: "unhandled SGPR spill to memory");
298
299 // Spill active lanes
300 TRI.buildVGPRSpillLoadStore(SB&: *this, Index, Offset, IsLoad,
301 /*IsKill*/ false);
302 // Spill inactive lanes
303 auto Not0 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
304 Not0->getOperand(2).setIsDead(); // Mark SCC as dead.
305 TRI.buildVGPRSpillLoadStore(SB&: *this, Index, Offset, IsLoad);
306 auto Not1 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
307 Not1->getOperand(2).setIsDead(); // Mark SCC as dead.
308 }
309 }
310
311 void setMI(MachineBasicBlock *NewMBB, MachineBasicBlock::iterator NewMI) {
312 assert(MBB->getParent() == &MF);
313 MI = NewMI;
314 MBB = NewMBB;
315 }
316};
317
318} // namespace llvm
319
320SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST)
321 : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour(),
322 ST.getAMDGPUDwarfFlavour()),
323 ST(ST), SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) {
324
325 assert(getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() == 3 &&
326 getSubRegIndexLaneMask(AMDGPU::sub31).getAsInteger() == (3ULL << 62) &&
327 (getSubRegIndexLaneMask(AMDGPU::lo16) |
328 getSubRegIndexLaneMask(AMDGPU::hi16)).getAsInteger() ==
329 getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() &&
330 "getNumCoveredRegs() will not work with generated subreg masks!");
331
332 RegPressureIgnoredUnits.resize(N: getNumRegUnits());
333 RegPressureIgnoredUnits.set(*regunits(MCRegister::from(AMDGPU::Val: M0)).begin());
334 for (auto Reg : AMDGPU::VGPR_16RegClass) {
335 if (AMDGPU::isHi(Reg, *this))
336 RegPressureIgnoredUnits.set(*regunits(Reg).begin());
337 }
338
339 // HACK: Until this is fully tablegen'd.
340 static llvm::once_flag InitializeRegSplitPartsFlag;
341
342 static auto InitializeRegSplitPartsOnce = [this]() {
343 for (unsigned Idx = 1, E = getNumSubRegIndices() - 1; Idx < E; ++Idx) {
344 unsigned Size = getSubRegIdxSize(Idx);
345 if (Size & 31)
346 continue;
347 std::vector<int16_t> &Vec = RegSplitParts[Size / 32 - 1];
348 unsigned Pos = getSubRegIdxOffset(Idx);
349 if (Pos % Size)
350 continue;
351 Pos /= Size;
352 if (Vec.empty()) {
353 unsigned MaxNumParts = 1024 / Size; // Maximum register is 1024 bits.
354 Vec.resize(new_size: MaxNumParts);
355 }
356 Vec[Pos] = Idx;
357 }
358 };
359
360 static llvm::once_flag InitializeSubRegFromChannelTableFlag;
361
362 static auto InitializeSubRegFromChannelTableOnce = [this]() {
363 for (auto &Row : SubRegFromChannelTable)
364 Row.fill(u: AMDGPU::NoSubRegister);
365 for (unsigned Idx = 1; Idx < getNumSubRegIndices(); ++Idx) {
366 unsigned Width = getSubRegIdxSize(Idx) / 32;
367 unsigned Offset = getSubRegIdxOffset(Idx) / 32;
368 assert(Width < SubRegFromChannelTableWidthMap.size());
369 Width = SubRegFromChannelTableWidthMap[Width];
370 if (Width == 0)
371 continue;
372 unsigned TableIdx = Width - 1;
373 assert(TableIdx < SubRegFromChannelTable.size());
374 assert(Offset < SubRegFromChannelTable[TableIdx].size());
375 SubRegFromChannelTable[TableIdx][Offset] = Idx;
376 }
377 };
378
379 llvm::call_once(flag&: InitializeRegSplitPartsFlag, F&: InitializeRegSplitPartsOnce);
380 llvm::call_once(flag&: InitializeSubRegFromChannelTableFlag,
381 F&: InitializeSubRegFromChannelTableOnce);
382}
383
384void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved,
385 MCRegister Reg) const {
386 for (MCRegAliasIterator R(Reg, this, true); R.isValid(); ++R)
387 Reserved.set(*R);
388}
389
390// Forced to be here by one .inc
391const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs(
392 const MachineFunction *MF) const {
393 CallingConv::ID CC = MF->getFunction().getCallingConv();
394 switch (CC) {
395 case CallingConv::C:
396 case CallingConv::Fast:
397 case CallingConv::Cold:
398 return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_SaveList
399 : CSR_AMDGPU_SaveList;
400 case CallingConv::AMDGPU_Gfx:
401 return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_SaveList
402 : CSR_AMDGPU_SI_Gfx_SaveList;
403 case CallingConv::AMDGPU_CS_ChainPreserve:
404 return CSR_AMDGPU_CS_ChainPreserve_SaveList;
405 default: {
406 // Dummy to not crash RegisterClassInfo.
407 static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister;
408 return &NoCalleeSavedReg;
409 }
410 }
411}
412
413const MCPhysReg *
414SIRegisterInfo::getCalleeSavedRegsViaCopy(const MachineFunction *MF) const {
415 return nullptr;
416}
417
418const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
419 CallingConv::ID CC) const {
420 switch (CC) {
421 case CallingConv::C:
422 case CallingConv::Fast:
423 case CallingConv::Cold:
424 return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_RegMask
425 : CSR_AMDGPU_RegMask;
426 case CallingConv::AMDGPU_Gfx:
427 return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_RegMask
428 : CSR_AMDGPU_SI_Gfx_RegMask;
429 case CallingConv::AMDGPU_CS_Chain:
430 case CallingConv::AMDGPU_CS_ChainPreserve:
431 // Calls to these functions never return, so we can pretend everything is
432 // preserved.
433 return AMDGPU_AllVGPRs_RegMask;
434 default:
435 return nullptr;
436 }
437}
438
439const uint32_t *SIRegisterInfo::getNoPreservedMask() const {
440 return CSR_AMDGPU_NoRegs_RegMask;
441}
442
443bool SIRegisterInfo::isChainScratchRegister(Register VGPR) {
444 return VGPR >= AMDGPU::VGPR0 && VGPR < AMDGPU::VGPR8;
445}
446
447const TargetRegisterClass *
448SIRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
449 const MachineFunction &MF) const {
450 // FIXME: Should have a helper function like getEquivalentVGPRClass to get the
451 // equivalent AV class. If used one, the verifier will crash after
452 // RegBankSelect in the GISel flow. The aligned regclasses are not fully given
453 // until Instruction selection.
454 if (ST.hasMAIInsts() && (isVGPRClass(RC) || isAGPRClass(RC))) {
455 if (RC == &AMDGPU::VGPR_32RegClass || RC == &AMDGPU::AGPR_32RegClass)
456 return &AMDGPU::AV_32RegClass;
457 if (RC == &AMDGPU::VReg_64RegClass || RC == &AMDGPU::AReg_64RegClass)
458 return &AMDGPU::AV_64RegClass;
459 if (RC == &AMDGPU::VReg_64_Align2RegClass ||
460 RC == &AMDGPU::AReg_64_Align2RegClass)
461 return &AMDGPU::AV_64_Align2RegClass;
462 if (RC == &AMDGPU::VReg_96RegClass || RC == &AMDGPU::AReg_96RegClass)
463 return &AMDGPU::AV_96RegClass;
464 if (RC == &AMDGPU::VReg_96_Align2RegClass ||
465 RC == &AMDGPU::AReg_96_Align2RegClass)
466 return &AMDGPU::AV_96_Align2RegClass;
467 if (RC == &AMDGPU::VReg_128RegClass || RC == &AMDGPU::AReg_128RegClass)
468 return &AMDGPU::AV_128RegClass;
469 if (RC == &AMDGPU::VReg_128_Align2RegClass ||
470 RC == &AMDGPU::AReg_128_Align2RegClass)
471 return &AMDGPU::AV_128_Align2RegClass;
472 if (RC == &AMDGPU::VReg_160RegClass || RC == &AMDGPU::AReg_160RegClass)
473 return &AMDGPU::AV_160RegClass;
474 if (RC == &AMDGPU::VReg_160_Align2RegClass ||
475 RC == &AMDGPU::AReg_160_Align2RegClass)
476 return &AMDGPU::AV_160_Align2RegClass;
477 if (RC == &AMDGPU::VReg_192RegClass || RC == &AMDGPU::AReg_192RegClass)
478 return &AMDGPU::AV_192RegClass;
479 if (RC == &AMDGPU::VReg_192_Align2RegClass ||
480 RC == &AMDGPU::AReg_192_Align2RegClass)
481 return &AMDGPU::AV_192_Align2RegClass;
482 if (RC == &AMDGPU::VReg_256RegClass || RC == &AMDGPU::AReg_256RegClass)
483 return &AMDGPU::AV_256RegClass;
484 if (RC == &AMDGPU::VReg_256_Align2RegClass ||
485 RC == &AMDGPU::AReg_256_Align2RegClass)
486 return &AMDGPU::AV_256_Align2RegClass;
487 if (RC == &AMDGPU::VReg_512RegClass || RC == &AMDGPU::AReg_512RegClass)
488 return &AMDGPU::AV_512RegClass;
489 if (RC == &AMDGPU::VReg_512_Align2RegClass ||
490 RC == &AMDGPU::AReg_512_Align2RegClass)
491 return &AMDGPU::AV_512_Align2RegClass;
492 if (RC == &AMDGPU::VReg_1024RegClass || RC == &AMDGPU::AReg_1024RegClass)
493 return &AMDGPU::AV_1024RegClass;
494 if (RC == &AMDGPU::VReg_1024_Align2RegClass ||
495 RC == &AMDGPU::AReg_1024_Align2RegClass)
496 return &AMDGPU::AV_1024_Align2RegClass;
497 }
498
499 return TargetRegisterInfo::getLargestLegalSuperClass(RC, MF);
500}
501
502Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
503 const SIFrameLowering *TFI = ST.getFrameLowering();
504 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
505 // During ISel lowering we always reserve the stack pointer in entry and chain
506 // functions, but never actually want to reference it when accessing our own
507 // frame. If we need a frame pointer we use it, but otherwise we can just use
508 // an immediate "0" which we represent by returning NoRegister.
509 if (FuncInfo->isBottomOfStack()) {
510 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() : Register();
511 }
512 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg()
513 : FuncInfo->getStackPtrOffsetReg();
514}
515
516bool SIRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
517 // When we need stack realignment, we can't reference off of the
518 // stack pointer, so we reserve a base pointer.
519 const MachineFrameInfo &MFI = MF.getFrameInfo();
520 return MFI.getNumFixedObjects() && shouldRealignStack(MF);
521}
522
523Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; }
524
525const uint32_t *SIRegisterInfo::getAllVGPRRegMask() const {
526 return AMDGPU_AllVGPRs_RegMask;
527}
528
529const uint32_t *SIRegisterInfo::getAllAGPRRegMask() const {
530 return AMDGPU_AllAGPRs_RegMask;
531}
532
533const uint32_t *SIRegisterInfo::getAllVectorRegMask() const {
534 return AMDGPU_AllVectorRegs_RegMask;
535}
536
537const uint32_t *SIRegisterInfo::getAllAllocatableSRegMask() const {
538 return AMDGPU_AllAllocatableSRegs_RegMask;
539}
540
541unsigned SIRegisterInfo::getSubRegFromChannel(unsigned Channel,
542 unsigned NumRegs) {
543 assert(NumRegs < SubRegFromChannelTableWidthMap.size());
544 unsigned NumRegIndex = SubRegFromChannelTableWidthMap[NumRegs];
545 assert(NumRegIndex && "Not implemented");
546 assert(Channel < SubRegFromChannelTable[NumRegIndex - 1].size());
547 return SubRegFromChannelTable[NumRegIndex - 1][Channel];
548}
549
550MCRegister
551SIRegisterInfo::getAlignedHighSGPRForRC(const MachineFunction &MF,
552 const unsigned Align,
553 const TargetRegisterClass *RC) const {
554 unsigned BaseIdx = alignDown(Value: ST.getMaxNumSGPRs(MF), Align) - Align;
555 MCRegister BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
556 return getMatchingSuperReg(BaseReg, AMDGPU::sub0, RC);
557}
558
559MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg(
560 const MachineFunction &MF) const {
561 return getAlignedHighSGPRForRC(MF, /*Align=*/4, &AMDGPU::SGPR_128RegClass);
562}
563
564BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
565 BitVector Reserved(getNumRegs());
566 Reserved.set(AMDGPU::MODE);
567
568 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
569
570 // Reserve special purpose registers.
571 //
572 // EXEC_LO and EXEC_HI could be allocated and used as regular register, but
573 // this seems likely to result in bugs, so I'm marking them as reserved.
574 reserveRegisterTuples(Reserved, AMDGPU::EXEC);
575 reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR);
576
577 // M0 has to be reserved so that llvm accepts it as a live-in into a block.
578 reserveRegisterTuples(Reserved, AMDGPU::M0);
579
580 // Reserve src_vccz, src_execz, src_scc.
581 reserveRegisterTuples(Reserved, AMDGPU::SRC_VCCZ);
582 reserveRegisterTuples(Reserved, AMDGPU::SRC_EXECZ);
583 reserveRegisterTuples(Reserved, AMDGPU::SRC_SCC);
584
585 // Reserve the memory aperture registers
586 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE);
587 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT);
588 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE);
589 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT);
590
591 // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen.
592 reserveRegisterTuples(Reserved, AMDGPU::SRC_POPS_EXITING_WAVE_ID);
593
594 // Reserve xnack_mask registers - support is not implemented in Codegen.
595 reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK);
596
597 // Reserve lds_direct register - support is not implemented in Codegen.
598 reserveRegisterTuples(Reserved, AMDGPU::LDS_DIRECT);
599
600 // Reserve Trap Handler registers - support is not implemented in Codegen.
601 reserveRegisterTuples(Reserved, AMDGPU::TBA);
602 reserveRegisterTuples(Reserved, AMDGPU::TMA);
603 reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1);
604 reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3);
605 reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5);
606 reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7);
607 reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9);
608 reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11);
609 reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13);
610 reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15);
611
612 // Reserve null register - it shall never be allocated
613 reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL64);
614
615 // Reserve SGPRs.
616 //
617 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
618 unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
619 for (const TargetRegisterClass *RC : regclasses()) {
620 if (RC->isBaseClass() && isSGPRClass(RC)) {
621 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32);
622 for (MCPhysReg Reg : *RC) {
623 unsigned Index = getHWRegIndex(Reg);
624 if (Index + NumRegs > MaxNumSGPRs && Index < TotalNumSGPRs)
625 Reserved.set(Reg);
626 }
627 }
628 }
629
630 Register ScratchRSrcReg = MFI->getScratchRSrcReg();
631 if (ScratchRSrcReg != AMDGPU::NoRegister) {
632 // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we
633 // need to spill.
634 // TODO: May need to reserve a VGPR if doing LDS spilling.
635 reserveRegisterTuples(Reserved, Reg: ScratchRSrcReg);
636 }
637
638 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
639 if (LongBranchReservedReg)
640 reserveRegisterTuples(Reserved, Reg: LongBranchReservedReg);
641
642 // We have to assume the SP is needed in case there are calls in the function,
643 // which is detected after the function is lowered. If we aren't really going
644 // to need SP, don't bother reserving it.
645 MCRegister StackPtrReg = MFI->getStackPtrOffsetReg();
646 if (StackPtrReg) {
647 reserveRegisterTuples(Reserved, Reg: StackPtrReg);
648 assert(!isSubRegister(ScratchRSrcReg, StackPtrReg));
649 }
650
651 MCRegister FrameReg = MFI->getFrameOffsetReg();
652 if (FrameReg) {
653 reserveRegisterTuples(Reserved, Reg: FrameReg);
654 assert(!isSubRegister(ScratchRSrcReg, FrameReg));
655 }
656
657 if (hasBasePointer(MF)) {
658 MCRegister BasePtrReg = getBaseRegister();
659 reserveRegisterTuples(Reserved, Reg: BasePtrReg);
660 assert(!isSubRegister(ScratchRSrcReg, BasePtrReg));
661 }
662
663 // FIXME: Use same reserved register introduced in D149775
664 // SGPR used to preserve EXEC MASK around WWM spill/copy instructions.
665 Register ExecCopyReg = MFI->getSGPRForEXECCopy();
666 if (ExecCopyReg)
667 reserveRegisterTuples(Reserved, Reg: ExecCopyReg);
668
669 // Reserve VGPRs/AGPRs.
670 //
671 unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF);
672 unsigned MaxNumAGPRs = MaxNumVGPRs;
673 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
674
675 // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically,
676 // a wave may have up to 512 total vector registers combining together both
677 // VGPRs and AGPRs. Hence, in an entry function without calls and without
678 // AGPRs used within it, it is possible to use the whole vector register
679 // budget for VGPRs.
680 //
681 // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split
682 // register file accordingly.
683 if (ST.hasGFX90AInsts()) {
684 if (MFI->usesAGPRs(MF)) {
685 MaxNumVGPRs /= 2;
686 MaxNumAGPRs = MaxNumVGPRs;
687 } else {
688 if (MaxNumVGPRs > TotalNumVGPRs) {
689 MaxNumAGPRs = MaxNumVGPRs - TotalNumVGPRs;
690 MaxNumVGPRs = TotalNumVGPRs;
691 } else
692 MaxNumAGPRs = 0;
693 }
694 }
695
696 for (const TargetRegisterClass *RC : regclasses()) {
697 if (RC->isBaseClass() && isVGPRClass(RC)) {
698 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32);
699 for (MCPhysReg Reg : *RC) {
700 unsigned Index = getHWRegIndex(Reg);
701 if (Index + NumRegs > MaxNumVGPRs)
702 Reserved.set(Reg);
703 }
704 }
705 }
706
707 // Reserve all the AGPRs if there are no instructions to use it.
708 if (!ST.hasMAIInsts())
709 MaxNumAGPRs = 0;
710 for (const TargetRegisterClass *RC : regclasses()) {
711 if (RC->isBaseClass() && isAGPRClass(RC)) {
712 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32);
713 for (MCPhysReg Reg : *RC) {
714 unsigned Index = getHWRegIndex(Reg);
715 if (Index + NumRegs > MaxNumAGPRs)
716 Reserved.set(Reg);
717 }
718 }
719 }
720
721 // On GFX908, in order to guarantee copying between AGPRs, we need a scratch
722 // VGPR available at all times.
723 if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
724 reserveRegisterTuples(Reserved, Reg: MFI->getVGPRForAGPRCopy());
725 }
726
727 for (Register Reg : MFI->getWWMReservedRegs())
728 reserveRegisterTuples(Reserved, Reg);
729
730 // FIXME: Stop using reserved registers for this.
731 for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs())
732 reserveRegisterTuples(Reserved, Reg);
733
734 for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs())
735 reserveRegisterTuples(Reserved, Reg);
736
737 return Reserved;
738}
739
740bool SIRegisterInfo::isAsmClobberable(const MachineFunction &MF,
741 MCRegister PhysReg) const {
742 return !MF.getRegInfo().isReserved(PhysReg);
743}
744
745bool SIRegisterInfo::shouldRealignStack(const MachineFunction &MF) const {
746 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
747 // On entry or in chain functions, the base address is 0, so it can't possibly
748 // need any more alignment.
749
750 // FIXME: Should be able to specify the entry frame alignment per calling
751 // convention instead.
752 if (Info->isBottomOfStack())
753 return false;
754
755 return TargetRegisterInfo::shouldRealignStack(MF);
756}
757
758bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const {
759 const SIMachineFunctionInfo *Info = Fn.getInfo<SIMachineFunctionInfo>();
760 if (Info->isEntryFunction()) {
761 const MachineFrameInfo &MFI = Fn.getFrameInfo();
762 return MFI.hasStackObjects() || MFI.hasCalls();
763 }
764
765 // May need scavenger for dealing with callee saved registers.
766 return true;
767}
768
769bool SIRegisterInfo::requiresFrameIndexScavenging(
770 const MachineFunction &MF) const {
771 // Do not use frame virtual registers. They used to be used for SGPRs, but
772 // once we reach PrologEpilogInserter, we can no longer spill SGPRs. If the
773 // scavenger fails, we can increment/decrement the necessary SGPRs to avoid a
774 // spill.
775 return false;
776}
777
778bool SIRegisterInfo::requiresFrameIndexReplacementScavenging(
779 const MachineFunction &MF) const {
780 const MachineFrameInfo &MFI = MF.getFrameInfo();
781 return MFI.hasStackObjects();
782}
783
784bool SIRegisterInfo::requiresVirtualBaseRegisters(
785 const MachineFunction &) const {
786 // There are no special dedicated stack or frame pointers.
787 return true;
788}
789
790int64_t SIRegisterInfo::getScratchInstrOffset(const MachineInstr *MI) const {
791 assert(SIInstrInfo::isMUBUF(*MI) || SIInstrInfo::isFLATScratch(*MI));
792
793 int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
794 AMDGPU::OpName::offset);
795 return MI->getOperand(i: OffIdx).getImm();
796}
797
798int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI,
799 int Idx) const {
800 if (!SIInstrInfo::isMUBUF(MI: *MI) && !SIInstrInfo::isFLATScratch(MI: *MI))
801 return 0;
802
803 assert((Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
804 AMDGPU::OpName::vaddr) ||
805 (Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
806 AMDGPU::OpName::saddr))) &&
807 "Should never see frame index on non-address operand");
808
809 return getScratchInstrOffset(MI);
810}
811
812bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
813 if (!SIInstrInfo::isMUBUF(MI: *MI) && !SIInstrInfo::isFLATScratch(MI: *MI))
814 return false;
815
816 int64_t FullOffset = Offset + getScratchInstrOffset(MI);
817
818 const SIInstrInfo *TII = ST.getInstrInfo();
819 if (SIInstrInfo::isMUBUF(MI: *MI))
820 return !TII->isLegalMUBUFImmOffset(Imm: FullOffset);
821
822 return !TII->isLegalFLATOffset(Offset: FullOffset, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS,
823 FlatVariant: SIInstrFlags::FlatScratch);
824}
825
826Register SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
827 int FrameIdx,
828 int64_t Offset) const {
829 MachineBasicBlock::iterator Ins = MBB->begin();
830 DebugLoc DL; // Defaults to "unknown"
831
832 if (Ins != MBB->end())
833 DL = Ins->getDebugLoc();
834
835 MachineFunction *MF = MBB->getParent();
836 const SIInstrInfo *TII = ST.getInstrInfo();
837 MachineRegisterInfo &MRI = MF->getRegInfo();
838 unsigned MovOpc = ST.enableFlatScratch() ? AMDGPU::S_MOV_B32
839 : AMDGPU::V_MOV_B32_e32;
840
841 Register BaseReg = MRI.createVirtualRegister(
842 ST.enableFlatScratch() ? &AMDGPU::SReg_32_XEXEC_HIRegClass
843 : &AMDGPU::VGPR_32RegClass);
844
845 if (Offset == 0) {
846 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), BaseReg)
847 .addFrameIndex(FrameIdx);
848 return BaseReg;
849 }
850
851 Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
852
853 Register FIReg = MRI.createVirtualRegister(
854 ST.enableFlatScratch() ? &AMDGPU::SReg_32_XM0RegClass
855 : &AMDGPU::VGPR_32RegClass);
856
857 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
858 .addImm(Offset);
859 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), FIReg)
860 .addFrameIndex(FrameIdx);
861
862 if (ST.enableFlatScratch() ) {
863 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_I32), BaseReg)
864 .addReg(OffsetReg, RegState::Kill)
865 .addReg(FIReg);
866 return BaseReg;
867 }
868
869 TII->getAddNoCarry(MBB&: *MBB, I: Ins, DL, DestReg: BaseReg)
870 .addReg(RegNo: OffsetReg, flags: RegState::Kill)
871 .addReg(RegNo: FIReg)
872 .addImm(Val: 0); // clamp bit
873
874 return BaseReg;
875}
876
877void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg,
878 int64_t Offset) const {
879 const SIInstrInfo *TII = ST.getInstrInfo();
880 bool IsFlat = TII->isFLATScratch(MI);
881
882#ifndef NDEBUG
883 // FIXME: Is it possible to be storing a frame index to itself?
884 bool SeenFI = false;
885 for (const MachineOperand &MO: MI.operands()) {
886 if (MO.isFI()) {
887 if (SeenFI)
888 llvm_unreachable("should not see multiple frame indices");
889
890 SeenFI = true;
891 }
892 }
893#endif
894
895 MachineOperand *FIOp =
896 TII->getNamedOperand(MI, IsFlat ? AMDGPU::OpName::saddr
897 : AMDGPU::OpName::vaddr);
898
899 MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset);
900 int64_t NewOffset = OffsetOp->getImm() + Offset;
901
902 assert(FIOp && FIOp->isFI() && "frame index must be address operand");
903 assert(TII->isMUBUF(MI) || TII->isFLATScratch(MI));
904
905 if (IsFlat) {
906 assert(TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
907 SIInstrFlags::FlatScratch) &&
908 "offset should be legal");
909 FIOp->ChangeToRegister(Reg: BaseReg, isDef: false);
910 OffsetOp->setImm(NewOffset);
911 return;
912 }
913
914#ifndef NDEBUG
915 MachineOperand *SOffset = TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
916 assert(SOffset->isImm() && SOffset->getImm() == 0);
917#endif
918
919 assert(TII->isLegalMUBUFImmOffset(NewOffset) && "offset should be legal");
920
921 FIOp->ChangeToRegister(Reg: BaseReg, isDef: false);
922 OffsetOp->setImm(NewOffset);
923}
924
925bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
926 Register BaseReg,
927 int64_t Offset) const {
928 if (!SIInstrInfo::isMUBUF(MI: *MI) && !SIInstrInfo::isFLATScratch(MI: *MI))
929 return false;
930
931 int64_t NewOffset = Offset + getScratchInstrOffset(MI);
932
933 const SIInstrInfo *TII = ST.getInstrInfo();
934 if (SIInstrInfo::isMUBUF(MI: *MI))
935 return TII->isLegalMUBUFImmOffset(Imm: NewOffset);
936
937 return TII->isLegalFLATOffset(Offset: NewOffset, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS,
938 FlatVariant: SIInstrFlags::FlatScratch);
939}
940
941const TargetRegisterClass *SIRegisterInfo::getPointerRegClass(
942 const MachineFunction &MF, unsigned Kind) const {
943 // This is inaccurate. It depends on the instruction and address space. The
944 // only place where we should hit this is for dealing with frame indexes /
945 // private accesses, so this is correct in that case.
946 return &AMDGPU::VGPR_32RegClass;
947}
948
949const TargetRegisterClass *
950SIRegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
951 if (isAGPRClass(RC) && !ST.hasGFX90AInsts())
952 return getEquivalentVGPRClass(SRC: RC);
953 if (RC == &AMDGPU::SCC_CLASSRegClass)
954 return getWaveMaskRegClass();
955
956 return RC;
957}
958
959static unsigned getNumSubRegsForSpillOp(unsigned Op) {
960
961 switch (Op) {
962 case AMDGPU::SI_SPILL_S1024_SAVE:
963 case AMDGPU::SI_SPILL_S1024_RESTORE:
964 case AMDGPU::SI_SPILL_V1024_SAVE:
965 case AMDGPU::SI_SPILL_V1024_RESTORE:
966 case AMDGPU::SI_SPILL_A1024_SAVE:
967 case AMDGPU::SI_SPILL_A1024_RESTORE:
968 case AMDGPU::SI_SPILL_AV1024_SAVE:
969 case AMDGPU::SI_SPILL_AV1024_RESTORE:
970 return 32;
971 case AMDGPU::SI_SPILL_S512_SAVE:
972 case AMDGPU::SI_SPILL_S512_RESTORE:
973 case AMDGPU::SI_SPILL_V512_SAVE:
974 case AMDGPU::SI_SPILL_V512_RESTORE:
975 case AMDGPU::SI_SPILL_A512_SAVE:
976 case AMDGPU::SI_SPILL_A512_RESTORE:
977 case AMDGPU::SI_SPILL_AV512_SAVE:
978 case AMDGPU::SI_SPILL_AV512_RESTORE:
979 return 16;
980 case AMDGPU::SI_SPILL_S384_SAVE:
981 case AMDGPU::SI_SPILL_S384_RESTORE:
982 case AMDGPU::SI_SPILL_V384_SAVE:
983 case AMDGPU::SI_SPILL_V384_RESTORE:
984 case AMDGPU::SI_SPILL_A384_SAVE:
985 case AMDGPU::SI_SPILL_A384_RESTORE:
986 case AMDGPU::SI_SPILL_AV384_SAVE:
987 case AMDGPU::SI_SPILL_AV384_RESTORE:
988 return 12;
989 case AMDGPU::SI_SPILL_S352_SAVE:
990 case AMDGPU::SI_SPILL_S352_RESTORE:
991 case AMDGPU::SI_SPILL_V352_SAVE:
992 case AMDGPU::SI_SPILL_V352_RESTORE:
993 case AMDGPU::SI_SPILL_A352_SAVE:
994 case AMDGPU::SI_SPILL_A352_RESTORE:
995 case AMDGPU::SI_SPILL_AV352_SAVE:
996 case AMDGPU::SI_SPILL_AV352_RESTORE:
997 return 11;
998 case AMDGPU::SI_SPILL_S320_SAVE:
999 case AMDGPU::SI_SPILL_S320_RESTORE:
1000 case AMDGPU::SI_SPILL_V320_SAVE:
1001 case AMDGPU::SI_SPILL_V320_RESTORE:
1002 case AMDGPU::SI_SPILL_A320_SAVE:
1003 case AMDGPU::SI_SPILL_A320_RESTORE:
1004 case AMDGPU::SI_SPILL_AV320_SAVE:
1005 case AMDGPU::SI_SPILL_AV320_RESTORE:
1006 return 10;
1007 case AMDGPU::SI_SPILL_S288_SAVE:
1008 case AMDGPU::SI_SPILL_S288_RESTORE:
1009 case AMDGPU::SI_SPILL_V288_SAVE:
1010 case AMDGPU::SI_SPILL_V288_RESTORE:
1011 case AMDGPU::SI_SPILL_A288_SAVE:
1012 case AMDGPU::SI_SPILL_A288_RESTORE:
1013 case AMDGPU::SI_SPILL_AV288_SAVE:
1014 case AMDGPU::SI_SPILL_AV288_RESTORE:
1015 return 9;
1016 case AMDGPU::SI_SPILL_S256_SAVE:
1017 case AMDGPU::SI_SPILL_S256_RESTORE:
1018 case AMDGPU::SI_SPILL_V256_SAVE:
1019 case AMDGPU::SI_SPILL_V256_RESTORE:
1020 case AMDGPU::SI_SPILL_A256_SAVE:
1021 case AMDGPU::SI_SPILL_A256_RESTORE:
1022 case AMDGPU::SI_SPILL_AV256_SAVE:
1023 case AMDGPU::SI_SPILL_AV256_RESTORE:
1024 return 8;
1025 case AMDGPU::SI_SPILL_S224_SAVE:
1026 case AMDGPU::SI_SPILL_S224_RESTORE:
1027 case AMDGPU::SI_SPILL_V224_SAVE:
1028 case AMDGPU::SI_SPILL_V224_RESTORE:
1029 case AMDGPU::SI_SPILL_A224_SAVE:
1030 case AMDGPU::SI_SPILL_A224_RESTORE:
1031 case AMDGPU::SI_SPILL_AV224_SAVE:
1032 case AMDGPU::SI_SPILL_AV224_RESTORE:
1033 return 7;
1034 case AMDGPU::SI_SPILL_S192_SAVE:
1035 case AMDGPU::SI_SPILL_S192_RESTORE:
1036 case AMDGPU::SI_SPILL_V192_SAVE:
1037 case AMDGPU::SI_SPILL_V192_RESTORE:
1038 case AMDGPU::SI_SPILL_A192_SAVE:
1039 case AMDGPU::SI_SPILL_A192_RESTORE:
1040 case AMDGPU::SI_SPILL_AV192_SAVE:
1041 case AMDGPU::SI_SPILL_AV192_RESTORE:
1042 return 6;
1043 case AMDGPU::SI_SPILL_S160_SAVE:
1044 case AMDGPU::SI_SPILL_S160_RESTORE:
1045 case AMDGPU::SI_SPILL_V160_SAVE:
1046 case AMDGPU::SI_SPILL_V160_RESTORE:
1047 case AMDGPU::SI_SPILL_A160_SAVE:
1048 case AMDGPU::SI_SPILL_A160_RESTORE:
1049 case AMDGPU::SI_SPILL_AV160_SAVE:
1050 case AMDGPU::SI_SPILL_AV160_RESTORE:
1051 return 5;
1052 case AMDGPU::SI_SPILL_S128_SAVE:
1053 case AMDGPU::SI_SPILL_S128_RESTORE:
1054 case AMDGPU::SI_SPILL_V128_SAVE:
1055 case AMDGPU::SI_SPILL_V128_RESTORE:
1056 case AMDGPU::SI_SPILL_A128_SAVE:
1057 case AMDGPU::SI_SPILL_A128_RESTORE:
1058 case AMDGPU::SI_SPILL_AV128_SAVE:
1059 case AMDGPU::SI_SPILL_AV128_RESTORE:
1060 return 4;
1061 case AMDGPU::SI_SPILL_S96_SAVE:
1062 case AMDGPU::SI_SPILL_S96_RESTORE:
1063 case AMDGPU::SI_SPILL_V96_SAVE:
1064 case AMDGPU::SI_SPILL_V96_RESTORE:
1065 case AMDGPU::SI_SPILL_A96_SAVE:
1066 case AMDGPU::SI_SPILL_A96_RESTORE:
1067 case AMDGPU::SI_SPILL_AV96_SAVE:
1068 case AMDGPU::SI_SPILL_AV96_RESTORE:
1069 return 3;
1070 case AMDGPU::SI_SPILL_S64_SAVE:
1071 case AMDGPU::SI_SPILL_S64_RESTORE:
1072 case AMDGPU::SI_SPILL_V64_SAVE:
1073 case AMDGPU::SI_SPILL_V64_RESTORE:
1074 case AMDGPU::SI_SPILL_A64_SAVE:
1075 case AMDGPU::SI_SPILL_A64_RESTORE:
1076 case AMDGPU::SI_SPILL_AV64_SAVE:
1077 case AMDGPU::SI_SPILL_AV64_RESTORE:
1078 return 2;
1079 case AMDGPU::SI_SPILL_S32_SAVE:
1080 case AMDGPU::SI_SPILL_S32_RESTORE:
1081 case AMDGPU::SI_SPILL_V32_SAVE:
1082 case AMDGPU::SI_SPILL_V32_RESTORE:
1083 case AMDGPU::SI_SPILL_A32_SAVE:
1084 case AMDGPU::SI_SPILL_A32_RESTORE:
1085 case AMDGPU::SI_SPILL_AV32_SAVE:
1086 case AMDGPU::SI_SPILL_AV32_RESTORE:
1087 case AMDGPU::SI_SPILL_WWM_V32_SAVE:
1088 case AMDGPU::SI_SPILL_WWM_V32_RESTORE:
1089 case AMDGPU::SI_SPILL_WWM_AV32_SAVE:
1090 case AMDGPU::SI_SPILL_WWM_AV32_RESTORE:
1091 return 1;
1092 default: llvm_unreachable("Invalid spill opcode");
1093 }
1094}
1095
1096static int getOffsetMUBUFStore(unsigned Opc) {
1097 switch (Opc) {
1098 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
1099 return AMDGPU::BUFFER_STORE_DWORD_OFFSET;
1100 case AMDGPU::BUFFER_STORE_BYTE_OFFEN:
1101 return AMDGPU::BUFFER_STORE_BYTE_OFFSET;
1102 case AMDGPU::BUFFER_STORE_SHORT_OFFEN:
1103 return AMDGPU::BUFFER_STORE_SHORT_OFFSET;
1104 case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN:
1105 return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET;
1106 case AMDGPU::BUFFER_STORE_DWORDX3_OFFEN:
1107 return AMDGPU::BUFFER_STORE_DWORDX3_OFFSET;
1108 case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN:
1109 return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET;
1110 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN:
1111 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET;
1112 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN:
1113 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET;
1114 default:
1115 return -1;
1116 }
1117}
1118
1119static int getOffsetMUBUFLoad(unsigned Opc) {
1120 switch (Opc) {
1121 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
1122 return AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
1123 case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN:
1124 return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET;
1125 case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN:
1126 return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET;
1127 case AMDGPU::BUFFER_LOAD_USHORT_OFFEN:
1128 return AMDGPU::BUFFER_LOAD_USHORT_OFFSET;
1129 case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN:
1130 return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET;
1131 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN:
1132 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
1133 case AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN:
1134 return AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET;
1135 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN:
1136 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET;
1137 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN:
1138 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET;
1139 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN:
1140 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET;
1141 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN:
1142 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET;
1143 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN:
1144 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET;
1145 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN:
1146 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET;
1147 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN:
1148 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET;
1149 default:
1150 return -1;
1151 }
1152}
1153
1154static int getOffenMUBUFStore(unsigned Opc) {
1155 switch (Opc) {
1156 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
1157 return AMDGPU::BUFFER_STORE_DWORD_OFFEN;
1158 case AMDGPU::BUFFER_STORE_BYTE_OFFSET:
1159 return AMDGPU::BUFFER_STORE_BYTE_OFFEN;
1160 case AMDGPU::BUFFER_STORE_SHORT_OFFSET:
1161 return AMDGPU::BUFFER_STORE_SHORT_OFFEN;
1162 case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET:
1163 return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN;
1164 case AMDGPU::BUFFER_STORE_DWORDX3_OFFSET:
1165 return AMDGPU::BUFFER_STORE_DWORDX3_OFFEN;
1166 case AMDGPU::BUFFER_STORE_DWORDX4_OFFSET:
1167 return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN;
1168 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET:
1169 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN;
1170 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET:
1171 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN;
1172 default:
1173 return -1;
1174 }
1175}
1176
1177static int getOffenMUBUFLoad(unsigned Opc) {
1178 switch (Opc) {
1179 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
1180 return AMDGPU::BUFFER_LOAD_DWORD_OFFEN;
1181 case AMDGPU::BUFFER_LOAD_UBYTE_OFFSET:
1182 return AMDGPU::BUFFER_LOAD_UBYTE_OFFEN;
1183 case AMDGPU::BUFFER_LOAD_SBYTE_OFFSET:
1184 return AMDGPU::BUFFER_LOAD_SBYTE_OFFEN;
1185 case AMDGPU::BUFFER_LOAD_USHORT_OFFSET:
1186 return AMDGPU::BUFFER_LOAD_USHORT_OFFEN;
1187 case AMDGPU::BUFFER_LOAD_SSHORT_OFFSET:
1188 return AMDGPU::BUFFER_LOAD_SSHORT_OFFEN;
1189 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET:
1190 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN;
1191 case AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET:
1192 return AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN;
1193 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET:
1194 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN;
1195 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET:
1196 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN;
1197 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET:
1198 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN;
1199 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET:
1200 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN;
1201 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET:
1202 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN;
1203 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET:
1204 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN;
1205 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET:
1206 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN;
1207 default:
1208 return -1;
1209 }
1210}
1211
1212static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST,
1213 MachineBasicBlock &MBB,
1214 MachineBasicBlock::iterator MI,
1215 int Index, unsigned Lane,
1216 unsigned ValueReg, bool IsKill) {
1217 MachineFunction *MF = MBB.getParent();
1218 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1219 const SIInstrInfo *TII = ST.getInstrInfo();
1220
1221 MCPhysReg Reg = MFI->getVGPRToAGPRSpill(FrameIndex: Index, Lane);
1222
1223 if (Reg == AMDGPU::NoRegister)
1224 return MachineInstrBuilder();
1225
1226 bool IsStore = MI->mayStore();
1227 MachineRegisterInfo &MRI = MF->getRegInfo();
1228 auto *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
1229
1230 unsigned Dst = IsStore ? Reg : ValueReg;
1231 unsigned Src = IsStore ? ValueReg : Reg;
1232 bool IsVGPR = TRI->isVGPR(MRI, Reg);
1233 DebugLoc DL = MI->getDebugLoc();
1234 if (IsVGPR == TRI->isVGPR(MRI, ValueReg)) {
1235 // Spiller during regalloc may restore a spilled register to its superclass.
1236 // It could result in AGPR spills restored to VGPRs or the other way around,
1237 // making the src and dst with identical regclasses at this point. It just
1238 // needs a copy in such cases.
1239 auto CopyMIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), Dst)
1240 .addReg(Src, getKillRegState(IsKill));
1241 CopyMIB->setAsmPrinterFlag(MachineInstr::ReloadReuse);
1242 return CopyMIB;
1243 }
1244 unsigned Opc = (IsStore ^ IsVGPR) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64
1245 : AMDGPU::V_ACCVGPR_READ_B32_e64;
1246
1247 auto MIB = BuildMI(MBB, MI, DL, TII->get(Opc), Dst)
1248 .addReg(Src, getKillRegState(B: IsKill));
1249 MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse);
1250 return MIB;
1251}
1252
1253// This differs from buildSpillLoadStore by only scavenging a VGPR. It does not
1254// need to handle the case where an SGPR may need to be spilled while spilling.
1255static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST,
1256 MachineFrameInfo &MFI,
1257 MachineBasicBlock::iterator MI,
1258 int Index,
1259 int64_t Offset) {
1260 const SIInstrInfo *TII = ST.getInstrInfo();
1261 MachineBasicBlock *MBB = MI->getParent();
1262 const DebugLoc &DL = MI->getDebugLoc();
1263 bool IsStore = MI->mayStore();
1264
1265 unsigned Opc = MI->getOpcode();
1266 int LoadStoreOp = IsStore ?
1267 getOffsetMUBUFStore(Opc) : getOffsetMUBUFLoad(Opc);
1268 if (LoadStoreOp == -1)
1269 return false;
1270
1271 const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata);
1272 if (spillVGPRtoAGPR(ST, MBB&: *MBB, MI, Index, Lane: 0, ValueReg: Reg->getReg(), IsKill: false).getInstr())
1273 return true;
1274
1275 MachineInstrBuilder NewMI =
1276 BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
1277 .add(*Reg)
1278 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc))
1279 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset))
1280 .addImm(Offset)
1281 .addImm(0) // cpol
1282 .addImm(0) // swz
1283 .cloneMemRefs(*MI);
1284
1285 const MachineOperand *VDataIn = TII->getNamedOperand(*MI,
1286 AMDGPU::OpName::vdata_in);
1287 if (VDataIn)
1288 NewMI.add(MO: *VDataIn);
1289 return true;
1290}
1291
1292static unsigned getFlatScratchSpillOpcode(const SIInstrInfo *TII,
1293 unsigned LoadStoreOp,
1294 unsigned EltSize) {
1295 bool IsStore = TII->get(LoadStoreOp).mayStore();
1296 bool HasVAddr = AMDGPU::hasNamedOperand(LoadStoreOp, AMDGPU::OpName::vaddr);
1297 bool UseST =
1298 !HasVAddr && !AMDGPU::hasNamedOperand(LoadStoreOp, AMDGPU::OpName::saddr);
1299
1300 switch (EltSize) {
1301 case 4:
1302 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
1303 : AMDGPU::SCRATCH_LOAD_DWORD_SADDR;
1304 break;
1305 case 8:
1306 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX2_SADDR
1307 : AMDGPU::SCRATCH_LOAD_DWORDX2_SADDR;
1308 break;
1309 case 12:
1310 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX3_SADDR
1311 : AMDGPU::SCRATCH_LOAD_DWORDX3_SADDR;
1312 break;
1313 case 16:
1314 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX4_SADDR
1315 : AMDGPU::SCRATCH_LOAD_DWORDX4_SADDR;
1316 break;
1317 default:
1318 llvm_unreachable("Unexpected spill load/store size!");
1319 }
1320
1321 if (HasVAddr)
1322 LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(Opcode: LoadStoreOp);
1323 else if (UseST)
1324 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(Opcode: LoadStoreOp);
1325
1326 return LoadStoreOp;
1327}
1328
1329void SIRegisterInfo::buildSpillLoadStore(
1330 MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL,
1331 unsigned LoadStoreOp, int Index, Register ValueReg, bool IsKill,
1332 MCRegister ScratchOffsetReg, int64_t InstOffset, MachineMemOperand *MMO,
1333 RegScavenger *RS, LiveRegUnits *LiveUnits) const {
1334 assert((!RS || !LiveUnits) && "Only RS or LiveUnits can be set but not both");
1335
1336 MachineFunction *MF = MBB.getParent();
1337 const SIInstrInfo *TII = ST.getInstrInfo();
1338 const MachineFrameInfo &MFI = MF->getFrameInfo();
1339 const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
1340
1341 const MCInstrDesc *Desc = &TII->get(LoadStoreOp);
1342 bool IsStore = Desc->mayStore();
1343 bool IsFlat = TII->isFLATScratch(Opcode: LoadStoreOp);
1344
1345 bool CanClobberSCC = false;
1346 bool Scavenged = false;
1347 MCRegister SOffset = ScratchOffsetReg;
1348
1349 const TargetRegisterClass *RC = getRegClassForReg(MRI: MF->getRegInfo(), Reg: ValueReg);
1350 // On gfx90a+ AGPR is a regular VGPR acceptable for loads and stores.
1351 const bool IsAGPR = !ST.hasGFX90AInsts() && isAGPRClass(RC);
1352 const unsigned RegWidth = AMDGPU::getRegBitWidth(RC: *RC) / 8;
1353
1354 // Always use 4 byte operations for AGPRs because we need to scavenge
1355 // a temporary VGPR.
1356 unsigned EltSize = (IsFlat && !IsAGPR) ? std::min(a: RegWidth, b: 16u) : 4u;
1357 unsigned NumSubRegs = RegWidth / EltSize;
1358 unsigned Size = NumSubRegs * EltSize;
1359 unsigned RemSize = RegWidth - Size;
1360 unsigned NumRemSubRegs = RemSize ? 1 : 0;
1361 int64_t Offset = InstOffset + MFI.getObjectOffset(ObjectIdx: Index);
1362 int64_t MaterializedOffset = Offset;
1363
1364 int64_t MaxOffset = Offset + Size + RemSize - EltSize;
1365 int64_t ScratchOffsetRegDelta = 0;
1366
1367 if (IsFlat && EltSize > 4) {
1368 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
1369 Desc = &TII->get(LoadStoreOp);
1370 }
1371
1372 Align Alignment = MFI.getObjectAlign(ObjectIdx: Index);
1373 const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo();
1374
1375 assert((IsFlat || ((Offset % EltSize) == 0)) &&
1376 "unexpected VGPR spill offset");
1377
1378 // Track a VGPR to use for a constant offset we need to materialize.
1379 Register TmpOffsetVGPR;
1380
1381 // Track a VGPR to use as an intermediate value.
1382 Register TmpIntermediateVGPR;
1383 bool UseVGPROffset = false;
1384
1385 // Materialize a VGPR offset required for the given SGPR/VGPR/Immediate
1386 // combination.
1387 auto MaterializeVOffset = [&](Register SGPRBase, Register TmpVGPR,
1388 int64_t VOffset) {
1389 // We are using a VGPR offset
1390 if (IsFlat && SGPRBase) {
1391 // We only have 1 VGPR offset, or 1 SGPR offset. We don't have a free
1392 // SGPR, so perform the add as vector.
1393 // We don't need a base SGPR in the kernel.
1394
1395 if (ST.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) >= 2) {
1396 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64), TmpVGPR)
1397 .addReg(SGPRBase)
1398 .addImm(VOffset)
1399 .addImm(0); // clamp
1400 } else {
1401 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
1402 .addReg(SGPRBase);
1403 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e32), TmpVGPR)
1404 .addImm(VOffset)
1405 .addReg(TmpOffsetVGPR);
1406 }
1407 } else {
1408 assert(TmpOffsetVGPR);
1409 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
1410 .addImm(VOffset);
1411 }
1412 };
1413
1414 bool IsOffsetLegal =
1415 IsFlat ? TII->isLegalFLATOffset(Offset: MaxOffset, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS,
1416 FlatVariant: SIInstrFlags::FlatScratch)
1417 : TII->isLegalMUBUFImmOffset(Imm: MaxOffset);
1418 if (!IsOffsetLegal || (IsFlat && !SOffset && !ST.hasFlatScratchSTMode())) {
1419 SOffset = MCRegister();
1420
1421 // We don't have access to the register scavenger if this function is called
1422 // during PEI::scavengeFrameVirtualRegs() so use LiveUnits in this case.
1423 // TODO: Clobbering SCC is not necessary for scratch instructions in the
1424 // entry.
1425 if (RS) {
1426 SOffset = RS->scavengeRegisterBackwards(AMDGPU::SGPR_32RegClass, MI, false, 0, false);
1427
1428 // Piggy back on the liveness scan we just did see if SCC is dead.
1429 CanClobberSCC = !RS->isRegUsed(AMDGPU::SCC);
1430 } else if (LiveUnits) {
1431 CanClobberSCC = LiveUnits->available(AMDGPU::SCC);
1432 for (MCRegister Reg : AMDGPU::SGPR_32RegClass) {
1433 if (LiveUnits->available(Reg) && !MF->getRegInfo().isReserved(Reg)) {
1434 SOffset = Reg;
1435 break;
1436 }
1437 }
1438 }
1439
1440 if (ScratchOffsetReg != AMDGPU::NoRegister && !CanClobberSCC)
1441 SOffset = Register();
1442
1443 if (!SOffset) {
1444 UseVGPROffset = true;
1445
1446 if (RS) {
1447 TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0);
1448 } else {
1449 assert(LiveUnits);
1450 for (MCRegister Reg : AMDGPU::VGPR_32RegClass) {
1451 if (LiveUnits->available(Reg) && !MF->getRegInfo().isReserved(Reg)) {
1452 TmpOffsetVGPR = Reg;
1453 break;
1454 }
1455 }
1456 }
1457
1458 assert(TmpOffsetVGPR);
1459 } else if (!SOffset && CanClobberSCC) {
1460 // There are no free SGPRs, and since we are in the process of spilling
1461 // VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true
1462 // on SI/CI and on VI it is true until we implement spilling using scalar
1463 // stores), we have no way to free up an SGPR. Our solution here is to
1464 // add the offset directly to the ScratchOffset or StackPtrOffset
1465 // register, and then subtract the offset after the spill to return the
1466 // register to it's original value.
1467
1468 // TODO: If we don't have to do an emergency stack slot spill, converting
1469 // to use the VGPR offset is fewer instructions.
1470 if (!ScratchOffsetReg)
1471 ScratchOffsetReg = FuncInfo->getStackPtrOffsetReg();
1472 SOffset = ScratchOffsetReg;
1473 ScratchOffsetRegDelta = Offset;
1474 } else {
1475 Scavenged = true;
1476 }
1477
1478 // We currently only support spilling VGPRs to EltSize boundaries, meaning
1479 // we can simplify the adjustment of Offset here to just scale with
1480 // WavefrontSize.
1481 if (!IsFlat && !UseVGPROffset)
1482 Offset *= ST.getWavefrontSize();
1483
1484 if (!UseVGPROffset && !SOffset)
1485 report_fatal_error(reason: "could not scavenge SGPR to spill in entry function");
1486
1487 if (UseVGPROffset) {
1488 // We are using a VGPR offset
1489 MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, Offset);
1490 } else if (ScratchOffsetReg == AMDGPU::NoRegister) {
1491 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset).addImm(Offset);
1492 } else {
1493 assert(Offset != 0);
1494 auto Add = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset)
1495 .addReg(ScratchOffsetReg)
1496 .addImm(Offset);
1497 Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1498 }
1499
1500 Offset = 0;
1501 }
1502
1503 if (IsFlat && SOffset == AMDGPU::NoRegister) {
1504 assert(AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0
1505 && "Unexpected vaddr for flat scratch with a FI operand");
1506
1507 if (UseVGPROffset) {
1508 LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(Opcode: LoadStoreOp);
1509 } else {
1510 assert(ST.hasFlatScratchSTMode());
1511 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(Opcode: LoadStoreOp);
1512 }
1513
1514 Desc = &TII->get(LoadStoreOp);
1515 }
1516
1517 for (unsigned i = 0, e = NumSubRegs + NumRemSubRegs, RegOffset = 0; i != e;
1518 ++i, RegOffset += EltSize) {
1519 if (i == NumSubRegs) {
1520 EltSize = RemSize;
1521 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
1522 }
1523 Desc = &TII->get(LoadStoreOp);
1524
1525 if (!IsFlat && UseVGPROffset) {
1526 int NewLoadStoreOp = IsStore ? getOffenMUBUFStore(Opc: LoadStoreOp)
1527 : getOffenMUBUFLoad(Opc: LoadStoreOp);
1528 Desc = &TII->get(NewLoadStoreOp);
1529 }
1530
1531 if (UseVGPROffset && TmpOffsetVGPR == TmpIntermediateVGPR) {
1532 // If we are spilling an AGPR beyond the range of the memory instruction
1533 // offset and need to use a VGPR offset, we ideally have at least 2
1534 // scratch VGPRs. If we don't have a second free VGPR without spilling,
1535 // recycle the VGPR used for the offset which requires resetting after
1536 // each subregister.
1537
1538 MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, MaterializedOffset);
1539 }
1540
1541 unsigned NumRegs = EltSize / 4;
1542 Register SubReg = e == 1
1543 ? ValueReg
1544 : Register(getSubReg(ValueReg,
1545 getSubRegFromChannel(Channel: RegOffset / 4, NumRegs)));
1546
1547 unsigned SOffsetRegState = 0;
1548 unsigned SrcDstRegState = getDefRegState(B: !IsStore);
1549 const bool IsLastSubReg = i + 1 == e;
1550 const bool IsFirstSubReg = i == 0;
1551 if (IsLastSubReg) {
1552 SOffsetRegState |= getKillRegState(B: Scavenged);
1553 // The last implicit use carries the "Kill" flag.
1554 SrcDstRegState |= getKillRegState(B: IsKill);
1555 }
1556
1557 // Make sure the whole register is defined if there are undef components by
1558 // adding an implicit def of the super-reg on the first instruction.
1559 bool NeedSuperRegDef = e > 1 && IsStore && IsFirstSubReg;
1560 bool NeedSuperRegImpOperand = e > 1;
1561
1562 // Remaining element size to spill into memory after some parts of it
1563 // spilled into either AGPRs or VGPRs.
1564 unsigned RemEltSize = EltSize;
1565
1566 // AGPRs to spill VGPRs and vice versa are allocated in a reverse order,
1567 // starting from the last lane. In case if a register cannot be completely
1568 // spilled into another register that will ensure its alignment does not
1569 // change. For targets with VGPR alignment requirement this is important
1570 // in case of flat scratch usage as we might get a scratch_load or
1571 // scratch_store of an unaligned register otherwise.
1572 for (int LaneS = (RegOffset + EltSize) / 4 - 1, Lane = LaneS,
1573 LaneE = RegOffset / 4;
1574 Lane >= LaneE; --Lane) {
1575 bool IsSubReg = e > 1 || EltSize > 4;
1576 Register Sub = IsSubReg
1577 ? Register(getSubReg(ValueReg, getSubRegFromChannel(Channel: Lane)))
1578 : ValueReg;
1579 auto MIB = spillVGPRtoAGPR(ST, MBB, MI, Index, Lane, ValueReg: Sub, IsKill);
1580 if (!MIB.getInstr())
1581 break;
1582 if (NeedSuperRegDef || (IsSubReg && IsStore && Lane == LaneS && IsFirstSubReg)) {
1583 MIB.addReg(ValueReg, RegState::ImplicitDefine);
1584 NeedSuperRegDef = false;
1585 }
1586 if ((IsSubReg || NeedSuperRegImpOperand) && (IsFirstSubReg || IsLastSubReg)) {
1587 NeedSuperRegImpOperand = true;
1588 unsigned State = SrcDstRegState;
1589 if (!IsLastSubReg || (Lane != LaneE))
1590 State &= ~RegState::Kill;
1591 if (!IsFirstSubReg || (Lane != LaneS))
1592 State &= ~RegState::Define;
1593 MIB.addReg(ValueReg, RegState::Implicit | State);
1594 }
1595 RemEltSize -= 4;
1596 }
1597
1598 if (!RemEltSize) // Fully spilled into AGPRs.
1599 continue;
1600
1601 if (RemEltSize != EltSize) { // Partially spilled to AGPRs
1602 assert(IsFlat && EltSize > 4);
1603
1604 unsigned NumRegs = RemEltSize / 4;
1605 SubReg = Register(getSubReg(ValueReg,
1606 getSubRegFromChannel(Channel: RegOffset / 4, NumRegs)));
1607 unsigned Opc = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize: RemEltSize);
1608 Desc = &TII->get(Opc);
1609 }
1610
1611 unsigned FinalReg = SubReg;
1612
1613 if (IsAGPR) {
1614 assert(EltSize == 4);
1615
1616 if (!TmpIntermediateVGPR) {
1617 TmpIntermediateVGPR = FuncInfo->getVGPRForAGPRCopy();
1618 assert(MF->getRegInfo().isReserved(TmpIntermediateVGPR));
1619 }
1620 if (IsStore) {
1621 auto AccRead = BuildMI(MBB, MI, DL,
1622 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64),
1623 TmpIntermediateVGPR)
1624 .addReg(SubReg, getKillRegState(IsKill));
1625 if (NeedSuperRegDef)
1626 AccRead.addReg(ValueReg, RegState::ImplicitDefine);
1627 AccRead->setAsmPrinterFlag(MachineInstr::ReloadReuse);
1628 }
1629 SubReg = TmpIntermediateVGPR;
1630 } else if (UseVGPROffset) {
1631 if (!TmpOffsetVGPR) {
1632 TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass,
1633 MI, false, 0);
1634 RS->setRegUsed(Reg: TmpOffsetVGPR);
1635 }
1636 }
1637
1638 MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(O: RegOffset);
1639 MachineMemOperand *NewMMO =
1640 MF->getMachineMemOperand(PtrInfo: PInfo, F: MMO->getFlags(), Size: RemEltSize,
1641 BaseAlignment: commonAlignment(A: Alignment, Offset: RegOffset));
1642
1643 auto MIB =
1644 BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: *Desc)
1645 .addReg(RegNo: SubReg, flags: getDefRegState(B: !IsStore) | getKillRegState(B: IsKill));
1646
1647 if (UseVGPROffset) {
1648 // For an AGPR spill, we reuse the same temp VGPR for the offset and the
1649 // intermediate accvgpr_write.
1650 MIB.addReg(TmpOffsetVGPR, getKillRegState(B: IsLastSubReg && !IsAGPR));
1651 }
1652
1653 if (!IsFlat)
1654 MIB.addReg(FuncInfo->getScratchRSrcReg());
1655
1656 if (SOffset == AMDGPU::NoRegister) {
1657 if (!IsFlat) {
1658 if (UseVGPROffset && ScratchOffsetReg) {
1659 MIB.addReg(ScratchOffsetReg);
1660 } else {
1661 assert(FuncInfo->isBottomOfStack());
1662 MIB.addImm(0);
1663 }
1664 }
1665 } else {
1666 MIB.addReg(SOffset, SOffsetRegState);
1667 }
1668
1669 MIB.addImm(Offset + RegOffset);
1670
1671 bool LastUse = MMO->getFlags() & MOLastUse;
1672 MIB.addImm(LastUse ? AMDGPU::CPol::TH_LU : 0); // cpol
1673
1674 if (!IsFlat)
1675 MIB.addImm(0); // swz
1676 MIB.addMemOperand(NewMMO);
1677
1678 if (!IsAGPR && NeedSuperRegDef)
1679 MIB.addReg(ValueReg, RegState::ImplicitDefine);
1680
1681 if (!IsStore && IsAGPR && TmpIntermediateVGPR != AMDGPU::NoRegister) {
1682 MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64),
1683 FinalReg)
1684 .addReg(TmpIntermediateVGPR, RegState::Kill);
1685 MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse);
1686 }
1687
1688 if (NeedSuperRegImpOperand && (IsFirstSubReg || IsLastSubReg))
1689 MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState);
1690
1691 // The epilog restore of a wwm-scratch register can cause undesired
1692 // optimization during machine-cp post PrologEpilogInserter if the same
1693 // register was assigned for return value ABI lowering with a COPY
1694 // instruction. As given below, with the epilog reload, the earlier COPY
1695 // appeared to be dead during machine-cp.
1696 // ...
1697 // v0 in WWM operation, needs the WWM spill at prolog/epilog.
1698 // $vgpr0 = V_WRITELANE_B32 $sgpr20, 0, $vgpr0
1699 // ...
1700 // Epilog block:
1701 // $vgpr0 = COPY $vgpr1 // outgoing value moved to v0
1702 // ...
1703 // WWM spill restore to preserve the inactive lanes of v0.
1704 // $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1
1705 // $vgpr0 = BUFFER_LOAD $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0
1706 // $exec = S_MOV_B64 killed $sgpr4_sgpr5
1707 // ...
1708 // SI_RETURN implicit $vgpr0
1709 // ...
1710 // To fix it, mark the same reg as a tied op for such restore instructions
1711 // so that it marks a usage for the preceding COPY.
1712 if (!IsStore && MI != MBB.end() && MI->isReturn() &&
1713 MI->readsRegister(SubReg, this)) {
1714 MIB.addReg(SubReg, RegState::Implicit);
1715 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1716 }
1717 }
1718
1719 if (ScratchOffsetRegDelta != 0) {
1720 // Subtract the offset we added to the ScratchOffset register.
1721 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset)
1722 .addReg(SOffset)
1723 .addImm(-ScratchOffsetRegDelta);
1724 }
1725}
1726
1727void SIRegisterInfo::buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index,
1728 int Offset, bool IsLoad,
1729 bool IsKill) const {
1730 // Load/store VGPR
1731 MachineFrameInfo &FrameInfo = SB.MF.getFrameInfo();
1732 assert(FrameInfo.getStackID(Index) != TargetStackID::SGPRSpill);
1733
1734 Register FrameReg =
1735 FrameInfo.isFixedObjectIndex(ObjectIdx: Index) && hasBasePointer(MF: SB.MF)
1736 ? getBaseRegister()
1737 : getFrameRegister(MF: SB.MF);
1738
1739 Align Alignment = FrameInfo.getObjectAlign(ObjectIdx: Index);
1740 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF&: SB.MF, FI: Index);
1741 MachineMemOperand *MMO = SB.MF.getMachineMemOperand(
1742 PtrInfo, F: IsLoad ? MachineMemOperand::MOLoad : MachineMemOperand::MOStore,
1743 Size: SB.EltSize, BaseAlignment: Alignment);
1744
1745 if (IsLoad) {
1746 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
1747 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
1748 buildSpillLoadStore(MBB&: *SB.MBB, MI: SB.MI, DL: SB.DL, LoadStoreOp: Opc, Index, ValueReg: SB.TmpVGPR, IsKill: false,
1749 ScratchOffsetReg: FrameReg, InstOffset: (int64_t)Offset * SB.EltSize, MMO, RS: SB.RS);
1750 } else {
1751 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
1752 : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
1753 buildSpillLoadStore(MBB&: *SB.MBB, MI: SB.MI, DL: SB.DL, LoadStoreOp: Opc, Index, ValueReg: SB.TmpVGPR, IsKill,
1754 ScratchOffsetReg: FrameReg, InstOffset: (int64_t)Offset * SB.EltSize, MMO, RS: SB.RS);
1755 // This only ever adds one VGPR spill
1756 SB.MFI.addToSpilledVGPRs(num: 1);
1757 }
1758}
1759
1760bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index,
1761 RegScavenger *RS, SlotIndexes *Indexes,
1762 LiveIntervals *LIS, bool OnlyToVGPR,
1763 bool SpillToPhysVGPRLane) const {
1764 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
1765
1766 ArrayRef<SpilledReg> VGPRSpills =
1767 SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(FrameIndex: Index)
1768 : SB.MFI.getSGPRSpillToVirtualVGPRLanes(FrameIndex: Index);
1769 bool SpillToVGPR = !VGPRSpills.empty();
1770 if (OnlyToVGPR && !SpillToVGPR)
1771 return false;
1772
1773 assert(SpillToVGPR || (SB.SuperReg != SB.MFI.getStackPtrOffsetReg() &&
1774 SB.SuperReg != SB.MFI.getFrameOffsetReg()));
1775
1776 if (SpillToVGPR) {
1777
1778 assert(SB.NumSubRegs == VGPRSpills.size() &&
1779 "Num of VGPR lanes should be equal to num of SGPRs spilled");
1780
1781 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
1782 Register SubReg =
1783 SB.NumSubRegs == 1
1784 ? SB.SuperReg
1785 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
1786 SpilledReg Spill = VGPRSpills[i];
1787
1788 bool IsFirstSubreg = i == 0;
1789 bool IsLastSubreg = i == SB.NumSubRegs - 1;
1790 bool UseKill = SB.IsKill && IsLastSubreg;
1791
1792
1793 // Mark the "old value of vgpr" input undef only if this is the first sgpr
1794 // spill to this specific vgpr in the first basic block.
1795 auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
1796 SB.TII.get(AMDGPU::SI_SPILL_S32_TO_VGPR), Spill.VGPR)
1797 .addReg(SubReg, getKillRegState(UseKill))
1798 .addImm(Spill.Lane)
1799 .addReg(Spill.VGPR);
1800 if (Indexes) {
1801 if (IsFirstSubreg)
1802 Indexes->replaceMachineInstrInMaps(MI&: *MI, NewMI&: *MIB);
1803 else
1804 Indexes->insertMachineInstrInMaps(MI&: *MIB);
1805 }
1806
1807 if (IsFirstSubreg && SB.NumSubRegs > 1) {
1808 // We may be spilling a super-register which is only partially defined,
1809 // and need to ensure later spills think the value is defined.
1810 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine);
1811 }
1812
1813 if (SB.NumSubRegs > 1 && (IsFirstSubreg || IsLastSubreg))
1814 MIB.addReg(SB.SuperReg, getKillRegState(B: UseKill) | RegState::Implicit);
1815
1816 // FIXME: Since this spills to another register instead of an actual
1817 // frame index, we should delete the frame index when all references to
1818 // it are fixed.
1819 }
1820 } else {
1821 SB.prepare();
1822
1823 // SubReg carries the "Kill" flag when SubReg == SB.SuperReg.
1824 unsigned SubKillState = getKillRegState(B: (SB.NumSubRegs == 1) && SB.IsKill);
1825
1826 // Per VGPR helper data
1827 auto PVD = SB.getPerVGPRData();
1828
1829 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
1830 unsigned TmpVGPRFlags = RegState::Undef;
1831
1832 // Write sub registers into the VGPR
1833 for (unsigned i = Offset * PVD.PerVGPR,
1834 e = std::min(a: (Offset + 1) * PVD.PerVGPR, b: SB.NumSubRegs);
1835 i < e; ++i) {
1836 Register SubReg =
1837 SB.NumSubRegs == 1
1838 ? SB.SuperReg
1839 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
1840
1841 MachineInstrBuilder WriteLane =
1842 BuildMI(*SB.MBB, MI, SB.DL,
1843 SB.TII.get(AMDGPU::SI_SPILL_S32_TO_VGPR), SB.TmpVGPR)
1844 .addReg(SubReg, SubKillState)
1845 .addImm(i % PVD.PerVGPR)
1846 .addReg(SB.TmpVGPR, TmpVGPRFlags);
1847 TmpVGPRFlags = 0;
1848
1849 if (Indexes) {
1850 if (i == 0)
1851 Indexes->replaceMachineInstrInMaps(MI&: *MI, NewMI&: *WriteLane);
1852 else
1853 Indexes->insertMachineInstrInMaps(MI&: *WriteLane);
1854 }
1855
1856 // There could be undef components of a spilled super register.
1857 // TODO: Can we detect this and skip the spill?
1858 if (SB.NumSubRegs > 1) {
1859 // The last implicit use of the SB.SuperReg carries the "Kill" flag.
1860 unsigned SuperKillState = 0;
1861 if (i + 1 == SB.NumSubRegs)
1862 SuperKillState |= getKillRegState(B: SB.IsKill);
1863 WriteLane.addReg(RegNo: SB.SuperReg, flags: RegState::Implicit | SuperKillState);
1864 }
1865 }
1866
1867 // Write out VGPR
1868 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ false);
1869 }
1870
1871 SB.restore();
1872 }
1873
1874 MI->eraseFromParent();
1875 SB.MFI.addToSpilledSGPRs(num: SB.NumSubRegs);
1876
1877 if (LIS)
1878 LIS->removeAllRegUnitsForPhysReg(Reg: SB.SuperReg);
1879
1880 return true;
1881}
1882
1883bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, int Index,
1884 RegScavenger *RS, SlotIndexes *Indexes,
1885 LiveIntervals *LIS, bool OnlyToVGPR,
1886 bool SpillToPhysVGPRLane) const {
1887 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
1888
1889 ArrayRef<SpilledReg> VGPRSpills =
1890 SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(FrameIndex: Index)
1891 : SB.MFI.getSGPRSpillToVirtualVGPRLanes(FrameIndex: Index);
1892 bool SpillToVGPR = !VGPRSpills.empty();
1893 if (OnlyToVGPR && !SpillToVGPR)
1894 return false;
1895
1896 if (SpillToVGPR) {
1897 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
1898 Register SubReg =
1899 SB.NumSubRegs == 1
1900 ? SB.SuperReg
1901 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
1902
1903 SpilledReg Spill = VGPRSpills[i];
1904 auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
1905 SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
1906 .addReg(Spill.VGPR)
1907 .addImm(Spill.Lane);
1908 if (SB.NumSubRegs > 1 && i == 0)
1909 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine);
1910 if (Indexes) {
1911 if (i == e - 1)
1912 Indexes->replaceMachineInstrInMaps(MI&: *MI, NewMI&: *MIB);
1913 else
1914 Indexes->insertMachineInstrInMaps(MI&: *MIB);
1915 }
1916 }
1917 } else {
1918 SB.prepare();
1919
1920 // Per VGPR helper data
1921 auto PVD = SB.getPerVGPRData();
1922
1923 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
1924 // Load in VGPR data
1925 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ true);
1926
1927 // Unpack lanes
1928 for (unsigned i = Offset * PVD.PerVGPR,
1929 e = std::min(a: (Offset + 1) * PVD.PerVGPR, b: SB.NumSubRegs);
1930 i < e; ++i) {
1931 Register SubReg =
1932 SB.NumSubRegs == 1
1933 ? SB.SuperReg
1934 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
1935
1936 bool LastSubReg = (i + 1 == e);
1937 auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
1938 SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
1939 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg))
1940 .addImm(i);
1941 if (SB.NumSubRegs > 1 && i == 0)
1942 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine);
1943 if (Indexes) {
1944 if (i == e - 1)
1945 Indexes->replaceMachineInstrInMaps(MI&: *MI, NewMI&: *MIB);
1946 else
1947 Indexes->insertMachineInstrInMaps(MI&: *MIB);
1948 }
1949 }
1950 }
1951
1952 SB.restore();
1953 }
1954
1955 MI->eraseFromParent();
1956
1957 if (LIS)
1958 LIS->removeAllRegUnitsForPhysReg(Reg: SB.SuperReg);
1959
1960 return true;
1961}
1962
1963bool SIRegisterInfo::spillEmergencySGPR(MachineBasicBlock::iterator MI,
1964 MachineBasicBlock &RestoreMBB,
1965 Register SGPR, RegScavenger *RS) const {
1966 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, SGPR, false, 0,
1967 RS);
1968 SB.prepare();
1969 // Generate the spill of SGPR to SB.TmpVGPR.
1970 unsigned SubKillState = getKillRegState(B: (SB.NumSubRegs == 1) && SB.IsKill);
1971 auto PVD = SB.getPerVGPRData();
1972 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
1973 unsigned TmpVGPRFlags = RegState::Undef;
1974 // Write sub registers into the VGPR
1975 for (unsigned i = Offset * PVD.PerVGPR,
1976 e = std::min(a: (Offset + 1) * PVD.PerVGPR, b: SB.NumSubRegs);
1977 i < e; ++i) {
1978 Register SubReg =
1979 SB.NumSubRegs == 1
1980 ? SB.SuperReg
1981 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
1982
1983 MachineInstrBuilder WriteLane =
1984 BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32),
1985 SB.TmpVGPR)
1986 .addReg(SubReg, SubKillState)
1987 .addImm(i % PVD.PerVGPR)
1988 .addReg(SB.TmpVGPR, TmpVGPRFlags);
1989 TmpVGPRFlags = 0;
1990 // There could be undef components of a spilled super register.
1991 // TODO: Can we detect this and skip the spill?
1992 if (SB.NumSubRegs > 1) {
1993 // The last implicit use of the SB.SuperReg carries the "Kill" flag.
1994 unsigned SuperKillState = 0;
1995 if (i + 1 == SB.NumSubRegs)
1996 SuperKillState |= getKillRegState(B: SB.IsKill);
1997 WriteLane.addReg(RegNo: SB.SuperReg, flags: RegState::Implicit | SuperKillState);
1998 }
1999 }
2000 // Don't need to write VGPR out.
2001 }
2002
2003 // Restore clobbered registers in the specified restore block.
2004 MI = RestoreMBB.end();
2005 SB.setMI(NewMBB: &RestoreMBB, NewMI: MI);
2006 // Generate the restore of SGPR from SB.TmpVGPR.
2007 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2008 // Don't need to load VGPR in.
2009 // Unpack lanes
2010 for (unsigned i = Offset * PVD.PerVGPR,
2011 e = std::min(a: (Offset + 1) * PVD.PerVGPR, b: SB.NumSubRegs);
2012 i < e; ++i) {
2013 Register SubReg =
2014 SB.NumSubRegs == 1
2015 ? SB.SuperReg
2016 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2017 bool LastSubReg = (i + 1 == e);
2018 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32),
2019 SubReg)
2020 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg))
2021 .addImm(i);
2022 if (SB.NumSubRegs > 1 && i == 0)
2023 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine);
2024 }
2025 }
2026 SB.restore();
2027
2028 SB.MFI.addToSpilledSGPRs(num: SB.NumSubRegs);
2029 return false;
2030}
2031
2032/// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to
2033/// a VGPR and the stack slot can be safely eliminated when all other users are
2034/// handled.
2035bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex(
2036 MachineBasicBlock::iterator MI, int FI, RegScavenger *RS,
2037 SlotIndexes *Indexes, LiveIntervals *LIS, bool SpillToPhysVGPRLane) const {
2038 switch (MI->getOpcode()) {
2039 case AMDGPU::SI_SPILL_S1024_SAVE:
2040 case AMDGPU::SI_SPILL_S512_SAVE:
2041 case AMDGPU::SI_SPILL_S384_SAVE:
2042 case AMDGPU::SI_SPILL_S352_SAVE:
2043 case AMDGPU::SI_SPILL_S320_SAVE:
2044 case AMDGPU::SI_SPILL_S288_SAVE:
2045 case AMDGPU::SI_SPILL_S256_SAVE:
2046 case AMDGPU::SI_SPILL_S224_SAVE:
2047 case AMDGPU::SI_SPILL_S192_SAVE:
2048 case AMDGPU::SI_SPILL_S160_SAVE:
2049 case AMDGPU::SI_SPILL_S128_SAVE:
2050 case AMDGPU::SI_SPILL_S96_SAVE:
2051 case AMDGPU::SI_SPILL_S64_SAVE:
2052 case AMDGPU::SI_SPILL_S32_SAVE:
2053 return spillSGPR(MI, Index: FI, RS, Indexes, LIS, OnlyToVGPR: true, SpillToPhysVGPRLane);
2054 case AMDGPU::SI_SPILL_S1024_RESTORE:
2055 case AMDGPU::SI_SPILL_S512_RESTORE:
2056 case AMDGPU::SI_SPILL_S384_RESTORE:
2057 case AMDGPU::SI_SPILL_S352_RESTORE:
2058 case AMDGPU::SI_SPILL_S320_RESTORE:
2059 case AMDGPU::SI_SPILL_S288_RESTORE:
2060 case AMDGPU::SI_SPILL_S256_RESTORE:
2061 case AMDGPU::SI_SPILL_S224_RESTORE:
2062 case AMDGPU::SI_SPILL_S192_RESTORE:
2063 case AMDGPU::SI_SPILL_S160_RESTORE:
2064 case AMDGPU::SI_SPILL_S128_RESTORE:
2065 case AMDGPU::SI_SPILL_S96_RESTORE:
2066 case AMDGPU::SI_SPILL_S64_RESTORE:
2067 case AMDGPU::SI_SPILL_S32_RESTORE:
2068 return restoreSGPR(MI, Index: FI, RS, Indexes, LIS, OnlyToVGPR: true, SpillToPhysVGPRLane);
2069 default:
2070 llvm_unreachable("not an SGPR spill instruction");
2071 }
2072}
2073
2074bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2075 int SPAdj, unsigned FIOperandNum,
2076 RegScavenger *RS) const {
2077 MachineFunction *MF = MI->getParent()->getParent();
2078 MachineBasicBlock *MBB = MI->getParent();
2079 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
2080 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
2081 const SIInstrInfo *TII = ST.getInstrInfo();
2082 DebugLoc DL = MI->getDebugLoc();
2083
2084 assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?");
2085
2086 MachineOperand &FIOp = MI->getOperand(i: FIOperandNum);
2087 int Index = MI->getOperand(i: FIOperandNum).getIndex();
2088
2089 Register FrameReg = FrameInfo.isFixedObjectIndex(ObjectIdx: Index) && hasBasePointer(MF: *MF)
2090 ? getBaseRegister()
2091 : getFrameRegister(MF: *MF);
2092
2093 switch (MI->getOpcode()) {
2094 // SGPR register spill
2095 case AMDGPU::SI_SPILL_S1024_SAVE:
2096 case AMDGPU::SI_SPILL_S512_SAVE:
2097 case AMDGPU::SI_SPILL_S384_SAVE:
2098 case AMDGPU::SI_SPILL_S352_SAVE:
2099 case AMDGPU::SI_SPILL_S320_SAVE:
2100 case AMDGPU::SI_SPILL_S288_SAVE:
2101 case AMDGPU::SI_SPILL_S256_SAVE:
2102 case AMDGPU::SI_SPILL_S224_SAVE:
2103 case AMDGPU::SI_SPILL_S192_SAVE:
2104 case AMDGPU::SI_SPILL_S160_SAVE:
2105 case AMDGPU::SI_SPILL_S128_SAVE:
2106 case AMDGPU::SI_SPILL_S96_SAVE:
2107 case AMDGPU::SI_SPILL_S64_SAVE:
2108 case AMDGPU::SI_SPILL_S32_SAVE: {
2109 return spillSGPR(MI, Index, RS);
2110 }
2111
2112 // SGPR register restore
2113 case AMDGPU::SI_SPILL_S1024_RESTORE:
2114 case AMDGPU::SI_SPILL_S512_RESTORE:
2115 case AMDGPU::SI_SPILL_S384_RESTORE:
2116 case AMDGPU::SI_SPILL_S352_RESTORE:
2117 case AMDGPU::SI_SPILL_S320_RESTORE:
2118 case AMDGPU::SI_SPILL_S288_RESTORE:
2119 case AMDGPU::SI_SPILL_S256_RESTORE:
2120 case AMDGPU::SI_SPILL_S224_RESTORE:
2121 case AMDGPU::SI_SPILL_S192_RESTORE:
2122 case AMDGPU::SI_SPILL_S160_RESTORE:
2123 case AMDGPU::SI_SPILL_S128_RESTORE:
2124 case AMDGPU::SI_SPILL_S96_RESTORE:
2125 case AMDGPU::SI_SPILL_S64_RESTORE:
2126 case AMDGPU::SI_SPILL_S32_RESTORE: {
2127 return restoreSGPR(MI, Index, RS);
2128 }
2129
2130 // VGPR register spill
2131 case AMDGPU::SI_SPILL_V1024_SAVE:
2132 case AMDGPU::SI_SPILL_V512_SAVE:
2133 case AMDGPU::SI_SPILL_V384_SAVE:
2134 case AMDGPU::SI_SPILL_V352_SAVE:
2135 case AMDGPU::SI_SPILL_V320_SAVE:
2136 case AMDGPU::SI_SPILL_V288_SAVE:
2137 case AMDGPU::SI_SPILL_V256_SAVE:
2138 case AMDGPU::SI_SPILL_V224_SAVE:
2139 case AMDGPU::SI_SPILL_V192_SAVE:
2140 case AMDGPU::SI_SPILL_V160_SAVE:
2141 case AMDGPU::SI_SPILL_V128_SAVE:
2142 case AMDGPU::SI_SPILL_V96_SAVE:
2143 case AMDGPU::SI_SPILL_V64_SAVE:
2144 case AMDGPU::SI_SPILL_V32_SAVE:
2145 case AMDGPU::SI_SPILL_A1024_SAVE:
2146 case AMDGPU::SI_SPILL_A512_SAVE:
2147 case AMDGPU::SI_SPILL_A384_SAVE:
2148 case AMDGPU::SI_SPILL_A352_SAVE:
2149 case AMDGPU::SI_SPILL_A320_SAVE:
2150 case AMDGPU::SI_SPILL_A288_SAVE:
2151 case AMDGPU::SI_SPILL_A256_SAVE:
2152 case AMDGPU::SI_SPILL_A224_SAVE:
2153 case AMDGPU::SI_SPILL_A192_SAVE:
2154 case AMDGPU::SI_SPILL_A160_SAVE:
2155 case AMDGPU::SI_SPILL_A128_SAVE:
2156 case AMDGPU::SI_SPILL_A96_SAVE:
2157 case AMDGPU::SI_SPILL_A64_SAVE:
2158 case AMDGPU::SI_SPILL_A32_SAVE:
2159 case AMDGPU::SI_SPILL_AV1024_SAVE:
2160 case AMDGPU::SI_SPILL_AV512_SAVE:
2161 case AMDGPU::SI_SPILL_AV384_SAVE:
2162 case AMDGPU::SI_SPILL_AV352_SAVE:
2163 case AMDGPU::SI_SPILL_AV320_SAVE:
2164 case AMDGPU::SI_SPILL_AV288_SAVE:
2165 case AMDGPU::SI_SPILL_AV256_SAVE:
2166 case AMDGPU::SI_SPILL_AV224_SAVE:
2167 case AMDGPU::SI_SPILL_AV192_SAVE:
2168 case AMDGPU::SI_SPILL_AV160_SAVE:
2169 case AMDGPU::SI_SPILL_AV128_SAVE:
2170 case AMDGPU::SI_SPILL_AV96_SAVE:
2171 case AMDGPU::SI_SPILL_AV64_SAVE:
2172 case AMDGPU::SI_SPILL_AV32_SAVE:
2173 case AMDGPU::SI_SPILL_WWM_V32_SAVE:
2174 case AMDGPU::SI_SPILL_WWM_AV32_SAVE: {
2175 const MachineOperand *VData = TII->getNamedOperand(*MI,
2176 AMDGPU::OpName::vdata);
2177 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
2178 MFI->getStackPtrOffsetReg());
2179
2180 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
2181 : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
2182 auto *MBB = MI->getParent();
2183 bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(Opcode: MI->getOpcode());
2184 if (IsWWMRegSpill) {
2185 TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(),
2186 RS->isRegUsed(AMDGPU::SCC));
2187 }
2188 buildSpillLoadStore(
2189 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
2190 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
2191 *MI->memoperands_begin(), RS);
2192 MFI->addToSpilledVGPRs(num: getNumSubRegsForSpillOp(Op: MI->getOpcode()));
2193 if (IsWWMRegSpill)
2194 TII->restoreExec(MF&: *MF, MBB&: *MBB, MBBI: MI, DL, Reg: MFI->getSGPRForEXECCopy());
2195
2196 MI->eraseFromParent();
2197 return true;
2198 }
2199 case AMDGPU::SI_SPILL_V32_RESTORE:
2200 case AMDGPU::SI_SPILL_V64_RESTORE:
2201 case AMDGPU::SI_SPILL_V96_RESTORE:
2202 case AMDGPU::SI_SPILL_V128_RESTORE:
2203 case AMDGPU::SI_SPILL_V160_RESTORE:
2204 case AMDGPU::SI_SPILL_V192_RESTORE:
2205 case AMDGPU::SI_SPILL_V224_RESTORE:
2206 case AMDGPU::SI_SPILL_V256_RESTORE:
2207 case AMDGPU::SI_SPILL_V288_RESTORE:
2208 case AMDGPU::SI_SPILL_V320_RESTORE:
2209 case AMDGPU::SI_SPILL_V352_RESTORE:
2210 case AMDGPU::SI_SPILL_V384_RESTORE:
2211 case AMDGPU::SI_SPILL_V512_RESTORE:
2212 case AMDGPU::SI_SPILL_V1024_RESTORE:
2213 case AMDGPU::SI_SPILL_A32_RESTORE:
2214 case AMDGPU::SI_SPILL_A64_RESTORE:
2215 case AMDGPU::SI_SPILL_A96_RESTORE:
2216 case AMDGPU::SI_SPILL_A128_RESTORE:
2217 case AMDGPU::SI_SPILL_A160_RESTORE:
2218 case AMDGPU::SI_SPILL_A192_RESTORE:
2219 case AMDGPU::SI_SPILL_A224_RESTORE:
2220 case AMDGPU::SI_SPILL_A256_RESTORE:
2221 case AMDGPU::SI_SPILL_A288_RESTORE:
2222 case AMDGPU::SI_SPILL_A320_RESTORE:
2223 case AMDGPU::SI_SPILL_A352_RESTORE:
2224 case AMDGPU::SI_SPILL_A384_RESTORE:
2225 case AMDGPU::SI_SPILL_A512_RESTORE:
2226 case AMDGPU::SI_SPILL_A1024_RESTORE:
2227 case AMDGPU::SI_SPILL_AV32_RESTORE:
2228 case AMDGPU::SI_SPILL_AV64_RESTORE:
2229 case AMDGPU::SI_SPILL_AV96_RESTORE:
2230 case AMDGPU::SI_SPILL_AV128_RESTORE:
2231 case AMDGPU::SI_SPILL_AV160_RESTORE:
2232 case AMDGPU::SI_SPILL_AV192_RESTORE:
2233 case AMDGPU::SI_SPILL_AV224_RESTORE:
2234 case AMDGPU::SI_SPILL_AV256_RESTORE:
2235 case AMDGPU::SI_SPILL_AV288_RESTORE:
2236 case AMDGPU::SI_SPILL_AV320_RESTORE:
2237 case AMDGPU::SI_SPILL_AV352_RESTORE:
2238 case AMDGPU::SI_SPILL_AV384_RESTORE:
2239 case AMDGPU::SI_SPILL_AV512_RESTORE:
2240 case AMDGPU::SI_SPILL_AV1024_RESTORE:
2241 case AMDGPU::SI_SPILL_WWM_V32_RESTORE:
2242 case AMDGPU::SI_SPILL_WWM_AV32_RESTORE: {
2243 const MachineOperand *VData = TII->getNamedOperand(*MI,
2244 AMDGPU::OpName::vdata);
2245 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
2246 MFI->getStackPtrOffsetReg());
2247
2248 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
2249 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
2250 auto *MBB = MI->getParent();
2251 bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(Opcode: MI->getOpcode());
2252 if (IsWWMRegSpill) {
2253 TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(),
2254 RS->isRegUsed(AMDGPU::SCC));
2255 }
2256
2257 buildSpillLoadStore(
2258 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
2259 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
2260 *MI->memoperands_begin(), RS);
2261
2262 if (IsWWMRegSpill)
2263 TII->restoreExec(MF&: *MF, MBB&: *MBB, MBBI: MI, DL, Reg: MFI->getSGPRForEXECCopy());
2264
2265 MI->eraseFromParent();
2266 return true;
2267 }
2268
2269 default: {
2270 // Other access to frame index
2271 const DebugLoc &DL = MI->getDebugLoc();
2272
2273 int64_t Offset = FrameInfo.getObjectOffset(ObjectIdx: Index);
2274 if (ST.enableFlatScratch()) {
2275 if (TII->isFLATScratch(MI: *MI)) {
2276 assert((int16_t)FIOperandNum ==
2277 AMDGPU::getNamedOperandIdx(MI->getOpcode(),
2278 AMDGPU::OpName::saddr));
2279
2280 // The offset is always swizzled, just replace it
2281 if (FrameReg)
2282 FIOp.ChangeToRegister(Reg: FrameReg, isDef: false);
2283
2284 MachineOperand *OffsetOp =
2285 TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
2286 int64_t NewOffset = Offset + OffsetOp->getImm();
2287 if (TII->isLegalFLATOffset(Offset: NewOffset, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS,
2288 FlatVariant: SIInstrFlags::FlatScratch)) {
2289 OffsetOp->setImm(NewOffset);
2290 if (FrameReg)
2291 return false;
2292 Offset = 0;
2293 }
2294
2295 if (!Offset) {
2296 unsigned Opc = MI->getOpcode();
2297 int NewOpc = -1;
2298 if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr)) {
2299 NewOpc = AMDGPU::getFlatScratchInstSVfromSVS(Opcode: Opc);
2300 } else if (ST.hasFlatScratchSTMode()) {
2301 // On GFX10 we have ST mode to use no registers for an address.
2302 // Otherwise we need to materialize 0 into an SGPR.
2303 NewOpc = AMDGPU::getFlatScratchInstSTfromSS(Opcode: Opc);
2304 }
2305
2306 if (NewOpc != -1) {
2307 // removeOperand doesn't fixup tied operand indexes as it goes, so
2308 // it asserts. Untie vdst_in for now and retie them afterwards.
2309 int VDstIn = AMDGPU::getNamedOperandIdx(Opc,
2310 AMDGPU::OpName::vdst_in);
2311 bool TiedVDst = VDstIn != -1 &&
2312 MI->getOperand(i: VDstIn).isReg() &&
2313 MI->getOperand(i: VDstIn).isTied();
2314 if (TiedVDst)
2315 MI->untieRegOperand(OpIdx: VDstIn);
2316
2317 MI->removeOperand(
2318 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr));
2319
2320 if (TiedVDst) {
2321 int NewVDst =
2322 AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
2323 int NewVDstIn =
2324 AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst_in);
2325 assert (NewVDst != -1 && NewVDstIn != -1 && "Must be tied!");
2326 MI->tieOperands(DefIdx: NewVDst, UseIdx: NewVDstIn);
2327 }
2328 MI->setDesc(TII->get(NewOpc));
2329 return false;
2330 }
2331 }
2332 }
2333
2334 if (!FrameReg) {
2335 FIOp.ChangeToImmediate(ImmVal: Offset);
2336 if (TII->isImmOperandLegal(MI: *MI, OpNo: FIOperandNum, MO: FIOp))
2337 return false;
2338 }
2339
2340 // We need to use register here. Check if we can use an SGPR or need
2341 // a VGPR.
2342 FIOp.ChangeToRegister(AMDGPU::M0, false);
2343 bool UseSGPR = TII->isOperandLegal(MI: *MI, OpIdx: FIOperandNum, MO: &FIOp);
2344
2345 if (!Offset && FrameReg && UseSGPR) {
2346 FIOp.setReg(FrameReg);
2347 return false;
2348 }
2349
2350 const TargetRegisterClass *RC = UseSGPR ? &AMDGPU::SReg_32_XM0RegClass
2351 : &AMDGPU::VGPR_32RegClass;
2352
2353 Register TmpReg =
2354 RS->scavengeRegisterBackwards(RC: *RC, To: MI, RestoreAfter: false, SPAdj: 0, AllowSpill: !UseSGPR);
2355 FIOp.setReg(TmpReg);
2356 FIOp.setIsKill();
2357
2358 if ((!FrameReg || !Offset) && TmpReg) {
2359 unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
2360 auto MIB = BuildMI(*MBB, MI, DL, TII->get(Opc), TmpReg);
2361 if (FrameReg)
2362 MIB.addReg(FrameReg);
2363 else
2364 MIB.addImm(Offset);
2365
2366 return false;
2367 }
2368
2369 bool NeedSaveSCC = RS->isRegUsed(AMDGPU::SCC) &&
2370 !MI->definesRegister(AMDGPU::SCC, /*TRI=*/nullptr);
2371
2372 Register TmpSReg =
2373 UseSGPR ? TmpReg
2374 : RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
2375 MI, false, 0, !UseSGPR);
2376
2377 // TODO: for flat scratch another attempt can be made with a VGPR index
2378 // if no SGPRs can be scavenged.
2379 if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR))
2380 report_fatal_error(reason: "Cannot scavenge register in FI elimination!");
2381
2382 if (!TmpSReg) {
2383 // Use frame register and restore it after.
2384 TmpSReg = FrameReg;
2385 FIOp.setReg(FrameReg);
2386 FIOp.setIsKill(false);
2387 }
2388
2389 if (NeedSaveSCC) {
2390 assert(!(Offset & 0x1) && "Flat scratch offset must be aligned!");
2391 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADDC_U32), TmpSReg)
2392 .addReg(FrameReg)
2393 .addImm(Offset);
2394 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITCMP1_B32))
2395 .addReg(TmpSReg)
2396 .addImm(0);
2397 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITSET0_B32), TmpSReg)
2398 .addImm(0)
2399 .addReg(TmpSReg);
2400 } else {
2401 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpSReg)
2402 .addReg(FrameReg)
2403 .addImm(Offset);
2404 }
2405
2406 if (!UseSGPR)
2407 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
2408 .addReg(TmpSReg, RegState::Kill);
2409
2410 if (TmpSReg == FrameReg) {
2411 // Undo frame register modification.
2412 if (NeedSaveSCC &&
2413 !MI->registerDefIsDead(AMDGPU::SCC, /*TRI=*/nullptr)) {
2414 MachineBasicBlock::iterator I =
2415 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADDC_U32),
2416 TmpSReg)
2417 .addReg(FrameReg)
2418 .addImm(-Offset);
2419 I = BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITCMP1_B32))
2420 .addReg(TmpSReg)
2421 .addImm(0);
2422 BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITSET0_B32),
2423 TmpSReg)
2424 .addImm(0)
2425 .addReg(TmpSReg);
2426 } else {
2427 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADD_I32),
2428 FrameReg)
2429 .addReg(FrameReg)
2430 .addImm(-Offset);
2431 }
2432 }
2433
2434 return false;
2435 }
2436
2437 bool IsMUBUF = TII->isMUBUF(MI: *MI);
2438
2439 if (!IsMUBUF && !MFI->isBottomOfStack()) {
2440 // Convert to a swizzled stack address by scaling by the wave size.
2441 // In an entry function/kernel the offset is already swizzled.
2442 bool IsSALU = isSGPRClass(RC: TII->getOpRegClass(MI: *MI, OpNo: FIOperandNum));
2443 bool LiveSCC = RS->isRegUsed(AMDGPU::SCC) &&
2444 !MI->definesRegister(AMDGPU::SCC, /*TRI=*/nullptr);
2445 const TargetRegisterClass *RC = IsSALU && !LiveSCC
2446 ? &AMDGPU::SReg_32RegClass
2447 : &AMDGPU::VGPR_32RegClass;
2448 bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32 ||
2449 MI->getOpcode() == AMDGPU::V_MOV_B32_e64;
2450 Register ResultReg =
2451 IsCopy ? MI->getOperand(i: 0).getReg()
2452 : RS->scavengeRegisterBackwards(RC: *RC, To: MI, RestoreAfter: false, SPAdj: 0);
2453
2454 int64_t Offset = FrameInfo.getObjectOffset(ObjectIdx: Index);
2455 if (Offset == 0) {
2456 unsigned OpCode = IsSALU && !LiveSCC ? AMDGPU::S_LSHR_B32
2457 : AMDGPU::V_LSHRREV_B32_e64;
2458 auto Shift = BuildMI(*MBB, MI, DL, TII->get(OpCode), ResultReg);
2459 if (OpCode == AMDGPU::V_LSHRREV_B32_e64)
2460 // For V_LSHRREV, the operands are reversed (the shift count goes
2461 // first).
2462 Shift.addImm(ST.getWavefrontSizeLog2()).addReg(FrameReg);
2463 else
2464 Shift.addReg(FrameReg).addImm(ST.getWavefrontSizeLog2());
2465 if (IsSALU && !LiveSCC)
2466 Shift.getInstr()->getOperand(3).setIsDead(); // Mark SCC as dead.
2467 if (IsSALU && LiveSCC) {
2468 Register NewDest = RS->scavengeRegisterBackwards(
2469 AMDGPU::SReg_32RegClass, Shift, false, 0);
2470 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
2471 NewDest)
2472 .addReg(ResultReg);
2473 ResultReg = NewDest;
2474 }
2475 } else {
2476 MachineInstrBuilder MIB;
2477 if (!IsSALU) {
2478 if ((MIB = TII->getAddNoCarry(MBB&: *MBB, I: MI, DL, DestReg: ResultReg, RS&: *RS)) !=
2479 nullptr) {
2480 // Reuse ResultReg in intermediate step.
2481 Register ScaledReg = ResultReg;
2482
2483 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
2484 ScaledReg)
2485 .addImm(ST.getWavefrontSizeLog2())
2486 .addReg(FrameReg);
2487
2488 const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32;
2489
2490 // TODO: Fold if use instruction is another add of a constant.
2491 if (IsVOP2 || AMDGPU::isInlinableLiteral32(Literal: Offset, HasInv2Pi: ST.hasInv2PiInlineImm())) {
2492 // FIXME: This can fail
2493 MIB.addImm(Val: Offset);
2494 MIB.addReg(RegNo: ScaledReg, flags: RegState::Kill);
2495 if (!IsVOP2)
2496 MIB.addImm(Val: 0); // clamp bit
2497 } else {
2498 assert(MIB->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 &&
2499 "Need to reuse carry out register");
2500
2501 // Use scavenged unused carry out as offset register.
2502 Register ConstOffsetReg;
2503 if (!isWave32)
2504 ConstOffsetReg = getSubReg(MIB.getReg(1), AMDGPU::sub0);
2505 else
2506 ConstOffsetReg = MIB.getReg(Idx: 1);
2507
2508 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg)
2509 .addImm(Offset);
2510 MIB.addReg(RegNo: ConstOffsetReg, flags: RegState::Kill);
2511 MIB.addReg(RegNo: ScaledReg, flags: RegState::Kill);
2512 MIB.addImm(Val: 0); // clamp bit
2513 }
2514 }
2515 }
2516 if (!MIB || IsSALU) {
2517 // We have to produce a carry out, and there isn't a free SGPR pair
2518 // for it. We can keep the whole computation on the SALU to avoid
2519 // clobbering an additional register at the cost of an extra mov.
2520
2521 // We may have 1 free scratch SGPR even though a carry out is
2522 // unavailable. Only one additional mov is needed.
2523 Register TmpScaledReg = RS->scavengeRegisterBackwards(
2524 AMDGPU::SReg_32_XM0RegClass, MI, false, 0, false);
2525 Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg;
2526
2527 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg)
2528 .addReg(FrameReg)
2529 .addImm(ST.getWavefrontSizeLog2());
2530 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg)
2531 .addReg(ScaledReg, RegState::Kill)
2532 .addImm(Offset);
2533 if (!IsSALU)
2534 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg)
2535 .addReg(ScaledReg, RegState::Kill);
2536 else
2537 ResultReg = ScaledReg;
2538
2539 // If there were truly no free SGPRs, we need to undo everything.
2540 if (!TmpScaledReg.isValid()) {
2541 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg)
2542 .addReg(ScaledReg, RegState::Kill)
2543 .addImm(-Offset);
2544 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg)
2545 .addReg(FrameReg)
2546 .addImm(ST.getWavefrontSizeLog2());
2547 }
2548 }
2549 }
2550
2551 // Don't introduce an extra copy if we're just materializing in a mov.
2552 if (IsCopy) {
2553 MI->eraseFromParent();
2554 return true;
2555 }
2556 FIOp.ChangeToRegister(Reg: ResultReg, isDef: false, isImp: false, isKill: true);
2557 return false;
2558 }
2559
2560 if (IsMUBUF) {
2561 // Disable offen so we don't need a 0 vgpr base.
2562 assert(static_cast<int>(FIOperandNum) ==
2563 AMDGPU::getNamedOperandIdx(MI->getOpcode(),
2564 AMDGPU::OpName::vaddr));
2565
2566 auto &SOffset = *TII->getNamedOperand(*MI, AMDGPU::OpName::soffset);
2567 assert((SOffset.isImm() && SOffset.getImm() == 0));
2568
2569 if (FrameReg != AMDGPU::NoRegister)
2570 SOffset.ChangeToRegister(FrameReg, false);
2571
2572 int64_t Offset = FrameInfo.getObjectOffset(ObjectIdx: Index);
2573 int64_t OldImm
2574 = TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm();
2575 int64_t NewOffset = OldImm + Offset;
2576
2577 if (TII->isLegalMUBUFImmOffset(Imm: NewOffset) &&
2578 buildMUBUFOffsetLoadStore(ST, MFI&: FrameInfo, MI, Index, Offset: NewOffset)) {
2579 MI->eraseFromParent();
2580 return true;
2581 }
2582 }
2583
2584 // If the offset is simply too big, don't convert to a scratch wave offset
2585 // relative index.
2586
2587 FIOp.ChangeToImmediate(ImmVal: Offset);
2588 if (!TII->isImmOperandLegal(MI: *MI, OpNo: FIOperandNum, MO: FIOp)) {
2589 Register TmpReg = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass,
2590 MI, false, 0);
2591 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
2592 .addImm(Offset);
2593 FIOp.ChangeToRegister(Reg: TmpReg, isDef: false, isImp: false, isKill: true);
2594 }
2595 }
2596 }
2597 return false;
2598}
2599
2600StringRef SIRegisterInfo::getRegAsmName(MCRegister Reg) const {
2601 return AMDGPUInstPrinter::getRegisterName(Reg);
2602}
2603
2604unsigned AMDGPU::getRegBitWidth(const TargetRegisterClass &RC) {
2605 return getRegBitWidth(RCID: RC.getID());
2606}
2607
2608static const TargetRegisterClass *
2609getAnyVGPRClassForBitWidth(unsigned BitWidth) {
2610 if (BitWidth == 64)
2611 return &AMDGPU::VReg_64RegClass;
2612 if (BitWidth == 96)
2613 return &AMDGPU::VReg_96RegClass;
2614 if (BitWidth == 128)
2615 return &AMDGPU::VReg_128RegClass;
2616 if (BitWidth == 160)
2617 return &AMDGPU::VReg_160RegClass;
2618 if (BitWidth == 192)
2619 return &AMDGPU::VReg_192RegClass;
2620 if (BitWidth == 224)
2621 return &AMDGPU::VReg_224RegClass;
2622 if (BitWidth == 256)
2623 return &AMDGPU::VReg_256RegClass;
2624 if (BitWidth == 288)
2625 return &AMDGPU::VReg_288RegClass;
2626 if (BitWidth == 320)
2627 return &AMDGPU::VReg_320RegClass;
2628 if (BitWidth == 352)
2629 return &AMDGPU::VReg_352RegClass;
2630 if (BitWidth == 384)
2631 return &AMDGPU::VReg_384RegClass;
2632 if (BitWidth == 512)
2633 return &AMDGPU::VReg_512RegClass;
2634 if (BitWidth == 1024)
2635 return &AMDGPU::VReg_1024RegClass;
2636
2637 return nullptr;
2638}
2639
2640static const TargetRegisterClass *
2641getAlignedVGPRClassForBitWidth(unsigned BitWidth) {
2642 if (BitWidth == 64)
2643 return &AMDGPU::VReg_64_Align2RegClass;
2644 if (BitWidth == 96)
2645 return &AMDGPU::VReg_96_Align2RegClass;
2646 if (BitWidth == 128)
2647 return &AMDGPU::VReg_128_Align2RegClass;
2648 if (BitWidth == 160)
2649 return &AMDGPU::VReg_160_Align2RegClass;
2650 if (BitWidth == 192)
2651 return &AMDGPU::VReg_192_Align2RegClass;
2652 if (BitWidth == 224)
2653 return &AMDGPU::VReg_224_Align2RegClass;
2654 if (BitWidth == 256)
2655 return &AMDGPU::VReg_256_Align2RegClass;
2656 if (BitWidth == 288)
2657 return &AMDGPU::VReg_288_Align2RegClass;
2658 if (BitWidth == 320)
2659 return &AMDGPU::VReg_320_Align2RegClass;
2660 if (BitWidth == 352)
2661 return &AMDGPU::VReg_352_Align2RegClass;
2662 if (BitWidth == 384)
2663 return &AMDGPU::VReg_384_Align2RegClass;
2664 if (BitWidth == 512)
2665 return &AMDGPU::VReg_512_Align2RegClass;
2666 if (BitWidth == 1024)
2667 return &AMDGPU::VReg_1024_Align2RegClass;
2668
2669 return nullptr;
2670}
2671
2672const TargetRegisterClass *
2673SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) const {
2674 if (BitWidth == 1)
2675 return &AMDGPU::VReg_1RegClass;
2676 if (BitWidth == 16)
2677 return &AMDGPU::VGPR_16RegClass;
2678 if (BitWidth == 32)
2679 return &AMDGPU::VGPR_32RegClass;
2680 return ST.needsAlignedVGPRs() ? getAlignedVGPRClassForBitWidth(BitWidth)
2681 : getAnyVGPRClassForBitWidth(BitWidth);
2682}
2683
2684static const TargetRegisterClass *
2685getAnyAGPRClassForBitWidth(unsigned BitWidth) {
2686 if (BitWidth == 64)
2687 return &AMDGPU::AReg_64RegClass;
2688 if (BitWidth == 96)
2689 return &AMDGPU::AReg_96RegClass;
2690 if (BitWidth == 128)
2691 return &AMDGPU::AReg_128RegClass;
2692 if (BitWidth == 160)
2693 return &AMDGPU::AReg_160RegClass;
2694 if (BitWidth == 192)
2695 return &AMDGPU::AReg_192RegClass;
2696 if (BitWidth == 224)
2697 return &AMDGPU::AReg_224RegClass;
2698 if (BitWidth == 256)
2699 return &AMDGPU::AReg_256RegClass;
2700 if (BitWidth == 288)
2701 return &AMDGPU::AReg_288RegClass;
2702 if (BitWidth == 320)
2703 return &AMDGPU::AReg_320RegClass;
2704 if (BitWidth == 352)
2705 return &AMDGPU::AReg_352RegClass;
2706 if (BitWidth == 384)
2707 return &AMDGPU::AReg_384RegClass;
2708 if (BitWidth == 512)
2709 return &AMDGPU::AReg_512RegClass;
2710 if (BitWidth == 1024)
2711 return &AMDGPU::AReg_1024RegClass;
2712
2713 return nullptr;
2714}
2715
2716static const TargetRegisterClass *
2717getAlignedAGPRClassForBitWidth(unsigned BitWidth) {
2718 if (BitWidth == 64)
2719 return &AMDGPU::AReg_64_Align2RegClass;
2720 if (BitWidth == 96)
2721 return &AMDGPU::AReg_96_Align2RegClass;
2722 if (BitWidth == 128)
2723 return &AMDGPU::AReg_128_Align2RegClass;
2724 if (BitWidth == 160)
2725 return &AMDGPU::AReg_160_Align2RegClass;
2726 if (BitWidth == 192)
2727 return &AMDGPU::AReg_192_Align2RegClass;
2728 if (BitWidth == 224)
2729 return &AMDGPU::AReg_224_Align2RegClass;
2730 if (BitWidth == 256)
2731 return &AMDGPU::AReg_256_Align2RegClass;
2732 if (BitWidth == 288)
2733 return &AMDGPU::AReg_288_Align2RegClass;
2734 if (BitWidth == 320)
2735 return &AMDGPU::AReg_320_Align2RegClass;
2736 if (BitWidth == 352)
2737 return &AMDGPU::AReg_352_Align2RegClass;
2738 if (BitWidth == 384)
2739 return &AMDGPU::AReg_384_Align2RegClass;
2740 if (BitWidth == 512)
2741 return &AMDGPU::AReg_512_Align2RegClass;
2742 if (BitWidth == 1024)
2743 return &AMDGPU::AReg_1024_Align2RegClass;
2744
2745 return nullptr;
2746}
2747
2748const TargetRegisterClass *
2749SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth) const {
2750 if (BitWidth == 16)
2751 return &AMDGPU::AGPR_LO16RegClass;
2752 if (BitWidth == 32)
2753 return &AMDGPU::AGPR_32RegClass;
2754 return ST.needsAlignedVGPRs() ? getAlignedAGPRClassForBitWidth(BitWidth)
2755 : getAnyAGPRClassForBitWidth(BitWidth);
2756}
2757
2758static const TargetRegisterClass *
2759getAnyVectorSuperClassForBitWidth(unsigned BitWidth) {
2760 if (BitWidth == 64)
2761 return &AMDGPU::AV_64RegClass;
2762 if (BitWidth == 96)
2763 return &AMDGPU::AV_96RegClass;
2764 if (BitWidth == 128)
2765 return &AMDGPU::AV_128RegClass;
2766 if (BitWidth == 160)
2767 return &AMDGPU::AV_160RegClass;
2768 if (BitWidth == 192)
2769 return &AMDGPU::AV_192RegClass;
2770 if (BitWidth == 224)
2771 return &AMDGPU::AV_224RegClass;
2772 if (BitWidth == 256)
2773 return &AMDGPU::AV_256RegClass;
2774 if (BitWidth == 288)
2775 return &AMDGPU::AV_288RegClass;
2776 if (BitWidth == 320)
2777 return &AMDGPU::AV_320RegClass;
2778 if (BitWidth == 352)
2779 return &AMDGPU::AV_352RegClass;
2780 if (BitWidth == 384)
2781 return &AMDGPU::AV_384RegClass;
2782 if (BitWidth == 512)
2783 return &AMDGPU::AV_512RegClass;
2784 if (BitWidth == 1024)
2785 return &AMDGPU::AV_1024RegClass;
2786
2787 return nullptr;
2788}
2789
2790static const TargetRegisterClass *
2791getAlignedVectorSuperClassForBitWidth(unsigned BitWidth) {
2792 if (BitWidth == 64)
2793 return &AMDGPU::AV_64_Align2RegClass;
2794 if (BitWidth == 96)
2795 return &AMDGPU::AV_96_Align2RegClass;
2796 if (BitWidth == 128)
2797 return &AMDGPU::AV_128_Align2RegClass;
2798 if (BitWidth == 160)
2799 return &AMDGPU::AV_160_Align2RegClass;
2800 if (BitWidth == 192)
2801 return &AMDGPU::AV_192_Align2RegClass;
2802 if (BitWidth == 224)
2803 return &AMDGPU::AV_224_Align2RegClass;
2804 if (BitWidth == 256)
2805 return &AMDGPU::AV_256_Align2RegClass;
2806 if (BitWidth == 288)
2807 return &AMDGPU::AV_288_Align2RegClass;
2808 if (BitWidth == 320)
2809 return &AMDGPU::AV_320_Align2RegClass;
2810 if (BitWidth == 352)
2811 return &AMDGPU::AV_352_Align2RegClass;
2812 if (BitWidth == 384)
2813 return &AMDGPU::AV_384_Align2RegClass;
2814 if (BitWidth == 512)
2815 return &AMDGPU::AV_512_Align2RegClass;
2816 if (BitWidth == 1024)
2817 return &AMDGPU::AV_1024_Align2RegClass;
2818
2819 return nullptr;
2820}
2821
2822const TargetRegisterClass *
2823SIRegisterInfo::getVectorSuperClassForBitWidth(unsigned BitWidth) const {
2824 if (BitWidth == 32)
2825 return &AMDGPU::AV_32RegClass;
2826 return ST.needsAlignedVGPRs()
2827 ? getAlignedVectorSuperClassForBitWidth(BitWidth)
2828 : getAnyVectorSuperClassForBitWidth(BitWidth);
2829}
2830
2831const TargetRegisterClass *
2832SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) {
2833 if (BitWidth == 16)
2834 return &AMDGPU::SGPR_LO16RegClass;
2835 if (BitWidth == 32)
2836 return &AMDGPU::SReg_32RegClass;
2837 if (BitWidth == 64)
2838 return &AMDGPU::SReg_64RegClass;
2839 if (BitWidth == 96)
2840 return &AMDGPU::SGPR_96RegClass;
2841 if (BitWidth == 128)
2842 return &AMDGPU::SGPR_128RegClass;
2843 if (BitWidth == 160)
2844 return &AMDGPU::SGPR_160RegClass;
2845 if (BitWidth == 192)
2846 return &AMDGPU::SGPR_192RegClass;
2847 if (BitWidth == 224)
2848 return &AMDGPU::SGPR_224RegClass;
2849 if (BitWidth == 256)
2850 return &AMDGPU::SGPR_256RegClass;
2851 if (BitWidth == 288)
2852 return &AMDGPU::SGPR_288RegClass;
2853 if (BitWidth == 320)
2854 return &AMDGPU::SGPR_320RegClass;
2855 if (BitWidth == 352)
2856 return &AMDGPU::SGPR_352RegClass;
2857 if (BitWidth == 384)
2858 return &AMDGPU::SGPR_384RegClass;
2859 if (BitWidth == 512)
2860 return &AMDGPU::SGPR_512RegClass;
2861 if (BitWidth == 1024)
2862 return &AMDGPU::SGPR_1024RegClass;
2863
2864 return nullptr;
2865}
2866
2867bool SIRegisterInfo::isSGPRReg(const MachineRegisterInfo &MRI,
2868 Register Reg) const {
2869 const TargetRegisterClass *RC;
2870 if (Reg.isVirtual())
2871 RC = MRI.getRegClass(Reg);
2872 else
2873 RC = getPhysRegBaseClass(Reg);
2874 return RC ? isSGPRClass(RC) : false;
2875}
2876
2877const TargetRegisterClass *
2878SIRegisterInfo::getEquivalentVGPRClass(const TargetRegisterClass *SRC) const {
2879 unsigned Size = getRegSizeInBits(*SRC);
2880 const TargetRegisterClass *VRC = getVGPRClassForBitWidth(BitWidth: Size);
2881 assert(VRC && "Invalid register class size");
2882 return VRC;
2883}
2884
2885const TargetRegisterClass *
2886SIRegisterInfo::getEquivalentAGPRClass(const TargetRegisterClass *SRC) const {
2887 unsigned Size = getRegSizeInBits(*SRC);
2888 const TargetRegisterClass *ARC = getAGPRClassForBitWidth(BitWidth: Size);
2889 assert(ARC && "Invalid register class size");
2890 return ARC;
2891}
2892
2893const TargetRegisterClass *
2894SIRegisterInfo::getEquivalentSGPRClass(const TargetRegisterClass *VRC) const {
2895 unsigned Size = getRegSizeInBits(*VRC);
2896 if (Size == 32)
2897 return &AMDGPU::SGPR_32RegClass;
2898 const TargetRegisterClass *SRC = getSGPRClassForBitWidth(BitWidth: Size);
2899 assert(SRC && "Invalid register class size");
2900 return SRC;
2901}
2902
2903const TargetRegisterClass *
2904SIRegisterInfo::getCompatibleSubRegClass(const TargetRegisterClass *SuperRC,
2905 const TargetRegisterClass *SubRC,
2906 unsigned SubIdx) const {
2907 // Ensure this subregister index is aligned in the super register.
2908 const TargetRegisterClass *MatchRC =
2909 getMatchingSuperRegClass(SuperRC, SubRC, SubIdx);
2910 return MatchRC && MatchRC->hasSubClassEq(RC: SuperRC) ? MatchRC : nullptr;
2911}
2912
2913bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const {
2914 if (OpType >= AMDGPU::OPERAND_REG_INLINE_AC_FIRST &&
2915 OpType <= AMDGPU::OPERAND_REG_INLINE_AC_LAST)
2916 return !ST.hasMFMAInlineLiteralBug();
2917
2918 return OpType >= AMDGPU::OPERAND_SRC_FIRST &&
2919 OpType <= AMDGPU::OPERAND_SRC_LAST;
2920}
2921
2922bool SIRegisterInfo::shouldRewriteCopySrc(
2923 const TargetRegisterClass *DefRC,
2924 unsigned DefSubReg,
2925 const TargetRegisterClass *SrcRC,
2926 unsigned SrcSubReg) const {
2927 // We want to prefer the smallest register class possible, so we don't want to
2928 // stop and rewrite on anything that looks like a subregister
2929 // extract. Operations mostly don't care about the super register class, so we
2930 // only want to stop on the most basic of copies between the same register
2931 // class.
2932 //
2933 // e.g. if we have something like
2934 // %0 = ...
2935 // %1 = ...
2936 // %2 = REG_SEQUENCE %0, sub0, %1, sub1, %2, sub2
2937 // %3 = COPY %2, sub0
2938 //
2939 // We want to look through the COPY to find:
2940 // => %3 = COPY %0
2941
2942 // Plain copy.
2943 return getCommonSubClass(DefRC, SrcRC) != nullptr;
2944}
2945
2946bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const {
2947 // TODO: 64-bit operands have extending behavior from 32-bit literal.
2948 return OpType >= AMDGPU::OPERAND_REG_IMM_FIRST &&
2949 OpType <= AMDGPU::OPERAND_REG_IMM_LAST;
2950}
2951
2952/// Returns a lowest register that is not used at any point in the function.
2953/// If all registers are used, then this function will return
2954/// AMDGPU::NoRegister. If \p ReserveHighestRegister = true, then return
2955/// highest unused register.
2956MCRegister SIRegisterInfo::findUnusedRegister(
2957 const MachineRegisterInfo &MRI, const TargetRegisterClass *RC,
2958 const MachineFunction &MF, bool ReserveHighestRegister) const {
2959 if (ReserveHighestRegister) {
2960 for (MCRegister Reg : reverse(C: *RC))
2961 if (MRI.isAllocatable(PhysReg: Reg) && !MRI.isPhysRegUsed(PhysReg: Reg))
2962 return Reg;
2963 } else {
2964 for (MCRegister Reg : *RC)
2965 if (MRI.isAllocatable(PhysReg: Reg) && !MRI.isPhysRegUsed(PhysReg: Reg))
2966 return Reg;
2967 }
2968 return MCRegister();
2969}
2970
2971bool SIRegisterInfo::isUniformReg(const MachineRegisterInfo &MRI,
2972 const RegisterBankInfo &RBI,
2973 Register Reg) const {
2974 auto *RB = RBI.getRegBank(Reg, MRI, TRI: *MRI.getTargetRegisterInfo());
2975 if (!RB)
2976 return false;
2977
2978 return !RBI.isDivergentRegBank(RB);
2979}
2980
2981ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC,
2982 unsigned EltSize) const {
2983 const unsigned RegBitWidth = AMDGPU::getRegBitWidth(RC: *RC);
2984 assert(RegBitWidth >= 32 && RegBitWidth <= 1024);
2985
2986 const unsigned RegDWORDs = RegBitWidth / 32;
2987 const unsigned EltDWORDs = EltSize / 4;
2988 assert(RegSplitParts.size() + 1 >= EltDWORDs);
2989
2990 const std::vector<int16_t> &Parts = RegSplitParts[EltDWORDs - 1];
2991 const unsigned NumParts = RegDWORDs / EltDWORDs;
2992
2993 return ArrayRef(Parts.data(), NumParts);
2994}
2995
2996const TargetRegisterClass*
2997SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI,
2998 Register Reg) const {
2999 return Reg.isVirtual() ? MRI.getRegClass(Reg) : getPhysRegBaseClass(Reg);
3000}
3001
3002const TargetRegisterClass *
3003SIRegisterInfo::getRegClassForOperandReg(const MachineRegisterInfo &MRI,
3004 const MachineOperand &MO) const {
3005 const TargetRegisterClass *SrcRC = getRegClassForReg(MRI, Reg: MO.getReg());
3006 return getSubRegisterClass(SrcRC, MO.getSubReg());
3007}
3008
3009bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI,
3010 Register Reg) const {
3011 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
3012 // Registers without classes are unaddressable, SGPR-like registers.
3013 return RC && isVGPRClass(RC);
3014}
3015
3016bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI,
3017 Register Reg) const {
3018 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
3019
3020 // Registers without classes are unaddressable, SGPR-like registers.
3021 return RC && isAGPRClass(RC);
3022}
3023
3024bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI,
3025 const TargetRegisterClass *SrcRC,
3026 unsigned SubReg,
3027 const TargetRegisterClass *DstRC,
3028 unsigned DstSubReg,
3029 const TargetRegisterClass *NewRC,
3030 LiveIntervals &LIS) const {
3031 unsigned SrcSize = getRegSizeInBits(*SrcRC);
3032 unsigned DstSize = getRegSizeInBits(*DstRC);
3033 unsigned NewSize = getRegSizeInBits(*NewRC);
3034
3035 // Do not increase size of registers beyond dword, we would need to allocate
3036 // adjacent registers and constraint regalloc more than needed.
3037
3038 // Always allow dword coalescing.
3039 if (SrcSize <= 32 || DstSize <= 32)
3040 return true;
3041
3042 return NewSize <= DstSize || NewSize <= SrcSize;
3043}
3044
3045unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
3046 MachineFunction &MF) const {
3047 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
3048
3049 unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(),
3050 MF.getFunction());
3051 switch (RC->getID()) {
3052 default:
3053 return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF);
3054 case AMDGPU::VGPR_32RegClassID:
3055 return std::min(a: ST.getMaxNumVGPRs(WavesPerEU: Occupancy), b: ST.getMaxNumVGPRs(MF));
3056 case AMDGPU::SGPR_32RegClassID:
3057 case AMDGPU::SGPR_LO16RegClassID:
3058 return std::min(a: ST.getMaxNumSGPRs(WavesPerEU: Occupancy, Addressable: true), b: ST.getMaxNumSGPRs(MF));
3059 }
3060}
3061
3062unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
3063 unsigned Idx) const {
3064 if (Idx == AMDGPU::RegisterPressureSets::VGPR_32 ||
3065 Idx == AMDGPU::RegisterPressureSets::AGPR_32)
3066 return getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
3067 const_cast<MachineFunction &>(MF));
3068
3069 if (Idx == AMDGPU::RegisterPressureSets::SReg_32)
3070 return getRegPressureLimit(&AMDGPU::SGPR_32RegClass,
3071 const_cast<MachineFunction &>(MF));
3072
3073 llvm_unreachable("Unexpected register pressure set!");
3074}
3075
3076const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const {
3077 static const int Empty[] = { -1 };
3078
3079 if (RegPressureIgnoredUnits[RegUnit])
3080 return Empty;
3081
3082 return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit);
3083}
3084
3085MCRegister SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const {
3086 // Not a callee saved register.
3087 return AMDGPU::SGPR30_SGPR31;
3088}
3089
3090const TargetRegisterClass *
3091SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size,
3092 const RegisterBank &RB) const {
3093 switch (RB.getID()) {
3094 case AMDGPU::VGPRRegBankID:
3095 return getVGPRClassForBitWidth(
3096 BitWidth: std::max(ST.useRealTrue16Insts() ? 16u : 32u, Size));
3097 case AMDGPU::VCCRegBankID:
3098 assert(Size == 1);
3099 return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass
3100 : &AMDGPU::SReg_64_XEXECRegClass;
3101 case AMDGPU::SGPRRegBankID:
3102 return getSGPRClassForBitWidth(BitWidth: std::max(a: 32u, b: Size));
3103 case AMDGPU::AGPRRegBankID:
3104 return getAGPRClassForBitWidth(BitWidth: std::max(a: 32u, b: Size));
3105 default:
3106 llvm_unreachable("unknown register bank");
3107 }
3108}
3109
3110const TargetRegisterClass *
3111SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO,
3112 const MachineRegisterInfo &MRI) const {
3113 const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(Reg: MO.getReg());
3114 if (const RegisterBank *RB = RCOrRB.dyn_cast<const RegisterBank*>())
3115 return getRegClassForTypeOnBank(Ty: MRI.getType(Reg: MO.getReg()), Bank: *RB);
3116
3117 if (const auto *RC = RCOrRB.dyn_cast<const TargetRegisterClass *>())
3118 return getAllocatableClass(RC);
3119
3120 return nullptr;
3121}
3122
3123MCRegister SIRegisterInfo::getVCC() const {
3124 return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC;
3125}
3126
3127MCRegister SIRegisterInfo::getExec() const {
3128 return isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
3129}
3130
3131const TargetRegisterClass *SIRegisterInfo::getVGPR64Class() const {
3132 // VGPR tuples have an alignment requirement on gfx90a variants.
3133 return ST.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass
3134 : &AMDGPU::VReg_64RegClass;
3135}
3136
3137const TargetRegisterClass *
3138SIRegisterInfo::getRegClass(unsigned RCID) const {
3139 switch ((int)RCID) {
3140 case AMDGPU::SReg_1RegClassID:
3141 return getBoolRC();
3142 case AMDGPU::SReg_1_XEXECRegClassID:
3143 return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass
3144 : &AMDGPU::SReg_64_XEXECRegClass;
3145 case -1:
3146 return nullptr;
3147 default:
3148 return AMDGPUGenRegisterInfo::getRegClass(RCID);
3149 }
3150}
3151
3152// Find reaching register definition
3153MachineInstr *SIRegisterInfo::findReachingDef(Register Reg, unsigned SubReg,
3154 MachineInstr &Use,
3155 MachineRegisterInfo &MRI,
3156 LiveIntervals *LIS) const {
3157 auto &MDT = LIS->getAnalysis<MachineDominatorTree>();
3158 SlotIndex UseIdx = LIS->getInstructionIndex(Instr: Use);
3159 SlotIndex DefIdx;
3160
3161 if (Reg.isVirtual()) {
3162 if (!LIS->hasInterval(Reg))
3163 return nullptr;
3164 LiveInterval &LI = LIS->getInterval(Reg);
3165 LaneBitmask SubLanes = SubReg ? getSubRegIndexLaneMask(SubReg)
3166 : MRI.getMaxLaneMaskForVReg(Reg);
3167 VNInfo *V = nullptr;
3168 if (LI.hasSubRanges()) {
3169 for (auto &S : LI.subranges()) {
3170 if ((S.LaneMask & SubLanes) == SubLanes) {
3171 V = S.getVNInfoAt(Idx: UseIdx);
3172 break;
3173 }
3174 }
3175 } else {
3176 V = LI.getVNInfoAt(Idx: UseIdx);
3177 }
3178 if (!V)
3179 return nullptr;
3180 DefIdx = V->def;
3181 } else {
3182 // Find last def.
3183 for (MCRegUnit Unit : regunits(Reg.asMCReg())) {
3184 LiveRange &LR = LIS->getRegUnit(Unit);
3185 if (VNInfo *V = LR.getVNInfoAt(UseIdx)) {
3186 if (!DefIdx.isValid() ||
3187 MDT.dominates(LIS->getInstructionFromIndex(DefIdx),
3188 LIS->getInstructionFromIndex(V->def)))
3189 DefIdx = V->def;
3190 } else {
3191 return nullptr;
3192 }
3193 }
3194 }
3195
3196 MachineInstr *Def = LIS->getInstructionFromIndex(index: DefIdx);
3197
3198 if (!Def || !MDT.dominates(A: Def, B: &Use))
3199 return nullptr;
3200
3201 assert(Def->modifiesRegister(Reg, this));
3202
3203 return Def;
3204}
3205
3206MCPhysReg SIRegisterInfo::get32BitRegister(MCPhysReg Reg) const {
3207 assert(getRegSizeInBits(*getPhysRegBaseClass(Reg)) <= 32);
3208
3209 for (const TargetRegisterClass &RC : { AMDGPU::VGPR_32RegClass,
3210 AMDGPU::SReg_32RegClass,
3211 AMDGPU::AGPR_32RegClass } ) {
3212 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::lo16, &RC))
3213 return Super;
3214 }
3215 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::hi16,
3216 &AMDGPU::VGPR_32RegClass)) {
3217 return Super;
3218 }
3219
3220 return AMDGPU::NoRegister;
3221}
3222
3223bool SIRegisterInfo::isProperlyAlignedRC(const TargetRegisterClass &RC) const {
3224 if (!ST.needsAlignedVGPRs())
3225 return true;
3226
3227 if (isVGPRClass(RC: &RC))
3228 return RC.hasSuperClassEq(RC: getVGPRClassForBitWidth(BitWidth: getRegSizeInBits(RC)));
3229 if (isAGPRClass(RC: &RC))
3230 return RC.hasSuperClassEq(RC: getAGPRClassForBitWidth(BitWidth: getRegSizeInBits(RC)));
3231 if (isVectorSuperClass(RC: &RC))
3232 return RC.hasSuperClassEq(
3233 RC: getVectorSuperClassForBitWidth(BitWidth: getRegSizeInBits(RC)));
3234
3235 return true;
3236}
3237
3238const TargetRegisterClass *
3239SIRegisterInfo::getProperlyAlignedRC(const TargetRegisterClass *RC) const {
3240 if (!RC || !ST.needsAlignedVGPRs())
3241 return RC;
3242
3243 unsigned Size = getRegSizeInBits(*RC);
3244 if (Size <= 32)
3245 return RC;
3246
3247 if (isVGPRClass(RC))
3248 return getAlignedVGPRClassForBitWidth(BitWidth: Size);
3249 if (isAGPRClass(RC))
3250 return getAlignedAGPRClassForBitWidth(BitWidth: Size);
3251 if (isVectorSuperClass(RC))
3252 return getAlignedVectorSuperClassForBitWidth(BitWidth: Size);
3253
3254 return RC;
3255}
3256
3257ArrayRef<MCPhysReg>
3258SIRegisterInfo::getAllSGPR128(const MachineFunction &MF) const {
3259 return ArrayRef(AMDGPU::SGPR_128RegClass.begin(), ST.getMaxNumSGPRs(MF) / 4);
3260}
3261
3262ArrayRef<MCPhysReg>
3263SIRegisterInfo::getAllSGPR64(const MachineFunction &MF) const {
3264 return ArrayRef(AMDGPU::SGPR_64RegClass.begin(), ST.getMaxNumSGPRs(MF) / 2);
3265}
3266
3267ArrayRef<MCPhysReg>
3268SIRegisterInfo::getAllSGPR32(const MachineFunction &MF) const {
3269 return ArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF));
3270}
3271
3272unsigned
3273SIRegisterInfo::getSubRegAlignmentNumBits(const TargetRegisterClass *RC,
3274 unsigned SubReg) const {
3275 switch (RC->TSFlags & SIRCFlags::RegKindMask) {
3276 case SIRCFlags::HasSGPR:
3277 return std::min(128u, getSubRegIdxSize(SubReg));
3278 case SIRCFlags::HasAGPR:
3279 case SIRCFlags::HasVGPR:
3280 case SIRCFlags::HasVGPR | SIRCFlags::HasAGPR:
3281 return std::min(32u, getSubRegIdxSize(SubReg));
3282 default:
3283 break;
3284 }
3285 return 0;
3286}
3287

source code of llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp