GCNNSAReassign.cpp source code [llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp]

1	//===-- GCNNSAReassign.cpp - Reassign registers in NSA instructions -------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// \brief Try to reassign registers on GFX10+ from non-sequential to sequential
11	/// in NSA image instructions. Later SIShrinkInstructions pass will replace NSA
12	/// with sequential versions where possible.
13	///
14	//===----------------------------------------------------------------------===//
15
16	#include "AMDGPU.h"
17	#include "GCNSubtarget.h"
18	#include "SIMachineFunctionInfo.h"
19	#include "SIRegisterInfo.h"
20	#include "llvm/ADT/Statistic.h"
21	#include "llvm/CodeGen/LiveIntervals.h"
22	#include "llvm/CodeGen/LiveRegMatrix.h"
23	#include "llvm/CodeGen/MachineFunctionPass.h"
24	#include "llvm/CodeGen/VirtRegMap.h"
25	#include "llvm/InitializePasses.h"
26
27	using namespace llvm;
28
29	#define DEBUG_TYPE "amdgpu-nsa-reassign"
30
31	STATISTIC(NumNSAInstructions,
32	"Number of NSA instructions with non-sequential address found");
33	STATISTIC(NumNSAConverted,
34	"Number of NSA instructions changed to sequential");
35
36	namespace {
37
38	class GCNNSAReassign : public MachineFunctionPass {
39	public:
40	static char ID;
41
42	GCNNSAReassign() : MachineFunctionPass (ID) {
43	initializeGCNNSAReassignPass(*PassRegistry::getPassRegistry());
44	}
45
46	bool runOnMachineFunction(MachineFunction &MF) override;
47
48	StringRef getPassName() const override { return "GCN NSA Reassign"; }
49
50	void getAnalysisUsage(AnalysisUsage &AU) const override {
51	AU.addRequired<LiveIntervals>();
52	AU.addRequired<VirtRegMap>();
53	AU.addRequired<LiveRegMatrix>();
54	AU.setPreservesAll();
55	MachineFunctionPass::getAnalysisUsage(AU);
56	}
57
58	private:
59	typedef enum {
60	NOT_NSA, // Not an NSA instruction
61	FIXED, // NSA which we cannot modify
62	NON_CONTIGUOUS, // NSA with non-sequential address which we can try
63	// to optimize.
64	CONTIGUOUS // NSA with all sequential address registers
65	} NSA_Status;
66
67	const GCNSubtarget *ST;
68
69	const MachineRegisterInfo *MRI;
70
71	const SIRegisterInfo *TRI;
72
73	VirtRegMap *VRM;
74
75	LiveRegMatrix *LRM;
76
77	LiveIntervals *LIS;
78
79	unsigned MaxNumVGPRs;
80
81	const MCPhysReg *CSRegs;
82
83	NSA_Status CheckNSA(const MachineInstr &MI, bool Fast = false) const;
84
85	bool tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals,
86	unsigned StartReg) const;
87
88	bool canAssign(unsigned StartReg, unsigned NumRegs) const;
89
90	bool scavengeRegs(SmallVectorImpl<LiveInterval > &Intervals) const*;
91	};
92
93	} // End anonymous namespace.
94
95	INITIALIZE_PASS_BEGIN(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign",
96	false, false)
97	INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
98	INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
99	INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix)
100	INITIALIZE_PASS_END(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign",
101	false, false)
102
103
104	char GCNNSAReassign::ID = `0`;
105
106	char &llvm::GCNNSAReassignID = GCNNSAReassign::ID;
107
108	bool
109	GCNNSAReassign::tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals,
110	unsigned StartReg) const {
111	unsigned NumRegs = Intervals.size();
112
113	for (unsigned N = `0`; N < NumRegs; ++N)
114	if (VRM->hasPhys(virtReg: Intervals [N]->reg()))
115	LRM->unassign(VirtReg: *Intervals [N]);
116
117	for (unsigned N = `0`; N < NumRegs; ++N)
118	if (LRM->checkInterference(VirtReg: *Intervals [N], PhysReg: MCRegister::from(Val: StartReg + N)))
119	return false;
120
121	for (unsigned N = `0`; N < NumRegs; ++N)
122	LRM->assign(VirtReg: *Intervals [N], PhysReg: MCRegister::from(Val: StartReg + N));
123
124	return true;
125	}
126
127	bool GCNNSAReassign::canAssign(unsigned StartReg, unsigned NumRegs) const {
128	for (unsigned N = `0`; N < NumRegs; ++N) {
129	unsigned Reg = StartReg + N;
130	if (!MRI->isAllocatable(PhysReg: Reg))
131	return false;
132
133	for (unsigned I = `0`; CSRegs[I]; ++I)
134	if (TRI->isSubRegisterEq(Reg, CSRegs[I]) &&
135	!LRM->isPhysRegUsed(PhysReg: CSRegs[I]))
136	return false;
137	}
138
139	return true;
140	}
141
142	bool
143	GCNNSAReassign::scavengeRegs(SmallVectorImpl<LiveInterval > &Intervals) const* {
144	unsigned NumRegs = Intervals.size();
145
146	if (NumRegs > MaxNumVGPRs)
147	return false;
148	unsigned MaxReg = MaxNumVGPRs - NumRegs + AMDGPU::VGPR0;
149
150	for (unsigned Reg = AMDGPU::VGPR0; Reg <= MaxReg; ++Reg) {
151	if (!canAssign(StartReg: Reg, NumRegs))
152	continue;
153
154	if (tryAssignRegisters(Intervals, StartReg: Reg))
155	return true;
156	}
157
158	return false;
159	}
160
161	GCNNSAReassign::NSA_Status
162	GCNNSAReassign::CheckNSA(const MachineInstr &MI, bool Fast) const {
163	const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: MI.getOpcode());
164	if (!Info)
165	return NSA_Status::NOT_NSA;
166
167	switch (Info->MIMGEncoding) {
168	case AMDGPU::MIMGEncGfx10NSA:
169	case AMDGPU::MIMGEncGfx11NSA:
170	break;
171	default:
172	return NSA_Status::NOT_NSA;
173	}
174
175	int VAddr0Idx =
176	AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
177
178	unsigned VgprBase = `0`;
179	bool NSA = false;
180	for (unsigned I = `0`; I < Info->VAddrOperands; ++I) {
181	const MachineOperand &Op = MI.getOperand(i: VAddr0Idx + I);
182	Register Reg = Op.getReg();
183	if (Reg.isPhysical() \|\| !VRM->isAssignedReg(virtReg: Reg))
184	return NSA_Status::FIXED;
185
186	Register PhysReg = VRM->getPhys(virtReg: Reg);
187
188	if (!Fast) {
189	if (!PhysReg)
190	return NSA_Status::FIXED;
191
192	// TODO: address the below limitation to handle GFX11 BVH instructions
193	// Bail if address is not a VGPR32. That should be possible to extend the
194	// optimization to work with subregs of a wider register tuples, but the
195	// logic to find free registers will be much more complicated with much
196	// less chances for success. That seems reasonable to assume that in most
197	// cases a tuple is used because a vector variable contains different
198	// parts of an address and it is either already consecutive or cannot
199	// be reassigned if not. If needed it is better to rely on register
200	// coalescer to process such address tuples.
201	if (TRI->getRegSizeInBits(*MRI->getRegClass(Reg)) != `32` \|\| Op.getSubReg())
202	return NSA_Status::FIXED;
203
204	// InlineSpiller does not call LRM::assign() after an LI split leaving
205	// it in an inconsistent state, so we cannot call LRM::unassign().
206	// See llvm bug #48911.
207	// Skip reassign if a register has originated from such split.
208	// FIXME: Remove the workaround when bug #48911 is fixed.
209	if (VRM->getPreSplitReg(virtReg: Reg))
210	return NSA_Status::FIXED;
211
212	const MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
213
214	if (Def && Def->isCopy() && Def->getOperand(i: `1`).getReg() == PhysReg)
215	return NSA_Status::FIXED;
216
217	for (auto U : MRI->use_nodbg_operands(Reg)) {
218	if (U.isImplicit())
219	return NSA_Status::FIXED;
220	const MachineInstr *UseInst = U.getParent();
221	if (UseInst->isCopy() && UseInst->getOperand(i: `0`).getReg() == PhysReg)
222	return NSA_Status::FIXED;
223	}
224
225	if (!LIS->hasInterval(Reg))
226	return NSA_Status::FIXED;
227	}
228
229	if (I == `0`)
230	VgprBase = PhysReg;
231	else if (VgprBase + I != PhysReg)
232	NSA = true;
233	}
234
235	return NSA ? NSA_Status::NON_CONTIGUOUS : NSA_Status::CONTIGUOUS;
236	}
237
238	bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) {
239	ST = &MF.getSubtarget<GCNSubtarget>();
240	if (!ST->hasNSAEncoding() \|\| !ST->hasNonNSAEncoding())
241	return false;
242
243	MRI = &MF.getRegInfo();
244	TRI = ST->getRegisterInfo();
245	VRM = &getAnalysis<VirtRegMap>();
246	LRM = &getAnalysis<LiveRegMatrix>();
247	LIS = &getAnalysis<LiveIntervals>();
248
249	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
250	MaxNumVGPRs = ST->getMaxNumVGPRs(MF);
251	MaxNumVGPRs = std::min(a: ST->getMaxNumVGPRs(WavesPerEU: MFI->getOccupancy()), b: MaxNumVGPRs);
252	CSRegs = MRI->getCalleeSavedRegs();
253
254	using Candidate = std::pair<const MachineInstr, bool*>;
255	SmallVector<Candidate, `32`> Candidates;
256	for (const MachineBasicBlock &MBB : MF) {
257	for (const MachineInstr &MI : MBB) {
258	switch (CheckNSA(MI)) {
259	default:
260	continue;
261	case NSA_Status::CONTIGUOUS:
262	Candidates.push_back(Elt: std::pair(&MI, true));
263	break;
264	case NSA_Status::NON_CONTIGUOUS:
265	Candidates.push_back(Elt: std::pair(&MI, false));
266	++NumNSAInstructions;
267	break;
268	}
269	}
270	}
271
272	bool Changed = false;
273	for (auto &C : Candidates) {
274	if (C.second)
275	continue;
276
277	const MachineInstr *MI = C.first;
278	if (CheckNSA(MI: MI, Fast: true*) == NSA_Status::CONTIGUOUS) {
279	// Already happen to be fixed.
280	C.second = true;
281	++NumNSAConverted;
282	continue;
283	}
284
285	const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: MI->getOpcode());
286	int VAddr0Idx =
287	AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr0);
288
289	SmallVector<LiveInterval *, `16`> Intervals;
290	SmallVector<MCRegister, `16`> OrigRegs;
291	SlotIndex MinInd, MaxInd;
292	for (unsigned I = `0`; I < Info->VAddrOperands; ++I) {
293	const MachineOperand &Op = MI->getOperand(i: VAddr0Idx + I);
294	Register Reg = Op.getReg();
295	LiveInterval *LI = &LIS->getInterval(Reg);
296	if (llvm::is_contained(Range&: Intervals, Element: LI)) {
297	// Same register used, unable to make sequential
298	Intervals.clear();
299	break;
300	}
301	Intervals.push_back(Elt: LI);
302	OrigRegs.push_back(Elt: VRM->getPhys(virtReg: Reg));
303	if (LI->empty()) {
304	// The address input is undef, so it doesn't contribute to the relevant
305	// range. Seed a reasonable index range if required.
306	if (I == `0`)
307	MinInd = MaxInd = LIS->getInstructionIndex(Instr: *MI);
308	continue;
309	}
310	MinInd = I != `0` ? std::min(a: MinInd, b: LI->beginIndex()) : LI->beginIndex();
311	MaxInd = I != `0` ? std::max(a: MaxInd, b: LI->endIndex()) : LI->endIndex();
312	}
313
314	if (Intervals.empty())
315	continue;
316
317	LLVM_DEBUG(dbgs() << "Attempting to reassign NSA: " << *MI
318	<< "\tOriginal allocation:\t";
319	for (auto *LI
320	: Intervals) dbgs()
321	<< " " << llvm::printReg((VRM->getPhys(LI->reg())), TRI);
322	dbgs() << `'\n'`);
323
324	bool Success = scavengeRegs(Intervals);
325	if (!Success) {
326	LLVM_DEBUG(dbgs() << "\tCannot reallocate.\n");
327	if (VRM->hasPhys(virtReg: Intervals.back()->reg())) // Did not change allocation.
328	continue;
329	} else {
330	// Check we did not make it worse for other instructions.
331	auto I = std::lower_bound(first: Candidates.begin(), last: &C, val: MinInd,
332	comp: [this](const Candidate &C, SlotIndex I) {
333	return LIS->getInstructionIndex(Instr: *C.first) < I;
334	});
335	for (auto E = Candidates.end(); Success && I != E &&
336	LIS->getInstructionIndex(Instr: *I->first) < MaxInd; ++I) {
337	if (I->second && CheckNSA(MI: I->first, Fast: true*) < NSA_Status::CONTIGUOUS) {
338	Success = false;
339	LLVM_DEBUG(dbgs() << "\tNSA conversion conflict with " << *I->first);
340	}
341	}
342	}
343
344	if (!Success) {
345	for (unsigned I = `0`; I < Info->VAddrOperands; ++I)
346	if (VRM->hasPhys(virtReg: Intervals [I]->reg()))
347	LRM->unassign(VirtReg: *Intervals [I]);
348
349	for (unsigned I = `0`; I < Info->VAddrOperands; ++I)
350	LRM->assign(VirtReg: *Intervals [I], PhysReg: OrigRegs [I]);
351
352	continue;
353	}
354
355	C.second = true;
356	++NumNSAConverted;
357	LLVM_DEBUG(
358	dbgs() << "\tNew allocation:\t\t ["
359	<< llvm::printReg((VRM->getPhys(Intervals.front()->reg())), TRI)
360	<< " : "
361	<< llvm::printReg((VRM->getPhys(Intervals.back()->reg())), TRI)
362	<< "]\n");
363	Changed = true;
364	}
365
366	return Changed;
367	}
368

source code of llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp