AMDGPUInsertDelayAlu.cpp source code [llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp]

1	//===- AMDGPUInsertDelayAlu.cpp - Insert s_delay_alu instructions ---------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// Insert s_delay_alu instructions to avoid stalls on GFX11+.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "AMDGPU.h"
15	#include "GCNSubtarget.h"
16	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
17	#include "SIInstrInfo.h"
18	#include "llvm/ADT/SetVector.h"
19
20	using namespace llvm;
21
22	#define DEBUG_TYPE "amdgpu-insert-delay-alu"
23
24	namespace {
25
26	class AMDGPUInsertDelayAlu : public MachineFunctionPass {
27	public:
28	static char ID;
29
30	const SIInstrInfo *SII;
31	const TargetRegisterInfo *TRI;
32
33	TargetSchedModel SchedModel;
34
35	AMDGPUInsertDelayAlu() : MachineFunctionPass (ID) {}
36
37	void getAnalysisUsage(AnalysisUsage &AU) const override {
38	AU.setPreservesCFG();
39	MachineFunctionPass::getAnalysisUsage(AU);
40	}
41
42	// Return true if MI waits for all outstanding VALU instructions to complete.
43	static bool instructionWaitsForVALU(const MachineInstr &MI) {
44	// These instruction types wait for VA_VDST==0 before issuing.
45	const uint64_t VA_VDST_0 = SIInstrFlags::DS \| SIInstrFlags::EXP \|
46	SIInstrFlags::FLAT \| SIInstrFlags::MIMG \|
47	SIInstrFlags::MTBUF \| SIInstrFlags::MUBUF;
48	if (MI.getDesc().TSFlags & VA_VDST_0)
49	return true;
50	if (MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B32 \|\|
51	MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B64)
52	return true;
53	if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
54	AMDGPU::DepCtr::decodeFieldVaVdst(Encoded: MI.getOperand(i: `0`).getImm()) == `0`)
55	return true;
56	return false;
57	}
58
59	// Types of delay that can be encoded in an s_delay_alu instruction.
60	enum DelayType { VALU, TRANS, SALU, OTHER };
61
62	// Get the delay type for an instruction with the specified TSFlags.
63	static DelayType getDelayType(uint64_t TSFlags) {
64	if (TSFlags & SIInstrFlags::TRANS)
65	return TRANS;
66	if (TSFlags & SIInstrFlags::VALU)
67	return VALU;
68	if (TSFlags & SIInstrFlags::SALU)
69	return SALU;
70	return OTHER;
71	}
72
73	// Information about the last instruction(s) that wrote to a particular
74	// regunit. In straight-line code there will only be one such instruction, but
75	// when control flow converges we merge the delay information from each path
76	// to represent the union of the worst-case delays of each type.
77	struct DelayInfo {
78	// One larger than the maximum number of (non-TRANS) VALU instructions we
79	// can encode in an s_delay_alu instruction.
80	static constexpr unsigned VALU_MAX = `5`;
81
82	// One larger than the maximum number of TRANS instructions we can encode in
83	// an s_delay_alu instruction.
84	static constexpr unsigned TRANS_MAX = `4`;
85
86	// One larger than the maximum number of SALU cycles we can encode in an
87	// s_delay_alu instruction.
88	static constexpr unsigned SALU_CYCLES_MAX = `4`;
89
90	// If it was written by a (non-TRANS) VALU, remember how many clock cycles
91	// are left until it completes, and how many other (non-TRANS) VALU we have
92	// seen since it was issued.
93	uint8_t VALUCycles = `0`;
94	uint8_t VALUNum = VALU_MAX;
95
96	// If it was written by a TRANS, remember how many clock cycles are left
97	// until it completes, and how many other TRANS we have seen since it was
98	// issued.
99	uint8_t TRANSCycles = `0`;
100	uint8_t TRANSNum = TRANS_MAX;
101	// Also remember how many other (non-TRANS) VALU we have seen since it was
102	// issued. When an instruction depends on both a prior TRANS and a prior
103	// non-TRANS VALU, this is used to decide whether to encode a wait for just
104	// one or both of them.
105	uint8_t TRANSNumVALU = VALU_MAX;
106
107	// If it was written by an SALU, remember how many clock cycles are left
108	// until it completes.
109	uint8_t SALUCycles = `0`;
110
111	DelayInfo() = default;
112
113	DelayInfo(DelayType Type, unsigned Cycles) {
114	switch (Type) {
115	default:
116	llvm_unreachable("unexpected type");
117	case VALU:
118	VALUCycles = Cycles;
119	VALUNum = `0`;
120	break;
121	case TRANS:
122	TRANSCycles = Cycles;
123	TRANSNum = `0`;
124	TRANSNumVALU = `0`;
125	break;
126	case SALU:
127	// Guard against pseudo-instructions like SI_CALL which are marked as
128	// SALU but with a very high latency.
129	SALUCycles = std::min(a: Cycles, b: SALU_CYCLES_MAX);
130	break;
131	}
132	}
133
134	bool operator==(const DelayInfo &RHS) const {
135	return VALUCycles == RHS.VALUCycles && VALUNum == RHS.VALUNum &&
136	TRANSCycles == RHS.TRANSCycles && TRANSNum == RHS.TRANSNum &&
137	TRANSNumVALU == RHS.TRANSNumVALU && SALUCycles == RHS.SALUCycles;
138	}
139
140	bool operator!=(const DelayInfo &RHS) const { return !(*this == RHS); }
141
142	// Merge another DelayInfo into this one, to represent the union of the
143	// worst-case delays of each type.
144	void merge(const DelayInfo &RHS) {
145	VALUCycles = std::max(a: VALUCycles, b: RHS.VALUCycles);
146	VALUNum = std::min(a: VALUNum, b: RHS.VALUNum);
147	TRANSCycles = std::max(a: TRANSCycles, b: RHS.TRANSCycles);
148	TRANSNum = std::min(a: TRANSNum, b: RHS.TRANSNum);
149	TRANSNumVALU = std::min(a: TRANSNumVALU, b: RHS.TRANSNumVALU);
150	SALUCycles = std::max(a: SALUCycles, b: RHS.SALUCycles);
151	}
152
153	// Update this DelayInfo after issuing an instruction. IsVALU should be 1
154	// when issuing a (non-TRANS) VALU, else 0. IsTRANS should be 1 when issuing
155	// a TRANS, else 0. Cycles is the number of cycles it takes to issue the
156	// instruction. Return true if there is no longer any useful delay info.
157	bool advance(DelayType Type, unsigned Cycles) {
158	bool Erase = true;
159
160	VALUNum += (Type == VALU);
161	if (VALUNum >= VALU_MAX \|\| VALUCycles <= Cycles) {
162	// Forget about the VALU instruction. It was too far back or has
163	// definitely completed by now.
164	VALUNum = VALU_MAX;
165	VALUCycles = `0`;
166	} else {
167	VALUCycles -= Cycles;
168	Erase = false;
169	}
170
171	TRANSNum += (Type == TRANS);
172	TRANSNumVALU += (Type == VALU);
173	if (TRANSNum >= TRANS_MAX \|\| TRANSCycles <= Cycles) {
174	// Forget about any TRANS instruction. It was too far back or has
175	// definitely completed by now.
176	TRANSNum = TRANS_MAX;
177	TRANSNumVALU = VALU_MAX;
178	TRANSCycles = `0`;
179	} else {
180	TRANSCycles -= Cycles;
181	Erase = false;
182	}
183
184	if (SALUCycles <= Cycles) {
185	// Forget about any SALU instruction. It has definitely completed by
186	// now.
187	SALUCycles = `0`;
188	} else {
189	SALUCycles -= Cycles;
190	Erase = false;
191	}
192
193	return Erase;
194	}
195
196	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
197	void dump() const {
198	if (VALUCycles)
199	dbgs() << " VALUCycles=" << (int)VALUCycles;
200	if (VALUNum < VALU_MAX)
201	dbgs() << " VALUNum=" << (int)VALUNum;
202	if (TRANSCycles)
203	dbgs() << " TRANSCycles=" << (int)TRANSCycles;
204	if (TRANSNum < TRANS_MAX)
205	dbgs() << " TRANSNum=" << (int)TRANSNum;
206	if (TRANSNumVALU < VALU_MAX)
207	dbgs() << " TRANSNumVALU=" << (int)TRANSNumVALU;
208	if (SALUCycles)
209	dbgs() << " SALUCycles=" << (int)SALUCycles;
210	}
211	#endif
212	};
213
214	// A map from regunits to the delay info for that regunit.
215	struct DelayState : DenseMap<unsigned, DelayInfo> {
216	// Merge another DelayState into this one by merging the delay info for each
217	// regunit.
218	void merge(const DelayState &RHS) {
219	for (const auto &KV : RHS) {
220	iterator It;
221	bool Inserted;
222	std::tie(args&: It, args&: Inserted) = insert(KV);
223	if (!Inserted)
224	It ->second.merge(RHS: KV.second);
225	}
226	}
227
228	// Advance the delay info for each regunit, erasing any that are no longer
229	// useful.
230	void advance(DelayType Type, unsigned Cycles) {
231	iterator Next;
232	for (auto I = begin(), E = end(); I != E; I = Next) {
233	Next = std::next(x: I);
234	if (I ->second.advance(Type, Cycles))
235	erase(I);
236	}
237	}
238
239	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
240	void dump(const TargetRegisterInfo TRI) const* {
241	if (empty()) {
242	dbgs() << " empty\n";
243	return;
244	}
245
246	// Dump DelayInfo for each RegUnit in numerical order.
247	SmallVector<const_iterator, `8`> Order;
248	Order.reserve(N: size());
249	for (const_iterator I = begin(), E = end(); I != E; ++I)
250	Order.push_back(Elt: I);
251	llvm::sort(C&: Order, Comp: [](const const_iterator &A, const const_iterator &B) {
252	return A ->first < B ->first;
253	});
254	for (const_iterator I : Order) {
255	dbgs() << " " << printRegUnit(Unit: I ->first, TRI);
256	I ->second.dump();
257	dbgs() << "\n";
258	}
259	}
260	#endif
261	};
262
263	// The saved delay state at the end of each basic block.
264	DenseMap<MachineBasicBlock *, DelayState> BlockState;
265
266	// Emit an s_delay_alu instruction if necessary before MI.
267	MachineInstr *emitDelayAlu(MachineInstr &MI, DelayInfo Delay,
268	MachineInstr *LastDelayAlu) {
269	unsigned Imm = `0`;
270
271	// Wait for a TRANS instruction.
272	if (Delay.TRANSNum < DelayInfo::TRANS_MAX)
273	Imm \|= `4` + Delay.TRANSNum;
274
275	// Wait for a VALU instruction (if it's more recent than any TRANS
276	// instruction that we're also waiting for).
277	if (Delay.VALUNum < DelayInfo::VALU_MAX &&
278	Delay.VALUNum <= Delay.TRANSNumVALU) {
279	if (Imm & `0xf`)
280	Imm \|= Delay.VALUNum << `7`;
281	else
282	Imm \|= Delay.VALUNum;
283	}
284
285	// Wait for an SALU instruction.
286	if (Delay.SALUCycles) {
287	assert(Delay.SALUCycles < DelayInfo::SALU_CYCLES_MAX);
288	if (Imm & `0x780`) {
289	// We have already encoded a VALU and a TRANS delay. There's no room in
290	// the encoding for an SALU delay as well, so just drop it.
291	} else if (Imm & `0xf`) {
292	Imm \|= (Delay.SALUCycles + `8`) << `7`;
293	} else {
294	Imm \|= Delay.SALUCycles + `8`;
295	}
296	}
297
298	// Don't emit the s_delay_alu instruction if there's nothing to wait for.
299	if (!Imm)
300	return LastDelayAlu;
301
302	// If we only need to wait for one instruction, try encoding it in the last
303	// s_delay_alu that we emitted.
304	if (!(Imm & `0x780`) && LastDelayAlu) {
305	unsigned Skip = `0`;
306	for (auto I = MachineBasicBlock::instr_iterator (LastDelayAlu),
307	E = MachineBasicBlock::instr_iterator (MI);
308	++I != E;) {
309	if (!I ->isBundle() && !I ->isMetaInstruction())
310	++Skip;
311	}
312	if (Skip < `6`) {
313	MachineOperand &Op = LastDelayAlu->getOperand(i: `0`);
314	unsigned LastImm = Op.getImm();
315	assert((LastImm & ~`0xf`) == `0` &&
316	"Remembered an s_delay_alu with no room for another delay!");
317	LastImm \|= Imm << `7` \| Skip << `4`;
318	Op.setImm(LastImm);
319	return nullptr;
320	}
321	}
322
323	auto &MBB = *MI.getParent();
324	MachineInstr *DelayAlu =
325	BuildMI(MBB, MI, DebugLoc(), SII->get(AMDGPU::S_DELAY_ALU)).addImm(Imm);
326	// Remember the s_delay_alu for next time if there is still room in it to
327	// encode another delay.
328	return (Imm & `0x780`) ? nullptr : DelayAlu;
329	}
330
331	bool runOnMachineBasicBlock(MachineBasicBlock &MBB, bool Emit) {
332	DelayState State;
333	for (auto *Pred : MBB.predecessors())
334	State.merge(RHS: BlockState [Pred]);
335
336	LLVM_DEBUG(dbgs() << " State at start of " << printMBBReference(MBB)
337	<< "\n";
338	State.dump(TRI););
339
340	bool Changed = false;
341	MachineInstr LastDelayAlu = nullptr*;
342
343	// Iterate over the contents of bundles, but don't emit any instructions
344	// inside a bundle.
345	for (auto &MI : MBB.instrs()) {
346	if (MI.isBundle() \|\| MI.isMetaInstruction())
347	continue;
348
349	// Ignore some more instructions that do not generate any code.
350	switch (MI.getOpcode()) {
351	case AMDGPU::SI_RETURN_TO_EPILOG:
352	continue;
353	}
354
355	DelayType Type = getDelayType(TSFlags: MI.getDesc().TSFlags);
356
357	if (instructionWaitsForVALU(MI)) {
358	// Forget about all outstanding VALU delays.
359	// TODO: This is overkill since it also forgets about SALU delays.
360	State = DelayState ();
361	} else if (Type != OTHER) {
362	DelayInfo Delay;
363	// TODO: Scan implicit uses too?
364	for (const auto &Op : MI.explicit_uses()) {
365	if (Op.isReg()) {
366	// One of the operands of the writelane is also the output operand.
367	// This creates the insertion of redundant delays. Hence, we have to
368	// ignore this operand.
369	if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32 && Op.isTied())
370	continue;
371	for (MCRegUnit Unit : TRI->regunits(Reg: Op.getReg())) {
372	auto It = State.find(Val: Unit);
373	if (It != State.end()) {
374	Delay.merge(RHS: It ->second);
375	State.erase(Val: Unit);
376	}
377	}
378	}
379	}
380	if (Emit && !MI.isBundledWithPred()) {
381	// TODO: For VALU->SALU delays should we use s_delay_alu or s_nop or
382	// just ignore them?
383	LastDelayAlu = emitDelayAlu(MI, Delay, LastDelayAlu);
384	}
385	}
386
387	if (Type != OTHER) {
388	// TODO: Scan implicit defs too?
389	for (const auto &Op : MI.defs()) {
390	unsigned Latency = SchedModel.computeOperandLatency(
391	DefMI: &MI, DefOperIdx: Op.getOperandNo(), UseMI: nullptr, UseOperIdx: `0`);
392	for (MCRegUnit Unit : TRI->regunits(Reg: Op.getReg()))
393	State [Unit] = DelayInfo (Type, Latency);
394	}
395	}
396
397	// Advance by the number of cycles it takes to issue this instruction.
398	// TODO: Use a more advanced model that accounts for instructions that
399	// take multiple cycles to issue on a particular pipeline.
400	unsigned Cycles = SIInstrInfo::getNumWaitStates(MI);
401	// TODO: In wave64 mode, double the number of cycles for VALU and VMEM
402	// instructions on the assumption that they will usually have to be issued
403	// twice?
404	State.advance(Type, Cycles);
405
406	LLVM_DEBUG(dbgs() << " State after " << MI; State.dump(TRI););
407	}
408
409	if (Emit) {
410	assert(State == BlockState[&MBB] &&
411	"Basic block state should not have changed on final pass!");
412	} else if (State != BlockState [&MBB]) {
413	BlockState [&MBB] = std::move(State);
414	Changed = true;
415	}
416	return Changed;
417	}
418
419	bool runOnMachineFunction(MachineFunction &MF) override {
420	if (skipFunction(F: MF.getFunction()))
421	return false;
422
423	LLVM_DEBUG(dbgs() << "AMDGPUInsertDelayAlu running on " << MF.getName()
424	<< "\n");
425
426	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
427	if (!ST.hasDelayAlu())
428	return false;
429
430	SII = ST.getInstrInfo();
431	TRI = ST.getRegisterInfo();
432
433	SchedModel.init(&ST);
434
435	// Calculate the delay state for each basic block, iterating until we reach
436	// a fixed point.
437	SetVector<MachineBasicBlock *> WorkList;
438	for (auto &MBB : reverse(C&: MF))
439	WorkList.insert(X: &MBB);
440	while (!WorkList.empty()) {
441	auto &MBB = *WorkList.pop_back_val();
442	bool Changed = runOnMachineBasicBlock(MBB, Emit: false);
443	if (Changed)
444	WorkList.insert(Start: MBB.succ_begin(), End: MBB.succ_end());
445	}
446
447	LLVM_DEBUG(dbgs() << "Final pass over all BBs\n");
448
449	// Make one last pass over all basic blocks to emit s_delay_alu
450	// instructions.
451	bool Changed = false;
452	for (auto &MBB : MF)
453	Changed \|= runOnMachineBasicBlock(MBB, Emit: true);
454	return Changed;
455	}
456	};
457
458	} // namespace
459
460	char AMDGPUInsertDelayAlu::ID = `0`;
461
462	char &llvm::AMDGPUInsertDelayAluID = AMDGPUInsertDelayAlu::ID;
463
464	INITIALIZE_PASS(AMDGPUInsertDelayAlu, DEBUG_TYPE, "AMDGPU Insert Delay ALU",
465	false, false)
466

source code of llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp