GCNHazardRecognizer.cpp source code [llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp]

1	//===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file implements hazard recognizers for scheduling on GCN processors.
10	//
11	//===----------------------------------------------------------------------===//
12
13	#include "GCNHazardRecognizer.h"
14	#include "GCNSubtarget.h"
15	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
16	#include "SIMachineFunctionInfo.h"
17	#include "llvm/CodeGen/MachineFunction.h"
18	#include "llvm/CodeGen/ScheduleDAG.h"
19	#include "llvm/TargetParser/TargetParser.h"
20
21	using namespace llvm;
22
23	namespace {
24
25	struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
26	MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
27
28	bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
29	if (Arg.getAsInteger(Radix: `0`, Result&: Value))
30	return O.error(Message: "'" + Arg + "' value invalid for uint argument!");
31
32	if (Value > `100`)
33	return O.error(Message: "'" + Arg + "' value must be in the range [0, 100]!");
34
35	return false;
36	}
37	};
38
39	} // end anonymous namespace
40
41	static cl::opt<unsigned, false, MFMAPaddingRatioParser>
42	MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(Val: `0`), cl::Hidden,
43	cl::desc ("Fill a percentage of the latency between "
44	"neighboring MFMA with s_nops."));
45
46	//===----------------------------------------------------------------------===//
47	// Hazard Recognizer Implementation
48	//===----------------------------------------------------------------------===//
49
50	static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
51	const GCNSubtarget &ST);
52
53	GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
54	IsHazardRecognizerMode(false),
55	CurrCycleInstr(nullptr),
56	MF(MF),
57	ST(MF.getSubtarget<GCNSubtarget>()),
58	TII(*ST.getInstrInfo()),
59	TRI(TII.getRegisterInfo()),
60	ClauseUses(TRI.getNumRegUnits()),
61	ClauseDefs(TRI.getNumRegUnits()) {
62	MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::PhysReg: AGPR0) ? `19` : `5`;
63	TSchedModel.init(&ST);
64	RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
65	}
66
67	void GCNHazardRecognizer::Reset() {
68	EmittedInstrs.clear();
69	}
70
71	void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
72	EmitInstruction(MI: SU->getInstr());
73	}
74
75	void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
76	CurrCycleInstr = MI;
77	}
78
79	static bool isDivFMas(unsigned Opcode) {
80	return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 \|\| Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
81	}
82
83	static bool isSGetReg(unsigned Opcode) {
84	return Opcode == AMDGPU::S_GETREG_B32;
85	}
86
87	static bool isSSetReg(unsigned Opcode) {
88	switch (Opcode) {
89	case AMDGPU::S_SETREG_B32:
90	case AMDGPU::S_SETREG_B32_mode:
91	case AMDGPU::S_SETREG_IMM32_B32:
92	case AMDGPU::S_SETREG_IMM32_B32_mode:
93	return true;
94	}
95	return false;
96	}
97
98	static bool isRWLane(unsigned Opcode) {
99	return Opcode == AMDGPU::V_READLANE_B32 \|\| Opcode == AMDGPU::V_WRITELANE_B32;
100	}
101
102	static bool isRFE(unsigned Opcode) {
103	return Opcode == AMDGPU::S_RFE_B64;
104	}
105
106	static bool isSMovRel(unsigned Opcode) {
107	switch (Opcode) {
108	case AMDGPU::S_MOVRELS_B32:
109	case AMDGPU::S_MOVRELS_B64:
110	case AMDGPU::S_MOVRELD_B32:
111	case AMDGPU::S_MOVRELD_B64:
112	return true;
113	default:
114	return false;
115	}
116	}
117
118	static bool isDGEMM(unsigned Opcode) {
119	return AMDGPU::getMAIIsDGEMM(Opc: Opcode);
120	}
121
122	static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) {
123	unsigned Opcode = MI.getOpcode();
124
125	if (!SIInstrInfo::isMAI(MI) \|\|
126	isDGEMM(Opcode) \|\|
127	Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 \|\|
128	Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
129	return false;
130
131	if (!ST.hasGFX940Insts())
132	return true;
133
134	return AMDGPU::getMAIIsGFX940XDL(Opc: Opcode);
135	}
136
137	static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
138	const MachineInstr &MI) {
139	if (TII.isAlwaysGDS(Opcode: MI.getOpcode()))
140	return true;
141
142	switch (MI.getOpcode()) {
143	case AMDGPU::S_SENDMSG:
144	case AMDGPU::S_SENDMSGHALT:
145	case AMDGPU::S_TTRACEDATA:
146	return true;
147	// These DS opcodes don't support GDS.
148	case AMDGPU::DS_NOP:
149	case AMDGPU::DS_PERMUTE_B32:
150	case AMDGPU::DS_BPERMUTE_B32:
151	return false;
152	default:
153	if (TII.isDS(Opcode: MI.getOpcode())) {
154	int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
155	AMDGPU::OpName::gds);
156	if (MI.getOperand(i: GDS).getImm())
157	return true;
158	}
159	return false;
160	}
161	}
162
163	static bool isPermlane(const MachineInstr &MI) {
164	unsigned Opcode = MI.getOpcode();
165	return Opcode == AMDGPU::V_PERMLANE16_B32_e64 \|\|
166	Opcode == AMDGPU::V_PERMLANE64_B32 \|\|
167	Opcode == AMDGPU::V_PERMLANEX16_B32_e64 \|\|
168	Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 \|\|
169	Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64;
170	}
171
172	static bool isLdsDma(const MachineInstr &MI) {
173	return SIInstrInfo::isVALU(MI) &&
174	(SIInstrInfo::isMUBUF(MI) \|\| SIInstrInfo::isFLAT(MI));
175	}
176
177	static unsigned getHWReg(const SIInstrInfo TII, const* MachineInstr &RegInstr) {
178	const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
179	AMDGPU::OpName::simm16);
180	return std::get<`0`>(AMDGPU::Hwreg::HwregEncoding::decode(RegOp->getImm()));
181	}
182
183	ScheduleHazardRecognizer::HazardType
184	GCNHazardRecognizer::getHazardType(SUnit SU, int* Stalls) {
185	MachineInstr *MI = SU->getInstr();
186	// If we are not in "HazardRecognizerMode" and therefore not being run from
187	// the scheduler, track possible stalls from hazards but don't insert noops.
188	auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
189
190	if (MI->isBundle())
191	return NoHazard;
192
193	if (SIInstrInfo::isSMRD(MI: *MI) && checkSMRDHazards(SMRD: MI) > `0`)
194	return HazardType;
195
196	if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > `0`)
197	return HazardType;
198
199	if (checkFPAtomicToDenormModeHazard(MI) > `0`)
200	return HazardType;
201
202	if (ST.hasNoDataDepHazard())
203	return NoHazard;
204
205	// FIXME: Should flat be considered vmem?
206	if ((SIInstrInfo::isVMEM(MI: *MI) \|\|
207	SIInstrInfo::isFLAT(MI: *MI))
208	&& checkVMEMHazards(VMEM: MI) > `0`)
209	return HazardType;
210
211	if (SIInstrInfo::isVALU(MI: *MI) && checkVALUHazards(VALU: MI) > `0`)
212	return HazardType;
213
214	if (SIInstrInfo::isDPP(MI: *MI) && checkDPPHazards(DPP: MI) > `0`)
215	return HazardType;
216
217	if (isDivFMas(Opcode: MI->getOpcode()) && checkDivFMasHazards(DivFMas: MI) > `0`)
218	return HazardType;
219
220	if (isRWLane(Opcode: MI->getOpcode()) && checkRWLaneHazards(RWLane: MI) > `0`)
221	return HazardType;
222
223	if ((SIInstrInfo::isVALU(MI: MI) \|\| SIInstrInfo::isVMEM(MI: MI) \|\|
224	SIInstrInfo::isFLAT(MI: MI) \|\| SIInstrInfo::isDS(MI: MI) \|\|
225	SIInstrInfo::isEXP(MI: *MI)) && checkMAIVALUHazards(MI) > `0`)
226	return HazardType;
227
228	if (isSGetReg(Opcode: MI->getOpcode()) && checkGetRegHazards(GetRegInstr: MI) > `0`)
229	return HazardType;
230
231	if (isSSetReg(Opcode: MI->getOpcode()) && checkSetRegHazards(SetRegInstr: MI) > `0`)
232	return HazardType;
233
234	if (isRFE(Opcode: MI->getOpcode()) && checkRFEHazards(RFE: MI) > `0`)
235	return HazardType;
236
237	if (((ST.hasReadM0MovRelInterpHazard() &&
238	(TII.isVINTRP(MI: *MI) \|\| isSMovRel(Opcode: MI->getOpcode()) \|\|
239	MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 \|\|
240	MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) \|\|
241	(ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, MI: *MI)) \|\|
242	(ST.hasReadM0LdsDmaHazard() && isLdsDma(MI: *MI)) \|\|
243	(ST.hasReadM0LdsDirectHazard() &&
244	MI->readsRegister(AMDGPU::Reg: LDS_DIRECT, /TRI=/nullptr))) &&
245	checkReadM0Hazards(SMovRel: MI) > `0`)
246	return HazardType;
247
248	if (SIInstrInfo::isMAI(MI: *MI) && checkMAIHazards(MI) > `0`)
249	return HazardType;
250
251	if ((SIInstrInfo::isVMEM(MI: *MI) \|\|
252	SIInstrInfo::isFLAT(MI: *MI) \|\|
253	SIInstrInfo::isDS(MI: *MI)) && checkMAILdStHazards(MI) > `0`)
254	return HazardType;
255
256	if (MI->isInlineAsm() && checkInlineAsmHazards(IA: MI) > `0`)
257	return HazardType;
258
259	return NoHazard;
260	}
261
262	static void insertNoopsInBundle(MachineInstr MI, const* SIInstrInfo &TII,
263	unsigned Quantity) {
264	while (Quantity > `0`) {
265	unsigned Arg = std::min(a: Quantity, b: `8u`);
266	Quantity -= Arg;
267	BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
268	.addImm(Arg - `1`);
269	}
270	}
271
272	unsigned
273	GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const {
274	const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(MI: &MI);
275	assert(TSchedModel.getWriteProcResBegin(SC) !=
276	TSchedModel.getWriteProcResEnd(SC));
277	return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
278	}
279
280	void GCNHazardRecognizer::processBundle() {
281	MachineBasicBlock::instr_iterator MI = std::next(x: CurrCycleInstr->getIterator());
282	MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
283	// Check bundled MachineInstr's for hazards.
284	for (; MI != E && MI ->isInsideBundle(); ++MI) {
285	CurrCycleInstr = &*MI;
286	unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
287
288	if (IsHazardRecognizerMode) {
289	fixHazards(MI: CurrCycleInstr);
290
291	insertNoopsInBundle(MI: CurrCycleInstr, TII, Quantity: WaitStates);
292	}
293
294	// It’s unnecessary to track more than MaxLookAhead instructions. Since we
295	// include the bundled MI directly after, only add a maximum of
296	// (MaxLookAhead - 1) noops to EmittedInstrs.
297	for (unsigned i = `0`, e = std::min(a: WaitStates, b: MaxLookAhead - `1`); i < e; ++i)
298	EmittedInstrs.push_front(x: nullptr);
299
300	EmittedInstrs.push_front(x: CurrCycleInstr);
301	EmittedInstrs.resize(new_size: MaxLookAhead);
302	}
303	CurrCycleInstr = nullptr;
304	}
305
306	void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) {
307	assert(IsHazardRecognizerMode);
308
309	unsigned NumPreNoops = PreEmitNoops(MI);
310	EmitNoops(Quantity: NumPreNoops);
311	if (MI->isInsideBundle())
312	insertNoopsInBundle(MI, TII, Quantity: NumPreNoops);
313	else
314	TII.insertNoops(MBB&: *MI->getParent(), MI: MachineBasicBlock::iterator (MI),
315	Quantity: NumPreNoops);
316	EmitInstruction(MI);
317	AdvanceCycle();
318	}
319
320	unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
321	IsHazardRecognizerMode = true;
322	CurrCycleInstr = MI;
323	unsigned W = PreEmitNoopsCommon(MI);
324	fixHazards(MI);
325	CurrCycleInstr = nullptr;
326	return W;
327	}
328
329	unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
330	if (MI->isBundle())
331	return `0`;
332
333	int WaitStates = `0`;
334
335	if (SIInstrInfo::isSMRD(MI: *MI))
336	return std::max(a: WaitStates, b: checkSMRDHazards(SMRD: MI));
337
338	if (ST.hasNSAtoVMEMBug())
339	WaitStates = std::max(a: WaitStates, b: checkNSAtoVMEMHazard(MI));
340
341	WaitStates = std::max(a: WaitStates, b: checkFPAtomicToDenormModeHazard(MI));
342
343	if (ST.hasNoDataDepHazard())
344	return WaitStates;
345
346	if (SIInstrInfo::isVMEM(MI: MI) \|\| SIInstrInfo::isFLAT(MI: MI))
347	WaitStates = std::max(a: WaitStates, b: checkVMEMHazards(VMEM: MI));
348
349	if (SIInstrInfo::isVALU(MI: *MI))
350	WaitStates = std::max(a: WaitStates, b: checkVALUHazards(VALU: MI));
351
352	if (SIInstrInfo::isDPP(MI: *MI))
353	WaitStates = std::max(a: WaitStates, b: checkDPPHazards(DPP: MI));
354
355	if (isDivFMas(Opcode: MI->getOpcode()))
356	WaitStates = std::max(a: WaitStates, b: checkDivFMasHazards(DivFMas: MI));
357
358	if (isRWLane(Opcode: MI->getOpcode()))
359	WaitStates = std::max(a: WaitStates, b: checkRWLaneHazards(RWLane: MI));
360
361	if ((SIInstrInfo::isVALU(MI: MI) \|\| SIInstrInfo::isVMEM(MI: MI) \|\|
362	SIInstrInfo::isFLAT(MI: MI) \|\| SIInstrInfo::isDS(MI: MI) \|\|
363	SIInstrInfo::isEXP(MI: *MI)) && checkMAIVALUHazards(MI) > `0`)
364	WaitStates = std::max(a: WaitStates, b: checkMAIVALUHazards(MI));
365
366	if (MI->isInlineAsm())
367	return std::max(a: WaitStates, b: checkInlineAsmHazards(IA: MI));
368
369	if (isSGetReg(Opcode: MI->getOpcode()))
370	return std::max(a: WaitStates, b: checkGetRegHazards(GetRegInstr: MI));
371
372	if (isSSetReg(Opcode: MI->getOpcode()))
373	return std::max(a: WaitStates, b: checkSetRegHazards(SetRegInstr: MI));
374
375	if (isRFE(Opcode: MI->getOpcode()))
376	return std::max(a: WaitStates, b: checkRFEHazards(RFE: MI));
377
378	if ((ST.hasReadM0MovRelInterpHazard() &&
379	(TII.isVINTRP(MI: *MI) \|\| isSMovRel(Opcode: MI->getOpcode()) \|\|
380	MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 \|\|
381	MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) \|\|
382	(ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, MI: *MI)) \|\|
383	(ST.hasReadM0LdsDmaHazard() && isLdsDma(MI: *MI)) \|\|
384	(ST.hasReadM0LdsDirectHazard() &&
385	MI->readsRegister(AMDGPU::Reg: LDS_DIRECT, /TRI=/nullptr)))
386	return std::max(a: WaitStates, b: checkReadM0Hazards(SMovRel: MI));
387
388	if (SIInstrInfo::isMAI(MI: *MI))
389	return std::max(a: WaitStates, b: checkMAIHazards(MI));
390
391	if (SIInstrInfo::isVMEM(MI: *MI) \|\|
392	SIInstrInfo::isFLAT(MI: *MI) \|\|
393	SIInstrInfo::isDS(MI: *MI))
394	return std::max(a: WaitStates, b: checkMAILdStHazards(MI));
395
396	return WaitStates;
397	}
398
399	void GCNHazardRecognizer::EmitNoop() {
400	EmittedInstrs.push_front(x: nullptr);
401	}
402
403	void GCNHazardRecognizer::AdvanceCycle() {
404	// When the scheduler detects a stall, it will call AdvanceCycle() without
405	// emitting any instructions.
406	if (!CurrCycleInstr) {
407	EmittedInstrs.push_front(x: nullptr);
408	return;
409	}
410
411	if (CurrCycleInstr->isBundle()) {
412	processBundle();
413	return;
414	}
415
416	unsigned NumWaitStates = TII.getNumWaitStates(MI: *CurrCycleInstr);
417	if (!NumWaitStates) {
418	CurrCycleInstr = nullptr;
419	return;
420	}
421
422	// Keep track of emitted instructions
423	EmittedInstrs.push_front(x: CurrCycleInstr);
424
425	// Add a nullptr for each additional wait state after the first. Make sure
426	// not to add more than getMaxLookAhead() items to the list, since we
427	// truncate the list to that size right after this loop.
428	for (unsigned i = `1`, e = std::min(a: NumWaitStates, b: getMaxLookAhead());
429	i < e; ++i) {
430	EmittedInstrs.push_front(x: nullptr);
431	}
432
433	// getMaxLookahead() is the largest number of wait states we will ever need
434	// to insert, so there is no point in keeping track of more than that many
435	// wait states.
436	EmittedInstrs.resize(new_size: getMaxLookAhead());
437
438	CurrCycleInstr = nullptr;
439	}
440
441	void GCNHazardRecognizer::RecedeCycle() {
442	llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
443	}
444
445	//===----------------------------------------------------------------------===//
446	// Helper Functions
447	//===----------------------------------------------------------------------===//
448
449	typedef enum { HazardFound, HazardExpired, NoHazardFound } HazardFnResult;
450
451	typedef function_ref<bool(const MachineInstr &, int WaitStates)> IsExpiredFn;
452	typedef function_ref<unsigned int(const MachineInstr &)> GetNumWaitStatesFn;
453
454	// Search for a hazard in a block and its predecessors.
455	template <typename StateT>
456	static bool
457	hasHazard(StateT State,
458	function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard,
459	function_ref<void(StateT &, const MachineInstr &)> UpdateState,
460	const MachineBasicBlock *MBB,
461	MachineBasicBlock::const_reverse_instr_iterator I,
462	DenseSet<const MachineBasicBlock *> &Visited) {
463	for (auto E = MBB->instr_rend(); I != E; ++I) {
464	// No need to look at parent BUNDLE instructions.
465	if (I ->isBundle())
466	continue;
467
468	switch (IsHazard(State, *I)) {
469	case HazardFound:
470	return true;
471	case HazardExpired:
472	return false;
473	default:
474	// Continue search
475	break;
476	}
477
478	if (I ->isInlineAsm() \|\| I ->isMetaInstruction())
479	continue;
480
481	UpdateState(State, *I);
482	}
483
484	for (MachineBasicBlock *Pred : MBB->predecessors()) {
485	if (!Visited.insert(V: Pred).second)
486	continue;
487
488	if (hasHazard(State, IsHazard, UpdateState, Pred, Pred->instr_rbegin(),
489	Visited))
490	return true;
491	}
492
493	return false;
494	}
495
496	// Returns a minimum wait states since \p I walking all predecessors.
497	// Only scans until \p IsExpired does not return true.
498	// Can only be run in a hazard recognizer mode.
499	static int getWaitStatesSince(
500	GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB,
501	MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates,
502	IsExpiredFn IsExpired, DenseSet<const MachineBasicBlock *> &Visited,
503	GetNumWaitStatesFn GetNumWaitStates = SIInstrInfo::getNumWaitStates) {
504	for (auto E = MBB->instr_rend(); I != E; ++I) {
505	// Don't add WaitStates for parent BUNDLE instructions.
506	if (I ->isBundle())
507	continue;
508
509	if (IsHazard (*I))
510	return WaitStates;
511
512	if (I ->isInlineAsm())
513	continue;
514
515	WaitStates += GetNumWaitStates (*I);
516
517	if (IsExpired (*I, WaitStates))
518	return std::numeric_limits<int>::max();
519	}
520
521	int MinWaitStates = std::numeric_limits<int>::max();
522	for (MachineBasicBlock *Pred : MBB->predecessors()) {
523	if (!Visited.insert(V: Pred).second)
524	continue;
525
526	int W = getWaitStatesSince(IsHazard, MBB: Pred, I: Pred->instr_rbegin(), WaitStates,
527	IsExpired, Visited, GetNumWaitStates);
528
529	MinWaitStates = std::min(a: MinWaitStates, b: W);
530	}
531
532	return MinWaitStates;
533	}
534
535	static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
536	const MachineInstr *MI, IsExpiredFn IsExpired) {
537	DenseSet<const MachineBasicBlock *> Visited;
538	return getWaitStatesSince(IsHazard, MBB: MI->getParent(),
539	I: std::next(x: MI->getReverseIterator()),
540	WaitStates: `0`, IsExpired, Visited);
541	}
542
543	int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
544	if (IsHazardRecognizerMode) {
545	auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
546	return WaitStates >= Limit;
547	};
548	return ::getWaitStatesSince(IsHazard, MI: CurrCycleInstr, IsExpired: IsExpiredFn);
549	}
550
551	int WaitStates = `0`;
552	for (MachineInstr *MI : EmittedInstrs) {
553	if (MI) {
554	if (IsHazard (*MI))
555	return WaitStates;
556
557	if (MI->isInlineAsm())
558	continue;
559	}
560	++WaitStates;
561
562	if (WaitStates >= Limit)
563	break;
564	}
565	return std::numeric_limits<int>::max();
566	}
567
568	int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
569	IsHazardFn IsHazardDef,
570	int Limit) {
571	const SIRegisterInfo *TRI = ST.getRegisterInfo();
572
573	auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
574	return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
575	};
576
577	return getWaitStatesSince(IsHazardFn, Limit);
578	}
579
580	int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
581	int Limit) {
582	auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
583	return isSSetReg(Opcode: MI.getOpcode()) && IsHazard (MI);
584	};
585
586	return getWaitStatesSince(IsHazard: IsHazardFn, Limit);
587	}
588
589	//===----------------------------------------------------------------------===//
590	// No-op Hazard Detection
591	//===----------------------------------------------------------------------===//
592
593	static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
594	MCRegister Reg) {
595	for (MCRegUnit Unit : TRI.regunits(Reg))
596	BV.set(Unit);
597	}
598
599	static void addRegsToSet(const SIRegisterInfo &TRI,
600	iterator_range<MachineInstr::const_mop_iterator> Ops,
601	BitVector &DefSet, BitVector &UseSet) {
602	for (const MachineOperand &Op : Ops) {
603	if (Op.isReg())
604	addRegUnits(TRI, BV&: Op.isDef() ? DefSet : UseSet, Reg: Op.getReg().asMCReg());
605	}
606	}
607
608	void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
609	addRegsToSet(TRI, Ops: MI.operands(), DefSet&: ClauseDefs, UseSet&: ClauseUses);
610	}
611
612	static bool breaksSMEMSoftClause(MachineInstr *MI) {
613	return !SIInstrInfo::isSMRD(MI: *MI);
614	}
615
616	static bool breaksVMEMSoftClause(MachineInstr *MI) {
617	return !SIInstrInfo::isVMEM(MI: MI) && !SIInstrInfo::isFLAT(MI: MI);
618	}
619
620	int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
621	// SMEM soft clause are only present on VI+, and only matter if xnack is
622	// enabled.
623	if (!ST.isXNACKEnabled())
624	return `0`;
625
626	bool IsSMRD = TII.isSMRD(MI: *MEM);
627
628	resetClause();
629
630	// A soft-clause is any group of consecutive SMEM instructions. The
631	// instructions in this group may return out of order and/or may be
632	// replayed (i.e. the same instruction issued more than once).
633	//
634	// In order to handle these situations correctly we need to make sure that
635	// when a clause has more than one instruction, no instruction in the clause
636	// writes to a register that is read by another instruction in the clause
637	// (including itself). If we encounter this situation, we need to break the
638	// clause by inserting a non SMEM instruction.
639
640	for (MachineInstr *MI : EmittedInstrs) {
641	// When we hit a non-SMEM instruction then we have passed the start of the
642	// clause and we can stop.
643	if (!MI)
644	break;
645
646	if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI))
647	break;
648
649	addClauseInst(MI: *MI);
650	}
651
652	if (ClauseDefs.none())
653	return `0`;
654
655	// We need to make sure not to put loads and stores in the same clause if they
656	// use the same address. For now, just start a new clause whenever we see a
657	// store.
658	if (MEM->mayStore())
659	return `1`;
660
661	addClauseInst(MI: *MEM);
662
663	// If the set of defs and uses intersect then we cannot add this instruction
664	// to the clause, so we have a hazard.
665	return ClauseDefs.anyCommon(RHS: ClauseUses) ? `1` : `0`;
666	}
667
668	int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
669	int WaitStatesNeeded = `0`;
670
671	WaitStatesNeeded = checkSoftClauseHazards(MEM: SMRD);
672
673	// This SMRD hazard only affects SI.
674	if (!ST.hasSMRDReadVALUDefHazard())
675	return WaitStatesNeeded;
676
677	// A read of an SGPR by SMRD instruction requires 4 wait states when the
678	// SGPR was written by a VALU instruction.
679	int SmrdSgprWaitStates = `4`;
680	auto IsHazardDefFn = [this](const MachineInstr &MI) {
681	return TII.isVALU(MI);
682	};
683	auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
684	return TII.isSALU(MI);
685	};
686
687	bool IsBufferSMRD = TII.isBufferSMRD(MI: *SMRD);
688
689	for (const MachineOperand &Use : SMRD->uses()) {
690	if (!Use.isReg())
691	continue;
692	int WaitStatesNeededForUse =
693	SmrdSgprWaitStates - getWaitStatesSinceDef(Reg: Use.getReg(), IsHazardDef: IsHazardDefFn,
694	Limit: SmrdSgprWaitStates);
695	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
696
697	// This fixes what appears to be undocumented hardware behavior in SI where
698	// s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
699	// needs some number of nops in between. We don't know how many we need, but
700	// let's use 4. This wasn't discovered before probably because the only
701	// case when this happens is when we expand a 64-bit pointer into a full
702	// descriptor and use s_buffer_load_dword instead of s_load_dword, which was
703	// probably never encountered in the closed-source land.
704	if (IsBufferSMRD) {
705	int WaitStatesNeededForUse =
706	SmrdSgprWaitStates - getWaitStatesSinceDef(Reg: Use.getReg(),
707	IsHazardDef: IsBufferHazardDefFn,
708	Limit: SmrdSgprWaitStates);
709	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
710	}
711	}
712
713	return WaitStatesNeeded;
714	}
715
716	int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
717	if (!ST.hasVMEMReadSGPRVALUDefHazard())
718	return `0`;
719
720	int WaitStatesNeeded = checkSoftClauseHazards(MEM: VMEM);
721
722	// A read of an SGPR by a VMEM instruction requires 5 wait states when the
723	// SGPR was written by a VALU Instruction.
724	const int VmemSgprWaitStates = `5`;
725	auto IsHazardDefFn = [this](const MachineInstr &MI) {
726	return TII.isVALU(MI);
727	};
728	for (const MachineOperand &Use : VMEM->uses()) {
729	if (!Use.isReg() \|\| TRI.isVectorRegister(MRI: MF.getRegInfo(), Reg: Use.getReg()))
730	continue;
731
732	int WaitStatesNeededForUse =
733	VmemSgprWaitStates - getWaitStatesSinceDef(Reg: Use.getReg(), IsHazardDef: IsHazardDefFn,
734	Limit: VmemSgprWaitStates);
735	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
736	}
737	return WaitStatesNeeded;
738	}
739
740	int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
741	const SIRegisterInfo *TRI = ST.getRegisterInfo();
742	const SIInstrInfo *TII = ST.getInstrInfo();
743
744	// Check for DPP VGPR read after VALU VGPR write and EXEC write.
745	int DppVgprWaitStates = `2`;
746	int DppExecWaitStates = `5`;
747	int WaitStatesNeeded = `0`;
748	auto IsHazardDefFn = [TII](const MachineInstr &MI) {
749	return TII->isVALU(MI);
750	};
751
752	for (const MachineOperand &Use : DPP->uses()) {
753	if (!Use.isReg() \|\| !TRI->isVGPR(MRI: MF.getRegInfo(), Reg: Use.getReg()))
754	continue;
755	int WaitStatesNeededForUse =
756	DppVgprWaitStates - getWaitStatesSinceDef(
757	Reg: Use.getReg(),
758	IsHazardDef: [](const MachineInstr &) { return true; },
759	Limit: DppVgprWaitStates);
760	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
761	}
762
763	WaitStatesNeeded = std::max(
764	WaitStatesNeeded,
765	DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
766	DppExecWaitStates));
767
768	return WaitStatesNeeded;
769	}
770
771	int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
772	const SIInstrInfo *TII = ST.getInstrInfo();
773
774	// v_div_fmas requires 4 wait states after a write to vcc from a VALU
775	// instruction.
776	const int DivFMasWaitStates = `4`;
777	auto IsHazardDefFn = [TII](const MachineInstr &MI) {
778	return TII->isVALU(MI);
779	};
780	int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
781	DivFMasWaitStates);
782
783	return DivFMasWaitStates - WaitStatesNeeded;
784	}
785
786	int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
787	const SIInstrInfo *TII = ST.getInstrInfo();
788	unsigned GetRegHWReg = getHWReg(TII, RegInstr: *GetRegInstr);
789
790	const int GetRegWaitStates = `2`;
791	auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
792	return GetRegHWReg == getHWReg(TII, RegInstr: MI);
793	};
794	int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazard: IsHazardFn, Limit: GetRegWaitStates);
795
796	return GetRegWaitStates - WaitStatesNeeded;
797	}
798
799	int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
800	const SIInstrInfo *TII = ST.getInstrInfo();
801	unsigned HWReg = getHWReg(TII, RegInstr: *SetRegInstr);
802
803	const int SetRegWaitStates = ST.getSetRegWaitStates();
804	auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
805	return HWReg == getHWReg(TII, RegInstr: MI);
806	};
807	int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazard: IsHazardFn, Limit: SetRegWaitStates);
808	return SetRegWaitStates - WaitStatesNeeded;
809	}
810
811	int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
812	if (!MI.mayStore())
813	return -`1`;
814
815	const SIInstrInfo *TII = ST.getInstrInfo();
816	unsigned Opcode = MI.getOpcode();
817	const MCInstrDesc &Desc = MI.getDesc();
818
819	int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
820	int VDataRCID = -`1`;
821	if (VDataIdx != -`1`)
822	VDataRCID = Desc.operands()[VDataIdx].RegClass;
823
824	if (TII->isMUBUF(MI) \|\| TII->isMTBUF(MI)) {
825	// There is no hazard if the instruction does not use vector regs
826	// (like wbinvl1)
827	if (VDataIdx == -`1`)
828	return -`1`;
829	// For MUBUF/MTBUF instructions this hazard only exists if the
830	// instruction is not using a register in the soffset field.
831	const MachineOperand *SOffset =
832	TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
833	// If we have no soffset operand, then assume this field has been
834	// hardcoded to zero.
835	if (AMDGPU::getRegBitWidth(RCID: VDataRCID) > `64` &&
836	(!SOffset \|\| !SOffset->isReg()))
837	return VDataIdx;
838	}
839
840	// MIMG instructions create a hazard if they don't use a 256-bit T# and
841	// the store size is greater than 8 bytes and they have more than two bits
842	// of their dmask set.
843	// All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
844	if (TII->isMIMG(MI)) {
845	int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
846	assert(SRsrcIdx != -`1` &&
847	AMDGPU::getRegBitWidth(Desc.operands()[SRsrcIdx].RegClass) == `256`);
848	(void)SRsrcIdx;
849	}
850
851	if (TII->isFLAT(MI)) {
852	int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
853	if (AMDGPU::getRegBitWidth(RCID: Desc.operands()[DataIdx].RegClass) > `64`)
854	return DataIdx;
855	}
856
857	return -`1`;
858	}
859
860	int
861	GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
862	const MachineRegisterInfo &MRI) {
863	// Helper to check for the hazard where VMEM instructions that store more than
864	// 8 bytes can have there store data over written by the next instruction.
865	const SIRegisterInfo *TRI = ST.getRegisterInfo();
866
867	const int VALUWaitStates = ST.hasGFX940Insts() ? `2` : `1`;
868	int WaitStatesNeeded = `0`;
869
870	if (!TRI->isVectorRegister(MRI, Reg: Def.getReg()))
871	return WaitStatesNeeded;
872	Register Reg = Def.getReg();
873	auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
874	int DataIdx = createsVALUHazard(MI);
875	return DataIdx >= `0` &&
876	TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
877	};
878	int WaitStatesNeededForDef =
879	VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
880	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
881
882	return WaitStatesNeeded;
883	}
884
885	int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
886	int WaitStatesNeeded = `0`;
887
888	if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(MI: *VALU)) {
889	const int TransDefWaitstates = `1`;
890
891	auto IsTransDefFn = [this, VALU](const MachineInstr &MI) {
892	if (!SIInstrInfo::isTRANS(MI))
893	return false;
894	const SIRegisterInfo *TRI = ST.getRegisterInfo();
895	const SIInstrInfo *TII = ST.getInstrInfo();
896	Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg();
897
898	for (const MachineOperand &Use : VALU->explicit_uses()) {
899	if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
900	return true;
901	}
902
903	return false;
904	};
905
906	int WaitStatesNeededForDef =
907	TransDefWaitstates -
908	getWaitStatesSince(IsHazard: IsTransDefFn, Limit: TransDefWaitstates);
909	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
910	}
911
912	if (ST.hasDstSelForwardingHazard()) {
913	const int Shift16DefWaitstates = `1`;
914
915	auto IsShift16BitDefFn = [this, VALU](const MachineInstr &MI) {
916	if (!SIInstrInfo::isVALU(MI))
917	return false;
918	const SIInstrInfo *TII = ST.getInstrInfo();
919	if (SIInstrInfo::isSDWA(MI)) {
920	if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
921	if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
922	return false;
923	} else {
924	if (!AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::op_sel) \|\|
925	!(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)
926	->getImm() &
927	SISrcMods::DST_OP_SEL))
928	return false;
929	}
930	const SIRegisterInfo *TRI = ST.getRegisterInfo();
931	if (auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
932	Register Def = Dst->getReg();
933
934	for (const MachineOperand &Use : VALU->explicit_uses()) {
935	if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
936	return true;
937	}
938	}
939
940	return false;
941	};
942
943	int WaitStatesNeededForDef =
944	Shift16DefWaitstates -
945	getWaitStatesSince(IsHazard: IsShift16BitDefFn, Limit: Shift16DefWaitstates);
946	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
947	}
948
949	if (ST.hasVDecCoExecHazard()) {
950	const int VALUWriteSGPRVALUReadWaitstates = `2`;
951	const int VALUWriteEXECRWLane = `4`;
952	const int VALUWriteVGPRReadlaneRead = `1`;
953
954	const SIRegisterInfo *TRI = ST.getRegisterInfo();
955	const MachineRegisterInfo &MRI = MF.getRegInfo();
956	Register UseReg;
957	auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) {
958	if (!SIInstrInfo::isVALU(MI))
959	return false;
960	return MI.modifiesRegister(UseReg, TRI);
961	};
962
963	for (const MachineOperand &Use : VALU->explicit_uses()) {
964	if (!Use.isReg())
965	continue;
966
967	UseReg = Use.getReg();
968	if (TRI->isSGPRReg(MRI, Reg: UseReg)) {
969	int WaitStatesNeededForDef =
970	VALUWriteSGPRVALUReadWaitstates -
971	getWaitStatesSince(IsVALUDefSGPRFn,
972	VALUWriteSGPRVALUReadWaitstates);
973	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
974	}
975	}
976
977	if (VALU->readsRegister(AMDGPU::VCC, TRI)) {
978	UseReg = AMDGPU::VCC;
979	int WaitStatesNeededForDef =
980	VALUWriteSGPRVALUReadWaitstates -
981	getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
982	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
983	}
984
985	switch (VALU->getOpcode()) {
986	case AMDGPU::V_READLANE_B32:
987	case AMDGPU::V_READFIRSTLANE_B32: {
988	MachineOperand Src = TII.getNamedOperand(VALU, AMDGPU::OpName::src0);
989	UseReg = Src->getReg();
990	int WaitStatesNeededForDef =
991	VALUWriteVGPRReadlaneRead -
992	getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
993	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
994	}
995	[[fallthrough]];
996	case AMDGPU::V_WRITELANE_B32: {
997	UseReg = AMDGPU::EXEC;
998	int WaitStatesNeededForDef =
999	VALUWriteEXECRWLane -
1000	getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
1001	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
1002	break;
1003	}
1004	default:
1005	break;
1006	}
1007	}
1008
1009	// This checks for the hazard where VMEM instructions that store more than
1010	// 8 bytes can have there store data over written by the next instruction.
1011	if (!ST.has12DWordStoreHazard())
1012	return WaitStatesNeeded;
1013
1014	const MachineRegisterInfo &MRI = MF.getRegInfo();
1015
1016	for (const MachineOperand &Def : VALU->defs()) {
1017	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: checkVALUHazardsHelper(Def, MRI));
1018	}
1019
1020	return WaitStatesNeeded;
1021	}
1022
1023	int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
1024	// This checks for hazards associated with inline asm statements.
1025	// Since inline asms can contain just about anything, we use this
1026	// to call/leverage other checkHazard routines. Note that*
1027	// this function doesn't attempt to address all possible inline asm
1028	// hazards (good luck), but is a collection of what has been
1029	// problematic thus far.
1030
1031	// see checkVALUHazards()
1032	if (!ST.has12DWordStoreHazard())
1033	return `0`;
1034
1035	const MachineRegisterInfo &MRI = MF.getRegInfo();
1036	int WaitStatesNeeded = `0`;
1037
1038	for (const MachineOperand &Op :
1039	llvm::drop_begin(RangeOrContainer: IA->operands(), N: InlineAsm::MIOp_FirstOperand)) {
1040	if (Op.isReg() && Op.isDef()) {
1041	WaitStatesNeeded =
1042	std::max(a: WaitStatesNeeded, b: checkVALUHazardsHelper(Def: Op, MRI));
1043	}
1044	}
1045
1046	return WaitStatesNeeded;
1047	}
1048
1049	int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
1050	const SIInstrInfo *TII = ST.getInstrInfo();
1051	const SIRegisterInfo *TRI = ST.getRegisterInfo();
1052	const MachineRegisterInfo &MRI = MF.getRegInfo();
1053
1054	const MachineOperand *LaneSelectOp =
1055	TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
1056
1057	if (!LaneSelectOp->isReg() \|\| !TRI->isSGPRReg(MRI, Reg: LaneSelectOp->getReg()))
1058	return `0`;
1059
1060	Register LaneSelectReg = LaneSelectOp->getReg();
1061	auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
1062
1063	const int RWLaneWaitStates = `4`;
1064	int WaitStatesSince = getWaitStatesSinceDef(Reg: LaneSelectReg, IsHazardDef: IsHazardFn,
1065	Limit: RWLaneWaitStates);
1066	return RWLaneWaitStates - WaitStatesSince;
1067	}
1068
1069	int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
1070	if (!ST.hasRFEHazards())
1071	return `0`;
1072
1073	const SIInstrInfo *TII = ST.getInstrInfo();
1074
1075	const int RFEWaitStates = `1`;
1076
1077	auto IsHazardFn = [TII](const MachineInstr &MI) {
1078	return getHWReg(TII, RegInstr: MI) == AMDGPU::Hwreg::ID_TRAPSTS;
1079	};
1080	int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazard: IsHazardFn, Limit: RFEWaitStates);
1081	return RFEWaitStates - WaitStatesNeeded;
1082	}
1083
1084	int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
1085	const SIInstrInfo *TII = ST.getInstrInfo();
1086	const int ReadM0WaitStates = `1`;
1087	auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
1088	return ReadM0WaitStates -
1089	getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates);
1090	}
1091
1092	void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
1093	fixVMEMtoScalarWriteHazards(MI);
1094	fixVcmpxPermlaneHazards(MI);
1095	fixSMEMtoVectorWriteHazards(MI);
1096	fixVcmpxExecWARHazard(MI);
1097	fixLdsBranchVmemWARHazard(MI);
1098	if (ST.hasLdsDirect()) {
1099	fixLdsDirectVALUHazard(MI);
1100	fixLdsDirectVMEMHazard(MI);
1101	}
1102	fixVALUPartialForwardingHazard(MI);
1103	fixVALUTransUseHazard(MI);
1104	fixWMMAHazards(MI);
1105	fixShift64HighRegBug(MI);
1106	fixVALUMaskWriteHazard(MI);
1107	}
1108
1109	bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
1110	if (!ST.hasVcmpxPermlaneHazard() \|\| !isPermlane(MI: *MI))
1111	return false;
1112
1113	const SIInstrInfo *TII = ST.getInstrInfo();
1114	const SIRegisterInfo *TRI = ST.getRegisterInfo();
1115	auto IsHazardFn = [TII, TRI](const MachineInstr &MI) {
1116	return (TII->isVOPC(MI) \|\|
1117	((TII->isVOP3(MI) \|\| TII->isSDWA(MI)) && MI.isCompare())) &&
1118	MI.modifiesRegister(AMDGPU::EXEC, TRI);
1119	};
1120
1121	auto IsExpiredFn = [](const MachineInstr &MI, int) {
1122	unsigned Opc = MI.getOpcode();
1123	return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
1124	Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
1125	};
1126
1127	if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1128	std::numeric_limits<int>::max())
1129	return false;
1130
1131	// V_NOP will be discarded by SQ.
1132	// Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
1133	// which is always a VGPR and available.
1134	auto Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1135	Register Reg = Src0->getReg();
1136	bool IsUndef = Src0->isUndef();
1137	BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1138	TII->get(AMDGPU::V_MOV_B32_e32))
1139	.addReg(Reg, RegState::Define \| (IsUndef ? RegState::Dead : `0`))
1140	.addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
1141
1142	return true;
1143	}
1144
1145	bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
1146	if (!ST.hasVMEMtoScalarWriteHazard())
1147	return false;
1148	assert(!ST.hasExtendedWaitCounts());
1149
1150	if (!SIInstrInfo::isSALU(MI: MI) && !SIInstrInfo::isSMRD(MI: MI))
1151	return false;
1152
1153	if (MI->getNumDefs() == `0`)
1154	return false;
1155
1156	const SIRegisterInfo *TRI = ST.getRegisterInfo();
1157
1158	auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
1159	if (!SIInstrInfo::isVMEM(MI: I) && !SIInstrInfo::isDS(MI: I) &&
1160	!SIInstrInfo::isFLAT(MI: I))
1161	return false;
1162
1163	for (const MachineOperand &Def : MI->defs()) {
1164	const MachineOperand *Op =
1165	I.findRegisterUseOperand(Def.getReg(), TRI, false);
1166	if (!Op)
1167	continue;
1168	return true;
1169	}
1170	return false;
1171	};
1172
1173	auto IsExpiredFn = [](const MachineInstr &MI, int) {
1174	return SIInstrInfo::isVALU(MI) \|\|
1175	(MI.getOpcode() == AMDGPU::S_WAITCNT &&
1176	!MI.getOperand(`0`).getImm()) \|\|
1177	(MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1178	AMDGPU::DepCtr::decodeFieldVmVsrc(MI.getOperand(`0`).getImm()) == `0`);
1179	};
1180
1181	if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1182	std::numeric_limits<int>::max())
1183	return false;
1184
1185	const SIInstrInfo *TII = ST.getInstrInfo();
1186	BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1187	TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1188	.addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(`0`));
1189	return true;
1190	}
1191
1192	bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
1193	if (!ST.hasSMEMtoVectorWriteHazard())
1194	return false;
1195	assert(!ST.hasExtendedWaitCounts());
1196
1197	if (!SIInstrInfo::isVALU(MI: *MI))
1198	return false;
1199
1200	unsigned SDSTName;
1201	switch (MI->getOpcode()) {
1202	case AMDGPU::V_READLANE_B32:
1203	case AMDGPU::V_READFIRSTLANE_B32:
1204	SDSTName = AMDGPU::OpName::vdst;
1205	break;
1206	default:
1207	SDSTName = AMDGPU::OpName::sdst;
1208	break;
1209	}
1210
1211	const SIInstrInfo *TII = ST.getInstrInfo();
1212	const SIRegisterInfo *TRI = ST.getRegisterInfo();
1213	const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(GPU: ST.getCPU());
1214	const MachineOperand SDST = TII->getNamedOperand(MI&: MI, OperandName: SDSTName);
1215	if (!SDST) {
1216	for (const auto &MO : MI->implicit_operands()) {
1217	if (MO.isDef() && TRI->isSGPRClass(RC: TRI->getPhysRegBaseClass(MO.getReg()))) {
1218	SDST = &MO;
1219	break;
1220	}
1221	}
1222	}
1223
1224	if (!SDST)
1225	return false;
1226
1227	const Register SDSTReg = SDST->getReg();
1228	auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
1229	return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI);
1230	};
1231
1232	auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
1233	if (TII->isSALU(MI)) {
1234	switch (MI.getOpcode()) {
1235	case AMDGPU::S_SETVSKIP:
1236	case AMDGPU::S_VERSION:
1237	case AMDGPU::S_WAITCNT_VSCNT:
1238	case AMDGPU::S_WAITCNT_VMCNT:
1239	case AMDGPU::S_WAITCNT_EXPCNT:
1240	// These instructions cannot not mitigate the hazard.
1241	return false;
1242	case AMDGPU::S_WAITCNT_LGKMCNT:
1243	// Reducing lgkmcnt count to 0 always mitigates the hazard.
1244	return (MI.getOperand(`1`).getImm() == `0`) &&
1245	(MI.getOperand(`0`).getReg() == AMDGPU::SGPR_NULL);
1246	case AMDGPU::S_WAITCNT: {
1247	const int64_t Imm = MI.getOperand(i: `0`).getImm();
1248	AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(Version: IV, Encoded: Imm);
1249	// DsCnt corresponds to LGKMCnt here.
1250	return (Decoded.DsCnt == `0`);
1251	}
1252	default:
1253	// SOPP instructions cannot mitigate the hazard.
1254	if (TII->isSOPP(MI))
1255	return false;
1256	// At this point the SALU can be assumed to mitigate the hazard
1257	// because either:
1258	// (a) it is independent of the at risk SMEM (breaking chain),
1259	// or
1260	// (b) it is dependent on the SMEM, in which case an appropriate
1261	// s_waitcnt lgkmcnt _must_ exist between it and the at risk
1262	// SMEM instruction.
1263	return true;
1264	}
1265	}
1266	return false;
1267	};
1268
1269	if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1270	std::numeric_limits<int>::max())
1271	return false;
1272
1273	BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1274	TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1275	.addImm(`0`);
1276	return true;
1277	}
1278
1279	bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1280	if (!ST.hasVcmpxExecWARHazard())
1281	return false;
1282	assert(!ST.hasExtendedWaitCounts());
1283
1284	if (!SIInstrInfo::isVALU(MI: *MI))
1285	return false;
1286
1287	const SIRegisterInfo *TRI = ST.getRegisterInfo();
1288	if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1289	return false;
1290
1291	auto IsHazardFn = [TRI](const MachineInstr &I) {
1292	if (SIInstrInfo::isVALU(MI: I))
1293	return false;
1294	return I.readsRegister(AMDGPU::EXEC, TRI);
1295	};
1296
1297	const SIInstrInfo *TII = ST.getInstrInfo();
1298	auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1299	if (SIInstrInfo::isVALU(MI)) {
1300	if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst))
1301	return true;
1302	for (auto MO : MI.implicit_operands())
1303	if (MO.isDef() && TRI->isSGPRClass(RC: TRI->getPhysRegBaseClass(MO.getReg())))
1304	return true;
1305	}
1306	if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1307	AMDGPU::DepCtr::decodeFieldSaSdst(MI.getOperand(`0`).getImm()) == `0`)
1308	return true;
1309	return false;
1310	};
1311
1312	if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1313	std::numeric_limits<int>::max())
1314	return false;
1315
1316	BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1317	TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1318	.addImm(AMDGPU::DepCtr::encodeFieldSaSdst(`0`));
1319	return true;
1320	}
1321
1322	static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
1323	const GCNSubtarget &ST) {
1324	if (!ST.hasLdsBranchVmemWARHazard())
1325	return false;
1326
1327	// Check if the necessary condition for the hazard is met: both LDS and VMEM
1328	// instructions need to appear in the same function.
1329	bool HasLds = false;
1330	bool HasVmem = false;
1331	for (auto &MBB : MF) {
1332	for (auto &MI : MBB) {
1333	HasLds \|= SIInstrInfo::isDS(MI);
1334	HasVmem \|=
1335	SIInstrInfo::isVMEM(MI) \|\| SIInstrInfo::isSegmentSpecificFLAT(MI);
1336	if (HasLds && HasVmem)
1337	return true;
1338	}
1339	}
1340	return false;
1341	}
1342
1343	static bool isStoreCountWaitZero(const MachineInstr &I) {
1344	return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1345	I.getOperand(`0`).getReg() == AMDGPU::SGPR_NULL &&
1346	!I.getOperand(`1`).getImm();
1347	}
1348
1349	bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1350	if (!RunLdsBranchVmemWARHazardFixup)
1351	return false;
1352
1353	assert(ST.hasLdsBranchVmemWARHazard());
1354	assert(!ST.hasExtendedWaitCounts());
1355
1356	auto IsHazardInst = [](const MachineInstr &MI) {
1357	if (SIInstrInfo::isDS(MI))
1358	return `1`;
1359	if (SIInstrInfo::isVMEM(MI) \|\| SIInstrInfo::isSegmentSpecificFLAT(MI))
1360	return `2`;
1361	return `0`;
1362	};
1363
1364	auto InstType = IsHazardInst (*MI);
1365	if (!InstType)
1366	return false;
1367
1368	auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1369	return IsHazardInst (I) \|\| isStoreCountWaitZero(I);
1370	};
1371
1372	auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1373	if (!I.isBranch())
1374	return false;
1375
1376	auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
1377	auto InstType2 = IsHazardInst (I);
1378	return InstType2 && InstType != InstType2;
1379	};
1380
1381	auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
1382	auto InstType2 = IsHazardInst (I);
1383	if (InstType == InstType2)
1384	return true;
1385
1386	return isStoreCountWaitZero(I);
1387	};
1388
1389	return ::getWaitStatesSince(IsHazard: IsHazardFn, MI: &I, IsExpired: IsExpiredFn) !=
1390	std::numeric_limits<int>::max();
1391	};
1392
1393	if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1394	std::numeric_limits<int>::max())
1395	return false;
1396
1397	const SIInstrInfo *TII = ST.getInstrInfo();
1398	BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1399	TII->get(AMDGPU::S_WAITCNT_VSCNT))
1400	.addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1401	.addImm(`0`);
1402
1403	return true;
1404	}
1405
1406	bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) {
1407	if (!SIInstrInfo::isLDSDIR(MI: *MI))
1408	return false;
1409
1410	const int NoHazardWaitStates = `15`;
1411	const MachineOperand VDST = TII.getNamedOperand(MI, AMDGPU::OpName::vdst);
1412	const Register VDSTReg = VDST->getReg();
1413
1414	bool VisitedTrans = false;
1415	auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) {
1416	if (!SIInstrInfo::isVALU(MI: I))
1417	return false;
1418	VisitedTrans = VisitedTrans \|\| SIInstrInfo::isTRANS(MI: I);
1419	// Cover both WAR and WAW
1420	return I.readsRegister(VDSTReg, &TRI) \|\| I.modifiesRegister(VDSTReg, &TRI);
1421	};
1422	auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) {
1423	if (WaitStates >= NoHazardWaitStates)
1424	return true;
1425	// Instructions which cause va_vdst==0 expire hazard
1426	return SIInstrInfo::isVMEM(MI: I) \|\| SIInstrInfo::isFLAT(MI: I) \|\|
1427	SIInstrInfo::isDS(MI: I) \|\| SIInstrInfo::isEXP(MI: I);
1428	};
1429	auto GetWaitStatesFn = [](const MachineInstr &MI) {
1430	return SIInstrInfo::isVALU(MI) ? `1` : `0`;
1431	};
1432
1433	DenseSet<const MachineBasicBlock *> Visited;
1434	auto Count = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
1435	std::next(x: MI->getReverseIterator()), `0`,
1436	IsExpiredFn, Visited, GetWaitStatesFn);
1437
1438	// Transcendentals can execute in parallel to other VALUs.
1439	// This makes va_vdst count unusable with a mixture of VALU and TRANS.
1440	if (VisitedTrans)
1441	Count = `0`;
1442
1443	MachineOperand *WaitVdstOp =
1444	TII.getNamedOperand(*MI, AMDGPU::OpName::waitvdst);
1445	WaitVdstOp->setImm(std::min(Count, NoHazardWaitStates));
1446
1447	return true;
1448	}
1449
1450	bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
1451	if (!SIInstrInfo::isLDSDIR(MI: *MI))
1452	return false;
1453
1454	const MachineOperand VDST = TII.getNamedOperand(MI, AMDGPU::OpName::vdst);
1455	const Register VDSTReg = VDST->getReg();
1456
1457	auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) {
1458	if (!SIInstrInfo::isVMEM(MI: I) && !SIInstrInfo::isFLAT(MI: I) &&
1459	!SIInstrInfo::isDS(MI: I))
1460	return false;
1461	return I.readsRegister(VDSTReg, &TRI) \|\| I.modifiesRegister(VDSTReg, &TRI);
1462	};
1463	bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
1464	// TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT
1465	// according to the type of VMEM instruction.
1466	auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) {
1467	return SIInstrInfo::isVALU(I) \|\| SIInstrInfo::isEXP(I) \|\|
1468	(I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(`0`).getImm()) \|\|
1469	(I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1470	AMDGPU::DepCtr::decodeFieldVmVsrc(I.getOperand(`0`).getImm()) == `0`) \|\|
1471	(LdsdirCanWait && SIInstrInfo::isLDSDIR(I) &&
1472	!TII.getNamedOperand(I, AMDGPU::OpName::waitvsrc)->getImm());
1473	};
1474
1475	if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1476	std::numeric_limits<int>::max())
1477	return false;
1478
1479	if (LdsdirCanWait) {
1480	TII.getNamedOperand(*MI, AMDGPU::OpName::waitvsrc)->setImm(`0`);
1481	} else {
1482	BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1483	TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1484	.addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(`0`));
1485	}
1486
1487	return true;
1488	}
1489
1490	bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
1491	if (!ST.hasVALUPartialForwardingHazard())
1492	return false;
1493	assert(!ST.hasExtendedWaitCounts());
1494
1495	if (!ST.isWave64() \|\| !SIInstrInfo::isVALU(MI: *MI))
1496	return false;
1497
1498	SmallSetVector<Register, `4`> SrcVGPRs;
1499
1500	for (const MachineOperand &Use : MI->explicit_uses()) {
1501	if (Use.isReg() && TRI.isVGPR(MRI: MF.getRegInfo(), Reg: Use.getReg()))
1502	SrcVGPRs.insert(X: Use.getReg());
1503	}
1504
1505	// Only applies with >= 2 unique VGPR sources
1506	if (SrcVGPRs.size() <= `1`)
1507	return false;
1508
1509	// Look for the following pattern:
1510	// Va <- VALU [PreExecPos]
1511	// intv1
1512	// Exec <- SALU [ExecPos]
1513	// intv2
1514	// Vb <- VALU [PostExecPos]
1515	// intv3
1516	// MI Va, Vb (WaitState = 0)
1517	//
1518	// Where:
1519	// intv1 + intv2 <= 2 VALUs
1520	// intv3 <= 4 VALUs
1521	//
1522	// If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1523
1524	const int Intv1plus2MaxVALUs = `2`;
1525	const int Intv3MaxVALUs = `4`;
1526	const int IntvMaxVALUs = `6`;
1527	const int NoHazardVALUWaitStates = IntvMaxVALUs + `2`;
1528
1529	struct StateType {
1530	SmallDenseMap<Register, int, `4`> DefPos;
1531	int ExecPos = std::numeric_limits<int>::max();
1532	int VALUs = `0`;
1533	};
1534
1535	StateType State;
1536
1537	// This overloads expiry testing with all the hazard detection
1538	auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1539	// Too many VALU states have passed
1540	if (State.VALUs > NoHazardVALUWaitStates)
1541	return HazardExpired;
1542
1543	// Instructions which cause va_vdst==0 expire hazard
1544	if (SIInstrInfo::isVMEM(I) \|\| SIInstrInfo::isFLAT(I) \|\|
1545	SIInstrInfo::isDS(I) \|\| SIInstrInfo::isEXP(I) \|\|
1546	(I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1547	AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(`0`).getImm()) == `0`))
1548	return HazardExpired;
1549
1550	// Track registers writes
1551	bool Changed = false;
1552	if (SIInstrInfo::isVALU(MI: I)) {
1553	for (Register Src : SrcVGPRs) {
1554	if (!State.DefPos.count(Val: Src) && I.modifiesRegister(Src, &TRI)) {
1555	State.DefPos [Src] = State.VALUs;
1556	Changed = true;
1557	}
1558	}
1559	} else if (SIInstrInfo::isSALU(MI: I)) {
1560	if (State.ExecPos == std::numeric_limits<int>::max()) {
1561	if (!State.DefPos.empty() && I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1562	State.ExecPos = State.VALUs;
1563	Changed = true;
1564	}
1565	}
1566	}
1567
1568	// Early expiration: too many VALUs in intv3
1569	if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1570	return HazardExpired;
1571
1572	// Only evaluate state if something changed
1573	if (!Changed)
1574	return NoHazardFound;
1575
1576	// Determine positions of VALUs pre/post exec change
1577	if (State.ExecPos == std::numeric_limits<int>::max())
1578	return NoHazardFound;
1579
1580	int PreExecPos = std::numeric_limits<int>::max();
1581	int PostExecPos = std::numeric_limits<int>::max();
1582
1583	for (auto Entry : State.DefPos) {
1584	int DefVALUs = Entry.second;
1585	if (DefVALUs != std::numeric_limits<int>::max()) {
1586	if (DefVALUs >= State.ExecPos)
1587	PreExecPos = std::min(a: PreExecPos, b: DefVALUs);
1588	else
1589	PostExecPos = std::min(a: PostExecPos, b: DefVALUs);
1590	}
1591	}
1592
1593	// Need a VALUs post exec change
1594	if (PostExecPos == std::numeric_limits<int>::max())
1595	return NoHazardFound;
1596
1597	// Too many VALUs in intv3?
1598	int Intv3VALUs = PostExecPos;
1599	if (Intv3VALUs > Intv3MaxVALUs)
1600	return HazardExpired;
1601
1602	// Too many VALUs in intv2?
1603	int Intv2VALUs = (State.ExecPos - PostExecPos) - `1`;
1604	if (Intv2VALUs > Intv1plus2MaxVALUs)
1605	return HazardExpired;
1606
1607	// Need a VALUs pre exec change
1608	if (PreExecPos == std::numeric_limits<int>::max())
1609	return NoHazardFound;
1610
1611	// Too many VALUs in intv1?
1612	int Intv1VALUs = PreExecPos - State.ExecPos;
1613	if (Intv1VALUs > Intv1plus2MaxVALUs)
1614	return HazardExpired;
1615
1616	// Too many VALUs in intv1 + intv2
1617	if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1618	return HazardExpired;
1619
1620	return HazardFound;
1621	};
1622	auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1623	if (SIInstrInfo::isVALU(MI))
1624	State.VALUs += `1`;
1625	};
1626
1627	DenseSet<const MachineBasicBlock *> Visited;
1628	if (!hasHazard<StateType>(State, IsHazard: IsHazardFn, UpdateState: UpdateStateFn, MBB: MI->getParent(),
1629	I: std::next(x: MI->getReverseIterator()), Visited))
1630	return false;
1631
1632	BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1633	TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1634	.addImm(`0x0fff`);
1635
1636	return true;
1637	}
1638
1639	bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
1640	if (!ST.hasVALUTransUseHazard())
1641	return false;
1642	assert(!ST.hasExtendedWaitCounts());
1643
1644	if (!SIInstrInfo::isVALU(MI: *MI))
1645	return false;
1646
1647	SmallSet<Register, `4`> SrcVGPRs;
1648
1649	for (const MachineOperand &Use : MI->explicit_uses()) {
1650	if (Use.isReg() && TRI.isVGPR(MRI: MF.getRegInfo(), Reg: Use.getReg()))
1651	SrcVGPRs.insert(V: Use.getReg());
1652	}
1653
1654	// Look for the following pattern:
1655	// Va <- TRANS VALU
1656	// intv
1657	// MI Va (WaitState = 0)
1658	//
1659	// Where:
1660	// intv <= 5 VALUs / 1 TRANS
1661	//
1662	// If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1663
1664	const int IntvMaxVALUs = `5`;
1665	const int IntvMaxTRANS = `1`;
1666
1667	struct StateType {
1668	int VALUs = `0`;
1669	int TRANS = `0`;
1670	};
1671
1672	StateType State;
1673
1674	// This overloads expiry testing with all the hazard detection
1675	auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1676	// Too many VALU states have passed
1677	if (State.VALUs > IntvMaxVALUs \|\| State.TRANS > IntvMaxTRANS)
1678	return HazardExpired;
1679
1680	// Instructions which cause va_vdst==0 expire hazard
1681	if (SIInstrInfo::isVMEM(I) \|\| SIInstrInfo::isFLAT(I) \|\|
1682	SIInstrInfo::isDS(I) \|\| SIInstrInfo::isEXP(I) \|\|
1683	(I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1684	I.getOperand(`0`).getImm() == `0x0fff`))
1685	return HazardExpired;
1686
1687	// Track registers writes
1688	if (SIInstrInfo::isTRANS(MI: I)) {
1689	for (Register Src : SrcVGPRs) {
1690	if (I.modifiesRegister(Src, &TRI)) {
1691	return HazardFound;
1692	}
1693	}
1694	}
1695
1696	return NoHazardFound;
1697	};
1698	auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1699	if (SIInstrInfo::isVALU(MI))
1700	State.VALUs += `1`;
1701	if (SIInstrInfo::isTRANS(MI))
1702	State.TRANS += `1`;
1703	};
1704
1705	DenseSet<const MachineBasicBlock *> Visited;
1706	if (!hasHazard<StateType>(State, IsHazard: IsHazardFn, UpdateState: UpdateStateFn, MBB: MI->getParent(),
1707	I: std::next(x: MI->getReverseIterator()), Visited))
1708	return false;
1709
1710	// Hazard is observed - insert a wait on va_dst counter to ensure hazard is
1711	// avoided.
1712	BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1713	TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1714	.addImm(AMDGPU::DepCtr::encodeFieldVaVdst(`0`));
1715
1716	return true;
1717	}
1718
1719	bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
1720	if (!SIInstrInfo::isWMMA(MI: MI) && !SIInstrInfo::isSWMMAC(MI: MI))
1721	return false;
1722
1723	const SIInstrInfo *TII = ST.getInstrInfo();
1724	const SIRegisterInfo *TRI = ST.getRegisterInfo();
1725
1726	auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) {
1727	if (!SIInstrInfo::isWMMA(MI: I) && !SIInstrInfo::isSWMMAC(MI: I))
1728	return false;
1729
1730	// Src0(matrix A) or Src1(matrix B) of the current wmma instruction overlaps
1731	// with the dest(matrix D) of the previous wmma.
1732	const Register CurSrc0Reg =
1733	TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
1734	const Register CurSrc1Reg =
1735	TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
1736
1737	const Register PrevDstReg =
1738	TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
1739
1740	if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) \|\|
1741	TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
1742	return true;
1743	}
1744
1745	// GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall)
1746	// but Index can't overlap with PrevDstReg.
1747	if (AMDGPU::isGFX12Plus(ST)) {
1748	if (SIInstrInfo::isSWMMAC(MI: *MI)) {
1749	const Register CurIndex =
1750	TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
1751	if (TRI->regsOverlap(PrevDstReg, CurIndex))
1752	return true;
1753	}
1754	return false;
1755	}
1756
1757	return false;
1758	};
1759
1760	auto IsExpiredFn = [](const MachineInstr &I, int) {
1761	return SIInstrInfo::isVALU(MI: I);
1762	};
1763
1764	if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1765	std::numeric_limits<int>::max())
1766	return false;
1767
1768	BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
1769
1770	return true;
1771	}
1772
1773	bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
1774	if (!ST.hasShift64HighRegBug())
1775	return false;
1776	assert(!ST.hasExtendedWaitCounts());
1777
1778	switch (MI->getOpcode()) {
1779	default:
1780	return false;
1781	case AMDGPU::V_LSHLREV_B64_e64:
1782	case AMDGPU::V_LSHRREV_B64_e64:
1783	case AMDGPU::V_ASHRREV_I64_e64:
1784	break;
1785	}
1786
1787	MachineOperand Amt = TII.getNamedOperand(MI, AMDGPU::OpName::src0);
1788	if (!Amt->isReg())
1789	return false;
1790
1791	Register AmtReg = Amt->getReg();
1792	const MachineRegisterInfo &MRI = MF.getRegInfo();
1793	// Check if this is a last VGPR in the allocation block.
1794	if (!TRI.isVGPR(MRI, AmtReg) \|\| ((AmtReg - AMDGPU::VGPR0) & `7`) != `7`)
1795	return false;
1796
1797	if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + `1`))
1798	return false;
1799
1800	MachineOperand Src1 = TII.getNamedOperand(MI, AMDGPU::OpName::src1);
1801	bool OverlappedSrc = Src1->isReg() && TRI.regsOverlap(Src1->getReg(), AmtReg);
1802	bool OverlappedDst = MI->modifiesRegister(AmtReg, &TRI);
1803	bool Overlapped = OverlappedSrc \|\| OverlappedDst;
1804
1805	assert(!OverlappedDst \|\| !OverlappedSrc \|\|
1806	Src1->getReg() == MI->getOperand(`0`).getReg());
1807	assert(ST.needsAlignedVGPRs());
1808	static_assert(AMDGPU::VGPR0 + `1` == AMDGPU::VGPR1);
1809
1810	Register NewReg;
1811	for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
1812	: AMDGPU::VGPR_32RegClass) {
1813	if (!MI->modifiesRegister(Reg, &TRI) && !MI->readsRegister(Reg, &TRI)) {
1814	NewReg = Reg;
1815	break;
1816	}
1817	}
1818
1819	Register NewAmt = Overlapped ? (Register)TRI.getSubReg(NewReg, AMDGPU::sub1)
1820	: NewReg;
1821	Register NewAmtLo;
1822
1823	if (Overlapped)
1824	NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
1825
1826	DebugLoc DL = MI->getDebugLoc();
1827	MachineBasicBlock *MBB = MI->getParent();
1828	// Insert a full wait count because found register might be pending a wait.
1829	BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT))
1830	.addImm(`0`);
1831
1832	// Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them.
1833	if (Overlapped)
1834	runOnInstruction(
1835	BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmtLo)
1836	.addDef(AmtReg - `1`)
1837	.addReg(AmtReg - `1`, RegState::Undef)
1838	.addReg(NewAmtLo, RegState::Undef));
1839	runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt)
1840	.addDef(AmtReg)
1841	.addReg(AmtReg, RegState::Undef)
1842	.addReg(NewAmt, RegState::Undef));
1843
1844	// Instructions emitted after the current instruction will be processed by the
1845	// parent loop of the hazard recognizer in a natural way.
1846	BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
1847	AmtReg)
1848	.addDef(NewAmt)
1849	.addReg(NewAmt)
1850	.addReg(AmtReg);
1851	if (Overlapped)
1852	BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
1853	AmtReg - `1`)
1854	.addDef(NewAmtLo)
1855	.addReg(NewAmtLo)
1856	.addReg(AmtReg - `1`);
1857
1858	// Re-running hazard recognizer on the modified instruction is not necessary,
1859	// inserted V_SWAP_B32 has already both read and write new registers so
1860	// hazards related to these register has already been handled.
1861	Amt->setReg(NewAmt);
1862	Amt->setIsKill(false);
1863	// We do not update liveness, so verifier may see it as undef.
1864	Amt->setIsUndef();
1865	if (OverlappedDst)
1866	MI->getOperand(i: `0`).setReg(NewReg);
1867	if (OverlappedSrc) {
1868	Src1->setReg(NewReg);
1869	Src1->setIsKill(false);
1870	Src1->setIsUndef();
1871	}
1872
1873	return true;
1874	}
1875
1876	int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
1877	int NSAtoVMEMWaitStates = `1`;
1878
1879	if (!ST.hasNSAtoVMEMBug())
1880	return `0`;
1881
1882	if (!SIInstrInfo::isMUBUF(MI: MI) && !SIInstrInfo::isMTBUF(MI: MI))
1883	return `0`;
1884
1885	const SIInstrInfo *TII = ST.getInstrInfo();
1886	const auto Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset);
1887	if (!Offset \|\| (Offset->getImm() & `6`) == `0`)
1888	return `0`;
1889
1890	auto IsHazardFn = [TII](const MachineInstr &I) {
1891	if (!SIInstrInfo::isMIMG(MI: I))
1892	return false;
1893	const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: I.getOpcode());
1894	return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
1895	TII->getInstSizeInBytes(I) >= `16`;
1896	};
1897
1898	return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazard: IsHazardFn, Limit: `1`);
1899	}
1900
1901	int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
1902	int FPAtomicToDenormModeWaitStates = `3`;
1903
1904	if (!ST.hasFPAtomicToDenormModeHazard())
1905	return `0`;
1906	assert(!ST.hasExtendedWaitCounts());
1907
1908	if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
1909	return `0`;
1910
1911	auto IsHazardFn = [](const MachineInstr &I) {
1912	if (!SIInstrInfo::isVMEM(MI: I) && !SIInstrInfo::isFLAT(MI: I))
1913	return false;
1914	return SIInstrInfo::isFPAtomic(MI: I);
1915	};
1916
1917	auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
1918	if (WaitStates >= `3` \|\| SIInstrInfo::isVALU(MI))
1919	return true;
1920
1921	switch (MI.getOpcode()) {
1922	case AMDGPU::S_WAITCNT:
1923	case AMDGPU::S_WAITCNT_VSCNT:
1924	case AMDGPU::S_WAITCNT_VMCNT:
1925	case AMDGPU::S_WAITCNT_EXPCNT:
1926	case AMDGPU::S_WAITCNT_LGKMCNT:
1927	case AMDGPU::S_WAIT_IDLE:
1928	return true;
1929	default:
1930	break;
1931	}
1932
1933	return false;
1934	};
1935
1936	return FPAtomicToDenormModeWaitStates -
1937	::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn);
1938	}
1939
1940	int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
1941	assert(SIInstrInfo::isMAI(*MI));
1942
1943	return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
1944	}
1945
1946	int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) {
1947	// Early exit if no padding is requested.
1948	if (MFMAPaddingRatio == `0`)
1949	return `0`;
1950
1951	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1952	if (!SIInstrInfo::isMFMA(MI: *MI) \|\| MFI->getOccupancy() < `2`)
1953	return `0`;
1954
1955	int NeighborMFMALatency = `0`;
1956	auto IsNeighboringMFMA = [&NeighborMFMALatency,
1957	this](const MachineInstr &MI) {
1958	if (!SIInstrInfo::isMFMA(MI))
1959	return false;
1960
1961	NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI);
1962	return true;
1963	};
1964
1965	const int MaxMFMAPipelineWaitStates = `16`;
1966	int WaitStatesSinceNeighborMFMA =
1967	getWaitStatesSince(IsHazard: IsNeighboringMFMA, Limit: MaxMFMAPipelineWaitStates);
1968
1969	int NeighborMFMAPaddingNeeded =
1970	(NeighborMFMALatency * MFMAPaddingRatio / `100`) -
1971	WaitStatesSinceNeighborMFMA;
1972
1973	return std::max(a: `0`, b: NeighborMFMAPaddingNeeded);
1974	}
1975
1976	int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
1977	int WaitStatesNeeded = `0`;
1978	unsigned Opc = MI->getOpcode();
1979
1980	auto IsVALUFn = [](const MachineInstr &MI) {
1981	return SIInstrInfo::isVALU(MI) \|\| MI.isInlineAsm();
1982	};
1983
1984	if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
1985	const int LegacyVALUWritesVGPRWaitStates = `2`;
1986	const int VALUWritesExecWaitStates = `4`;
1987	const int MaxWaitStates = `4`;
1988
1989	int WaitStatesNeededForUse = VALUWritesExecWaitStates -
1990	getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
1991	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
1992
1993	if (WaitStatesNeeded < MaxWaitStates) {
1994	for (const MachineOperand &Use : MI->explicit_uses()) {
1995	const int MaxWaitStates = `2`;
1996
1997	if (!Use.isReg() \|\| !TRI.isVGPR(MRI: MF.getRegInfo(), Reg: Use.getReg()))
1998	continue;
1999
2000	int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2001	getWaitStatesSinceDef(Reg: Use.getReg(), IsHazardDef: IsVALUFn, Limit: MaxWaitStates);
2002	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2003
2004	if (WaitStatesNeeded == MaxWaitStates)
2005	break;
2006	}
2007	}
2008	}
2009
2010	for (const MachineOperand &Op : MI->explicit_operands()) {
2011	if (!Op.isReg() \|\| !TRI.isAGPR(MRI: MF.getRegInfo(), Reg: Op.getReg()))
2012	continue;
2013
2014	if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2015	continue;
2016
2017	const int MFMAWritesAGPROverlappedSrcABWaitStates = `4`;
2018	const int MFMAWritesAGPROverlappedSrcCWaitStates = `2`;
2019	const int MFMA4x4WritesAGPRAccVgprReadWaitStates = `4`;
2020	const int MFMA16x16WritesAGPRAccVgprReadWaitStates = `10`;
2021	const int MFMA32x32WritesAGPRAccVgprReadWaitStates = `18`;
2022	const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = `1`;
2023	const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = `7`;
2024	const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = `15`;
2025	const int MaxWaitStates = `18`;
2026	Register Reg = Op.getReg();
2027	unsigned HazardDefLatency = `0`;
2028
2029	auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency,
2030	this](const MachineInstr &MI) {
2031	if (!SIInstrInfo::isMFMA(MI))
2032	return false;
2033	Register DstReg = MI.getOperand(i: `0`).getReg();
2034	if (DstReg == Reg)
2035	return false;
2036	HazardDefLatency =
2037	std::max(a: HazardDefLatency, b: TSchedModel.computeInstrLatency(MI: &MI));
2038	return TRI.regsOverlap(DstReg, Reg);
2039	};
2040
2041	int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
2042	MaxWaitStates);
2043	int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2044	int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2045	int OpNo = Op.getOperandNo();
2046	if (OpNo == SrcCIdx) {
2047	NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2048	} else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2049	switch (HazardDefLatency) {
2050	case `2`: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2051	break;
2052	case `8`: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2053	break;
2054	case `16`: [[fallthrough]];
2055	default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2056	break;
2057	}
2058	} else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2059	switch (HazardDefLatency) {
2060	case `2`: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2061	break;
2062	case `8`: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2063	break;
2064	case `16`: [[fallthrough]];
2065	default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2066	break;
2067	}
2068	}
2069
2070	int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2071	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2072
2073	if (WaitStatesNeeded == MaxWaitStates)
2074	return WaitStatesNeeded; // Early exit.
2075
2076	auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
2077	if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2078	return false;
2079	Register DstReg = MI.getOperand(i: `0`).getReg();
2080	return TRI.regsOverlap(Reg, DstReg);
2081	};
2082
2083	const int AccVGPRWriteMFMAReadSrcCWaitStates = `1`;
2084	const int AccVGPRWriteMFMAReadSrcABWaitStates = `3`;
2085	const int AccVGPRWriteAccVgprReadWaitStates = `3`;
2086	NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2087	if (OpNo == SrcCIdx)
2088	NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2089	else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2090	NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2091
2092	WaitStatesNeededForUse = NeedWaitStates -
2093	getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
2094	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2095
2096	if (WaitStatesNeeded == MaxWaitStates)
2097	return WaitStatesNeeded; // Early exit.
2098	}
2099
2100	if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2101	const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = `0`;
2102	const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = `5`;
2103	const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = `13`;
2104	const int MaxWaitStates = `13`;
2105	Register DstReg = MI->getOperand(i: `0`).getReg();
2106	unsigned HazardDefLatency = `0`;
2107
2108	auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2109	this](const MachineInstr &MI) {
2110	if (!SIInstrInfo::isMFMA(MI))
2111	return false;
2112	Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
2113	HazardDefLatency =
2114	std::max(a: HazardDefLatency, b: TSchedModel.computeInstrLatency(MI: &MI));
2115	return TRI.regsOverlap(Reg, DstReg);
2116	};
2117
2118	int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
2119	int NeedWaitStates;
2120	switch (HazardDefLatency) {
2121	case `2`: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2122	break;
2123	case `8`: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2124	break;
2125	case `16`: [[fallthrough]];
2126	default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2127	break;
2128	}
2129
2130	int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2131	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2132	}
2133
2134	// Pad neighboring MFMA with noops for better inter-wave performance.
2135	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: checkMFMAPadding(MI));
2136
2137	return WaitStatesNeeded;
2138	}
2139
2140	static int
2141	GFX940_XDL_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses) {
2142	// 2 pass -> 3
2143	// 4 pass -> 5
2144	// 8 pass -> 9
2145	// 16 pass -> 17
2146	return NumPasses + `1`;
2147	}
2148
2149	static int
2150	GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses) {
2151	// 2 pass -> 2
2152	// 4 pass -> 4
2153	// 8 pass -> 8
2154	// 16 pass -> 16
2155	return NumPasses;
2156	}
2157
2158	static int
2159	GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) {
2160	// 2 pass -> 4
2161	// 4 pass -> 6
2162	// 8 pass -> 10
2163	// 16 pass -> 18
2164	return NumPasses + `2`;
2165	}
2166
2167	static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) {
2168	// 2 pass -> 5
2169	// 4 pass -> 7
2170	// 8 pass -> 11
2171	// 16 pass -> 19
2172	return NumPasses + `3`;
2173	}
2174
2175	int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
2176	int WaitStatesNeeded = `0`;
2177	unsigned Opc = MI->getOpcode();
2178
2179	auto IsLegacyVALUFn = [](const MachineInstr &MI) {
2180	return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI);
2181	};
2182
2183	auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) {
2184	return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI) &&
2185	!SIInstrInfo::isDOT(MI);
2186	};
2187
2188	if (!SIInstrInfo::isMFMA(MI: *MI))
2189	return WaitStatesNeeded;
2190
2191	const int VALUWritesExecWaitStates = `4`;
2192	int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2193	getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2194	VALUWritesExecWaitStates);
2195	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2196
2197	int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2198
2199	// Loop for both DGEMM and S/HGEMM 2nd instruction.
2200	for (const MachineOperand &Use : MI->explicit_uses()) {
2201	const int LegacyVALUNotDotWritesVGPRWaitStates = `2`;
2202	const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = `2`;
2203	const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = `8`;
2204	const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = `16`;
2205	const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = `3`;
2206	const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = `9`;
2207	const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = `17`;
2208	const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = `9`;
2209	const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = `4`;
2210	const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = `5`;
2211	const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = `11`;
2212	const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = `19`;
2213	const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = `6`;
2214	const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = `11`;
2215	const int DMFMA4x4WritesVGPRFullSrcCWaitStates = `4`;
2216	const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = `2`;
2217	const int MaxWaitStates = `19`;
2218
2219	if (!Use.isReg())
2220	continue;
2221	Register Reg = Use.getReg();
2222	bool FullReg;
2223	const MachineInstr *MI1;
2224
2225	auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1,
2226	this](const MachineInstr &MI) {
2227	if (!SIInstrInfo::isMFMA(MI))
2228	return false;
2229	Register DstReg = MI.getOperand(i: `0`).getReg();
2230	FullReg = (DstReg == Reg);
2231	MI1 = &MI;
2232	return TRI.regsOverlap(DstReg, Reg);
2233	};
2234
2235	WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2236	getWaitStatesSinceDef(Reg, IsHazardDef: IsLegacyVALUNotDotFn, Limit: MaxWaitStates);
2237	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2238
2239	int NumWaitStates =
2240	getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates);
2241	if (NumWaitStates == std::numeric_limits<int>::max())
2242	continue;
2243
2244	int OpNo = Use.getOperandNo();
2245	unsigned Opc1 = MI1->getOpcode();
2246	int NeedWaitStates = `0`;
2247	if (OpNo == SrcCIdx) {
2248	if (!isDGEMM(Opcode: Opc) && (!ST.hasGFX940Insts() && isDGEMM(Opcode: Opc1))) {
2249	NeedWaitStates = `0`;
2250	} else if (FullReg) {
2251	if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 \|\|
2252	Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2253	(Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 \|\|
2254	Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2255	NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2256	else if (ST.hasGFX940Insts() &&
2257	TSchedModel.computeInstrLatency(MI: MI1) == `2`)
2258	NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2259	} else {
2260	switch (Opc1) {
2261	case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2262	case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2263	case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2264	case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2265	if (!isXDL(ST, MI: *MI))
2266	NeedWaitStates = DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2267	break;
2268	case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2269	case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2270	if (!isXDL(ST, MI: *MI))
2271	NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2272	break;
2273	default:
2274	int NumPasses = TSchedModel.computeInstrLatency(MI: MI1);
2275	if (ST.hasGFX940Insts()) {
2276	if (isXDL(ST, MI: MI) && !isXDL(ST, MI: MI1))
2277	break;
2278
2279	NeedWaitStates =
2280	isXDL(ST, MI: *MI1)
2281	? GFX940_XDL_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(
2282	NumPasses)
2283	: GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(
2284	NumPasses);
2285	break;
2286	}
2287
2288	switch (NumPasses) {
2289	case `2`:
2290	NeedWaitStates =
2291	isDGEMM(Opcode: Opc) ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2292	: SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2293	break;
2294	case `8`:
2295	NeedWaitStates =
2296	isDGEMM(Opcode: Opc)
2297	? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2298	: SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2299	break;
2300	case `16`:
2301	NeedWaitStates =
2302	isDGEMM(Opcode: Opc)
2303	? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2304	: SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2305	break;
2306	default:
2307	llvm_unreachable("unexpected number of passes");
2308	}
2309	}
2310	}
2311	} else {
2312	switch (Opc1) {
2313	case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2314	case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2315	case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2316	case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2317	NeedWaitStates = DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2318	break;
2319	case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2320	case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2321	NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2322	break;
2323	default:
2324	int NumPasses = TSchedModel.computeInstrLatency(MI: MI1);
2325
2326	if (ST.hasGFX940Insts()) {
2327	NeedWaitStates =
2328	isXDL(ST, MI: *MI1)
2329	? GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(
2330	NumPasses)
2331	: GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(
2332	NumPasses);
2333	break;
2334	}
2335
2336	switch (NumPasses) {
2337	case `2`:
2338	NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2339	break;
2340	case `4`:
2341	llvm_unreachable("unexpected number of passes for mfma");
2342	case `8`:
2343	NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2344	break;
2345	case `16`: [[fallthrough]];
2346	default:
2347	NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2348	}
2349	}
2350	}
2351	if (WaitStatesNeeded >= NeedWaitStates)
2352	continue;
2353
2354	WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2355	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2356
2357	if (WaitStatesNeeded == MaxWaitStates)
2358	break;
2359	}
2360
2361	// Pad neighboring MFMA with noops for better inter-wave performance.
2362	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: checkMFMAPadding(MI));
2363
2364	return WaitStatesNeeded;
2365	}
2366
2367	int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
2368	// On gfx90a+ relevant hazards are checked in checkMAIVALUHazards()
2369	if (!ST.hasMAIInsts() \|\| ST.hasGFX90AInsts())
2370	return `0`;
2371
2372	int WaitStatesNeeded = `0`;
2373
2374	auto IsAccVgprReadFn = [](const MachineInstr &MI) {
2375	return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2376	};
2377
2378	for (const MachineOperand &Op : MI->explicit_uses()) {
2379	if (!Op.isReg() \|\| !TRI.isVGPR(MRI: MF.getRegInfo(), Reg: Op.getReg()))
2380	continue;
2381
2382	Register Reg = Op.getReg();
2383
2384	const int AccVgprReadLdStWaitStates = `2`;
2385	const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = `1`;
2386	const int MaxWaitStates = `2`;
2387
2388	int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2389	getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
2390	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2391
2392	if (WaitStatesNeeded == MaxWaitStates)
2393	return WaitStatesNeeded; // Early exit.
2394
2395	auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
2396	if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2397	MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2398	return false;
2399	auto IsVALUFn = [](const MachineInstr &MI) {
2400	return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMAI(MI);
2401	};
2402	return getWaitStatesSinceDef(Reg, IsHazardDef: IsVALUFn, Limit: `2` /MaxWaitStates/) <
2403	std::numeric_limits<int>::max();
2404	};
2405
2406	WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2407	getWaitStatesSince(IsHazard: IsVALUAccVgprRdWrCheckFn, Limit: MaxWaitStates);
2408	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2409	}
2410
2411	return WaitStatesNeeded;
2412	}
2413
2414	static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses) {
2415	// 2 pass -> 4
2416	// 4 pass -> 6
2417	// 8 pass -> 10
2418	// 16 pass -> 18
2419	return NumPasses + `2`;
2420	}
2421
2422	static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses) {
2423	// 2 pass -> 5
2424	// 4 pass -> 7
2425	// 8 pass -> 11
2426	// 16 pass -> 19
2427	return NumPasses + `3`;
2428	}
2429
2430	static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) {
2431	// 2 pass -> 5
2432	// 4 pass -> 7
2433	// 8 pass -> 11
2434	// 16 pass -> 19
2435	return NumPasses + `3`;
2436	}
2437
2438	static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) {
2439	// 2 pass -> 4
2440	// 4 pass -> 6
2441	// 8 pass -> 10
2442	// 16 pass -> 18
2443	return NumPasses + `2`;
2444	}
2445
2446	int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
2447	if (!ST.hasGFX90AInsts())
2448	return `0`;
2449
2450	auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
2451	return isDGEMM(Opcode: MI.getOpcode());
2452	};
2453
2454	// This is checked in checkMAIHazards90A()
2455	if (SIInstrInfo::isMFMA(MI: *MI))
2456	return `0`;
2457
2458	const MachineRegisterInfo &MRI = MF.getRegInfo();
2459
2460	int WaitStatesNeeded = `0`;
2461
2462	bool IsMem = SIInstrInfo::isVMEM(MI: *MI) \|\|
2463	SIInstrInfo::isFLAT(MI: *MI) \|\|
2464	SIInstrInfo::isDS(MI: *MI);
2465	bool IsMemOrExport = IsMem \|\| SIInstrInfo::isEXP(MI: *MI);
2466	bool IsVALU = SIInstrInfo::isVALU(MI: *MI);
2467
2468	const MachineInstr MFMA = nullptr*;
2469	unsigned Reg;
2470	auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2471	if (!SIInstrInfo::isMFMA(MI) \|\|
2472	!TRI.regsOverlap(MI.getOperand(i: `0`).getReg(), Reg))
2473	return false;
2474	MFMA = &MI;
2475	return true;
2476	};
2477
2478	const MachineInstr DOT = nullptr*;
2479	auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
2480	if (!SIInstrInfo::isDOT(MI) \|\|
2481	!TRI.regsOverlap(MI.getOperand(i: `0`).getReg(), Reg))
2482	return false;
2483	DOT = &MI;
2484	return true;
2485	};
2486
2487	bool DGEMMAfterVALUWrite = false;
2488	auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) {
2489	// Found DGEMM on reverse traversal to def.
2490	if (isDGEMM(Opcode: MI.getOpcode()))
2491	DGEMMAfterVALUWrite = true;
2492
2493	// Only hazard if register is defined by a VALU and a DGEMM is found after
2494	// after the def.
2495	if (!TII.isVALU(MI) \|\| !DGEMMAfterVALUWrite)
2496	return false;
2497
2498	return true;
2499	};
2500
2501	int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
2502	AMDGPU::OpName::src2);
2503
2504	if (IsMemOrExport \|\| IsVALU) {
2505	const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = `5`;
2506	const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = `11`;
2507	const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = `19`;
2508	const int DMFMA4x4WriteVgprMemExpReadWaitStates = `9`;
2509	const int DMFMA16x16WriteVgprMemExpReadWaitStates = `18`;
2510	const int DMFMA4x4WriteVgprVALUReadWaitStates = `6`;
2511	const int DMFMA16x16WriteVgprVALUReadWaitStates = `11`;
2512	const int DotWriteSameDotReadSrcAB = `3`;
2513	const int DotWriteDifferentVALURead = `3`;
2514	const int DMFMABetweenVALUWriteVMEMRead = `2`;
2515	const int MaxWaitStates = `19`;
2516
2517	for (const MachineOperand &Use : MI->explicit_uses()) {
2518	if (!Use.isReg())
2519	continue;
2520	Reg = Use.getReg();
2521
2522	DOT = nullptr;
2523	int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsHazardDef: IsDotWriteFn,
2524	Limit: MaxWaitStates);
2525	if (DOT) {
2526	int NeedWaitStates = `0`;
2527	if (DOT->getOpcode() == MI->getOpcode()) {
2528	if (&Use - &MI->getOperand(i: `0`) != SrcCIdx)
2529	NeedWaitStates = DotWriteSameDotReadSrcAB;
2530	} else {
2531	NeedWaitStates = DotWriteDifferentVALURead;
2532	}
2533
2534	int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2535	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2536	}
2537
2538	// Workaround for HW data hazard bug observed only in GFX90A. When there
2539	// is a DGEMM instruction in-between a VALU and a VMEM instruction it
2540	// causes the SQ to incorrectly not insert two wait states between the two
2541	// instructions needed to avoid data hazard.
2542	if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
2543	DGEMMAfterVALUWrite = false;
2544	if (TRI.isVectorRegister(MRI, Reg)) {
2545	int WaitStatesNeededForUse =
2546	DMFMABetweenVALUWriteVMEMRead -
2547	getWaitStatesSinceDef(Reg, IsHazardDef: IsDGEMMHazard,
2548	Limit: DMFMABetweenVALUWriteVMEMRead);
2549
2550	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2551	}
2552	}
2553
2554	MFMA = nullptr;
2555	WaitStatesSinceDef =
2556	getWaitStatesSinceDef(Reg, IsHazardDef: IsMFMAWriteFn, Limit: MaxWaitStates);
2557	if (!MFMA)
2558	continue;
2559
2560	unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MI: MFMA);
2561	int NumPasses = HazardDefLatency;
2562	int NeedWaitStates = MaxWaitStates;
2563
2564	if (isDGEMM(Opcode: MFMA->getOpcode())) {
2565	switch (HazardDefLatency) {
2566	case `4`:
2567	NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
2568	: DMFMA4x4WriteVgprVALUReadWaitStates;
2569	break;
2570	case `8`:
2571	case `16`:
2572	NeedWaitStates = IsMemOrExport
2573	? DMFMA16x16WriteVgprMemExpReadWaitStates
2574	: DMFMA16x16WriteVgprVALUReadWaitStates;
2575	break;
2576	default:
2577	llvm_unreachable("unexpected dgemm");
2578	}
2579	} else if (ST.hasGFX940Insts()) {
2580	NeedWaitStates =
2581	isXDL(ST, MI: *MFMA)
2582	? GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(NumPasses)
2583	: GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(
2584	NumPasses);
2585	} else {
2586	switch (HazardDefLatency) {
2587	case `2`:
2588	NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
2589	break;
2590	case `8`:
2591	NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
2592	break;
2593	case `16`:
2594	NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
2595	break;
2596	default:
2597	llvm_unreachable("unexpected number of passes for mfma");
2598	}
2599	}
2600
2601	int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2602	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2603
2604	if (WaitStatesNeeded == MaxWaitStates)
2605	break;
2606	}
2607	}
2608
2609	unsigned Opc = MI->getOpcode();
2610	const int DMFMAToFMA64WaitStates = `2`;
2611	if ((Opc == AMDGPU::V_FMA_F64_e64 \|\|
2612	Opc == AMDGPU::V_FMAC_F64_e32 \|\| Opc == AMDGPU::V_FMAC_F64_e64 \|\|
2613	Opc == AMDGPU::V_FMAC_F64_dpp) &&
2614	WaitStatesNeeded < DMFMAToFMA64WaitStates) {
2615	int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
2616	getWaitStatesSince(IsHazard: IsDGEMMFn, Limit: DMFMAToFMA64WaitStates);
2617	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2618	}
2619
2620	if (!IsVALU && !IsMemOrExport)
2621	return WaitStatesNeeded;
2622
2623	for (const MachineOperand &Def : MI->defs()) {
2624	const int SMFMA4x4WriteVgprVALUWawWaitStates = `5`;
2625	const int SMFMA16x16WriteVgprVALUWawWaitStates = `11`;
2626	const int SMFMA32x32WriteVgprVALUWawWaitStates = `19`;
2627	const int SMFMA4x4ReadVgprVALUWarWaitStates = `1`;
2628	const int GFX940_XDL4PassReadVgprVALUWarWaitStates = `3`;
2629	const int SMFMA16x16ReadVgprVALUWarWaitStates = `7`;
2630	const int SMFMA32x32ReadVgprVALUWarWaitStates = `15`;
2631	const int DMFMA4x4WriteVgprVALUWriteWaitStates = `6`;
2632	const int DMFMA16x16WriteVgprVALUWriteWaitStates = `11`;
2633	const int DotWriteDifferentVALUWrite = `3`;
2634	const int MaxWaitStates = `19`;
2635	const int MaxWarWaitStates = `15`;
2636
2637	Reg = Def.getReg();
2638
2639	DOT = nullptr;
2640	int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsHazardDef: IsDotWriteFn,
2641	Limit: MaxWaitStates);
2642	if (DOT && DOT->getOpcode() != MI->getOpcode())
2643	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: DotWriteDifferentVALUWrite -
2644	WaitStatesSinceDef);
2645
2646	MFMA = nullptr;
2647	WaitStatesSinceDef =
2648	getWaitStatesSinceDef(Reg, IsHazardDef: IsMFMAWriteFn, Limit: MaxWaitStates);
2649	if (MFMA) {
2650	int NeedWaitStates = MaxWaitStates;
2651	int NumPasses = TSchedModel.computeInstrLatency(MI: MFMA);
2652
2653	if (isDGEMM(Opcode: MFMA->getOpcode())) {
2654	switch (NumPasses) {
2655	case `4`:
2656	NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
2657	break;
2658	case `8`:
2659	case `16`:
2660	NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
2661	break;
2662	default:
2663	llvm_unreachable("unexpected number of cycles for dgemm");
2664	}
2665	} else if (ST.hasGFX940Insts()) {
2666	NeedWaitStates =
2667	isXDL(ST, MI: *MFMA)
2668	? GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(NumPasses)
2669	: GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(NumPasses);
2670	} else {
2671	switch (NumPasses) {
2672	case `2`:
2673	NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
2674	break;
2675	case `8`:
2676	NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
2677	break;
2678	case `16`:
2679	NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
2680	break;
2681	default:
2682	llvm_unreachable("Unexpected number of passes for mfma");
2683	}
2684	}
2685
2686	int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2687	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2688
2689	if (WaitStatesNeeded == MaxWaitStates)
2690	break;
2691	}
2692
2693	auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2694	if (!SIInstrInfo::isMFMA(MI) \|\| isDGEMM(Opcode: MI.getOpcode()) \|\|
2695	!MI.readsRegister(Reg, &TRI))
2696	return false;
2697
2698	if (ST.hasGFX940Insts() && !isXDL(ST, MI))
2699	return false;
2700
2701	const MachineOperand *SrcC =
2702	TII.getNamedOperand(MI, AMDGPU::OpName::src2);
2703	assert(SrcC);
2704	if (!SrcC->isReg() \|\| !TRI.regsOverlap(SrcC->getReg(), Reg))
2705	return false;
2706
2707	MFMA = &MI;
2708	return true;
2709	};
2710
2711	MFMA = nullptr;
2712	int WaitStatesSinceUse = getWaitStatesSince(IsHazard: IsSMFMAReadAsCFn,
2713	Limit: MaxWarWaitStates);
2714	if (!MFMA)
2715	continue;
2716
2717	unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MI: MFMA);
2718	int NeedWaitStates = MaxWaitStates;
2719	switch (HazardDefLatency) {
2720	case `2`: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
2721	break;
2722	case `4`: assert(ST.hasGFX940Insts());
2723	NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
2724	break;
2725	case `8`: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
2726	break;
2727	case `16`: [[fallthrough]];
2728	default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
2729	break;
2730	}
2731
2732	int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
2733	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2734	}
2735
2736	return WaitStatesNeeded;
2737	}
2738
2739	bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
2740	if (!SU->isInstr())
2741	return false;
2742
2743	const MachineInstr MAI = nullptr*;
2744
2745	auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
2746	MAI = nullptr;
2747	if (SIInstrInfo::isMFMA(MI))
2748	MAI = &MI;
2749	return MAI != nullptr;
2750	};
2751
2752	MachineInstr *MI = SU->getInstr();
2753	if (IsMFMAFn (*MI)) {
2754	int W = getWaitStatesSince(IsHazard: IsMFMAFn, Limit: `16`);
2755	if (MAI)
2756	return W < (int)TSchedModel.computeInstrLatency(MI: MAI);
2757	}
2758
2759	return false;
2760	}
2761
2762	bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
2763	if (!ST.hasVALUMaskWriteHazard())
2764	return false;
2765	assert(!ST.hasExtendedWaitCounts());
2766
2767	if (!ST.isWave64() \|\| !SIInstrInfo::isSALU(MI: *MI))
2768	return false;
2769
2770	// The hazard sequence is three instructions:
2771	// 1. VALU reads SGPR as mask
2772	// 2. SALU writes SGPR
2773	// 3. SALU reads SGPR
2774	// The hazard can expire if the distance between 2 and 3 is sufficient.
2775	// In practice this happens <10% of the time, hence this always assumes
2776	// the hazard exists if 1 and 2 are present to avoid searching.
2777
2778	const MachineOperand SDSTOp = TII.getNamedOperand(MI, AMDGPU::OpName::sdst);
2779	if (!SDSTOp \|\| !SDSTOp->isReg())
2780	return false;
2781
2782	const Register HazardReg = SDSTOp->getReg();
2783	if (HazardReg == AMDGPU::EXEC \|\|
2784	HazardReg == AMDGPU::EXEC_LO \|\|
2785	HazardReg == AMDGPU::EXEC_HI \|\|
2786	HazardReg == AMDGPU::M0)
2787	return false;
2788
2789	auto IsHazardFn = [HazardReg, this](const MachineInstr &I) {
2790	switch (I.getOpcode()) {
2791	case AMDGPU::V_ADDC_U32_e32:
2792	case AMDGPU::V_ADDC_U32_dpp:
2793	case AMDGPU::V_CNDMASK_B16_e32:
2794	case AMDGPU::V_CNDMASK_B16_dpp:
2795	case AMDGPU::V_CNDMASK_B32_e32:
2796	case AMDGPU::V_CNDMASK_B32_dpp:
2797	case AMDGPU::V_DIV_FMAS_F32_e64:
2798	case AMDGPU::V_DIV_FMAS_F64_e64:
2799	case AMDGPU::V_SUBB_U32_e32:
2800	case AMDGPU::V_SUBB_U32_dpp:
2801	case AMDGPU::V_SUBBREV_U32_e32:
2802	case AMDGPU::V_SUBBREV_U32_dpp:
2803	// These implicitly read VCC as mask source.
2804	return HazardReg == AMDGPU::VCC \|\|
2805	HazardReg == AMDGPU::VCC_LO \|\|
2806	HazardReg == AMDGPU::VCC_HI;
2807	case AMDGPU::V_ADDC_U32_e64:
2808	case AMDGPU::V_ADDC_U32_e64_dpp:
2809	case AMDGPU::V_CNDMASK_B16_e64:
2810	case AMDGPU::V_CNDMASK_B16_e64_dpp:
2811	case AMDGPU::V_CNDMASK_B32_e64:
2812	case AMDGPU::V_CNDMASK_B32_e64_dpp:
2813	case AMDGPU::V_SUBB_U32_e64:
2814	case AMDGPU::V_SUBB_U32_e64_dpp:
2815	case AMDGPU::V_SUBBREV_U32_e64:
2816	case AMDGPU::V_SUBBREV_U32_e64_dpp: {
2817	// Only check mask register overlaps.
2818	const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2);
2819	assert(SSRCOp);
2820	return TRI.regsOverlap(SSRCOp->getReg(), HazardReg);
2821	}
2822	default:
2823	return false;
2824	}
2825	};
2826
2827	const MachineRegisterInfo &MRI = MF.getRegInfo();
2828	auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) {
2829	// s_waitcnt_depctr sa_sdst(0) mitigates hazard.
2830	if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
2831	AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(`0`).getImm()) == `0`)
2832	return true;
2833
2834	// VALU access to any SGPR or literal constant other than HazardReg
2835	// mitigates hazard. No need to check HazardReg here as this will
2836	// only be called when !IsHazardFn.
2837	if (!SIInstrInfo::isVALU(MI: I))
2838	return false;
2839	for (int OpNo = `0`, End = I.getNumOperands(); OpNo < End; ++OpNo) {
2840	const MachineOperand &Op = I.getOperand(i: OpNo);
2841	if (Op.isReg()) {
2842	Register OpReg = Op.getReg();
2843	// Only consider uses
2844	if (!Op.isUse())
2845	continue;
2846	// Ignore EXEC
2847	if (OpReg == AMDGPU::EXEC \|\|
2848	OpReg == AMDGPU::EXEC_LO \|\|
2849	OpReg == AMDGPU::EXEC_HI)
2850	continue;
2851	// Ignore all implicit uses except VCC
2852	if (Op.isImplicit()) {
2853	if (OpReg == AMDGPU::VCC \|\|
2854	OpReg == AMDGPU::VCC_LO \|\|
2855	OpReg == AMDGPU::VCC_HI)
2856	return true;
2857	continue;
2858	}
2859	if (TRI.isSGPRReg(MRI, Reg: OpReg))
2860	return true;
2861	} else {
2862	const MCInstrDesc &InstDesc = I.getDesc();
2863	const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
2864	if (!TII.isInlineConstant(MO: Op, OpInfo))
2865	return true;
2866	}
2867	}
2868	return false;
2869	};
2870
2871	// Check for hazard
2872	if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
2873	std::numeric_limits<int>::max())
2874	return false;
2875
2876	auto NextMI = std::next(x: MI->getIterator());
2877
2878	// Add s_waitcnt_depctr sa_sdst(0) after SALU write.
2879	BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
2880	TII.get(AMDGPU::S_WAITCNT_DEPCTR))
2881	.addImm(AMDGPU::DepCtr::encodeFieldSaSdst(`0`));
2882
2883	// SALU write may be s_getpc in a bundle.
2884	if (MI->getOpcode() == AMDGPU::S_GETPC_B64) {
2885	// Update offsets of any references in the bundle.
2886	while (NextMI != MI->getParent()->end() &&
2887	NextMI ->isBundledWithPred()) {
2888	for (auto &Operand : NextMI ->operands()) {
2889	if (Operand.isGlobal())
2890	Operand.setOffset(Operand.getOffset() + `4`);
2891	}
2892	NextMI ++;
2893	}
2894	}
2895
2896	return true;
2897	}
2898

source code of llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp