1//===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements hazard recognizers for scheduling on GCN processors.
10//
11//===----------------------------------------------------------------------===//
12
13#include "GCNHazardRecognizer.h"
14#include "GCNSubtarget.h"
15#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
16#include "SIMachineFunctionInfo.h"
17#include "llvm/CodeGen/MachineFunction.h"
18#include "llvm/CodeGen/ScheduleDAG.h"
19#include "llvm/TargetParser/TargetParser.h"
20
21using namespace llvm;
22
23namespace {
24
25struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
26 MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
27
28 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
29 if (Arg.getAsInteger(Radix: 0, Result&: Value))
30 return O.error(Message: "'" + Arg + "' value invalid for uint argument!");
31
32 if (Value > 100)
33 return O.error(Message: "'" + Arg + "' value must be in the range [0, 100]!");
34
35 return false;
36 }
37};
38
39} // end anonymous namespace
40
41static cl::opt<unsigned, false, MFMAPaddingRatioParser>
42 MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(Val: 0), cl::Hidden,
43 cl::desc("Fill a percentage of the latency between "
44 "neighboring MFMA with s_nops."));
45
46//===----------------------------------------------------------------------===//
47// Hazard Recognizer Implementation
48//===----------------------------------------------------------------------===//
49
50static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
51 const GCNSubtarget &ST);
52
53GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
54 IsHazardRecognizerMode(false),
55 CurrCycleInstr(nullptr),
56 MF(MF),
57 ST(MF.getSubtarget<GCNSubtarget>()),
58 TII(*ST.getInstrInfo()),
59 TRI(TII.getRegisterInfo()),
60 ClauseUses(TRI.getNumRegUnits()),
61 ClauseDefs(TRI.getNumRegUnits()) {
62 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::PhysReg: AGPR0) ? 19 : 5;
63 TSchedModel.init(&ST);
64 RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
65}
66
67void GCNHazardRecognizer::Reset() {
68 EmittedInstrs.clear();
69}
70
71void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
72 EmitInstruction(MI: SU->getInstr());
73}
74
75void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
76 CurrCycleInstr = MI;
77}
78
79static bool isDivFMas(unsigned Opcode) {
80 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
81}
82
83static bool isSGetReg(unsigned Opcode) {
84 return Opcode == AMDGPU::S_GETREG_B32;
85}
86
87static bool isSSetReg(unsigned Opcode) {
88 switch (Opcode) {
89 case AMDGPU::S_SETREG_B32:
90 case AMDGPU::S_SETREG_B32_mode:
91 case AMDGPU::S_SETREG_IMM32_B32:
92 case AMDGPU::S_SETREG_IMM32_B32_mode:
93 return true;
94 }
95 return false;
96}
97
98static bool isRWLane(unsigned Opcode) {
99 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
100}
101
102static bool isRFE(unsigned Opcode) {
103 return Opcode == AMDGPU::S_RFE_B64;
104}
105
106static bool isSMovRel(unsigned Opcode) {
107 switch (Opcode) {
108 case AMDGPU::S_MOVRELS_B32:
109 case AMDGPU::S_MOVRELS_B64:
110 case AMDGPU::S_MOVRELD_B32:
111 case AMDGPU::S_MOVRELD_B64:
112 return true;
113 default:
114 return false;
115 }
116}
117
118static bool isDGEMM(unsigned Opcode) {
119 return AMDGPU::getMAIIsDGEMM(Opc: Opcode);
120}
121
122static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) {
123 unsigned Opcode = MI.getOpcode();
124
125 if (!SIInstrInfo::isMAI(MI) ||
126 isDGEMM(Opcode) ||
127 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
128 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
129 return false;
130
131 if (!ST.hasGFX940Insts())
132 return true;
133
134 return AMDGPU::getMAIIsGFX940XDL(Opc: Opcode);
135}
136
137static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
138 const MachineInstr &MI) {
139 if (TII.isAlwaysGDS(Opcode: MI.getOpcode()))
140 return true;
141
142 switch (MI.getOpcode()) {
143 case AMDGPU::S_SENDMSG:
144 case AMDGPU::S_SENDMSGHALT:
145 case AMDGPU::S_TTRACEDATA:
146 return true;
147 // These DS opcodes don't support GDS.
148 case AMDGPU::DS_NOP:
149 case AMDGPU::DS_PERMUTE_B32:
150 case AMDGPU::DS_BPERMUTE_B32:
151 return false;
152 default:
153 if (TII.isDS(Opcode: MI.getOpcode())) {
154 int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
155 AMDGPU::OpName::gds);
156 if (MI.getOperand(i: GDS).getImm())
157 return true;
158 }
159 return false;
160 }
161}
162
163static bool isPermlane(const MachineInstr &MI) {
164 unsigned Opcode = MI.getOpcode();
165 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
166 Opcode == AMDGPU::V_PERMLANE64_B32 ||
167 Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
168 Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
169 Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64;
170}
171
172static bool isLdsDma(const MachineInstr &MI) {
173 return SIInstrInfo::isVALU(MI) &&
174 (SIInstrInfo::isMUBUF(MI) || SIInstrInfo::isFLAT(MI));
175}
176
177static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
178 const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
179 AMDGPU::OpName::simm16);
180 return std::get<0>(AMDGPU::Hwreg::HwregEncoding::decode(RegOp->getImm()));
181}
182
183ScheduleHazardRecognizer::HazardType
184GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
185 MachineInstr *MI = SU->getInstr();
186 // If we are not in "HazardRecognizerMode" and therefore not being run from
187 // the scheduler, track possible stalls from hazards but don't insert noops.
188 auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
189
190 if (MI->isBundle())
191 return NoHazard;
192
193 if (SIInstrInfo::isSMRD(MI: *MI) && checkSMRDHazards(SMRD: MI) > 0)
194 return HazardType;
195
196 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
197 return HazardType;
198
199 if (checkFPAtomicToDenormModeHazard(MI) > 0)
200 return HazardType;
201
202 if (ST.hasNoDataDepHazard())
203 return NoHazard;
204
205 // FIXME: Should flat be considered vmem?
206 if ((SIInstrInfo::isVMEM(MI: *MI) ||
207 SIInstrInfo::isFLAT(MI: *MI))
208 && checkVMEMHazards(VMEM: MI) > 0)
209 return HazardType;
210
211 if (SIInstrInfo::isVALU(MI: *MI) && checkVALUHazards(VALU: MI) > 0)
212 return HazardType;
213
214 if (SIInstrInfo::isDPP(MI: *MI) && checkDPPHazards(DPP: MI) > 0)
215 return HazardType;
216
217 if (isDivFMas(Opcode: MI->getOpcode()) && checkDivFMasHazards(DivFMas: MI) > 0)
218 return HazardType;
219
220 if (isRWLane(Opcode: MI->getOpcode()) && checkRWLaneHazards(RWLane: MI) > 0)
221 return HazardType;
222
223 if ((SIInstrInfo::isVALU(MI: *MI) || SIInstrInfo::isVMEM(MI: *MI) ||
224 SIInstrInfo::isFLAT(MI: *MI) || SIInstrInfo::isDS(MI: *MI) ||
225 SIInstrInfo::isEXP(MI: *MI)) && checkMAIVALUHazards(MI) > 0)
226 return HazardType;
227
228 if (isSGetReg(Opcode: MI->getOpcode()) && checkGetRegHazards(GetRegInstr: MI) > 0)
229 return HazardType;
230
231 if (isSSetReg(Opcode: MI->getOpcode()) && checkSetRegHazards(SetRegInstr: MI) > 0)
232 return HazardType;
233
234 if (isRFE(Opcode: MI->getOpcode()) && checkRFEHazards(RFE: MI) > 0)
235 return HazardType;
236
237 if (((ST.hasReadM0MovRelInterpHazard() &&
238 (TII.isVINTRP(MI: *MI) || isSMovRel(Opcode: MI->getOpcode()) ||
239 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
240 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
241 (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, MI: *MI)) ||
242 (ST.hasReadM0LdsDmaHazard() && isLdsDma(MI: *MI)) ||
243 (ST.hasReadM0LdsDirectHazard() &&
244 MI->readsRegister(AMDGPU::Reg: LDS_DIRECT, /*TRI=*/nullptr))) &&
245 checkReadM0Hazards(SMovRel: MI) > 0)
246 return HazardType;
247
248 if (SIInstrInfo::isMAI(MI: *MI) && checkMAIHazards(MI) > 0)
249 return HazardType;
250
251 if ((SIInstrInfo::isVMEM(MI: *MI) ||
252 SIInstrInfo::isFLAT(MI: *MI) ||
253 SIInstrInfo::isDS(MI: *MI)) && checkMAILdStHazards(MI) > 0)
254 return HazardType;
255
256 if (MI->isInlineAsm() && checkInlineAsmHazards(IA: MI) > 0)
257 return HazardType;
258
259 return NoHazard;
260}
261
262static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII,
263 unsigned Quantity) {
264 while (Quantity > 0) {
265 unsigned Arg = std::min(a: Quantity, b: 8u);
266 Quantity -= Arg;
267 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
268 .addImm(Arg - 1);
269 }
270}
271
272unsigned
273GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const {
274 const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(MI: &MI);
275 assert(TSchedModel.getWriteProcResBegin(SC) !=
276 TSchedModel.getWriteProcResEnd(SC));
277 return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
278}
279
280void GCNHazardRecognizer::processBundle() {
281 MachineBasicBlock::instr_iterator MI = std::next(x: CurrCycleInstr->getIterator());
282 MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
283 // Check bundled MachineInstr's for hazards.
284 for (; MI != E && MI->isInsideBundle(); ++MI) {
285 CurrCycleInstr = &*MI;
286 unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
287
288 if (IsHazardRecognizerMode) {
289 fixHazards(MI: CurrCycleInstr);
290
291 insertNoopsInBundle(MI: CurrCycleInstr, TII, Quantity: WaitStates);
292 }
293
294 // It’s unnecessary to track more than MaxLookAhead instructions. Since we
295 // include the bundled MI directly after, only add a maximum of
296 // (MaxLookAhead - 1) noops to EmittedInstrs.
297 for (unsigned i = 0, e = std::min(a: WaitStates, b: MaxLookAhead - 1); i < e; ++i)
298 EmittedInstrs.push_front(x: nullptr);
299
300 EmittedInstrs.push_front(x: CurrCycleInstr);
301 EmittedInstrs.resize(new_size: MaxLookAhead);
302 }
303 CurrCycleInstr = nullptr;
304}
305
306void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) {
307 assert(IsHazardRecognizerMode);
308
309 unsigned NumPreNoops = PreEmitNoops(MI);
310 EmitNoops(Quantity: NumPreNoops);
311 if (MI->isInsideBundle())
312 insertNoopsInBundle(MI, TII, Quantity: NumPreNoops);
313 else
314 TII.insertNoops(MBB&: *MI->getParent(), MI: MachineBasicBlock::iterator(MI),
315 Quantity: NumPreNoops);
316 EmitInstruction(MI);
317 AdvanceCycle();
318}
319
320unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
321 IsHazardRecognizerMode = true;
322 CurrCycleInstr = MI;
323 unsigned W = PreEmitNoopsCommon(MI);
324 fixHazards(MI);
325 CurrCycleInstr = nullptr;
326 return W;
327}
328
329unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
330 if (MI->isBundle())
331 return 0;
332
333 int WaitStates = 0;
334
335 if (SIInstrInfo::isSMRD(MI: *MI))
336 return std::max(a: WaitStates, b: checkSMRDHazards(SMRD: MI));
337
338 if (ST.hasNSAtoVMEMBug())
339 WaitStates = std::max(a: WaitStates, b: checkNSAtoVMEMHazard(MI));
340
341 WaitStates = std::max(a: WaitStates, b: checkFPAtomicToDenormModeHazard(MI));
342
343 if (ST.hasNoDataDepHazard())
344 return WaitStates;
345
346 if (SIInstrInfo::isVMEM(MI: *MI) || SIInstrInfo::isFLAT(MI: *MI))
347 WaitStates = std::max(a: WaitStates, b: checkVMEMHazards(VMEM: MI));
348
349 if (SIInstrInfo::isVALU(MI: *MI))
350 WaitStates = std::max(a: WaitStates, b: checkVALUHazards(VALU: MI));
351
352 if (SIInstrInfo::isDPP(MI: *MI))
353 WaitStates = std::max(a: WaitStates, b: checkDPPHazards(DPP: MI));
354
355 if (isDivFMas(Opcode: MI->getOpcode()))
356 WaitStates = std::max(a: WaitStates, b: checkDivFMasHazards(DivFMas: MI));
357
358 if (isRWLane(Opcode: MI->getOpcode()))
359 WaitStates = std::max(a: WaitStates, b: checkRWLaneHazards(RWLane: MI));
360
361 if ((SIInstrInfo::isVALU(MI: *MI) || SIInstrInfo::isVMEM(MI: *MI) ||
362 SIInstrInfo::isFLAT(MI: *MI) || SIInstrInfo::isDS(MI: *MI) ||
363 SIInstrInfo::isEXP(MI: *MI)) && checkMAIVALUHazards(MI) > 0)
364 WaitStates = std::max(a: WaitStates, b: checkMAIVALUHazards(MI));
365
366 if (MI->isInlineAsm())
367 return std::max(a: WaitStates, b: checkInlineAsmHazards(IA: MI));
368
369 if (isSGetReg(Opcode: MI->getOpcode()))
370 return std::max(a: WaitStates, b: checkGetRegHazards(GetRegInstr: MI));
371
372 if (isSSetReg(Opcode: MI->getOpcode()))
373 return std::max(a: WaitStates, b: checkSetRegHazards(SetRegInstr: MI));
374
375 if (isRFE(Opcode: MI->getOpcode()))
376 return std::max(a: WaitStates, b: checkRFEHazards(RFE: MI));
377
378 if ((ST.hasReadM0MovRelInterpHazard() &&
379 (TII.isVINTRP(MI: *MI) || isSMovRel(Opcode: MI->getOpcode()) ||
380 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
381 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
382 (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, MI: *MI)) ||
383 (ST.hasReadM0LdsDmaHazard() && isLdsDma(MI: *MI)) ||
384 (ST.hasReadM0LdsDirectHazard() &&
385 MI->readsRegister(AMDGPU::Reg: LDS_DIRECT, /*TRI=*/nullptr)))
386 return std::max(a: WaitStates, b: checkReadM0Hazards(SMovRel: MI));
387
388 if (SIInstrInfo::isMAI(MI: *MI))
389 return std::max(a: WaitStates, b: checkMAIHazards(MI));
390
391 if (SIInstrInfo::isVMEM(MI: *MI) ||
392 SIInstrInfo::isFLAT(MI: *MI) ||
393 SIInstrInfo::isDS(MI: *MI))
394 return std::max(a: WaitStates, b: checkMAILdStHazards(MI));
395
396 return WaitStates;
397}
398
399void GCNHazardRecognizer::EmitNoop() {
400 EmittedInstrs.push_front(x: nullptr);
401}
402
403void GCNHazardRecognizer::AdvanceCycle() {
404 // When the scheduler detects a stall, it will call AdvanceCycle() without
405 // emitting any instructions.
406 if (!CurrCycleInstr) {
407 EmittedInstrs.push_front(x: nullptr);
408 return;
409 }
410
411 if (CurrCycleInstr->isBundle()) {
412 processBundle();
413 return;
414 }
415
416 unsigned NumWaitStates = TII.getNumWaitStates(MI: *CurrCycleInstr);
417 if (!NumWaitStates) {
418 CurrCycleInstr = nullptr;
419 return;
420 }
421
422 // Keep track of emitted instructions
423 EmittedInstrs.push_front(x: CurrCycleInstr);
424
425 // Add a nullptr for each additional wait state after the first. Make sure
426 // not to add more than getMaxLookAhead() items to the list, since we
427 // truncate the list to that size right after this loop.
428 for (unsigned i = 1, e = std::min(a: NumWaitStates, b: getMaxLookAhead());
429 i < e; ++i) {
430 EmittedInstrs.push_front(x: nullptr);
431 }
432
433 // getMaxLookahead() is the largest number of wait states we will ever need
434 // to insert, so there is no point in keeping track of more than that many
435 // wait states.
436 EmittedInstrs.resize(new_size: getMaxLookAhead());
437
438 CurrCycleInstr = nullptr;
439}
440
441void GCNHazardRecognizer::RecedeCycle() {
442 llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
443}
444
445//===----------------------------------------------------------------------===//
446// Helper Functions
447//===----------------------------------------------------------------------===//
448
449typedef enum { HazardFound, HazardExpired, NoHazardFound } HazardFnResult;
450
451typedef function_ref<bool(const MachineInstr &, int WaitStates)> IsExpiredFn;
452typedef function_ref<unsigned int(const MachineInstr &)> GetNumWaitStatesFn;
453
454// Search for a hazard in a block and its predecessors.
455template <typename StateT>
456static bool
457hasHazard(StateT State,
458 function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard,
459 function_ref<void(StateT &, const MachineInstr &)> UpdateState,
460 const MachineBasicBlock *MBB,
461 MachineBasicBlock::const_reverse_instr_iterator I,
462 DenseSet<const MachineBasicBlock *> &Visited) {
463 for (auto E = MBB->instr_rend(); I != E; ++I) {
464 // No need to look at parent BUNDLE instructions.
465 if (I->isBundle())
466 continue;
467
468 switch (IsHazard(State, *I)) {
469 case HazardFound:
470 return true;
471 case HazardExpired:
472 return false;
473 default:
474 // Continue search
475 break;
476 }
477
478 if (I->isInlineAsm() || I->isMetaInstruction())
479 continue;
480
481 UpdateState(State, *I);
482 }
483
484 for (MachineBasicBlock *Pred : MBB->predecessors()) {
485 if (!Visited.insert(V: Pred).second)
486 continue;
487
488 if (hasHazard(State, IsHazard, UpdateState, Pred, Pred->instr_rbegin(),
489 Visited))
490 return true;
491 }
492
493 return false;
494}
495
496// Returns a minimum wait states since \p I walking all predecessors.
497// Only scans until \p IsExpired does not return true.
498// Can only be run in a hazard recognizer mode.
499static int getWaitStatesSince(
500 GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB,
501 MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates,
502 IsExpiredFn IsExpired, DenseSet<const MachineBasicBlock *> &Visited,
503 GetNumWaitStatesFn GetNumWaitStates = SIInstrInfo::getNumWaitStates) {
504 for (auto E = MBB->instr_rend(); I != E; ++I) {
505 // Don't add WaitStates for parent BUNDLE instructions.
506 if (I->isBundle())
507 continue;
508
509 if (IsHazard(*I))
510 return WaitStates;
511
512 if (I->isInlineAsm())
513 continue;
514
515 WaitStates += GetNumWaitStates(*I);
516
517 if (IsExpired(*I, WaitStates))
518 return std::numeric_limits<int>::max();
519 }
520
521 int MinWaitStates = std::numeric_limits<int>::max();
522 for (MachineBasicBlock *Pred : MBB->predecessors()) {
523 if (!Visited.insert(V: Pred).second)
524 continue;
525
526 int W = getWaitStatesSince(IsHazard, MBB: Pred, I: Pred->instr_rbegin(), WaitStates,
527 IsExpired, Visited, GetNumWaitStates);
528
529 MinWaitStates = std::min(a: MinWaitStates, b: W);
530 }
531
532 return MinWaitStates;
533}
534
535static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
536 const MachineInstr *MI, IsExpiredFn IsExpired) {
537 DenseSet<const MachineBasicBlock *> Visited;
538 return getWaitStatesSince(IsHazard, MBB: MI->getParent(),
539 I: std::next(x: MI->getReverseIterator()),
540 WaitStates: 0, IsExpired, Visited);
541}
542
543int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
544 if (IsHazardRecognizerMode) {
545 auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
546 return WaitStates >= Limit;
547 };
548 return ::getWaitStatesSince(IsHazard, MI: CurrCycleInstr, IsExpired: IsExpiredFn);
549 }
550
551 int WaitStates = 0;
552 for (MachineInstr *MI : EmittedInstrs) {
553 if (MI) {
554 if (IsHazard(*MI))
555 return WaitStates;
556
557 if (MI->isInlineAsm())
558 continue;
559 }
560 ++WaitStates;
561
562 if (WaitStates >= Limit)
563 break;
564 }
565 return std::numeric_limits<int>::max();
566}
567
568int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
569 IsHazardFn IsHazardDef,
570 int Limit) {
571 const SIRegisterInfo *TRI = ST.getRegisterInfo();
572
573 auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
574 return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
575 };
576
577 return getWaitStatesSince(IsHazardFn, Limit);
578}
579
580int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
581 int Limit) {
582 auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
583 return isSSetReg(Opcode: MI.getOpcode()) && IsHazard(MI);
584 };
585
586 return getWaitStatesSince(IsHazard: IsHazardFn, Limit);
587}
588
589//===----------------------------------------------------------------------===//
590// No-op Hazard Detection
591//===----------------------------------------------------------------------===//
592
593static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
594 MCRegister Reg) {
595 for (MCRegUnit Unit : TRI.regunits(Reg))
596 BV.set(Unit);
597}
598
599static void addRegsToSet(const SIRegisterInfo &TRI,
600 iterator_range<MachineInstr::const_mop_iterator> Ops,
601 BitVector &DefSet, BitVector &UseSet) {
602 for (const MachineOperand &Op : Ops) {
603 if (Op.isReg())
604 addRegUnits(TRI, BV&: Op.isDef() ? DefSet : UseSet, Reg: Op.getReg().asMCReg());
605 }
606}
607
608void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
609 addRegsToSet(TRI, Ops: MI.operands(), DefSet&: ClauseDefs, UseSet&: ClauseUses);
610}
611
612static bool breaksSMEMSoftClause(MachineInstr *MI) {
613 return !SIInstrInfo::isSMRD(MI: *MI);
614}
615
616static bool breaksVMEMSoftClause(MachineInstr *MI) {
617 return !SIInstrInfo::isVMEM(MI: *MI) && !SIInstrInfo::isFLAT(MI: *MI);
618}
619
620int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
621 // SMEM soft clause are only present on VI+, and only matter if xnack is
622 // enabled.
623 if (!ST.isXNACKEnabled())
624 return 0;
625
626 bool IsSMRD = TII.isSMRD(MI: *MEM);
627
628 resetClause();
629
630 // A soft-clause is any group of consecutive SMEM instructions. The
631 // instructions in this group may return out of order and/or may be
632 // replayed (i.e. the same instruction issued more than once).
633 //
634 // In order to handle these situations correctly we need to make sure that
635 // when a clause has more than one instruction, no instruction in the clause
636 // writes to a register that is read by another instruction in the clause
637 // (including itself). If we encounter this situation, we need to break the
638 // clause by inserting a non SMEM instruction.
639
640 for (MachineInstr *MI : EmittedInstrs) {
641 // When we hit a non-SMEM instruction then we have passed the start of the
642 // clause and we can stop.
643 if (!MI)
644 break;
645
646 if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI))
647 break;
648
649 addClauseInst(MI: *MI);
650 }
651
652 if (ClauseDefs.none())
653 return 0;
654
655 // We need to make sure not to put loads and stores in the same clause if they
656 // use the same address. For now, just start a new clause whenever we see a
657 // store.
658 if (MEM->mayStore())
659 return 1;
660
661 addClauseInst(MI: *MEM);
662
663 // If the set of defs and uses intersect then we cannot add this instruction
664 // to the clause, so we have a hazard.
665 return ClauseDefs.anyCommon(RHS: ClauseUses) ? 1 : 0;
666}
667
668int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
669 int WaitStatesNeeded = 0;
670
671 WaitStatesNeeded = checkSoftClauseHazards(MEM: SMRD);
672
673 // This SMRD hazard only affects SI.
674 if (!ST.hasSMRDReadVALUDefHazard())
675 return WaitStatesNeeded;
676
677 // A read of an SGPR by SMRD instruction requires 4 wait states when the
678 // SGPR was written by a VALU instruction.
679 int SmrdSgprWaitStates = 4;
680 auto IsHazardDefFn = [this](const MachineInstr &MI) {
681 return TII.isVALU(MI);
682 };
683 auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
684 return TII.isSALU(MI);
685 };
686
687 bool IsBufferSMRD = TII.isBufferSMRD(MI: *SMRD);
688
689 for (const MachineOperand &Use : SMRD->uses()) {
690 if (!Use.isReg())
691 continue;
692 int WaitStatesNeededForUse =
693 SmrdSgprWaitStates - getWaitStatesSinceDef(Reg: Use.getReg(), IsHazardDef: IsHazardDefFn,
694 Limit: SmrdSgprWaitStates);
695 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
696
697 // This fixes what appears to be undocumented hardware behavior in SI where
698 // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
699 // needs some number of nops in between. We don't know how many we need, but
700 // let's use 4. This wasn't discovered before probably because the only
701 // case when this happens is when we expand a 64-bit pointer into a full
702 // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
703 // probably never encountered in the closed-source land.
704 if (IsBufferSMRD) {
705 int WaitStatesNeededForUse =
706 SmrdSgprWaitStates - getWaitStatesSinceDef(Reg: Use.getReg(),
707 IsHazardDef: IsBufferHazardDefFn,
708 Limit: SmrdSgprWaitStates);
709 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
710 }
711 }
712
713 return WaitStatesNeeded;
714}
715
716int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
717 if (!ST.hasVMEMReadSGPRVALUDefHazard())
718 return 0;
719
720 int WaitStatesNeeded = checkSoftClauseHazards(MEM: VMEM);
721
722 // A read of an SGPR by a VMEM instruction requires 5 wait states when the
723 // SGPR was written by a VALU Instruction.
724 const int VmemSgprWaitStates = 5;
725 auto IsHazardDefFn = [this](const MachineInstr &MI) {
726 return TII.isVALU(MI);
727 };
728 for (const MachineOperand &Use : VMEM->uses()) {
729 if (!Use.isReg() || TRI.isVectorRegister(MRI: MF.getRegInfo(), Reg: Use.getReg()))
730 continue;
731
732 int WaitStatesNeededForUse =
733 VmemSgprWaitStates - getWaitStatesSinceDef(Reg: Use.getReg(), IsHazardDef: IsHazardDefFn,
734 Limit: VmemSgprWaitStates);
735 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
736 }
737 return WaitStatesNeeded;
738}
739
740int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
741 const SIRegisterInfo *TRI = ST.getRegisterInfo();
742 const SIInstrInfo *TII = ST.getInstrInfo();
743
744 // Check for DPP VGPR read after VALU VGPR write and EXEC write.
745 int DppVgprWaitStates = 2;
746 int DppExecWaitStates = 5;
747 int WaitStatesNeeded = 0;
748 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
749 return TII->isVALU(MI);
750 };
751
752 for (const MachineOperand &Use : DPP->uses()) {
753 if (!Use.isReg() || !TRI->isVGPR(MRI: MF.getRegInfo(), Reg: Use.getReg()))
754 continue;
755 int WaitStatesNeededForUse =
756 DppVgprWaitStates - getWaitStatesSinceDef(
757 Reg: Use.getReg(),
758 IsHazardDef: [](const MachineInstr &) { return true; },
759 Limit: DppVgprWaitStates);
760 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
761 }
762
763 WaitStatesNeeded = std::max(
764 WaitStatesNeeded,
765 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
766 DppExecWaitStates));
767
768 return WaitStatesNeeded;
769}
770
771int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
772 const SIInstrInfo *TII = ST.getInstrInfo();
773
774 // v_div_fmas requires 4 wait states after a write to vcc from a VALU
775 // instruction.
776 const int DivFMasWaitStates = 4;
777 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
778 return TII->isVALU(MI);
779 };
780 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
781 DivFMasWaitStates);
782
783 return DivFMasWaitStates - WaitStatesNeeded;
784}
785
786int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
787 const SIInstrInfo *TII = ST.getInstrInfo();
788 unsigned GetRegHWReg = getHWReg(TII, RegInstr: *GetRegInstr);
789
790 const int GetRegWaitStates = 2;
791 auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
792 return GetRegHWReg == getHWReg(TII, RegInstr: MI);
793 };
794 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazard: IsHazardFn, Limit: GetRegWaitStates);
795
796 return GetRegWaitStates - WaitStatesNeeded;
797}
798
799int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
800 const SIInstrInfo *TII = ST.getInstrInfo();
801 unsigned HWReg = getHWReg(TII, RegInstr: *SetRegInstr);
802
803 const int SetRegWaitStates = ST.getSetRegWaitStates();
804 auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
805 return HWReg == getHWReg(TII, RegInstr: MI);
806 };
807 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazard: IsHazardFn, Limit: SetRegWaitStates);
808 return SetRegWaitStates - WaitStatesNeeded;
809}
810
811int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
812 if (!MI.mayStore())
813 return -1;
814
815 const SIInstrInfo *TII = ST.getInstrInfo();
816 unsigned Opcode = MI.getOpcode();
817 const MCInstrDesc &Desc = MI.getDesc();
818
819 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
820 int VDataRCID = -1;
821 if (VDataIdx != -1)
822 VDataRCID = Desc.operands()[VDataIdx].RegClass;
823
824 if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
825 // There is no hazard if the instruction does not use vector regs
826 // (like wbinvl1)
827 if (VDataIdx == -1)
828 return -1;
829 // For MUBUF/MTBUF instructions this hazard only exists if the
830 // instruction is not using a register in the soffset field.
831 const MachineOperand *SOffset =
832 TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
833 // If we have no soffset operand, then assume this field has been
834 // hardcoded to zero.
835 if (AMDGPU::getRegBitWidth(RCID: VDataRCID) > 64 &&
836 (!SOffset || !SOffset->isReg()))
837 return VDataIdx;
838 }
839
840 // MIMG instructions create a hazard if they don't use a 256-bit T# and
841 // the store size is greater than 8 bytes and they have more than two bits
842 // of their dmask set.
843 // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
844 if (TII->isMIMG(MI)) {
845 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
846 assert(SRsrcIdx != -1 &&
847 AMDGPU::getRegBitWidth(Desc.operands()[SRsrcIdx].RegClass) == 256);
848 (void)SRsrcIdx;
849 }
850
851 if (TII->isFLAT(MI)) {
852 int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
853 if (AMDGPU::getRegBitWidth(RCID: Desc.operands()[DataIdx].RegClass) > 64)
854 return DataIdx;
855 }
856
857 return -1;
858}
859
860int
861GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
862 const MachineRegisterInfo &MRI) {
863 // Helper to check for the hazard where VMEM instructions that store more than
864 // 8 bytes can have there store data over written by the next instruction.
865 const SIRegisterInfo *TRI = ST.getRegisterInfo();
866
867 const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1;
868 int WaitStatesNeeded = 0;
869
870 if (!TRI->isVectorRegister(MRI, Reg: Def.getReg()))
871 return WaitStatesNeeded;
872 Register Reg = Def.getReg();
873 auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
874 int DataIdx = createsVALUHazard(MI);
875 return DataIdx >= 0 &&
876 TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
877 };
878 int WaitStatesNeededForDef =
879 VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
880 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
881
882 return WaitStatesNeeded;
883}
884
885int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
886 int WaitStatesNeeded = 0;
887
888 if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(MI: *VALU)) {
889 const int TransDefWaitstates = 1;
890
891 auto IsTransDefFn = [this, VALU](const MachineInstr &MI) {
892 if (!SIInstrInfo::isTRANS(MI))
893 return false;
894 const SIRegisterInfo *TRI = ST.getRegisterInfo();
895 const SIInstrInfo *TII = ST.getInstrInfo();
896 Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg();
897
898 for (const MachineOperand &Use : VALU->explicit_uses()) {
899 if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
900 return true;
901 }
902
903 return false;
904 };
905
906 int WaitStatesNeededForDef =
907 TransDefWaitstates -
908 getWaitStatesSince(IsHazard: IsTransDefFn, Limit: TransDefWaitstates);
909 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
910 }
911
912 if (ST.hasDstSelForwardingHazard()) {
913 const int Shift16DefWaitstates = 1;
914
915 auto IsShift16BitDefFn = [this, VALU](const MachineInstr &MI) {
916 if (!SIInstrInfo::isVALU(MI))
917 return false;
918 const SIInstrInfo *TII = ST.getInstrInfo();
919 if (SIInstrInfo::isSDWA(MI)) {
920 if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
921 if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
922 return false;
923 } else {
924 if (!AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::op_sel) ||
925 !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)
926 ->getImm() &
927 SISrcMods::DST_OP_SEL))
928 return false;
929 }
930 const SIRegisterInfo *TRI = ST.getRegisterInfo();
931 if (auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
932 Register Def = Dst->getReg();
933
934 for (const MachineOperand &Use : VALU->explicit_uses()) {
935 if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
936 return true;
937 }
938 }
939
940 return false;
941 };
942
943 int WaitStatesNeededForDef =
944 Shift16DefWaitstates -
945 getWaitStatesSince(IsHazard: IsShift16BitDefFn, Limit: Shift16DefWaitstates);
946 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
947 }
948
949 if (ST.hasVDecCoExecHazard()) {
950 const int VALUWriteSGPRVALUReadWaitstates = 2;
951 const int VALUWriteEXECRWLane = 4;
952 const int VALUWriteVGPRReadlaneRead = 1;
953
954 const SIRegisterInfo *TRI = ST.getRegisterInfo();
955 const MachineRegisterInfo &MRI = MF.getRegInfo();
956 Register UseReg;
957 auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) {
958 if (!SIInstrInfo::isVALU(MI))
959 return false;
960 return MI.modifiesRegister(UseReg, TRI);
961 };
962
963 for (const MachineOperand &Use : VALU->explicit_uses()) {
964 if (!Use.isReg())
965 continue;
966
967 UseReg = Use.getReg();
968 if (TRI->isSGPRReg(MRI, Reg: UseReg)) {
969 int WaitStatesNeededForDef =
970 VALUWriteSGPRVALUReadWaitstates -
971 getWaitStatesSince(IsVALUDefSGPRFn,
972 VALUWriteSGPRVALUReadWaitstates);
973 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
974 }
975 }
976
977 if (VALU->readsRegister(AMDGPU::VCC, TRI)) {
978 UseReg = AMDGPU::VCC;
979 int WaitStatesNeededForDef =
980 VALUWriteSGPRVALUReadWaitstates -
981 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
982 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
983 }
984
985 switch (VALU->getOpcode()) {
986 case AMDGPU::V_READLANE_B32:
987 case AMDGPU::V_READFIRSTLANE_B32: {
988 MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
989 UseReg = Src->getReg();
990 int WaitStatesNeededForDef =
991 VALUWriteVGPRReadlaneRead -
992 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
993 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
994 }
995 [[fallthrough]];
996 case AMDGPU::V_WRITELANE_B32: {
997 UseReg = AMDGPU::EXEC;
998 int WaitStatesNeededForDef =
999 VALUWriteEXECRWLane -
1000 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
1001 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
1002 break;
1003 }
1004 default:
1005 break;
1006 }
1007 }
1008
1009 // This checks for the hazard where VMEM instructions that store more than
1010 // 8 bytes can have there store data over written by the next instruction.
1011 if (!ST.has12DWordStoreHazard())
1012 return WaitStatesNeeded;
1013
1014 const MachineRegisterInfo &MRI = MF.getRegInfo();
1015
1016 for (const MachineOperand &Def : VALU->defs()) {
1017 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: checkVALUHazardsHelper(Def, MRI));
1018 }
1019
1020 return WaitStatesNeeded;
1021}
1022
1023int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
1024 // This checks for hazards associated with inline asm statements.
1025 // Since inline asms can contain just about anything, we use this
1026 // to call/leverage other check*Hazard routines. Note that
1027 // this function doesn't attempt to address all possible inline asm
1028 // hazards (good luck), but is a collection of what has been
1029 // problematic thus far.
1030
1031 // see checkVALUHazards()
1032 if (!ST.has12DWordStoreHazard())
1033 return 0;
1034
1035 const MachineRegisterInfo &MRI = MF.getRegInfo();
1036 int WaitStatesNeeded = 0;
1037
1038 for (const MachineOperand &Op :
1039 llvm::drop_begin(RangeOrContainer: IA->operands(), N: InlineAsm::MIOp_FirstOperand)) {
1040 if (Op.isReg() && Op.isDef()) {
1041 WaitStatesNeeded =
1042 std::max(a: WaitStatesNeeded, b: checkVALUHazardsHelper(Def: Op, MRI));
1043 }
1044 }
1045
1046 return WaitStatesNeeded;
1047}
1048
1049int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
1050 const SIInstrInfo *TII = ST.getInstrInfo();
1051 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1052 const MachineRegisterInfo &MRI = MF.getRegInfo();
1053
1054 const MachineOperand *LaneSelectOp =
1055 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
1056
1057 if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, Reg: LaneSelectOp->getReg()))
1058 return 0;
1059
1060 Register LaneSelectReg = LaneSelectOp->getReg();
1061 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
1062
1063 const int RWLaneWaitStates = 4;
1064 int WaitStatesSince = getWaitStatesSinceDef(Reg: LaneSelectReg, IsHazardDef: IsHazardFn,
1065 Limit: RWLaneWaitStates);
1066 return RWLaneWaitStates - WaitStatesSince;
1067}
1068
1069int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
1070 if (!ST.hasRFEHazards())
1071 return 0;
1072
1073 const SIInstrInfo *TII = ST.getInstrInfo();
1074
1075 const int RFEWaitStates = 1;
1076
1077 auto IsHazardFn = [TII](const MachineInstr &MI) {
1078 return getHWReg(TII, RegInstr: MI) == AMDGPU::Hwreg::ID_TRAPSTS;
1079 };
1080 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazard: IsHazardFn, Limit: RFEWaitStates);
1081 return RFEWaitStates - WaitStatesNeeded;
1082}
1083
1084int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
1085 const SIInstrInfo *TII = ST.getInstrInfo();
1086 const int ReadM0WaitStates = 1;
1087 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
1088 return ReadM0WaitStates -
1089 getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates);
1090}
1091
1092void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
1093 fixVMEMtoScalarWriteHazards(MI);
1094 fixVcmpxPermlaneHazards(MI);
1095 fixSMEMtoVectorWriteHazards(MI);
1096 fixVcmpxExecWARHazard(MI);
1097 fixLdsBranchVmemWARHazard(MI);
1098 if (ST.hasLdsDirect()) {
1099 fixLdsDirectVALUHazard(MI);
1100 fixLdsDirectVMEMHazard(MI);
1101 }
1102 fixVALUPartialForwardingHazard(MI);
1103 fixVALUTransUseHazard(MI);
1104 fixWMMAHazards(MI);
1105 fixShift64HighRegBug(MI);
1106 fixVALUMaskWriteHazard(MI);
1107}
1108
1109bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
1110 if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(MI: *MI))
1111 return false;
1112
1113 const SIInstrInfo *TII = ST.getInstrInfo();
1114 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1115 auto IsHazardFn = [TII, TRI](const MachineInstr &MI) {
1116 return (TII->isVOPC(MI) ||
1117 ((TII->isVOP3(MI) || TII->isSDWA(MI)) && MI.isCompare())) &&
1118 MI.modifiesRegister(AMDGPU::EXEC, TRI);
1119 };
1120
1121 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1122 unsigned Opc = MI.getOpcode();
1123 return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
1124 Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
1125 };
1126
1127 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1128 std::numeric_limits<int>::max())
1129 return false;
1130
1131 // V_NOP will be discarded by SQ.
1132 // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
1133 // which is always a VGPR and available.
1134 auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
1135 Register Reg = Src0->getReg();
1136 bool IsUndef = Src0->isUndef();
1137 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1138 TII->get(AMDGPU::V_MOV_B32_e32))
1139 .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
1140 .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
1141
1142 return true;
1143}
1144
1145bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
1146 if (!ST.hasVMEMtoScalarWriteHazard())
1147 return false;
1148 assert(!ST.hasExtendedWaitCounts());
1149
1150 if (!SIInstrInfo::isSALU(MI: *MI) && !SIInstrInfo::isSMRD(MI: *MI))
1151 return false;
1152
1153 if (MI->getNumDefs() == 0)
1154 return false;
1155
1156 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1157
1158 auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
1159 if (!SIInstrInfo::isVMEM(MI: I) && !SIInstrInfo::isDS(MI: I) &&
1160 !SIInstrInfo::isFLAT(MI: I))
1161 return false;
1162
1163 for (const MachineOperand &Def : MI->defs()) {
1164 const MachineOperand *Op =
1165 I.findRegisterUseOperand(Def.getReg(), TRI, false);
1166 if (!Op)
1167 continue;
1168 return true;
1169 }
1170 return false;
1171 };
1172
1173 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1174 return SIInstrInfo::isVALU(MI) ||
1175 (MI.getOpcode() == AMDGPU::S_WAITCNT &&
1176 !MI.getOperand(0).getImm()) ||
1177 (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1178 AMDGPU::DepCtr::decodeFieldVmVsrc(MI.getOperand(0).getImm()) == 0);
1179 };
1180
1181 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1182 std::numeric_limits<int>::max())
1183 return false;
1184
1185 const SIInstrInfo *TII = ST.getInstrInfo();
1186 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1187 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1188 .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
1189 return true;
1190}
1191
1192bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
1193 if (!ST.hasSMEMtoVectorWriteHazard())
1194 return false;
1195 assert(!ST.hasExtendedWaitCounts());
1196
1197 if (!SIInstrInfo::isVALU(MI: *MI))
1198 return false;
1199
1200 unsigned SDSTName;
1201 switch (MI->getOpcode()) {
1202 case AMDGPU::V_READLANE_B32:
1203 case AMDGPU::V_READFIRSTLANE_B32:
1204 SDSTName = AMDGPU::OpName::vdst;
1205 break;
1206 default:
1207 SDSTName = AMDGPU::OpName::sdst;
1208 break;
1209 }
1210
1211 const SIInstrInfo *TII = ST.getInstrInfo();
1212 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1213 const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(GPU: ST.getCPU());
1214 const MachineOperand *SDST = TII->getNamedOperand(MI&: *MI, OperandName: SDSTName);
1215 if (!SDST) {
1216 for (const auto &MO : MI->implicit_operands()) {
1217 if (MO.isDef() && TRI->isSGPRClass(RC: TRI->getPhysRegBaseClass(MO.getReg()))) {
1218 SDST = &MO;
1219 break;
1220 }
1221 }
1222 }
1223
1224 if (!SDST)
1225 return false;
1226
1227 const Register SDSTReg = SDST->getReg();
1228 auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
1229 return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI);
1230 };
1231
1232 auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
1233 if (TII->isSALU(MI)) {
1234 switch (MI.getOpcode()) {
1235 case AMDGPU::S_SETVSKIP:
1236 case AMDGPU::S_VERSION:
1237 case AMDGPU::S_WAITCNT_VSCNT:
1238 case AMDGPU::S_WAITCNT_VMCNT:
1239 case AMDGPU::S_WAITCNT_EXPCNT:
1240 // These instructions cannot not mitigate the hazard.
1241 return false;
1242 case AMDGPU::S_WAITCNT_LGKMCNT:
1243 // Reducing lgkmcnt count to 0 always mitigates the hazard.
1244 return (MI.getOperand(1).getImm() == 0) &&
1245 (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1246 case AMDGPU::S_WAITCNT: {
1247 const int64_t Imm = MI.getOperand(i: 0).getImm();
1248 AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(Version: IV, Encoded: Imm);
1249 // DsCnt corresponds to LGKMCnt here.
1250 return (Decoded.DsCnt == 0);
1251 }
1252 default:
1253 // SOPP instructions cannot mitigate the hazard.
1254 if (TII->isSOPP(MI))
1255 return false;
1256 // At this point the SALU can be assumed to mitigate the hazard
1257 // because either:
1258 // (a) it is independent of the at risk SMEM (breaking chain),
1259 // or
1260 // (b) it is dependent on the SMEM, in which case an appropriate
1261 // s_waitcnt lgkmcnt _must_ exist between it and the at risk
1262 // SMEM instruction.
1263 return true;
1264 }
1265 }
1266 return false;
1267 };
1268
1269 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1270 std::numeric_limits<int>::max())
1271 return false;
1272
1273 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1274 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1275 .addImm(0);
1276 return true;
1277}
1278
1279bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1280 if (!ST.hasVcmpxExecWARHazard())
1281 return false;
1282 assert(!ST.hasExtendedWaitCounts());
1283
1284 if (!SIInstrInfo::isVALU(MI: *MI))
1285 return false;
1286
1287 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1288 if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1289 return false;
1290
1291 auto IsHazardFn = [TRI](const MachineInstr &I) {
1292 if (SIInstrInfo::isVALU(MI: I))
1293 return false;
1294 return I.readsRegister(AMDGPU::EXEC, TRI);
1295 };
1296
1297 const SIInstrInfo *TII = ST.getInstrInfo();
1298 auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1299 if (SIInstrInfo::isVALU(MI)) {
1300 if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst))
1301 return true;
1302 for (auto MO : MI.implicit_operands())
1303 if (MO.isDef() && TRI->isSGPRClass(RC: TRI->getPhysRegBaseClass(MO.getReg())))
1304 return true;
1305 }
1306 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1307 AMDGPU::DepCtr::decodeFieldSaSdst(MI.getOperand(0).getImm()) == 0)
1308 return true;
1309 return false;
1310 };
1311
1312 if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1313 std::numeric_limits<int>::max())
1314 return false;
1315
1316 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1317 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1318 .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
1319 return true;
1320}
1321
1322static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
1323 const GCNSubtarget &ST) {
1324 if (!ST.hasLdsBranchVmemWARHazard())
1325 return false;
1326
1327 // Check if the necessary condition for the hazard is met: both LDS and VMEM
1328 // instructions need to appear in the same function.
1329 bool HasLds = false;
1330 bool HasVmem = false;
1331 for (auto &MBB : MF) {
1332 for (auto &MI : MBB) {
1333 HasLds |= SIInstrInfo::isDS(MI);
1334 HasVmem |=
1335 SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI);
1336 if (HasLds && HasVmem)
1337 return true;
1338 }
1339 }
1340 return false;
1341}
1342
1343static bool isStoreCountWaitZero(const MachineInstr &I) {
1344 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1345 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1346 !I.getOperand(1).getImm();
1347}
1348
1349bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1350 if (!RunLdsBranchVmemWARHazardFixup)
1351 return false;
1352
1353 assert(ST.hasLdsBranchVmemWARHazard());
1354 assert(!ST.hasExtendedWaitCounts());
1355
1356 auto IsHazardInst = [](const MachineInstr &MI) {
1357 if (SIInstrInfo::isDS(MI))
1358 return 1;
1359 if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI))
1360 return 2;
1361 return 0;
1362 };
1363
1364 auto InstType = IsHazardInst(*MI);
1365 if (!InstType)
1366 return false;
1367
1368 auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1369 return IsHazardInst(I) || isStoreCountWaitZero(I);
1370 };
1371
1372 auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1373 if (!I.isBranch())
1374 return false;
1375
1376 auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
1377 auto InstType2 = IsHazardInst(I);
1378 return InstType2 && InstType != InstType2;
1379 };
1380
1381 auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
1382 auto InstType2 = IsHazardInst(I);
1383 if (InstType == InstType2)
1384 return true;
1385
1386 return isStoreCountWaitZero(I);
1387 };
1388
1389 return ::getWaitStatesSince(IsHazard: IsHazardFn, MI: &I, IsExpired: IsExpiredFn) !=
1390 std::numeric_limits<int>::max();
1391 };
1392
1393 if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1394 std::numeric_limits<int>::max())
1395 return false;
1396
1397 const SIInstrInfo *TII = ST.getInstrInfo();
1398 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1399 TII->get(AMDGPU::S_WAITCNT_VSCNT))
1400 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1401 .addImm(0);
1402
1403 return true;
1404}
1405
1406bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) {
1407 if (!SIInstrInfo::isLDSDIR(MI: *MI))
1408 return false;
1409
1410 const int NoHazardWaitStates = 15;
1411 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1412 const Register VDSTReg = VDST->getReg();
1413
1414 bool VisitedTrans = false;
1415 auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) {
1416 if (!SIInstrInfo::isVALU(MI: I))
1417 return false;
1418 VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(MI: I);
1419 // Cover both WAR and WAW
1420 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1421 };
1422 auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) {
1423 if (WaitStates >= NoHazardWaitStates)
1424 return true;
1425 // Instructions which cause va_vdst==0 expire hazard
1426 return SIInstrInfo::isVMEM(MI: I) || SIInstrInfo::isFLAT(MI: I) ||
1427 SIInstrInfo::isDS(MI: I) || SIInstrInfo::isEXP(MI: I);
1428 };
1429 auto GetWaitStatesFn = [](const MachineInstr &MI) {
1430 return SIInstrInfo::isVALU(MI) ? 1 : 0;
1431 };
1432
1433 DenseSet<const MachineBasicBlock *> Visited;
1434 auto Count = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
1435 std::next(x: MI->getReverseIterator()), 0,
1436 IsExpiredFn, Visited, GetWaitStatesFn);
1437
1438 // Transcendentals can execute in parallel to other VALUs.
1439 // This makes va_vdst count unusable with a mixture of VALU and TRANS.
1440 if (VisitedTrans)
1441 Count = 0;
1442
1443 MachineOperand *WaitVdstOp =
1444 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvdst);
1445 WaitVdstOp->setImm(std::min(Count, NoHazardWaitStates));
1446
1447 return true;
1448}
1449
1450bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
1451 if (!SIInstrInfo::isLDSDIR(MI: *MI))
1452 return false;
1453
1454 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1455 const Register VDSTReg = VDST->getReg();
1456
1457 auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) {
1458 if (!SIInstrInfo::isVMEM(MI: I) && !SIInstrInfo::isFLAT(MI: I) &&
1459 !SIInstrInfo::isDS(MI: I))
1460 return false;
1461 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1462 };
1463 bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
1464 // TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT
1465 // according to the type of VMEM instruction.
1466 auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) {
1467 return SIInstrInfo::isVALU(I) || SIInstrInfo::isEXP(I) ||
1468 (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) ||
1469 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1470 AMDGPU::DepCtr::decodeFieldVmVsrc(I.getOperand(0).getImm()) == 0) ||
1471 (LdsdirCanWait && SIInstrInfo::isLDSDIR(I) &&
1472 !TII.getNamedOperand(I, AMDGPU::OpName::waitvsrc)->getImm());
1473 };
1474
1475 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1476 std::numeric_limits<int>::max())
1477 return false;
1478
1479 if (LdsdirCanWait) {
1480 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvsrc)->setImm(0);
1481 } else {
1482 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1483 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1484 .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
1485 }
1486
1487 return true;
1488}
1489
1490bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
1491 if (!ST.hasVALUPartialForwardingHazard())
1492 return false;
1493 assert(!ST.hasExtendedWaitCounts());
1494
1495 if (!ST.isWave64() || !SIInstrInfo::isVALU(MI: *MI))
1496 return false;
1497
1498 SmallSetVector<Register, 4> SrcVGPRs;
1499
1500 for (const MachineOperand &Use : MI->explicit_uses()) {
1501 if (Use.isReg() && TRI.isVGPR(MRI: MF.getRegInfo(), Reg: Use.getReg()))
1502 SrcVGPRs.insert(X: Use.getReg());
1503 }
1504
1505 // Only applies with >= 2 unique VGPR sources
1506 if (SrcVGPRs.size() <= 1)
1507 return false;
1508
1509 // Look for the following pattern:
1510 // Va <- VALU [PreExecPos]
1511 // intv1
1512 // Exec <- SALU [ExecPos]
1513 // intv2
1514 // Vb <- VALU [PostExecPos]
1515 // intv3
1516 // MI Va, Vb (WaitState = 0)
1517 //
1518 // Where:
1519 // intv1 + intv2 <= 2 VALUs
1520 // intv3 <= 4 VALUs
1521 //
1522 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1523
1524 const int Intv1plus2MaxVALUs = 2;
1525 const int Intv3MaxVALUs = 4;
1526 const int IntvMaxVALUs = 6;
1527 const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1528
1529 struct StateType {
1530 SmallDenseMap<Register, int, 4> DefPos;
1531 int ExecPos = std::numeric_limits<int>::max();
1532 int VALUs = 0;
1533 };
1534
1535 StateType State;
1536
1537 // This overloads expiry testing with all the hazard detection
1538 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1539 // Too many VALU states have passed
1540 if (State.VALUs > NoHazardVALUWaitStates)
1541 return HazardExpired;
1542
1543 // Instructions which cause va_vdst==0 expire hazard
1544 if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
1545 SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) ||
1546 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1547 AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0))
1548 return HazardExpired;
1549
1550 // Track registers writes
1551 bool Changed = false;
1552 if (SIInstrInfo::isVALU(MI: I)) {
1553 for (Register Src : SrcVGPRs) {
1554 if (!State.DefPos.count(Val: Src) && I.modifiesRegister(Src, &TRI)) {
1555 State.DefPos[Src] = State.VALUs;
1556 Changed = true;
1557 }
1558 }
1559 } else if (SIInstrInfo::isSALU(MI: I)) {
1560 if (State.ExecPos == std::numeric_limits<int>::max()) {
1561 if (!State.DefPos.empty() && I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1562 State.ExecPos = State.VALUs;
1563 Changed = true;
1564 }
1565 }
1566 }
1567
1568 // Early expiration: too many VALUs in intv3
1569 if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1570 return HazardExpired;
1571
1572 // Only evaluate state if something changed
1573 if (!Changed)
1574 return NoHazardFound;
1575
1576 // Determine positions of VALUs pre/post exec change
1577 if (State.ExecPos == std::numeric_limits<int>::max())
1578 return NoHazardFound;
1579
1580 int PreExecPos = std::numeric_limits<int>::max();
1581 int PostExecPos = std::numeric_limits<int>::max();
1582
1583 for (auto Entry : State.DefPos) {
1584 int DefVALUs = Entry.second;
1585 if (DefVALUs != std::numeric_limits<int>::max()) {
1586 if (DefVALUs >= State.ExecPos)
1587 PreExecPos = std::min(a: PreExecPos, b: DefVALUs);
1588 else
1589 PostExecPos = std::min(a: PostExecPos, b: DefVALUs);
1590 }
1591 }
1592
1593 // Need a VALUs post exec change
1594 if (PostExecPos == std::numeric_limits<int>::max())
1595 return NoHazardFound;
1596
1597 // Too many VALUs in intv3?
1598 int Intv3VALUs = PostExecPos;
1599 if (Intv3VALUs > Intv3MaxVALUs)
1600 return HazardExpired;
1601
1602 // Too many VALUs in intv2?
1603 int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1604 if (Intv2VALUs > Intv1plus2MaxVALUs)
1605 return HazardExpired;
1606
1607 // Need a VALUs pre exec change
1608 if (PreExecPos == std::numeric_limits<int>::max())
1609 return NoHazardFound;
1610
1611 // Too many VALUs in intv1?
1612 int Intv1VALUs = PreExecPos - State.ExecPos;
1613 if (Intv1VALUs > Intv1plus2MaxVALUs)
1614 return HazardExpired;
1615
1616 // Too many VALUs in intv1 + intv2
1617 if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1618 return HazardExpired;
1619
1620 return HazardFound;
1621 };
1622 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1623 if (SIInstrInfo::isVALU(MI))
1624 State.VALUs += 1;
1625 };
1626
1627 DenseSet<const MachineBasicBlock *> Visited;
1628 if (!hasHazard<StateType>(State, IsHazard: IsHazardFn, UpdateState: UpdateStateFn, MBB: MI->getParent(),
1629 I: std::next(x: MI->getReverseIterator()), Visited))
1630 return false;
1631
1632 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1633 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1634 .addImm(0x0fff);
1635
1636 return true;
1637}
1638
1639bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
1640 if (!ST.hasVALUTransUseHazard())
1641 return false;
1642 assert(!ST.hasExtendedWaitCounts());
1643
1644 if (!SIInstrInfo::isVALU(MI: *MI))
1645 return false;
1646
1647 SmallSet<Register, 4> SrcVGPRs;
1648
1649 for (const MachineOperand &Use : MI->explicit_uses()) {
1650 if (Use.isReg() && TRI.isVGPR(MRI: MF.getRegInfo(), Reg: Use.getReg()))
1651 SrcVGPRs.insert(V: Use.getReg());
1652 }
1653
1654 // Look for the following pattern:
1655 // Va <- TRANS VALU
1656 // intv
1657 // MI Va (WaitState = 0)
1658 //
1659 // Where:
1660 // intv <= 5 VALUs / 1 TRANS
1661 //
1662 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1663
1664 const int IntvMaxVALUs = 5;
1665 const int IntvMaxTRANS = 1;
1666
1667 struct StateType {
1668 int VALUs = 0;
1669 int TRANS = 0;
1670 };
1671
1672 StateType State;
1673
1674 // This overloads expiry testing with all the hazard detection
1675 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1676 // Too many VALU states have passed
1677 if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1678 return HazardExpired;
1679
1680 // Instructions which cause va_vdst==0 expire hazard
1681 if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
1682 SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) ||
1683 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1684 I.getOperand(0).getImm() == 0x0fff))
1685 return HazardExpired;
1686
1687 // Track registers writes
1688 if (SIInstrInfo::isTRANS(MI: I)) {
1689 for (Register Src : SrcVGPRs) {
1690 if (I.modifiesRegister(Src, &TRI)) {
1691 return HazardFound;
1692 }
1693 }
1694 }
1695
1696 return NoHazardFound;
1697 };
1698 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1699 if (SIInstrInfo::isVALU(MI))
1700 State.VALUs += 1;
1701 if (SIInstrInfo::isTRANS(MI))
1702 State.TRANS += 1;
1703 };
1704
1705 DenseSet<const MachineBasicBlock *> Visited;
1706 if (!hasHazard<StateType>(State, IsHazard: IsHazardFn, UpdateState: UpdateStateFn, MBB: MI->getParent(),
1707 I: std::next(x: MI->getReverseIterator()), Visited))
1708 return false;
1709
1710 // Hazard is observed - insert a wait on va_dst counter to ensure hazard is
1711 // avoided.
1712 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1713 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1714 .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0));
1715
1716 return true;
1717}
1718
1719bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
1720 if (!SIInstrInfo::isWMMA(MI: *MI) && !SIInstrInfo::isSWMMAC(MI: *MI))
1721 return false;
1722
1723 const SIInstrInfo *TII = ST.getInstrInfo();
1724 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1725
1726 auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) {
1727 if (!SIInstrInfo::isWMMA(MI: I) && !SIInstrInfo::isSWMMAC(MI: I))
1728 return false;
1729
1730 // Src0(matrix A) or Src1(matrix B) of the current wmma instruction overlaps
1731 // with the dest(matrix D) of the previous wmma.
1732 const Register CurSrc0Reg =
1733 TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
1734 const Register CurSrc1Reg =
1735 TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
1736
1737 const Register PrevDstReg =
1738 TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
1739
1740 if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
1741 TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
1742 return true;
1743 }
1744
1745 // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall)
1746 // but Index can't overlap with PrevDstReg.
1747 if (AMDGPU::isGFX12Plus(ST)) {
1748 if (SIInstrInfo::isSWMMAC(MI: *MI)) {
1749 const Register CurIndex =
1750 TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
1751 if (TRI->regsOverlap(PrevDstReg, CurIndex))
1752 return true;
1753 }
1754 return false;
1755 }
1756
1757 return false;
1758 };
1759
1760 auto IsExpiredFn = [](const MachineInstr &I, int) {
1761 return SIInstrInfo::isVALU(MI: I);
1762 };
1763
1764 if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1765 std::numeric_limits<int>::max())
1766 return false;
1767
1768 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
1769
1770 return true;
1771}
1772
1773bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
1774 if (!ST.hasShift64HighRegBug())
1775 return false;
1776 assert(!ST.hasExtendedWaitCounts());
1777
1778 switch (MI->getOpcode()) {
1779 default:
1780 return false;
1781 case AMDGPU::V_LSHLREV_B64_e64:
1782 case AMDGPU::V_LSHRREV_B64_e64:
1783 case AMDGPU::V_ASHRREV_I64_e64:
1784 break;
1785 }
1786
1787 MachineOperand *Amt = TII.getNamedOperand(*MI, AMDGPU::OpName::src0);
1788 if (!Amt->isReg())
1789 return false;
1790
1791 Register AmtReg = Amt->getReg();
1792 const MachineRegisterInfo &MRI = MF.getRegInfo();
1793 // Check if this is a last VGPR in the allocation block.
1794 if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
1795 return false;
1796
1797 if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1))
1798 return false;
1799
1800 MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1);
1801 bool OverlappedSrc = Src1->isReg() && TRI.regsOverlap(Src1->getReg(), AmtReg);
1802 bool OverlappedDst = MI->modifiesRegister(AmtReg, &TRI);
1803 bool Overlapped = OverlappedSrc || OverlappedDst;
1804
1805 assert(!OverlappedDst || !OverlappedSrc ||
1806 Src1->getReg() == MI->getOperand(0).getReg());
1807 assert(ST.needsAlignedVGPRs());
1808 static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
1809
1810 Register NewReg;
1811 for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
1812 : AMDGPU::VGPR_32RegClass) {
1813 if (!MI->modifiesRegister(Reg, &TRI) && !MI->readsRegister(Reg, &TRI)) {
1814 NewReg = Reg;
1815 break;
1816 }
1817 }
1818
1819 Register NewAmt = Overlapped ? (Register)TRI.getSubReg(NewReg, AMDGPU::sub1)
1820 : NewReg;
1821 Register NewAmtLo;
1822
1823 if (Overlapped)
1824 NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
1825
1826 DebugLoc DL = MI->getDebugLoc();
1827 MachineBasicBlock *MBB = MI->getParent();
1828 // Insert a full wait count because found register might be pending a wait.
1829 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT))
1830 .addImm(0);
1831
1832 // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them.
1833 if (Overlapped)
1834 runOnInstruction(
1835 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmtLo)
1836 .addDef(AmtReg - 1)
1837 .addReg(AmtReg - 1, RegState::Undef)
1838 .addReg(NewAmtLo, RegState::Undef));
1839 runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt)
1840 .addDef(AmtReg)
1841 .addReg(AmtReg, RegState::Undef)
1842 .addReg(NewAmt, RegState::Undef));
1843
1844 // Instructions emitted after the current instruction will be processed by the
1845 // parent loop of the hazard recognizer in a natural way.
1846 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
1847 AmtReg)
1848 .addDef(NewAmt)
1849 .addReg(NewAmt)
1850 .addReg(AmtReg);
1851 if (Overlapped)
1852 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
1853 AmtReg - 1)
1854 .addDef(NewAmtLo)
1855 .addReg(NewAmtLo)
1856 .addReg(AmtReg - 1);
1857
1858 // Re-running hazard recognizer on the modified instruction is not necessary,
1859 // inserted V_SWAP_B32 has already both read and write new registers so
1860 // hazards related to these register has already been handled.
1861 Amt->setReg(NewAmt);
1862 Amt->setIsKill(false);
1863 // We do not update liveness, so verifier may see it as undef.
1864 Amt->setIsUndef();
1865 if (OverlappedDst)
1866 MI->getOperand(i: 0).setReg(NewReg);
1867 if (OverlappedSrc) {
1868 Src1->setReg(NewReg);
1869 Src1->setIsKill(false);
1870 Src1->setIsUndef();
1871 }
1872
1873 return true;
1874}
1875
1876int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
1877 int NSAtoVMEMWaitStates = 1;
1878
1879 if (!ST.hasNSAtoVMEMBug())
1880 return 0;
1881
1882 if (!SIInstrInfo::isMUBUF(MI: *MI) && !SIInstrInfo::isMTBUF(MI: *MI))
1883 return 0;
1884
1885 const SIInstrInfo *TII = ST.getInstrInfo();
1886 const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
1887 if (!Offset || (Offset->getImm() & 6) == 0)
1888 return 0;
1889
1890 auto IsHazardFn = [TII](const MachineInstr &I) {
1891 if (!SIInstrInfo::isMIMG(MI: I))
1892 return false;
1893 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: I.getOpcode());
1894 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
1895 TII->getInstSizeInBytes(I) >= 16;
1896 };
1897
1898 return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazard: IsHazardFn, Limit: 1);
1899}
1900
1901int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
1902 int FPAtomicToDenormModeWaitStates = 3;
1903
1904 if (!ST.hasFPAtomicToDenormModeHazard())
1905 return 0;
1906 assert(!ST.hasExtendedWaitCounts());
1907
1908 if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
1909 return 0;
1910
1911 auto IsHazardFn = [](const MachineInstr &I) {
1912 if (!SIInstrInfo::isVMEM(MI: I) && !SIInstrInfo::isFLAT(MI: I))
1913 return false;
1914 return SIInstrInfo::isFPAtomic(MI: I);
1915 };
1916
1917 auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
1918 if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
1919 return true;
1920
1921 switch (MI.getOpcode()) {
1922 case AMDGPU::S_WAITCNT:
1923 case AMDGPU::S_WAITCNT_VSCNT:
1924 case AMDGPU::S_WAITCNT_VMCNT:
1925 case AMDGPU::S_WAITCNT_EXPCNT:
1926 case AMDGPU::S_WAITCNT_LGKMCNT:
1927 case AMDGPU::S_WAIT_IDLE:
1928 return true;
1929 default:
1930 break;
1931 }
1932
1933 return false;
1934 };
1935
1936 return FPAtomicToDenormModeWaitStates -
1937 ::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn);
1938}
1939
1940int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
1941 assert(SIInstrInfo::isMAI(*MI));
1942
1943 return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
1944}
1945
1946int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) {
1947 // Early exit if no padding is requested.
1948 if (MFMAPaddingRatio == 0)
1949 return 0;
1950
1951 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1952 if (!SIInstrInfo::isMFMA(MI: *MI) || MFI->getOccupancy() < 2)
1953 return 0;
1954
1955 int NeighborMFMALatency = 0;
1956 auto IsNeighboringMFMA = [&NeighborMFMALatency,
1957 this](const MachineInstr &MI) {
1958 if (!SIInstrInfo::isMFMA(MI))
1959 return false;
1960
1961 NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI);
1962 return true;
1963 };
1964
1965 const int MaxMFMAPipelineWaitStates = 16;
1966 int WaitStatesSinceNeighborMFMA =
1967 getWaitStatesSince(IsHazard: IsNeighboringMFMA, Limit: MaxMFMAPipelineWaitStates);
1968
1969 int NeighborMFMAPaddingNeeded =
1970 (NeighborMFMALatency * MFMAPaddingRatio / 100) -
1971 WaitStatesSinceNeighborMFMA;
1972
1973 return std::max(a: 0, b: NeighborMFMAPaddingNeeded);
1974}
1975
1976int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
1977 int WaitStatesNeeded = 0;
1978 unsigned Opc = MI->getOpcode();
1979
1980 auto IsVALUFn = [](const MachineInstr &MI) {
1981 return SIInstrInfo::isVALU(MI) || MI.isInlineAsm();
1982 };
1983
1984 if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
1985 const int LegacyVALUWritesVGPRWaitStates = 2;
1986 const int VALUWritesExecWaitStates = 4;
1987 const int MaxWaitStates = 4;
1988
1989 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
1990 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
1991 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
1992
1993 if (WaitStatesNeeded < MaxWaitStates) {
1994 for (const MachineOperand &Use : MI->explicit_uses()) {
1995 const int MaxWaitStates = 2;
1996
1997 if (!Use.isReg() || !TRI.isVGPR(MRI: MF.getRegInfo(), Reg: Use.getReg()))
1998 continue;
1999
2000 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2001 getWaitStatesSinceDef(Reg: Use.getReg(), IsHazardDef: IsVALUFn, Limit: MaxWaitStates);
2002 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2003
2004 if (WaitStatesNeeded == MaxWaitStates)
2005 break;
2006 }
2007 }
2008 }
2009
2010 for (const MachineOperand &Op : MI->explicit_operands()) {
2011 if (!Op.isReg() || !TRI.isAGPR(MRI: MF.getRegInfo(), Reg: Op.getReg()))
2012 continue;
2013
2014 if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2015 continue;
2016
2017 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2018 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2019 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2020 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2021 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2022 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2023 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2024 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2025 const int MaxWaitStates = 18;
2026 Register Reg = Op.getReg();
2027 unsigned HazardDefLatency = 0;
2028
2029 auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency,
2030 this](const MachineInstr &MI) {
2031 if (!SIInstrInfo::isMFMA(MI))
2032 return false;
2033 Register DstReg = MI.getOperand(i: 0).getReg();
2034 if (DstReg == Reg)
2035 return false;
2036 HazardDefLatency =
2037 std::max(a: HazardDefLatency, b: TSchedModel.computeInstrLatency(MI: &MI));
2038 return TRI.regsOverlap(DstReg, Reg);
2039 };
2040
2041 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
2042 MaxWaitStates);
2043 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2044 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2045 int OpNo = Op.getOperandNo();
2046 if (OpNo == SrcCIdx) {
2047 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2048 } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2049 switch (HazardDefLatency) {
2050 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2051 break;
2052 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2053 break;
2054 case 16: [[fallthrough]];
2055 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2056 break;
2057 }
2058 } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2059 switch (HazardDefLatency) {
2060 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2061 break;
2062 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2063 break;
2064 case 16: [[fallthrough]];
2065 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2066 break;
2067 }
2068 }
2069
2070 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2071 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2072
2073 if (WaitStatesNeeded == MaxWaitStates)
2074 return WaitStatesNeeded; // Early exit.
2075
2076 auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
2077 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2078 return false;
2079 Register DstReg = MI.getOperand(i: 0).getReg();
2080 return TRI.regsOverlap(Reg, DstReg);
2081 };
2082
2083 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2084 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2085 const int AccVGPRWriteAccVgprReadWaitStates = 3;
2086 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2087 if (OpNo == SrcCIdx)
2088 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2089 else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2090 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2091
2092 WaitStatesNeededForUse = NeedWaitStates -
2093 getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
2094 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2095
2096 if (WaitStatesNeeded == MaxWaitStates)
2097 return WaitStatesNeeded; // Early exit.
2098 }
2099
2100 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2101 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2102 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2103 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2104 const int MaxWaitStates = 13;
2105 Register DstReg = MI->getOperand(i: 0).getReg();
2106 unsigned HazardDefLatency = 0;
2107
2108 auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2109 this](const MachineInstr &MI) {
2110 if (!SIInstrInfo::isMFMA(MI))
2111 return false;
2112 Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
2113 HazardDefLatency =
2114 std::max(a: HazardDefLatency, b: TSchedModel.computeInstrLatency(MI: &MI));
2115 return TRI.regsOverlap(Reg, DstReg);
2116 };
2117
2118 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
2119 int NeedWaitStates;
2120 switch (HazardDefLatency) {
2121 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2122 break;
2123 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2124 break;
2125 case 16: [[fallthrough]];
2126 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2127 break;
2128 }
2129
2130 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2131 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2132 }
2133
2134 // Pad neighboring MFMA with noops for better inter-wave performance.
2135 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: checkMFMAPadding(MI));
2136
2137 return WaitStatesNeeded;
2138}
2139
2140static int
2141GFX940_XDL_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses) {
2142 // 2 pass -> 3
2143 // 4 pass -> 5
2144 // 8 pass -> 9
2145 // 16 pass -> 17
2146 return NumPasses + 1;
2147}
2148
2149static int
2150GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses) {
2151 // 2 pass -> 2
2152 // 4 pass -> 4
2153 // 8 pass -> 8
2154 // 16 pass -> 16
2155 return NumPasses;
2156}
2157
2158static int
2159GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) {
2160 // 2 pass -> 4
2161 // 4 pass -> 6
2162 // 8 pass -> 10
2163 // 16 pass -> 18
2164 return NumPasses + 2;
2165}
2166
2167static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) {
2168 // 2 pass -> 5
2169 // 4 pass -> 7
2170 // 8 pass -> 11
2171 // 16 pass -> 19
2172 return NumPasses + 3;
2173}
2174
2175int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
2176 int WaitStatesNeeded = 0;
2177 unsigned Opc = MI->getOpcode();
2178
2179 auto IsLegacyVALUFn = [](const MachineInstr &MI) {
2180 return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI);
2181 };
2182
2183 auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) {
2184 return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI) &&
2185 !SIInstrInfo::isDOT(MI);
2186 };
2187
2188 if (!SIInstrInfo::isMFMA(MI: *MI))
2189 return WaitStatesNeeded;
2190
2191 const int VALUWritesExecWaitStates = 4;
2192 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2193 getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2194 VALUWritesExecWaitStates);
2195 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2196
2197 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2198
2199 // Loop for both DGEMM and S/HGEMM 2nd instruction.
2200 for (const MachineOperand &Use : MI->explicit_uses()) {
2201 const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2202 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2203 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2204 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2205 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2206 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2207 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2208 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2209 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2210 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2211 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2212 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2213 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2214 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2215 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2216 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2217 const int MaxWaitStates = 19;
2218
2219 if (!Use.isReg())
2220 continue;
2221 Register Reg = Use.getReg();
2222 bool FullReg;
2223 const MachineInstr *MI1;
2224
2225 auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1,
2226 this](const MachineInstr &MI) {
2227 if (!SIInstrInfo::isMFMA(MI))
2228 return false;
2229 Register DstReg = MI.getOperand(i: 0).getReg();
2230 FullReg = (DstReg == Reg);
2231 MI1 = &MI;
2232 return TRI.regsOverlap(DstReg, Reg);
2233 };
2234
2235 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2236 getWaitStatesSinceDef(Reg, IsHazardDef: IsLegacyVALUNotDotFn, Limit: MaxWaitStates);
2237 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2238
2239 int NumWaitStates =
2240 getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates);
2241 if (NumWaitStates == std::numeric_limits<int>::max())
2242 continue;
2243
2244 int OpNo = Use.getOperandNo();
2245 unsigned Opc1 = MI1->getOpcode();
2246 int NeedWaitStates = 0;
2247 if (OpNo == SrcCIdx) {
2248 if (!isDGEMM(Opcode: Opc) && (!ST.hasGFX940Insts() && isDGEMM(Opcode: Opc1))) {
2249 NeedWaitStates = 0;
2250 } else if (FullReg) {
2251 if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2252 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2253 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2254 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2255 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2256 else if (ST.hasGFX940Insts() &&
2257 TSchedModel.computeInstrLatency(MI: MI1) == 2)
2258 NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2259 } else {
2260 switch (Opc1) {
2261 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2262 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2263 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2264 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2265 if (!isXDL(ST, MI: *MI))
2266 NeedWaitStates = DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2267 break;
2268 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2269 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2270 if (!isXDL(ST, MI: *MI))
2271 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2272 break;
2273 default:
2274 int NumPasses = TSchedModel.computeInstrLatency(MI: MI1);
2275 if (ST.hasGFX940Insts()) {
2276 if (isXDL(ST, MI: *MI) && !isXDL(ST, MI: *MI1))
2277 break;
2278
2279 NeedWaitStates =
2280 isXDL(ST, MI: *MI1)
2281 ? GFX940_XDL_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(
2282 NumPasses)
2283 : GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(
2284 NumPasses);
2285 break;
2286 }
2287
2288 switch (NumPasses) {
2289 case 2:
2290 NeedWaitStates =
2291 isDGEMM(Opcode: Opc) ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2292 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2293 break;
2294 case 8:
2295 NeedWaitStates =
2296 isDGEMM(Opcode: Opc)
2297 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2298 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2299 break;
2300 case 16:
2301 NeedWaitStates =
2302 isDGEMM(Opcode: Opc)
2303 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2304 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2305 break;
2306 default:
2307 llvm_unreachable("unexpected number of passes");
2308 }
2309 }
2310 }
2311 } else {
2312 switch (Opc1) {
2313 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2314 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2315 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2316 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2317 NeedWaitStates = DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2318 break;
2319 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2320 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2321 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2322 break;
2323 default:
2324 int NumPasses = TSchedModel.computeInstrLatency(MI: MI1);
2325
2326 if (ST.hasGFX940Insts()) {
2327 NeedWaitStates =
2328 isXDL(ST, MI: *MI1)
2329 ? GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(
2330 NumPasses)
2331 : GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(
2332 NumPasses);
2333 break;
2334 }
2335
2336 switch (NumPasses) {
2337 case 2:
2338 NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2339 break;
2340 case 4:
2341 llvm_unreachable("unexpected number of passes for mfma");
2342 case 8:
2343 NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2344 break;
2345 case 16: [[fallthrough]];
2346 default:
2347 NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2348 }
2349 }
2350 }
2351 if (WaitStatesNeeded >= NeedWaitStates)
2352 continue;
2353
2354 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2355 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2356
2357 if (WaitStatesNeeded == MaxWaitStates)
2358 break;
2359 }
2360
2361 // Pad neighboring MFMA with noops for better inter-wave performance.
2362 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: checkMFMAPadding(MI));
2363
2364 return WaitStatesNeeded;
2365}
2366
2367int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
2368 // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards()
2369 if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
2370 return 0;
2371
2372 int WaitStatesNeeded = 0;
2373
2374 auto IsAccVgprReadFn = [](const MachineInstr &MI) {
2375 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2376 };
2377
2378 for (const MachineOperand &Op : MI->explicit_uses()) {
2379 if (!Op.isReg() || !TRI.isVGPR(MRI: MF.getRegInfo(), Reg: Op.getReg()))
2380 continue;
2381
2382 Register Reg = Op.getReg();
2383
2384 const int AccVgprReadLdStWaitStates = 2;
2385 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2386 const int MaxWaitStates = 2;
2387
2388 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2389 getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
2390 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2391
2392 if (WaitStatesNeeded == MaxWaitStates)
2393 return WaitStatesNeeded; // Early exit.
2394
2395 auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
2396 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2397 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2398 return false;
2399 auto IsVALUFn = [](const MachineInstr &MI) {
2400 return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMAI(MI);
2401 };
2402 return getWaitStatesSinceDef(Reg, IsHazardDef: IsVALUFn, Limit: 2 /*MaxWaitStates*/) <
2403 std::numeric_limits<int>::max();
2404 };
2405
2406 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2407 getWaitStatesSince(IsHazard: IsVALUAccVgprRdWrCheckFn, Limit: MaxWaitStates);
2408 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2409 }
2410
2411 return WaitStatesNeeded;
2412}
2413
2414static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses) {
2415 // 2 pass -> 4
2416 // 4 pass -> 6
2417 // 8 pass -> 10
2418 // 16 pass -> 18
2419 return NumPasses + 2;
2420}
2421
2422static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses) {
2423 // 2 pass -> 5
2424 // 4 pass -> 7
2425 // 8 pass -> 11
2426 // 16 pass -> 19
2427 return NumPasses + 3;
2428}
2429
2430static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) {
2431 // 2 pass -> 5
2432 // 4 pass -> 7
2433 // 8 pass -> 11
2434 // 16 pass -> 19
2435 return NumPasses + 3;
2436}
2437
2438static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) {
2439 // 2 pass -> 4
2440 // 4 pass -> 6
2441 // 8 pass -> 10
2442 // 16 pass -> 18
2443 return NumPasses + 2;
2444}
2445
2446int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
2447 if (!ST.hasGFX90AInsts())
2448 return 0;
2449
2450 auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
2451 return isDGEMM(Opcode: MI.getOpcode());
2452 };
2453
2454 // This is checked in checkMAIHazards90A()
2455 if (SIInstrInfo::isMFMA(MI: *MI))
2456 return 0;
2457
2458 const MachineRegisterInfo &MRI = MF.getRegInfo();
2459
2460 int WaitStatesNeeded = 0;
2461
2462 bool IsMem = SIInstrInfo::isVMEM(MI: *MI) ||
2463 SIInstrInfo::isFLAT(MI: *MI) ||
2464 SIInstrInfo::isDS(MI: *MI);
2465 bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(MI: *MI);
2466 bool IsVALU = SIInstrInfo::isVALU(MI: *MI);
2467
2468 const MachineInstr *MFMA = nullptr;
2469 unsigned Reg;
2470 auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2471 if (!SIInstrInfo::isMFMA(MI) ||
2472 !TRI.regsOverlap(MI.getOperand(i: 0).getReg(), Reg))
2473 return false;
2474 MFMA = &MI;
2475 return true;
2476 };
2477
2478 const MachineInstr *DOT = nullptr;
2479 auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
2480 if (!SIInstrInfo::isDOT(MI) ||
2481 !TRI.regsOverlap(MI.getOperand(i: 0).getReg(), Reg))
2482 return false;
2483 DOT = &MI;
2484 return true;
2485 };
2486
2487 bool DGEMMAfterVALUWrite = false;
2488 auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) {
2489 // Found DGEMM on reverse traversal to def.
2490 if (isDGEMM(Opcode: MI.getOpcode()))
2491 DGEMMAfterVALUWrite = true;
2492
2493 // Only hazard if register is defined by a VALU and a DGEMM is found after
2494 // after the def.
2495 if (!TII.isVALU(MI) || !DGEMMAfterVALUWrite)
2496 return false;
2497
2498 return true;
2499 };
2500
2501 int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
2502 AMDGPU::OpName::src2);
2503
2504 if (IsMemOrExport || IsVALU) {
2505 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
2506 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
2507 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
2508 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
2509 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
2510 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
2511 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
2512 const int DotWriteSameDotReadSrcAB = 3;
2513 const int DotWriteDifferentVALURead = 3;
2514 const int DMFMABetweenVALUWriteVMEMRead = 2;
2515 const int MaxWaitStates = 19;
2516
2517 for (const MachineOperand &Use : MI->explicit_uses()) {
2518 if (!Use.isReg())
2519 continue;
2520 Reg = Use.getReg();
2521
2522 DOT = nullptr;
2523 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsHazardDef: IsDotWriteFn,
2524 Limit: MaxWaitStates);
2525 if (DOT) {
2526 int NeedWaitStates = 0;
2527 if (DOT->getOpcode() == MI->getOpcode()) {
2528 if (&Use - &MI->getOperand(i: 0) != SrcCIdx)
2529 NeedWaitStates = DotWriteSameDotReadSrcAB;
2530 } else {
2531 NeedWaitStates = DotWriteDifferentVALURead;
2532 }
2533
2534 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2535 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2536 }
2537
2538 // Workaround for HW data hazard bug observed only in GFX90A. When there
2539 // is a DGEMM instruction in-between a VALU and a VMEM instruction it
2540 // causes the SQ to incorrectly not insert two wait states between the two
2541 // instructions needed to avoid data hazard.
2542 if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
2543 DGEMMAfterVALUWrite = false;
2544 if (TRI.isVectorRegister(MRI, Reg)) {
2545 int WaitStatesNeededForUse =
2546 DMFMABetweenVALUWriteVMEMRead -
2547 getWaitStatesSinceDef(Reg, IsHazardDef: IsDGEMMHazard,
2548 Limit: DMFMABetweenVALUWriteVMEMRead);
2549
2550 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2551 }
2552 }
2553
2554 MFMA = nullptr;
2555 WaitStatesSinceDef =
2556 getWaitStatesSinceDef(Reg, IsHazardDef: IsMFMAWriteFn, Limit: MaxWaitStates);
2557 if (!MFMA)
2558 continue;
2559
2560 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MI: MFMA);
2561 int NumPasses = HazardDefLatency;
2562 int NeedWaitStates = MaxWaitStates;
2563
2564 if (isDGEMM(Opcode: MFMA->getOpcode())) {
2565 switch (HazardDefLatency) {
2566 case 4:
2567 NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
2568 : DMFMA4x4WriteVgprVALUReadWaitStates;
2569 break;
2570 case 8:
2571 case 16:
2572 NeedWaitStates = IsMemOrExport
2573 ? DMFMA16x16WriteVgprMemExpReadWaitStates
2574 : DMFMA16x16WriteVgprVALUReadWaitStates;
2575 break;
2576 default:
2577 llvm_unreachable("unexpected dgemm");
2578 }
2579 } else if (ST.hasGFX940Insts()) {
2580 NeedWaitStates =
2581 isXDL(ST, MI: *MFMA)
2582 ? GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(NumPasses)
2583 : GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(
2584 NumPasses);
2585 } else {
2586 switch (HazardDefLatency) {
2587 case 2:
2588 NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
2589 break;
2590 case 8:
2591 NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
2592 break;
2593 case 16:
2594 NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
2595 break;
2596 default:
2597 llvm_unreachable("unexpected number of passes for mfma");
2598 }
2599 }
2600
2601 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2602 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2603
2604 if (WaitStatesNeeded == MaxWaitStates)
2605 break;
2606 }
2607 }
2608
2609 unsigned Opc = MI->getOpcode();
2610 const int DMFMAToFMA64WaitStates = 2;
2611 if ((Opc == AMDGPU::V_FMA_F64_e64 ||
2612 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
2613 Opc == AMDGPU::V_FMAC_F64_dpp) &&
2614 WaitStatesNeeded < DMFMAToFMA64WaitStates) {
2615 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
2616 getWaitStatesSince(IsHazard: IsDGEMMFn, Limit: DMFMAToFMA64WaitStates);
2617 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2618 }
2619
2620 if (!IsVALU && !IsMemOrExport)
2621 return WaitStatesNeeded;
2622
2623 for (const MachineOperand &Def : MI->defs()) {
2624 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
2625 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
2626 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
2627 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
2628 const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
2629 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
2630 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
2631 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
2632 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
2633 const int DotWriteDifferentVALUWrite = 3;
2634 const int MaxWaitStates = 19;
2635 const int MaxWarWaitStates = 15;
2636
2637 Reg = Def.getReg();
2638
2639 DOT = nullptr;
2640 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsHazardDef: IsDotWriteFn,
2641 Limit: MaxWaitStates);
2642 if (DOT && DOT->getOpcode() != MI->getOpcode())
2643 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: DotWriteDifferentVALUWrite -
2644 WaitStatesSinceDef);
2645
2646 MFMA = nullptr;
2647 WaitStatesSinceDef =
2648 getWaitStatesSinceDef(Reg, IsHazardDef: IsMFMAWriteFn, Limit: MaxWaitStates);
2649 if (MFMA) {
2650 int NeedWaitStates = MaxWaitStates;
2651 int NumPasses = TSchedModel.computeInstrLatency(MI: MFMA);
2652
2653 if (isDGEMM(Opcode: MFMA->getOpcode())) {
2654 switch (NumPasses) {
2655 case 4:
2656 NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
2657 break;
2658 case 8:
2659 case 16:
2660 NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
2661 break;
2662 default:
2663 llvm_unreachable("unexpected number of cycles for dgemm");
2664 }
2665 } else if (ST.hasGFX940Insts()) {
2666 NeedWaitStates =
2667 isXDL(ST, MI: *MFMA)
2668 ? GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(NumPasses)
2669 : GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(NumPasses);
2670 } else {
2671 switch (NumPasses) {
2672 case 2:
2673 NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
2674 break;
2675 case 8:
2676 NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
2677 break;
2678 case 16:
2679 NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
2680 break;
2681 default:
2682 llvm_unreachable("Unexpected number of passes for mfma");
2683 }
2684 }
2685
2686 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2687 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2688
2689 if (WaitStatesNeeded == MaxWaitStates)
2690 break;
2691 }
2692
2693 auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2694 if (!SIInstrInfo::isMFMA(MI) || isDGEMM(Opcode: MI.getOpcode()) ||
2695 !MI.readsRegister(Reg, &TRI))
2696 return false;
2697
2698 if (ST.hasGFX940Insts() && !isXDL(ST, MI))
2699 return false;
2700
2701 const MachineOperand *SrcC =
2702 TII.getNamedOperand(MI, AMDGPU::OpName::src2);
2703 assert(SrcC);
2704 if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg))
2705 return false;
2706
2707 MFMA = &MI;
2708 return true;
2709 };
2710
2711 MFMA = nullptr;
2712 int WaitStatesSinceUse = getWaitStatesSince(IsHazard: IsSMFMAReadAsCFn,
2713 Limit: MaxWarWaitStates);
2714 if (!MFMA)
2715 continue;
2716
2717 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MI: MFMA);
2718 int NeedWaitStates = MaxWaitStates;
2719 switch (HazardDefLatency) {
2720 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
2721 break;
2722 case 4: assert(ST.hasGFX940Insts());
2723 NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
2724 break;
2725 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
2726 break;
2727 case 16: [[fallthrough]];
2728 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
2729 break;
2730 }
2731
2732 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
2733 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2734 }
2735
2736 return WaitStatesNeeded;
2737}
2738
2739bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
2740 if (!SU->isInstr())
2741 return false;
2742
2743 const MachineInstr *MAI = nullptr;
2744
2745 auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
2746 MAI = nullptr;
2747 if (SIInstrInfo::isMFMA(MI))
2748 MAI = &MI;
2749 return MAI != nullptr;
2750 };
2751
2752 MachineInstr *MI = SU->getInstr();
2753 if (IsMFMAFn(*MI)) {
2754 int W = getWaitStatesSince(IsHazard: IsMFMAFn, Limit: 16);
2755 if (MAI)
2756 return W < (int)TSchedModel.computeInstrLatency(MI: MAI);
2757 }
2758
2759 return false;
2760}
2761
2762bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
2763 if (!ST.hasVALUMaskWriteHazard())
2764 return false;
2765 assert(!ST.hasExtendedWaitCounts());
2766
2767 if (!ST.isWave64() || !SIInstrInfo::isSALU(MI: *MI))
2768 return false;
2769
2770 // The hazard sequence is three instructions:
2771 // 1. VALU reads SGPR as mask
2772 // 2. SALU writes SGPR
2773 // 3. SALU reads SGPR
2774 // The hazard can expire if the distance between 2 and 3 is sufficient.
2775 // In practice this happens <10% of the time, hence this always assumes
2776 // the hazard exists if 1 and 2 are present to avoid searching.
2777
2778 const MachineOperand *SDSTOp = TII.getNamedOperand(*MI, AMDGPU::OpName::sdst);
2779 if (!SDSTOp || !SDSTOp->isReg())
2780 return false;
2781
2782 const Register HazardReg = SDSTOp->getReg();
2783 if (HazardReg == AMDGPU::EXEC ||
2784 HazardReg == AMDGPU::EXEC_LO ||
2785 HazardReg == AMDGPU::EXEC_HI ||
2786 HazardReg == AMDGPU::M0)
2787 return false;
2788
2789 auto IsHazardFn = [HazardReg, this](const MachineInstr &I) {
2790 switch (I.getOpcode()) {
2791 case AMDGPU::V_ADDC_U32_e32:
2792 case AMDGPU::V_ADDC_U32_dpp:
2793 case AMDGPU::V_CNDMASK_B16_e32:
2794 case AMDGPU::V_CNDMASK_B16_dpp:
2795 case AMDGPU::V_CNDMASK_B32_e32:
2796 case AMDGPU::V_CNDMASK_B32_dpp:
2797 case AMDGPU::V_DIV_FMAS_F32_e64:
2798 case AMDGPU::V_DIV_FMAS_F64_e64:
2799 case AMDGPU::V_SUBB_U32_e32:
2800 case AMDGPU::V_SUBB_U32_dpp:
2801 case AMDGPU::V_SUBBREV_U32_e32:
2802 case AMDGPU::V_SUBBREV_U32_dpp:
2803 // These implicitly read VCC as mask source.
2804 return HazardReg == AMDGPU::VCC ||
2805 HazardReg == AMDGPU::VCC_LO ||
2806 HazardReg == AMDGPU::VCC_HI;
2807 case AMDGPU::V_ADDC_U32_e64:
2808 case AMDGPU::V_ADDC_U32_e64_dpp:
2809 case AMDGPU::V_CNDMASK_B16_e64:
2810 case AMDGPU::V_CNDMASK_B16_e64_dpp:
2811 case AMDGPU::V_CNDMASK_B32_e64:
2812 case AMDGPU::V_CNDMASK_B32_e64_dpp:
2813 case AMDGPU::V_SUBB_U32_e64:
2814 case AMDGPU::V_SUBB_U32_e64_dpp:
2815 case AMDGPU::V_SUBBREV_U32_e64:
2816 case AMDGPU::V_SUBBREV_U32_e64_dpp: {
2817 // Only check mask register overlaps.
2818 const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2);
2819 assert(SSRCOp);
2820 return TRI.regsOverlap(SSRCOp->getReg(), HazardReg);
2821 }
2822 default:
2823 return false;
2824 }
2825 };
2826
2827 const MachineRegisterInfo &MRI = MF.getRegInfo();
2828 auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) {
2829 // s_waitcnt_depctr sa_sdst(0) mitigates hazard.
2830 if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
2831 AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0)
2832 return true;
2833
2834 // VALU access to any SGPR or literal constant other than HazardReg
2835 // mitigates hazard. No need to check HazardReg here as this will
2836 // only be called when !IsHazardFn.
2837 if (!SIInstrInfo::isVALU(MI: I))
2838 return false;
2839 for (int OpNo = 0, End = I.getNumOperands(); OpNo < End; ++OpNo) {
2840 const MachineOperand &Op = I.getOperand(i: OpNo);
2841 if (Op.isReg()) {
2842 Register OpReg = Op.getReg();
2843 // Only consider uses
2844 if (!Op.isUse())
2845 continue;
2846 // Ignore EXEC
2847 if (OpReg == AMDGPU::EXEC ||
2848 OpReg == AMDGPU::EXEC_LO ||
2849 OpReg == AMDGPU::EXEC_HI)
2850 continue;
2851 // Ignore all implicit uses except VCC
2852 if (Op.isImplicit()) {
2853 if (OpReg == AMDGPU::VCC ||
2854 OpReg == AMDGPU::VCC_LO ||
2855 OpReg == AMDGPU::VCC_HI)
2856 return true;
2857 continue;
2858 }
2859 if (TRI.isSGPRReg(MRI, Reg: OpReg))
2860 return true;
2861 } else {
2862 const MCInstrDesc &InstDesc = I.getDesc();
2863 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
2864 if (!TII.isInlineConstant(MO: Op, OpInfo))
2865 return true;
2866 }
2867 }
2868 return false;
2869 };
2870
2871 // Check for hazard
2872 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
2873 std::numeric_limits<int>::max())
2874 return false;
2875
2876 auto NextMI = std::next(x: MI->getIterator());
2877
2878 // Add s_waitcnt_depctr sa_sdst(0) after SALU write.
2879 BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
2880 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
2881 .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
2882
2883 // SALU write may be s_getpc in a bundle.
2884 if (MI->getOpcode() == AMDGPU::S_GETPC_B64) {
2885 // Update offsets of any references in the bundle.
2886 while (NextMI != MI->getParent()->end() &&
2887 NextMI->isBundledWithPred()) {
2888 for (auto &Operand : NextMI->operands()) {
2889 if (Operand.isGlobal())
2890 Operand.setOffset(Operand.getOffset() + 4);
2891 }
2892 NextMI++;
2893 }
2894 }
2895
2896 return true;
2897}
2898

source code of llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp