1 | //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file implements hazard recognizers for scheduling on GCN processors. |
10 | // |
11 | //===----------------------------------------------------------------------===// |
12 | |
13 | #include "GCNHazardRecognizer.h" |
14 | #include "GCNSubtarget.h" |
15 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
16 | #include "SIMachineFunctionInfo.h" |
17 | #include "llvm/CodeGen/MachineFunction.h" |
18 | #include "llvm/CodeGen/ScheduleDAG.h" |
19 | #include "llvm/TargetParser/TargetParser.h" |
20 | |
21 | using namespace llvm; |
22 | |
23 | namespace { |
24 | |
25 | struct MFMAPaddingRatioParser : public cl::parser<unsigned> { |
26 | MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {} |
27 | |
28 | bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) { |
29 | if (Arg.getAsInteger(Radix: 0, Result&: Value)) |
30 | return O.error(Message: "'" + Arg + "' value invalid for uint argument!" ); |
31 | |
32 | if (Value > 100) |
33 | return O.error(Message: "'" + Arg + "' value must be in the range [0, 100]!" ); |
34 | |
35 | return false; |
36 | } |
37 | }; |
38 | |
39 | } // end anonymous namespace |
40 | |
41 | static cl::opt<unsigned, false, MFMAPaddingRatioParser> |
42 | MFMAPaddingRatio("amdgpu-mfma-padding-ratio" , cl::init(Val: 0), cl::Hidden, |
43 | cl::desc("Fill a percentage of the latency between " |
44 | "neighboring MFMA with s_nops." )); |
45 | |
46 | //===----------------------------------------------------------------------===// |
47 | // Hazard Recognizer Implementation |
48 | //===----------------------------------------------------------------------===// |
49 | |
50 | static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, |
51 | const GCNSubtarget &ST); |
52 | |
53 | GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) : |
54 | IsHazardRecognizerMode(false), |
55 | CurrCycleInstr(nullptr), |
56 | MF(MF), |
57 | ST(MF.getSubtarget<GCNSubtarget>()), |
58 | TII(*ST.getInstrInfo()), |
59 | TRI(TII.getRegisterInfo()), |
60 | ClauseUses(TRI.getNumRegUnits()), |
61 | ClauseDefs(TRI.getNumRegUnits()) { |
62 | MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::PhysReg: AGPR0) ? 19 : 5; |
63 | TSchedModel.init(&ST); |
64 | RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST); |
65 | } |
66 | |
67 | void GCNHazardRecognizer::Reset() { |
68 | EmittedInstrs.clear(); |
69 | } |
70 | |
71 | void GCNHazardRecognizer::EmitInstruction(SUnit *SU) { |
72 | EmitInstruction(MI: SU->getInstr()); |
73 | } |
74 | |
75 | void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) { |
76 | CurrCycleInstr = MI; |
77 | } |
78 | |
79 | static bool isDivFMas(unsigned Opcode) { |
80 | return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64; |
81 | } |
82 | |
83 | static bool isSGetReg(unsigned Opcode) { |
84 | return Opcode == AMDGPU::S_GETREG_B32; |
85 | } |
86 | |
87 | static bool isSSetReg(unsigned Opcode) { |
88 | switch (Opcode) { |
89 | case AMDGPU::S_SETREG_B32: |
90 | case AMDGPU::S_SETREG_B32_mode: |
91 | case AMDGPU::S_SETREG_IMM32_B32: |
92 | case AMDGPU::S_SETREG_IMM32_B32_mode: |
93 | return true; |
94 | } |
95 | return false; |
96 | } |
97 | |
98 | static bool isRWLane(unsigned Opcode) { |
99 | return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32; |
100 | } |
101 | |
102 | static bool isRFE(unsigned Opcode) { |
103 | return Opcode == AMDGPU::S_RFE_B64; |
104 | } |
105 | |
106 | static bool isSMovRel(unsigned Opcode) { |
107 | switch (Opcode) { |
108 | case AMDGPU::S_MOVRELS_B32: |
109 | case AMDGPU::S_MOVRELS_B64: |
110 | case AMDGPU::S_MOVRELD_B32: |
111 | case AMDGPU::S_MOVRELD_B64: |
112 | return true; |
113 | default: |
114 | return false; |
115 | } |
116 | } |
117 | |
118 | static bool isDGEMM(unsigned Opcode) { |
119 | return AMDGPU::getMAIIsDGEMM(Opc: Opcode); |
120 | } |
121 | |
122 | static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) { |
123 | unsigned Opcode = MI.getOpcode(); |
124 | |
125 | if (!SIInstrInfo::isMAI(MI) || |
126 | isDGEMM(Opcode) || |
127 | Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 || |
128 | Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64) |
129 | return false; |
130 | |
131 | if (!ST.hasGFX940Insts()) |
132 | return true; |
133 | |
134 | return AMDGPU::getMAIIsGFX940XDL(Opc: Opcode); |
135 | } |
136 | |
137 | static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, |
138 | const MachineInstr &MI) { |
139 | if (TII.isAlwaysGDS(Opcode: MI.getOpcode())) |
140 | return true; |
141 | |
142 | switch (MI.getOpcode()) { |
143 | case AMDGPU::S_SENDMSG: |
144 | case AMDGPU::S_SENDMSGHALT: |
145 | case AMDGPU::S_TTRACEDATA: |
146 | return true; |
147 | // These DS opcodes don't support GDS. |
148 | case AMDGPU::DS_NOP: |
149 | case AMDGPU::DS_PERMUTE_B32: |
150 | case AMDGPU::DS_BPERMUTE_B32: |
151 | return false; |
152 | default: |
153 | if (TII.isDS(Opcode: MI.getOpcode())) { |
154 | int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(), |
155 | AMDGPU::OpName::gds); |
156 | if (MI.getOperand(i: GDS).getImm()) |
157 | return true; |
158 | } |
159 | return false; |
160 | } |
161 | } |
162 | |
163 | static bool isPermlane(const MachineInstr &MI) { |
164 | unsigned Opcode = MI.getOpcode(); |
165 | return Opcode == AMDGPU::V_PERMLANE16_B32_e64 || |
166 | Opcode == AMDGPU::V_PERMLANE64_B32 || |
167 | Opcode == AMDGPU::V_PERMLANEX16_B32_e64 || |
168 | Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 || |
169 | Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64; |
170 | } |
171 | |
172 | static bool isLdsDma(const MachineInstr &MI) { |
173 | return SIInstrInfo::isVALU(MI) && |
174 | (SIInstrInfo::isMUBUF(MI) || SIInstrInfo::isFLAT(MI)); |
175 | } |
176 | |
177 | static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) { |
178 | const MachineOperand *RegOp = TII->getNamedOperand(RegInstr, |
179 | AMDGPU::OpName::simm16); |
180 | return std::get<0>(AMDGPU::Hwreg::HwregEncoding::decode(RegOp->getImm())); |
181 | } |
182 | |
183 | ScheduleHazardRecognizer::HazardType |
184 | GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { |
185 | MachineInstr *MI = SU->getInstr(); |
186 | // If we are not in "HazardRecognizerMode" and therefore not being run from |
187 | // the scheduler, track possible stalls from hazards but don't insert noops. |
188 | auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard; |
189 | |
190 | if (MI->isBundle()) |
191 | return NoHazard; |
192 | |
193 | if (SIInstrInfo::isSMRD(MI: *MI) && checkSMRDHazards(SMRD: MI) > 0) |
194 | return HazardType; |
195 | |
196 | if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0) |
197 | return HazardType; |
198 | |
199 | if (checkFPAtomicToDenormModeHazard(MI) > 0) |
200 | return HazardType; |
201 | |
202 | if (ST.hasNoDataDepHazard()) |
203 | return NoHazard; |
204 | |
205 | // FIXME: Should flat be considered vmem? |
206 | if ((SIInstrInfo::isVMEM(MI: *MI) || |
207 | SIInstrInfo::isFLAT(MI: *MI)) |
208 | && checkVMEMHazards(VMEM: MI) > 0) |
209 | return HazardType; |
210 | |
211 | if (SIInstrInfo::isVALU(MI: *MI) && checkVALUHazards(VALU: MI) > 0) |
212 | return HazardType; |
213 | |
214 | if (SIInstrInfo::isDPP(MI: *MI) && checkDPPHazards(DPP: MI) > 0) |
215 | return HazardType; |
216 | |
217 | if (isDivFMas(Opcode: MI->getOpcode()) && checkDivFMasHazards(DivFMas: MI) > 0) |
218 | return HazardType; |
219 | |
220 | if (isRWLane(Opcode: MI->getOpcode()) && checkRWLaneHazards(RWLane: MI) > 0) |
221 | return HazardType; |
222 | |
223 | if ((SIInstrInfo::isVALU(MI: *MI) || SIInstrInfo::isVMEM(MI: *MI) || |
224 | SIInstrInfo::isFLAT(MI: *MI) || SIInstrInfo::isDS(MI: *MI) || |
225 | SIInstrInfo::isEXP(MI: *MI)) && checkMAIVALUHazards(MI) > 0) |
226 | return HazardType; |
227 | |
228 | if (isSGetReg(Opcode: MI->getOpcode()) && checkGetRegHazards(GetRegInstr: MI) > 0) |
229 | return HazardType; |
230 | |
231 | if (isSSetReg(Opcode: MI->getOpcode()) && checkSetRegHazards(SetRegInstr: MI) > 0) |
232 | return HazardType; |
233 | |
234 | if (isRFE(Opcode: MI->getOpcode()) && checkRFEHazards(RFE: MI) > 0) |
235 | return HazardType; |
236 | |
237 | if (((ST.hasReadM0MovRelInterpHazard() && |
238 | (TII.isVINTRP(MI: *MI) || isSMovRel(Opcode: MI->getOpcode()) || |
239 | MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 || |
240 | MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) || |
241 | (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, MI: *MI)) || |
242 | (ST.hasReadM0LdsDmaHazard() && isLdsDma(MI: *MI)) || |
243 | (ST.hasReadM0LdsDirectHazard() && |
244 | MI->readsRegister(AMDGPU::Reg: LDS_DIRECT, /*TRI=*/nullptr))) && |
245 | checkReadM0Hazards(SMovRel: MI) > 0) |
246 | return HazardType; |
247 | |
248 | if (SIInstrInfo::isMAI(MI: *MI) && checkMAIHazards(MI) > 0) |
249 | return HazardType; |
250 | |
251 | if ((SIInstrInfo::isVMEM(MI: *MI) || |
252 | SIInstrInfo::isFLAT(MI: *MI) || |
253 | SIInstrInfo::isDS(MI: *MI)) && checkMAILdStHazards(MI) > 0) |
254 | return HazardType; |
255 | |
256 | if (MI->isInlineAsm() && checkInlineAsmHazards(IA: MI) > 0) |
257 | return HazardType; |
258 | |
259 | return NoHazard; |
260 | } |
261 | |
262 | static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, |
263 | unsigned Quantity) { |
264 | while (Quantity > 0) { |
265 | unsigned Arg = std::min(a: Quantity, b: 8u); |
266 | Quantity -= Arg; |
267 | BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP)) |
268 | .addImm(Arg - 1); |
269 | } |
270 | } |
271 | |
272 | unsigned |
273 | GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const { |
274 | const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(MI: &MI); |
275 | assert(TSchedModel.getWriteProcResBegin(SC) != |
276 | TSchedModel.getWriteProcResEnd(SC)); |
277 | return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle; |
278 | } |
279 | |
280 | void GCNHazardRecognizer::processBundle() { |
281 | MachineBasicBlock::instr_iterator MI = std::next(x: CurrCycleInstr->getIterator()); |
282 | MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end(); |
283 | // Check bundled MachineInstr's for hazards. |
284 | for (; MI != E && MI->isInsideBundle(); ++MI) { |
285 | CurrCycleInstr = &*MI; |
286 | unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr); |
287 | |
288 | if (IsHazardRecognizerMode) { |
289 | fixHazards(MI: CurrCycleInstr); |
290 | |
291 | insertNoopsInBundle(MI: CurrCycleInstr, TII, Quantity: WaitStates); |
292 | } |
293 | |
294 | // It’s unnecessary to track more than MaxLookAhead instructions. Since we |
295 | // include the bundled MI directly after, only add a maximum of |
296 | // (MaxLookAhead - 1) noops to EmittedInstrs. |
297 | for (unsigned i = 0, e = std::min(a: WaitStates, b: MaxLookAhead - 1); i < e; ++i) |
298 | EmittedInstrs.push_front(x: nullptr); |
299 | |
300 | EmittedInstrs.push_front(x: CurrCycleInstr); |
301 | EmittedInstrs.resize(new_size: MaxLookAhead); |
302 | } |
303 | CurrCycleInstr = nullptr; |
304 | } |
305 | |
306 | void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) { |
307 | assert(IsHazardRecognizerMode); |
308 | |
309 | unsigned NumPreNoops = PreEmitNoops(MI); |
310 | EmitNoops(Quantity: NumPreNoops); |
311 | if (MI->isInsideBundle()) |
312 | insertNoopsInBundle(MI, TII, Quantity: NumPreNoops); |
313 | else |
314 | TII.insertNoops(MBB&: *MI->getParent(), MI: MachineBasicBlock::iterator(MI), |
315 | Quantity: NumPreNoops); |
316 | EmitInstruction(MI); |
317 | AdvanceCycle(); |
318 | } |
319 | |
320 | unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) { |
321 | IsHazardRecognizerMode = true; |
322 | CurrCycleInstr = MI; |
323 | unsigned W = PreEmitNoopsCommon(MI); |
324 | fixHazards(MI); |
325 | CurrCycleInstr = nullptr; |
326 | return W; |
327 | } |
328 | |
329 | unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) { |
330 | if (MI->isBundle()) |
331 | return 0; |
332 | |
333 | int WaitStates = 0; |
334 | |
335 | if (SIInstrInfo::isSMRD(MI: *MI)) |
336 | return std::max(a: WaitStates, b: checkSMRDHazards(SMRD: MI)); |
337 | |
338 | if (ST.hasNSAtoVMEMBug()) |
339 | WaitStates = std::max(a: WaitStates, b: checkNSAtoVMEMHazard(MI)); |
340 | |
341 | WaitStates = std::max(a: WaitStates, b: checkFPAtomicToDenormModeHazard(MI)); |
342 | |
343 | if (ST.hasNoDataDepHazard()) |
344 | return WaitStates; |
345 | |
346 | if (SIInstrInfo::isVMEM(MI: *MI) || SIInstrInfo::isFLAT(MI: *MI)) |
347 | WaitStates = std::max(a: WaitStates, b: checkVMEMHazards(VMEM: MI)); |
348 | |
349 | if (SIInstrInfo::isVALU(MI: *MI)) |
350 | WaitStates = std::max(a: WaitStates, b: checkVALUHazards(VALU: MI)); |
351 | |
352 | if (SIInstrInfo::isDPP(MI: *MI)) |
353 | WaitStates = std::max(a: WaitStates, b: checkDPPHazards(DPP: MI)); |
354 | |
355 | if (isDivFMas(Opcode: MI->getOpcode())) |
356 | WaitStates = std::max(a: WaitStates, b: checkDivFMasHazards(DivFMas: MI)); |
357 | |
358 | if (isRWLane(Opcode: MI->getOpcode())) |
359 | WaitStates = std::max(a: WaitStates, b: checkRWLaneHazards(RWLane: MI)); |
360 | |
361 | if ((SIInstrInfo::isVALU(MI: *MI) || SIInstrInfo::isVMEM(MI: *MI) || |
362 | SIInstrInfo::isFLAT(MI: *MI) || SIInstrInfo::isDS(MI: *MI) || |
363 | SIInstrInfo::isEXP(MI: *MI)) && checkMAIVALUHazards(MI) > 0) |
364 | WaitStates = std::max(a: WaitStates, b: checkMAIVALUHazards(MI)); |
365 | |
366 | if (MI->isInlineAsm()) |
367 | return std::max(a: WaitStates, b: checkInlineAsmHazards(IA: MI)); |
368 | |
369 | if (isSGetReg(Opcode: MI->getOpcode())) |
370 | return std::max(a: WaitStates, b: checkGetRegHazards(GetRegInstr: MI)); |
371 | |
372 | if (isSSetReg(Opcode: MI->getOpcode())) |
373 | return std::max(a: WaitStates, b: checkSetRegHazards(SetRegInstr: MI)); |
374 | |
375 | if (isRFE(Opcode: MI->getOpcode())) |
376 | return std::max(a: WaitStates, b: checkRFEHazards(RFE: MI)); |
377 | |
378 | if ((ST.hasReadM0MovRelInterpHazard() && |
379 | (TII.isVINTRP(MI: *MI) || isSMovRel(Opcode: MI->getOpcode()) || |
380 | MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 || |
381 | MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) || |
382 | (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, MI: *MI)) || |
383 | (ST.hasReadM0LdsDmaHazard() && isLdsDma(MI: *MI)) || |
384 | (ST.hasReadM0LdsDirectHazard() && |
385 | MI->readsRegister(AMDGPU::Reg: LDS_DIRECT, /*TRI=*/nullptr))) |
386 | return std::max(a: WaitStates, b: checkReadM0Hazards(SMovRel: MI)); |
387 | |
388 | if (SIInstrInfo::isMAI(MI: *MI)) |
389 | return std::max(a: WaitStates, b: checkMAIHazards(MI)); |
390 | |
391 | if (SIInstrInfo::isVMEM(MI: *MI) || |
392 | SIInstrInfo::isFLAT(MI: *MI) || |
393 | SIInstrInfo::isDS(MI: *MI)) |
394 | return std::max(a: WaitStates, b: checkMAILdStHazards(MI)); |
395 | |
396 | return WaitStates; |
397 | } |
398 | |
399 | void GCNHazardRecognizer::EmitNoop() { |
400 | EmittedInstrs.push_front(x: nullptr); |
401 | } |
402 | |
403 | void GCNHazardRecognizer::AdvanceCycle() { |
404 | // When the scheduler detects a stall, it will call AdvanceCycle() without |
405 | // emitting any instructions. |
406 | if (!CurrCycleInstr) { |
407 | EmittedInstrs.push_front(x: nullptr); |
408 | return; |
409 | } |
410 | |
411 | if (CurrCycleInstr->isBundle()) { |
412 | processBundle(); |
413 | return; |
414 | } |
415 | |
416 | unsigned NumWaitStates = TII.getNumWaitStates(MI: *CurrCycleInstr); |
417 | if (!NumWaitStates) { |
418 | CurrCycleInstr = nullptr; |
419 | return; |
420 | } |
421 | |
422 | // Keep track of emitted instructions |
423 | EmittedInstrs.push_front(x: CurrCycleInstr); |
424 | |
425 | // Add a nullptr for each additional wait state after the first. Make sure |
426 | // not to add more than getMaxLookAhead() items to the list, since we |
427 | // truncate the list to that size right after this loop. |
428 | for (unsigned i = 1, e = std::min(a: NumWaitStates, b: getMaxLookAhead()); |
429 | i < e; ++i) { |
430 | EmittedInstrs.push_front(x: nullptr); |
431 | } |
432 | |
433 | // getMaxLookahead() is the largest number of wait states we will ever need |
434 | // to insert, so there is no point in keeping track of more than that many |
435 | // wait states. |
436 | EmittedInstrs.resize(new_size: getMaxLookAhead()); |
437 | |
438 | CurrCycleInstr = nullptr; |
439 | } |
440 | |
441 | void GCNHazardRecognizer::RecedeCycle() { |
442 | llvm_unreachable("hazard recognizer does not support bottom-up scheduling." ); |
443 | } |
444 | |
445 | //===----------------------------------------------------------------------===// |
446 | // Helper Functions |
447 | //===----------------------------------------------------------------------===// |
448 | |
449 | typedef enum { HazardFound, HazardExpired, NoHazardFound } HazardFnResult; |
450 | |
451 | typedef function_ref<bool(const MachineInstr &, int WaitStates)> IsExpiredFn; |
452 | typedef function_ref<unsigned int(const MachineInstr &)> GetNumWaitStatesFn; |
453 | |
454 | // Search for a hazard in a block and its predecessors. |
455 | template <typename StateT> |
456 | static bool |
457 | hasHazard(StateT State, |
458 | function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard, |
459 | function_ref<void(StateT &, const MachineInstr &)> UpdateState, |
460 | const MachineBasicBlock *MBB, |
461 | MachineBasicBlock::const_reverse_instr_iterator I, |
462 | DenseSet<const MachineBasicBlock *> &Visited) { |
463 | for (auto E = MBB->instr_rend(); I != E; ++I) { |
464 | // No need to look at parent BUNDLE instructions. |
465 | if (I->isBundle()) |
466 | continue; |
467 | |
468 | switch (IsHazard(State, *I)) { |
469 | case HazardFound: |
470 | return true; |
471 | case HazardExpired: |
472 | return false; |
473 | default: |
474 | // Continue search |
475 | break; |
476 | } |
477 | |
478 | if (I->isInlineAsm() || I->isMetaInstruction()) |
479 | continue; |
480 | |
481 | UpdateState(State, *I); |
482 | } |
483 | |
484 | for (MachineBasicBlock *Pred : MBB->predecessors()) { |
485 | if (!Visited.insert(V: Pred).second) |
486 | continue; |
487 | |
488 | if (hasHazard(State, IsHazard, UpdateState, Pred, Pred->instr_rbegin(), |
489 | Visited)) |
490 | return true; |
491 | } |
492 | |
493 | return false; |
494 | } |
495 | |
496 | // Returns a minimum wait states since \p I walking all predecessors. |
497 | // Only scans until \p IsExpired does not return true. |
498 | // Can only be run in a hazard recognizer mode. |
499 | static int getWaitStatesSince( |
500 | GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB, |
501 | MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates, |
502 | IsExpiredFn IsExpired, DenseSet<const MachineBasicBlock *> &Visited, |
503 | GetNumWaitStatesFn GetNumWaitStates = SIInstrInfo::getNumWaitStates) { |
504 | for (auto E = MBB->instr_rend(); I != E; ++I) { |
505 | // Don't add WaitStates for parent BUNDLE instructions. |
506 | if (I->isBundle()) |
507 | continue; |
508 | |
509 | if (IsHazard(*I)) |
510 | return WaitStates; |
511 | |
512 | if (I->isInlineAsm()) |
513 | continue; |
514 | |
515 | WaitStates += GetNumWaitStates(*I); |
516 | |
517 | if (IsExpired(*I, WaitStates)) |
518 | return std::numeric_limits<int>::max(); |
519 | } |
520 | |
521 | int MinWaitStates = std::numeric_limits<int>::max(); |
522 | for (MachineBasicBlock *Pred : MBB->predecessors()) { |
523 | if (!Visited.insert(V: Pred).second) |
524 | continue; |
525 | |
526 | int W = getWaitStatesSince(IsHazard, MBB: Pred, I: Pred->instr_rbegin(), WaitStates, |
527 | IsExpired, Visited, GetNumWaitStates); |
528 | |
529 | MinWaitStates = std::min(a: MinWaitStates, b: W); |
530 | } |
531 | |
532 | return MinWaitStates; |
533 | } |
534 | |
535 | static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, |
536 | const MachineInstr *MI, IsExpiredFn IsExpired) { |
537 | DenseSet<const MachineBasicBlock *> Visited; |
538 | return getWaitStatesSince(IsHazard, MBB: MI->getParent(), |
539 | I: std::next(x: MI->getReverseIterator()), |
540 | WaitStates: 0, IsExpired, Visited); |
541 | } |
542 | |
543 | int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) { |
544 | if (IsHazardRecognizerMode) { |
545 | auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) { |
546 | return WaitStates >= Limit; |
547 | }; |
548 | return ::getWaitStatesSince(IsHazard, MI: CurrCycleInstr, IsExpired: IsExpiredFn); |
549 | } |
550 | |
551 | int WaitStates = 0; |
552 | for (MachineInstr *MI : EmittedInstrs) { |
553 | if (MI) { |
554 | if (IsHazard(*MI)) |
555 | return WaitStates; |
556 | |
557 | if (MI->isInlineAsm()) |
558 | continue; |
559 | } |
560 | ++WaitStates; |
561 | |
562 | if (WaitStates >= Limit) |
563 | break; |
564 | } |
565 | return std::numeric_limits<int>::max(); |
566 | } |
567 | |
568 | int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg, |
569 | IsHazardFn IsHazardDef, |
570 | int Limit) { |
571 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
572 | |
573 | auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) { |
574 | return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI); |
575 | }; |
576 | |
577 | return getWaitStatesSince(IsHazardFn, Limit); |
578 | } |
579 | |
580 | int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard, |
581 | int Limit) { |
582 | auto IsHazardFn = [IsHazard](const MachineInstr &MI) { |
583 | return isSSetReg(Opcode: MI.getOpcode()) && IsHazard(MI); |
584 | }; |
585 | |
586 | return getWaitStatesSince(IsHazard: IsHazardFn, Limit); |
587 | } |
588 | |
589 | //===----------------------------------------------------------------------===// |
590 | // No-op Hazard Detection |
591 | //===----------------------------------------------------------------------===// |
592 | |
593 | static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, |
594 | MCRegister Reg) { |
595 | for (MCRegUnit Unit : TRI.regunits(Reg)) |
596 | BV.set(Unit); |
597 | } |
598 | |
599 | static void addRegsToSet(const SIRegisterInfo &TRI, |
600 | iterator_range<MachineInstr::const_mop_iterator> Ops, |
601 | BitVector &DefSet, BitVector &UseSet) { |
602 | for (const MachineOperand &Op : Ops) { |
603 | if (Op.isReg()) |
604 | addRegUnits(TRI, BV&: Op.isDef() ? DefSet : UseSet, Reg: Op.getReg().asMCReg()); |
605 | } |
606 | } |
607 | |
608 | void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) { |
609 | addRegsToSet(TRI, Ops: MI.operands(), DefSet&: ClauseDefs, UseSet&: ClauseUses); |
610 | } |
611 | |
612 | static bool breaksSMEMSoftClause(MachineInstr *MI) { |
613 | return !SIInstrInfo::isSMRD(MI: *MI); |
614 | } |
615 | |
616 | static bool breaksVMEMSoftClause(MachineInstr *MI) { |
617 | return !SIInstrInfo::isVMEM(MI: *MI) && !SIInstrInfo::isFLAT(MI: *MI); |
618 | } |
619 | |
620 | int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) { |
621 | // SMEM soft clause are only present on VI+, and only matter if xnack is |
622 | // enabled. |
623 | if (!ST.isXNACKEnabled()) |
624 | return 0; |
625 | |
626 | bool IsSMRD = TII.isSMRD(MI: *MEM); |
627 | |
628 | resetClause(); |
629 | |
630 | // A soft-clause is any group of consecutive SMEM instructions. The |
631 | // instructions in this group may return out of order and/or may be |
632 | // replayed (i.e. the same instruction issued more than once). |
633 | // |
634 | // In order to handle these situations correctly we need to make sure that |
635 | // when a clause has more than one instruction, no instruction in the clause |
636 | // writes to a register that is read by another instruction in the clause |
637 | // (including itself). If we encounter this situation, we need to break the |
638 | // clause by inserting a non SMEM instruction. |
639 | |
640 | for (MachineInstr *MI : EmittedInstrs) { |
641 | // When we hit a non-SMEM instruction then we have passed the start of the |
642 | // clause and we can stop. |
643 | if (!MI) |
644 | break; |
645 | |
646 | if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI)) |
647 | break; |
648 | |
649 | addClauseInst(MI: *MI); |
650 | } |
651 | |
652 | if (ClauseDefs.none()) |
653 | return 0; |
654 | |
655 | // We need to make sure not to put loads and stores in the same clause if they |
656 | // use the same address. For now, just start a new clause whenever we see a |
657 | // store. |
658 | if (MEM->mayStore()) |
659 | return 1; |
660 | |
661 | addClauseInst(MI: *MEM); |
662 | |
663 | // If the set of defs and uses intersect then we cannot add this instruction |
664 | // to the clause, so we have a hazard. |
665 | return ClauseDefs.anyCommon(RHS: ClauseUses) ? 1 : 0; |
666 | } |
667 | |
668 | int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) { |
669 | int WaitStatesNeeded = 0; |
670 | |
671 | WaitStatesNeeded = checkSoftClauseHazards(MEM: SMRD); |
672 | |
673 | // This SMRD hazard only affects SI. |
674 | if (!ST.hasSMRDReadVALUDefHazard()) |
675 | return WaitStatesNeeded; |
676 | |
677 | // A read of an SGPR by SMRD instruction requires 4 wait states when the |
678 | // SGPR was written by a VALU instruction. |
679 | int SmrdSgprWaitStates = 4; |
680 | auto IsHazardDefFn = [this](const MachineInstr &MI) { |
681 | return TII.isVALU(MI); |
682 | }; |
683 | auto IsBufferHazardDefFn = [this](const MachineInstr &MI) { |
684 | return TII.isSALU(MI); |
685 | }; |
686 | |
687 | bool IsBufferSMRD = TII.isBufferSMRD(MI: *SMRD); |
688 | |
689 | for (const MachineOperand &Use : SMRD->uses()) { |
690 | if (!Use.isReg()) |
691 | continue; |
692 | int WaitStatesNeededForUse = |
693 | SmrdSgprWaitStates - getWaitStatesSinceDef(Reg: Use.getReg(), IsHazardDef: IsHazardDefFn, |
694 | Limit: SmrdSgprWaitStates); |
695 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse); |
696 | |
697 | // This fixes what appears to be undocumented hardware behavior in SI where |
698 | // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor |
699 | // needs some number of nops in between. We don't know how many we need, but |
700 | // let's use 4. This wasn't discovered before probably because the only |
701 | // case when this happens is when we expand a 64-bit pointer into a full |
702 | // descriptor and use s_buffer_load_dword instead of s_load_dword, which was |
703 | // probably never encountered in the closed-source land. |
704 | if (IsBufferSMRD) { |
705 | int WaitStatesNeededForUse = |
706 | SmrdSgprWaitStates - getWaitStatesSinceDef(Reg: Use.getReg(), |
707 | IsHazardDef: IsBufferHazardDefFn, |
708 | Limit: SmrdSgprWaitStates); |
709 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse); |
710 | } |
711 | } |
712 | |
713 | return WaitStatesNeeded; |
714 | } |
715 | |
716 | int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) { |
717 | if (!ST.hasVMEMReadSGPRVALUDefHazard()) |
718 | return 0; |
719 | |
720 | int WaitStatesNeeded = checkSoftClauseHazards(MEM: VMEM); |
721 | |
722 | // A read of an SGPR by a VMEM instruction requires 5 wait states when the |
723 | // SGPR was written by a VALU Instruction. |
724 | const int VmemSgprWaitStates = 5; |
725 | auto IsHazardDefFn = [this](const MachineInstr &MI) { |
726 | return TII.isVALU(MI); |
727 | }; |
728 | for (const MachineOperand &Use : VMEM->uses()) { |
729 | if (!Use.isReg() || TRI.isVectorRegister(MRI: MF.getRegInfo(), Reg: Use.getReg())) |
730 | continue; |
731 | |
732 | int WaitStatesNeededForUse = |
733 | VmemSgprWaitStates - getWaitStatesSinceDef(Reg: Use.getReg(), IsHazardDef: IsHazardDefFn, |
734 | Limit: VmemSgprWaitStates); |
735 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse); |
736 | } |
737 | return WaitStatesNeeded; |
738 | } |
739 | |
740 | int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) { |
741 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
742 | const SIInstrInfo *TII = ST.getInstrInfo(); |
743 | |
744 | // Check for DPP VGPR read after VALU VGPR write and EXEC write. |
745 | int DppVgprWaitStates = 2; |
746 | int DppExecWaitStates = 5; |
747 | int WaitStatesNeeded = 0; |
748 | auto IsHazardDefFn = [TII](const MachineInstr &MI) { |
749 | return TII->isVALU(MI); |
750 | }; |
751 | |
752 | for (const MachineOperand &Use : DPP->uses()) { |
753 | if (!Use.isReg() || !TRI->isVGPR(MRI: MF.getRegInfo(), Reg: Use.getReg())) |
754 | continue; |
755 | int WaitStatesNeededForUse = |
756 | DppVgprWaitStates - getWaitStatesSinceDef( |
757 | Reg: Use.getReg(), |
758 | IsHazardDef: [](const MachineInstr &) { return true; }, |
759 | Limit: DppVgprWaitStates); |
760 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse); |
761 | } |
762 | |
763 | WaitStatesNeeded = std::max( |
764 | WaitStatesNeeded, |
765 | DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn, |
766 | DppExecWaitStates)); |
767 | |
768 | return WaitStatesNeeded; |
769 | } |
770 | |
771 | int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) { |
772 | const SIInstrInfo *TII = ST.getInstrInfo(); |
773 | |
774 | // v_div_fmas requires 4 wait states after a write to vcc from a VALU |
775 | // instruction. |
776 | const int DivFMasWaitStates = 4; |
777 | auto IsHazardDefFn = [TII](const MachineInstr &MI) { |
778 | return TII->isVALU(MI); |
779 | }; |
780 | int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn, |
781 | DivFMasWaitStates); |
782 | |
783 | return DivFMasWaitStates - WaitStatesNeeded; |
784 | } |
785 | |
786 | int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) { |
787 | const SIInstrInfo *TII = ST.getInstrInfo(); |
788 | unsigned GetRegHWReg = getHWReg(TII, RegInstr: *GetRegInstr); |
789 | |
790 | const int GetRegWaitStates = 2; |
791 | auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) { |
792 | return GetRegHWReg == getHWReg(TII, RegInstr: MI); |
793 | }; |
794 | int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazard: IsHazardFn, Limit: GetRegWaitStates); |
795 | |
796 | return GetRegWaitStates - WaitStatesNeeded; |
797 | } |
798 | |
799 | int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) { |
800 | const SIInstrInfo *TII = ST.getInstrInfo(); |
801 | unsigned HWReg = getHWReg(TII, RegInstr: *SetRegInstr); |
802 | |
803 | const int SetRegWaitStates = ST.getSetRegWaitStates(); |
804 | auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) { |
805 | return HWReg == getHWReg(TII, RegInstr: MI); |
806 | }; |
807 | int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazard: IsHazardFn, Limit: SetRegWaitStates); |
808 | return SetRegWaitStates - WaitStatesNeeded; |
809 | } |
810 | |
811 | int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) { |
812 | if (!MI.mayStore()) |
813 | return -1; |
814 | |
815 | const SIInstrInfo *TII = ST.getInstrInfo(); |
816 | unsigned Opcode = MI.getOpcode(); |
817 | const MCInstrDesc &Desc = MI.getDesc(); |
818 | |
819 | int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata); |
820 | int VDataRCID = -1; |
821 | if (VDataIdx != -1) |
822 | VDataRCID = Desc.operands()[VDataIdx].RegClass; |
823 | |
824 | if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) { |
825 | // There is no hazard if the instruction does not use vector regs |
826 | // (like wbinvl1) |
827 | if (VDataIdx == -1) |
828 | return -1; |
829 | // For MUBUF/MTBUF instructions this hazard only exists if the |
830 | // instruction is not using a register in the soffset field. |
831 | const MachineOperand *SOffset = |
832 | TII->getNamedOperand(MI, AMDGPU::OpName::soffset); |
833 | // If we have no soffset operand, then assume this field has been |
834 | // hardcoded to zero. |
835 | if (AMDGPU::getRegBitWidth(RCID: VDataRCID) > 64 && |
836 | (!SOffset || !SOffset->isReg())) |
837 | return VDataIdx; |
838 | } |
839 | |
840 | // MIMG instructions create a hazard if they don't use a 256-bit T# and |
841 | // the store size is greater than 8 bytes and they have more than two bits |
842 | // of their dmask set. |
843 | // All our MIMG definitions use a 256-bit T#, so we can skip checking for them. |
844 | if (TII->isMIMG(MI)) { |
845 | int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc); |
846 | assert(SRsrcIdx != -1 && |
847 | AMDGPU::getRegBitWidth(Desc.operands()[SRsrcIdx].RegClass) == 256); |
848 | (void)SRsrcIdx; |
849 | } |
850 | |
851 | if (TII->isFLAT(MI)) { |
852 | int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata); |
853 | if (AMDGPU::getRegBitWidth(RCID: Desc.operands()[DataIdx].RegClass) > 64) |
854 | return DataIdx; |
855 | } |
856 | |
857 | return -1; |
858 | } |
859 | |
860 | int |
861 | GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def, |
862 | const MachineRegisterInfo &MRI) { |
863 | // Helper to check for the hazard where VMEM instructions that store more than |
864 | // 8 bytes can have there store data over written by the next instruction. |
865 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
866 | |
867 | const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1; |
868 | int WaitStatesNeeded = 0; |
869 | |
870 | if (!TRI->isVectorRegister(MRI, Reg: Def.getReg())) |
871 | return WaitStatesNeeded; |
872 | Register Reg = Def.getReg(); |
873 | auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) { |
874 | int DataIdx = createsVALUHazard(MI); |
875 | return DataIdx >= 0 && |
876 | TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg); |
877 | }; |
878 | int WaitStatesNeededForDef = |
879 | VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates); |
880 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef); |
881 | |
882 | return WaitStatesNeeded; |
883 | } |
884 | |
885 | int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) { |
886 | int WaitStatesNeeded = 0; |
887 | |
888 | if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(MI: *VALU)) { |
889 | const int TransDefWaitstates = 1; |
890 | |
891 | auto IsTransDefFn = [this, VALU](const MachineInstr &MI) { |
892 | if (!SIInstrInfo::isTRANS(MI)) |
893 | return false; |
894 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
895 | const SIInstrInfo *TII = ST.getInstrInfo(); |
896 | Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg(); |
897 | |
898 | for (const MachineOperand &Use : VALU->explicit_uses()) { |
899 | if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg())) |
900 | return true; |
901 | } |
902 | |
903 | return false; |
904 | }; |
905 | |
906 | int WaitStatesNeededForDef = |
907 | TransDefWaitstates - |
908 | getWaitStatesSince(IsHazard: IsTransDefFn, Limit: TransDefWaitstates); |
909 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef); |
910 | } |
911 | |
912 | if (ST.hasDstSelForwardingHazard()) { |
913 | const int Shift16DefWaitstates = 1; |
914 | |
915 | auto IsShift16BitDefFn = [this, VALU](const MachineInstr &MI) { |
916 | if (!SIInstrInfo::isVALU(MI)) |
917 | return false; |
918 | const SIInstrInfo *TII = ST.getInstrInfo(); |
919 | if (SIInstrInfo::isSDWA(MI)) { |
920 | if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel)) |
921 | if (DstSel->getImm() == AMDGPU::SDWA::DWORD) |
922 | return false; |
923 | } else { |
924 | if (!AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::op_sel) || |
925 | !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers) |
926 | ->getImm() & |
927 | SISrcMods::DST_OP_SEL)) |
928 | return false; |
929 | } |
930 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
931 | if (auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { |
932 | Register Def = Dst->getReg(); |
933 | |
934 | for (const MachineOperand &Use : VALU->explicit_uses()) { |
935 | if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg())) |
936 | return true; |
937 | } |
938 | } |
939 | |
940 | return false; |
941 | }; |
942 | |
943 | int WaitStatesNeededForDef = |
944 | Shift16DefWaitstates - |
945 | getWaitStatesSince(IsHazard: IsShift16BitDefFn, Limit: Shift16DefWaitstates); |
946 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef); |
947 | } |
948 | |
949 | if (ST.hasVDecCoExecHazard()) { |
950 | const int VALUWriteSGPRVALUReadWaitstates = 2; |
951 | const int VALUWriteEXECRWLane = 4; |
952 | const int VALUWriteVGPRReadlaneRead = 1; |
953 | |
954 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
955 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
956 | Register UseReg; |
957 | auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) { |
958 | if (!SIInstrInfo::isVALU(MI)) |
959 | return false; |
960 | return MI.modifiesRegister(UseReg, TRI); |
961 | }; |
962 | |
963 | for (const MachineOperand &Use : VALU->explicit_uses()) { |
964 | if (!Use.isReg()) |
965 | continue; |
966 | |
967 | UseReg = Use.getReg(); |
968 | if (TRI->isSGPRReg(MRI, Reg: UseReg)) { |
969 | int WaitStatesNeededForDef = |
970 | VALUWriteSGPRVALUReadWaitstates - |
971 | getWaitStatesSince(IsVALUDefSGPRFn, |
972 | VALUWriteSGPRVALUReadWaitstates); |
973 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef); |
974 | } |
975 | } |
976 | |
977 | if (VALU->readsRegister(AMDGPU::VCC, TRI)) { |
978 | UseReg = AMDGPU::VCC; |
979 | int WaitStatesNeededForDef = |
980 | VALUWriteSGPRVALUReadWaitstates - |
981 | getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates); |
982 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef); |
983 | } |
984 | |
985 | switch (VALU->getOpcode()) { |
986 | case AMDGPU::V_READLANE_B32: |
987 | case AMDGPU::V_READFIRSTLANE_B32: { |
988 | MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0); |
989 | UseReg = Src->getReg(); |
990 | int WaitStatesNeededForDef = |
991 | VALUWriteVGPRReadlaneRead - |
992 | getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead); |
993 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef); |
994 | } |
995 | [[fallthrough]]; |
996 | case AMDGPU::V_WRITELANE_B32: { |
997 | UseReg = AMDGPU::EXEC; |
998 | int WaitStatesNeededForDef = |
999 | VALUWriteEXECRWLane - |
1000 | getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane); |
1001 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef); |
1002 | break; |
1003 | } |
1004 | default: |
1005 | break; |
1006 | } |
1007 | } |
1008 | |
1009 | // This checks for the hazard where VMEM instructions that store more than |
1010 | // 8 bytes can have there store data over written by the next instruction. |
1011 | if (!ST.has12DWordStoreHazard()) |
1012 | return WaitStatesNeeded; |
1013 | |
1014 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
1015 | |
1016 | for (const MachineOperand &Def : VALU->defs()) { |
1017 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: checkVALUHazardsHelper(Def, MRI)); |
1018 | } |
1019 | |
1020 | return WaitStatesNeeded; |
1021 | } |
1022 | |
1023 | int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) { |
1024 | // This checks for hazards associated with inline asm statements. |
1025 | // Since inline asms can contain just about anything, we use this |
1026 | // to call/leverage other check*Hazard routines. Note that |
1027 | // this function doesn't attempt to address all possible inline asm |
1028 | // hazards (good luck), but is a collection of what has been |
1029 | // problematic thus far. |
1030 | |
1031 | // see checkVALUHazards() |
1032 | if (!ST.has12DWordStoreHazard()) |
1033 | return 0; |
1034 | |
1035 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
1036 | int WaitStatesNeeded = 0; |
1037 | |
1038 | for (const MachineOperand &Op : |
1039 | llvm::drop_begin(RangeOrContainer: IA->operands(), N: InlineAsm::MIOp_FirstOperand)) { |
1040 | if (Op.isReg() && Op.isDef()) { |
1041 | WaitStatesNeeded = |
1042 | std::max(a: WaitStatesNeeded, b: checkVALUHazardsHelper(Def: Op, MRI)); |
1043 | } |
1044 | } |
1045 | |
1046 | return WaitStatesNeeded; |
1047 | } |
1048 | |
1049 | int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) { |
1050 | const SIInstrInfo *TII = ST.getInstrInfo(); |
1051 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
1052 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
1053 | |
1054 | const MachineOperand *LaneSelectOp = |
1055 | TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1); |
1056 | |
1057 | if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, Reg: LaneSelectOp->getReg())) |
1058 | return 0; |
1059 | |
1060 | Register LaneSelectReg = LaneSelectOp->getReg(); |
1061 | auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); }; |
1062 | |
1063 | const int RWLaneWaitStates = 4; |
1064 | int WaitStatesSince = getWaitStatesSinceDef(Reg: LaneSelectReg, IsHazardDef: IsHazardFn, |
1065 | Limit: RWLaneWaitStates); |
1066 | return RWLaneWaitStates - WaitStatesSince; |
1067 | } |
1068 | |
1069 | int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) { |
1070 | if (!ST.hasRFEHazards()) |
1071 | return 0; |
1072 | |
1073 | const SIInstrInfo *TII = ST.getInstrInfo(); |
1074 | |
1075 | const int RFEWaitStates = 1; |
1076 | |
1077 | auto IsHazardFn = [TII](const MachineInstr &MI) { |
1078 | return getHWReg(TII, RegInstr: MI) == AMDGPU::Hwreg::ID_TRAPSTS; |
1079 | }; |
1080 | int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazard: IsHazardFn, Limit: RFEWaitStates); |
1081 | return RFEWaitStates - WaitStatesNeeded; |
1082 | } |
1083 | |
1084 | int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) { |
1085 | const SIInstrInfo *TII = ST.getInstrInfo(); |
1086 | const int ReadM0WaitStates = 1; |
1087 | auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); }; |
1088 | return ReadM0WaitStates - |
1089 | getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates); |
1090 | } |
1091 | |
1092 | void GCNHazardRecognizer::fixHazards(MachineInstr *MI) { |
1093 | fixVMEMtoScalarWriteHazards(MI); |
1094 | fixVcmpxPermlaneHazards(MI); |
1095 | fixSMEMtoVectorWriteHazards(MI); |
1096 | fixVcmpxExecWARHazard(MI); |
1097 | fixLdsBranchVmemWARHazard(MI); |
1098 | if (ST.hasLdsDirect()) { |
1099 | fixLdsDirectVALUHazard(MI); |
1100 | fixLdsDirectVMEMHazard(MI); |
1101 | } |
1102 | fixVALUPartialForwardingHazard(MI); |
1103 | fixVALUTransUseHazard(MI); |
1104 | fixWMMAHazards(MI); |
1105 | fixShift64HighRegBug(MI); |
1106 | fixVALUMaskWriteHazard(MI); |
1107 | } |
1108 | |
1109 | bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) { |
1110 | if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(MI: *MI)) |
1111 | return false; |
1112 | |
1113 | const SIInstrInfo *TII = ST.getInstrInfo(); |
1114 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
1115 | auto IsHazardFn = [TII, TRI](const MachineInstr &MI) { |
1116 | return (TII->isVOPC(MI) || |
1117 | ((TII->isVOP3(MI) || TII->isSDWA(MI)) && MI.isCompare())) && |
1118 | MI.modifiesRegister(AMDGPU::EXEC, TRI); |
1119 | }; |
1120 | |
1121 | auto IsExpiredFn = [](const MachineInstr &MI, int) { |
1122 | unsigned Opc = MI.getOpcode(); |
1123 | return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 && |
1124 | Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa; |
1125 | }; |
1126 | |
1127 | if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == |
1128 | std::numeric_limits<int>::max()) |
1129 | return false; |
1130 | |
1131 | // V_NOP will be discarded by SQ. |
1132 | // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE* |
1133 | // which is always a VGPR and available. |
1134 | auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0); |
1135 | Register Reg = Src0->getReg(); |
1136 | bool IsUndef = Src0->isUndef(); |
1137 | BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), |
1138 | TII->get(AMDGPU::V_MOV_B32_e32)) |
1139 | .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0)) |
1140 | .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill); |
1141 | |
1142 | return true; |
1143 | } |
1144 | |
1145 | bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) { |
1146 | if (!ST.hasVMEMtoScalarWriteHazard()) |
1147 | return false; |
1148 | assert(!ST.hasExtendedWaitCounts()); |
1149 | |
1150 | if (!SIInstrInfo::isSALU(MI: *MI) && !SIInstrInfo::isSMRD(MI: *MI)) |
1151 | return false; |
1152 | |
1153 | if (MI->getNumDefs() == 0) |
1154 | return false; |
1155 | |
1156 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
1157 | |
1158 | auto IsHazardFn = [TRI, MI](const MachineInstr &I) { |
1159 | if (!SIInstrInfo::isVMEM(MI: I) && !SIInstrInfo::isDS(MI: I) && |
1160 | !SIInstrInfo::isFLAT(MI: I)) |
1161 | return false; |
1162 | |
1163 | for (const MachineOperand &Def : MI->defs()) { |
1164 | const MachineOperand *Op = |
1165 | I.findRegisterUseOperand(Def.getReg(), TRI, false); |
1166 | if (!Op) |
1167 | continue; |
1168 | return true; |
1169 | } |
1170 | return false; |
1171 | }; |
1172 | |
1173 | auto IsExpiredFn = [](const MachineInstr &MI, int) { |
1174 | return SIInstrInfo::isVALU(MI) || |
1175 | (MI.getOpcode() == AMDGPU::S_WAITCNT && |
1176 | !MI.getOperand(0).getImm()) || |
1177 | (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && |
1178 | AMDGPU::DepCtr::decodeFieldVmVsrc(MI.getOperand(0).getImm()) == 0); |
1179 | }; |
1180 | |
1181 | if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == |
1182 | std::numeric_limits<int>::max()) |
1183 | return false; |
1184 | |
1185 | const SIInstrInfo *TII = ST.getInstrInfo(); |
1186 | BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), |
1187 | TII->get(AMDGPU::S_WAITCNT_DEPCTR)) |
1188 | .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0)); |
1189 | return true; |
1190 | } |
1191 | |
1192 | bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) { |
1193 | if (!ST.hasSMEMtoVectorWriteHazard()) |
1194 | return false; |
1195 | assert(!ST.hasExtendedWaitCounts()); |
1196 | |
1197 | if (!SIInstrInfo::isVALU(MI: *MI)) |
1198 | return false; |
1199 | |
1200 | unsigned SDSTName; |
1201 | switch (MI->getOpcode()) { |
1202 | case AMDGPU::V_READLANE_B32: |
1203 | case AMDGPU::V_READFIRSTLANE_B32: |
1204 | SDSTName = AMDGPU::OpName::vdst; |
1205 | break; |
1206 | default: |
1207 | SDSTName = AMDGPU::OpName::sdst; |
1208 | break; |
1209 | } |
1210 | |
1211 | const SIInstrInfo *TII = ST.getInstrInfo(); |
1212 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
1213 | const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(GPU: ST.getCPU()); |
1214 | const MachineOperand *SDST = TII->getNamedOperand(MI&: *MI, OperandName: SDSTName); |
1215 | if (!SDST) { |
1216 | for (const auto &MO : MI->implicit_operands()) { |
1217 | if (MO.isDef() && TRI->isSGPRClass(RC: TRI->getPhysRegBaseClass(MO.getReg()))) { |
1218 | SDST = &MO; |
1219 | break; |
1220 | } |
1221 | } |
1222 | } |
1223 | |
1224 | if (!SDST) |
1225 | return false; |
1226 | |
1227 | const Register SDSTReg = SDST->getReg(); |
1228 | auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) { |
1229 | return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI); |
1230 | }; |
1231 | |
1232 | auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) { |
1233 | if (TII->isSALU(MI)) { |
1234 | switch (MI.getOpcode()) { |
1235 | case AMDGPU::S_SETVSKIP: |
1236 | case AMDGPU::S_VERSION: |
1237 | case AMDGPU::S_WAITCNT_VSCNT: |
1238 | case AMDGPU::S_WAITCNT_VMCNT: |
1239 | case AMDGPU::S_WAITCNT_EXPCNT: |
1240 | // These instructions cannot not mitigate the hazard. |
1241 | return false; |
1242 | case AMDGPU::S_WAITCNT_LGKMCNT: |
1243 | // Reducing lgkmcnt count to 0 always mitigates the hazard. |
1244 | return (MI.getOperand(1).getImm() == 0) && |
1245 | (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL); |
1246 | case AMDGPU::S_WAITCNT: { |
1247 | const int64_t Imm = MI.getOperand(i: 0).getImm(); |
1248 | AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(Version: IV, Encoded: Imm); |
1249 | // DsCnt corresponds to LGKMCnt here. |
1250 | return (Decoded.DsCnt == 0); |
1251 | } |
1252 | default: |
1253 | // SOPP instructions cannot mitigate the hazard. |
1254 | if (TII->isSOPP(MI)) |
1255 | return false; |
1256 | // At this point the SALU can be assumed to mitigate the hazard |
1257 | // because either: |
1258 | // (a) it is independent of the at risk SMEM (breaking chain), |
1259 | // or |
1260 | // (b) it is dependent on the SMEM, in which case an appropriate |
1261 | // s_waitcnt lgkmcnt _must_ exist between it and the at risk |
1262 | // SMEM instruction. |
1263 | return true; |
1264 | } |
1265 | } |
1266 | return false; |
1267 | }; |
1268 | |
1269 | if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == |
1270 | std::numeric_limits<int>::max()) |
1271 | return false; |
1272 | |
1273 | BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), |
1274 | TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL) |
1275 | .addImm(0); |
1276 | return true; |
1277 | } |
1278 | |
1279 | bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) { |
1280 | if (!ST.hasVcmpxExecWARHazard()) |
1281 | return false; |
1282 | assert(!ST.hasExtendedWaitCounts()); |
1283 | |
1284 | if (!SIInstrInfo::isVALU(MI: *MI)) |
1285 | return false; |
1286 | |
1287 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
1288 | if (!MI->modifiesRegister(AMDGPU::EXEC, TRI)) |
1289 | return false; |
1290 | |
1291 | auto IsHazardFn = [TRI](const MachineInstr &I) { |
1292 | if (SIInstrInfo::isVALU(MI: I)) |
1293 | return false; |
1294 | return I.readsRegister(AMDGPU::EXEC, TRI); |
1295 | }; |
1296 | |
1297 | const SIInstrInfo *TII = ST.getInstrInfo(); |
1298 | auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) { |
1299 | if (SIInstrInfo::isVALU(MI)) { |
1300 | if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) |
1301 | return true; |
1302 | for (auto MO : MI.implicit_operands()) |
1303 | if (MO.isDef() && TRI->isSGPRClass(RC: TRI->getPhysRegBaseClass(MO.getReg()))) |
1304 | return true; |
1305 | } |
1306 | if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && |
1307 | AMDGPU::DepCtr::decodeFieldSaSdst(MI.getOperand(0).getImm()) == 0) |
1308 | return true; |
1309 | return false; |
1310 | }; |
1311 | |
1312 | if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) == |
1313 | std::numeric_limits<int>::max()) |
1314 | return false; |
1315 | |
1316 | BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), |
1317 | TII->get(AMDGPU::S_WAITCNT_DEPCTR)) |
1318 | .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); |
1319 | return true; |
1320 | } |
1321 | |
1322 | static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, |
1323 | const GCNSubtarget &ST) { |
1324 | if (!ST.hasLdsBranchVmemWARHazard()) |
1325 | return false; |
1326 | |
1327 | // Check if the necessary condition for the hazard is met: both LDS and VMEM |
1328 | // instructions need to appear in the same function. |
1329 | bool HasLds = false; |
1330 | bool HasVmem = false; |
1331 | for (auto &MBB : MF) { |
1332 | for (auto &MI : MBB) { |
1333 | HasLds |= SIInstrInfo::isDS(MI); |
1334 | HasVmem |= |
1335 | SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI); |
1336 | if (HasLds && HasVmem) |
1337 | return true; |
1338 | } |
1339 | } |
1340 | return false; |
1341 | } |
1342 | |
1343 | static bool isStoreCountWaitZero(const MachineInstr &I) { |
1344 | return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT && |
1345 | I.getOperand(0).getReg() == AMDGPU::SGPR_NULL && |
1346 | !I.getOperand(1).getImm(); |
1347 | } |
1348 | |
1349 | bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) { |
1350 | if (!RunLdsBranchVmemWARHazardFixup) |
1351 | return false; |
1352 | |
1353 | assert(ST.hasLdsBranchVmemWARHazard()); |
1354 | assert(!ST.hasExtendedWaitCounts()); |
1355 | |
1356 | auto IsHazardInst = [](const MachineInstr &MI) { |
1357 | if (SIInstrInfo::isDS(MI)) |
1358 | return 1; |
1359 | if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI)) |
1360 | return 2; |
1361 | return 0; |
1362 | }; |
1363 | |
1364 | auto InstType = IsHazardInst(*MI); |
1365 | if (!InstType) |
1366 | return false; |
1367 | |
1368 | auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) { |
1369 | return IsHazardInst(I) || isStoreCountWaitZero(I); |
1370 | }; |
1371 | |
1372 | auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) { |
1373 | if (!I.isBranch()) |
1374 | return false; |
1375 | |
1376 | auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) { |
1377 | auto InstType2 = IsHazardInst(I); |
1378 | return InstType2 && InstType != InstType2; |
1379 | }; |
1380 | |
1381 | auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) { |
1382 | auto InstType2 = IsHazardInst(I); |
1383 | if (InstType == InstType2) |
1384 | return true; |
1385 | |
1386 | return isStoreCountWaitZero(I); |
1387 | }; |
1388 | |
1389 | return ::getWaitStatesSince(IsHazard: IsHazardFn, MI: &I, IsExpired: IsExpiredFn) != |
1390 | std::numeric_limits<int>::max(); |
1391 | }; |
1392 | |
1393 | if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) == |
1394 | std::numeric_limits<int>::max()) |
1395 | return false; |
1396 | |
1397 | const SIInstrInfo *TII = ST.getInstrInfo(); |
1398 | BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), |
1399 | TII->get(AMDGPU::S_WAITCNT_VSCNT)) |
1400 | .addReg(AMDGPU::SGPR_NULL, RegState::Undef) |
1401 | .addImm(0); |
1402 | |
1403 | return true; |
1404 | } |
1405 | |
1406 | bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) { |
1407 | if (!SIInstrInfo::isLDSDIR(MI: *MI)) |
1408 | return false; |
1409 | |
1410 | const int NoHazardWaitStates = 15; |
1411 | const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst); |
1412 | const Register VDSTReg = VDST->getReg(); |
1413 | |
1414 | bool VisitedTrans = false; |
1415 | auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) { |
1416 | if (!SIInstrInfo::isVALU(MI: I)) |
1417 | return false; |
1418 | VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(MI: I); |
1419 | // Cover both WAR and WAW |
1420 | return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI); |
1421 | }; |
1422 | auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) { |
1423 | if (WaitStates >= NoHazardWaitStates) |
1424 | return true; |
1425 | // Instructions which cause va_vdst==0 expire hazard |
1426 | return SIInstrInfo::isVMEM(MI: I) || SIInstrInfo::isFLAT(MI: I) || |
1427 | SIInstrInfo::isDS(MI: I) || SIInstrInfo::isEXP(MI: I); |
1428 | }; |
1429 | auto GetWaitStatesFn = [](const MachineInstr &MI) { |
1430 | return SIInstrInfo::isVALU(MI) ? 1 : 0; |
1431 | }; |
1432 | |
1433 | DenseSet<const MachineBasicBlock *> Visited; |
1434 | auto Count = ::getWaitStatesSince(IsHazardFn, MI->getParent(), |
1435 | std::next(x: MI->getReverseIterator()), 0, |
1436 | IsExpiredFn, Visited, GetWaitStatesFn); |
1437 | |
1438 | // Transcendentals can execute in parallel to other VALUs. |
1439 | // This makes va_vdst count unusable with a mixture of VALU and TRANS. |
1440 | if (VisitedTrans) |
1441 | Count = 0; |
1442 | |
1443 | MachineOperand *WaitVdstOp = |
1444 | TII.getNamedOperand(*MI, AMDGPU::OpName::waitvdst); |
1445 | WaitVdstOp->setImm(std::min(Count, NoHazardWaitStates)); |
1446 | |
1447 | return true; |
1448 | } |
1449 | |
1450 | bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) { |
1451 | if (!SIInstrInfo::isLDSDIR(MI: *MI)) |
1452 | return false; |
1453 | |
1454 | const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst); |
1455 | const Register VDSTReg = VDST->getReg(); |
1456 | |
1457 | auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) { |
1458 | if (!SIInstrInfo::isVMEM(MI: I) && !SIInstrInfo::isFLAT(MI: I) && |
1459 | !SIInstrInfo::isDS(MI: I)) |
1460 | return false; |
1461 | return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI); |
1462 | }; |
1463 | bool LdsdirCanWait = ST.hasLdsWaitVMSRC(); |
1464 | // TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT |
1465 | // according to the type of VMEM instruction. |
1466 | auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) { |
1467 | return SIInstrInfo::isVALU(I) || SIInstrInfo::isEXP(I) || |
1468 | (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) || |
1469 | (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && |
1470 | AMDGPU::DepCtr::decodeFieldVmVsrc(I.getOperand(0).getImm()) == 0) || |
1471 | (LdsdirCanWait && SIInstrInfo::isLDSDIR(I) && |
1472 | !TII.getNamedOperand(I, AMDGPU::OpName::waitvsrc)->getImm()); |
1473 | }; |
1474 | |
1475 | if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == |
1476 | std::numeric_limits<int>::max()) |
1477 | return false; |
1478 | |
1479 | if (LdsdirCanWait) { |
1480 | TII.getNamedOperand(*MI, AMDGPU::OpName::waitvsrc)->setImm(0); |
1481 | } else { |
1482 | BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), |
1483 | TII.get(AMDGPU::S_WAITCNT_DEPCTR)) |
1484 | .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0)); |
1485 | } |
1486 | |
1487 | return true; |
1488 | } |
1489 | |
1490 | bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) { |
1491 | if (!ST.hasVALUPartialForwardingHazard()) |
1492 | return false; |
1493 | assert(!ST.hasExtendedWaitCounts()); |
1494 | |
1495 | if (!ST.isWave64() || !SIInstrInfo::isVALU(MI: *MI)) |
1496 | return false; |
1497 | |
1498 | SmallSetVector<Register, 4> SrcVGPRs; |
1499 | |
1500 | for (const MachineOperand &Use : MI->explicit_uses()) { |
1501 | if (Use.isReg() && TRI.isVGPR(MRI: MF.getRegInfo(), Reg: Use.getReg())) |
1502 | SrcVGPRs.insert(X: Use.getReg()); |
1503 | } |
1504 | |
1505 | // Only applies with >= 2 unique VGPR sources |
1506 | if (SrcVGPRs.size() <= 1) |
1507 | return false; |
1508 | |
1509 | // Look for the following pattern: |
1510 | // Va <- VALU [PreExecPos] |
1511 | // intv1 |
1512 | // Exec <- SALU [ExecPos] |
1513 | // intv2 |
1514 | // Vb <- VALU [PostExecPos] |
1515 | // intv3 |
1516 | // MI Va, Vb (WaitState = 0) |
1517 | // |
1518 | // Where: |
1519 | // intv1 + intv2 <= 2 VALUs |
1520 | // intv3 <= 4 VALUs |
1521 | // |
1522 | // If found, insert an appropriate S_WAITCNT_DEPCTR before MI. |
1523 | |
1524 | const int Intv1plus2MaxVALUs = 2; |
1525 | const int Intv3MaxVALUs = 4; |
1526 | const int IntvMaxVALUs = 6; |
1527 | const int NoHazardVALUWaitStates = IntvMaxVALUs + 2; |
1528 | |
1529 | struct StateType { |
1530 | SmallDenseMap<Register, int, 4> DefPos; |
1531 | int ExecPos = std::numeric_limits<int>::max(); |
1532 | int VALUs = 0; |
1533 | }; |
1534 | |
1535 | StateType State; |
1536 | |
1537 | // This overloads expiry testing with all the hazard detection |
1538 | auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) { |
1539 | // Too many VALU states have passed |
1540 | if (State.VALUs > NoHazardVALUWaitStates) |
1541 | return HazardExpired; |
1542 | |
1543 | // Instructions which cause va_vdst==0 expire hazard |
1544 | if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) || |
1545 | SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) || |
1546 | (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && |
1547 | AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0)) |
1548 | return HazardExpired; |
1549 | |
1550 | // Track registers writes |
1551 | bool Changed = false; |
1552 | if (SIInstrInfo::isVALU(MI: I)) { |
1553 | for (Register Src : SrcVGPRs) { |
1554 | if (!State.DefPos.count(Val: Src) && I.modifiesRegister(Src, &TRI)) { |
1555 | State.DefPos[Src] = State.VALUs; |
1556 | Changed = true; |
1557 | } |
1558 | } |
1559 | } else if (SIInstrInfo::isSALU(MI: I)) { |
1560 | if (State.ExecPos == std::numeric_limits<int>::max()) { |
1561 | if (!State.DefPos.empty() && I.modifiesRegister(AMDGPU::EXEC, &TRI)) { |
1562 | State.ExecPos = State.VALUs; |
1563 | Changed = true; |
1564 | } |
1565 | } |
1566 | } |
1567 | |
1568 | // Early expiration: too many VALUs in intv3 |
1569 | if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty()) |
1570 | return HazardExpired; |
1571 | |
1572 | // Only evaluate state if something changed |
1573 | if (!Changed) |
1574 | return NoHazardFound; |
1575 | |
1576 | // Determine positions of VALUs pre/post exec change |
1577 | if (State.ExecPos == std::numeric_limits<int>::max()) |
1578 | return NoHazardFound; |
1579 | |
1580 | int PreExecPos = std::numeric_limits<int>::max(); |
1581 | int PostExecPos = std::numeric_limits<int>::max(); |
1582 | |
1583 | for (auto Entry : State.DefPos) { |
1584 | int DefVALUs = Entry.second; |
1585 | if (DefVALUs != std::numeric_limits<int>::max()) { |
1586 | if (DefVALUs >= State.ExecPos) |
1587 | PreExecPos = std::min(a: PreExecPos, b: DefVALUs); |
1588 | else |
1589 | PostExecPos = std::min(a: PostExecPos, b: DefVALUs); |
1590 | } |
1591 | } |
1592 | |
1593 | // Need a VALUs post exec change |
1594 | if (PostExecPos == std::numeric_limits<int>::max()) |
1595 | return NoHazardFound; |
1596 | |
1597 | // Too many VALUs in intv3? |
1598 | int Intv3VALUs = PostExecPos; |
1599 | if (Intv3VALUs > Intv3MaxVALUs) |
1600 | return HazardExpired; |
1601 | |
1602 | // Too many VALUs in intv2? |
1603 | int Intv2VALUs = (State.ExecPos - PostExecPos) - 1; |
1604 | if (Intv2VALUs > Intv1plus2MaxVALUs) |
1605 | return HazardExpired; |
1606 | |
1607 | // Need a VALUs pre exec change |
1608 | if (PreExecPos == std::numeric_limits<int>::max()) |
1609 | return NoHazardFound; |
1610 | |
1611 | // Too many VALUs in intv1? |
1612 | int Intv1VALUs = PreExecPos - State.ExecPos; |
1613 | if (Intv1VALUs > Intv1plus2MaxVALUs) |
1614 | return HazardExpired; |
1615 | |
1616 | // Too many VALUs in intv1 + intv2 |
1617 | if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs) |
1618 | return HazardExpired; |
1619 | |
1620 | return HazardFound; |
1621 | }; |
1622 | auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) { |
1623 | if (SIInstrInfo::isVALU(MI)) |
1624 | State.VALUs += 1; |
1625 | }; |
1626 | |
1627 | DenseSet<const MachineBasicBlock *> Visited; |
1628 | if (!hasHazard<StateType>(State, IsHazard: IsHazardFn, UpdateState: UpdateStateFn, MBB: MI->getParent(), |
1629 | I: std::next(x: MI->getReverseIterator()), Visited)) |
1630 | return false; |
1631 | |
1632 | BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), |
1633 | TII.get(AMDGPU::S_WAITCNT_DEPCTR)) |
1634 | .addImm(0x0fff); |
1635 | |
1636 | return true; |
1637 | } |
1638 | |
1639 | bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) { |
1640 | if (!ST.hasVALUTransUseHazard()) |
1641 | return false; |
1642 | assert(!ST.hasExtendedWaitCounts()); |
1643 | |
1644 | if (!SIInstrInfo::isVALU(MI: *MI)) |
1645 | return false; |
1646 | |
1647 | SmallSet<Register, 4> SrcVGPRs; |
1648 | |
1649 | for (const MachineOperand &Use : MI->explicit_uses()) { |
1650 | if (Use.isReg() && TRI.isVGPR(MRI: MF.getRegInfo(), Reg: Use.getReg())) |
1651 | SrcVGPRs.insert(V: Use.getReg()); |
1652 | } |
1653 | |
1654 | // Look for the following pattern: |
1655 | // Va <- TRANS VALU |
1656 | // intv |
1657 | // MI Va (WaitState = 0) |
1658 | // |
1659 | // Where: |
1660 | // intv <= 5 VALUs / 1 TRANS |
1661 | // |
1662 | // If found, insert an appropriate S_WAITCNT_DEPCTR before MI. |
1663 | |
1664 | const int IntvMaxVALUs = 5; |
1665 | const int IntvMaxTRANS = 1; |
1666 | |
1667 | struct StateType { |
1668 | int VALUs = 0; |
1669 | int TRANS = 0; |
1670 | }; |
1671 | |
1672 | StateType State; |
1673 | |
1674 | // This overloads expiry testing with all the hazard detection |
1675 | auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) { |
1676 | // Too many VALU states have passed |
1677 | if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS) |
1678 | return HazardExpired; |
1679 | |
1680 | // Instructions which cause va_vdst==0 expire hazard |
1681 | if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) || |
1682 | SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) || |
1683 | (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && |
1684 | I.getOperand(0).getImm() == 0x0fff)) |
1685 | return HazardExpired; |
1686 | |
1687 | // Track registers writes |
1688 | if (SIInstrInfo::isTRANS(MI: I)) { |
1689 | for (Register Src : SrcVGPRs) { |
1690 | if (I.modifiesRegister(Src, &TRI)) { |
1691 | return HazardFound; |
1692 | } |
1693 | } |
1694 | } |
1695 | |
1696 | return NoHazardFound; |
1697 | }; |
1698 | auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) { |
1699 | if (SIInstrInfo::isVALU(MI)) |
1700 | State.VALUs += 1; |
1701 | if (SIInstrInfo::isTRANS(MI)) |
1702 | State.TRANS += 1; |
1703 | }; |
1704 | |
1705 | DenseSet<const MachineBasicBlock *> Visited; |
1706 | if (!hasHazard<StateType>(State, IsHazard: IsHazardFn, UpdateState: UpdateStateFn, MBB: MI->getParent(), |
1707 | I: std::next(x: MI->getReverseIterator()), Visited)) |
1708 | return false; |
1709 | |
1710 | // Hazard is observed - insert a wait on va_dst counter to ensure hazard is |
1711 | // avoided. |
1712 | BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), |
1713 | TII.get(AMDGPU::S_WAITCNT_DEPCTR)) |
1714 | .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0)); |
1715 | |
1716 | return true; |
1717 | } |
1718 | |
1719 | bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) { |
1720 | if (!SIInstrInfo::isWMMA(MI: *MI) && !SIInstrInfo::isSWMMAC(MI: *MI)) |
1721 | return false; |
1722 | |
1723 | const SIInstrInfo *TII = ST.getInstrInfo(); |
1724 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
1725 | |
1726 | auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) { |
1727 | if (!SIInstrInfo::isWMMA(MI: I) && !SIInstrInfo::isSWMMAC(MI: I)) |
1728 | return false; |
1729 | |
1730 | // Src0(matrix A) or Src1(matrix B) of the current wmma instruction overlaps |
1731 | // with the dest(matrix D) of the previous wmma. |
1732 | const Register CurSrc0Reg = |
1733 | TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg(); |
1734 | const Register CurSrc1Reg = |
1735 | TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg(); |
1736 | |
1737 | const Register PrevDstReg = |
1738 | TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg(); |
1739 | |
1740 | if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) || |
1741 | TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) { |
1742 | return true; |
1743 | } |
1744 | |
1745 | // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall) |
1746 | // but Index can't overlap with PrevDstReg. |
1747 | if (AMDGPU::isGFX12Plus(ST)) { |
1748 | if (SIInstrInfo::isSWMMAC(MI: *MI)) { |
1749 | const Register CurIndex = |
1750 | TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg(); |
1751 | if (TRI->regsOverlap(PrevDstReg, CurIndex)) |
1752 | return true; |
1753 | } |
1754 | return false; |
1755 | } |
1756 | |
1757 | return false; |
1758 | }; |
1759 | |
1760 | auto IsExpiredFn = [](const MachineInstr &I, int) { |
1761 | return SIInstrInfo::isVALU(MI: I); |
1762 | }; |
1763 | |
1764 | if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) == |
1765 | std::numeric_limits<int>::max()) |
1766 | return false; |
1767 | |
1768 | BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32)); |
1769 | |
1770 | return true; |
1771 | } |
1772 | |
1773 | bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) { |
1774 | if (!ST.hasShift64HighRegBug()) |
1775 | return false; |
1776 | assert(!ST.hasExtendedWaitCounts()); |
1777 | |
1778 | switch (MI->getOpcode()) { |
1779 | default: |
1780 | return false; |
1781 | case AMDGPU::V_LSHLREV_B64_e64: |
1782 | case AMDGPU::V_LSHRREV_B64_e64: |
1783 | case AMDGPU::V_ASHRREV_I64_e64: |
1784 | break; |
1785 | } |
1786 | |
1787 | MachineOperand *Amt = TII.getNamedOperand(*MI, AMDGPU::OpName::src0); |
1788 | if (!Amt->isReg()) |
1789 | return false; |
1790 | |
1791 | Register AmtReg = Amt->getReg(); |
1792 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
1793 | // Check if this is a last VGPR in the allocation block. |
1794 | if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7) |
1795 | return false; |
1796 | |
1797 | if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1)) |
1798 | return false; |
1799 | |
1800 | MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1); |
1801 | bool OverlappedSrc = Src1->isReg() && TRI.regsOverlap(Src1->getReg(), AmtReg); |
1802 | bool OverlappedDst = MI->modifiesRegister(AmtReg, &TRI); |
1803 | bool Overlapped = OverlappedSrc || OverlappedDst; |
1804 | |
1805 | assert(!OverlappedDst || !OverlappedSrc || |
1806 | Src1->getReg() == MI->getOperand(0).getReg()); |
1807 | assert(ST.needsAlignedVGPRs()); |
1808 | static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1); |
1809 | |
1810 | Register NewReg; |
1811 | for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass |
1812 | : AMDGPU::VGPR_32RegClass) { |
1813 | if (!MI->modifiesRegister(Reg, &TRI) && !MI->readsRegister(Reg, &TRI)) { |
1814 | NewReg = Reg; |
1815 | break; |
1816 | } |
1817 | } |
1818 | |
1819 | Register NewAmt = Overlapped ? (Register)TRI.getSubReg(NewReg, AMDGPU::sub1) |
1820 | : NewReg; |
1821 | Register NewAmtLo; |
1822 | |
1823 | if (Overlapped) |
1824 | NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0); |
1825 | |
1826 | DebugLoc DL = MI->getDebugLoc(); |
1827 | MachineBasicBlock *MBB = MI->getParent(); |
1828 | // Insert a full wait count because found register might be pending a wait. |
1829 | BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT)) |
1830 | .addImm(0); |
1831 | |
1832 | // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them. |
1833 | if (Overlapped) |
1834 | runOnInstruction( |
1835 | BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmtLo) |
1836 | .addDef(AmtReg - 1) |
1837 | .addReg(AmtReg - 1, RegState::Undef) |
1838 | .addReg(NewAmtLo, RegState::Undef)); |
1839 | runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt) |
1840 | .addDef(AmtReg) |
1841 | .addReg(AmtReg, RegState::Undef) |
1842 | .addReg(NewAmt, RegState::Undef)); |
1843 | |
1844 | // Instructions emitted after the current instruction will be processed by the |
1845 | // parent loop of the hazard recognizer in a natural way. |
1846 | BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32), |
1847 | AmtReg) |
1848 | .addDef(NewAmt) |
1849 | .addReg(NewAmt) |
1850 | .addReg(AmtReg); |
1851 | if (Overlapped) |
1852 | BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32), |
1853 | AmtReg - 1) |
1854 | .addDef(NewAmtLo) |
1855 | .addReg(NewAmtLo) |
1856 | .addReg(AmtReg - 1); |
1857 | |
1858 | // Re-running hazard recognizer on the modified instruction is not necessary, |
1859 | // inserted V_SWAP_B32 has already both read and write new registers so |
1860 | // hazards related to these register has already been handled. |
1861 | Amt->setReg(NewAmt); |
1862 | Amt->setIsKill(false); |
1863 | // We do not update liveness, so verifier may see it as undef. |
1864 | Amt->setIsUndef(); |
1865 | if (OverlappedDst) |
1866 | MI->getOperand(i: 0).setReg(NewReg); |
1867 | if (OverlappedSrc) { |
1868 | Src1->setReg(NewReg); |
1869 | Src1->setIsKill(false); |
1870 | Src1->setIsUndef(); |
1871 | } |
1872 | |
1873 | return true; |
1874 | } |
1875 | |
1876 | int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) { |
1877 | int NSAtoVMEMWaitStates = 1; |
1878 | |
1879 | if (!ST.hasNSAtoVMEMBug()) |
1880 | return 0; |
1881 | |
1882 | if (!SIInstrInfo::isMUBUF(MI: *MI) && !SIInstrInfo::isMTBUF(MI: *MI)) |
1883 | return 0; |
1884 | |
1885 | const SIInstrInfo *TII = ST.getInstrInfo(); |
1886 | const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset); |
1887 | if (!Offset || (Offset->getImm() & 6) == 0) |
1888 | return 0; |
1889 | |
1890 | auto IsHazardFn = [TII](const MachineInstr &I) { |
1891 | if (!SIInstrInfo::isMIMG(MI: I)) |
1892 | return false; |
1893 | const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: I.getOpcode()); |
1894 | return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA && |
1895 | TII->getInstSizeInBytes(I) >= 16; |
1896 | }; |
1897 | |
1898 | return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazard: IsHazardFn, Limit: 1); |
1899 | } |
1900 | |
1901 | int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) { |
1902 | int FPAtomicToDenormModeWaitStates = 3; |
1903 | |
1904 | if (!ST.hasFPAtomicToDenormModeHazard()) |
1905 | return 0; |
1906 | assert(!ST.hasExtendedWaitCounts()); |
1907 | |
1908 | if (MI->getOpcode() != AMDGPU::S_DENORM_MODE) |
1909 | return 0; |
1910 | |
1911 | auto IsHazardFn = [](const MachineInstr &I) { |
1912 | if (!SIInstrInfo::isVMEM(MI: I) && !SIInstrInfo::isFLAT(MI: I)) |
1913 | return false; |
1914 | return SIInstrInfo::isFPAtomic(MI: I); |
1915 | }; |
1916 | |
1917 | auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) { |
1918 | if (WaitStates >= 3 || SIInstrInfo::isVALU(MI)) |
1919 | return true; |
1920 | |
1921 | switch (MI.getOpcode()) { |
1922 | case AMDGPU::S_WAITCNT: |
1923 | case AMDGPU::S_WAITCNT_VSCNT: |
1924 | case AMDGPU::S_WAITCNT_VMCNT: |
1925 | case AMDGPU::S_WAITCNT_EXPCNT: |
1926 | case AMDGPU::S_WAITCNT_LGKMCNT: |
1927 | case AMDGPU::S_WAIT_IDLE: |
1928 | return true; |
1929 | default: |
1930 | break; |
1931 | } |
1932 | |
1933 | return false; |
1934 | }; |
1935 | |
1936 | return FPAtomicToDenormModeWaitStates - |
1937 | ::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn); |
1938 | } |
1939 | |
1940 | int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) { |
1941 | assert(SIInstrInfo::isMAI(*MI)); |
1942 | |
1943 | return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI); |
1944 | } |
1945 | |
1946 | int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) { |
1947 | // Early exit if no padding is requested. |
1948 | if (MFMAPaddingRatio == 0) |
1949 | return 0; |
1950 | |
1951 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
1952 | if (!SIInstrInfo::isMFMA(MI: *MI) || MFI->getOccupancy() < 2) |
1953 | return 0; |
1954 | |
1955 | int NeighborMFMALatency = 0; |
1956 | auto IsNeighboringMFMA = [&NeighborMFMALatency, |
1957 | this](const MachineInstr &MI) { |
1958 | if (!SIInstrInfo::isMFMA(MI)) |
1959 | return false; |
1960 | |
1961 | NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI); |
1962 | return true; |
1963 | }; |
1964 | |
1965 | const int MaxMFMAPipelineWaitStates = 16; |
1966 | int WaitStatesSinceNeighborMFMA = |
1967 | getWaitStatesSince(IsHazard: IsNeighboringMFMA, Limit: MaxMFMAPipelineWaitStates); |
1968 | |
1969 | int NeighborMFMAPaddingNeeded = |
1970 | (NeighborMFMALatency * MFMAPaddingRatio / 100) - |
1971 | WaitStatesSinceNeighborMFMA; |
1972 | |
1973 | return std::max(a: 0, b: NeighborMFMAPaddingNeeded); |
1974 | } |
1975 | |
1976 | int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) { |
1977 | int WaitStatesNeeded = 0; |
1978 | unsigned Opc = MI->getOpcode(); |
1979 | |
1980 | auto IsVALUFn = [](const MachineInstr &MI) { |
1981 | return SIInstrInfo::isVALU(MI) || MI.isInlineAsm(); |
1982 | }; |
1983 | |
1984 | if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write |
1985 | const int LegacyVALUWritesVGPRWaitStates = 2; |
1986 | const int VALUWritesExecWaitStates = 4; |
1987 | const int MaxWaitStates = 4; |
1988 | |
1989 | int WaitStatesNeededForUse = VALUWritesExecWaitStates - |
1990 | getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates); |
1991 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse); |
1992 | |
1993 | if (WaitStatesNeeded < MaxWaitStates) { |
1994 | for (const MachineOperand &Use : MI->explicit_uses()) { |
1995 | const int MaxWaitStates = 2; |
1996 | |
1997 | if (!Use.isReg() || !TRI.isVGPR(MRI: MF.getRegInfo(), Reg: Use.getReg())) |
1998 | continue; |
1999 | |
2000 | int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates - |
2001 | getWaitStatesSinceDef(Reg: Use.getReg(), IsHazardDef: IsVALUFn, Limit: MaxWaitStates); |
2002 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse); |
2003 | |
2004 | if (WaitStatesNeeded == MaxWaitStates) |
2005 | break; |
2006 | } |
2007 | } |
2008 | } |
2009 | |
2010 | for (const MachineOperand &Op : MI->explicit_operands()) { |
2011 | if (!Op.isReg() || !TRI.isAGPR(MRI: MF.getRegInfo(), Reg: Op.getReg())) |
2012 | continue; |
2013 | |
2014 | if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64) |
2015 | continue; |
2016 | |
2017 | const int MFMAWritesAGPROverlappedSrcABWaitStates = 4; |
2018 | const int MFMAWritesAGPROverlappedSrcCWaitStates = 2; |
2019 | const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4; |
2020 | const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10; |
2021 | const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18; |
2022 | const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1; |
2023 | const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7; |
2024 | const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15; |
2025 | const int MaxWaitStates = 18; |
2026 | Register Reg = Op.getReg(); |
2027 | unsigned HazardDefLatency = 0; |
2028 | |
2029 | auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency, |
2030 | this](const MachineInstr &MI) { |
2031 | if (!SIInstrInfo::isMFMA(MI)) |
2032 | return false; |
2033 | Register DstReg = MI.getOperand(i: 0).getReg(); |
2034 | if (DstReg == Reg) |
2035 | return false; |
2036 | HazardDefLatency = |
2037 | std::max(a: HazardDefLatency, b: TSchedModel.computeInstrLatency(MI: &MI)); |
2038 | return TRI.regsOverlap(DstReg, Reg); |
2039 | }; |
2040 | |
2041 | int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, |
2042 | MaxWaitStates); |
2043 | int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates; |
2044 | int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); |
2045 | int OpNo = Op.getOperandNo(); |
2046 | if (OpNo == SrcCIdx) { |
2047 | NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates; |
2048 | } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) { |
2049 | switch (HazardDefLatency) { |
2050 | case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates; |
2051 | break; |
2052 | case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates; |
2053 | break; |
2054 | case 16: [[fallthrough]]; |
2055 | default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates; |
2056 | break; |
2057 | } |
2058 | } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) { |
2059 | switch (HazardDefLatency) { |
2060 | case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates; |
2061 | break; |
2062 | case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates; |
2063 | break; |
2064 | case 16: [[fallthrough]]; |
2065 | default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates; |
2066 | break; |
2067 | } |
2068 | } |
2069 | |
2070 | int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; |
2071 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse); |
2072 | |
2073 | if (WaitStatesNeeded == MaxWaitStates) |
2074 | return WaitStatesNeeded; // Early exit. |
2075 | |
2076 | auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) { |
2077 | if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64) |
2078 | return false; |
2079 | Register DstReg = MI.getOperand(i: 0).getReg(); |
2080 | return TRI.regsOverlap(Reg, DstReg); |
2081 | }; |
2082 | |
2083 | const int AccVGPRWriteMFMAReadSrcCWaitStates = 1; |
2084 | const int AccVGPRWriteMFMAReadSrcABWaitStates = 3; |
2085 | const int AccVGPRWriteAccVgprReadWaitStates = 3; |
2086 | NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates; |
2087 | if (OpNo == SrcCIdx) |
2088 | NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates; |
2089 | else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) |
2090 | NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates; |
2091 | |
2092 | WaitStatesNeededForUse = NeedWaitStates - |
2093 | getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates); |
2094 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse); |
2095 | |
2096 | if (WaitStatesNeeded == MaxWaitStates) |
2097 | return WaitStatesNeeded; // Early exit. |
2098 | } |
2099 | |
2100 | if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) { |
2101 | const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0; |
2102 | const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5; |
2103 | const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13; |
2104 | const int MaxWaitStates = 13; |
2105 | Register DstReg = MI->getOperand(i: 0).getReg(); |
2106 | unsigned HazardDefLatency = 0; |
2107 | |
2108 | auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency, |
2109 | this](const MachineInstr &MI) { |
2110 | if (!SIInstrInfo::isMFMA(MI)) |
2111 | return false; |
2112 | Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg(); |
2113 | HazardDefLatency = |
2114 | std::max(a: HazardDefLatency, b: TSchedModel.computeInstrLatency(MI: &MI)); |
2115 | return TRI.regsOverlap(Reg, DstReg); |
2116 | }; |
2117 | |
2118 | int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates); |
2119 | int NeedWaitStates; |
2120 | switch (HazardDefLatency) { |
2121 | case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates; |
2122 | break; |
2123 | case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates; |
2124 | break; |
2125 | case 16: [[fallthrough]]; |
2126 | default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates; |
2127 | break; |
2128 | } |
2129 | |
2130 | int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince; |
2131 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse); |
2132 | } |
2133 | |
2134 | // Pad neighboring MFMA with noops for better inter-wave performance. |
2135 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: checkMFMAPadding(MI)); |
2136 | |
2137 | return WaitStatesNeeded; |
2138 | } |
2139 | |
2140 | static int |
2141 | GFX940_XDL_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses) { |
2142 | // 2 pass -> 3 |
2143 | // 4 pass -> 5 |
2144 | // 8 pass -> 9 |
2145 | // 16 pass -> 17 |
2146 | return NumPasses + 1; |
2147 | } |
2148 | |
2149 | static int |
2150 | GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses) { |
2151 | // 2 pass -> 2 |
2152 | // 4 pass -> 4 |
2153 | // 8 pass -> 8 |
2154 | // 16 pass -> 16 |
2155 | return NumPasses; |
2156 | } |
2157 | |
2158 | static int |
2159 | GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) { |
2160 | // 2 pass -> 4 |
2161 | // 4 pass -> 6 |
2162 | // 8 pass -> 10 |
2163 | // 16 pass -> 18 |
2164 | return NumPasses + 2; |
2165 | } |
2166 | |
2167 | static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) { |
2168 | // 2 pass -> 5 |
2169 | // 4 pass -> 7 |
2170 | // 8 pass -> 11 |
2171 | // 16 pass -> 19 |
2172 | return NumPasses + 3; |
2173 | } |
2174 | |
2175 | int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) { |
2176 | int WaitStatesNeeded = 0; |
2177 | unsigned Opc = MI->getOpcode(); |
2178 | |
2179 | auto IsLegacyVALUFn = [](const MachineInstr &MI) { |
2180 | return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI); |
2181 | }; |
2182 | |
2183 | auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) { |
2184 | return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI) && |
2185 | !SIInstrInfo::isDOT(MI); |
2186 | }; |
2187 | |
2188 | if (!SIInstrInfo::isMFMA(MI: *MI)) |
2189 | return WaitStatesNeeded; |
2190 | |
2191 | const int VALUWritesExecWaitStates = 4; |
2192 | int WaitStatesNeededForUse = VALUWritesExecWaitStates - |
2193 | getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn, |
2194 | VALUWritesExecWaitStates); |
2195 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse); |
2196 | |
2197 | int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); |
2198 | |
2199 | // Loop for both DGEMM and S/HGEMM 2nd instruction. |
2200 | for (const MachineOperand &Use : MI->explicit_uses()) { |
2201 | const int LegacyVALUNotDotWritesVGPRWaitStates = 2; |
2202 | const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2; |
2203 | const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8; |
2204 | const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16; |
2205 | const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3; |
2206 | const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9; |
2207 | const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17; |
2208 | const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9; |
2209 | const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4; |
2210 | const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5; |
2211 | const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11; |
2212 | const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19; |
2213 | const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6; |
2214 | const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11; |
2215 | const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4; |
2216 | const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2; |
2217 | const int MaxWaitStates = 19; |
2218 | |
2219 | if (!Use.isReg()) |
2220 | continue; |
2221 | Register Reg = Use.getReg(); |
2222 | bool FullReg; |
2223 | const MachineInstr *MI1; |
2224 | |
2225 | auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1, |
2226 | this](const MachineInstr &MI) { |
2227 | if (!SIInstrInfo::isMFMA(MI)) |
2228 | return false; |
2229 | Register DstReg = MI.getOperand(i: 0).getReg(); |
2230 | FullReg = (DstReg == Reg); |
2231 | MI1 = &MI; |
2232 | return TRI.regsOverlap(DstReg, Reg); |
2233 | }; |
2234 | |
2235 | WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates - |
2236 | getWaitStatesSinceDef(Reg, IsHazardDef: IsLegacyVALUNotDotFn, Limit: MaxWaitStates); |
2237 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse); |
2238 | |
2239 | int NumWaitStates = |
2240 | getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates); |
2241 | if (NumWaitStates == std::numeric_limits<int>::max()) |
2242 | continue; |
2243 | |
2244 | int OpNo = Use.getOperandNo(); |
2245 | unsigned Opc1 = MI1->getOpcode(); |
2246 | int NeedWaitStates = 0; |
2247 | if (OpNo == SrcCIdx) { |
2248 | if (!isDGEMM(Opcode: Opc) && (!ST.hasGFX940Insts() && isDGEMM(Opcode: Opc1))) { |
2249 | NeedWaitStates = 0; |
2250 | } else if (FullReg) { |
2251 | if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 || |
2252 | Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) && |
2253 | (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 || |
2254 | Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64)) |
2255 | NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates; |
2256 | else if (ST.hasGFX940Insts() && |
2257 | TSchedModel.computeInstrLatency(MI: MI1) == 2) |
2258 | NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates; |
2259 | } else { |
2260 | switch (Opc1) { |
2261 | case AMDGPU::V_MFMA_F64_16X16X4F64_e64: |
2262 | case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64: |
2263 | case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64: |
2264 | case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64: |
2265 | if (!isXDL(ST, MI: *MI)) |
2266 | NeedWaitStates = DMFMA16x16WritesVGPROverlappedSrcCWaitStates; |
2267 | break; |
2268 | case AMDGPU::V_MFMA_F64_4X4X4F64_e64: |
2269 | case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64: |
2270 | if (!isXDL(ST, MI: *MI)) |
2271 | NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates; |
2272 | break; |
2273 | default: |
2274 | int NumPasses = TSchedModel.computeInstrLatency(MI: MI1); |
2275 | if (ST.hasGFX940Insts()) { |
2276 | if (isXDL(ST, MI: *MI) && !isXDL(ST, MI: *MI1)) |
2277 | break; |
2278 | |
2279 | NeedWaitStates = |
2280 | isXDL(ST, MI: *MI1) |
2281 | ? GFX940_XDL_N_PassWritesVGPROverlappedSMFMASrcCWaitStates( |
2282 | NumPasses) |
2283 | : GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates( |
2284 | NumPasses); |
2285 | break; |
2286 | } |
2287 | |
2288 | switch (NumPasses) { |
2289 | case 2: |
2290 | NeedWaitStates = |
2291 | isDGEMM(Opcode: Opc) ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates |
2292 | : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates; |
2293 | break; |
2294 | case 8: |
2295 | NeedWaitStates = |
2296 | isDGEMM(Opcode: Opc) |
2297 | ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates |
2298 | : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates; |
2299 | break; |
2300 | case 16: |
2301 | NeedWaitStates = |
2302 | isDGEMM(Opcode: Opc) |
2303 | ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates |
2304 | : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates; |
2305 | break; |
2306 | default: |
2307 | llvm_unreachable("unexpected number of passes" ); |
2308 | } |
2309 | } |
2310 | } |
2311 | } else { |
2312 | switch (Opc1) { |
2313 | case AMDGPU::V_MFMA_F64_16X16X4F64_e64: |
2314 | case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64: |
2315 | case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64: |
2316 | case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64: |
2317 | NeedWaitStates = DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates; |
2318 | break; |
2319 | case AMDGPU::V_MFMA_F64_4X4X4F64_e64: |
2320 | case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64: |
2321 | NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates; |
2322 | break; |
2323 | default: |
2324 | int NumPasses = TSchedModel.computeInstrLatency(MI: MI1); |
2325 | |
2326 | if (ST.hasGFX940Insts()) { |
2327 | NeedWaitStates = |
2328 | isXDL(ST, MI: *MI1) |
2329 | ? GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates( |
2330 | NumPasses) |
2331 | : GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates( |
2332 | NumPasses); |
2333 | break; |
2334 | } |
2335 | |
2336 | switch (NumPasses) { |
2337 | case 2: |
2338 | NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates; |
2339 | break; |
2340 | case 4: |
2341 | llvm_unreachable("unexpected number of passes for mfma" ); |
2342 | case 8: |
2343 | NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates; |
2344 | break; |
2345 | case 16: [[fallthrough]]; |
2346 | default: |
2347 | NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates; |
2348 | } |
2349 | } |
2350 | } |
2351 | if (WaitStatesNeeded >= NeedWaitStates) |
2352 | continue; |
2353 | |
2354 | WaitStatesNeededForUse = NeedWaitStates - NumWaitStates; |
2355 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse); |
2356 | |
2357 | if (WaitStatesNeeded == MaxWaitStates) |
2358 | break; |
2359 | } |
2360 | |
2361 | // Pad neighboring MFMA with noops for better inter-wave performance. |
2362 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: checkMFMAPadding(MI)); |
2363 | |
2364 | return WaitStatesNeeded; |
2365 | } |
2366 | |
2367 | int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) { |
2368 | // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards() |
2369 | if (!ST.hasMAIInsts() || ST.hasGFX90AInsts()) |
2370 | return 0; |
2371 | |
2372 | int WaitStatesNeeded = 0; |
2373 | |
2374 | auto IsAccVgprReadFn = [](const MachineInstr &MI) { |
2375 | return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64; |
2376 | }; |
2377 | |
2378 | for (const MachineOperand &Op : MI->explicit_uses()) { |
2379 | if (!Op.isReg() || !TRI.isVGPR(MRI: MF.getRegInfo(), Reg: Op.getReg())) |
2380 | continue; |
2381 | |
2382 | Register Reg = Op.getReg(); |
2383 | |
2384 | const int AccVgprReadLdStWaitStates = 2; |
2385 | const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1; |
2386 | const int MaxWaitStates = 2; |
2387 | |
2388 | int WaitStatesNeededForUse = AccVgprReadLdStWaitStates - |
2389 | getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates); |
2390 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse); |
2391 | |
2392 | if (WaitStatesNeeded == MaxWaitStates) |
2393 | return WaitStatesNeeded; // Early exit. |
2394 | |
2395 | auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) { |
2396 | if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 && |
2397 | MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64) |
2398 | return false; |
2399 | auto IsVALUFn = [](const MachineInstr &MI) { |
2400 | return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMAI(MI); |
2401 | }; |
2402 | return getWaitStatesSinceDef(Reg, IsHazardDef: IsVALUFn, Limit: 2 /*MaxWaitStates*/) < |
2403 | std::numeric_limits<int>::max(); |
2404 | }; |
2405 | |
2406 | WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates - |
2407 | getWaitStatesSince(IsHazard: IsVALUAccVgprRdWrCheckFn, Limit: MaxWaitStates); |
2408 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse); |
2409 | } |
2410 | |
2411 | return WaitStatesNeeded; |
2412 | } |
2413 | |
2414 | static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses) { |
2415 | // 2 pass -> 4 |
2416 | // 4 pass -> 6 |
2417 | // 8 pass -> 10 |
2418 | // 16 pass -> 18 |
2419 | return NumPasses + 2; |
2420 | } |
2421 | |
2422 | static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses) { |
2423 | // 2 pass -> 5 |
2424 | // 4 pass -> 7 |
2425 | // 8 pass -> 11 |
2426 | // 16 pass -> 19 |
2427 | return NumPasses + 3; |
2428 | } |
2429 | |
2430 | static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) { |
2431 | // 2 pass -> 5 |
2432 | // 4 pass -> 7 |
2433 | // 8 pass -> 11 |
2434 | // 16 pass -> 19 |
2435 | return NumPasses + 3; |
2436 | } |
2437 | |
2438 | static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) { |
2439 | // 2 pass -> 4 |
2440 | // 4 pass -> 6 |
2441 | // 8 pass -> 10 |
2442 | // 16 pass -> 18 |
2443 | return NumPasses + 2; |
2444 | } |
2445 | |
2446 | int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) { |
2447 | if (!ST.hasGFX90AInsts()) |
2448 | return 0; |
2449 | |
2450 | auto IsDGEMMFn = [](const MachineInstr &MI) -> bool { |
2451 | return isDGEMM(Opcode: MI.getOpcode()); |
2452 | }; |
2453 | |
2454 | // This is checked in checkMAIHazards90A() |
2455 | if (SIInstrInfo::isMFMA(MI: *MI)) |
2456 | return 0; |
2457 | |
2458 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
2459 | |
2460 | int WaitStatesNeeded = 0; |
2461 | |
2462 | bool IsMem = SIInstrInfo::isVMEM(MI: *MI) || |
2463 | SIInstrInfo::isFLAT(MI: *MI) || |
2464 | SIInstrInfo::isDS(MI: *MI); |
2465 | bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(MI: *MI); |
2466 | bool IsVALU = SIInstrInfo::isVALU(MI: *MI); |
2467 | |
2468 | const MachineInstr *MFMA = nullptr; |
2469 | unsigned Reg; |
2470 | auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) { |
2471 | if (!SIInstrInfo::isMFMA(MI) || |
2472 | !TRI.regsOverlap(MI.getOperand(i: 0).getReg(), Reg)) |
2473 | return false; |
2474 | MFMA = &MI; |
2475 | return true; |
2476 | }; |
2477 | |
2478 | const MachineInstr *DOT = nullptr; |
2479 | auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) { |
2480 | if (!SIInstrInfo::isDOT(MI) || |
2481 | !TRI.regsOverlap(MI.getOperand(i: 0).getReg(), Reg)) |
2482 | return false; |
2483 | DOT = &MI; |
2484 | return true; |
2485 | }; |
2486 | |
2487 | bool DGEMMAfterVALUWrite = false; |
2488 | auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) { |
2489 | // Found DGEMM on reverse traversal to def. |
2490 | if (isDGEMM(Opcode: MI.getOpcode())) |
2491 | DGEMMAfterVALUWrite = true; |
2492 | |
2493 | // Only hazard if register is defined by a VALU and a DGEMM is found after |
2494 | // after the def. |
2495 | if (!TII.isVALU(MI) || !DGEMMAfterVALUWrite) |
2496 | return false; |
2497 | |
2498 | return true; |
2499 | }; |
2500 | |
2501 | int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), |
2502 | AMDGPU::OpName::src2); |
2503 | |
2504 | if (IsMemOrExport || IsVALU) { |
2505 | const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5; |
2506 | const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11; |
2507 | const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19; |
2508 | const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9; |
2509 | const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18; |
2510 | const int DMFMA4x4WriteVgprVALUReadWaitStates = 6; |
2511 | const int DMFMA16x16WriteVgprVALUReadWaitStates = 11; |
2512 | const int DotWriteSameDotReadSrcAB = 3; |
2513 | const int DotWriteDifferentVALURead = 3; |
2514 | const int DMFMABetweenVALUWriteVMEMRead = 2; |
2515 | const int MaxWaitStates = 19; |
2516 | |
2517 | for (const MachineOperand &Use : MI->explicit_uses()) { |
2518 | if (!Use.isReg()) |
2519 | continue; |
2520 | Reg = Use.getReg(); |
2521 | |
2522 | DOT = nullptr; |
2523 | int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsHazardDef: IsDotWriteFn, |
2524 | Limit: MaxWaitStates); |
2525 | if (DOT) { |
2526 | int NeedWaitStates = 0; |
2527 | if (DOT->getOpcode() == MI->getOpcode()) { |
2528 | if (&Use - &MI->getOperand(i: 0) != SrcCIdx) |
2529 | NeedWaitStates = DotWriteSameDotReadSrcAB; |
2530 | } else { |
2531 | NeedWaitStates = DotWriteDifferentVALURead; |
2532 | } |
2533 | |
2534 | int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; |
2535 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse); |
2536 | } |
2537 | |
2538 | // Workaround for HW data hazard bug observed only in GFX90A. When there |
2539 | // is a DGEMM instruction in-between a VALU and a VMEM instruction it |
2540 | // causes the SQ to incorrectly not insert two wait states between the two |
2541 | // instructions needed to avoid data hazard. |
2542 | if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) { |
2543 | DGEMMAfterVALUWrite = false; |
2544 | if (TRI.isVectorRegister(MRI, Reg)) { |
2545 | int WaitStatesNeededForUse = |
2546 | DMFMABetweenVALUWriteVMEMRead - |
2547 | getWaitStatesSinceDef(Reg, IsHazardDef: IsDGEMMHazard, |
2548 | Limit: DMFMABetweenVALUWriteVMEMRead); |
2549 | |
2550 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse); |
2551 | } |
2552 | } |
2553 | |
2554 | MFMA = nullptr; |
2555 | WaitStatesSinceDef = |
2556 | getWaitStatesSinceDef(Reg, IsHazardDef: IsMFMAWriteFn, Limit: MaxWaitStates); |
2557 | if (!MFMA) |
2558 | continue; |
2559 | |
2560 | unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MI: MFMA); |
2561 | int NumPasses = HazardDefLatency; |
2562 | int NeedWaitStates = MaxWaitStates; |
2563 | |
2564 | if (isDGEMM(Opcode: MFMA->getOpcode())) { |
2565 | switch (HazardDefLatency) { |
2566 | case 4: |
2567 | NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates |
2568 | : DMFMA4x4WriteVgprVALUReadWaitStates; |
2569 | break; |
2570 | case 8: |
2571 | case 16: |
2572 | NeedWaitStates = IsMemOrExport |
2573 | ? DMFMA16x16WriteVgprMemExpReadWaitStates |
2574 | : DMFMA16x16WriteVgprVALUReadWaitStates; |
2575 | break; |
2576 | default: |
2577 | llvm_unreachable("unexpected dgemm" ); |
2578 | } |
2579 | } else if (ST.hasGFX940Insts()) { |
2580 | NeedWaitStates = |
2581 | isXDL(ST, MI: *MFMA) |
2582 | ? GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(NumPasses) |
2583 | : GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates( |
2584 | NumPasses); |
2585 | } else { |
2586 | switch (HazardDefLatency) { |
2587 | case 2: |
2588 | NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates; |
2589 | break; |
2590 | case 8: |
2591 | NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates; |
2592 | break; |
2593 | case 16: |
2594 | NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates; |
2595 | break; |
2596 | default: |
2597 | llvm_unreachable("unexpected number of passes for mfma" ); |
2598 | } |
2599 | } |
2600 | |
2601 | int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; |
2602 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse); |
2603 | |
2604 | if (WaitStatesNeeded == MaxWaitStates) |
2605 | break; |
2606 | } |
2607 | } |
2608 | |
2609 | unsigned Opc = MI->getOpcode(); |
2610 | const int DMFMAToFMA64WaitStates = 2; |
2611 | if ((Opc == AMDGPU::V_FMA_F64_e64 || |
2612 | Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 || |
2613 | Opc == AMDGPU::V_FMAC_F64_dpp) && |
2614 | WaitStatesNeeded < DMFMAToFMA64WaitStates) { |
2615 | int WaitStatesNeededForUse = DMFMAToFMA64WaitStates - |
2616 | getWaitStatesSince(IsHazard: IsDGEMMFn, Limit: DMFMAToFMA64WaitStates); |
2617 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse); |
2618 | } |
2619 | |
2620 | if (!IsVALU && !IsMemOrExport) |
2621 | return WaitStatesNeeded; |
2622 | |
2623 | for (const MachineOperand &Def : MI->defs()) { |
2624 | const int SMFMA4x4WriteVgprVALUWawWaitStates = 5; |
2625 | const int SMFMA16x16WriteVgprVALUWawWaitStates = 11; |
2626 | const int SMFMA32x32WriteVgprVALUWawWaitStates = 19; |
2627 | const int SMFMA4x4ReadVgprVALUWarWaitStates = 1; |
2628 | const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3; |
2629 | const int SMFMA16x16ReadVgprVALUWarWaitStates = 7; |
2630 | const int SMFMA32x32ReadVgprVALUWarWaitStates = 15; |
2631 | const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6; |
2632 | const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11; |
2633 | const int DotWriteDifferentVALUWrite = 3; |
2634 | const int MaxWaitStates = 19; |
2635 | const int MaxWarWaitStates = 15; |
2636 | |
2637 | Reg = Def.getReg(); |
2638 | |
2639 | DOT = nullptr; |
2640 | int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsHazardDef: IsDotWriteFn, |
2641 | Limit: MaxWaitStates); |
2642 | if (DOT && DOT->getOpcode() != MI->getOpcode()) |
2643 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: DotWriteDifferentVALUWrite - |
2644 | WaitStatesSinceDef); |
2645 | |
2646 | MFMA = nullptr; |
2647 | WaitStatesSinceDef = |
2648 | getWaitStatesSinceDef(Reg, IsHazardDef: IsMFMAWriteFn, Limit: MaxWaitStates); |
2649 | if (MFMA) { |
2650 | int NeedWaitStates = MaxWaitStates; |
2651 | int NumPasses = TSchedModel.computeInstrLatency(MI: MFMA); |
2652 | |
2653 | if (isDGEMM(Opcode: MFMA->getOpcode())) { |
2654 | switch (NumPasses) { |
2655 | case 4: |
2656 | NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates; |
2657 | break; |
2658 | case 8: |
2659 | case 16: |
2660 | NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates; |
2661 | break; |
2662 | default: |
2663 | llvm_unreachable("unexpected number of cycles for dgemm" ); |
2664 | } |
2665 | } else if (ST.hasGFX940Insts()) { |
2666 | NeedWaitStates = |
2667 | isXDL(ST, MI: *MFMA) |
2668 | ? GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(NumPasses) |
2669 | : GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(NumPasses); |
2670 | } else { |
2671 | switch (NumPasses) { |
2672 | case 2: |
2673 | NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates; |
2674 | break; |
2675 | case 8: |
2676 | NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates; |
2677 | break; |
2678 | case 16: |
2679 | NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates; |
2680 | break; |
2681 | default: |
2682 | llvm_unreachable("Unexpected number of passes for mfma" ); |
2683 | } |
2684 | } |
2685 | |
2686 | int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; |
2687 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse); |
2688 | |
2689 | if (WaitStatesNeeded == MaxWaitStates) |
2690 | break; |
2691 | } |
2692 | |
2693 | auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) { |
2694 | if (!SIInstrInfo::isMFMA(MI) || isDGEMM(Opcode: MI.getOpcode()) || |
2695 | !MI.readsRegister(Reg, &TRI)) |
2696 | return false; |
2697 | |
2698 | if (ST.hasGFX940Insts() && !isXDL(ST, MI)) |
2699 | return false; |
2700 | |
2701 | const MachineOperand *SrcC = |
2702 | TII.getNamedOperand(MI, AMDGPU::OpName::src2); |
2703 | assert(SrcC); |
2704 | if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg)) |
2705 | return false; |
2706 | |
2707 | MFMA = &MI; |
2708 | return true; |
2709 | }; |
2710 | |
2711 | MFMA = nullptr; |
2712 | int WaitStatesSinceUse = getWaitStatesSince(IsHazard: IsSMFMAReadAsCFn, |
2713 | Limit: MaxWarWaitStates); |
2714 | if (!MFMA) |
2715 | continue; |
2716 | |
2717 | unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MI: MFMA); |
2718 | int NeedWaitStates = MaxWaitStates; |
2719 | switch (HazardDefLatency) { |
2720 | case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates; |
2721 | break; |
2722 | case 4: assert(ST.hasGFX940Insts()); |
2723 | NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates; |
2724 | break; |
2725 | case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates; |
2726 | break; |
2727 | case 16: [[fallthrough]]; |
2728 | default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates; |
2729 | break; |
2730 | } |
2731 | |
2732 | int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse; |
2733 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse); |
2734 | } |
2735 | |
2736 | return WaitStatesNeeded; |
2737 | } |
2738 | |
2739 | bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) { |
2740 | if (!SU->isInstr()) |
2741 | return false; |
2742 | |
2743 | const MachineInstr *MAI = nullptr; |
2744 | |
2745 | auto IsMFMAFn = [&MAI](const MachineInstr &MI) { |
2746 | MAI = nullptr; |
2747 | if (SIInstrInfo::isMFMA(MI)) |
2748 | MAI = &MI; |
2749 | return MAI != nullptr; |
2750 | }; |
2751 | |
2752 | MachineInstr *MI = SU->getInstr(); |
2753 | if (IsMFMAFn(*MI)) { |
2754 | int W = getWaitStatesSince(IsHazard: IsMFMAFn, Limit: 16); |
2755 | if (MAI) |
2756 | return W < (int)TSchedModel.computeInstrLatency(MI: MAI); |
2757 | } |
2758 | |
2759 | return false; |
2760 | } |
2761 | |
2762 | bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) { |
2763 | if (!ST.hasVALUMaskWriteHazard()) |
2764 | return false; |
2765 | assert(!ST.hasExtendedWaitCounts()); |
2766 | |
2767 | if (!ST.isWave64() || !SIInstrInfo::isSALU(MI: *MI)) |
2768 | return false; |
2769 | |
2770 | // The hazard sequence is three instructions: |
2771 | // 1. VALU reads SGPR as mask |
2772 | // 2. SALU writes SGPR |
2773 | // 3. SALU reads SGPR |
2774 | // The hazard can expire if the distance between 2 and 3 is sufficient. |
2775 | // In practice this happens <10% of the time, hence this always assumes |
2776 | // the hazard exists if 1 and 2 are present to avoid searching. |
2777 | |
2778 | const MachineOperand *SDSTOp = TII.getNamedOperand(*MI, AMDGPU::OpName::sdst); |
2779 | if (!SDSTOp || !SDSTOp->isReg()) |
2780 | return false; |
2781 | |
2782 | const Register HazardReg = SDSTOp->getReg(); |
2783 | if (HazardReg == AMDGPU::EXEC || |
2784 | HazardReg == AMDGPU::EXEC_LO || |
2785 | HazardReg == AMDGPU::EXEC_HI || |
2786 | HazardReg == AMDGPU::M0) |
2787 | return false; |
2788 | |
2789 | auto IsHazardFn = [HazardReg, this](const MachineInstr &I) { |
2790 | switch (I.getOpcode()) { |
2791 | case AMDGPU::V_ADDC_U32_e32: |
2792 | case AMDGPU::V_ADDC_U32_dpp: |
2793 | case AMDGPU::V_CNDMASK_B16_e32: |
2794 | case AMDGPU::V_CNDMASK_B16_dpp: |
2795 | case AMDGPU::V_CNDMASK_B32_e32: |
2796 | case AMDGPU::V_CNDMASK_B32_dpp: |
2797 | case AMDGPU::V_DIV_FMAS_F32_e64: |
2798 | case AMDGPU::V_DIV_FMAS_F64_e64: |
2799 | case AMDGPU::V_SUBB_U32_e32: |
2800 | case AMDGPU::V_SUBB_U32_dpp: |
2801 | case AMDGPU::V_SUBBREV_U32_e32: |
2802 | case AMDGPU::V_SUBBREV_U32_dpp: |
2803 | // These implicitly read VCC as mask source. |
2804 | return HazardReg == AMDGPU::VCC || |
2805 | HazardReg == AMDGPU::VCC_LO || |
2806 | HazardReg == AMDGPU::VCC_HI; |
2807 | case AMDGPU::V_ADDC_U32_e64: |
2808 | case AMDGPU::V_ADDC_U32_e64_dpp: |
2809 | case AMDGPU::V_CNDMASK_B16_e64: |
2810 | case AMDGPU::V_CNDMASK_B16_e64_dpp: |
2811 | case AMDGPU::V_CNDMASK_B32_e64: |
2812 | case AMDGPU::V_CNDMASK_B32_e64_dpp: |
2813 | case AMDGPU::V_SUBB_U32_e64: |
2814 | case AMDGPU::V_SUBB_U32_e64_dpp: |
2815 | case AMDGPU::V_SUBBREV_U32_e64: |
2816 | case AMDGPU::V_SUBBREV_U32_e64_dpp: { |
2817 | // Only check mask register overlaps. |
2818 | const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2); |
2819 | assert(SSRCOp); |
2820 | return TRI.regsOverlap(SSRCOp->getReg(), HazardReg); |
2821 | } |
2822 | default: |
2823 | return false; |
2824 | } |
2825 | }; |
2826 | |
2827 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
2828 | auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) { |
2829 | // s_waitcnt_depctr sa_sdst(0) mitigates hazard. |
2830 | if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && |
2831 | AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0) |
2832 | return true; |
2833 | |
2834 | // VALU access to any SGPR or literal constant other than HazardReg |
2835 | // mitigates hazard. No need to check HazardReg here as this will |
2836 | // only be called when !IsHazardFn. |
2837 | if (!SIInstrInfo::isVALU(MI: I)) |
2838 | return false; |
2839 | for (int OpNo = 0, End = I.getNumOperands(); OpNo < End; ++OpNo) { |
2840 | const MachineOperand &Op = I.getOperand(i: OpNo); |
2841 | if (Op.isReg()) { |
2842 | Register OpReg = Op.getReg(); |
2843 | // Only consider uses |
2844 | if (!Op.isUse()) |
2845 | continue; |
2846 | // Ignore EXEC |
2847 | if (OpReg == AMDGPU::EXEC || |
2848 | OpReg == AMDGPU::EXEC_LO || |
2849 | OpReg == AMDGPU::EXEC_HI) |
2850 | continue; |
2851 | // Ignore all implicit uses except VCC |
2852 | if (Op.isImplicit()) { |
2853 | if (OpReg == AMDGPU::VCC || |
2854 | OpReg == AMDGPU::VCC_LO || |
2855 | OpReg == AMDGPU::VCC_HI) |
2856 | return true; |
2857 | continue; |
2858 | } |
2859 | if (TRI.isSGPRReg(MRI, Reg: OpReg)) |
2860 | return true; |
2861 | } else { |
2862 | const MCInstrDesc &InstDesc = I.getDesc(); |
2863 | const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo]; |
2864 | if (!TII.isInlineConstant(MO: Op, OpInfo)) |
2865 | return true; |
2866 | } |
2867 | } |
2868 | return false; |
2869 | }; |
2870 | |
2871 | // Check for hazard |
2872 | if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == |
2873 | std::numeric_limits<int>::max()) |
2874 | return false; |
2875 | |
2876 | auto NextMI = std::next(x: MI->getIterator()); |
2877 | |
2878 | // Add s_waitcnt_depctr sa_sdst(0) after SALU write. |
2879 | BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(), |
2880 | TII.get(AMDGPU::S_WAITCNT_DEPCTR)) |
2881 | .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); |
2882 | |
2883 | // SALU write may be s_getpc in a bundle. |
2884 | if (MI->getOpcode() == AMDGPU::S_GETPC_B64) { |
2885 | // Update offsets of any references in the bundle. |
2886 | while (NextMI != MI->getParent()->end() && |
2887 | NextMI->isBundledWithPred()) { |
2888 | for (auto &Operand : NextMI->operands()) { |
2889 | if (Operand.isGlobal()) |
2890 | Operand.setOffset(Operand.getOffset() + 4); |
2891 | } |
2892 | NextMI++; |
2893 | } |
2894 | } |
2895 | |
2896 | return true; |
2897 | } |
2898 | |