1//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Insert wait instructions for memory reads and writes.
11///
12/// Memory reads and writes are issued asynchronously, so we need to insert
13/// S_WAITCNT instructions when we want to access any of their results or
14/// overwrite any register that's used asynchronously.
15///
16/// TODO: This pass currently keeps one timeline per hardware counter. A more
17/// finely-grained approach that keeps one timeline per event type could
18/// sometimes get away with generating weaker s_waitcnt instructions. For
19/// example, when both SMEM and LDS are in flight and we need to wait for
20/// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
21/// but the pass will currently generate a conservative lgkmcnt(0) because
22/// multiple event types are in flight.
23//
24//===----------------------------------------------------------------------===//
25
26#include "AMDGPU.h"
27#include "GCNSubtarget.h"
28#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
29#include "SIMachineFunctionInfo.h"
30#include "Utils/AMDGPUBaseInfo.h"
31#include "llvm/ADT/MapVector.h"
32#include "llvm/ADT/PostOrderIterator.h"
33#include "llvm/ADT/Sequence.h"
34#include "llvm/Analysis/AliasAnalysis.h"
35#include "llvm/CodeGen/MachineLoopInfo.h"
36#include "llvm/CodeGen/MachinePostDominators.h"
37#include "llvm/InitializePasses.h"
38#include "llvm/Support/DebugCounter.h"
39#include "llvm/TargetParser/TargetParser.h"
40using namespace llvm;
41
42#define DEBUG_TYPE "si-insert-waitcnts"
43
44DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",
45 "Force emit s_waitcnt expcnt(0) instrs");
46DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
47 "Force emit s_waitcnt lgkmcnt(0) instrs");
48DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
49 "Force emit s_waitcnt vmcnt(0) instrs");
50
51static cl::opt<bool> ForceEmitZeroFlag(
52 "amdgpu-waitcnt-forcezero",
53 cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
54 cl::init(Val: false), cl::Hidden);
55
56namespace {
57// Class of object that encapsulates latest instruction counter score
58// associated with the operand. Used for determining whether
59// s_waitcnt instruction needs to be emitted.
60
61enum InstCounterType {
62 LOAD_CNT = 0, // VMcnt prior to gfx12.
63 DS_CNT, // LKGMcnt prior to gfx12.
64 EXP_CNT, //
65 STORE_CNT, // VScnt in gfx10/gfx11.
66 NUM_NORMAL_INST_CNTS,
67 SAMPLE_CNT = NUM_NORMAL_INST_CNTS, // gfx12+ only.
68 BVH_CNT, // gfx12+ only.
69 KM_CNT, // gfx12+ only.
70 NUM_EXTENDED_INST_CNTS,
71 NUM_INST_CNTS = NUM_EXTENDED_INST_CNTS
72};
73} // namespace
74
75namespace llvm {
76template <> struct enum_iteration_traits<InstCounterType> {
77 static constexpr bool is_iterable = true;
78};
79} // namespace llvm
80
81namespace {
82// Return an iterator over all counters between LOAD_CNT (the first counter)
83// and \c MaxCounter (exclusive, default value yields an enumeration over
84// all counters).
85auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) {
86 return enum_seq(Begin: LOAD_CNT, End: MaxCounter);
87}
88
89using RegInterval = std::pair<int, int>;
90
91struct HardwareLimits {
92 unsigned LoadcntMax; // Corresponds to VMcnt prior to gfx12.
93 unsigned ExpcntMax;
94 unsigned DscntMax; // Corresponds to LGKMcnt prior to gfx12.
95 unsigned StorecntMax; // Corresponds to VScnt in gfx10/gfx11.
96 unsigned SamplecntMax; // gfx12+ only.
97 unsigned BvhcntMax; // gfx12+ only.
98 unsigned KmcntMax; // gfx12+ only.
99};
100
101struct RegisterEncoding {
102 unsigned VGPR0;
103 unsigned VGPRL;
104 unsigned SGPR0;
105 unsigned SGPRL;
106};
107
108enum WaitEventType {
109 VMEM_ACCESS, // vector-memory read & write
110 VMEM_READ_ACCESS, // vector-memory read
111 VMEM_SAMPLER_READ_ACCESS, // vector-memory SAMPLER read (gfx12+ only)
112 VMEM_BVH_READ_ACCESS, // vector-memory BVH read (gfx12+ only)
113 VMEM_WRITE_ACCESS, // vector-memory write that is not scratch
114 SCRATCH_WRITE_ACCESS, // vector-memory write that may be scratch
115 LDS_ACCESS, // lds read & write
116 GDS_ACCESS, // gds read & write
117 SQ_MESSAGE, // send message
118 SMEM_ACCESS, // scalar-memory read & write
119 EXP_GPR_LOCK, // export holding on its data src
120 GDS_GPR_LOCK, // GDS holding on its data and addr src
121 EXP_POS_ACCESS, // write to export position
122 EXP_PARAM_ACCESS, // write to export parameter
123 VMW_GPR_LOCK, // vector-memory write holding on its data src
124 EXP_LDS_ACCESS, // read by ldsdir counting as export
125 NUM_WAIT_EVENTS,
126};
127
128// The mapping is:
129// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
130// SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
131// NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
132// We reserve a fixed number of VGPR slots in the scoring tables for
133// special tokens like SCMEM_LDS (needed for buffer load to LDS).
134enum RegisterMapping {
135 SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets.
136 AGPR_OFFSET = 256, // Maximum programmable ArchVGPRs across all targets.
137 SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
138 NUM_EXTRA_VGPRS = 9, // Reserved slots for DS.
139 // Artificial register slots to track LDS writes into specific LDS locations
140 // if a location is known. When slots are exhausted or location is
141 // unknown use the first slot. The first slot is also always updated in
142 // addition to known location's slot to properly generate waits if dependent
143 // instruction's location is unknown.
144 EXTRA_VGPR_LDS = 0,
145 NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
146};
147
148// Enumerate different types of result-returning VMEM operations. Although
149// s_waitcnt orders them all with a single vmcnt counter, in the absence of
150// s_waitcnt only instructions of the same VmemType are guaranteed to write
151// their results in order -- so there is no need to insert an s_waitcnt between
152// two instructions of the same type that write the same vgpr.
153enum VmemType {
154 // BUF instructions and MIMG instructions without a sampler.
155 VMEM_NOSAMPLER,
156 // MIMG instructions with a sampler.
157 VMEM_SAMPLER,
158 // BVH instructions
159 VMEM_BVH,
160 NUM_VMEM_TYPES
161};
162
163// Maps values of InstCounterType to the instruction that waits on that
164// counter. Only used if GCNSubtarget::hasExtendedWaitCounts()
165// returns true.
166static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
167 AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT,
168 AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
169 AMDGPU::S_WAIT_KMCNT};
170
171static bool updateVMCntOnly(const MachineInstr &Inst) {
172 return SIInstrInfo::isVMEM(MI: Inst) || SIInstrInfo::isFLATGlobal(MI: Inst) ||
173 SIInstrInfo::isFLATScratch(MI: Inst);
174}
175
176#ifndef NDEBUG
177static bool isNormalMode(InstCounterType MaxCounter) {
178 return MaxCounter == NUM_NORMAL_INST_CNTS;
179}
180#endif // NDEBUG
181
182VmemType getVmemType(const MachineInstr &Inst) {
183 assert(updateVMCntOnly(Inst));
184 if (!SIInstrInfo::isMIMG(MI: Inst) && !SIInstrInfo::isVIMAGE(MI: Inst) &&
185 !SIInstrInfo::isVSAMPLE(MI: Inst))
186 return VMEM_NOSAMPLER;
187 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: Inst.getOpcode());
188 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
189 AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Info->BaseOpcode);
190 return BaseInfo->BVH ? VMEM_BVH
191 : BaseInfo->Sampler ? VMEM_SAMPLER : VMEM_NOSAMPLER;
192}
193
194unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) {
195 switch (T) {
196 case LOAD_CNT:
197 return Wait.LoadCnt;
198 case EXP_CNT:
199 return Wait.ExpCnt;
200 case DS_CNT:
201 return Wait.DsCnt;
202 case STORE_CNT:
203 return Wait.StoreCnt;
204 case SAMPLE_CNT:
205 return Wait.SampleCnt;
206 case BVH_CNT:
207 return Wait.BvhCnt;
208 case KM_CNT:
209 return Wait.KmCnt;
210 default:
211 llvm_unreachable("bad InstCounterType");
212 }
213}
214
215void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
216 unsigned &WC = getCounterRef(Wait, T);
217 WC = std::min(a: WC, b: Count);
218}
219
220void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
221 getCounterRef(Wait, T) = ~0u;
222}
223
224unsigned getWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
225 return getCounterRef(Wait, T);
226}
227
228// Mapping from event to counter according to the table masks.
229InstCounterType eventCounter(const unsigned *masks, WaitEventType E) {
230 for (auto T : inst_counter_types()) {
231 if (masks[T] & (1 << E))
232 return T;
233 }
234 llvm_unreachable("event type has no associated counter");
235}
236
237// This objects maintains the current score brackets of each wait counter, and
238// a per-register scoreboard for each wait counter.
239//
240// We also maintain the latest score for every event type that can change the
241// waitcnt in order to know if there are multiple types of events within
242// the brackets. When multiple types of event happen in the bracket,
243// wait count may get decreased out of order, therefore we need to put in
244// "s_waitcnt 0" before use.
245class WaitcntBrackets {
246public:
247 WaitcntBrackets(const GCNSubtarget *SubTarget, InstCounterType MaxCounter,
248 HardwareLimits Limits, RegisterEncoding Encoding,
249 const unsigned *WaitEventMaskForInst,
250 InstCounterType SmemAccessCounter)
251 : ST(SubTarget), MaxCounter(MaxCounter), Limits(Limits),
252 Encoding(Encoding), WaitEventMaskForInst(WaitEventMaskForInst),
253 SmemAccessCounter(SmemAccessCounter) {}
254
255 unsigned getWaitCountMax(InstCounterType T) const {
256 switch (T) {
257 case LOAD_CNT:
258 return Limits.LoadcntMax;
259 case DS_CNT:
260 return Limits.DscntMax;
261 case EXP_CNT:
262 return Limits.ExpcntMax;
263 case STORE_CNT:
264 return Limits.StorecntMax;
265 case SAMPLE_CNT:
266 return Limits.SamplecntMax;
267 case BVH_CNT:
268 return Limits.BvhcntMax;
269 case KM_CNT:
270 return Limits.KmcntMax;
271 default:
272 break;
273 }
274 return 0;
275 }
276
277 unsigned getScoreLB(InstCounterType T) const {
278 assert(T < NUM_INST_CNTS);
279 return ScoreLBs[T];
280 }
281
282 unsigned getScoreUB(InstCounterType T) const {
283 assert(T < NUM_INST_CNTS);
284 return ScoreUBs[T];
285 }
286
287 unsigned getScoreRange(InstCounterType T) const {
288 return getScoreUB(T) - getScoreLB(T);
289 }
290
291 unsigned getRegScore(int GprNo, InstCounterType T) const {
292 if (GprNo < NUM_ALL_VGPRS) {
293 return VgprScores[T][GprNo];
294 }
295 assert(T == SmemAccessCounter);
296 return SgprScores[GprNo - NUM_ALL_VGPRS];
297 }
298
299 bool merge(const WaitcntBrackets &Other);
300
301 RegInterval getRegInterval(const MachineInstr *MI,
302 const MachineRegisterInfo *MRI,
303 const SIRegisterInfo *TRI, unsigned OpNo) const;
304
305 bool counterOutOfOrder(InstCounterType T) const;
306 void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
307 void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
308 void determineWait(InstCounterType T, int RegNo, AMDGPU::Waitcnt &Wait) const;
309 void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
310 void applyWaitcnt(InstCounterType T, unsigned Count);
311 void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
312 const MachineRegisterInfo *MRI, WaitEventType E,
313 MachineInstr &MI);
314
315 unsigned hasPendingEvent() const { return PendingEvents; }
316 unsigned hasPendingEvent(WaitEventType E) const {
317 return PendingEvents & (1 << E);
318 }
319 unsigned hasPendingEvent(InstCounterType T) const {
320 unsigned HasPending = PendingEvents & WaitEventMaskForInst[T];
321 assert((HasPending != 0) == (getScoreRange(T) != 0));
322 return HasPending;
323 }
324
325 bool hasMixedPendingEvents(InstCounterType T) const {
326 unsigned Events = hasPendingEvent(T);
327 // Return true if more than one bit is set in Events.
328 return Events & (Events - 1);
329 }
330
331 bool hasPendingFlat() const {
332 return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
333 LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||
334 (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
335 LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
336 }
337
338 void setPendingFlat() {
339 LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
340 LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
341 }
342
343 // Return true if there might be pending writes to the specified vgpr by VMEM
344 // instructions with types different from V.
345 bool hasOtherPendingVmemTypes(int GprNo, VmemType V) const {
346 assert(GprNo < NUM_ALL_VGPRS);
347 return VgprVmemTypes[GprNo] & ~(1 << V);
348 }
349
350 void clearVgprVmemTypes(int GprNo) {
351 assert(GprNo < NUM_ALL_VGPRS);
352 VgprVmemTypes[GprNo] = 0;
353 }
354
355 void setStateOnFunctionEntryOrReturn() {
356 setScoreUB(T: STORE_CNT, Val: getScoreUB(T: STORE_CNT) + getWaitCountMax(T: STORE_CNT));
357 PendingEvents |= WaitEventMaskForInst[STORE_CNT];
358 }
359
360 ArrayRef<const MachineInstr *> getLDSDMAStores() const {
361 return LDSDMAStores;
362 }
363
364 void print(raw_ostream &);
365 void dump() { print(dbgs()); }
366
367private:
368 struct MergeInfo {
369 unsigned OldLB;
370 unsigned OtherLB;
371 unsigned MyShift;
372 unsigned OtherShift;
373 };
374 static bool mergeScore(const MergeInfo &M, unsigned &Score,
375 unsigned OtherScore);
376
377 void setScoreLB(InstCounterType T, unsigned Val) {
378 assert(T < NUM_INST_CNTS);
379 ScoreLBs[T] = Val;
380 }
381
382 void setScoreUB(InstCounterType T, unsigned Val) {
383 assert(T < NUM_INST_CNTS);
384 ScoreUBs[T] = Val;
385
386 if (T != EXP_CNT)
387 return;
388
389 if (getScoreRange(T: EXP_CNT) > getWaitCountMax(T: EXP_CNT))
390 ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax(T: EXP_CNT);
391 }
392
393 void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
394 if (GprNo < NUM_ALL_VGPRS) {
395 VgprUB = std::max(a: VgprUB, b: GprNo);
396 VgprScores[T][GprNo] = Val;
397 } else {
398 assert(T == SmemAccessCounter);
399 SgprUB = std::max(a: SgprUB, b: GprNo - NUM_ALL_VGPRS);
400 SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
401 }
402 }
403
404 void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
405 const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
406 unsigned OpNo, unsigned Val);
407
408 const GCNSubtarget *ST = nullptr;
409 InstCounterType MaxCounter = NUM_EXTENDED_INST_CNTS;
410 HardwareLimits Limits = {};
411 RegisterEncoding Encoding = {};
412 const unsigned *WaitEventMaskForInst;
413 InstCounterType SmemAccessCounter;
414 unsigned ScoreLBs[NUM_INST_CNTS] = {0};
415 unsigned ScoreUBs[NUM_INST_CNTS] = {0};
416 unsigned PendingEvents = 0;
417 // Remember the last flat memory operation.
418 unsigned LastFlat[NUM_INST_CNTS] = {0};
419 // wait_cnt scores for every vgpr.
420 // Keep track of the VgprUB and SgprUB to make merge at join efficient.
421 int VgprUB = -1;
422 int SgprUB = -1;
423 unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
424 // Wait cnt scores for every sgpr, only DS_CNT (corresponding to LGKMcnt
425 // pre-gfx12) or KM_CNT (gfx12+ only) are relevant.
426 unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0};
427 // Bitmask of the VmemTypes of VMEM instructions that might have a pending
428 // write to each vgpr.
429 unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
430 // Store representative LDS DMA operations. The only useful info here is
431 // alias info. One store is kept per unique AAInfo.
432 SmallVector<const MachineInstr *, NUM_EXTRA_VGPRS - 1> LDSDMAStores;
433};
434
435// This abstracts the logic for generating and updating S_WAIT* instructions
436// away from the analysis that determines where they are needed. This was
437// done because the set of counters and instructions for waiting on them
438// underwent a major shift with gfx12, sufficiently so that having this
439// abstraction allows the main analysis logic to be simpler than it would
440// otherwise have had to become.
441class WaitcntGenerator {
442protected:
443 const GCNSubtarget *ST = nullptr;
444 const SIInstrInfo *TII = nullptr;
445 AMDGPU::IsaVersion IV;
446 InstCounterType MaxCounter;
447
448public:
449 WaitcntGenerator() {}
450 WaitcntGenerator(const GCNSubtarget *ST, InstCounterType MaxCounter)
451 : ST(ST), TII(ST->getInstrInfo()),
452 IV(AMDGPU::getIsaVersion(GPU: ST->getCPU())), MaxCounter(MaxCounter) {}
453
454 // Edits an existing sequence of wait count instructions according
455 // to an incoming Waitcnt value, which is itself updated to reflect
456 // any new wait count instructions which may need to be generated by
457 // WaitcntGenerator::createNewWaitcnt(). It will return true if any edits
458 // were made.
459 //
460 // This editing will usually be merely updated operands, but it may also
461 // delete instructions if the incoming Wait value indicates they are not
462 // needed. It may also remove existing instructions for which a wait
463 // is needed if it can be determined that it is better to generate new
464 // instructions later, as can happen on gfx12.
465 virtual bool
466 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
467 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
468 MachineBasicBlock::instr_iterator It) const = 0;
469
470 // Transform a soft waitcnt into a normal one.
471 bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const;
472
473 // Generates new wait count instructions according to the value of
474 // Wait, returning true if any new instructions were created.
475 virtual bool createNewWaitcnt(MachineBasicBlock &Block,
476 MachineBasicBlock::instr_iterator It,
477 AMDGPU::Waitcnt Wait) = 0;
478
479 // Returns an array of bit masks which can be used to map values in
480 // WaitEventType to corresponding counter values in InstCounterType.
481 virtual const unsigned *getWaitEventMask() const = 0;
482
483 // Returns a new waitcnt with all counters except VScnt set to 0. If
484 // IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u.
485 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0;
486
487 virtual ~WaitcntGenerator() = default;
488
489 // Create a mask value from the initializer list of wait event types.
490 static constexpr unsigned
491 eventMask(std::initializer_list<WaitEventType> Events) {
492 unsigned Mask = 0;
493 for (auto &E : Events)
494 Mask |= 1 << E;
495
496 return Mask;
497 }
498};
499
500class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
501public:
502 WaitcntGeneratorPreGFX12() {}
503 WaitcntGeneratorPreGFX12(const GCNSubtarget *ST)
504 : WaitcntGenerator(ST, NUM_NORMAL_INST_CNTS) {}
505
506 bool
507 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
508 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
509 MachineBasicBlock::instr_iterator It) const override;
510
511 bool createNewWaitcnt(MachineBasicBlock &Block,
512 MachineBasicBlock::instr_iterator It,
513 AMDGPU::Waitcnt Wait) override;
514
515 const unsigned *getWaitEventMask() const override {
516 assert(ST);
517
518 static const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {
519 eventMask(Events: {VMEM_ACCESS, VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS,
520 VMEM_BVH_READ_ACCESS}),
521 eventMask(Events: {SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
522 eventMask(Events: {EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
523 EXP_POS_ACCESS, EXP_LDS_ACCESS}),
524 eventMask(Events: {VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
525 0,
526 0,
527 0};
528
529 return WaitEventMaskForInstPreGFX12;
530 }
531
532 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
533};
534
535class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
536public:
537 WaitcntGeneratorGFX12Plus() {}
538 WaitcntGeneratorGFX12Plus(const GCNSubtarget *ST, InstCounterType MaxCounter)
539 : WaitcntGenerator(ST, MaxCounter) {}
540
541 bool
542 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
543 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
544 MachineBasicBlock::instr_iterator It) const override;
545
546 bool createNewWaitcnt(MachineBasicBlock &Block,
547 MachineBasicBlock::instr_iterator It,
548 AMDGPU::Waitcnt Wait) override;
549
550 const unsigned *getWaitEventMask() const override {
551 assert(ST);
552
553 static const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {
554 eventMask(Events: {VMEM_ACCESS, VMEM_READ_ACCESS}),
555 eventMask(Events: {LDS_ACCESS, GDS_ACCESS}),
556 eventMask(Events: {EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
557 EXP_POS_ACCESS, EXP_LDS_ACCESS}),
558 eventMask(Events: {VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
559 eventMask(Events: {VMEM_SAMPLER_READ_ACCESS}),
560 eventMask(Events: {VMEM_BVH_READ_ACCESS}),
561 eventMask(Events: {SMEM_ACCESS, SQ_MESSAGE})};
562
563 return WaitEventMaskForInstGFX12Plus;
564 }
565
566 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
567};
568
569class SIInsertWaitcnts : public MachineFunctionPass {
570private:
571 const GCNSubtarget *ST = nullptr;
572 const SIInstrInfo *TII = nullptr;
573 const SIRegisterInfo *TRI = nullptr;
574 const MachineRegisterInfo *MRI = nullptr;
575
576 DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
577 DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;
578 MachineLoopInfo *MLI;
579 MachinePostDominatorTree *PDT;
580 AliasAnalysis *AA = nullptr;
581
582 struct BlockInfo {
583 std::unique_ptr<WaitcntBrackets> Incoming;
584 bool Dirty = true;
585 };
586
587 InstCounterType SmemAccessCounter;
588
589 MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
590
591 // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
592 // because of amdgpu-waitcnt-forcezero flag
593 bool ForceEmitZeroWaitcnts;
594 bool ForceEmitWaitcnt[NUM_INST_CNTS];
595
596 bool OptNone;
597
598 // In any given run of this pass, WCG will point to one of these two
599 // generator objects, which must have been re-initialised before use
600 // from a value made using a subtarget constructor.
601 WaitcntGeneratorPreGFX12 WCGPreGFX12;
602 WaitcntGeneratorGFX12Plus WCGGFX12Plus;
603
604 WaitcntGenerator *WCG = nullptr;
605
606 // S_ENDPGM instructions before which we should insert a DEALLOC_VGPRS
607 // message.
608 DenseSet<MachineInstr *> ReleaseVGPRInsts;
609
610 InstCounterType MaxCounter = NUM_NORMAL_INST_CNTS;
611
612public:
613 static char ID;
614
615 SIInsertWaitcnts() : MachineFunctionPass(ID) {
616 (void)ForceExpCounter;
617 (void)ForceLgkmCounter;
618 (void)ForceVMCounter;
619 }
620
621 bool shouldFlushVmCnt(MachineLoop *ML, WaitcntBrackets &Brackets);
622 bool isPreheaderToFlush(MachineBasicBlock &MBB,
623 WaitcntBrackets &ScoreBrackets);
624 bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
625 bool runOnMachineFunction(MachineFunction &MF) override;
626
627 StringRef getPassName() const override {
628 return "SI insert wait instructions";
629 }
630
631 void getAnalysisUsage(AnalysisUsage &AU) const override {
632 AU.setPreservesCFG();
633 AU.addRequired<MachineLoopInfo>();
634 AU.addRequired<MachinePostDominatorTree>();
635 AU.addUsedIfAvailable<AAResultsWrapperPass>();
636 AU.addPreserved<AAResultsWrapperPass>();
637 MachineFunctionPass::getAnalysisUsage(AU);
638 }
639
640 bool isForceEmitWaitcnt() const {
641 for (auto T : inst_counter_types())
642 if (ForceEmitWaitcnt[T])
643 return true;
644 return false;
645 }
646
647 void setForceEmitWaitcnt() {
648// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
649// For debug builds, get the debug counter info and adjust if need be
650#ifndef NDEBUG
651 if (DebugCounter::isCounterSet(ID: ForceExpCounter) &&
652 DebugCounter::shouldExecute(CounterName: ForceExpCounter)) {
653 ForceEmitWaitcnt[EXP_CNT] = true;
654 } else {
655 ForceEmitWaitcnt[EXP_CNT] = false;
656 }
657
658 if (DebugCounter::isCounterSet(ID: ForceLgkmCounter) &&
659 DebugCounter::shouldExecute(CounterName: ForceLgkmCounter)) {
660 ForceEmitWaitcnt[DS_CNT] = true;
661 ForceEmitWaitcnt[KM_CNT] = true;
662 } else {
663 ForceEmitWaitcnt[DS_CNT] = false;
664 ForceEmitWaitcnt[KM_CNT] = false;
665 }
666
667 if (DebugCounter::isCounterSet(ID: ForceVMCounter) &&
668 DebugCounter::shouldExecute(CounterName: ForceVMCounter)) {
669 ForceEmitWaitcnt[LOAD_CNT] = true;
670 ForceEmitWaitcnt[SAMPLE_CNT] = true;
671 ForceEmitWaitcnt[BVH_CNT] = true;
672 } else {
673 ForceEmitWaitcnt[LOAD_CNT] = false;
674 ForceEmitWaitcnt[SAMPLE_CNT] = false;
675 ForceEmitWaitcnt[BVH_CNT] = false;
676 }
677#endif // NDEBUG
678 }
679
680 // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM or
681 // FLAT instruction.
682 WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
683 // Maps VMEM access types to their corresponding WaitEventType.
684 static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
685 VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
686
687 assert(SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLAT(Inst));
688 // LDS DMA loads are also stores, but on the LDS side. On the VMEM side
689 // these should use VM_CNT.
690 if (!ST->hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(MI: Inst))
691 return VMEM_ACCESS;
692 if (Inst.mayStore() && !SIInstrInfo::isAtomicRet(MI: Inst)) {
693 // FLAT and SCRATCH instructions may access scratch. Other VMEM
694 // instructions do not.
695 if (SIInstrInfo::isFLAT(MI: Inst) && mayAccessScratchThroughFlat(MI: Inst))
696 return SCRATCH_WRITE_ACCESS;
697 return VMEM_WRITE_ACCESS;
698 }
699 if (!ST->hasExtendedWaitCounts() || SIInstrInfo::isFLAT(MI: Inst))
700 return VMEM_READ_ACCESS;
701 return VmemReadMapping[getVmemType(Inst)];
702 }
703
704 bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
705 bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
706 bool mayAccessScratchThroughFlat(const MachineInstr &MI) const;
707 bool generateWaitcntInstBefore(MachineInstr &MI,
708 WaitcntBrackets &ScoreBrackets,
709 MachineInstr *OldWaitcntInstr,
710 bool FlushVmCnt);
711 bool generateWaitcnt(AMDGPU::Waitcnt Wait,
712 MachineBasicBlock::instr_iterator It,
713 MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
714 MachineInstr *OldWaitcntInstr);
715 void updateEventWaitcntAfter(MachineInstr &Inst,
716 WaitcntBrackets *ScoreBrackets);
717 bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
718 WaitcntBrackets &ScoreBrackets);
719};
720
721} // end anonymous namespace
722
723RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
724 const MachineRegisterInfo *MRI,
725 const SIRegisterInfo *TRI,
726 unsigned OpNo) const {
727 const MachineOperand &Op = MI->getOperand(i: OpNo);
728 if (!TRI->isInAllocatableClass(Op.getReg()))
729 return {-1, -1};
730
731 // A use via a PW operand does not need a waitcnt.
732 // A partial write is not a WAW.
733 assert(!Op.getSubReg() || !Op.isUndef());
734
735 RegInterval Result;
736
737 unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST)) &
738 AMDGPU::HWEncoding::REG_IDX_MASK;
739
740 if (TRI->isVectorRegister(MRI: *MRI, Reg: Op.getReg())) {
741 assert(Reg >= Encoding.VGPR0 && Reg <= Encoding.VGPRL);
742 Result.first = Reg - Encoding.VGPR0;
743 if (TRI->isAGPR(MRI: *MRI, Reg: Op.getReg()))
744 Result.first += AGPR_OFFSET;
745 assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
746 } else if (TRI->isSGPRReg(MRI: *MRI, Reg: Op.getReg())) {
747 assert(Reg >= Encoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
748 Result.first = Reg - Encoding.SGPR0 + NUM_ALL_VGPRS;
749 assert(Result.first >= NUM_ALL_VGPRS &&
750 Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
751 }
752 // TODO: Handle TTMP
753 // else if (TRI->isTTMP(*MRI, Reg.getReg())) ...
754 else
755 return {-1, -1};
756
757 const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Op.getReg());
758 unsigned Size = TRI->getRegSizeInBits(*RC);
759 Result.second = Result.first + ((Size + 16) / 32);
760
761 return Result;
762}
763
764void WaitcntBrackets::setExpScore(const MachineInstr *MI,
765 const SIInstrInfo *TII,
766 const SIRegisterInfo *TRI,
767 const MachineRegisterInfo *MRI, unsigned OpNo,
768 unsigned Val) {
769 RegInterval Interval = getRegInterval(MI, MRI, TRI, OpNo);
770 assert(TRI->isVectorRegister(*MRI, MI->getOperand(OpNo).getReg()));
771 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
772 setRegScore(GprNo: RegNo, T: EXP_CNT, Val);
773 }
774}
775
776void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
777 const SIRegisterInfo *TRI,
778 const MachineRegisterInfo *MRI,
779 WaitEventType E, MachineInstr &Inst) {
780 InstCounterType T = eventCounter(masks: WaitEventMaskForInst, E);
781
782 unsigned UB = getScoreUB(T);
783 unsigned CurrScore = UB + 1;
784 if (CurrScore == 0)
785 report_fatal_error(reason: "InsertWaitcnt score wraparound");
786 // PendingEvents and ScoreUB need to be update regardless if this event
787 // changes the score of a register or not.
788 // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
789 PendingEvents |= 1 << E;
790 setScoreUB(T, Val: CurrScore);
791
792 if (T == EXP_CNT) {
793 // Put score on the source vgprs. If this is a store, just use those
794 // specific register(s).
795 if (TII->isDS(MI: Inst) && (Inst.mayStore() || Inst.mayLoad())) {
796 int AddrOpIdx =
797 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr);
798 // All GDS operations must protect their address register (same as
799 // export.)
800 if (AddrOpIdx != -1) {
801 setExpScore(MI: &Inst, TII, TRI, MRI, OpNo: AddrOpIdx, Val: CurrScore);
802 }
803
804 if (Inst.mayStore()) {
805 if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::data0)) {
806 setExpScore(
807 &Inst, TII, TRI, MRI,
808 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
809 CurrScore);
810 }
811 if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::data1)) {
812 setExpScore(&Inst, TII, TRI, MRI,
813 AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
814 AMDGPU::OpName::data1),
815 CurrScore);
816 }
817 } else if (SIInstrInfo::isAtomicRet(MI: Inst) && !SIInstrInfo::isGWS(MI: Inst) &&
818 Inst.getOpcode() != AMDGPU::DS_APPEND &&
819 Inst.getOpcode() != AMDGPU::DS_CONSUME &&
820 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
821 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
822 const MachineOperand &Op = Inst.getOperand(i: I);
823 if (Op.isReg() && !Op.isDef() &&
824 TRI->isVectorRegister(MRI: *MRI, Reg: Op.getReg())) {
825 setExpScore(MI: &Inst, TII, TRI, MRI, OpNo: I, Val: CurrScore);
826 }
827 }
828 }
829 } else if (TII->isFLAT(MI: Inst)) {
830 if (Inst.mayStore()) {
831 setExpScore(
832 &Inst, TII, TRI, MRI,
833 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
834 CurrScore);
835 } else if (SIInstrInfo::isAtomicRet(MI: Inst)) {
836 setExpScore(
837 &Inst, TII, TRI, MRI,
838 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
839 CurrScore);
840 }
841 } else if (TII->isMIMG(MI: Inst)) {
842 if (Inst.mayStore()) {
843 setExpScore(MI: &Inst, TII, TRI, MRI, OpNo: 0, Val: CurrScore);
844 } else if (SIInstrInfo::isAtomicRet(MI: Inst)) {
845 setExpScore(
846 &Inst, TII, TRI, MRI,
847 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
848 CurrScore);
849 }
850 } else if (TII->isMTBUF(MI: Inst)) {
851 if (Inst.mayStore()) {
852 setExpScore(MI: &Inst, TII, TRI, MRI, OpNo: 0, Val: CurrScore);
853 }
854 } else if (TII->isMUBUF(MI: Inst)) {
855 if (Inst.mayStore()) {
856 setExpScore(MI: &Inst, TII, TRI, MRI, OpNo: 0, Val: CurrScore);
857 } else if (SIInstrInfo::isAtomicRet(MI: Inst)) {
858 setExpScore(
859 &Inst, TII, TRI, MRI,
860 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
861 CurrScore);
862 }
863 } else if (TII->isLDSDIR(MI: Inst)) {
864 // LDSDIR instructions attach the score to the destination.
865 setExpScore(
866 &Inst, TII, TRI, MRI,
867 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::vdst),
868 CurrScore);
869 } else {
870 if (TII->isEXP(MI: Inst)) {
871 // For export the destination registers are really temps that
872 // can be used as the actual source after export patching, so
873 // we need to treat them like sources and set the EXP_CNT
874 // score.
875 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
876 MachineOperand &DefMO = Inst.getOperand(i: I);
877 if (DefMO.isReg() && DefMO.isDef() &&
878 TRI->isVGPR(MRI: *MRI, Reg: DefMO.getReg())) {
879 setRegScore(
880 GprNo: TRI->getEncodingValue(AMDGPU::getMCReg(DefMO.getReg(), *ST)),
881 T: EXP_CNT, Val: CurrScore);
882 }
883 }
884 }
885 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
886 MachineOperand &MO = Inst.getOperand(i: I);
887 if (MO.isReg() && !MO.isDef() &&
888 TRI->isVectorRegister(MRI: *MRI, Reg: MO.getReg())) {
889 setExpScore(MI: &Inst, TII, TRI, MRI, OpNo: I, Val: CurrScore);
890 }
891 }
892 }
893#if 0 // TODO: check if this is handled by MUBUF code above.
894 } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
895 Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
896 Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
897 MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
898 unsigned OpNo;//TODO: find the OpNo for this operand;
899 RegInterval Interval = getRegInterval(&Inst, MRI, TRI, OpNo);
900 for (int RegNo = Interval.first; RegNo < Interval.second;
901 ++RegNo) {
902 setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
903 }
904#endif
905 } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
906 // Match the score to the destination registers.
907 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
908 auto &Op = Inst.getOperand(i: I);
909 if (!Op.isReg() || !Op.isDef())
910 continue;
911 RegInterval Interval = getRegInterval(MI: &Inst, MRI, TRI, OpNo: I);
912 if (T == LOAD_CNT || T == SAMPLE_CNT || T == BVH_CNT) {
913 if (Interval.first >= NUM_ALL_VGPRS)
914 continue;
915 if (updateVMCntOnly(Inst)) {
916 // updateVMCntOnly should only leave us with VGPRs
917 // MUBUF, MTBUF, MIMG, FlatGlobal, and FlatScratch only have VGPR/AGPR
918 // defs. That's required for a sane index into `VgprMemTypes` below
919 assert(TRI->isVectorRegister(*MRI, Op.getReg()));
920 VmemType V = getVmemType(Inst);
921 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo)
922 VgprVmemTypes[RegNo] |= 1 << V;
923 }
924 }
925 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
926 setRegScore(GprNo: RegNo, T, Val: CurrScore);
927 }
928 }
929 if (Inst.mayStore() &&
930 (TII->isDS(MI: Inst) || TII->mayWriteLDSThroughDMA(MI: Inst))) {
931 // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
932 // written can be accessed. A load from LDS to VMEM does not need a wait.
933 unsigned Slot = 0;
934 for (const auto *MemOp : Inst.memoperands()) {
935 if (!MemOp->isStore() ||
936 MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS)
937 continue;
938 // Comparing just AA info does not guarantee memoperands are equal
939 // in general, but this is so for LDS DMA in practice.
940 auto AAI = MemOp->getAAInfo();
941 // Alias scope information gives a way to definitely identify an
942 // original memory object and practically produced in the module LDS
943 // lowering pass. If there is no scope available we will not be able
944 // to disambiguate LDS aliasing as after the module lowering all LDS
945 // is squashed into a single big object. Do not attempt to use one of
946 // the limited LDSDMAStores for something we will not be able to use
947 // anyway.
948 if (!AAI || !AAI.Scope)
949 break;
950 for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) {
951 for (const auto *MemOp : LDSDMAStores[I]->memoperands()) {
952 if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
953 Slot = I + 1;
954 break;
955 }
956 }
957 }
958 if (Slot || LDSDMAStores.size() == NUM_EXTRA_VGPRS - 1)
959 break;
960 LDSDMAStores.push_back(Elt: &Inst);
961 Slot = LDSDMAStores.size();
962 break;
963 }
964 setRegScore(GprNo: SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS + Slot, T, Val: CurrScore);
965 if (Slot)
966 setRegScore(GprNo: SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, Val: CurrScore);
967 }
968 }
969}
970
971void WaitcntBrackets::print(raw_ostream &OS) {
972 OS << '\n';
973 for (auto T : inst_counter_types(MaxCounter)) {
974 unsigned SR = getScoreRange(T);
975
976 switch (T) {
977 case LOAD_CNT:
978 OS << " " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("
979 << SR << "): ";
980 break;
981 case DS_CNT:
982 OS << " " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT("
983 << SR << "): ";
984 break;
985 case EXP_CNT:
986 OS << " EXP_CNT(" << SR << "): ";
987 break;
988 case STORE_CNT:
989 OS << " " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT("
990 << SR << "): ";
991 break;
992 case SAMPLE_CNT:
993 OS << " SAMPLE_CNT(" << SR << "): ";
994 break;
995 case BVH_CNT:
996 OS << " BVH_CNT(" << SR << "): ";
997 break;
998 case KM_CNT:
999 OS << " KM_CNT(" << SR << "): ";
1000 break;
1001 default:
1002 OS << " UNKNOWN(" << SR << "): ";
1003 break;
1004 }
1005
1006 if (SR != 0) {
1007 // Print vgpr scores.
1008 unsigned LB = getScoreLB(T);
1009
1010 for (int J = 0; J <= VgprUB; J++) {
1011 unsigned RegScore = getRegScore(GprNo: J, T);
1012 if (RegScore <= LB)
1013 continue;
1014 unsigned RelScore = RegScore - LB - 1;
1015 if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
1016 OS << RelScore << ":v" << J << " ";
1017 } else {
1018 OS << RelScore << ":ds ";
1019 }
1020 }
1021 // Also need to print sgpr scores for lgkm_cnt.
1022 if (T == SmemAccessCounter) {
1023 for (int J = 0; J <= SgprUB; J++) {
1024 unsigned RegScore = getRegScore(GprNo: J + NUM_ALL_VGPRS, T);
1025 if (RegScore <= LB)
1026 continue;
1027 unsigned RelScore = RegScore - LB - 1;
1028 OS << RelScore << ":s" << J << " ";
1029 }
1030 }
1031 }
1032 OS << '\n';
1033 }
1034 OS << '\n';
1035}
1036
1037/// Simplify the waitcnt, in the sense of removing redundant counts, and return
1038/// whether a waitcnt instruction is needed at all.
1039void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
1040 simplifyWaitcnt(T: LOAD_CNT, Count&: Wait.LoadCnt);
1041 simplifyWaitcnt(T: EXP_CNT, Count&: Wait.ExpCnt);
1042 simplifyWaitcnt(T: DS_CNT, Count&: Wait.DsCnt);
1043 simplifyWaitcnt(T: STORE_CNT, Count&: Wait.StoreCnt);
1044 simplifyWaitcnt(T: SAMPLE_CNT, Count&: Wait.SampleCnt);
1045 simplifyWaitcnt(T: BVH_CNT, Count&: Wait.BvhCnt);
1046 simplifyWaitcnt(T: KM_CNT, Count&: Wait.KmCnt);
1047}
1048
1049void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
1050 unsigned &Count) const {
1051 // The number of outstanding events for this type, T, can be calculated
1052 // as (UB - LB). If the current Count is greater than or equal to the number
1053 // of outstanding events, then the wait for this counter is redundant.
1054 if (Count >= getScoreRange(T))
1055 Count = ~0u;
1056}
1057
1058void WaitcntBrackets::determineWait(InstCounterType T, int RegNo,
1059 AMDGPU::Waitcnt &Wait) const {
1060 unsigned ScoreToWait = getRegScore(GprNo: RegNo, T);
1061
1062 // If the score of src_operand falls within the bracket, we need an
1063 // s_waitcnt instruction.
1064 const unsigned LB = getScoreLB(T);
1065 const unsigned UB = getScoreUB(T);
1066 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
1067 if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() &&
1068 !ST->hasFlatLgkmVMemCountInOrder()) {
1069 // If there is a pending FLAT operation, and this is a VMem or LGKM
1070 // waitcnt and the target can report early completion, then we need
1071 // to force a waitcnt 0.
1072 addWait(Wait, T, Count: 0);
1073 } else if (counterOutOfOrder(T)) {
1074 // Counter can get decremented out-of-order when there
1075 // are multiple types event in the bracket. Also emit an s_wait counter
1076 // with a conservative value of 0 for the counter.
1077 addWait(Wait, T, Count: 0);
1078 } else {
1079 // If a counter has been maxed out avoid overflow by waiting for
1080 // MAX(CounterType) - 1 instead.
1081 unsigned NeededWait = std::min(a: UB - ScoreToWait, b: getWaitCountMax(T) - 1);
1082 addWait(Wait, T, Count: NeededWait);
1083 }
1084 }
1085}
1086
1087void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
1088 applyWaitcnt(T: LOAD_CNT, Count: Wait.LoadCnt);
1089 applyWaitcnt(T: EXP_CNT, Count: Wait.ExpCnt);
1090 applyWaitcnt(T: DS_CNT, Count: Wait.DsCnt);
1091 applyWaitcnt(T: STORE_CNT, Count: Wait.StoreCnt);
1092 applyWaitcnt(T: SAMPLE_CNT, Count: Wait.SampleCnt);
1093 applyWaitcnt(T: BVH_CNT, Count: Wait.BvhCnt);
1094 applyWaitcnt(T: KM_CNT, Count: Wait.KmCnt);
1095}
1096
1097void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
1098 const unsigned UB = getScoreUB(T);
1099 if (Count >= UB)
1100 return;
1101 if (Count != 0) {
1102 if (counterOutOfOrder(T))
1103 return;
1104 setScoreLB(T, Val: std::max(a: getScoreLB(T), b: UB - Count));
1105 } else {
1106 setScoreLB(T, Val: UB);
1107 PendingEvents &= ~WaitEventMaskForInst[T];
1108 }
1109}
1110
1111// Where there are multiple types of event in the bracket of a counter,
1112// the decrement may go out of order.
1113bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
1114 // Scalar memory read always can go out of order.
1115 if (T == SmemAccessCounter && hasPendingEvent(E: SMEM_ACCESS))
1116 return true;
1117 return hasMixedPendingEvents(T);
1118}
1119
1120INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
1121 false)
1122INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
1123INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
1124INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
1125 false)
1126
1127char SIInsertWaitcnts::ID = 0;
1128
1129char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
1130
1131FunctionPass *llvm::createSIInsertWaitcntsPass() {
1132 return new SIInsertWaitcnts();
1133}
1134
1135static bool updateOperandIfDifferent(MachineInstr &MI, uint16_t OpName,
1136 unsigned NewEnc) {
1137 int OpIdx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), NamedIdx: OpName);
1138 assert(OpIdx >= 0);
1139
1140 MachineOperand &MO = MI.getOperand(i: OpIdx);
1141
1142 if (NewEnc == MO.getImm())
1143 return false;
1144
1145 MO.setImm(NewEnc);
1146 return true;
1147}
1148
1149/// Determine if \p MI is a gfx12+ single-counter S_WAIT_*CNT instruction,
1150/// and if so, which counter it is waiting on.
1151static std::optional<InstCounterType> counterTypeForInstr(unsigned Opcode) {
1152 switch (Opcode) {
1153 case AMDGPU::S_WAIT_LOADCNT:
1154 return LOAD_CNT;
1155 case AMDGPU::S_WAIT_EXPCNT:
1156 return EXP_CNT;
1157 case AMDGPU::S_WAIT_STORECNT:
1158 return STORE_CNT;
1159 case AMDGPU::S_WAIT_SAMPLECNT:
1160 return SAMPLE_CNT;
1161 case AMDGPU::S_WAIT_BVHCNT:
1162 return BVH_CNT;
1163 case AMDGPU::S_WAIT_DSCNT:
1164 return DS_CNT;
1165 case AMDGPU::S_WAIT_KMCNT:
1166 return KM_CNT;
1167 default:
1168 return {};
1169 }
1170}
1171
1172bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const {
1173 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode: Waitcnt->getOpcode());
1174 if (Opcode == Waitcnt->getOpcode())
1175 return false;
1176
1177 Waitcnt->setDesc(TII->get(Opcode));
1178 return true;
1179}
1180
1181/// Combine consecutive S_WAITCNT and S_WAITCNT_VSCNT instructions that
1182/// precede \p It and follow \p OldWaitcntInstr and apply any extra waits
1183/// from \p Wait that were added by previous passes. Currently this pass
1184/// conservatively assumes that these preexisting waits are required for
1185/// correctness.
1186bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1187 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1188 AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
1189 assert(ST);
1190 assert(isNormalMode(MaxCounter));
1191
1192 bool Modified = false;
1193 MachineInstr *WaitcntInstr = nullptr;
1194 MachineInstr *WaitcntVsCntInstr = nullptr;
1195
1196 for (auto &II :
1197 make_early_inc_range(Range: make_range(x: OldWaitcntInstr.getIterator(), y: It))) {
1198 if (II.isMetaInstruction())
1199 continue;
1200
1201 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode: II.getOpcode());
1202 bool IsSoft = Opcode != II.getOpcode();
1203
1204 // Update required wait count. If this is a soft waitcnt (= it was added
1205 // by an earlier pass), it may be entirely removed.
1206 if (Opcode == AMDGPU::S_WAITCNT) {
1207 unsigned IEnc = II.getOperand(i: 0).getImm();
1208 AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(Version: IV, Encoded: IEnc);
1209 if (IsSoft)
1210 ScoreBrackets.simplifyWaitcnt(Wait&: OldWait);
1211 Wait = Wait.combined(Other: OldWait);
1212
1213 // Merge consecutive waitcnt of the same type by erasing multiples.
1214 if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && IsSoft)) {
1215 II.eraseFromParent();
1216 Modified = true;
1217 } else
1218 WaitcntInstr = &II;
1219 } else {
1220 assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
1221 assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1222
1223 unsigned OldVSCnt =
1224 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1225 if (IsSoft)
1226 ScoreBrackets.simplifyWaitcnt(T: InstCounterType::STORE_CNT, Count&: OldVSCnt);
1227 Wait.StoreCnt = std::min(a: Wait.StoreCnt, b: OldVSCnt);
1228
1229 if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && IsSoft)) {
1230 II.eraseFromParent();
1231 Modified = true;
1232 } else
1233 WaitcntVsCntInstr = &II;
1234 }
1235 }
1236
1237 if (WaitcntInstr) {
1238 Modified |= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,
1239 AMDGPU::encodeWaitcnt(IV, Wait));
1240 Modified |= promoteSoftWaitCnt(Waitcnt: WaitcntInstr);
1241
1242 ScoreBrackets.applyWaitcnt(T: LOAD_CNT, Count: Wait.LoadCnt);
1243 ScoreBrackets.applyWaitcnt(T: EXP_CNT, Count: Wait.ExpCnt);
1244 ScoreBrackets.applyWaitcnt(T: DS_CNT, Count: Wait.DsCnt);
1245 Wait.LoadCnt = ~0u;
1246 Wait.ExpCnt = ~0u;
1247 Wait.DsCnt = ~0u;
1248
1249 LLVM_DEBUG(It == WaitcntInstr->getParent()->end()
1250 ? dbgs()
1251 << "applyPreexistingWaitcnt\n"
1252 << "New Instr at block end: " << *WaitcntInstr << '\n'
1253 : dbgs() << "applyPreexistingWaitcnt\n"
1254 << "Old Instr: " << *It
1255 << "New Instr: " << *WaitcntInstr << '\n');
1256 }
1257
1258 if (WaitcntVsCntInstr) {
1259 Modified |= updateOperandIfDifferent(*WaitcntVsCntInstr,
1260 AMDGPU::OpName::simm16, Wait.StoreCnt);
1261 Modified |= promoteSoftWaitCnt(Waitcnt: WaitcntVsCntInstr);
1262
1263 ScoreBrackets.applyWaitcnt(T: STORE_CNT, Count: Wait.StoreCnt);
1264 Wait.StoreCnt = ~0u;
1265
1266 LLVM_DEBUG(It == WaitcntVsCntInstr->getParent()->end()
1267 ? dbgs() << "applyPreexistingWaitcnt\n"
1268 << "New Instr at block end: " << *WaitcntVsCntInstr
1269 << '\n'
1270 : dbgs() << "applyPreexistingWaitcnt\n"
1271 << "Old Instr: " << *It
1272 << "New Instr: " << *WaitcntVsCntInstr << '\n');
1273 }
1274
1275 return Modified;
1276}
1277
1278/// Generate S_WAITCNT and/or S_WAITCNT_VSCNT instructions for any
1279/// required counters in \p Wait
1280bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
1281 MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
1282 AMDGPU::Waitcnt Wait) {
1283 assert(ST);
1284 assert(isNormalMode(MaxCounter));
1285
1286 bool Modified = false;
1287 const DebugLoc &DL = Block.findDebugLoc(MBBI: It);
1288
1289 // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a
1290 // single instruction while VScnt has its own instruction.
1291 if (Wait.hasWaitExceptStoreCnt()) {
1292 unsigned Enc = AMDGPU::encodeWaitcnt(Version: IV, Decoded: Wait);
1293 [[maybe_unused]] auto SWaitInst =
1294 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
1295 Modified = true;
1296
1297 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1298 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1299 dbgs() << "New Instr: " << *SWaitInst << '\n');
1300 }
1301
1302 if (Wait.hasWaitStoreCnt()) {
1303 assert(ST->hasVscnt());
1304
1305 [[maybe_unused]] auto SWaitInst =
1306 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1307 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1308 .addImm(Wait.StoreCnt);
1309 Modified = true;
1310
1311 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1312 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1313 dbgs() << "New Instr: " << *SWaitInst << '\n');
1314 }
1315
1316 return Modified;
1317}
1318
1319AMDGPU::Waitcnt
1320WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1321 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST->hasVscnt() ? 0 : ~0u);
1322}
1323
1324AMDGPU::Waitcnt
1325WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1326 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0);
1327}
1328
1329/// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
1330/// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that
1331/// were added by previous passes. Currently this pass conservatively
1332/// assumes that these preexisting waits are required for correctness.
1333bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
1334 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1335 AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
1336 assert(ST);
1337 assert(!isNormalMode(MaxCounter));
1338
1339 bool Modified = false;
1340 MachineInstr *CombinedLoadDsCntInstr = nullptr;
1341 MachineInstr *CombinedStoreDsCntInstr = nullptr;
1342 MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {};
1343
1344 for (auto &II :
1345 make_early_inc_range(Range: make_range(x: OldWaitcntInstr.getIterator(), y: It))) {
1346 if (II.isMetaInstruction())
1347 continue;
1348
1349 MachineInstr **UpdatableInstr;
1350
1351 // Update required wait count. If this is a soft waitcnt (= it was added
1352 // by an earlier pass), it may be entirely removed.
1353
1354 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode: II.getOpcode());
1355 bool IsSoft = Opcode != II.getOpcode();
1356
1357 if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
1358 unsigned OldEnc =
1359 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1360 AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(Version: IV, LoadcntDscnt: OldEnc);
1361 if (IsSoft)
1362 ScoreBrackets.simplifyWaitcnt(Wait&: OldWait);
1363 Wait = Wait.combined(Other: OldWait);
1364 UpdatableInstr = &CombinedLoadDsCntInstr;
1365 } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
1366 unsigned OldEnc =
1367 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1368 AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(Version: IV, StorecntDscnt: OldEnc);
1369 if (IsSoft)
1370 ScoreBrackets.simplifyWaitcnt(Wait&: OldWait);
1371 Wait = Wait.combined(Other: OldWait);
1372 UpdatableInstr = &CombinedStoreDsCntInstr;
1373 } else {
1374 std::optional<InstCounterType> CT = counterTypeForInstr(Opcode);
1375 assert(CT.has_value());
1376 unsigned OldCnt =
1377 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1378 if (IsSoft)
1379 ScoreBrackets.simplifyWaitcnt(T: CT.value(), Count&: OldCnt);
1380 addWait(Wait, T: CT.value(), Count: OldCnt);
1381 UpdatableInstr = &WaitInstrs[CT.value()];
1382 }
1383
1384 // Merge consecutive waitcnt of the same type by erasing multiples.
1385 if (!*UpdatableInstr) {
1386 *UpdatableInstr = &II;
1387 } else {
1388 II.eraseFromParent();
1389 Modified = true;
1390 }
1391 }
1392
1393 if (CombinedLoadDsCntInstr) {
1394 // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need
1395 // to be waited for. Otherwise, let the instruction be deleted so
1396 // the appropriate single counter wait instruction can be inserted
1397 // instead, when new S_WAIT_*CNT instructions are inserted by
1398 // createNewWaitcnt(). As a side effect, resetting the wait counts will
1399 // cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by
1400 // the loop below that deals with single counter instructions.
1401 if (Wait.LoadCnt != ~0u && Wait.DsCnt != ~0u) {
1402 unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(Version: IV, Decoded: Wait);
1403 Modified |= updateOperandIfDifferent(*CombinedLoadDsCntInstr,
1404 AMDGPU::OpName::simm16, NewEnc);
1405 Modified |= promoteSoftWaitCnt(Waitcnt: CombinedLoadDsCntInstr);
1406 ScoreBrackets.applyWaitcnt(T: LOAD_CNT, Count: Wait.LoadCnt);
1407 ScoreBrackets.applyWaitcnt(T: DS_CNT, Count: Wait.DsCnt);
1408 Wait.LoadCnt = ~0u;
1409 Wait.DsCnt = ~0u;
1410
1411 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1412 ? dbgs() << "applyPreexistingWaitcnt\n"
1413 << "New Instr at block end: "
1414 << *CombinedLoadDsCntInstr << '\n'
1415 : dbgs() << "applyPreexistingWaitcnt\n"
1416 << "Old Instr: " << *It << "New Instr: "
1417 << *CombinedLoadDsCntInstr << '\n');
1418 } else {
1419 CombinedLoadDsCntInstr->eraseFromParent();
1420 Modified = true;
1421 }
1422 }
1423
1424 if (CombinedStoreDsCntInstr) {
1425 // Similarly for S_WAIT_STORECNT_DSCNT.
1426 if (Wait.StoreCnt != ~0u && Wait.DsCnt != ~0u) {
1427 unsigned NewEnc = AMDGPU::encodeStorecntDscnt(Version: IV, Decoded: Wait);
1428 Modified |= updateOperandIfDifferent(*CombinedStoreDsCntInstr,
1429 AMDGPU::OpName::simm16, NewEnc);
1430 Modified |= promoteSoftWaitCnt(Waitcnt: CombinedStoreDsCntInstr);
1431 ScoreBrackets.applyWaitcnt(T: STORE_CNT, Count: Wait.StoreCnt);
1432 ScoreBrackets.applyWaitcnt(T: DS_CNT, Count: Wait.DsCnt);
1433 Wait.StoreCnt = ~0u;
1434 Wait.DsCnt = ~0u;
1435
1436 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1437 ? dbgs() << "applyPreexistingWaitcnt\n"
1438 << "New Instr at block end: "
1439 << *CombinedStoreDsCntInstr << '\n'
1440 : dbgs() << "applyPreexistingWaitcnt\n"
1441 << "Old Instr: " << *It << "New Instr: "
1442 << *CombinedStoreDsCntInstr << '\n');
1443 } else {
1444 CombinedStoreDsCntInstr->eraseFromParent();
1445 Modified = true;
1446 }
1447 }
1448
1449 // Look for an opportunity to convert existing S_WAIT_LOADCNT,
1450 // S_WAIT_STORECNT and S_WAIT_DSCNT into new S_WAIT_LOADCNT_DSCNT
1451 // or S_WAIT_STORECNT_DSCNT. This is achieved by selectively removing
1452 // instructions so that createNewWaitcnt() will create new combined
1453 // instructions to replace them.
1454
1455 if (Wait.DsCnt != ~0u) {
1456 // This is a vector of addresses in WaitInstrs pointing to instructions
1457 // that should be removed if they are present.
1458 SmallVector<MachineInstr **, 2> WaitsToErase;
1459
1460 // If it's known that both DScnt and either LOADcnt or STOREcnt (but not
1461 // both) need to be waited for, ensure that there are no existing
1462 // individual wait count instructions for these.
1463
1464 if (Wait.LoadCnt != ~0u) {
1465 WaitsToErase.push_back(Elt: &WaitInstrs[LOAD_CNT]);
1466 WaitsToErase.push_back(Elt: &WaitInstrs[DS_CNT]);
1467 } else if (Wait.StoreCnt != ~0u) {
1468 WaitsToErase.push_back(Elt: &WaitInstrs[STORE_CNT]);
1469 WaitsToErase.push_back(Elt: &WaitInstrs[DS_CNT]);
1470 }
1471
1472 for (MachineInstr **WI : WaitsToErase) {
1473 if (!*WI)
1474 continue;
1475
1476 (*WI)->eraseFromParent();
1477 *WI = nullptr;
1478 Modified = true;
1479 }
1480 }
1481
1482 for (auto CT : inst_counter_types(MaxCounter: NUM_EXTENDED_INST_CNTS)) {
1483 if (!WaitInstrs[CT])
1484 continue;
1485
1486 unsigned NewCnt = getWait(Wait, T: CT);
1487 if (NewCnt != ~0u) {
1488 Modified |= updateOperandIfDifferent(*WaitInstrs[CT],
1489 AMDGPU::OpName::simm16, NewCnt);
1490 Modified |= promoteSoftWaitCnt(Waitcnt: WaitInstrs[CT]);
1491
1492 ScoreBrackets.applyWaitcnt(T: CT, Count: NewCnt);
1493 setNoWait(Wait, T: CT);
1494
1495 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1496 ? dbgs() << "applyPreexistingWaitcnt\n"
1497 << "New Instr at block end: " << *WaitInstrs[CT]
1498 << '\n'
1499 : dbgs() << "applyPreexistingWaitcnt\n"
1500 << "Old Instr: " << *It
1501 << "New Instr: " << *WaitInstrs[CT] << '\n');
1502 } else {
1503 WaitInstrs[CT]->eraseFromParent();
1504 Modified = true;
1505 }
1506 }
1507
1508 return Modified;
1509}
1510
1511/// Generate S_WAIT_*CNT instructions for any required counters in \p Wait
1512bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
1513 MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
1514 AMDGPU::Waitcnt Wait) {
1515 assert(ST);
1516 assert(!isNormalMode(MaxCounter));
1517
1518 bool Modified = false;
1519 const DebugLoc &DL = Block.findDebugLoc(MBBI: It);
1520
1521 // Check for opportunities to use combined wait instructions.
1522 if (Wait.DsCnt != ~0u) {
1523 MachineInstr *SWaitInst = nullptr;
1524
1525 if (Wait.LoadCnt != ~0u) {
1526 unsigned Enc = AMDGPU::encodeLoadcntDscnt(Version: IV, Decoded: Wait);
1527
1528 SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
1529 .addImm(Enc);
1530
1531 Wait.LoadCnt = ~0u;
1532 Wait.DsCnt = ~0u;
1533 } else if (Wait.StoreCnt != ~0u) {
1534 unsigned Enc = AMDGPU::encodeStorecntDscnt(Version: IV, Decoded: Wait);
1535
1536 SWaitInst =
1537 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_STORECNT_DSCNT))
1538 .addImm(Enc);
1539
1540 Wait.StoreCnt = ~0u;
1541 Wait.DsCnt = ~0u;
1542 }
1543
1544 if (SWaitInst) {
1545 Modified = true;
1546
1547 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1548 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1549 dbgs() << "New Instr: " << *SWaitInst << '\n');
1550 }
1551 }
1552
1553 // Generate an instruction for any remaining counter that needs
1554 // waiting for.
1555
1556 for (auto CT : inst_counter_types(MaxCounter: NUM_EXTENDED_INST_CNTS)) {
1557 unsigned Count = getWait(Wait, T: CT);
1558 if (Count == ~0u)
1559 continue;
1560
1561 [[maybe_unused]] auto SWaitInst =
1562 BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT]))
1563 .addImm(Count);
1564
1565 Modified = true;
1566
1567 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1568 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1569 dbgs() << "New Instr: " << *SWaitInst << '\n');
1570 }
1571
1572 return Modified;
1573}
1574
1575static bool readsVCCZ(const MachineInstr &MI) {
1576 unsigned Opc = MI.getOpcode();
1577 return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
1578 !MI.getOperand(1).isUndef();
1579}
1580
1581/// \returns true if the callee inserts an s_waitcnt 0 on function entry.
1582static bool callWaitsOnFunctionEntry(const MachineInstr &MI) {
1583 // Currently all conventions wait, but this may not always be the case.
1584 //
1585 // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make
1586 // senses to omit the wait and do it in the caller.
1587 return true;
1588}
1589
1590/// \returns true if the callee is expected to wait for any outstanding waits
1591/// before returning.
1592static bool callWaitsOnFunctionReturn(const MachineInstr &MI) {
1593 return true;
1594}
1595
1596/// Generate s_waitcnt instruction to be placed before cur_Inst.
1597/// Instructions of a given type are returned in order,
1598/// but instructions of different types can complete out of order.
1599/// We rely on this in-order completion
1600/// and simply assign a score to the memory access instructions.
1601/// We keep track of the active "score bracket" to determine
1602/// if an access of a memory read requires an s_waitcnt
1603/// and if so what the value of each counter is.
1604/// The "score bracket" is bound by the lower bound and upper bound
1605/// scores (*_score_LB and *_score_ub respectively).
1606/// If FlushVmCnt is true, that means that we want to generate a s_waitcnt to
1607/// flush the vmcnt counter here.
1608bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
1609 WaitcntBrackets &ScoreBrackets,
1610 MachineInstr *OldWaitcntInstr,
1611 bool FlushVmCnt) {
1612 setForceEmitWaitcnt();
1613
1614 if (MI.isMetaInstruction())
1615 return false;
1616
1617 AMDGPU::Waitcnt Wait;
1618
1619 // FIXME: This should have already been handled by the memory legalizer.
1620 // Removing this currently doesn't affect any lit tests, but we need to
1621 // verify that nothing was relying on this. The number of buffer invalidates
1622 // being handled here should not be expanded.
1623 if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
1624 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
1625 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL ||
1626 MI.getOpcode() == AMDGPU::BUFFER_GL0_INV ||
1627 MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) {
1628 Wait.LoadCnt = 0;
1629 }
1630
1631 // All waits must be resolved at call return.
1632 // NOTE: this could be improved with knowledge of all call sites or
1633 // with knowledge of the called routines.
1634 if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
1635 MI.getOpcode() == AMDGPU::SI_RETURN ||
1636 MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
1637 (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
1638 Wait = Wait.combined(Other: WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
1639 }
1640 // Identify S_ENDPGM instructions which may have to wait for outstanding VMEM
1641 // stores. In this case it can be useful to send a message to explicitly
1642 // release all VGPRs before the stores have completed, but it is only safe to
1643 // do this if:
1644 // * there are no outstanding scratch stores
1645 // * we are not in Dynamic VGPR mode
1646 else if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
1647 MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
1648 if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && !OptNone &&
1649 ScoreBrackets.getScoreRange(T: STORE_CNT) != 0 &&
1650 !ScoreBrackets.hasPendingEvent(E: SCRATCH_WRITE_ACCESS))
1651 ReleaseVGPRInsts.insert(V: &MI);
1652 }
1653 // Resolve vm waits before gs-done.
1654 else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
1655 MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
1656 ST->hasLegacyGeometry() &&
1657 ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
1658 AMDGPU::SendMsg::ID_GS_DONE_PreGFX11)) {
1659 Wait.LoadCnt = 0;
1660 }
1661#if 0 // TODO: the following blocks of logic when we have fence.
1662 else if (MI.getOpcode() == SC_FENCE) {
1663 const unsigned int group_size =
1664 context->shader_info->GetMaxThreadGroupSize();
1665 // group_size == 0 means thread group size is unknown at compile time
1666 const bool group_is_multi_wave =
1667 (group_size == 0 || group_size > target_info->GetWaveFrontSize());
1668 const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
1669
1670 for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
1671 SCRegType src_type = Inst->GetSrcType(i);
1672 switch (src_type) {
1673 case SCMEM_LDS:
1674 if (group_is_multi_wave ||
1675 context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
1676 EmitWaitcnt |= ScoreBrackets->updateByWait(DS_CNT,
1677 ScoreBrackets->getScoreUB(DS_CNT));
1678 // LDS may have to wait for VMcnt after buffer load to LDS
1679 if (target_info->HasBufferLoadToLDS()) {
1680 EmitWaitcnt |= ScoreBrackets->updateByWait(LOAD_CNT,
1681 ScoreBrackets->getScoreUB(LOAD_CNT));
1682 }
1683 }
1684 break;
1685
1686 case SCMEM_GDS:
1687 if (group_is_multi_wave || fence_is_global) {
1688 EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
1689 ScoreBrackets->getScoreUB(EXP_CNT));
1690 EmitWaitcnt |= ScoreBrackets->updateByWait(DS_CNT,
1691 ScoreBrackets->getScoreUB(DS_CNT));
1692 }
1693 break;
1694
1695 case SCMEM_UAV:
1696 case SCMEM_TFBUF:
1697 case SCMEM_RING:
1698 case SCMEM_SCATTER:
1699 if (group_is_multi_wave || fence_is_global) {
1700 EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
1701 ScoreBrackets->getScoreUB(EXP_CNT));
1702 EmitWaitcnt |= ScoreBrackets->updateByWait(LOAD_CNT,
1703 ScoreBrackets->getScoreUB(LOAD_CNT));
1704 }
1705 break;
1706
1707 case SCMEM_SCRATCH:
1708 default:
1709 break;
1710 }
1711 }
1712 }
1713#endif
1714
1715 // Export & GDS instructions do not read the EXEC mask until after the export
1716 // is granted (which can occur well after the instruction is issued).
1717 // The shader program must flush all EXP operations on the export-count
1718 // before overwriting the EXEC mask.
1719 else {
1720 if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
1721 // Export and GDS are tracked individually, either may trigger a waitcnt
1722 // for EXEC.
1723 if (ScoreBrackets.hasPendingEvent(E: EXP_GPR_LOCK) ||
1724 ScoreBrackets.hasPendingEvent(E: EXP_PARAM_ACCESS) ||
1725 ScoreBrackets.hasPendingEvent(E: EXP_POS_ACCESS) ||
1726 ScoreBrackets.hasPendingEvent(E: GDS_GPR_LOCK)) {
1727 Wait.ExpCnt = 0;
1728 }
1729 }
1730
1731 if (MI.isCall() && callWaitsOnFunctionEntry(MI)) {
1732 // The function is going to insert a wait on everything in its prolog.
1733 // This still needs to be careful if the call target is a load (e.g. a GOT
1734 // load). We also need to check WAW dependency with saved PC.
1735 Wait = AMDGPU::Waitcnt();
1736
1737 int CallAddrOpIdx =
1738 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
1739
1740 if (MI.getOperand(i: CallAddrOpIdx).isReg()) {
1741 RegInterval CallAddrOpInterval =
1742 ScoreBrackets.getRegInterval(MI: &MI, MRI, TRI, OpNo: CallAddrOpIdx);
1743
1744 for (int RegNo = CallAddrOpInterval.first;
1745 RegNo < CallAddrOpInterval.second; ++RegNo)
1746 ScoreBrackets.determineWait(T: SmemAccessCounter, RegNo, Wait);
1747
1748 int RtnAddrOpIdx =
1749 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
1750 if (RtnAddrOpIdx != -1) {
1751 RegInterval RtnAddrOpInterval =
1752 ScoreBrackets.getRegInterval(MI: &MI, MRI, TRI, OpNo: RtnAddrOpIdx);
1753
1754 for (int RegNo = RtnAddrOpInterval.first;
1755 RegNo < RtnAddrOpInterval.second; ++RegNo)
1756 ScoreBrackets.determineWait(T: SmemAccessCounter, RegNo, Wait);
1757 }
1758 }
1759 } else {
1760 // FIXME: Should not be relying on memoperands.
1761 // Look at the source operands of every instruction to see if
1762 // any of them results from a previous memory operation that affects
1763 // its current usage. If so, an s_waitcnt instruction needs to be
1764 // emitted.
1765 // If the source operand was defined by a load, add the s_waitcnt
1766 // instruction.
1767 //
1768 // Two cases are handled for destination operands:
1769 // 1) If the destination operand was defined by a load, add the s_waitcnt
1770 // instruction to guarantee the right WAW order.
1771 // 2) If a destination operand that was used by a recent export/store ins,
1772 // add s_waitcnt on exp_cnt to guarantee the WAR order.
1773
1774 for (const MachineMemOperand *Memop : MI.memoperands()) {
1775 const Value *Ptr = Memop->getValue();
1776 if (Memop->isStore() && SLoadAddresses.count(Val: Ptr)) {
1777 addWait(Wait, T: SmemAccessCounter, Count: 0);
1778 if (PDT->dominates(A: MI.getParent(), B: SLoadAddresses.find(Val: Ptr)->second))
1779 SLoadAddresses.erase(Val: Ptr);
1780 }
1781 unsigned AS = Memop->getAddrSpace();
1782 if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::FLAT_ADDRESS)
1783 continue;
1784 // No need to wait before load from VMEM to LDS.
1785 if (TII->mayWriteLDSThroughDMA(MI))
1786 continue;
1787
1788 // LOAD_CNT is only relevant to vgpr or LDS.
1789 unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
1790 bool FoundAliasingStore = false;
1791 // Only objects with alias scope info were added to LDSDMAScopes array.
1792 // In the absense of the scope info we will not be able to disambiguate
1793 // aliasing here. There is no need to try searching for a corresponding
1794 // store slot. This is conservatively correct because in that case we
1795 // will produce a wait using the first (general) LDS DMA wait slot which
1796 // will wait on all of them anyway.
1797 if (Ptr && Memop->getAAInfo() && Memop->getAAInfo().Scope) {
1798 const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
1799 for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
1800 if (MI.mayAlias(AA, Other: *LDSDMAStores[I], UseTBAA: true)) {
1801 FoundAliasingStore = true;
1802 ScoreBrackets.determineWait(T: LOAD_CNT, RegNo: RegNo + I + 1, Wait);
1803 }
1804 }
1805 }
1806 if (!FoundAliasingStore)
1807 ScoreBrackets.determineWait(T: LOAD_CNT, RegNo, Wait);
1808 if (Memop->isStore()) {
1809 ScoreBrackets.determineWait(T: EXP_CNT, RegNo, Wait);
1810 }
1811 }
1812
1813 // Loop over use and def operands.
1814 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
1815 MachineOperand &Op = MI.getOperand(i: I);
1816 if (!Op.isReg())
1817 continue;
1818
1819 // If the instruction does not read tied source, skip the operand.
1820 if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI))
1821 continue;
1822
1823 RegInterval Interval = ScoreBrackets.getRegInterval(MI: &MI, MRI, TRI, OpNo: I);
1824
1825 const bool IsVGPR = TRI->isVectorRegister(MRI: *MRI, Reg: Op.getReg());
1826 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1827 if (IsVGPR) {
1828 // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
1829 // previous write and this write are the same type of VMEM
1830 // instruction, in which case they're guaranteed to write their
1831 // results in order anyway.
1832 if (Op.isUse() || !updateVMCntOnly(Inst: MI) ||
1833 ScoreBrackets.hasOtherPendingVmemTypes(GprNo: RegNo,
1834 V: getVmemType(Inst: MI))) {
1835 ScoreBrackets.determineWait(T: LOAD_CNT, RegNo, Wait);
1836 ScoreBrackets.determineWait(T: SAMPLE_CNT, RegNo, Wait);
1837 ScoreBrackets.determineWait(T: BVH_CNT, RegNo, Wait);
1838 ScoreBrackets.clearVgprVmemTypes(GprNo: RegNo);
1839 }
1840 if (Op.isDef() || ScoreBrackets.hasPendingEvent(E: EXP_LDS_ACCESS)) {
1841 ScoreBrackets.determineWait(T: EXP_CNT, RegNo, Wait);
1842 }
1843 ScoreBrackets.determineWait(T: DS_CNT, RegNo, Wait);
1844 } else {
1845 ScoreBrackets.determineWait(T: SmemAccessCounter, RegNo, Wait);
1846 }
1847 }
1848 }
1849 }
1850 }
1851
1852 // The subtarget may have an implicit S_WAITCNT 0 before barriers. If it does
1853 // not, we need to ensure the subtarget is capable of backing off barrier
1854 // instructions in case there are any outstanding memory operations that may
1855 // cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here.
1856 if (MI.getOpcode() == AMDGPU::S_BARRIER &&
1857 !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) {
1858 Wait = Wait.combined(Other: WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true));
1859 }
1860
1861 // TODO: Remove this work-around, enable the assert for Bug 457939
1862 // after fixing the scheduler. Also, the Shader Compiler code is
1863 // independent of target.
1864 if (readsVCCZ(MI) && ST->hasReadVCCZBug()) {
1865 if (ScoreBrackets.hasPendingEvent(E: SMEM_ACCESS)) {
1866 Wait.DsCnt = 0;
1867 }
1868 }
1869
1870 // Verify that the wait is actually needed.
1871 ScoreBrackets.simplifyWaitcnt(Wait);
1872
1873 if (ForceEmitZeroWaitcnts)
1874 Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
1875
1876 if (ForceEmitWaitcnt[LOAD_CNT])
1877 Wait.LoadCnt = 0;
1878 if (ForceEmitWaitcnt[EXP_CNT])
1879 Wait.ExpCnt = 0;
1880 if (ForceEmitWaitcnt[DS_CNT])
1881 Wait.DsCnt = 0;
1882 if (ForceEmitWaitcnt[SAMPLE_CNT])
1883 Wait.SampleCnt = 0;
1884 if (ForceEmitWaitcnt[BVH_CNT])
1885 Wait.BvhCnt = 0;
1886 if (ForceEmitWaitcnt[KM_CNT])
1887 Wait.KmCnt = 0;
1888
1889 if (FlushVmCnt) {
1890 if (ScoreBrackets.hasPendingEvent(T: LOAD_CNT))
1891 Wait.LoadCnt = 0;
1892 if (ScoreBrackets.hasPendingEvent(T: SAMPLE_CNT))
1893 Wait.SampleCnt = 0;
1894 if (ScoreBrackets.hasPendingEvent(T: BVH_CNT))
1895 Wait.BvhCnt = 0;
1896 }
1897
1898 return generateWaitcnt(Wait, It: MI.getIterator(), Block&: *MI.getParent(), ScoreBrackets,
1899 OldWaitcntInstr);
1900}
1901
1902bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
1903 MachineBasicBlock::instr_iterator It,
1904 MachineBasicBlock &Block,
1905 WaitcntBrackets &ScoreBrackets,
1906 MachineInstr *OldWaitcntInstr) {
1907 bool Modified = false;
1908
1909 if (OldWaitcntInstr)
1910 // Try to merge the required wait with preexisting waitcnt instructions.
1911 // Also erase redundant waitcnt.
1912 Modified =
1913 WCG->applyPreexistingWaitcnt(ScoreBrackets, OldWaitcntInstr&: *OldWaitcntInstr, Wait, It);
1914
1915 // Any counts that could have been applied to any existing waitcnt
1916 // instructions will have been done so, now deal with any remaining.
1917 ScoreBrackets.applyWaitcnt(Wait);
1918
1919 // ExpCnt can be merged into VINTERP.
1920 if (Wait.ExpCnt != ~0u && It != Block.instr_end() &&
1921 SIInstrInfo::isVINTERP(MI: *It)) {
1922 MachineOperand *WaitExp =
1923 TII->getNamedOperand(*It, AMDGPU::OpName::waitexp);
1924 if (Wait.ExpCnt < WaitExp->getImm()) {
1925 WaitExp->setImm(Wait.ExpCnt);
1926 Modified = true;
1927 }
1928 Wait.ExpCnt = ~0u;
1929
1930 LLVM_DEBUG(dbgs() << "generateWaitcnt\n"
1931 << "Update Instr: " << *It);
1932 }
1933
1934 if (WCG->createNewWaitcnt(Block, It, Wait))
1935 Modified = true;
1936
1937 return Modified;
1938}
1939
1940// This is a flat memory operation. Check to see if it has memory tokens other
1941// than LDS. Other address spaces supported by flat memory operations involve
1942// global memory.
1943bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
1944 assert(TII->isFLAT(MI));
1945
1946 // All flat instructions use the VMEM counter.
1947 assert(TII->usesVM_CNT(MI));
1948
1949 // If there are no memory operands then conservatively assume the flat
1950 // operation may access VMEM.
1951 if (MI.memoperands_empty())
1952 return true;
1953
1954 // See if any memory operand specifies an address space that involves VMEM.
1955 // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
1956 // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
1957 // (GDS) address space is not supported by flat operations. Therefore, simply
1958 // return true unless only the LDS address space is found.
1959 for (const MachineMemOperand *Memop : MI.memoperands()) {
1960 unsigned AS = Memop->getAddrSpace();
1961 assert(AS != AMDGPUAS::REGION_ADDRESS);
1962 if (AS != AMDGPUAS::LOCAL_ADDRESS)
1963 return true;
1964 }
1965
1966 return false;
1967}
1968
1969// This is a flat memory operation. Check to see if it has memory tokens for
1970// either LDS or FLAT.
1971bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
1972 assert(TII->isFLAT(MI));
1973
1974 // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
1975 if (!TII->usesLGKM_CNT(MI))
1976 return false;
1977
1978 // If in tgsplit mode then there can be no use of LDS.
1979 if (ST->isTgSplitEnabled())
1980 return false;
1981
1982 // If there are no memory operands then conservatively assume the flat
1983 // operation may access LDS.
1984 if (MI.memoperands_empty())
1985 return true;
1986
1987 // See if any memory operand specifies an address space that involves LDS.
1988 for (const MachineMemOperand *Memop : MI.memoperands()) {
1989 unsigned AS = Memop->getAddrSpace();
1990 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
1991 return true;
1992 }
1993
1994 return false;
1995}
1996
1997// This is a flat memory operation. Check to see if it has memory tokens for
1998// either scratch or FLAT.
1999bool SIInsertWaitcnts::mayAccessScratchThroughFlat(
2000 const MachineInstr &MI) const {
2001 assert(TII->isFLAT(MI));
2002
2003 // SCRATCH instructions always access scratch.
2004 if (TII->isFLATScratch(MI))
2005 return true;
2006
2007 // GLOBAL instructions never access scratch.
2008 if (TII->isFLATGlobal(MI))
2009 return false;
2010
2011 // If there are no memory operands then conservatively assume the flat
2012 // operation may access scratch.
2013 if (MI.memoperands_empty())
2014 return true;
2015
2016 // See if any memory operand specifies an address space that involves scratch.
2017 return any_of(Range: MI.memoperands(), P: [](const MachineMemOperand *Memop) {
2018 unsigned AS = Memop->getAddrSpace();
2019 return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
2020 });
2021}
2022
2023static bool isCacheInvOrWBInst(MachineInstr &Inst) {
2024 auto Opc = Inst.getOpcode();
2025 return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB ||
2026 Opc == AMDGPU::GLOBAL_WBINV;
2027}
2028
2029void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
2030 WaitcntBrackets *ScoreBrackets) {
2031 // Now look at the instruction opcode. If it is a memory access
2032 // instruction, update the upper-bound of the appropriate counter's
2033 // bracket and the destination operand scores.
2034 // TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere.
2035
2036 if (TII->isDS(MI: Inst) && TII->usesLGKM_CNT(MI: Inst)) {
2037 if (TII->isAlwaysGDS(Inst.getOpcode()) ||
2038 TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
2039 ScoreBrackets->updateByEvent(TII, TRI, MRI, E: GDS_ACCESS, Inst);
2040 ScoreBrackets->updateByEvent(TII, TRI, MRI, E: GDS_GPR_LOCK, Inst);
2041 } else {
2042 ScoreBrackets->updateByEvent(TII, TRI, MRI, E: LDS_ACCESS, Inst);
2043 }
2044 } else if (TII->isFLAT(MI: Inst)) {
2045 // TODO: Track this properly.
2046 if (isCacheInvOrWBInst(Inst))
2047 return;
2048
2049 assert(Inst.mayLoadOrStore());
2050
2051 int FlatASCount = 0;
2052
2053 if (mayAccessVMEMThroughFlat(MI: Inst)) {
2054 ++FlatASCount;
2055 ScoreBrackets->updateByEvent(TII, TRI, MRI, E: getVmemWaitEventType(Inst),
2056 Inst);
2057 }
2058
2059 if (mayAccessLDSThroughFlat(MI: Inst)) {
2060 ++FlatASCount;
2061 ScoreBrackets->updateByEvent(TII, TRI, MRI, E: LDS_ACCESS, Inst);
2062 }
2063
2064 // A Flat memory operation must access at least one address space.
2065 assert(FlatASCount);
2066
2067 // This is a flat memory operation that access both VMEM and LDS, so note it
2068 // - it will require that both the VM and LGKM be flushed to zero if it is
2069 // pending when a VM or LGKM dependency occurs.
2070 if (FlatASCount > 1)
2071 ScoreBrackets->setPendingFlat();
2072 } else if (SIInstrInfo::isVMEM(MI: Inst) &&
2073 !llvm::AMDGPU::getMUBUFIsBufferInv(Opc: Inst.getOpcode())) {
2074 ScoreBrackets->updateByEvent(TII, TRI, MRI, E: getVmemWaitEventType(Inst),
2075 Inst);
2076
2077 if (ST->vmemWriteNeedsExpWaitcnt() &&
2078 (Inst.mayStore() || SIInstrInfo::isAtomicRet(MI: Inst))) {
2079 ScoreBrackets->updateByEvent(TII, TRI, MRI, E: VMW_GPR_LOCK, Inst);
2080 }
2081 } else if (TII->isSMRD(MI: Inst)) {
2082 ScoreBrackets->updateByEvent(TII, TRI, MRI, E: SMEM_ACCESS, Inst);
2083 } else if (Inst.isCall()) {
2084 if (callWaitsOnFunctionReturn(MI: Inst)) {
2085 // Act as a wait on everything
2086 ScoreBrackets->applyWaitcnt(
2087 Wait: WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
2088 ScoreBrackets->setStateOnFunctionEntryOrReturn();
2089 } else {
2090 // May need to way wait for anything.
2091 ScoreBrackets->applyWaitcnt(Wait: AMDGPU::Waitcnt());
2092 }
2093 } else if (SIInstrInfo::isLDSDIR(MI: Inst)) {
2094 ScoreBrackets->updateByEvent(TII, TRI, MRI, E: EXP_LDS_ACCESS, Inst);
2095 } else if (TII->isVINTERP(MI: Inst)) {
2096 int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
2097 ScoreBrackets->applyWaitcnt(T: EXP_CNT, Count: Imm);
2098 } else if (SIInstrInfo::isEXP(MI: Inst)) {
2099 unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
2100 if (Imm >= AMDGPU::Exp::ET_PARAM0 && Imm <= AMDGPU::Exp::ET_PARAM31)
2101 ScoreBrackets->updateByEvent(TII, TRI, MRI, E: EXP_PARAM_ACCESS, Inst);
2102 else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
2103 ScoreBrackets->updateByEvent(TII, TRI, MRI, E: EXP_POS_ACCESS, Inst);
2104 else
2105 ScoreBrackets->updateByEvent(TII, TRI, MRI, E: EXP_GPR_LOCK, Inst);
2106 } else {
2107 switch (Inst.getOpcode()) {
2108 case AMDGPU::S_SENDMSG:
2109 case AMDGPU::S_SENDMSG_RTN_B32:
2110 case AMDGPU::S_SENDMSG_RTN_B64:
2111 case AMDGPU::S_SENDMSGHALT:
2112 ScoreBrackets->updateByEvent(TII, TRI, MRI, E: SQ_MESSAGE, Inst);
2113 break;
2114 case AMDGPU::S_MEMTIME:
2115 case AMDGPU::S_MEMREALTIME:
2116 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0:
2117 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM:
2118 case AMDGPU::S_BARRIER_LEAVE:
2119 case AMDGPU::S_GET_BARRIER_STATE_M0:
2120 case AMDGPU::S_GET_BARRIER_STATE_IMM:
2121 ScoreBrackets->updateByEvent(TII, TRI, MRI, E: SMEM_ACCESS, Inst);
2122 break;
2123 }
2124 }
2125}
2126
2127bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
2128 unsigned OtherScore) {
2129 unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
2130 unsigned OtherShifted =
2131 OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
2132 Score = std::max(a: MyShifted, b: OtherShifted);
2133 return OtherShifted > MyShifted;
2134}
2135
2136/// Merge the pending events and associater score brackets of \p Other into
2137/// this brackets status.
2138///
2139/// Returns whether the merge resulted in a change that requires tighter waits
2140/// (i.e. the merged brackets strictly dominate the original brackets).
2141bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
2142 bool StrictDom = false;
2143
2144 VgprUB = std::max(a: VgprUB, b: Other.VgprUB);
2145 SgprUB = std::max(a: SgprUB, b: Other.SgprUB);
2146
2147 for (auto T : inst_counter_types(MaxCounter)) {
2148 // Merge event flags for this counter
2149 const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
2150 const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
2151 if (OtherEvents & ~OldEvents)
2152 StrictDom = true;
2153 PendingEvents |= OtherEvents;
2154
2155 // Merge scores for this counter
2156 const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
2157 const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
2158 const unsigned NewUB = ScoreLBs[T] + std::max(a: MyPending, b: OtherPending);
2159 if (NewUB < ScoreLBs[T])
2160 report_fatal_error(reason: "waitcnt score overflow");
2161
2162 MergeInfo M;
2163 M.OldLB = ScoreLBs[T];
2164 M.OtherLB = Other.ScoreLBs[T];
2165 M.MyShift = NewUB - ScoreUBs[T];
2166 M.OtherShift = NewUB - Other.ScoreUBs[T];
2167
2168 ScoreUBs[T] = NewUB;
2169
2170 StrictDom |= mergeScore(M, Score&: LastFlat[T], OtherScore: Other.LastFlat[T]);
2171
2172 for (int J = 0; J <= VgprUB; J++)
2173 StrictDom |= mergeScore(M, Score&: VgprScores[T][J], OtherScore: Other.VgprScores[T][J]);
2174
2175 if (T == SmemAccessCounter) {
2176 for (int J = 0; J <= SgprUB; J++)
2177 StrictDom |= mergeScore(M, Score&: SgprScores[J], OtherScore: Other.SgprScores[J]);
2178 }
2179 }
2180
2181 for (int J = 0; J <= VgprUB; J++) {
2182 unsigned char NewVmemTypes = VgprVmemTypes[J] | Other.VgprVmemTypes[J];
2183 StrictDom |= NewVmemTypes != VgprVmemTypes[J];
2184 VgprVmemTypes[J] = NewVmemTypes;
2185 }
2186
2187 return StrictDom;
2188}
2189
2190static bool isWaitInstr(MachineInstr &Inst) {
2191 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode: Inst.getOpcode());
2192 return Opcode == AMDGPU::S_WAITCNT ||
2193 (Opcode == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(0).isReg() &&
2194 Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) ||
2195 Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
2196 Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
2197 counterTypeForInstr(Opcode).has_value();
2198}
2199
2200// Generate s_waitcnt instructions where needed.
2201bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
2202 MachineBasicBlock &Block,
2203 WaitcntBrackets &ScoreBrackets) {
2204 bool Modified = false;
2205
2206 LLVM_DEBUG({
2207 dbgs() << "*** Block" << Block.getNumber() << " ***";
2208 ScoreBrackets.dump();
2209 });
2210
2211 // Track the correctness of vccz through this basic block. There are two
2212 // reasons why it might be incorrect; see ST->hasReadVCCZBug() and
2213 // ST->partialVCCWritesUpdateVCCZ().
2214 bool VCCZCorrect = true;
2215 if (ST->hasReadVCCZBug()) {
2216 // vccz could be incorrect at a basic block boundary if a predecessor wrote
2217 // to vcc and then issued an smem load.
2218 VCCZCorrect = false;
2219 } else if (!ST->partialVCCWritesUpdateVCCZ()) {
2220 // vccz could be incorrect at a basic block boundary if a predecessor wrote
2221 // to vcc_lo or vcc_hi.
2222 VCCZCorrect = false;
2223 }
2224
2225 // Walk over the instructions.
2226 MachineInstr *OldWaitcntInstr = nullptr;
2227
2228 for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
2229 E = Block.instr_end();
2230 Iter != E;) {
2231 MachineInstr &Inst = *Iter;
2232
2233 // Track pre-existing waitcnts that were added in earlier iterations or by
2234 // the memory legalizer.
2235 if (isWaitInstr(Inst)) {
2236 if (!OldWaitcntInstr)
2237 OldWaitcntInstr = &Inst;
2238 ++Iter;
2239 continue;
2240 }
2241
2242 bool FlushVmCnt = Block.getFirstTerminator() == Inst &&
2243 isPreheaderToFlush(MBB&: Block, ScoreBrackets);
2244
2245 // Generate an s_waitcnt instruction to be placed before Inst, if needed.
2246 Modified |= generateWaitcntInstBefore(MI&: Inst, ScoreBrackets, OldWaitcntInstr,
2247 FlushVmCnt);
2248 OldWaitcntInstr = nullptr;
2249
2250 // Restore vccz if it's not known to be correct already.
2251 bool RestoreVCCZ = !VCCZCorrect && readsVCCZ(MI: Inst);
2252
2253 // Don't examine operands unless we need to track vccz correctness.
2254 if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) {
2255 if (Inst.definesRegister(AMDGPU::VCC_LO, /*TRI=*/nullptr) ||
2256 Inst.definesRegister(AMDGPU::VCC_HI, /*TRI=*/nullptr)) {
2257 // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
2258 if (!ST->partialVCCWritesUpdateVCCZ())
2259 VCCZCorrect = false;
2260 } else if (Inst.definesRegister(AMDGPU::VCC, /*TRI=*/nullptr)) {
2261 // There is a hardware bug on CI/SI where SMRD instruction may corrupt
2262 // vccz bit, so when we detect that an instruction may read from a
2263 // corrupt vccz bit, we need to:
2264 // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
2265 // operations to complete.
2266 // 2. Restore the correct value of vccz by writing the current value
2267 // of vcc back to vcc.
2268 if (ST->hasReadVCCZBug() &&
2269 ScoreBrackets.hasPendingEvent(E: SMEM_ACCESS)) {
2270 // Writes to vcc while there's an outstanding smem read may get
2271 // clobbered as soon as any read completes.
2272 VCCZCorrect = false;
2273 } else {
2274 // Writes to vcc will fix any incorrect value in vccz.
2275 VCCZCorrect = true;
2276 }
2277 }
2278 }
2279
2280 if (TII->isSMRD(MI: Inst)) {
2281 for (const MachineMemOperand *Memop : Inst.memoperands()) {
2282 // No need to handle invariant loads when avoiding WAR conflicts, as
2283 // there cannot be a vector store to the same memory location.
2284 if (!Memop->isInvariant()) {
2285 const Value *Ptr = Memop->getValue();
2286 SLoadAddresses.insert(KV: std::pair(Ptr, Inst.getParent()));
2287 }
2288 }
2289 if (ST->hasReadVCCZBug()) {
2290 // This smem read could complete and clobber vccz at any time.
2291 VCCZCorrect = false;
2292 }
2293 }
2294
2295 updateEventWaitcntAfter(Inst, ScoreBrackets: &ScoreBrackets);
2296
2297#if 0 // TODO: implement resource type check controlled by options with ub = LB.
2298 // If this instruction generates a S_SETVSKIP because it is an
2299 // indexed resource, and we are on Tahiti, then it will also force
2300 // an S_WAITCNT vmcnt(0)
2301 if (RequireCheckResourceType(Inst, context)) {
2302 // Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
2303 ScoreBrackets->setScoreLB(LOAD_CNT,
2304 ScoreBrackets->getScoreUB(LOAD_CNT));
2305 }
2306#endif
2307
2308 if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) {
2309 AMDGPU::Waitcnt Wait = WCG->getAllZeroWaitcnt(
2310 IncludeVSCnt: Inst.mayStore() && !SIInstrInfo::isAtomicRet(MI: Inst));
2311 ScoreBrackets.simplifyWaitcnt(Wait);
2312 Modified |= generateWaitcnt(Wait, It: std::next(x: Inst.getIterator()), Block,
2313 ScoreBrackets, /*OldWaitcntInstr=*/nullptr);
2314 }
2315
2316 LLVM_DEBUG({
2317 Inst.print(dbgs());
2318 ScoreBrackets.dump();
2319 });
2320
2321 // TODO: Remove this work-around after fixing the scheduler and enable the
2322 // assert above.
2323 if (RestoreVCCZ) {
2324 // Restore the vccz bit. Any time a value is written to vcc, the vcc
2325 // bit is updated, so we can restore the bit by reading the value of
2326 // vcc and then writing it back to the register.
2327 BuildMI(Block, Inst, Inst.getDebugLoc(),
2328 TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
2329 TRI->getVCC())
2330 .addReg(TRI->getVCC());
2331 VCCZCorrect = true;
2332 Modified = true;
2333 }
2334
2335 ++Iter;
2336 }
2337
2338 // Flush the LOADcnt, SAMPLEcnt and BVHcnt counters at the end of the block if
2339 // needed.
2340 AMDGPU::Waitcnt Wait;
2341 if (Block.getFirstTerminator() == Block.end() &&
2342 isPreheaderToFlush(MBB&: Block, ScoreBrackets)) {
2343 if (ScoreBrackets.hasPendingEvent(T: LOAD_CNT))
2344 Wait.LoadCnt = 0;
2345 if (ScoreBrackets.hasPendingEvent(T: SAMPLE_CNT))
2346 Wait.SampleCnt = 0;
2347 if (ScoreBrackets.hasPendingEvent(T: BVH_CNT))
2348 Wait.BvhCnt = 0;
2349 }
2350
2351 // Combine or remove any redundant waitcnts at the end of the block.
2352 Modified |= generateWaitcnt(Wait, It: Block.instr_end(), Block, ScoreBrackets,
2353 OldWaitcntInstr);
2354
2355 return Modified;
2356}
2357
2358// Return true if the given machine basic block is a preheader of a loop in
2359// which we want to flush the vmcnt counter, and false otherwise.
2360bool SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB,
2361 WaitcntBrackets &ScoreBrackets) {
2362 auto [Iterator, IsInserted] = PreheadersToFlush.try_emplace(Key: &MBB, Args: false);
2363 if (!IsInserted)
2364 return Iterator->second;
2365
2366 MachineBasicBlock *Succ = MBB.getSingleSuccessor();
2367 if (!Succ)
2368 return false;
2369
2370 MachineLoop *Loop = MLI->getLoopFor(BB: Succ);
2371 if (!Loop)
2372 return false;
2373
2374 if (Loop->getLoopPreheader() == &MBB &&
2375 shouldFlushVmCnt(ML: Loop, Brackets&: ScoreBrackets)) {
2376 Iterator->second = true;
2377 return true;
2378 }
2379
2380 return false;
2381}
2382
2383bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
2384 return SIInstrInfo::isVMEM(MI) ||
2385 (SIInstrInfo::isFLAT(MI) && mayAccessVMEMThroughFlat(MI));
2386}
2387
2388// Return true if it is better to flush the vmcnt counter in the preheader of
2389// the given loop. We currently decide to flush in two situations:
2390// 1. The loop contains vmem store(s), no vmem load and at least one use of a
2391// vgpr containing a value that is loaded outside of the loop. (Only on
2392// targets with no vscnt counter).
2393// 2. The loop contains vmem load(s), but the loaded values are not used in the
2394// loop, and at least one use of a vgpr containing a value that is loaded
2395// outside of the loop.
2396bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
2397 WaitcntBrackets &Brackets) {
2398 bool HasVMemLoad = false;
2399 bool HasVMemStore = false;
2400 bool UsesVgprLoadedOutside = false;
2401 DenseSet<Register> VgprUse;
2402 DenseSet<Register> VgprDef;
2403
2404 for (MachineBasicBlock *MBB : ML->blocks()) {
2405 for (MachineInstr &MI : *MBB) {
2406 if (isVMEMOrFlatVMEM(MI)) {
2407 if (MI.mayLoad())
2408 HasVMemLoad = true;
2409 if (MI.mayStore())
2410 HasVMemStore = true;
2411 }
2412 for (unsigned I = 0; I < MI.getNumOperands(); I++) {
2413 MachineOperand &Op = MI.getOperand(i: I);
2414 if (!Op.isReg() || !TRI->isVectorRegister(MRI: *MRI, Reg: Op.getReg()))
2415 continue;
2416 RegInterval Interval = Brackets.getRegInterval(MI: &MI, MRI, TRI, OpNo: I);
2417 // Vgpr use
2418 if (Op.isUse()) {
2419 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
2420 // If we find a register that is loaded inside the loop, 1. and 2.
2421 // are invalidated and we can exit.
2422 if (VgprDef.contains(V: RegNo))
2423 return false;
2424 VgprUse.insert(V: RegNo);
2425 // If at least one of Op's registers is in the score brackets, the
2426 // value is likely loaded outside of the loop.
2427 if (Brackets.getRegScore(GprNo: RegNo, T: LOAD_CNT) >
2428 Brackets.getScoreLB(T: LOAD_CNT) ||
2429 Brackets.getRegScore(GprNo: RegNo, T: SAMPLE_CNT) >
2430 Brackets.getScoreLB(T: SAMPLE_CNT) ||
2431 Brackets.getRegScore(GprNo: RegNo, T: BVH_CNT) >
2432 Brackets.getScoreLB(T: BVH_CNT)) {
2433 UsesVgprLoadedOutside = true;
2434 break;
2435 }
2436 }
2437 }
2438 // VMem load vgpr def
2439 else if (isVMEMOrFlatVMEM(MI) && MI.mayLoad() && Op.isDef())
2440 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
2441 // If we find a register that is loaded inside the loop, 1. and 2.
2442 // are invalidated and we can exit.
2443 if (VgprUse.contains(V: RegNo))
2444 return false;
2445 VgprDef.insert(V: RegNo);
2446 }
2447 }
2448 }
2449 }
2450 if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside)
2451 return true;
2452 return HasVMemLoad && UsesVgprLoadedOutside;
2453}
2454
2455bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
2456 ST = &MF.getSubtarget<GCNSubtarget>();
2457 TII = ST->getInstrInfo();
2458 TRI = &TII->getRegisterInfo();
2459 MRI = &MF.getRegInfo();
2460 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2461 MLI = &getAnalysis<MachineLoopInfo>();
2462 PDT = &getAnalysis<MachinePostDominatorTree>();
2463 if (auto AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
2464 AA = &AAR->getAAResults();
2465
2466 AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(GPU: ST->getCPU());
2467
2468 if (ST->hasExtendedWaitCounts()) {
2469 MaxCounter = NUM_EXTENDED_INST_CNTS;
2470 WCGGFX12Plus = WaitcntGeneratorGFX12Plus(ST, MaxCounter);
2471 WCG = &WCGGFX12Plus;
2472 } else {
2473 MaxCounter = NUM_NORMAL_INST_CNTS;
2474 WCGPreGFX12 = WaitcntGeneratorPreGFX12(ST);
2475 WCG = &WCGPreGFX12;
2476 }
2477
2478 ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
2479 for (auto T : inst_counter_types())
2480 ForceEmitWaitcnt[T] = false;
2481
2482 const unsigned *WaitEventMaskForInst = WCG->getWaitEventMask();
2483
2484 SmemAccessCounter = eventCounter(masks: WaitEventMaskForInst, E: SMEM_ACCESS);
2485
2486 OptNone = MF.getFunction().hasOptNone() ||
2487 MF.getTarget().getOptLevel() == CodeGenOptLevel::None;
2488
2489 HardwareLimits Limits = {};
2490 if (ST->hasExtendedWaitCounts()) {
2491 Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(Version: IV);
2492 Limits.DscntMax = AMDGPU::getDscntBitMask(Version: IV);
2493 } else {
2494 Limits.LoadcntMax = AMDGPU::getVmcntBitMask(Version: IV);
2495 Limits.DscntMax = AMDGPU::getLgkmcntBitMask(Version: IV);
2496 }
2497 Limits.ExpcntMax = AMDGPU::getExpcntBitMask(Version: IV);
2498 Limits.StorecntMax = AMDGPU::getStorecntBitMask(Version: IV);
2499 Limits.SamplecntMax = AMDGPU::getSamplecntBitMask(Version: IV);
2500 Limits.BvhcntMax = AMDGPU::getBvhcntBitMask(Version: IV);
2501 Limits.KmcntMax = AMDGPU::getKmcntBitMask(Version: IV);
2502
2503 unsigned NumVGPRsMax = ST->getAddressableNumVGPRs();
2504 unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
2505 assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
2506 assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
2507
2508 RegisterEncoding Encoding = {};
2509 Encoding.VGPR0 =
2510 TRI->getEncodingValue(AMDGPU::VGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK;
2511 Encoding.VGPRL = Encoding.VGPR0 + NumVGPRsMax - 1;
2512 Encoding.SGPR0 =
2513 TRI->getEncodingValue(AMDGPU::SGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK;
2514 Encoding.SGPRL = Encoding.SGPR0 + NumSGPRsMax - 1;
2515
2516 BlockInfos.clear();
2517 bool Modified = false;
2518
2519 MachineBasicBlock &EntryBB = MF.front();
2520 MachineBasicBlock::iterator I = EntryBB.begin();
2521
2522 if (!MFI->isEntryFunction()) {
2523 // Wait for any outstanding memory operations that the input registers may
2524 // depend on. We can't track them and it's better to do the wait after the
2525 // costly call sequence.
2526
2527 // TODO: Could insert earlier and schedule more liberally with operations
2528 // that only use caller preserved registers.
2529 for (MachineBasicBlock::iterator E = EntryBB.end();
2530 I != E && (I->isPHI() || I->isMetaInstruction()); ++I)
2531 ;
2532
2533 if (ST->hasExtendedWaitCounts()) {
2534 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
2535 .addImm(0);
2536 for (auto CT : inst_counter_types(MaxCounter: NUM_EXTENDED_INST_CNTS)) {
2537 if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT)
2538 continue;
2539
2540 BuildMI(EntryBB, I, DebugLoc(),
2541 TII->get(instrsForExtendedCounterTypes[CT]))
2542 .addImm(0);
2543 }
2544 } else {
2545 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
2546 }
2547
2548 auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(
2549 args&: ST, args&: MaxCounter, args&: Limits, args&: Encoding, args&: WaitEventMaskForInst,
2550 args&: SmemAccessCounter);
2551 NonKernelInitialState->setStateOnFunctionEntryOrReturn();
2552 BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
2553
2554 Modified = true;
2555 }
2556
2557 // Keep iterating over the blocks in reverse post order, inserting and
2558 // updating s_waitcnt where needed, until a fix point is reached.
2559 for (auto *MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))
2560 BlockInfos.insert(KV: {MBB, BlockInfo()});
2561
2562 std::unique_ptr<WaitcntBrackets> Brackets;
2563 bool Repeat;
2564 do {
2565 Repeat = false;
2566
2567 for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
2568 ++BII) {
2569 MachineBasicBlock *MBB = BII->first;
2570 BlockInfo &BI = BII->second;
2571 if (!BI.Dirty)
2572 continue;
2573
2574 if (BI.Incoming) {
2575 if (!Brackets)
2576 Brackets = std::make_unique<WaitcntBrackets>(args&: *BI.Incoming);
2577 else
2578 *Brackets = *BI.Incoming;
2579 } else {
2580 if (!Brackets)
2581 Brackets = std::make_unique<WaitcntBrackets>(
2582 args&: ST, args&: MaxCounter, args&: Limits, args&: Encoding, args&: WaitEventMaskForInst,
2583 args&: SmemAccessCounter);
2584 else
2585 *Brackets = WaitcntBrackets(ST, MaxCounter, Limits, Encoding,
2586 WaitEventMaskForInst, SmemAccessCounter);
2587 }
2588
2589 Modified |= insertWaitcntInBlock(MF, Block&: *MBB, ScoreBrackets&: *Brackets);
2590 BI.Dirty = false;
2591
2592 if (Brackets->hasPendingEvent()) {
2593 BlockInfo *MoveBracketsToSucc = nullptr;
2594 for (MachineBasicBlock *Succ : MBB->successors()) {
2595 auto SuccBII = BlockInfos.find(Key: Succ);
2596 BlockInfo &SuccBI = SuccBII->second;
2597 if (!SuccBI.Incoming) {
2598 SuccBI.Dirty = true;
2599 if (SuccBII <= BII)
2600 Repeat = true;
2601 if (!MoveBracketsToSucc) {
2602 MoveBracketsToSucc = &SuccBI;
2603 } else {
2604 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(args&: *Brackets);
2605 }
2606 } else if (SuccBI.Incoming->merge(Other: *Brackets)) {
2607 SuccBI.Dirty = true;
2608 if (SuccBII <= BII)
2609 Repeat = true;
2610 }
2611 }
2612 if (MoveBracketsToSucc)
2613 MoveBracketsToSucc->Incoming = std::move(Brackets);
2614 }
2615 }
2616 } while (Repeat);
2617
2618 if (ST->hasScalarStores()) {
2619 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
2620 bool HaveScalarStores = false;
2621
2622 for (MachineBasicBlock &MBB : MF) {
2623 for (MachineInstr &MI : MBB) {
2624 if (!HaveScalarStores && TII->isScalarStore(MI))
2625 HaveScalarStores = true;
2626
2627 if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
2628 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
2629 EndPgmBlocks.push_back(Elt: &MBB);
2630 }
2631 }
2632
2633 if (HaveScalarStores) {
2634 // If scalar writes are used, the cache must be flushed or else the next
2635 // wave to reuse the same scratch memory can be clobbered.
2636 //
2637 // Insert s_dcache_wb at wave termination points if there were any scalar
2638 // stores, and only if the cache hasn't already been flushed. This could
2639 // be improved by looking across blocks for flushes in postdominating
2640 // blocks from the stores but an explicitly requested flush is probably
2641 // very rare.
2642 for (MachineBasicBlock *MBB : EndPgmBlocks) {
2643 bool SeenDCacheWB = false;
2644
2645 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
2646 I != E; ++I) {
2647 if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
2648 SeenDCacheWB = true;
2649 else if (TII->isScalarStore(MI: *I))
2650 SeenDCacheWB = false;
2651
2652 // FIXME: It would be better to insert this before a waitcnt if any.
2653 if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
2654 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
2655 !SeenDCacheWB) {
2656 Modified = true;
2657 BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
2658 }
2659 }
2660 }
2661 }
2662 }
2663
2664 // Insert DEALLOC_VGPR messages before previously identified S_ENDPGM
2665 // instructions.
2666 for (MachineInstr *MI : ReleaseVGPRInsts) {
2667 if (ST->requiresNopBeforeDeallocVGPRs()) {
2668 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::S_NOP))
2669 .addImm(0);
2670 }
2671 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
2672 TII->get(AMDGPU::S_SENDMSG))
2673 .addImm(AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus);
2674 Modified = true;
2675 }
2676 ReleaseVGPRInsts.clear();
2677
2678 return Modified;
2679}
2680

source code of llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp