1 | //===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file |
10 | /// This pass adds instructions to enable whole quad mode (strict or non-strict) |
11 | /// for pixel shaders, and strict whole wavefront mode for all programs. |
12 | /// |
13 | /// The "strict" prefix indicates that inactive lanes do not take part in |
14 | /// control flow, specifically an inactive lane enabled by a strict WQM/WWM will |
15 | /// always be enabled irrespective of control flow decisions. Conversely in |
16 | /// non-strict WQM inactive lanes may control flow decisions. |
17 | /// |
18 | /// Whole quad mode is required for derivative computations, but it interferes |
19 | /// with shader side effects (stores and atomics). It ensures that WQM is |
20 | /// enabled when necessary, but disabled around stores and atomics. |
21 | /// |
22 | /// When necessary, this pass creates a function prolog |
23 | /// |
24 | /// S_MOV_B64 LiveMask, EXEC |
25 | /// S_WQM_B64 EXEC, EXEC |
26 | /// |
27 | /// to enter WQM at the top of the function and surrounds blocks of Exact |
28 | /// instructions by |
29 | /// |
30 | /// S_AND_SAVEEXEC_B64 Tmp, LiveMask |
31 | /// ... |
32 | /// S_MOV_B64 EXEC, Tmp |
33 | /// |
34 | /// We also compute when a sequence of instructions requires strict whole |
35 | /// wavefront mode (StrictWWM) and insert instructions to save and restore it: |
36 | /// |
37 | /// S_OR_SAVEEXEC_B64 Tmp, -1 |
38 | /// ... |
39 | /// S_MOV_B64 EXEC, Tmp |
40 | /// |
41 | /// When a sequence of instructions requires strict whole quad mode (StrictWQM) |
42 | /// we use a similar save and restore mechanism and force whole quad mode for |
43 | /// those instructions: |
44 | /// |
45 | /// S_MOV_B64 Tmp, EXEC |
46 | /// S_WQM_B64 EXEC, EXEC |
47 | /// ... |
48 | /// S_MOV_B64 EXEC, Tmp |
49 | /// |
50 | /// In order to avoid excessive switching during sequences of Exact |
51 | /// instructions, the pass first analyzes which instructions must be run in WQM |
52 | /// (aka which instructions produce values that lead to derivative |
53 | /// computations). |
54 | /// |
55 | /// Basic blocks are always exited in WQM as long as some successor needs WQM. |
56 | /// |
57 | /// There is room for improvement given better control flow analysis: |
58 | /// |
59 | /// (1) at the top level (outside of control flow statements, and as long as |
60 | /// kill hasn't been used), one SGPR can be saved by recovering WQM from |
61 | /// the LiveMask (this is implemented for the entry block). |
62 | /// |
63 | /// (2) when entire regions (e.g. if-else blocks or entire loops) only |
64 | /// consist of exact and don't-care instructions, the switch only has to |
65 | /// be done at the entry and exit points rather than potentially in each |
66 | /// block of the region. |
67 | /// |
68 | //===----------------------------------------------------------------------===// |
69 | |
70 | #include "AMDGPU.h" |
71 | #include "GCNSubtarget.h" |
72 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
73 | #include "llvm/ADT/MapVector.h" |
74 | #include "llvm/ADT/PostOrderIterator.h" |
75 | #include "llvm/CodeGen/LiveIntervals.h" |
76 | #include "llvm/CodeGen/MachineBasicBlock.h" |
77 | #include "llvm/CodeGen/MachineDominators.h" |
78 | #include "llvm/CodeGen/MachineFunctionPass.h" |
79 | #include "llvm/CodeGen/MachineInstr.h" |
80 | #include "llvm/CodeGen/MachinePostDominators.h" |
81 | #include "llvm/IR/CallingConv.h" |
82 | #include "llvm/InitializePasses.h" |
83 | #include "llvm/Support/raw_ostream.h" |
84 | |
85 | using namespace llvm; |
86 | |
87 | #define DEBUG_TYPE "si-wqm" |
88 | |
89 | namespace { |
90 | |
91 | enum { |
92 | StateWQM = 0x1, |
93 | StateStrictWWM = 0x2, |
94 | StateStrictWQM = 0x4, |
95 | StateExact = 0x8, |
96 | StateStrict = StateStrictWWM | StateStrictWQM, |
97 | }; |
98 | |
99 | struct PrintState { |
100 | public: |
101 | int State; |
102 | |
103 | explicit PrintState(int State) : State(State) {} |
104 | }; |
105 | |
106 | #ifndef NDEBUG |
107 | static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) { |
108 | |
109 | static const std::pair<char, const char *> Mapping[] = { |
110 | std::pair(StateWQM, "WQM" ), std::pair(StateStrictWWM, "StrictWWM" ), |
111 | std::pair(StateStrictWQM, "StrictWQM" ), std::pair(StateExact, "Exact" )}; |
112 | char State = PS.State; |
113 | for (auto M : Mapping) { |
114 | if (State & M.first) { |
115 | OS << M.second; |
116 | State &= ~M.first; |
117 | |
118 | if (State) |
119 | OS << '|'; |
120 | } |
121 | } |
122 | assert(State == 0); |
123 | return OS; |
124 | } |
125 | #endif |
126 | |
127 | struct InstrInfo { |
128 | char Needs = 0; |
129 | char Disabled = 0; |
130 | char OutNeeds = 0; |
131 | }; |
132 | |
133 | struct BlockInfo { |
134 | char Needs = 0; |
135 | char InNeeds = 0; |
136 | char OutNeeds = 0; |
137 | char InitialState = 0; |
138 | bool NeedsLowering = false; |
139 | }; |
140 | |
141 | struct WorkItem { |
142 | MachineBasicBlock *MBB = nullptr; |
143 | MachineInstr *MI = nullptr; |
144 | |
145 | WorkItem() = default; |
146 | WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {} |
147 | WorkItem(MachineInstr *MI) : MI(MI) {} |
148 | }; |
149 | |
150 | class SIWholeQuadMode : public MachineFunctionPass { |
151 | private: |
152 | const SIInstrInfo *TII; |
153 | const SIRegisterInfo *TRI; |
154 | const GCNSubtarget *ST; |
155 | MachineRegisterInfo *MRI; |
156 | LiveIntervals *LIS; |
157 | MachineDominatorTree *MDT; |
158 | MachinePostDominatorTree *PDT; |
159 | |
160 | unsigned AndOpc; |
161 | unsigned AndTermOpc; |
162 | unsigned AndN2Opc; |
163 | unsigned XorOpc; |
164 | unsigned AndSaveExecOpc; |
165 | unsigned AndSaveExecTermOpc; |
166 | unsigned WQMOpc; |
167 | Register Exec; |
168 | Register LiveMaskReg; |
169 | |
170 | DenseMap<const MachineInstr *, InstrInfo> Instructions; |
171 | MapVector<MachineBasicBlock *, BlockInfo> Blocks; |
172 | |
173 | // Tracks state (WQM/StrictWWM/StrictWQM/Exact) after a given instruction |
174 | DenseMap<const MachineInstr *, char> StateTransition; |
175 | |
176 | SmallVector<MachineInstr *, 2> LiveMaskQueries; |
177 | SmallVector<MachineInstr *, 4> LowerToMovInstrs; |
178 | SmallVector<MachineInstr *, 4> LowerToCopyInstrs; |
179 | SmallVector<MachineInstr *, 4> KillInstrs; |
180 | |
181 | void printInfo(); |
182 | |
183 | void markInstruction(MachineInstr &MI, char Flag, |
184 | std::vector<WorkItem> &Worklist); |
185 | void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg, |
186 | unsigned SubReg, char Flag, std::vector<WorkItem> &Worklist); |
187 | void markOperand(const MachineInstr &MI, const MachineOperand &Op, char Flag, |
188 | std::vector<WorkItem> &Worklist); |
189 | void markInstructionUses(const MachineInstr &MI, char Flag, |
190 | std::vector<WorkItem> &Worklist); |
191 | char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist); |
192 | void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist); |
193 | void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist); |
194 | char analyzeFunction(MachineFunction &MF); |
195 | |
196 | MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB, |
197 | MachineBasicBlock::iterator Before); |
198 | MachineBasicBlock::iterator |
199 | prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First, |
200 | MachineBasicBlock::iterator Last, bool PreferLast, |
201 | bool SaveSCC); |
202 | void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, |
203 | Register SaveWQM); |
204 | void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, |
205 | Register SavedWQM); |
206 | void toStrictMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, |
207 | Register SaveOrig, char StrictStateNeeded); |
208 | void fromStrictMode(MachineBasicBlock &MBB, |
209 | MachineBasicBlock::iterator Before, Register SavedOrig, |
210 | char NonStrictState, char CurrentStrictState); |
211 | |
212 | MachineBasicBlock *splitBlock(MachineBasicBlock *BB, MachineInstr *TermMI); |
213 | |
214 | MachineInstr *lowerKillI1(MachineBasicBlock &MBB, MachineInstr &MI, |
215 | bool IsWQM); |
216 | MachineInstr *lowerKillF32(MachineBasicBlock &MBB, MachineInstr &MI); |
217 | void lowerPseudoStrictMode(MachineBasicBlock &MBB, MachineInstr *Entry, |
218 | MachineInstr *Exit); |
219 | |
220 | void lowerBlock(MachineBasicBlock &MBB); |
221 | void processBlock(MachineBasicBlock &MBB, bool IsEntry); |
222 | |
223 | void lowerLiveMaskQueries(); |
224 | void lowerCopyInstrs(); |
225 | void lowerKillInstrs(bool IsWQM); |
226 | |
227 | public: |
228 | static char ID; |
229 | |
230 | SIWholeQuadMode() : |
231 | MachineFunctionPass(ID) { } |
232 | |
233 | bool runOnMachineFunction(MachineFunction &MF) override; |
234 | |
235 | StringRef getPassName() const override { return "SI Whole Quad Mode" ; } |
236 | |
237 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
238 | AU.addRequired<LiveIntervals>(); |
239 | AU.addPreserved<SlotIndexes>(); |
240 | AU.addPreserved<LiveIntervals>(); |
241 | AU.addPreserved<MachineDominatorTree>(); |
242 | AU.addPreserved<MachinePostDominatorTree>(); |
243 | MachineFunctionPass::getAnalysisUsage(AU); |
244 | } |
245 | |
246 | MachineFunctionProperties getClearedProperties() const override { |
247 | return MachineFunctionProperties().set( |
248 | MachineFunctionProperties::Property::IsSSA); |
249 | } |
250 | }; |
251 | |
252 | } // end anonymous namespace |
253 | |
254 | char SIWholeQuadMode::ID = 0; |
255 | |
256 | INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode" , false, |
257 | false) |
258 | INITIALIZE_PASS_DEPENDENCY(LiveIntervals) |
259 | INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) |
260 | INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) |
261 | INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode" , false, |
262 | false) |
263 | |
264 | char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID; |
265 | |
266 | FunctionPass *llvm::createSIWholeQuadModePass() { |
267 | return new SIWholeQuadMode; |
268 | } |
269 | |
270 | #ifndef NDEBUG |
271 | LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() { |
272 | for (const auto &BII : Blocks) { |
273 | dbgs() << "\n" |
274 | << printMBBReference(MBB: *BII.first) << ":\n" |
275 | << " InNeeds = " << PrintState(BII.second.InNeeds) |
276 | << ", Needs = " << PrintState(BII.second.Needs) |
277 | << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n" ; |
278 | |
279 | for (const MachineInstr &MI : *BII.first) { |
280 | auto III = Instructions.find(Val: &MI); |
281 | if (III == Instructions.end()) |
282 | continue; |
283 | |
284 | dbgs() << " " << MI << " Needs = " << PrintState(III->second.Needs) |
285 | << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n'; |
286 | } |
287 | } |
288 | } |
289 | #endif |
290 | |
291 | void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag, |
292 | std::vector<WorkItem> &Worklist) { |
293 | InstrInfo &II = Instructions[&MI]; |
294 | |
295 | assert(!(Flag & StateExact) && Flag != 0); |
296 | |
297 | // Remove any disabled states from the flag. The user that required it gets |
298 | // an undefined value in the helper lanes. For example, this can happen if |
299 | // the result of an atomic is used by instruction that requires WQM, where |
300 | // ignoring the request for WQM is correct as per the relevant specs. |
301 | Flag &= ~II.Disabled; |
302 | |
303 | // Ignore if the flag is already encompassed by the existing needs, or we |
304 | // just disabled everything. |
305 | if ((II.Needs & Flag) == Flag) |
306 | return; |
307 | |
308 | LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI); |
309 | II.Needs |= Flag; |
310 | Worklist.push_back(x: &MI); |
311 | } |
312 | |
313 | /// Mark all relevant definitions of register \p Reg in usage \p UseMI. |
314 | void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR, |
315 | Register Reg, unsigned SubReg, char Flag, |
316 | std::vector<WorkItem> &Worklist) { |
317 | LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI); |
318 | |
319 | LiveQueryResult UseLRQ = LR.Query(Idx: LIS->getInstructionIndex(Instr: UseMI)); |
320 | const VNInfo *Value = UseLRQ.valueIn(); |
321 | if (!Value) |
322 | return; |
323 | |
324 | // Note: this code assumes that lane masks on AMDGPU completely |
325 | // cover registers. |
326 | const LaneBitmask UseLanes = |
327 | SubReg ? TRI->getSubRegIndexLaneMask(SubReg) |
328 | : (Reg.isVirtual() ? MRI->getMaxLaneMaskForVReg(Reg) |
329 | : LaneBitmask::getNone()); |
330 | |
331 | // Perform a depth-first iteration of the LiveRange graph marking defs. |
332 | // Stop processing of a given branch when all use lanes have been defined. |
333 | // The first definition stops processing for a physical register. |
334 | struct PhiEntry { |
335 | const VNInfo *Phi; |
336 | unsigned PredIdx; |
337 | LaneBitmask DefinedLanes; |
338 | |
339 | PhiEntry(const VNInfo *Phi, unsigned PredIdx, LaneBitmask DefinedLanes) |
340 | : Phi(Phi), PredIdx(PredIdx), DefinedLanes(DefinedLanes) {} |
341 | }; |
342 | using VisitKey = std::pair<const VNInfo *, LaneBitmask>; |
343 | SmallVector<PhiEntry, 2> PhiStack; |
344 | SmallSet<VisitKey, 4> Visited; |
345 | LaneBitmask DefinedLanes; |
346 | unsigned NextPredIdx = 0; // Only used for processing phi nodes |
347 | do { |
348 | const VNInfo *NextValue = nullptr; |
349 | const VisitKey Key(Value, DefinedLanes); |
350 | |
351 | if (Visited.insert(V: Key).second) { |
352 | // On first visit to a phi then start processing first predecessor |
353 | NextPredIdx = 0; |
354 | } |
355 | |
356 | if (Value->isPHIDef()) { |
357 | // Each predecessor node in the phi must be processed as a subgraph |
358 | const MachineBasicBlock *MBB = LIS->getMBBFromIndex(index: Value->def); |
359 | assert(MBB && "Phi-def has no defining MBB" ); |
360 | |
361 | // Find next predecessor to process |
362 | unsigned Idx = NextPredIdx; |
363 | auto PI = MBB->pred_begin() + Idx; |
364 | auto PE = MBB->pred_end(); |
365 | for (; PI != PE && !NextValue; ++PI, ++Idx) { |
366 | if (const VNInfo *VN = LR.getVNInfoBefore(Idx: LIS->getMBBEndIdx(mbb: *PI))) { |
367 | if (!Visited.count(V: VisitKey(VN, DefinedLanes))) |
368 | NextValue = VN; |
369 | } |
370 | } |
371 | |
372 | // If there are more predecessors to process; add phi to stack |
373 | if (PI != PE) |
374 | PhiStack.emplace_back(Args&: Value, Args&: Idx, Args&: DefinedLanes); |
375 | } else { |
376 | MachineInstr *MI = LIS->getInstructionFromIndex(index: Value->def); |
377 | assert(MI && "Def has no defining instruction" ); |
378 | |
379 | if (Reg.isVirtual()) { |
380 | // Iterate over all operands to find relevant definitions |
381 | bool HasDef = false; |
382 | for (const MachineOperand &Op : MI->all_defs()) { |
383 | if (Op.getReg() != Reg) |
384 | continue; |
385 | |
386 | // Compute lanes defined and overlap with use |
387 | LaneBitmask OpLanes = |
388 | Op.isUndef() ? LaneBitmask::getAll() |
389 | : TRI->getSubRegIndexLaneMask(Op.getSubReg()); |
390 | LaneBitmask Overlap = (UseLanes & OpLanes); |
391 | |
392 | // Record if this instruction defined any of use |
393 | HasDef |= Overlap.any(); |
394 | |
395 | // Mark any lanes defined |
396 | DefinedLanes |= OpLanes; |
397 | } |
398 | |
399 | // Check if all lanes of use have been defined |
400 | if ((DefinedLanes & UseLanes) != UseLanes) { |
401 | // Definition not complete; need to process input value |
402 | LiveQueryResult LRQ = LR.Query(Idx: LIS->getInstructionIndex(Instr: *MI)); |
403 | if (const VNInfo *VN = LRQ.valueIn()) { |
404 | if (!Visited.count(V: VisitKey(VN, DefinedLanes))) |
405 | NextValue = VN; |
406 | } |
407 | } |
408 | |
409 | // Only mark the instruction if it defines some part of the use |
410 | if (HasDef) |
411 | markInstruction(MI&: *MI, Flag, Worklist); |
412 | } else { |
413 | // For physical registers simply mark the defining instruction |
414 | markInstruction(MI&: *MI, Flag, Worklist); |
415 | } |
416 | } |
417 | |
418 | if (!NextValue && !PhiStack.empty()) { |
419 | // Reach end of chain; revert to processing last phi |
420 | PhiEntry &Entry = PhiStack.back(); |
421 | NextValue = Entry.Phi; |
422 | NextPredIdx = Entry.PredIdx; |
423 | DefinedLanes = Entry.DefinedLanes; |
424 | PhiStack.pop_back(); |
425 | } |
426 | |
427 | Value = NextValue; |
428 | } while (Value); |
429 | } |
430 | |
431 | void SIWholeQuadMode::markOperand(const MachineInstr &MI, |
432 | const MachineOperand &Op, char Flag, |
433 | std::vector<WorkItem> &Worklist) { |
434 | assert(Op.isReg()); |
435 | Register Reg = Op.getReg(); |
436 | |
437 | // Ignore some hardware registers |
438 | switch (Reg) { |
439 | case AMDGPU::EXEC: |
440 | case AMDGPU::EXEC_LO: |
441 | return; |
442 | default: |
443 | break; |
444 | } |
445 | |
446 | LLVM_DEBUG(dbgs() << "markOperand " << PrintState(Flag) << ": " << Op |
447 | << " for " << MI); |
448 | if (Reg.isVirtual()) { |
449 | LiveRange &LR = LIS->getInterval(Reg); |
450 | markDefs(UseMI: MI, LR, Reg, SubReg: Op.getSubReg(), Flag, Worklist); |
451 | } else { |
452 | // Handle physical registers that we need to track; this is mostly relevant |
453 | // for VCC, which can appear as the (implicit) input of a uniform branch, |
454 | // e.g. when a loop counter is stored in a VGPR. |
455 | for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg())) { |
456 | LiveRange &LR = LIS->getRegUnit(Unit); |
457 | const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn(); |
458 | if (!Value) |
459 | continue; |
460 | |
461 | markDefs(MI, LR, Unit, AMDGPU::NoSubRegister, Flag, Worklist); |
462 | } |
463 | } |
464 | } |
465 | |
466 | /// Mark all instructions defining the uses in \p MI with \p Flag. |
467 | void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag, |
468 | std::vector<WorkItem> &Worklist) { |
469 | LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": " |
470 | << MI); |
471 | |
472 | for (const MachineOperand &Use : MI.all_uses()) |
473 | markOperand(MI, Op: Use, Flag, Worklist); |
474 | } |
475 | |
476 | // Scan instructions to determine which ones require an Exact execmask and |
477 | // which ones seed WQM requirements. |
478 | char SIWholeQuadMode::scanInstructions(MachineFunction &MF, |
479 | std::vector<WorkItem> &Worklist) { |
480 | char GlobalFlags = 0; |
481 | bool WQMOutputs = MF.getFunction().hasFnAttribute(Kind: "amdgpu-ps-wqm-outputs" ); |
482 | SmallVector<MachineInstr *, 4> SetInactiveInstrs; |
483 | SmallVector<MachineInstr *, 4> SoftWQMInstrs; |
484 | bool HasImplicitDerivatives = |
485 | MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS; |
486 | |
487 | // We need to visit the basic blocks in reverse post-order so that we visit |
488 | // defs before uses, in particular so that we don't accidentally mark an |
489 | // instruction as needing e.g. WQM before visiting it and realizing it needs |
490 | // WQM disabled. |
491 | ReversePostOrderTraversal<MachineFunction *> RPOT(&MF); |
492 | for (MachineBasicBlock *MBB : RPOT) { |
493 | BlockInfo &BBI = Blocks[MBB]; |
494 | |
495 | for (MachineInstr &MI : *MBB) { |
496 | InstrInfo &III = Instructions[&MI]; |
497 | unsigned Opcode = MI.getOpcode(); |
498 | char Flags = 0; |
499 | |
500 | if (TII->isWQM(Opcode)) { |
501 | // If LOD is not supported WQM is not needed. |
502 | if (!ST->hasExtendedImageInsts()) |
503 | continue; |
504 | // Only generate implicit WQM if implicit derivatives are required. |
505 | // This avoids inserting unintended WQM if a shader type without |
506 | // implicit derivatives uses an image sampling instruction. |
507 | if (!HasImplicitDerivatives) |
508 | continue; |
509 | // Sampling instructions don't need to produce results for all pixels |
510 | // in a quad, they just require all inputs of a quad to have been |
511 | // computed for derivatives. |
512 | markInstructionUses(MI, Flag: StateWQM, Worklist); |
513 | GlobalFlags |= StateWQM; |
514 | continue; |
515 | } else if (Opcode == AMDGPU::WQM) { |
516 | // The WQM intrinsic requires its output to have all the helper lanes |
517 | // correct, so we need it to be in WQM. |
518 | Flags = StateWQM; |
519 | LowerToCopyInstrs.push_back(Elt: &MI); |
520 | } else if (Opcode == AMDGPU::SOFT_WQM) { |
521 | LowerToCopyInstrs.push_back(Elt: &MI); |
522 | SoftWQMInstrs.push_back(Elt: &MI); |
523 | continue; |
524 | } else if (Opcode == AMDGPU::STRICT_WWM) { |
525 | // The STRICT_WWM intrinsic doesn't make the same guarantee, and plus |
526 | // it needs to be executed in WQM or Exact so that its copy doesn't |
527 | // clobber inactive lanes. |
528 | markInstructionUses(MI, Flag: StateStrictWWM, Worklist); |
529 | GlobalFlags |= StateStrictWWM; |
530 | LowerToMovInstrs.push_back(Elt: &MI); |
531 | continue; |
532 | } else if (Opcode == AMDGPU::STRICT_WQM || |
533 | TII->isDualSourceBlendEXP(MI)) { |
534 | // STRICT_WQM is similar to STRICTWWM, but instead of enabling all |
535 | // threads of the wave like STRICTWWM, STRICT_WQM enables all threads in |
536 | // quads that have at least one active thread. |
537 | markInstructionUses(MI, Flag: StateStrictWQM, Worklist); |
538 | GlobalFlags |= StateStrictWQM; |
539 | |
540 | if (Opcode == AMDGPU::STRICT_WQM) { |
541 | LowerToMovInstrs.push_back(Elt: &MI); |
542 | } else { |
543 | // Dual source blend export acts as implicit strict-wqm, its sources |
544 | // need to be shuffled in strict wqm, but the export itself needs to |
545 | // run in exact mode. |
546 | BBI.Needs |= StateExact; |
547 | if (!(BBI.InNeeds & StateExact)) { |
548 | BBI.InNeeds |= StateExact; |
549 | Worklist.push_back(x: MBB); |
550 | } |
551 | GlobalFlags |= StateExact; |
552 | III.Disabled = StateWQM | StateStrict; |
553 | } |
554 | continue; |
555 | } else if (Opcode == AMDGPU::LDS_PARAM_LOAD || |
556 | Opcode == AMDGPU::DS_PARAM_LOAD || |
557 | Opcode == AMDGPU::LDS_DIRECT_LOAD || |
558 | Opcode == AMDGPU::DS_DIRECT_LOAD) { |
559 | // Mark these STRICTWQM, but only for the instruction, not its operands. |
560 | // This avoid unnecessarily marking M0 as requiring WQM. |
561 | InstrInfo &II = Instructions[&MI]; |
562 | II.Needs |= StateStrictWQM; |
563 | GlobalFlags |= StateStrictWQM; |
564 | continue; |
565 | } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 || |
566 | Opcode == AMDGPU::V_SET_INACTIVE_B64) { |
567 | III.Disabled = StateStrict; |
568 | MachineOperand &Inactive = MI.getOperand(i: 2); |
569 | if (Inactive.isReg()) { |
570 | if (Inactive.isUndef()) { |
571 | LowerToCopyInstrs.push_back(Elt: &MI); |
572 | } else { |
573 | markOperand(MI, Op: Inactive, Flag: StateStrictWWM, Worklist); |
574 | } |
575 | } |
576 | SetInactiveInstrs.push_back(Elt: &MI); |
577 | continue; |
578 | } else if (TII->isDisableWQM(MI)) { |
579 | BBI.Needs |= StateExact; |
580 | if (!(BBI.InNeeds & StateExact)) { |
581 | BBI.InNeeds |= StateExact; |
582 | Worklist.push_back(x: MBB); |
583 | } |
584 | GlobalFlags |= StateExact; |
585 | III.Disabled = StateWQM | StateStrict; |
586 | continue; |
587 | } else { |
588 | if (Opcode == AMDGPU::SI_PS_LIVE || Opcode == AMDGPU::SI_LIVE_MASK) { |
589 | LiveMaskQueries.push_back(Elt: &MI); |
590 | } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR || |
591 | Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR || |
592 | Opcode == AMDGPU::SI_DEMOTE_I1) { |
593 | KillInstrs.push_back(Elt: &MI); |
594 | BBI.NeedsLowering = true; |
595 | } else if (WQMOutputs) { |
596 | // The function is in machine SSA form, which means that physical |
597 | // VGPRs correspond to shader inputs and outputs. Inputs are |
598 | // only used, outputs are only defined. |
599 | // FIXME: is this still valid? |
600 | for (const MachineOperand &MO : MI.defs()) { |
601 | if (!MO.isReg()) |
602 | continue; |
603 | |
604 | Register Reg = MO.getReg(); |
605 | |
606 | if (!Reg.isVirtual() && |
607 | TRI->hasVectorRegisters(RC: TRI->getPhysRegBaseClass(Reg))) { |
608 | Flags = StateWQM; |
609 | break; |
610 | } |
611 | } |
612 | } |
613 | |
614 | if (!Flags) |
615 | continue; |
616 | } |
617 | |
618 | markInstruction(MI, Flag: Flags, Worklist); |
619 | GlobalFlags |= Flags; |
620 | } |
621 | } |
622 | |
623 | // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is |
624 | // ever used anywhere in the function. This implements the corresponding |
625 | // semantics of @llvm.amdgcn.set.inactive. |
626 | // Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm. |
627 | if (GlobalFlags & StateWQM) { |
628 | for (MachineInstr *MI : SetInactiveInstrs) |
629 | markInstruction(MI&: *MI, Flag: StateWQM, Worklist); |
630 | for (MachineInstr *MI : SoftWQMInstrs) |
631 | markInstruction(MI&: *MI, Flag: StateWQM, Worklist); |
632 | } |
633 | |
634 | return GlobalFlags; |
635 | } |
636 | |
637 | void SIWholeQuadMode::propagateInstruction(MachineInstr &MI, |
638 | std::vector<WorkItem>& Worklist) { |
639 | MachineBasicBlock *MBB = MI.getParent(); |
640 | InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references |
641 | BlockInfo &BI = Blocks[MBB]; |
642 | |
643 | // Control flow-type instructions and stores to temporary memory that are |
644 | // followed by WQM computations must themselves be in WQM. |
645 | if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) && |
646 | (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) { |
647 | Instructions[&MI].Needs = StateWQM; |
648 | II.Needs = StateWQM; |
649 | } |
650 | |
651 | // Propagate to block level |
652 | if (II.Needs & StateWQM) { |
653 | BI.Needs |= StateWQM; |
654 | if (!(BI.InNeeds & StateWQM)) { |
655 | BI.InNeeds |= StateWQM; |
656 | Worklist.push_back(x: MBB); |
657 | } |
658 | } |
659 | |
660 | // Propagate backwards within block |
661 | if (MachineInstr *PrevMI = MI.getPrevNode()) { |
662 | char InNeeds = (II.Needs & ~StateStrict) | II.OutNeeds; |
663 | if (!PrevMI->isPHI()) { |
664 | InstrInfo &PrevII = Instructions[PrevMI]; |
665 | if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) { |
666 | PrevII.OutNeeds |= InNeeds; |
667 | Worklist.push_back(x: PrevMI); |
668 | } |
669 | } |
670 | } |
671 | |
672 | // Propagate WQM flag to instruction inputs |
673 | assert(!(II.Needs & StateExact)); |
674 | |
675 | if (II.Needs != 0) |
676 | markInstructionUses(MI, Flag: II.Needs, Worklist); |
677 | |
678 | // Ensure we process a block containing StrictWWM/StrictWQM, even if it does |
679 | // not require any WQM transitions. |
680 | if (II.Needs & StateStrictWWM) |
681 | BI.Needs |= StateStrictWWM; |
682 | if (II.Needs & StateStrictWQM) |
683 | BI.Needs |= StateStrictWQM; |
684 | } |
685 | |
686 | void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB, |
687 | std::vector<WorkItem>& Worklist) { |
688 | BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references. |
689 | |
690 | // Propagate through instructions |
691 | if (!MBB.empty()) { |
692 | MachineInstr *LastMI = &*MBB.rbegin(); |
693 | InstrInfo &LastII = Instructions[LastMI]; |
694 | if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) { |
695 | LastII.OutNeeds |= BI.OutNeeds; |
696 | Worklist.push_back(x: LastMI); |
697 | } |
698 | } |
699 | |
700 | // Predecessor blocks must provide for our WQM/Exact needs. |
701 | for (MachineBasicBlock *Pred : MBB.predecessors()) { |
702 | BlockInfo &PredBI = Blocks[Pred]; |
703 | if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds) |
704 | continue; |
705 | |
706 | PredBI.OutNeeds |= BI.InNeeds; |
707 | PredBI.InNeeds |= BI.InNeeds; |
708 | Worklist.push_back(x: Pred); |
709 | } |
710 | |
711 | // All successors must be prepared to accept the same set of WQM/Exact data. |
712 | for (MachineBasicBlock *Succ : MBB.successors()) { |
713 | BlockInfo &SuccBI = Blocks[Succ]; |
714 | if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds) |
715 | continue; |
716 | |
717 | SuccBI.InNeeds |= BI.OutNeeds; |
718 | Worklist.push_back(x: Succ); |
719 | } |
720 | } |
721 | |
722 | char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) { |
723 | std::vector<WorkItem> Worklist; |
724 | char GlobalFlags = scanInstructions(MF, Worklist); |
725 | |
726 | while (!Worklist.empty()) { |
727 | WorkItem WI = Worklist.back(); |
728 | Worklist.pop_back(); |
729 | |
730 | if (WI.MI) |
731 | propagateInstruction(MI&: *WI.MI, Worklist); |
732 | else |
733 | propagateBlock(MBB&: *WI.MBB, Worklist); |
734 | } |
735 | |
736 | return GlobalFlags; |
737 | } |
738 | |
739 | MachineBasicBlock::iterator |
740 | SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB, |
741 | MachineBasicBlock::iterator Before) { |
742 | Register SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); |
743 | |
744 | MachineInstr *Save = |
745 | BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg) |
746 | .addReg(AMDGPU::SCC); |
747 | MachineInstr *Restore = |
748 | BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC) |
749 | .addReg(SaveReg); |
750 | |
751 | LIS->InsertMachineInstrInMaps(MI&: *Save); |
752 | LIS->InsertMachineInstrInMaps(MI&: *Restore); |
753 | LIS->createAndComputeVirtRegInterval(Reg: SaveReg); |
754 | |
755 | return Restore; |
756 | } |
757 | |
758 | MachineBasicBlock *SIWholeQuadMode::splitBlock(MachineBasicBlock *BB, |
759 | MachineInstr *TermMI) { |
760 | LLVM_DEBUG(dbgs() << "Split block " << printMBBReference(*BB) << " @ " |
761 | << *TermMI << "\n" ); |
762 | |
763 | MachineBasicBlock *SplitBB = |
764 | BB->splitAt(SplitInst&: *TermMI, /*UpdateLiveIns*/ true, LIS); |
765 | |
766 | // Convert last instruction in block to a terminator. |
767 | // Note: this only covers the expected patterns |
768 | unsigned NewOpcode = 0; |
769 | switch (TermMI->getOpcode()) { |
770 | case AMDGPU::S_AND_B32: |
771 | NewOpcode = AMDGPU::S_AND_B32_term; |
772 | break; |
773 | case AMDGPU::S_AND_B64: |
774 | NewOpcode = AMDGPU::S_AND_B64_term; |
775 | break; |
776 | case AMDGPU::S_MOV_B32: |
777 | NewOpcode = AMDGPU::S_MOV_B32_term; |
778 | break; |
779 | case AMDGPU::S_MOV_B64: |
780 | NewOpcode = AMDGPU::S_MOV_B64_term; |
781 | break; |
782 | default: |
783 | break; |
784 | } |
785 | if (NewOpcode) |
786 | TermMI->setDesc(TII->get(NewOpcode)); |
787 | |
788 | if (SplitBB != BB) { |
789 | // Update dominator trees |
790 | using DomTreeT = DomTreeBase<MachineBasicBlock>; |
791 | SmallVector<DomTreeT::UpdateType, 16> DTUpdates; |
792 | for (MachineBasicBlock *Succ : SplitBB->successors()) { |
793 | DTUpdates.push_back(Elt: {DomTreeT::Insert, SplitBB, Succ}); |
794 | DTUpdates.push_back(Elt: {DomTreeT::Delete, BB, Succ}); |
795 | } |
796 | DTUpdates.push_back(Elt: {DomTreeT::Insert, BB, SplitBB}); |
797 | if (MDT) |
798 | MDT->getBase().applyUpdates(Updates: DTUpdates); |
799 | if (PDT) |
800 | PDT->getBase().applyUpdates(Updates: DTUpdates); |
801 | |
802 | // Link blocks |
803 | MachineInstr *MI = |
804 | BuildMI(*BB, BB->end(), DebugLoc(), TII->get(AMDGPU::S_BRANCH)) |
805 | .addMBB(SplitBB); |
806 | LIS->InsertMachineInstrInMaps(MI&: *MI); |
807 | } |
808 | |
809 | return SplitBB; |
810 | } |
811 | |
812 | MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB, |
813 | MachineInstr &MI) { |
814 | const DebugLoc &DL = MI.getDebugLoc(); |
815 | unsigned Opcode = 0; |
816 | |
817 | assert(MI.getOperand(0).isReg()); |
818 | |
819 | // Comparison is for live lanes; however here we compute the inverse |
820 | // (killed lanes). This is because VCMP will always generate 0 bits |
821 | // for inactive lanes so a mask of live lanes would not be correct |
822 | // inside control flow. |
823 | // Invert the comparison by swapping the operands and adjusting |
824 | // the comparison codes. |
825 | |
826 | switch (MI.getOperand(i: 2).getImm()) { |
827 | case ISD::SETUEQ: |
828 | Opcode = AMDGPU::V_CMP_LG_F32_e64; |
829 | break; |
830 | case ISD::SETUGT: |
831 | Opcode = AMDGPU::V_CMP_GE_F32_e64; |
832 | break; |
833 | case ISD::SETUGE: |
834 | Opcode = AMDGPU::V_CMP_GT_F32_e64; |
835 | break; |
836 | case ISD::SETULT: |
837 | Opcode = AMDGPU::V_CMP_LE_F32_e64; |
838 | break; |
839 | case ISD::SETULE: |
840 | Opcode = AMDGPU::V_CMP_LT_F32_e64; |
841 | break; |
842 | case ISD::SETUNE: |
843 | Opcode = AMDGPU::V_CMP_EQ_F32_e64; |
844 | break; |
845 | case ISD::SETO: |
846 | Opcode = AMDGPU::V_CMP_O_F32_e64; |
847 | break; |
848 | case ISD::SETUO: |
849 | Opcode = AMDGPU::V_CMP_U_F32_e64; |
850 | break; |
851 | case ISD::SETOEQ: |
852 | case ISD::SETEQ: |
853 | Opcode = AMDGPU::V_CMP_NEQ_F32_e64; |
854 | break; |
855 | case ISD::SETOGT: |
856 | case ISD::SETGT: |
857 | Opcode = AMDGPU::V_CMP_NLT_F32_e64; |
858 | break; |
859 | case ISD::SETOGE: |
860 | case ISD::SETGE: |
861 | Opcode = AMDGPU::V_CMP_NLE_F32_e64; |
862 | break; |
863 | case ISD::SETOLT: |
864 | case ISD::SETLT: |
865 | Opcode = AMDGPU::V_CMP_NGT_F32_e64; |
866 | break; |
867 | case ISD::SETOLE: |
868 | case ISD::SETLE: |
869 | Opcode = AMDGPU::V_CMP_NGE_F32_e64; |
870 | break; |
871 | case ISD::SETONE: |
872 | case ISD::SETNE: |
873 | Opcode = AMDGPU::V_CMP_NLG_F32_e64; |
874 | break; |
875 | default: |
876 | llvm_unreachable("invalid ISD:SET cond code" ); |
877 | } |
878 | |
879 | // Pick opcode based on comparison type. |
880 | MachineInstr *VcmpMI; |
881 | const MachineOperand &Op0 = MI.getOperand(i: 0); |
882 | const MachineOperand &Op1 = MI.getOperand(i: 1); |
883 | |
884 | // VCC represents lanes killed. |
885 | Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC; |
886 | |
887 | if (TRI->isVGPR(MRI: *MRI, Reg: Op0.getReg())) { |
888 | Opcode = AMDGPU::getVOPe32(Opcode); |
889 | VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)).add(Op1).add(Op0); |
890 | } else { |
891 | VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)) |
892 | .addReg(VCC, RegState::Define) |
893 | .addImm(0) // src0 modifiers |
894 | .add(Op1) |
895 | .addImm(0) // src1 modifiers |
896 | .add(Op0) |
897 | .addImm(0); // omod |
898 | } |
899 | |
900 | MachineInstr *MaskUpdateMI = |
901 | BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg) |
902 | .addReg(LiveMaskReg) |
903 | .addReg(VCC); |
904 | |
905 | // State of SCC represents whether any lanes are live in mask, |
906 | // if SCC is 0 then no lanes will be alive anymore. |
907 | MachineInstr *EarlyTermMI = |
908 | BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0)); |
909 | |
910 | MachineInstr *ExecMaskMI = |
911 | BuildMI(MBB, MI, DL, TII->get(AndN2Opc), Exec).addReg(Exec).addReg(VCC); |
912 | |
913 | assert(MBB.succ_size() == 1); |
914 | MachineInstr *NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH)) |
915 | .addMBB(*MBB.succ_begin()); |
916 | |
917 | // Update live intervals |
918 | LIS->ReplaceMachineInstrInMaps(MI, NewMI&: *VcmpMI); |
919 | MBB.remove(I: &MI); |
920 | |
921 | LIS->InsertMachineInstrInMaps(MI&: *MaskUpdateMI); |
922 | LIS->InsertMachineInstrInMaps(MI&: *ExecMaskMI); |
923 | LIS->InsertMachineInstrInMaps(MI&: *EarlyTermMI); |
924 | LIS->InsertMachineInstrInMaps(MI&: *NewTerm); |
925 | |
926 | return NewTerm; |
927 | } |
928 | |
929 | MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB, |
930 | MachineInstr &MI, bool IsWQM) { |
931 | const DebugLoc &DL = MI.getDebugLoc(); |
932 | MachineInstr *MaskUpdateMI = nullptr; |
933 | |
934 | const bool IsDemote = IsWQM && (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1); |
935 | const MachineOperand &Op = MI.getOperand(i: 0); |
936 | int64_t KillVal = MI.getOperand(i: 1).getImm(); |
937 | MachineInstr *ComputeKilledMaskMI = nullptr; |
938 | Register CndReg = !Op.isImm() ? Op.getReg() : Register(); |
939 | Register TmpReg; |
940 | |
941 | // Is this a static or dynamic kill? |
942 | if (Op.isImm()) { |
943 | if (Op.getImm() == KillVal) { |
944 | // Static: all active lanes are killed |
945 | MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg) |
946 | .addReg(LiveMaskReg) |
947 | .addReg(Exec); |
948 | } else { |
949 | // Static: kill does nothing |
950 | MachineInstr *NewTerm = nullptr; |
951 | if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) { |
952 | LIS->RemoveMachineInstrFromMaps(MI); |
953 | } else { |
954 | assert(MBB.succ_size() == 1); |
955 | NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH)) |
956 | .addMBB(*MBB.succ_begin()); |
957 | LIS->ReplaceMachineInstrInMaps(MI, NewMI&: *NewTerm); |
958 | } |
959 | MBB.remove(I: &MI); |
960 | return NewTerm; |
961 | } |
962 | } else { |
963 | if (!KillVal) { |
964 | // Op represents live lanes after kill, |
965 | // so exec mask needs to be factored in. |
966 | TmpReg = MRI->createVirtualRegister(RegClass: TRI->getBoolRC()); |
967 | ComputeKilledMaskMI = |
968 | BuildMI(MBB, MI, DL, TII->get(XorOpc), TmpReg).add(Op).addReg(Exec); |
969 | MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg) |
970 | .addReg(LiveMaskReg) |
971 | .addReg(TmpReg); |
972 | } else { |
973 | // Op represents lanes to kill |
974 | MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg) |
975 | .addReg(LiveMaskReg) |
976 | .add(Op); |
977 | } |
978 | } |
979 | |
980 | // State of SCC represents whether any lanes are live in mask, |
981 | // if SCC is 0 then no lanes will be alive anymore. |
982 | MachineInstr *EarlyTermMI = |
983 | BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0)); |
984 | |
985 | // In the case we got this far some lanes are still live, |
986 | // update EXEC to deactivate lanes as appropriate. |
987 | MachineInstr *NewTerm; |
988 | MachineInstr *WQMMaskMI = nullptr; |
989 | Register LiveMaskWQM; |
990 | if (IsDemote) { |
991 | // Demote - deactivate quads with only helper lanes |
992 | LiveMaskWQM = MRI->createVirtualRegister(RegClass: TRI->getBoolRC()); |
993 | WQMMaskMI = |
994 | BuildMI(MBB, MI, DL, TII->get(WQMOpc), LiveMaskWQM).addReg(LiveMaskReg); |
995 | NewTerm = BuildMI(MBB, MI, DL, TII->get(AndOpc), Exec) |
996 | .addReg(Exec) |
997 | .addReg(LiveMaskWQM); |
998 | } else { |
999 | // Kill - deactivate lanes no longer in live mask |
1000 | if (Op.isImm()) { |
1001 | unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; |
1002 | NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0); |
1003 | } else if (!IsWQM) { |
1004 | NewTerm = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Exec) |
1005 | .addReg(Exec) |
1006 | .addReg(LiveMaskReg); |
1007 | } else { |
1008 | unsigned Opcode = KillVal ? AndN2Opc : AndOpc; |
1009 | NewTerm = |
1010 | BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec).addReg(Exec).add(Op); |
1011 | } |
1012 | } |
1013 | |
1014 | // Update live intervals |
1015 | LIS->RemoveMachineInstrFromMaps(MI); |
1016 | MBB.remove(I: &MI); |
1017 | assert(EarlyTermMI); |
1018 | assert(MaskUpdateMI); |
1019 | assert(NewTerm); |
1020 | if (ComputeKilledMaskMI) |
1021 | LIS->InsertMachineInstrInMaps(MI&: *ComputeKilledMaskMI); |
1022 | LIS->InsertMachineInstrInMaps(MI&: *MaskUpdateMI); |
1023 | LIS->InsertMachineInstrInMaps(MI&: *EarlyTermMI); |
1024 | if (WQMMaskMI) |
1025 | LIS->InsertMachineInstrInMaps(MI&: *WQMMaskMI); |
1026 | LIS->InsertMachineInstrInMaps(MI&: *NewTerm); |
1027 | |
1028 | if (CndReg) { |
1029 | LIS->removeInterval(Reg: CndReg); |
1030 | LIS->createAndComputeVirtRegInterval(Reg: CndReg); |
1031 | } |
1032 | if (TmpReg) |
1033 | LIS->createAndComputeVirtRegInterval(Reg: TmpReg); |
1034 | if (LiveMaskWQM) |
1035 | LIS->createAndComputeVirtRegInterval(Reg: LiveMaskWQM); |
1036 | |
1037 | return NewTerm; |
1038 | } |
1039 | |
1040 | // Convert a strict mode transition to a pseudo transition. |
1041 | // This still pre-allocates registers to prevent clobbering, |
1042 | // but avoids any EXEC mask changes. |
1043 | void SIWholeQuadMode::lowerPseudoStrictMode(MachineBasicBlock &MBB, |
1044 | MachineInstr *Entry, |
1045 | MachineInstr *Exit) { |
1046 | assert(Entry->getOpcode() == AMDGPU::ENTER_STRICT_WQM); |
1047 | assert(Exit->getOpcode() == AMDGPU::EXIT_STRICT_WQM); |
1048 | |
1049 | Register SaveOrig = Entry->getOperand(i: 0).getReg(); |
1050 | |
1051 | MachineInstr *NewEntry = |
1052 | BuildMI(MBB, Entry, DebugLoc(), TII->get(AMDGPU::ENTER_PSEUDO_WM)); |
1053 | MachineInstr *NewExit = |
1054 | BuildMI(MBB, Exit, DebugLoc(), TII->get(AMDGPU::EXIT_PSEUDO_WM)); |
1055 | |
1056 | LIS->ReplaceMachineInstrInMaps(MI&: *Exit, NewMI&: *NewExit); |
1057 | Exit->eraseFromParent(); |
1058 | |
1059 | LIS->ReplaceMachineInstrInMaps(MI&: *Entry, NewMI&: *NewEntry); |
1060 | Entry->eraseFromParent(); |
1061 | |
1062 | LIS->removeInterval(Reg: SaveOrig); |
1063 | } |
1064 | |
1065 | // Replace (or supplement) instructions accessing live mask. |
1066 | // This can only happen once all the live mask registers have been created |
1067 | // and the execute state (WQM/StrictWWM/Exact) of instructions is known. |
1068 | void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) { |
1069 | auto BII = Blocks.find(Key: &MBB); |
1070 | if (BII == Blocks.end()) |
1071 | return; |
1072 | |
1073 | const BlockInfo &BI = BII->second; |
1074 | if (!BI.NeedsLowering) |
1075 | return; |
1076 | |
1077 | LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n" ); |
1078 | |
1079 | SmallVector<MachineInstr *, 4> SplitPoints; |
1080 | char State = BI.InitialState; |
1081 | MachineInstr *StrictEntry = nullptr; |
1082 | |
1083 | for (MachineInstr &MI : llvm::make_early_inc_range( |
1084 | Range: llvm::make_range(x: MBB.getFirstNonPHI(), y: MBB.end()))) { |
1085 | char PreviousState = State; |
1086 | |
1087 | if (StateTransition.count(Val: &MI)) |
1088 | State = StateTransition[&MI]; |
1089 | |
1090 | MachineInstr *SplitPoint = nullptr; |
1091 | switch (MI.getOpcode()) { |
1092 | case AMDGPU::SI_DEMOTE_I1: |
1093 | case AMDGPU::SI_KILL_I1_TERMINATOR: |
1094 | SplitPoint = lowerKillI1(MBB, MI, IsWQM: State == StateWQM); |
1095 | break; |
1096 | case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: |
1097 | SplitPoint = lowerKillF32(MBB, MI); |
1098 | break; |
1099 | case AMDGPU::ENTER_STRICT_WQM: |
1100 | StrictEntry = PreviousState == StateWQM ? &MI : nullptr; |
1101 | break; |
1102 | case AMDGPU::EXIT_STRICT_WQM: |
1103 | if (State == StateWQM && StrictEntry) { |
1104 | // Transition WQM -> StrictWQM -> WQM detected. |
1105 | lowerPseudoStrictMode(MBB, Entry: StrictEntry, Exit: &MI); |
1106 | } |
1107 | StrictEntry = nullptr; |
1108 | break; |
1109 | case AMDGPU::ENTER_STRICT_WWM: |
1110 | case AMDGPU::EXIT_STRICT_WWM: |
1111 | StrictEntry = nullptr; |
1112 | break; |
1113 | default: |
1114 | break; |
1115 | } |
1116 | if (SplitPoint) |
1117 | SplitPoints.push_back(Elt: SplitPoint); |
1118 | } |
1119 | |
1120 | // Perform splitting after instruction scan to simplify iteration. |
1121 | if (!SplitPoints.empty()) { |
1122 | MachineBasicBlock *BB = &MBB; |
1123 | for (MachineInstr *MI : SplitPoints) { |
1124 | BB = splitBlock(BB, TermMI: MI); |
1125 | } |
1126 | } |
1127 | } |
1128 | |
1129 | // Return an iterator in the (inclusive) range [First, Last] at which |
1130 | // instructions can be safely inserted, keeping in mind that some of the |
1131 | // instructions we want to add necessarily clobber SCC. |
1132 | MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion( |
1133 | MachineBasicBlock &MBB, MachineBasicBlock::iterator First, |
1134 | MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) { |
1135 | if (!SaveSCC) |
1136 | return PreferLast ? Last : First; |
1137 | |
1138 | LiveRange &LR = |
1139 | LIS->getRegUnit(*TRI->regunits(MCRegister::from(AMDGPU::SCC)).begin()); |
1140 | auto MBBE = MBB.end(); |
1141 | SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(Instr: *First) |
1142 | : LIS->getMBBEndIdx(mbb: &MBB); |
1143 | SlotIndex LastIdx = |
1144 | Last != MBBE ? LIS->getInstructionIndex(Instr: *Last) : LIS->getMBBEndIdx(mbb: &MBB); |
1145 | SlotIndex Idx = PreferLast ? LastIdx : FirstIdx; |
1146 | const LiveRange::Segment *S; |
1147 | |
1148 | for (;;) { |
1149 | S = LR.getSegmentContaining(Idx); |
1150 | if (!S) |
1151 | break; |
1152 | |
1153 | if (PreferLast) { |
1154 | SlotIndex Next = S->start.getBaseIndex(); |
1155 | if (Next < FirstIdx) |
1156 | break; |
1157 | Idx = Next; |
1158 | } else { |
1159 | MachineInstr *EndMI = LIS->getInstructionFromIndex(index: S->end.getBaseIndex()); |
1160 | assert(EndMI && "Segment does not end on valid instruction" ); |
1161 | auto NextI = std::next(x: EndMI->getIterator()); |
1162 | if (NextI == MBB.end()) |
1163 | break; |
1164 | SlotIndex Next = LIS->getInstructionIndex(Instr: *NextI); |
1165 | if (Next > LastIdx) |
1166 | break; |
1167 | Idx = Next; |
1168 | } |
1169 | } |
1170 | |
1171 | MachineBasicBlock::iterator MBBI; |
1172 | |
1173 | if (MachineInstr *MI = LIS->getInstructionFromIndex(index: Idx)) |
1174 | MBBI = MI; |
1175 | else { |
1176 | assert(Idx == LIS->getMBBEndIdx(&MBB)); |
1177 | MBBI = MBB.end(); |
1178 | } |
1179 | |
1180 | // Move insertion point past any operations modifying EXEC. |
1181 | // This assumes that the value of SCC defined by any of these operations |
1182 | // does not need to be preserved. |
1183 | while (MBBI != Last) { |
1184 | bool IsExecDef = false; |
1185 | for (const MachineOperand &MO : MBBI->all_defs()) { |
1186 | IsExecDef |= |
1187 | MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC; |
1188 | } |
1189 | if (!IsExecDef) |
1190 | break; |
1191 | MBBI++; |
1192 | S = nullptr; |
1193 | } |
1194 | |
1195 | if (S) |
1196 | MBBI = saveSCC(MBB, Before: MBBI); |
1197 | |
1198 | return MBBI; |
1199 | } |
1200 | |
1201 | void SIWholeQuadMode::toExact(MachineBasicBlock &MBB, |
1202 | MachineBasicBlock::iterator Before, |
1203 | Register SaveWQM) { |
1204 | bool IsTerminator = Before == MBB.end(); |
1205 | if (!IsTerminator) { |
1206 | auto FirstTerm = MBB.getFirstTerminator(); |
1207 | if (FirstTerm != MBB.end()) { |
1208 | SlotIndex FirstTermIdx = LIS->getInstructionIndex(Instr: *FirstTerm); |
1209 | SlotIndex BeforeIdx = LIS->getInstructionIndex(Instr: *Before); |
1210 | IsTerminator = BeforeIdx > FirstTermIdx; |
1211 | } |
1212 | } |
1213 | |
1214 | MachineInstr *MI; |
1215 | |
1216 | if (SaveWQM) { |
1217 | unsigned Opcode = IsTerminator ? AndSaveExecTermOpc : AndSaveExecOpc; |
1218 | MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), SaveWQM) |
1219 | .addReg(LiveMaskReg); |
1220 | } else { |
1221 | unsigned Opcode = IsTerminator ? AndTermOpc : AndOpc; |
1222 | MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), Exec) |
1223 | .addReg(Exec) |
1224 | .addReg(LiveMaskReg); |
1225 | } |
1226 | |
1227 | LIS->InsertMachineInstrInMaps(MI&: *MI); |
1228 | StateTransition[MI] = StateExact; |
1229 | } |
1230 | |
1231 | void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB, |
1232 | MachineBasicBlock::iterator Before, |
1233 | Register SavedWQM) { |
1234 | MachineInstr *MI; |
1235 | |
1236 | if (SavedWQM) { |
1237 | MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec) |
1238 | .addReg(SavedWQM); |
1239 | } else { |
1240 | MI = BuildMI(MBB, Before, DebugLoc(), TII->get(WQMOpc), Exec).addReg(Exec); |
1241 | } |
1242 | |
1243 | LIS->InsertMachineInstrInMaps(MI&: *MI); |
1244 | StateTransition[MI] = StateWQM; |
1245 | } |
1246 | |
1247 | void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB, |
1248 | MachineBasicBlock::iterator Before, |
1249 | Register SaveOrig, char StrictStateNeeded) { |
1250 | MachineInstr *MI; |
1251 | assert(SaveOrig); |
1252 | assert(StrictStateNeeded == StateStrictWWM || |
1253 | StrictStateNeeded == StateStrictWQM); |
1254 | |
1255 | if (StrictStateNeeded == StateStrictWWM) { |
1256 | MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WWM), |
1257 | SaveOrig) |
1258 | .addImm(-1); |
1259 | } else { |
1260 | MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WQM), |
1261 | SaveOrig) |
1262 | .addImm(-1); |
1263 | } |
1264 | LIS->InsertMachineInstrInMaps(MI&: *MI); |
1265 | StateTransition[MI] = StrictStateNeeded; |
1266 | |
1267 | // Mark block as needing lower so it will be checked for unnecessary transitions. |
1268 | auto BII = Blocks.find(Key: &MBB); |
1269 | if (BII != Blocks.end()) |
1270 | BII->second.NeedsLowering = true; |
1271 | } |
1272 | |
1273 | void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB, |
1274 | MachineBasicBlock::iterator Before, |
1275 | Register SavedOrig, char NonStrictState, |
1276 | char CurrentStrictState) { |
1277 | MachineInstr *MI; |
1278 | |
1279 | assert(SavedOrig); |
1280 | assert(CurrentStrictState == StateStrictWWM || |
1281 | CurrentStrictState == StateStrictWQM); |
1282 | |
1283 | if (CurrentStrictState == StateStrictWWM) { |
1284 | MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WWM), |
1285 | Exec) |
1286 | .addReg(SavedOrig); |
1287 | } else { |
1288 | MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WQM), |
1289 | Exec) |
1290 | .addReg(SavedOrig); |
1291 | } |
1292 | LIS->InsertMachineInstrInMaps(MI&: *MI); |
1293 | StateTransition[MI] = NonStrictState; |
1294 | } |
1295 | |
1296 | void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) { |
1297 | auto BII = Blocks.find(Key: &MBB); |
1298 | if (BII == Blocks.end()) |
1299 | return; |
1300 | |
1301 | BlockInfo &BI = BII->second; |
1302 | |
1303 | // This is a non-entry block that is WQM throughout, so no need to do |
1304 | // anything. |
1305 | if (!IsEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) { |
1306 | BI.InitialState = StateWQM; |
1307 | return; |
1308 | } |
1309 | |
1310 | LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB) |
1311 | << ":\n" ); |
1312 | |
1313 | Register SavedWQMReg; |
1314 | Register SavedNonStrictReg; |
1315 | bool WQMFromExec = IsEntry; |
1316 | char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM; |
1317 | char NonStrictState = 0; |
1318 | const TargetRegisterClass *BoolRC = TRI->getBoolRC(); |
1319 | |
1320 | auto II = MBB.getFirstNonPHI(), IE = MBB.end(); |
1321 | if (IsEntry) { |
1322 | // Skip the instruction that saves LiveMask |
1323 | if (II != IE && II->getOpcode() == AMDGPU::COPY && |
1324 | II->getOperand(1).getReg() == TRI->getExec()) |
1325 | ++II; |
1326 | } |
1327 | |
1328 | // This stores the first instruction where it's safe to switch from WQM to |
1329 | // Exact or vice versa. |
1330 | MachineBasicBlock::iterator FirstWQM = IE; |
1331 | |
1332 | // This stores the first instruction where it's safe to switch from Strict |
1333 | // mode to Exact/WQM or to switch to Strict mode. It must always be the same |
1334 | // as, or after, FirstWQM since if it's safe to switch to/from Strict, it must |
1335 | // be safe to switch to/from WQM as well. |
1336 | MachineBasicBlock::iterator FirstStrict = IE; |
1337 | |
1338 | // Record initial state is block information. |
1339 | BI.InitialState = State; |
1340 | |
1341 | for (;;) { |
1342 | MachineBasicBlock::iterator Next = II; |
1343 | char Needs = StateExact | StateWQM; // Strict mode is disabled by default. |
1344 | char OutNeeds = 0; |
1345 | |
1346 | if (FirstWQM == IE) |
1347 | FirstWQM = II; |
1348 | |
1349 | if (FirstStrict == IE) |
1350 | FirstStrict = II; |
1351 | |
1352 | // First, figure out the allowed states (Needs) based on the propagated |
1353 | // flags. |
1354 | if (II != IE) { |
1355 | MachineInstr &MI = *II; |
1356 | |
1357 | if (MI.isTerminator() || TII->mayReadEXEC(MRI: *MRI, MI)) { |
1358 | auto III = Instructions.find(Val: &MI); |
1359 | if (III != Instructions.end()) { |
1360 | if (III->second.Needs & StateStrictWWM) |
1361 | Needs = StateStrictWWM; |
1362 | else if (III->second.Needs & StateStrictWQM) |
1363 | Needs = StateStrictWQM; |
1364 | else if (III->second.Needs & StateWQM) |
1365 | Needs = StateWQM; |
1366 | else |
1367 | Needs &= ~III->second.Disabled; |
1368 | OutNeeds = III->second.OutNeeds; |
1369 | } |
1370 | } else { |
1371 | // If the instruction doesn't actually need a correct EXEC, then we can |
1372 | // safely leave Strict mode enabled. |
1373 | Needs = StateExact | StateWQM | StateStrict; |
1374 | } |
1375 | |
1376 | // Exact mode exit can occur in terminators, but must be before branches. |
1377 | if (MI.isBranch() && OutNeeds == StateExact) |
1378 | Needs = StateExact; |
1379 | |
1380 | ++Next; |
1381 | } else { |
1382 | // End of basic block |
1383 | if (BI.OutNeeds & StateWQM) |
1384 | Needs = StateWQM; |
1385 | else if (BI.OutNeeds == StateExact) |
1386 | Needs = StateExact; |
1387 | else |
1388 | Needs = StateWQM | StateExact; |
1389 | } |
1390 | |
1391 | // Now, transition if necessary. |
1392 | if (!(Needs & State)) { |
1393 | MachineBasicBlock::iterator First; |
1394 | if (State == StateStrictWWM || Needs == StateStrictWWM || |
1395 | State == StateStrictWQM || Needs == StateStrictWQM) { |
1396 | // We must switch to or from Strict mode. |
1397 | First = FirstStrict; |
1398 | } else { |
1399 | // We only need to switch to/from WQM, so we can use FirstWQM. |
1400 | First = FirstWQM; |
1401 | } |
1402 | |
1403 | // Whether we need to save SCC depends on start and end states. |
1404 | bool SaveSCC = false; |
1405 | switch (State) { |
1406 | case StateExact: |
1407 | case StateStrictWWM: |
1408 | case StateStrictWQM: |
1409 | // Exact/Strict -> Strict: save SCC |
1410 | // Exact/Strict -> WQM: save SCC if WQM mask is generated from exec |
1411 | // Exact/Strict -> Exact: no save |
1412 | SaveSCC = (Needs & StateStrict) || ((Needs & StateWQM) && WQMFromExec); |
1413 | break; |
1414 | case StateWQM: |
1415 | // WQM -> Exact/Strict: save SCC |
1416 | SaveSCC = !(Needs & StateWQM); |
1417 | break; |
1418 | default: |
1419 | llvm_unreachable("Unknown state" ); |
1420 | break; |
1421 | } |
1422 | MachineBasicBlock::iterator Before = |
1423 | prepareInsertion(MBB, First, Last: II, PreferLast: Needs == StateWQM, SaveSCC); |
1424 | |
1425 | if (State & StateStrict) { |
1426 | assert(State == StateStrictWWM || State == StateStrictWQM); |
1427 | assert(SavedNonStrictReg); |
1428 | fromStrictMode(MBB, Before, SavedOrig: SavedNonStrictReg, NonStrictState, CurrentStrictState: State); |
1429 | |
1430 | LIS->createAndComputeVirtRegInterval(Reg: SavedNonStrictReg); |
1431 | SavedNonStrictReg = 0; |
1432 | State = NonStrictState; |
1433 | } |
1434 | |
1435 | if (Needs & StateStrict) { |
1436 | NonStrictState = State; |
1437 | assert(Needs == StateStrictWWM || Needs == StateStrictWQM); |
1438 | assert(!SavedNonStrictReg); |
1439 | SavedNonStrictReg = MRI->createVirtualRegister(RegClass: BoolRC); |
1440 | |
1441 | toStrictMode(MBB, Before, SaveOrig: SavedNonStrictReg, StrictStateNeeded: Needs); |
1442 | State = Needs; |
1443 | |
1444 | } else { |
1445 | if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) { |
1446 | if (!WQMFromExec && (OutNeeds & StateWQM)) { |
1447 | assert(!SavedWQMReg); |
1448 | SavedWQMReg = MRI->createVirtualRegister(RegClass: BoolRC); |
1449 | } |
1450 | |
1451 | toExact(MBB, Before, SaveWQM: SavedWQMReg); |
1452 | State = StateExact; |
1453 | } else if (State == StateExact && (Needs & StateWQM) && |
1454 | !(Needs & StateExact)) { |
1455 | assert(WQMFromExec == (SavedWQMReg == 0)); |
1456 | |
1457 | toWQM(MBB, Before, SavedWQM: SavedWQMReg); |
1458 | |
1459 | if (SavedWQMReg) { |
1460 | LIS->createAndComputeVirtRegInterval(Reg: SavedWQMReg); |
1461 | SavedWQMReg = 0; |
1462 | } |
1463 | State = StateWQM; |
1464 | } else { |
1465 | // We can get here if we transitioned from StrictWWM to a |
1466 | // non-StrictWWM state that already matches our needs, but we |
1467 | // shouldn't need to do anything. |
1468 | assert(Needs & State); |
1469 | } |
1470 | } |
1471 | } |
1472 | |
1473 | if (Needs != (StateExact | StateWQM | StateStrict)) { |
1474 | if (Needs != (StateExact | StateWQM)) |
1475 | FirstWQM = IE; |
1476 | FirstStrict = IE; |
1477 | } |
1478 | |
1479 | if (II == IE) |
1480 | break; |
1481 | |
1482 | II = Next; |
1483 | } |
1484 | assert(!SavedWQMReg); |
1485 | assert(!SavedNonStrictReg); |
1486 | } |
1487 | |
1488 | void SIWholeQuadMode::lowerLiveMaskQueries() { |
1489 | for (MachineInstr *MI : LiveMaskQueries) { |
1490 | const DebugLoc &DL = MI->getDebugLoc(); |
1491 | Register Dest = MI->getOperand(i: 0).getReg(); |
1492 | |
1493 | MachineInstr *Copy = |
1494 | BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest) |
1495 | .addReg(LiveMaskReg); |
1496 | |
1497 | LIS->ReplaceMachineInstrInMaps(MI&: *MI, NewMI&: *Copy); |
1498 | MI->eraseFromParent(); |
1499 | } |
1500 | } |
1501 | |
1502 | void SIWholeQuadMode::lowerCopyInstrs() { |
1503 | for (MachineInstr *MI : LowerToMovInstrs) { |
1504 | assert(MI->getNumExplicitOperands() == 2); |
1505 | |
1506 | const Register Reg = MI->getOperand(i: 0).getReg(); |
1507 | |
1508 | const TargetRegisterClass *regClass = |
1509 | TRI->getRegClassForOperandReg(MRI: *MRI, MO: MI->getOperand(i: 0)); |
1510 | if (TRI->isVGPRClass(RC: regClass)) { |
1511 | const unsigned MovOp = TII->getMovOpcode(DstRC: regClass); |
1512 | MI->setDesc(TII->get(MovOp)); |
1513 | |
1514 | // Check that it already implicitly depends on exec (like all VALU movs |
1515 | // should do). |
1516 | assert(any_of(MI->implicit_operands(), [](const MachineOperand &MO) { |
1517 | return MO.isUse() && MO.getReg() == AMDGPU::EXEC; |
1518 | })); |
1519 | } else { |
1520 | // Remove early-clobber and exec dependency from simple SGPR copies. |
1521 | // This allows some to be eliminated during/post RA. |
1522 | LLVM_DEBUG(dbgs() << "simplify SGPR copy: " << *MI); |
1523 | if (MI->getOperand(i: 0).isEarlyClobber()) { |
1524 | LIS->removeInterval(Reg); |
1525 | MI->getOperand(i: 0).setIsEarlyClobber(false); |
1526 | LIS->createAndComputeVirtRegInterval(Reg); |
1527 | } |
1528 | int Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC, /*TRI=*/nullptr); |
1529 | while (Index >= 0) { |
1530 | MI->removeOperand(OpNo: Index); |
1531 | Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC, /*TRI=*/nullptr); |
1532 | } |
1533 | MI->setDesc(TII->get(AMDGPU::COPY)); |
1534 | LLVM_DEBUG(dbgs() << " -> " << *MI); |
1535 | } |
1536 | } |
1537 | for (MachineInstr *MI : LowerToCopyInstrs) { |
1538 | if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 || |
1539 | MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) { |
1540 | assert(MI->getNumExplicitOperands() == 3); |
1541 | // the only reason we should be here is V_SET_INACTIVE has |
1542 | // an undef input so it is being replaced by a simple copy. |
1543 | // There should be a second undef source that we should remove. |
1544 | assert(MI->getOperand(2).isUndef()); |
1545 | MI->removeOperand(OpNo: 2); |
1546 | MI->untieRegOperand(OpIdx: 1); |
1547 | } else { |
1548 | assert(MI->getNumExplicitOperands() == 2); |
1549 | } |
1550 | |
1551 | unsigned CopyOp = MI->getOperand(1).isReg() |
1552 | ? (unsigned)AMDGPU::COPY |
1553 | : TII->getMovOpcode(TRI->getRegClassForOperandReg( |
1554 | *MRI, MI->getOperand(0))); |
1555 | MI->setDesc(TII->get(CopyOp)); |
1556 | } |
1557 | } |
1558 | |
1559 | void SIWholeQuadMode::lowerKillInstrs(bool IsWQM) { |
1560 | for (MachineInstr *MI : KillInstrs) { |
1561 | MachineBasicBlock *MBB = MI->getParent(); |
1562 | MachineInstr *SplitPoint = nullptr; |
1563 | switch (MI->getOpcode()) { |
1564 | case AMDGPU::SI_DEMOTE_I1: |
1565 | case AMDGPU::SI_KILL_I1_TERMINATOR: |
1566 | SplitPoint = lowerKillI1(MBB&: *MBB, MI&: *MI, IsWQM); |
1567 | break; |
1568 | case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: |
1569 | SplitPoint = lowerKillF32(MBB&: *MBB, MI&: *MI); |
1570 | break; |
1571 | default: |
1572 | continue; |
1573 | } |
1574 | if (SplitPoint) |
1575 | splitBlock(BB: MBB, TermMI: SplitPoint); |
1576 | } |
1577 | } |
1578 | |
1579 | bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { |
1580 | LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName() |
1581 | << " ------------- \n" ); |
1582 | LLVM_DEBUG(MF.dump();); |
1583 | |
1584 | Instructions.clear(); |
1585 | Blocks.clear(); |
1586 | LiveMaskQueries.clear(); |
1587 | LowerToCopyInstrs.clear(); |
1588 | LowerToMovInstrs.clear(); |
1589 | KillInstrs.clear(); |
1590 | StateTransition.clear(); |
1591 | |
1592 | ST = &MF.getSubtarget<GCNSubtarget>(); |
1593 | |
1594 | TII = ST->getInstrInfo(); |
1595 | TRI = &TII->getRegisterInfo(); |
1596 | MRI = &MF.getRegInfo(); |
1597 | LIS = &getAnalysis<LiveIntervals>(); |
1598 | MDT = getAnalysisIfAvailable<MachineDominatorTree>(); |
1599 | PDT = getAnalysisIfAvailable<MachinePostDominatorTree>(); |
1600 | |
1601 | if (ST->isWave32()) { |
1602 | AndOpc = AMDGPU::S_AND_B32; |
1603 | AndTermOpc = AMDGPU::S_AND_B32_term; |
1604 | AndN2Opc = AMDGPU::S_ANDN2_B32; |
1605 | XorOpc = AMDGPU::S_XOR_B32; |
1606 | AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32; |
1607 | AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B32_term; |
1608 | WQMOpc = AMDGPU::S_WQM_B32; |
1609 | Exec = AMDGPU::EXEC_LO; |
1610 | } else { |
1611 | AndOpc = AMDGPU::S_AND_B64; |
1612 | AndTermOpc = AMDGPU::S_AND_B64_term; |
1613 | AndN2Opc = AMDGPU::S_ANDN2_B64; |
1614 | XorOpc = AMDGPU::S_XOR_B64; |
1615 | AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64; |
1616 | AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B64_term; |
1617 | WQMOpc = AMDGPU::S_WQM_B64; |
1618 | Exec = AMDGPU::EXEC; |
1619 | } |
1620 | |
1621 | const char GlobalFlags = analyzeFunction(MF); |
1622 | const bool NeedsLiveMask = !(KillInstrs.empty() && LiveMaskQueries.empty()); |
1623 | |
1624 | LiveMaskReg = Exec; |
1625 | |
1626 | // Shader is simple does not need any state changes or any complex lowering |
1627 | if (!(GlobalFlags & (StateWQM | StateStrict)) && LowerToCopyInstrs.empty() && |
1628 | LowerToMovInstrs.empty() && KillInstrs.empty()) { |
1629 | lowerLiveMaskQueries(); |
1630 | return !LiveMaskQueries.empty(); |
1631 | } |
1632 | |
1633 | MachineBasicBlock &Entry = MF.front(); |
1634 | MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI(); |
1635 | |
1636 | // Store a copy of the original live mask when required |
1637 | if (NeedsLiveMask || (GlobalFlags & StateWQM)) { |
1638 | LiveMaskReg = MRI->createVirtualRegister(RegClass: TRI->getBoolRC()); |
1639 | MachineInstr *MI = |
1640 | BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg) |
1641 | .addReg(Exec); |
1642 | LIS->InsertMachineInstrInMaps(MI&: *MI); |
1643 | } |
1644 | |
1645 | LLVM_DEBUG(printInfo()); |
1646 | |
1647 | lowerLiveMaskQueries(); |
1648 | lowerCopyInstrs(); |
1649 | |
1650 | // Shader only needs WQM |
1651 | if (GlobalFlags == StateWQM) { |
1652 | auto MI = BuildMI(Entry, EntryMI, DebugLoc(), TII->get(WQMOpc), Exec) |
1653 | .addReg(Exec); |
1654 | LIS->InsertMachineInstrInMaps(MI&: *MI); |
1655 | lowerKillInstrs(IsWQM: true); |
1656 | } else { |
1657 | for (auto BII : Blocks) |
1658 | processBlock(MBB&: *BII.first, IsEntry: BII.first == &Entry); |
1659 | // Lowering blocks causes block splitting so perform as a second pass. |
1660 | for (auto BII : Blocks) |
1661 | lowerBlock(MBB&: *BII.first); |
1662 | } |
1663 | |
1664 | // Compute live range for live mask |
1665 | if (LiveMaskReg != Exec) |
1666 | LIS->createAndComputeVirtRegInterval(Reg: LiveMaskReg); |
1667 | |
1668 | // Physical registers like SCC aren't tracked by default anyway, so just |
1669 | // removing the ranges we computed is the simplest option for maintaining |
1670 | // the analysis results. |
1671 | LIS->removeAllRegUnitsForPhysReg(AMDGPU::SCC); |
1672 | |
1673 | // If we performed any kills then recompute EXEC |
1674 | if (!KillInstrs.empty()) |
1675 | LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC); |
1676 | |
1677 | return true; |
1678 | } |
1679 | |