1 | //===- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions ---===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file This pass tries to apply several peephole SDWA patterns. |
10 | /// |
11 | /// E.g. original: |
12 | /// V_LSHRREV_B32_e32 %0, 16, %1 |
13 | /// V_ADD_CO_U32_e32 %2, %0, %3 |
14 | /// V_LSHLREV_B32_e32 %4, 16, %2 |
15 | /// |
16 | /// Replace: |
17 | /// V_ADD_CO_U32_sdwa %4, %1, %3 |
18 | /// dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD |
19 | /// |
20 | //===----------------------------------------------------------------------===// |
21 | |
22 | #include "AMDGPU.h" |
23 | #include "GCNSubtarget.h" |
24 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
25 | #include "llvm/ADT/MapVector.h" |
26 | #include "llvm/ADT/Statistic.h" |
27 | #include "llvm/CodeGen/MachineFunctionPass.h" |
28 | #include <optional> |
29 | |
30 | using namespace llvm; |
31 | |
32 | #define DEBUG_TYPE "si-peephole-sdwa" |
33 | |
34 | STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found." ); |
35 | STATISTIC(NumSDWAInstructionsPeepholed, |
36 | "Number of instruction converted to SDWA." ); |
37 | |
38 | namespace { |
39 | |
40 | class SDWAOperand; |
41 | class SDWADstOperand; |
42 | |
43 | class SIPeepholeSDWA : public MachineFunctionPass { |
44 | public: |
45 | using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>; |
46 | |
47 | private: |
48 | MachineRegisterInfo *MRI; |
49 | const SIRegisterInfo *TRI; |
50 | const SIInstrInfo *TII; |
51 | |
52 | MapVector<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands; |
53 | MapVector<MachineInstr *, SDWAOperandsVector> PotentialMatches; |
54 | SmallVector<MachineInstr *, 8> ConvertedInstructions; |
55 | |
56 | std::optional<int64_t> foldToImm(const MachineOperand &Op) const; |
57 | |
58 | public: |
59 | static char ID; |
60 | |
61 | SIPeepholeSDWA() : MachineFunctionPass(ID) { |
62 | initializeSIPeepholeSDWAPass(*PassRegistry::getPassRegistry()); |
63 | } |
64 | |
65 | bool runOnMachineFunction(MachineFunction &MF) override; |
66 | void matchSDWAOperands(MachineBasicBlock &MBB); |
67 | std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI); |
68 | bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST) const; |
69 | void pseudoOpConvertToVOP2(MachineInstr &MI, |
70 | const GCNSubtarget &ST) const; |
71 | bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands); |
72 | void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const; |
73 | |
74 | StringRef getPassName() const override { return "SI Peephole SDWA" ; } |
75 | |
76 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
77 | AU.setPreservesCFG(); |
78 | MachineFunctionPass::getAnalysisUsage(AU); |
79 | } |
80 | }; |
81 | |
82 | class SDWAOperand { |
83 | private: |
84 | MachineOperand *Target; // Operand that would be used in converted instruction |
85 | MachineOperand *Replaced; // Operand that would be replace by Target |
86 | |
87 | public: |
88 | SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp) |
89 | : Target(TargetOp), Replaced(ReplacedOp) { |
90 | assert(Target->isReg()); |
91 | assert(Replaced->isReg()); |
92 | } |
93 | |
94 | virtual ~SDWAOperand() = default; |
95 | |
96 | virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) = 0; |
97 | virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0; |
98 | |
99 | MachineOperand *getTargetOperand() const { return Target; } |
100 | MachineOperand *getReplacedOperand() const { return Replaced; } |
101 | MachineInstr *getParentInst() const { return Target->getParent(); } |
102 | |
103 | MachineRegisterInfo *getMRI() const { |
104 | return &getParentInst()->getParent()->getParent()->getRegInfo(); |
105 | } |
106 | |
107 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
108 | virtual void print(raw_ostream& OS) const = 0; |
109 | void dump() const { print(OS&: dbgs()); } |
110 | #endif |
111 | }; |
112 | |
113 | using namespace AMDGPU::SDWA; |
114 | |
115 | class SDWASrcOperand : public SDWAOperand { |
116 | private: |
117 | SdwaSel SrcSel; |
118 | bool Abs; |
119 | bool Neg; |
120 | bool Sext; |
121 | |
122 | public: |
123 | SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, |
124 | SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false, |
125 | bool Sext_ = false) |
126 | : SDWAOperand(TargetOp, ReplacedOp), |
127 | SrcSel(SrcSel_), Abs(Abs_), Neg(Neg_), Sext(Sext_) {} |
128 | |
129 | MachineInstr *potentialToConvert(const SIInstrInfo *TII) override; |
130 | bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; |
131 | |
132 | SdwaSel getSrcSel() const { return SrcSel; } |
133 | bool getAbs() const { return Abs; } |
134 | bool getNeg() const { return Neg; } |
135 | bool getSext() const { return Sext; } |
136 | |
137 | uint64_t getSrcMods(const SIInstrInfo *TII, |
138 | const MachineOperand *SrcOp) const; |
139 | |
140 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
141 | void print(raw_ostream& OS) const override; |
142 | #endif |
143 | }; |
144 | |
145 | class SDWADstOperand : public SDWAOperand { |
146 | private: |
147 | SdwaSel DstSel; |
148 | DstUnused DstUn; |
149 | |
150 | public: |
151 | |
152 | SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, |
153 | SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD) |
154 | : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {} |
155 | |
156 | MachineInstr *potentialToConvert(const SIInstrInfo *TII) override; |
157 | bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; |
158 | |
159 | SdwaSel getDstSel() const { return DstSel; } |
160 | DstUnused getDstUnused() const { return DstUn; } |
161 | |
162 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
163 | void print(raw_ostream& OS) const override; |
164 | #endif |
165 | }; |
166 | |
167 | class SDWADstPreserveOperand : public SDWADstOperand { |
168 | private: |
169 | MachineOperand *Preserve; |
170 | |
171 | public: |
172 | SDWADstPreserveOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, |
173 | MachineOperand *PreserveOp, SdwaSel DstSel_ = DWORD) |
174 | : SDWADstOperand(TargetOp, ReplacedOp, DstSel_, UNUSED_PRESERVE), |
175 | Preserve(PreserveOp) {} |
176 | |
177 | bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; |
178 | |
179 | MachineOperand *getPreservedOperand() const { return Preserve; } |
180 | |
181 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
182 | void print(raw_ostream& OS) const override; |
183 | #endif |
184 | }; |
185 | |
186 | } // end anonymous namespace |
187 | |
188 | INITIALIZE_PASS(SIPeepholeSDWA, DEBUG_TYPE, "SI Peephole SDWA" , false, false) |
189 | |
190 | char SIPeepholeSDWA::ID = 0; |
191 | |
192 | char &llvm::SIPeepholeSDWAID = SIPeepholeSDWA::ID; |
193 | |
194 | FunctionPass *llvm::createSIPeepholeSDWAPass() { |
195 | return new SIPeepholeSDWA(); |
196 | } |
197 | |
198 | |
199 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
200 | static raw_ostream& operator<<(raw_ostream &OS, SdwaSel Sel) { |
201 | switch(Sel) { |
202 | case BYTE_0: OS << "BYTE_0" ; break; |
203 | case BYTE_1: OS << "BYTE_1" ; break; |
204 | case BYTE_2: OS << "BYTE_2" ; break; |
205 | case BYTE_3: OS << "BYTE_3" ; break; |
206 | case WORD_0: OS << "WORD_0" ; break; |
207 | case WORD_1: OS << "WORD_1" ; break; |
208 | case DWORD: OS << "DWORD" ; break; |
209 | } |
210 | return OS; |
211 | } |
212 | |
213 | static raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) { |
214 | switch(Un) { |
215 | case UNUSED_PAD: OS << "UNUSED_PAD" ; break; |
216 | case UNUSED_SEXT: OS << "UNUSED_SEXT" ; break; |
217 | case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE" ; break; |
218 | } |
219 | return OS; |
220 | } |
221 | |
222 | LLVM_DUMP_METHOD |
223 | void SDWASrcOperand::print(raw_ostream& OS) const { |
224 | OS << "SDWA src: " << *getTargetOperand() |
225 | << " src_sel:" << getSrcSel() |
226 | << " abs:" << getAbs() << " neg:" << getNeg() |
227 | << " sext:" << getSext() << '\n'; |
228 | } |
229 | |
230 | LLVM_DUMP_METHOD |
231 | void SDWADstOperand::print(raw_ostream& OS) const { |
232 | OS << "SDWA dst: " << *getTargetOperand() |
233 | << " dst_sel:" << getDstSel() |
234 | << " dst_unused:" << getDstUnused() << '\n'; |
235 | } |
236 | |
237 | LLVM_DUMP_METHOD |
238 | void SDWADstPreserveOperand::print(raw_ostream& OS) const { |
239 | OS << "SDWA preserve dst: " << *getTargetOperand() |
240 | << " dst_sel:" << getDstSel() |
241 | << " preserve:" << *getPreservedOperand() << '\n'; |
242 | } |
243 | |
244 | #endif |
245 | |
246 | static void copyRegOperand(MachineOperand &To, const MachineOperand &From) { |
247 | assert(To.isReg() && From.isReg()); |
248 | To.setReg(From.getReg()); |
249 | To.setSubReg(From.getSubReg()); |
250 | To.setIsUndef(From.isUndef()); |
251 | if (To.isUse()) { |
252 | To.setIsKill(From.isKill()); |
253 | } else { |
254 | To.setIsDead(From.isDead()); |
255 | } |
256 | } |
257 | |
258 | static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) { |
259 | return LHS.isReg() && |
260 | RHS.isReg() && |
261 | LHS.getReg() == RHS.getReg() && |
262 | LHS.getSubReg() == RHS.getSubReg(); |
263 | } |
264 | |
265 | static MachineOperand *findSingleRegUse(const MachineOperand *Reg, |
266 | const MachineRegisterInfo *MRI) { |
267 | if (!Reg->isReg() || !Reg->isDef()) |
268 | return nullptr; |
269 | |
270 | MachineOperand *ResMO = nullptr; |
271 | for (MachineOperand &UseMO : MRI->use_nodbg_operands(Reg: Reg->getReg())) { |
272 | // If there exist use of subreg of Reg then return nullptr |
273 | if (!isSameReg(LHS: UseMO, RHS: *Reg)) |
274 | return nullptr; |
275 | |
276 | // Check that there is only one instruction that uses Reg |
277 | if (!ResMO) { |
278 | ResMO = &UseMO; |
279 | } else if (ResMO->getParent() != UseMO.getParent()) { |
280 | return nullptr; |
281 | } |
282 | } |
283 | |
284 | return ResMO; |
285 | } |
286 | |
287 | static MachineOperand *findSingleRegDef(const MachineOperand *Reg, |
288 | const MachineRegisterInfo *MRI) { |
289 | if (!Reg->isReg()) |
290 | return nullptr; |
291 | |
292 | MachineInstr *DefInstr = MRI->getUniqueVRegDef(Reg: Reg->getReg()); |
293 | if (!DefInstr) |
294 | return nullptr; |
295 | |
296 | for (auto &DefMO : DefInstr->defs()) { |
297 | if (DefMO.isReg() && DefMO.getReg() == Reg->getReg()) |
298 | return &DefMO; |
299 | } |
300 | |
301 | // Ignore implicit defs. |
302 | return nullptr; |
303 | } |
304 | |
305 | uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII, |
306 | const MachineOperand *SrcOp) const { |
307 | uint64_t Mods = 0; |
308 | const auto *MI = SrcOp->getParent(); |
309 | if (TII->getNamedOperand(*MI, AMDGPU::OpName::src0) == SrcOp) { |
310 | if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) { |
311 | Mods = Mod->getImm(); |
312 | } |
313 | } else if (TII->getNamedOperand(*MI, AMDGPU::OpName::src1) == SrcOp) { |
314 | if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers)) { |
315 | Mods = Mod->getImm(); |
316 | } |
317 | } |
318 | if (Abs || Neg) { |
319 | assert(!Sext && |
320 | "Float and integer src modifiers can't be set simultaneously" ); |
321 | Mods |= Abs ? SISrcMods::ABS : 0u; |
322 | Mods ^= Neg ? SISrcMods::NEG : 0u; |
323 | } else if (Sext) { |
324 | Mods |= SISrcMods::SEXT; |
325 | } |
326 | |
327 | return Mods; |
328 | } |
329 | |
330 | MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) { |
331 | // For SDWA src operand potential instruction is one that use register |
332 | // defined by parent instruction |
333 | MachineOperand *PotentialMO = findSingleRegUse(Reg: getReplacedOperand(), MRI: getMRI()); |
334 | if (!PotentialMO) |
335 | return nullptr; |
336 | |
337 | return PotentialMO->getParent(); |
338 | } |
339 | |
340 | bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { |
341 | switch (MI.getOpcode()) { |
342 | case AMDGPU::V_CVT_F32_FP8_sdwa: |
343 | case AMDGPU::V_CVT_F32_BF8_sdwa: |
344 | case AMDGPU::V_CVT_PK_F32_FP8_sdwa: |
345 | case AMDGPU::V_CVT_PK_F32_BF8_sdwa: |
346 | // Does not support input modifiers: noabs, noneg, nosext. |
347 | return false; |
348 | } |
349 | |
350 | // Find operand in instruction that matches source operand and replace it with |
351 | // target operand. Set corresponding src_sel |
352 | bool IsPreserveSrc = false; |
353 | MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src0); |
354 | MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel); |
355 | MachineOperand *SrcMods = |
356 | TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); |
357 | assert(Src && (Src->isReg() || Src->isImm())); |
358 | if (!isSameReg(LHS: *Src, RHS: *getReplacedOperand())) { |
359 | // If this is not src0 then it could be src1 |
360 | Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1); |
361 | SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel); |
362 | SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); |
363 | |
364 | if (!Src || |
365 | !isSameReg(LHS: *Src, RHS: *getReplacedOperand())) { |
366 | // It's possible this Src is a tied operand for |
367 | // UNUSED_PRESERVE, in which case we can either |
368 | // abandon the peephole attempt, or if legal we can |
369 | // copy the target operand into the tied slot |
370 | // if the preserve operation will effectively cause the same |
371 | // result by overwriting the rest of the dst. |
372 | MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); |
373 | MachineOperand *DstUnused = |
374 | TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); |
375 | |
376 | if (Dst && |
377 | DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) { |
378 | // This will work if the tied src is accessing WORD_0, and the dst is |
379 | // writing WORD_1. Modifiers don't matter because all the bits that |
380 | // would be impacted are being overwritten by the dst. |
381 | // Any other case will not work. |
382 | SdwaSel DstSel = static_cast<SdwaSel>( |
383 | TII->getNamedImmOperand(MI, AMDGPU::OpName::dst_sel)); |
384 | if (DstSel == AMDGPU::SDWA::SdwaSel::WORD_1 && |
385 | getSrcSel() == AMDGPU::SDWA::SdwaSel::WORD_0) { |
386 | IsPreserveSrc = true; |
387 | auto DstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), |
388 | AMDGPU::OpName::vdst); |
389 | auto TiedIdx = MI.findTiedOperandIdx(OpIdx: DstIdx); |
390 | Src = &MI.getOperand(TiedIdx); |
391 | SrcSel = nullptr; |
392 | SrcMods = nullptr; |
393 | } else { |
394 | // Not legal to convert this src |
395 | return false; |
396 | } |
397 | } |
398 | } |
399 | assert(Src && Src->isReg()); |
400 | |
401 | if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa || |
402 | MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa || |
403 | MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || |
404 | MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && |
405 | !isSameReg(LHS: *Src, RHS: *getReplacedOperand())) { |
406 | // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to |
407 | // src2. This is not allowed. |
408 | return false; |
409 | } |
410 | |
411 | assert(isSameReg(*Src, *getReplacedOperand()) && |
412 | (IsPreserveSrc || (SrcSel && SrcMods))); |
413 | } |
414 | copyRegOperand(To&: *Src, From: *getTargetOperand()); |
415 | if (!IsPreserveSrc) { |
416 | SrcSel->setImm(getSrcSel()); |
417 | SrcMods->setImm(getSrcMods(TII, SrcOp: Src)); |
418 | } |
419 | getTargetOperand()->setIsKill(false); |
420 | return true; |
421 | } |
422 | |
423 | MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) { |
424 | // For SDWA dst operand potential instruction is one that defines register |
425 | // that this operand uses |
426 | MachineRegisterInfo *MRI = getMRI(); |
427 | MachineInstr *ParentMI = getParentInst(); |
428 | |
429 | MachineOperand *PotentialMO = findSingleRegDef(Reg: getReplacedOperand(), MRI); |
430 | if (!PotentialMO) |
431 | return nullptr; |
432 | |
433 | // Check that ParentMI is the only instruction that uses replaced register |
434 | for (MachineInstr &UseInst : MRI->use_nodbg_instructions(Reg: PotentialMO->getReg())) { |
435 | if (&UseInst != ParentMI) |
436 | return nullptr; |
437 | } |
438 | |
439 | return PotentialMO->getParent(); |
440 | } |
441 | |
442 | bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { |
443 | // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused |
444 | |
445 | if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa || |
446 | MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa || |
447 | MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || |
448 | MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && |
449 | getDstSel() != AMDGPU::SDWA::DWORD) { |
450 | // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD |
451 | return false; |
452 | } |
453 | |
454 | MachineOperand *Operand = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); |
455 | assert(Operand && |
456 | Operand->isReg() && |
457 | isSameReg(*Operand, *getReplacedOperand())); |
458 | copyRegOperand(To&: *Operand, From: *getTargetOperand()); |
459 | MachineOperand *DstSel= TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel); |
460 | assert(DstSel); |
461 | DstSel->setImm(getDstSel()); |
462 | MachineOperand *DstUnused= TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); |
463 | assert(DstUnused); |
464 | DstUnused->setImm(getDstUnused()); |
465 | |
466 | // Remove original instruction because it would conflict with our new |
467 | // instruction by register definition |
468 | getParentInst()->eraseFromParent(); |
469 | return true; |
470 | } |
471 | |
472 | bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI, |
473 | const SIInstrInfo *TII) { |
474 | // MI should be moved right before v_or_b32. |
475 | // For this we should clear all kill flags on uses of MI src-operands or else |
476 | // we can encounter problem with use of killed operand. |
477 | for (MachineOperand &MO : MI.uses()) { |
478 | if (!MO.isReg()) |
479 | continue; |
480 | getMRI()->clearKillFlags(Reg: MO.getReg()); |
481 | } |
482 | |
483 | // Move MI before v_or_b32 |
484 | MI.getParent()->remove(I: &MI); |
485 | getParentInst()->getParent()->insert(I: getParentInst(), MI: &MI); |
486 | |
487 | // Add Implicit use of preserved register |
488 | MachineInstrBuilder MIB(*MI.getMF(), MI); |
489 | MIB.addReg(RegNo: getPreservedOperand()->getReg(), |
490 | flags: RegState::ImplicitKill, |
491 | SubReg: getPreservedOperand()->getSubReg()); |
492 | |
493 | // Tie dst to implicit use |
494 | MI.tieOperands(AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst), |
495 | MI.getNumOperands() - 1); |
496 | |
497 | // Convert MI as any other SDWADstOperand and remove v_or_b32 |
498 | return SDWADstOperand::convertToSDWA(MI, TII); |
499 | } |
500 | |
501 | std::optional<int64_t> |
502 | SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const { |
503 | if (Op.isImm()) { |
504 | return Op.getImm(); |
505 | } |
506 | |
507 | // If this is not immediate then it can be copy of immediate value, e.g.: |
508 | // %1 = S_MOV_B32 255; |
509 | if (Op.isReg()) { |
510 | for (const MachineOperand &Def : MRI->def_operands(Reg: Op.getReg())) { |
511 | if (!isSameReg(LHS: Op, RHS: Def)) |
512 | continue; |
513 | |
514 | const MachineInstr *DefInst = Def.getParent(); |
515 | if (!TII->isFoldableCopy(MI: *DefInst)) |
516 | return std::nullopt; |
517 | |
518 | const MachineOperand &Copied = DefInst->getOperand(i: 1); |
519 | if (!Copied.isImm()) |
520 | return std::nullopt; |
521 | |
522 | return Copied.getImm(); |
523 | } |
524 | } |
525 | |
526 | return std::nullopt; |
527 | } |
528 | |
529 | std::unique_ptr<SDWAOperand> |
530 | SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) { |
531 | unsigned Opcode = MI.getOpcode(); |
532 | switch (Opcode) { |
533 | case AMDGPU::V_LSHRREV_B32_e32: |
534 | case AMDGPU::V_ASHRREV_I32_e32: |
535 | case AMDGPU::V_LSHLREV_B32_e32: |
536 | case AMDGPU::V_LSHRREV_B32_e64: |
537 | case AMDGPU::V_ASHRREV_I32_e64: |
538 | case AMDGPU::V_LSHLREV_B32_e64: { |
539 | // from: v_lshrrev_b32_e32 v1, 16/24, v0 |
540 | // to SDWA src:v0 src_sel:WORD_1/BYTE_3 |
541 | |
542 | // from: v_ashrrev_i32_e32 v1, 16/24, v0 |
543 | // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1 |
544 | |
545 | // from: v_lshlrev_b32_e32 v1, 16/24, v0 |
546 | // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD |
547 | MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); |
548 | auto Imm = foldToImm(Op: *Src0); |
549 | if (!Imm) |
550 | break; |
551 | |
552 | if (*Imm != 16 && *Imm != 24) |
553 | break; |
554 | |
555 | MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); |
556 | MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); |
557 | if (!Src1->isReg() || Src1->getReg().isPhysical() || |
558 | Dst->getReg().isPhysical()) |
559 | break; |
560 | |
561 | if (Opcode == AMDGPU::V_LSHLREV_B32_e32 || |
562 | Opcode == AMDGPU::V_LSHLREV_B32_e64) { |
563 | return std::make_unique<SDWADstOperand>( |
564 | args&: Dst, args&: Src1, args: *Imm == 16 ? WORD_1 : BYTE_3, args: UNUSED_PAD); |
565 | } else { |
566 | return std::make_unique<SDWASrcOperand>( |
567 | Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false, |
568 | Opcode != AMDGPU::V_LSHRREV_B32_e32 && |
569 | Opcode != AMDGPU::V_LSHRREV_B32_e64); |
570 | } |
571 | break; |
572 | } |
573 | |
574 | case AMDGPU::V_LSHRREV_B16_e32: |
575 | case AMDGPU::V_ASHRREV_I16_e32: |
576 | case AMDGPU::V_LSHLREV_B16_e32: |
577 | case AMDGPU::V_LSHRREV_B16_e64: |
578 | case AMDGPU::V_ASHRREV_I16_e64: |
579 | case AMDGPU::V_LSHLREV_B16_e64: { |
580 | // from: v_lshrrev_b16_e32 v1, 8, v0 |
581 | // to SDWA src:v0 src_sel:BYTE_1 |
582 | |
583 | // from: v_ashrrev_i16_e32 v1, 8, v0 |
584 | // to SDWA src:v0 src_sel:BYTE_1 sext:1 |
585 | |
586 | // from: v_lshlrev_b16_e32 v1, 8, v0 |
587 | // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD |
588 | MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); |
589 | auto Imm = foldToImm(Op: *Src0); |
590 | if (!Imm || *Imm != 8) |
591 | break; |
592 | |
593 | MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); |
594 | MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); |
595 | |
596 | if (!Src1->isReg() || Src1->getReg().isPhysical() || |
597 | Dst->getReg().isPhysical()) |
598 | break; |
599 | |
600 | if (Opcode == AMDGPU::V_LSHLREV_B16_e32 || |
601 | Opcode == AMDGPU::V_LSHLREV_B16_e64) { |
602 | return std::make_unique<SDWADstOperand>(args&: Dst, args&: Src1, args: BYTE_1, args: UNUSED_PAD); |
603 | } else { |
604 | return std::make_unique<SDWASrcOperand>( |
605 | Src1, Dst, BYTE_1, false, false, |
606 | Opcode != AMDGPU::V_LSHRREV_B16_e32 && |
607 | Opcode != AMDGPU::V_LSHRREV_B16_e64); |
608 | } |
609 | break; |
610 | } |
611 | |
612 | case AMDGPU::V_BFE_I32_e64: |
613 | case AMDGPU::V_BFE_U32_e64: { |
614 | // e.g.: |
615 | // from: v_bfe_u32 v1, v0, 8, 8 |
616 | // to SDWA src:v0 src_sel:BYTE_1 |
617 | |
618 | // offset | width | src_sel |
619 | // ------------------------ |
620 | // 0 | 8 | BYTE_0 |
621 | // 0 | 16 | WORD_0 |
622 | // 0 | 32 | DWORD ? |
623 | // 8 | 8 | BYTE_1 |
624 | // 16 | 8 | BYTE_2 |
625 | // 16 | 16 | WORD_1 |
626 | // 24 | 8 | BYTE_3 |
627 | |
628 | MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); |
629 | auto Offset = foldToImm(Op: *Src1); |
630 | if (!Offset) |
631 | break; |
632 | |
633 | MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); |
634 | auto Width = foldToImm(Op: *Src2); |
635 | if (!Width) |
636 | break; |
637 | |
638 | SdwaSel SrcSel = DWORD; |
639 | |
640 | if (*Offset == 0 && *Width == 8) |
641 | SrcSel = BYTE_0; |
642 | else if (*Offset == 0 && *Width == 16) |
643 | SrcSel = WORD_0; |
644 | else if (*Offset == 0 && *Width == 32) |
645 | SrcSel = DWORD; |
646 | else if (*Offset == 8 && *Width == 8) |
647 | SrcSel = BYTE_1; |
648 | else if (*Offset == 16 && *Width == 8) |
649 | SrcSel = BYTE_2; |
650 | else if (*Offset == 16 && *Width == 16) |
651 | SrcSel = WORD_1; |
652 | else if (*Offset == 24 && *Width == 8) |
653 | SrcSel = BYTE_3; |
654 | else |
655 | break; |
656 | |
657 | MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); |
658 | MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); |
659 | |
660 | if (!Src0->isReg() || Src0->getReg().isPhysical() || |
661 | Dst->getReg().isPhysical()) |
662 | break; |
663 | |
664 | return std::make_unique<SDWASrcOperand>( |
665 | Src0, Dst, SrcSel, false, false, Opcode != AMDGPU::V_BFE_U32_e64); |
666 | } |
667 | |
668 | case AMDGPU::V_AND_B32_e32: |
669 | case AMDGPU::V_AND_B32_e64: { |
670 | // e.g.: |
671 | // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0 |
672 | // to SDWA src:v0 src_sel:WORD_0/BYTE_0 |
673 | |
674 | MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); |
675 | MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); |
676 | auto ValSrc = Src1; |
677 | auto Imm = foldToImm(Op: *Src0); |
678 | |
679 | if (!Imm) { |
680 | Imm = foldToImm(Op: *Src1); |
681 | ValSrc = Src0; |
682 | } |
683 | |
684 | if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff)) |
685 | break; |
686 | |
687 | MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); |
688 | |
689 | if (!ValSrc->isReg() || ValSrc->getReg().isPhysical() || |
690 | Dst->getReg().isPhysical()) |
691 | break; |
692 | |
693 | return std::make_unique<SDWASrcOperand>( |
694 | args&: ValSrc, args&: Dst, args: *Imm == 0x0000ffff ? WORD_0 : BYTE_0); |
695 | } |
696 | |
697 | case AMDGPU::V_OR_B32_e32: |
698 | case AMDGPU::V_OR_B32_e64: { |
699 | // Patterns for dst_unused:UNUSED_PRESERVE. |
700 | // e.g., from: |
701 | // v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD |
702 | // src1_sel:WORD_1 src2_sel:WORD1 |
703 | // v_add_f16_e32 v3, v1, v2 |
704 | // v_or_b32_e32 v4, v0, v3 |
705 | // to SDWA preserve dst:v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE preserve:v3 |
706 | |
707 | // Check if one of operands of v_or_b32 is SDWA instruction |
708 | using CheckRetType = |
709 | std::optional<std::pair<MachineOperand *, MachineOperand *>>; |
710 | auto CheckOROperandsForSDWA = |
711 | [&](const MachineOperand *Op1, const MachineOperand *Op2) -> CheckRetType { |
712 | if (!Op1 || !Op1->isReg() || !Op2 || !Op2->isReg()) |
713 | return CheckRetType(std::nullopt); |
714 | |
715 | MachineOperand *Op1Def = findSingleRegDef(Reg: Op1, MRI); |
716 | if (!Op1Def) |
717 | return CheckRetType(std::nullopt); |
718 | |
719 | MachineInstr *Op1Inst = Op1Def->getParent(); |
720 | if (!TII->isSDWA(MI: *Op1Inst)) |
721 | return CheckRetType(std::nullopt); |
722 | |
723 | MachineOperand *Op2Def = findSingleRegDef(Reg: Op2, MRI); |
724 | if (!Op2Def) |
725 | return CheckRetType(std::nullopt); |
726 | |
727 | return CheckRetType(std::pair(Op1Def, Op2Def)); |
728 | }; |
729 | |
730 | MachineOperand *OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src0); |
731 | MachineOperand *OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src1); |
732 | assert(OrSDWA && OrOther); |
733 | auto Res = CheckOROperandsForSDWA(OrSDWA, OrOther); |
734 | if (!Res) { |
735 | OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src1); |
736 | OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src0); |
737 | assert(OrSDWA && OrOther); |
738 | Res = CheckOROperandsForSDWA(OrSDWA, OrOther); |
739 | if (!Res) |
740 | break; |
741 | } |
742 | |
743 | MachineOperand *OrSDWADef = Res->first; |
744 | MachineOperand *OrOtherDef = Res->second; |
745 | assert(OrSDWADef && OrOtherDef); |
746 | |
747 | MachineInstr *SDWAInst = OrSDWADef->getParent(); |
748 | MachineInstr *OtherInst = OrOtherDef->getParent(); |
749 | |
750 | // Check that OtherInstr is actually bitwise compatible with SDWAInst = their |
751 | // destination patterns don't overlap. Compatible instruction can be either |
752 | // regular instruction with compatible bitness or SDWA instruction with |
753 | // correct dst_sel |
754 | // SDWAInst | OtherInst bitness / OtherInst dst_sel |
755 | // ----------------------------------------------------- |
756 | // DWORD | no / no |
757 | // WORD_0 | no / BYTE_2/3, WORD_1 |
758 | // WORD_1 | 8/16-bit instructions / BYTE_0/1, WORD_0 |
759 | // BYTE_0 | no / BYTE_1/2/3, WORD_1 |
760 | // BYTE_1 | 8-bit / BYTE_0/2/3, WORD_1 |
761 | // BYTE_2 | 8/16-bit / BYTE_0/1/3. WORD_0 |
762 | // BYTE_3 | 8/16/24-bit / BYTE_0/1/2, WORD_0 |
763 | // E.g. if SDWAInst is v_add_f16_sdwa dst_sel:WORD_1 then v_add_f16 is OK |
764 | // but v_add_f32 is not. |
765 | |
766 | // TODO: add support for non-SDWA instructions as OtherInst. |
767 | // For now this only works with SDWA instructions. For regular instructions |
768 | // there is no way to determine if the instruction writes only 8/16/24-bit |
769 | // out of full register size and all registers are at min 32-bit wide. |
770 | if (!TII->isSDWA(MI: *OtherInst)) |
771 | break; |
772 | |
773 | SdwaSel DstSel = static_cast<SdwaSel>( |
774 | TII->getNamedImmOperand(*SDWAInst, AMDGPU::OpName::dst_sel)); |
775 | SdwaSel OtherDstSel = static_cast<SdwaSel>( |
776 | TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_sel)); |
777 | |
778 | bool DstSelAgree = false; |
779 | switch (DstSel) { |
780 | case WORD_0: DstSelAgree = ((OtherDstSel == BYTE_2) || |
781 | (OtherDstSel == BYTE_3) || |
782 | (OtherDstSel == WORD_1)); |
783 | break; |
784 | case WORD_1: DstSelAgree = ((OtherDstSel == BYTE_0) || |
785 | (OtherDstSel == BYTE_1) || |
786 | (OtherDstSel == WORD_0)); |
787 | break; |
788 | case BYTE_0: DstSelAgree = ((OtherDstSel == BYTE_1) || |
789 | (OtherDstSel == BYTE_2) || |
790 | (OtherDstSel == BYTE_3) || |
791 | (OtherDstSel == WORD_1)); |
792 | break; |
793 | case BYTE_1: DstSelAgree = ((OtherDstSel == BYTE_0) || |
794 | (OtherDstSel == BYTE_2) || |
795 | (OtherDstSel == BYTE_3) || |
796 | (OtherDstSel == WORD_1)); |
797 | break; |
798 | case BYTE_2: DstSelAgree = ((OtherDstSel == BYTE_0) || |
799 | (OtherDstSel == BYTE_1) || |
800 | (OtherDstSel == BYTE_3) || |
801 | (OtherDstSel == WORD_0)); |
802 | break; |
803 | case BYTE_3: DstSelAgree = ((OtherDstSel == BYTE_0) || |
804 | (OtherDstSel == BYTE_1) || |
805 | (OtherDstSel == BYTE_2) || |
806 | (OtherDstSel == WORD_0)); |
807 | break; |
808 | default: DstSelAgree = false; |
809 | } |
810 | |
811 | if (!DstSelAgree) |
812 | break; |
813 | |
814 | // Also OtherInst dst_unused should be UNUSED_PAD |
815 | DstUnused OtherDstUnused = static_cast<DstUnused>( |
816 | TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_unused)); |
817 | if (OtherDstUnused != DstUnused::UNUSED_PAD) |
818 | break; |
819 | |
820 | // Create DstPreserveOperand |
821 | MachineOperand *OrDst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); |
822 | assert(OrDst && OrDst->isReg()); |
823 | |
824 | return std::make_unique<SDWADstPreserveOperand>( |
825 | args&: OrDst, args&: OrSDWADef, args&: OrOtherDef, args&: DstSel); |
826 | |
827 | } |
828 | } |
829 | |
830 | return std::unique_ptr<SDWAOperand>(nullptr); |
831 | } |
832 | |
833 | #if !defined(NDEBUG) |
834 | static raw_ostream& operator<<(raw_ostream &OS, const SDWAOperand &Operand) { |
835 | Operand.print(OS); |
836 | return OS; |
837 | } |
838 | #endif |
839 | |
840 | void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) { |
841 | for (MachineInstr &MI : MBB) { |
842 | if (auto Operand = matchSDWAOperand(MI)) { |
843 | LLVM_DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << '\n'); |
844 | SDWAOperands[&MI] = std::move(Operand); |
845 | ++NumSDWAPatternsFound; |
846 | } |
847 | } |
848 | } |
849 | |
850 | // Convert the V_ADD_CO_U32_e64 into V_ADD_CO_U32_e32. This allows |
851 | // isConvertibleToSDWA to perform its transformation on V_ADD_CO_U32_e32 into |
852 | // V_ADD_CO_U32_sdwa. |
853 | // |
854 | // We are transforming from a VOP3 into a VOP2 form of the instruction. |
855 | // %19:vgpr_32 = V_AND_B32_e32 255, |
856 | // killed %16:vgpr_32, implicit $exec |
857 | // %47:vgpr_32, %49:sreg_64_xexec = V_ADD_CO_U32_e64 |
858 | // %26.sub0:vreg_64, %19:vgpr_32, implicit $exec |
859 | // %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64 |
860 | // %26.sub1:vreg_64, %54:vgpr_32, killed %49:sreg_64_xexec, implicit $exec |
861 | // |
862 | // becomes |
863 | // %47:vgpr_32 = V_ADD_CO_U32_sdwa |
864 | // 0, %26.sub0:vreg_64, 0, killed %16:vgpr_32, 0, 6, 0, 6, 0, |
865 | // implicit-def $vcc, implicit $exec |
866 | // %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64 |
867 | // %26.sub1:vreg_64, %54:vgpr_32, killed $vcc, implicit $exec |
868 | void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI, |
869 | const GCNSubtarget &ST) const { |
870 | int Opc = MI.getOpcode(); |
871 | assert((Opc == AMDGPU::V_ADD_CO_U32_e64 || Opc == AMDGPU::V_SUB_CO_U32_e64) && |
872 | "Currently only handles V_ADD_CO_U32_e64 or V_SUB_CO_U32_e64" ); |
873 | |
874 | // Can the candidate MI be shrunk? |
875 | if (!TII->canShrink(MI, MRI: *MRI)) |
876 | return; |
877 | Opc = AMDGPU::getVOPe32(Opcode: Opc); |
878 | // Find the related ADD instruction. |
879 | const MachineOperand *Sdst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); |
880 | if (!Sdst) |
881 | return; |
882 | MachineOperand *NextOp = findSingleRegUse(Reg: Sdst, MRI); |
883 | if (!NextOp) |
884 | return; |
885 | MachineInstr &MISucc = *NextOp->getParent(); |
886 | |
887 | // Make sure the carry in/out are subsequently unused. |
888 | MachineOperand *CarryIn = TII->getNamedOperand(MISucc, AMDGPU::OpName::src2); |
889 | if (!CarryIn) |
890 | return; |
891 | MachineOperand *CarryOut = TII->getNamedOperand(MISucc, AMDGPU::OpName::sdst); |
892 | if (!CarryOut) |
893 | return; |
894 | if (!MRI->hasOneUse(RegNo: CarryIn->getReg()) || !MRI->use_empty(RegNo: CarryOut->getReg())) |
895 | return; |
896 | // Make sure VCC or its subregs are dead before MI. |
897 | MachineBasicBlock &MBB = *MI.getParent(); |
898 | auto Liveness = MBB.computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 25); |
899 | if (Liveness != MachineBasicBlock::LQR_Dead) |
900 | return; |
901 | // Check if VCC is referenced in range of (MI,MISucc]. |
902 | for (auto I = std::next(x: MI.getIterator()), E = MISucc.getIterator(); |
903 | I != E; ++I) { |
904 | if (I->modifiesRegister(AMDGPU::VCC, TRI)) |
905 | return; |
906 | } |
907 | |
908 | // Replace MI with V_{SUB|ADD}_I32_e32 |
909 | BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(Opc)) |
910 | .add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) |
911 | .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0)) |
912 | .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1)) |
913 | .setMIFlags(MI.getFlags()); |
914 | |
915 | MI.eraseFromParent(); |
916 | |
917 | // Since the carry output of MI is now VCC, update its use in MISucc. |
918 | |
919 | MISucc.substituteRegister(CarryIn->getReg(), TRI->getVCC(), 0, *TRI); |
920 | } |
921 | |
922 | bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI, |
923 | const GCNSubtarget &ST) const { |
924 | // Check if this is already an SDWA instruction |
925 | unsigned Opc = MI.getOpcode(); |
926 | if (TII->isSDWA(Opcode: Opc)) |
927 | return true; |
928 | |
929 | // Check if this instruction has opcode that supports SDWA |
930 | if (AMDGPU::getSDWAOp(Opcode: Opc) == -1) |
931 | Opc = AMDGPU::getVOPe32(Opcode: Opc); |
932 | |
933 | if (AMDGPU::getSDWAOp(Opcode: Opc) == -1) |
934 | return false; |
935 | |
936 | if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, AMDGPU::OpName::omod)) |
937 | return false; |
938 | |
939 | if (TII->isVOPC(Opcode: Opc)) { |
940 | if (!ST.hasSDWASdst()) { |
941 | const MachineOperand *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); |
942 | if (SDst && (SDst->getReg() != AMDGPU::VCC && |
943 | SDst->getReg() != AMDGPU::VCC_LO)) |
944 | return false; |
945 | } |
946 | |
947 | if (!ST.hasSDWAOutModsVOPC() && |
948 | (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) || |
949 | TII->hasModifiersSet(MI, AMDGPU::OpName::omod))) |
950 | return false; |
951 | |
952 | } else if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst) || |
953 | !TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { |
954 | return false; |
955 | } |
956 | |
957 | if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_FMAC_F16_e32 || |
958 | Opc == AMDGPU::V_FMAC_F32_e32 || |
959 | Opc == AMDGPU::V_MAC_F16_e32 || |
960 | Opc == AMDGPU::V_MAC_F32_e32)) |
961 | return false; |
962 | |
963 | // Check if target supports this SDWA opcode |
964 | if (TII->pseudoToMCOpcode(Opcode: Opc) == -1) |
965 | return false; |
966 | |
967 | // FIXME: has SDWA but require handling of implicit VCC use |
968 | if (Opc == AMDGPU::V_CNDMASK_B32_e32) |
969 | return false; |
970 | |
971 | if (MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0)) { |
972 | if (!Src0->isReg() && !Src0->isImm()) |
973 | return false; |
974 | } |
975 | |
976 | if (MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1)) { |
977 | if (!Src1->isReg() && !Src1->isImm()) |
978 | return false; |
979 | } |
980 | |
981 | return true; |
982 | } |
983 | |
984 | bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, |
985 | const SDWAOperandsVector &SDWAOperands) { |
986 | |
987 | LLVM_DEBUG(dbgs() << "Convert instruction:" << MI); |
988 | |
989 | // Convert to sdwa |
990 | int SDWAOpcode; |
991 | unsigned Opcode = MI.getOpcode(); |
992 | if (TII->isSDWA(Opcode)) { |
993 | SDWAOpcode = Opcode; |
994 | } else { |
995 | SDWAOpcode = AMDGPU::getSDWAOp(Opcode); |
996 | if (SDWAOpcode == -1) |
997 | SDWAOpcode = AMDGPU::getSDWAOp(Opcode: AMDGPU::getVOPe32(Opcode)); |
998 | } |
999 | assert(SDWAOpcode != -1); |
1000 | |
1001 | const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode); |
1002 | |
1003 | // Create SDWA version of instruction MI and initialize its operands |
1004 | MachineInstrBuilder SDWAInst = |
1005 | BuildMI(BB&: *MI.getParent(), I&: MI, MIMD: MI.getDebugLoc(), MCID: SDWADesc) |
1006 | .setMIFlags(MI.getFlags()); |
1007 | |
1008 | // Copy dst, if it is present in original then should also be present in SDWA |
1009 | MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); |
1010 | if (Dst) { |
1011 | assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::vdst)); |
1012 | SDWAInst.add(MO: *Dst); |
1013 | } else if ((Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst))) { |
1014 | assert(Dst && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::sdst)); |
1015 | SDWAInst.add(MO: *Dst); |
1016 | } else { |
1017 | assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::sdst)); |
1018 | SDWAInst.addReg(RegNo: TRI->getVCC(), flags: RegState::Define); |
1019 | } |
1020 | |
1021 | // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and |
1022 | // src0_modifiers (except for v_nop_sdwa, but it can't get here) |
1023 | MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); |
1024 | assert(Src0 && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0) && |
1025 | AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0_modifiers)); |
1026 | if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)) |
1027 | SDWAInst.addImm(Val: Mod->getImm()); |
1028 | else |
1029 | SDWAInst.addImm(Val: 0); |
1030 | SDWAInst.add(MO: *Src0); |
1031 | |
1032 | // Copy src1 if present, initialize src1_modifiers. |
1033 | MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); |
1034 | if (Src1) { |
1035 | assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1) && |
1036 | AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1_modifiers)); |
1037 | if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)) |
1038 | SDWAInst.addImm(Val: Mod->getImm()); |
1039 | else |
1040 | SDWAInst.addImm(Val: 0); |
1041 | SDWAInst.add(MO: *Src1); |
1042 | } |
1043 | |
1044 | if (SDWAOpcode == AMDGPU::V_FMAC_F16_sdwa || |
1045 | SDWAOpcode == AMDGPU::V_FMAC_F32_sdwa || |
1046 | SDWAOpcode == AMDGPU::V_MAC_F16_sdwa || |
1047 | SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) { |
1048 | // v_mac_f16/32 has additional src2 operand tied to vdst |
1049 | MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); |
1050 | assert(Src2); |
1051 | SDWAInst.add(MO: *Src2); |
1052 | } |
1053 | |
1054 | // Copy clamp if present, initialize otherwise |
1055 | assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::clamp)); |
1056 | MachineOperand *Clamp = TII->getNamedOperand(MI, AMDGPU::OpName::clamp); |
1057 | if (Clamp) { |
1058 | SDWAInst.add(MO: *Clamp); |
1059 | } else { |
1060 | SDWAInst.addImm(Val: 0); |
1061 | } |
1062 | |
1063 | // Copy omod if present, initialize otherwise if needed |
1064 | if (AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::omod)) { |
1065 | MachineOperand *OMod = TII->getNamedOperand(MI, AMDGPU::OpName::omod); |
1066 | if (OMod) { |
1067 | SDWAInst.add(MO: *OMod); |
1068 | } else { |
1069 | SDWAInst.addImm(Val: 0); |
1070 | } |
1071 | } |
1072 | |
1073 | // Copy dst_sel if present, initialize otherwise if needed |
1074 | if (AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::dst_sel)) { |
1075 | MachineOperand *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel); |
1076 | if (DstSel) { |
1077 | SDWAInst.add(MO: *DstSel); |
1078 | } else { |
1079 | SDWAInst.addImm(Val: AMDGPU::SDWA::SdwaSel::DWORD); |
1080 | } |
1081 | } |
1082 | |
1083 | // Copy dst_unused if present, initialize otherwise if needed |
1084 | if (AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::dst_unused)) { |
1085 | MachineOperand *DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); |
1086 | if (DstUnused) { |
1087 | SDWAInst.add(MO: *DstUnused); |
1088 | } else { |
1089 | SDWAInst.addImm(Val: AMDGPU::SDWA::DstUnused::UNUSED_PAD); |
1090 | } |
1091 | } |
1092 | |
1093 | // Copy src0_sel if present, initialize otherwise |
1094 | assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0_sel)); |
1095 | MachineOperand *Src0Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel); |
1096 | if (Src0Sel) { |
1097 | SDWAInst.add(MO: *Src0Sel); |
1098 | } else { |
1099 | SDWAInst.addImm(Val: AMDGPU::SDWA::SdwaSel::DWORD); |
1100 | } |
1101 | |
1102 | // Copy src1_sel if present, initialize otherwise if needed |
1103 | if (Src1) { |
1104 | assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1_sel)); |
1105 | MachineOperand *Src1Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel); |
1106 | if (Src1Sel) { |
1107 | SDWAInst.add(MO: *Src1Sel); |
1108 | } else { |
1109 | SDWAInst.addImm(Val: AMDGPU::SDWA::SdwaSel::DWORD); |
1110 | } |
1111 | } |
1112 | |
1113 | // Check for a preserved register that needs to be copied. |
1114 | auto DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); |
1115 | if (DstUnused && |
1116 | DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) { |
1117 | // We expect, if we are here, that the instruction was already in it's SDWA form, |
1118 | // with a tied operand. |
1119 | assert(Dst && Dst->isTied()); |
1120 | assert(Opcode == static_cast<unsigned int>(SDWAOpcode)); |
1121 | // We also expect a vdst, since sdst can't preserve. |
1122 | auto PreserveDstIdx = AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst); |
1123 | assert(PreserveDstIdx != -1); |
1124 | |
1125 | auto TiedIdx = MI.findTiedOperandIdx(OpIdx: PreserveDstIdx); |
1126 | auto Tied = MI.getOperand(TiedIdx); |
1127 | |
1128 | SDWAInst.add(Tied); |
1129 | SDWAInst->tieOperands(DefIdx: PreserveDstIdx, UseIdx: SDWAInst->getNumOperands() - 1); |
1130 | } |
1131 | |
1132 | // Apply all sdwa operand patterns. |
1133 | bool Converted = false; |
1134 | for (auto &Operand : SDWAOperands) { |
1135 | LLVM_DEBUG(dbgs() << *SDWAInst << "\nOperand: " << *Operand); |
1136 | // There should be no intersection between SDWA operands and potential MIs |
1137 | // e.g.: |
1138 | // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0 |
1139 | // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0 |
1140 | // v_add_u32 v3, v4, v2 |
1141 | // |
1142 | // In that example it is possible that we would fold 2nd instruction into |
1143 | // 3rd (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that |
1144 | // was already destroyed). So if SDWAOperand is also a potential MI then do |
1145 | // not apply it. |
1146 | if (PotentialMatches.count(Key: Operand->getParentInst()) == 0) |
1147 | Converted |= Operand->convertToSDWA(MI&: *SDWAInst, TII); |
1148 | } |
1149 | if (Converted) { |
1150 | ConvertedInstructions.push_back(Elt: SDWAInst); |
1151 | } else { |
1152 | SDWAInst->eraseFromParent(); |
1153 | return false; |
1154 | } |
1155 | |
1156 | LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n'); |
1157 | ++NumSDWAInstructionsPeepholed; |
1158 | |
1159 | MI.eraseFromParent(); |
1160 | return true; |
1161 | } |
1162 | |
1163 | // If an instruction was converted to SDWA it should not have immediates or SGPR |
1164 | // operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs. |
1165 | void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, |
1166 | const GCNSubtarget &ST) const { |
1167 | const MCInstrDesc &Desc = TII->get(MI.getOpcode()); |
1168 | unsigned ConstantBusCount = 0; |
1169 | for (MachineOperand &Op : MI.explicit_uses()) { |
1170 | if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(MRI: *MRI, Reg: Op.getReg()))) |
1171 | continue; |
1172 | |
1173 | unsigned I = Op.getOperandNo(); |
1174 | if (Desc.operands()[I].RegClass == -1 || |
1175 | !TRI->isVSSuperClass(RC: TRI->getRegClass(RCID: Desc.operands()[I].RegClass))) |
1176 | continue; |
1177 | |
1178 | if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() && |
1179 | TRI->isSGPRReg(MRI: *MRI, Reg: Op.getReg())) { |
1180 | ++ConstantBusCount; |
1181 | continue; |
1182 | } |
1183 | |
1184 | Register VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
1185 | auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), |
1186 | TII->get(AMDGPU::V_MOV_B32_e32), VGPR); |
1187 | if (Op.isImm()) |
1188 | Copy.addImm(Op.getImm()); |
1189 | else if (Op.isReg()) |
1190 | Copy.addReg(Op.getReg(), Op.isKill() ? RegState::Kill : 0, |
1191 | Op.getSubReg()); |
1192 | Op.ChangeToRegister(Reg: VGPR, isDef: false); |
1193 | } |
1194 | } |
1195 | |
1196 | bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) { |
1197 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
1198 | |
1199 | if (!ST.hasSDWA() || skipFunction(F: MF.getFunction())) |
1200 | return false; |
1201 | |
1202 | MRI = &MF.getRegInfo(); |
1203 | TRI = ST.getRegisterInfo(); |
1204 | TII = ST.getInstrInfo(); |
1205 | |
1206 | // Find all SDWA operands in MF. |
1207 | bool Ret = false; |
1208 | for (MachineBasicBlock &MBB : MF) { |
1209 | bool Changed = false; |
1210 | do { |
1211 | // Preprocess the ADD/SUB pairs so they could be SDWA'ed. |
1212 | // Look for a possible ADD or SUB that resulted from a previously lowered |
1213 | // V_{ADD|SUB}_U64_PSEUDO. The function pseudoOpConvertToVOP2 |
1214 | // lowers the pair of instructions into e32 form. |
1215 | matchSDWAOperands(MBB); |
1216 | for (const auto &OperandPair : SDWAOperands) { |
1217 | const auto &Operand = OperandPair.second; |
1218 | MachineInstr *PotentialMI = Operand->potentialToConvert(TII); |
1219 | if (PotentialMI && |
1220 | (PotentialMI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 || |
1221 | PotentialMI->getOpcode() == AMDGPU::V_SUB_CO_U32_e64)) |
1222 | pseudoOpConvertToVOP2(MI&: *PotentialMI, ST); |
1223 | } |
1224 | SDWAOperands.clear(); |
1225 | |
1226 | // Generate potential match list. |
1227 | matchSDWAOperands(MBB); |
1228 | |
1229 | for (const auto &OperandPair : SDWAOperands) { |
1230 | const auto &Operand = OperandPair.second; |
1231 | MachineInstr *PotentialMI = Operand->potentialToConvert(TII); |
1232 | if (PotentialMI && isConvertibleToSDWA(MI&: *PotentialMI, ST)) { |
1233 | PotentialMatches[PotentialMI].push_back(Elt: Operand.get()); |
1234 | } |
1235 | } |
1236 | |
1237 | for (auto &PotentialPair : PotentialMatches) { |
1238 | MachineInstr &PotentialMI = *PotentialPair.first; |
1239 | convertToSDWA(MI&: PotentialMI, SDWAOperands: PotentialPair.second); |
1240 | } |
1241 | |
1242 | PotentialMatches.clear(); |
1243 | SDWAOperands.clear(); |
1244 | |
1245 | Changed = !ConvertedInstructions.empty(); |
1246 | |
1247 | if (Changed) |
1248 | Ret = true; |
1249 | while (!ConvertedInstructions.empty()) |
1250 | legalizeScalarOperands(MI&: *ConvertedInstructions.pop_back_val(), ST); |
1251 | } while (Changed); |
1252 | } |
1253 | |
1254 | return Ret; |
1255 | } |
1256 | |