1 | //=======- GCNDPPCombine.cpp - optimization for DPP instructions ---==========// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // The pass combines V_MOV_B32_dpp instruction with its VALU uses as a DPP src0 |
9 | // operand. If any of the use instruction cannot be combined with the mov the |
10 | // whole sequence is reverted. |
11 | // |
12 | // $old = ... |
13 | // $dpp_value = V_MOV_B32_dpp $old, $vgpr_to_be_read_from_other_lane, |
14 | // dpp_controls..., $row_mask, $bank_mask, $bound_ctrl |
15 | // $res = VALU $dpp_value [, src1] |
16 | // |
17 | // to |
18 | // |
19 | // $res = VALU_DPP $combined_old, $vgpr_to_be_read_from_other_lane, [src1,] |
20 | // dpp_controls..., $row_mask, $bank_mask, $combined_bound_ctrl |
21 | // |
22 | // Combining rules : |
23 | // |
24 | // if $row_mask and $bank_mask are fully enabled (0xF) and |
25 | // $bound_ctrl==DPP_BOUND_ZERO or $old==0 |
26 | // -> $combined_old = undef, |
27 | // $combined_bound_ctrl = DPP_BOUND_ZERO |
28 | // |
29 | // if the VALU op is binary and |
30 | // $bound_ctrl==DPP_BOUND_OFF and |
31 | // $old==identity value (immediate) for the VALU op |
32 | // -> $combined_old = src1, |
33 | // $combined_bound_ctrl = DPP_BOUND_OFF |
34 | // |
35 | // Otherwise cancel. |
36 | // |
37 | // The mov_dpp instruction should reside in the same BB as all its uses |
38 | //===----------------------------------------------------------------------===// |
39 | |
40 | #include "AMDGPU.h" |
41 | #include "GCNSubtarget.h" |
42 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
43 | #include "llvm/ADT/Statistic.h" |
44 | #include "llvm/CodeGen/MachineFunctionPass.h" |
45 | |
46 | using namespace llvm; |
47 | |
48 | #define DEBUG_TYPE "gcn-dpp-combine" |
49 | |
50 | STATISTIC(NumDPPMovsCombined, "Number of DPP moves combined." ); |
51 | |
52 | namespace { |
53 | |
54 | class GCNDPPCombine : public MachineFunctionPass { |
55 | MachineRegisterInfo *MRI; |
56 | const SIInstrInfo *TII; |
57 | const GCNSubtarget *ST; |
58 | |
59 | using RegSubRegPair = TargetInstrInfo::RegSubRegPair; |
60 | |
61 | MachineOperand *getOldOpndValue(MachineOperand &OldOpnd) const; |
62 | |
63 | MachineInstr *createDPPInst(MachineInstr &OrigMI, MachineInstr &MovMI, |
64 | RegSubRegPair CombOldVGPR, |
65 | MachineOperand *OldOpnd, bool CombBCZ, |
66 | bool IsShrinkable) const; |
67 | |
68 | MachineInstr *createDPPInst(MachineInstr &OrigMI, MachineInstr &MovMI, |
69 | RegSubRegPair CombOldVGPR, bool CombBCZ, |
70 | bool IsShrinkable) const; |
71 | |
72 | bool hasNoImmOrEqual(MachineInstr &MI, |
73 | unsigned OpndName, |
74 | int64_t Value, |
75 | int64_t Mask = -1) const; |
76 | |
77 | bool combineDPPMov(MachineInstr &MI) const; |
78 | |
79 | public: |
80 | static char ID; |
81 | |
82 | GCNDPPCombine() : MachineFunctionPass(ID) { |
83 | initializeGCNDPPCombinePass(*PassRegistry::getPassRegistry()); |
84 | } |
85 | |
86 | bool runOnMachineFunction(MachineFunction &MF) override; |
87 | |
88 | StringRef getPassName() const override { return "GCN DPP Combine" ; } |
89 | |
90 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
91 | AU.setPreservesCFG(); |
92 | MachineFunctionPass::getAnalysisUsage(AU); |
93 | } |
94 | |
95 | MachineFunctionProperties getRequiredProperties() const override { |
96 | return MachineFunctionProperties() |
97 | .set(MachineFunctionProperties::Property::IsSSA); |
98 | } |
99 | |
100 | private: |
101 | int getDPPOp(unsigned Op, bool IsShrinkable) const; |
102 | bool isShrinkable(MachineInstr &MI) const; |
103 | }; |
104 | |
105 | } // end anonymous namespace |
106 | |
107 | INITIALIZE_PASS(GCNDPPCombine, DEBUG_TYPE, "GCN DPP Combine" , false, false) |
108 | |
109 | char GCNDPPCombine::ID = 0; |
110 | |
111 | char &llvm::GCNDPPCombineID = GCNDPPCombine::ID; |
112 | |
113 | FunctionPass *llvm::createGCNDPPCombinePass() { |
114 | return new GCNDPPCombine(); |
115 | } |
116 | |
117 | bool GCNDPPCombine::isShrinkable(MachineInstr &MI) const { |
118 | unsigned Op = MI.getOpcode(); |
119 | if (!TII->isVOP3(Opcode: Op)) { |
120 | return false; |
121 | } |
122 | if (!TII->hasVALU32BitEncoding(Opcode: Op)) { |
123 | LLVM_DEBUG(dbgs() << " Inst hasn't e32 equivalent\n" ); |
124 | return false; |
125 | } |
126 | // Do not shrink True16 instructions pre-RA to avoid the restriction in |
127 | // register allocation from only being able to use 128 VGPRs |
128 | if (AMDGPU::isTrue16Inst(Opc: Op)) |
129 | return false; |
130 | if (const auto *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) { |
131 | // Give up if there are any uses of the sdst in carry-out or VOPC. |
132 | // The shrunken form of the instruction would write it to vcc instead of to |
133 | // a virtual register. If we rewrote the uses the shrinking would be |
134 | // possible. |
135 | if (!MRI->use_nodbg_empty(RegNo: SDst->getReg())) |
136 | return false; |
137 | } |
138 | // check if other than abs|neg modifiers are set (opsel for example) |
139 | const int64_t Mask = ~(SISrcMods::ABS | SISrcMods::NEG); |
140 | if (!hasNoImmOrEqual(MI, AMDGPU::OpName::src0_modifiers, 0, Mask) || |
141 | !hasNoImmOrEqual(MI, AMDGPU::OpName::src1_modifiers, 0, Mask) || |
142 | !hasNoImmOrEqual(MI, AMDGPU::OpName::clamp, 0) || |
143 | !hasNoImmOrEqual(MI, AMDGPU::OpName::omod, 0)) { |
144 | LLVM_DEBUG(dbgs() << " Inst has non-default modifiers\n" ); |
145 | return false; |
146 | } |
147 | return true; |
148 | } |
149 | |
150 | int GCNDPPCombine::getDPPOp(unsigned Op, bool IsShrinkable) const { |
151 | int DPP32 = AMDGPU::getDPPOp32(Opcode: Op); |
152 | if (IsShrinkable) { |
153 | assert(DPP32 == -1); |
154 | int E32 = AMDGPU::getVOPe32(Opcode: Op); |
155 | DPP32 = (E32 == -1) ? -1 : AMDGPU::getDPPOp32(Opcode: E32); |
156 | } |
157 | if (DPP32 != -1 && TII->pseudoToMCOpcode(Opcode: DPP32) != -1) |
158 | return DPP32; |
159 | int DPP64 = -1; |
160 | if (ST->hasVOP3DPP()) |
161 | DPP64 = AMDGPU::getDPPOp64(Opcode: Op); |
162 | if (DPP64 != -1 && TII->pseudoToMCOpcode(Opcode: DPP64) != -1) |
163 | return DPP64; |
164 | return -1; |
165 | } |
166 | |
167 | // tracks the register operand definition and returns: |
168 | // 1. immediate operand used to initialize the register if found |
169 | // 2. nullptr if the register operand is undef |
170 | // 3. the operand itself otherwise |
171 | MachineOperand *GCNDPPCombine::getOldOpndValue(MachineOperand &OldOpnd) const { |
172 | auto *Def = getVRegSubRegDef(P: getRegSubRegPair(O: OldOpnd), MRI&: *MRI); |
173 | if (!Def) |
174 | return nullptr; |
175 | |
176 | switch(Def->getOpcode()) { |
177 | default: break; |
178 | case AMDGPU::IMPLICIT_DEF: |
179 | return nullptr; |
180 | case AMDGPU::COPY: |
181 | case AMDGPU::V_MOV_B32_e32: |
182 | case AMDGPU::V_MOV_B64_PSEUDO: |
183 | case AMDGPU::V_MOV_B64_e32: |
184 | case AMDGPU::V_MOV_B64_e64: { |
185 | auto &Op1 = Def->getOperand(i: 1); |
186 | if (Op1.isImm()) |
187 | return &Op1; |
188 | break; |
189 | } |
190 | } |
191 | return &OldOpnd; |
192 | } |
193 | |
194 | [[maybe_unused]] static unsigned getOperandSize(MachineInstr &MI, unsigned Idx, |
195 | MachineRegisterInfo &MRI) { |
196 | int16_t RegClass = MI.getDesc().operands()[Idx].RegClass; |
197 | if (RegClass == -1) |
198 | return 0; |
199 | |
200 | const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); |
201 | return TRI->getRegSizeInBits(RC: *TRI->getRegClass(i: RegClass)); |
202 | } |
203 | |
204 | MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, |
205 | MachineInstr &MovMI, |
206 | RegSubRegPair CombOldVGPR, |
207 | bool CombBCZ, |
208 | bool IsShrinkable) const { |
209 | assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp || |
210 | MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp || |
211 | MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO); |
212 | |
213 | bool HasVOP3DPP = ST->hasVOP3DPP(); |
214 | auto OrigOp = OrigMI.getOpcode(); |
215 | auto DPPOp = getDPPOp(Op: OrigOp, IsShrinkable); |
216 | if (DPPOp == -1) { |
217 | LLVM_DEBUG(dbgs() << " failed: no DPP opcode\n" ); |
218 | return nullptr; |
219 | } |
220 | int OrigOpE32 = AMDGPU::getVOPe32(Opcode: OrigOp); |
221 | // Prior checks cover Mask with VOPC condition, but not on purpose |
222 | auto *RowMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask); |
223 | assert(RowMaskOpnd && RowMaskOpnd->isImm()); |
224 | auto *BankMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask); |
225 | assert(BankMaskOpnd && BankMaskOpnd->isImm()); |
226 | const bool MaskAllLanes = |
227 | RowMaskOpnd->getImm() == 0xF && BankMaskOpnd->getImm() == 0xF; |
228 | (void)MaskAllLanes; |
229 | assert((MaskAllLanes || |
230 | !(TII->isVOPC(DPPOp) || (TII->isVOP3(DPPOp) && OrigOpE32 != -1 && |
231 | TII->isVOPC(OrigOpE32)))) && |
232 | "VOPC cannot form DPP unless mask is full" ); |
233 | |
234 | auto DPPInst = BuildMI(*OrigMI.getParent(), OrigMI, |
235 | OrigMI.getDebugLoc(), TII->get(DPPOp)) |
236 | .setMIFlags(OrigMI.getFlags()); |
237 | |
238 | bool Fail = false; |
239 | do { |
240 | int NumOperands = 0; |
241 | if (auto *Dst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst)) { |
242 | DPPInst.add(*Dst); |
243 | ++NumOperands; |
244 | } |
245 | if (auto *SDst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::sdst)) { |
246 | if (TII->isOperandLegal(MI: *DPPInst.getInstr(), OpIdx: NumOperands, MO: SDst)) { |
247 | DPPInst.add(*SDst); |
248 | ++NumOperands; |
249 | } |
250 | // If we shrunk a 64bit vop3b to 32bits, just ignore the sdst |
251 | } |
252 | |
253 | const int OldIdx = AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::old); |
254 | if (OldIdx != -1) { |
255 | assert(OldIdx == NumOperands); |
256 | assert(isOfRegClass( |
257 | CombOldVGPR, |
258 | *MRI->getRegClass( |
259 | TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg()), |
260 | *MRI)); |
261 | auto *Def = getVRegSubRegDef(P: CombOldVGPR, MRI&: *MRI); |
262 | DPPInst.addReg(CombOldVGPR.Reg, Def ? 0 : RegState::Undef, |
263 | CombOldVGPR.SubReg); |
264 | ++NumOperands; |
265 | } else if (TII->isVOPC(Opcode: DPPOp) || (TII->isVOP3(Opcode: DPPOp) && OrigOpE32 != -1 && |
266 | TII->isVOPC(Opcode: OrigOpE32))) { |
267 | // VOPC DPP and VOPC promoted to VOP3 DPP do not have an old operand |
268 | // because they write to SGPRs not VGPRs |
269 | } else { |
270 | // TODO: this discards MAC/FMA instructions for now, let's add it later |
271 | LLVM_DEBUG(dbgs() << " failed: no old operand in DPP instruction," |
272 | " TBD\n" ); |
273 | Fail = true; |
274 | break; |
275 | } |
276 | |
277 | auto *Mod0 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0_modifiers); |
278 | if (Mod0) { |
279 | assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp, |
280 | AMDGPU::OpName::src0_modifiers)); |
281 | assert(HasVOP3DPP || |
282 | (0LL == (Mod0->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)))); |
283 | DPPInst.addImm(Mod0->getImm()); |
284 | ++NumOperands; |
285 | } else if (AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::src0_modifiers)) { |
286 | DPPInst.addImm(0); |
287 | ++NumOperands; |
288 | } |
289 | auto *Src0 = TII->getNamedOperand(MovMI, AMDGPU::OpName::src0); |
290 | assert(Src0); |
291 | int Src0Idx = NumOperands; |
292 | if (!TII->isOperandLegal(MI: *DPPInst.getInstr(), OpIdx: NumOperands, MO: Src0)) { |
293 | LLVM_DEBUG(dbgs() << " failed: src0 is illegal\n" ); |
294 | Fail = true; |
295 | break; |
296 | } |
297 | DPPInst.add(*Src0); |
298 | DPPInst->getOperand(NumOperands).setIsKill(false); |
299 | ++NumOperands; |
300 | |
301 | auto *Mod1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1_modifiers); |
302 | if (Mod1) { |
303 | assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp, |
304 | AMDGPU::OpName::src1_modifiers)); |
305 | assert(HasVOP3DPP || |
306 | (0LL == (Mod1->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)))); |
307 | DPPInst.addImm(Mod1->getImm()); |
308 | ++NumOperands; |
309 | } else if (AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::src1_modifiers)) { |
310 | DPPInst.addImm(0); |
311 | ++NumOperands; |
312 | } |
313 | auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1); |
314 | if (Src1) { |
315 | int OpNum = NumOperands; |
316 | // If subtarget does not support SGPRs for src1 operand then the |
317 | // requirements are the same as for src0. We check src0 instead because |
318 | // pseudos are shared between subtargets and allow SGPR for src1 on all. |
319 | if (!ST->hasDPPSrc1SGPR()) { |
320 | assert(getOperandSize(*DPPInst, Src0Idx, *MRI) == |
321 | getOperandSize(*DPPInst, NumOperands, *MRI) && |
322 | "Src0 and Src1 operands should have the same size" ); |
323 | OpNum = Src0Idx; |
324 | } |
325 | if (!TII->isOperandLegal(MI: *DPPInst.getInstr(), OpIdx: OpNum, MO: Src1)) { |
326 | LLVM_DEBUG(dbgs() << " failed: src1 is illegal\n" ); |
327 | Fail = true; |
328 | break; |
329 | } |
330 | DPPInst.add(*Src1); |
331 | ++NumOperands; |
332 | } |
333 | |
334 | auto *Mod2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2_modifiers); |
335 | if (Mod2) { |
336 | assert(NumOperands == |
337 | AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::src2_modifiers)); |
338 | assert(HasVOP3DPP || |
339 | (0LL == (Mod2->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)))); |
340 | DPPInst.addImm(Mod2->getImm()); |
341 | ++NumOperands; |
342 | } |
343 | auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2); |
344 | if (Src2) { |
345 | if (!TII->getNamedOperand(*DPPInst.getInstr(), AMDGPU::OpName::src2) || |
346 | !TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src2)) { |
347 | LLVM_DEBUG(dbgs() << " failed: src2 is illegal\n" ); |
348 | Fail = true; |
349 | break; |
350 | } |
351 | DPPInst.add(*Src2); |
352 | ++NumOperands; |
353 | } |
354 | |
355 | if (HasVOP3DPP) { |
356 | auto *ClampOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::clamp); |
357 | if (ClampOpr && AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::clamp)) { |
358 | DPPInst.addImm(ClampOpr->getImm()); |
359 | } |
360 | auto *VdstInOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst_in); |
361 | if (VdstInOpr && |
362 | AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::vdst_in)) { |
363 | DPPInst.add(*VdstInOpr); |
364 | } |
365 | auto *OmodOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::omod); |
366 | if (OmodOpr && AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::omod)) { |
367 | DPPInst.addImm(OmodOpr->getImm()); |
368 | } |
369 | // Validate OP_SEL has to be set to all 0 and OP_SEL_HI has to be set to |
370 | // all 1. |
371 | if (TII->getNamedOperand(OrigMI, AMDGPU::OpName::op_sel)) { |
372 | int64_t OpSel = 0; |
373 | OpSel |= (Mod0 ? (!!(Mod0->getImm() & SISrcMods::OP_SEL_0) << 0) : 0); |
374 | OpSel |= (Mod1 ? (!!(Mod1->getImm() & SISrcMods::OP_SEL_0) << 1) : 0); |
375 | OpSel |= (Mod2 ? (!!(Mod2->getImm() & SISrcMods::OP_SEL_0) << 2) : 0); |
376 | if (Mod0 && TII->isVOP3(MI: OrigMI) && !TII->isVOP3P(MI: OrigMI)) |
377 | OpSel |= !!(Mod0->getImm() & SISrcMods::DST_OP_SEL) << 3; |
378 | |
379 | if (OpSel != 0) { |
380 | LLVM_DEBUG(dbgs() << " failed: op_sel must be zero\n" ); |
381 | Fail = true; |
382 | break; |
383 | } |
384 | if (AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::op_sel)) |
385 | DPPInst.addImm(OpSel); |
386 | } |
387 | if (TII->getNamedOperand(OrigMI, AMDGPU::OpName::op_sel_hi)) { |
388 | int64_t OpSelHi = 0; |
389 | OpSelHi |= (Mod0 ? (!!(Mod0->getImm() & SISrcMods::OP_SEL_1) << 0) : 0); |
390 | OpSelHi |= (Mod1 ? (!!(Mod1->getImm() & SISrcMods::OP_SEL_1) << 1) : 0); |
391 | OpSelHi |= (Mod2 ? (!!(Mod2->getImm() & SISrcMods::OP_SEL_1) << 2) : 0); |
392 | |
393 | // Only vop3p has op_sel_hi, and all vop3p have 3 operands, so check |
394 | // the bitmask for 3 op_sel_hi bits set |
395 | assert(Src2 && "Expected vop3p with 3 operands" ); |
396 | if (OpSelHi != 7) { |
397 | LLVM_DEBUG(dbgs() << " failed: op_sel_hi must be all set to one\n" ); |
398 | Fail = true; |
399 | break; |
400 | } |
401 | if (AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::op_sel_hi)) |
402 | DPPInst.addImm(OpSelHi); |
403 | } |
404 | auto *NegOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::neg_lo); |
405 | if (NegOpr && AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::neg_lo)) { |
406 | DPPInst.addImm(NegOpr->getImm()); |
407 | } |
408 | auto *NegHiOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::neg_hi); |
409 | if (NegHiOpr && AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::neg_hi)) { |
410 | DPPInst.addImm(NegHiOpr->getImm()); |
411 | } |
412 | } |
413 | DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl)); |
414 | DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask)); |
415 | DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask)); |
416 | DPPInst.addImm(CombBCZ ? 1 : 0); |
417 | } while (false); |
418 | |
419 | if (Fail) { |
420 | DPPInst.getInstr()->eraseFromParent(); |
421 | return nullptr; |
422 | } |
423 | LLVM_DEBUG(dbgs() << " combined: " << *DPPInst.getInstr()); |
424 | return DPPInst.getInstr(); |
425 | } |
426 | |
427 | static bool isIdentityValue(unsigned OrigMIOp, MachineOperand *OldOpnd) { |
428 | assert(OldOpnd->isImm()); |
429 | switch (OrigMIOp) { |
430 | default: break; |
431 | case AMDGPU::V_ADD_U32_e32: |
432 | case AMDGPU::V_ADD_U32_e64: |
433 | case AMDGPU::V_ADD_CO_U32_e32: |
434 | case AMDGPU::V_ADD_CO_U32_e64: |
435 | case AMDGPU::V_OR_B32_e32: |
436 | case AMDGPU::V_OR_B32_e64: |
437 | case AMDGPU::V_SUBREV_U32_e32: |
438 | case AMDGPU::V_SUBREV_U32_e64: |
439 | case AMDGPU::V_SUBREV_CO_U32_e32: |
440 | case AMDGPU::V_SUBREV_CO_U32_e64: |
441 | case AMDGPU::V_MAX_U32_e32: |
442 | case AMDGPU::V_MAX_U32_e64: |
443 | case AMDGPU::V_XOR_B32_e32: |
444 | case AMDGPU::V_XOR_B32_e64: |
445 | if (OldOpnd->getImm() == 0) |
446 | return true; |
447 | break; |
448 | case AMDGPU::V_AND_B32_e32: |
449 | case AMDGPU::V_AND_B32_e64: |
450 | case AMDGPU::V_MIN_U32_e32: |
451 | case AMDGPU::V_MIN_U32_e64: |
452 | if (static_cast<uint32_t>(OldOpnd->getImm()) == |
453 | std::numeric_limits<uint32_t>::max()) |
454 | return true; |
455 | break; |
456 | case AMDGPU::V_MIN_I32_e32: |
457 | case AMDGPU::V_MIN_I32_e64: |
458 | if (static_cast<int32_t>(OldOpnd->getImm()) == |
459 | std::numeric_limits<int32_t>::max()) |
460 | return true; |
461 | break; |
462 | case AMDGPU::V_MAX_I32_e32: |
463 | case AMDGPU::V_MAX_I32_e64: |
464 | if (static_cast<int32_t>(OldOpnd->getImm()) == |
465 | std::numeric_limits<int32_t>::min()) |
466 | return true; |
467 | break; |
468 | case AMDGPU::V_MUL_I32_I24_e32: |
469 | case AMDGPU::V_MUL_I32_I24_e64: |
470 | case AMDGPU::V_MUL_U32_U24_e32: |
471 | case AMDGPU::V_MUL_U32_U24_e64: |
472 | if (OldOpnd->getImm() == 1) |
473 | return true; |
474 | break; |
475 | } |
476 | return false; |
477 | } |
478 | |
479 | MachineInstr *GCNDPPCombine::createDPPInst( |
480 | MachineInstr &OrigMI, MachineInstr &MovMI, RegSubRegPair CombOldVGPR, |
481 | MachineOperand *OldOpndValue, bool CombBCZ, bool IsShrinkable) const { |
482 | assert(CombOldVGPR.Reg); |
483 | if (!CombBCZ && OldOpndValue && OldOpndValue->isImm()) { |
484 | auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1); |
485 | if (!Src1 || !Src1->isReg()) { |
486 | LLVM_DEBUG(dbgs() << " failed: no src1 or it isn't a register\n" ); |
487 | return nullptr; |
488 | } |
489 | if (!isIdentityValue(OrigMIOp: OrigMI.getOpcode(), OldOpnd: OldOpndValue)) { |
490 | LLVM_DEBUG(dbgs() << " failed: old immediate isn't an identity\n" ); |
491 | return nullptr; |
492 | } |
493 | CombOldVGPR = getRegSubRegPair(*Src1); |
494 | auto MovDst = TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst); |
495 | const TargetRegisterClass *RC = MRI->getRegClass(Reg: MovDst->getReg()); |
496 | if (!isOfRegClass(P: CombOldVGPR, TRC: *RC, MRI&: *MRI)) { |
497 | LLVM_DEBUG(dbgs() << " failed: src1 has wrong register class\n" ); |
498 | return nullptr; |
499 | } |
500 | } |
501 | return createDPPInst(OrigMI, MovMI, CombOldVGPR, CombBCZ, IsShrinkable); |
502 | } |
503 | |
504 | // returns true if MI doesn't have OpndName immediate operand or the |
505 | // operand has Value |
506 | bool GCNDPPCombine::hasNoImmOrEqual(MachineInstr &MI, unsigned OpndName, |
507 | int64_t Value, int64_t Mask) const { |
508 | auto *Imm = TII->getNamedOperand(MI, OperandName: OpndName); |
509 | if (!Imm) |
510 | return true; |
511 | |
512 | assert(Imm->isImm()); |
513 | return (Imm->getImm() & Mask) == Value; |
514 | } |
515 | |
516 | bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { |
517 | assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp || |
518 | MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp || |
519 | MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO); |
520 | LLVM_DEBUG(dbgs() << "\nDPP combine: " << MovMI); |
521 | |
522 | auto *DstOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst); |
523 | assert(DstOpnd && DstOpnd->isReg()); |
524 | auto DPPMovReg = DstOpnd->getReg(); |
525 | if (DPPMovReg.isPhysical()) { |
526 | LLVM_DEBUG(dbgs() << " failed: dpp move writes physreg\n" ); |
527 | return false; |
528 | } |
529 | if (execMayBeModifiedBeforeAnyUse(*MRI, DPPMovReg, MovMI)) { |
530 | LLVM_DEBUG(dbgs() << " failed: EXEC mask should remain the same" |
531 | " for all uses\n" ); |
532 | return false; |
533 | } |
534 | |
535 | if (MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO || |
536 | MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp) { |
537 | auto *DppCtrl = TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl); |
538 | assert(DppCtrl && DppCtrl->isImm()); |
539 | if (!AMDGPU::isLegalDPALU_DPPControl(DC: DppCtrl->getImm())) { |
540 | LLVM_DEBUG(dbgs() << " failed: 64 bit dpp move uses unsupported" |
541 | " control value\n" ); |
542 | // Let it split, then control may become legal. |
543 | return false; |
544 | } |
545 | } |
546 | |
547 | auto *RowMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask); |
548 | assert(RowMaskOpnd && RowMaskOpnd->isImm()); |
549 | auto *BankMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask); |
550 | assert(BankMaskOpnd && BankMaskOpnd->isImm()); |
551 | const bool MaskAllLanes = RowMaskOpnd->getImm() == 0xF && |
552 | BankMaskOpnd->getImm() == 0xF; |
553 | |
554 | auto *BCZOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bound_ctrl); |
555 | assert(BCZOpnd && BCZOpnd->isImm()); |
556 | bool BoundCtrlZero = BCZOpnd->getImm(); |
557 | |
558 | auto *OldOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::old); |
559 | auto *SrcOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::src0); |
560 | assert(OldOpnd && OldOpnd->isReg()); |
561 | assert(SrcOpnd && SrcOpnd->isReg()); |
562 | if (OldOpnd->getReg().isPhysical() || SrcOpnd->getReg().isPhysical()) { |
563 | LLVM_DEBUG(dbgs() << " failed: dpp move reads physreg\n" ); |
564 | return false; |
565 | } |
566 | |
567 | auto * const OldOpndValue = getOldOpndValue(OldOpnd&: *OldOpnd); |
568 | // OldOpndValue is either undef (IMPLICIT_DEF) or immediate or something else |
569 | // We could use: assert(!OldOpndValue || OldOpndValue->isImm()) |
570 | // but the third option is used to distinguish undef from non-immediate |
571 | // to reuse IMPLICIT_DEF instruction later |
572 | assert(!OldOpndValue || OldOpndValue->isImm() || OldOpndValue == OldOpnd); |
573 | |
574 | bool CombBCZ = false; |
575 | |
576 | if (MaskAllLanes && BoundCtrlZero) { // [1] |
577 | CombBCZ = true; |
578 | } else { |
579 | if (!OldOpndValue || !OldOpndValue->isImm()) { |
580 | LLVM_DEBUG(dbgs() << " failed: the DPP mov isn't combinable\n" ); |
581 | return false; |
582 | } |
583 | |
584 | if (OldOpndValue->getImm() == 0) { |
585 | if (MaskAllLanes) { |
586 | assert(!BoundCtrlZero); // by check [1] |
587 | CombBCZ = true; |
588 | } |
589 | } else if (BoundCtrlZero) { |
590 | assert(!MaskAllLanes); // by check [1] |
591 | LLVM_DEBUG(dbgs() << |
592 | " failed: old!=0 and bctrl:0 and not all lanes isn't combinable\n" ); |
593 | return false; |
594 | } |
595 | } |
596 | |
597 | LLVM_DEBUG(dbgs() << " old=" ; |
598 | if (!OldOpndValue) |
599 | dbgs() << "undef" ; |
600 | else |
601 | dbgs() << *OldOpndValue; |
602 | dbgs() << ", bound_ctrl=" << CombBCZ << '\n'); |
603 | |
604 | SmallVector<MachineInstr*, 4> OrigMIs, DPPMIs; |
605 | DenseMap<MachineInstr*, SmallVector<unsigned, 4>> RegSeqWithOpNos; |
606 | auto CombOldVGPR = getRegSubRegPair(*OldOpnd); |
607 | // try to reuse previous old reg if its undefined (IMPLICIT_DEF) |
608 | if (CombBCZ && OldOpndValue) { // CombOldVGPR should be undef |
609 | const TargetRegisterClass *RC = MRI->getRegClass(Reg: DPPMovReg); |
610 | CombOldVGPR = RegSubRegPair( |
611 | MRI->createVirtualRegister(RegClass: RC)); |
612 | auto UndefInst = BuildMI(*MovMI.getParent(), MovMI, MovMI.getDebugLoc(), |
613 | TII->get(AMDGPU::IMPLICIT_DEF), CombOldVGPR.Reg); |
614 | DPPMIs.push_back(Elt: UndefInst.getInstr()); |
615 | } |
616 | |
617 | OrigMIs.push_back(Elt: &MovMI); |
618 | bool Rollback = true; |
619 | SmallVector<MachineOperand*, 16> Uses; |
620 | |
621 | for (auto &Use : MRI->use_nodbg_operands(DPPMovReg)) { |
622 | Uses.push_back(&Use); |
623 | } |
624 | |
625 | while (!Uses.empty()) { |
626 | MachineOperand *Use = Uses.pop_back_val(); |
627 | Rollback = true; |
628 | |
629 | auto &OrigMI = *Use->getParent(); |
630 | LLVM_DEBUG(dbgs() << " try: " << OrigMI); |
631 | |
632 | auto OrigOp = OrigMI.getOpcode(); |
633 | assert((TII->get(OrigOp).getSize() != 4 || !AMDGPU::isTrue16Inst(OrigOp)) && |
634 | "There should not be e32 True16 instructions pre-RA" ); |
635 | if (OrigOp == AMDGPU::REG_SEQUENCE) { |
636 | Register FwdReg = OrigMI.getOperand(i: 0).getReg(); |
637 | unsigned FwdSubReg = 0; |
638 | |
639 | if (execMayBeModifiedBeforeAnyUse(MRI: *MRI, VReg: FwdReg, DefMI: OrigMI)) { |
640 | LLVM_DEBUG(dbgs() << " failed: EXEC mask should remain the same" |
641 | " for all uses\n" ); |
642 | break; |
643 | } |
644 | |
645 | unsigned OpNo, E = OrigMI.getNumOperands(); |
646 | for (OpNo = 1; OpNo < E; OpNo += 2) { |
647 | if (OrigMI.getOperand(i: OpNo).getReg() == DPPMovReg) { |
648 | FwdSubReg = OrigMI.getOperand(i: OpNo + 1).getImm(); |
649 | break; |
650 | } |
651 | } |
652 | |
653 | if (!FwdSubReg) |
654 | break; |
655 | |
656 | for (auto &Op : MRI->use_nodbg_operands(Reg: FwdReg)) { |
657 | if (Op.getSubReg() == FwdSubReg) |
658 | Uses.push_back(Elt: &Op); |
659 | } |
660 | RegSeqWithOpNos[&OrigMI].push_back(Elt: OpNo); |
661 | continue; |
662 | } |
663 | |
664 | bool IsShrinkable = isShrinkable(MI&: OrigMI); |
665 | if (!(IsShrinkable || |
666 | ((TII->isVOP3P(Opcode: OrigOp) || TII->isVOPC(Opcode: OrigOp) || |
667 | TII->isVOP3(Opcode: OrigOp)) && |
668 | ST->hasVOP3DPP()) || |
669 | TII->isVOP1(Opcode: OrigOp) || TII->isVOP2(Opcode: OrigOp))) { |
670 | LLVM_DEBUG(dbgs() << " failed: not VOP1/2/3/3P/C\n" ); |
671 | break; |
672 | } |
673 | if (OrigMI.modifiesRegister(AMDGPU::EXEC, ST->getRegisterInfo())) { |
674 | LLVM_DEBUG(dbgs() << " failed: can't combine v_cmpx\n" ); |
675 | break; |
676 | } |
677 | |
678 | auto *Src0 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0); |
679 | auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1); |
680 | if (Use != Src0 && !(Use == Src1 && OrigMI.isCommutable())) { // [1] |
681 | LLVM_DEBUG(dbgs() << " failed: no suitable operands\n" ); |
682 | break; |
683 | } |
684 | |
685 | auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2); |
686 | assert(Src0 && "Src1 without Src0?" ); |
687 | if ((Use == Src0 && ((Src1 && Src1->isIdenticalTo(*Src0)) || |
688 | (Src2 && Src2->isIdenticalTo(*Src0)))) || |
689 | (Use == Src1 && (Src1->isIdenticalTo(*Src0) || |
690 | (Src2 && Src2->isIdenticalTo(*Src1))))) { |
691 | LLVM_DEBUG( |
692 | dbgs() |
693 | << " " << OrigMI |
694 | << " failed: DPP register is used more than once per instruction\n" ); |
695 | break; |
696 | } |
697 | |
698 | LLVM_DEBUG(dbgs() << " combining: " << OrigMI); |
699 | if (Use == Src0) { |
700 | if (auto *DPPInst = createDPPInst(OrigMI, MovMI, CombOldVGPR, |
701 | OldOpndValue, CombBCZ, IsShrinkable)) { |
702 | DPPMIs.push_back(Elt: DPPInst); |
703 | Rollback = false; |
704 | } |
705 | } else { |
706 | assert(Use == Src1 && OrigMI.isCommutable()); // by check [1] |
707 | auto *BB = OrigMI.getParent(); |
708 | auto *NewMI = BB->getParent()->CloneMachineInstr(Orig: &OrigMI); |
709 | BB->insert(I: OrigMI, MI: NewMI); |
710 | if (TII->commuteInstruction(*NewMI)) { |
711 | LLVM_DEBUG(dbgs() << " commuted: " << *NewMI); |
712 | if (auto *DPPInst = |
713 | createDPPInst(*NewMI, MovMI, CombOldVGPR, OldOpndValue, CombBCZ, |
714 | IsShrinkable)) { |
715 | DPPMIs.push_back(Elt: DPPInst); |
716 | Rollback = false; |
717 | } |
718 | } else |
719 | LLVM_DEBUG(dbgs() << " failed: cannot be commuted\n" ); |
720 | NewMI->eraseFromParent(); |
721 | } |
722 | if (Rollback) |
723 | break; |
724 | OrigMIs.push_back(Elt: &OrigMI); |
725 | } |
726 | |
727 | Rollback |= !Uses.empty(); |
728 | |
729 | for (auto *MI : *(Rollback? &DPPMIs : &OrigMIs)) |
730 | MI->eraseFromParent(); |
731 | |
732 | if (!Rollback) { |
733 | for (auto &S : RegSeqWithOpNos) { |
734 | if (MRI->use_nodbg_empty(RegNo: S.first->getOperand(i: 0).getReg())) { |
735 | S.first->eraseFromParent(); |
736 | continue; |
737 | } |
738 | while (!S.second.empty()) |
739 | S.first->getOperand(i: S.second.pop_back_val()).setIsUndef(); |
740 | } |
741 | } |
742 | |
743 | return !Rollback; |
744 | } |
745 | |
746 | bool GCNDPPCombine::runOnMachineFunction(MachineFunction &MF) { |
747 | ST = &MF.getSubtarget<GCNSubtarget>(); |
748 | if (!ST->hasDPP() || skipFunction(F: MF.getFunction())) |
749 | return false; |
750 | |
751 | MRI = &MF.getRegInfo(); |
752 | TII = ST->getInstrInfo(); |
753 | |
754 | bool Changed = false; |
755 | for (auto &MBB : MF) { |
756 | for (MachineInstr &MI : llvm::make_early_inc_range(Range: llvm::reverse(C&: MBB))) { |
757 | if (MI.getOpcode() == AMDGPU::V_MOV_B32_dpp && combineDPPMov(MI)) { |
758 | Changed = true; |
759 | ++NumDPPMovsCombined; |
760 | } else if (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO || |
761 | MI.getOpcode() == AMDGPU::V_MOV_B64_dpp) { |
762 | if (ST->hasDPALU_DPP() && combineDPPMov(MovMI&: MI)) { |
763 | Changed = true; |
764 | ++NumDPPMovsCombined; |
765 | } else { |
766 | auto Split = TII->expandMovDPP64(MI); |
767 | for (auto *M : {Split.first, Split.second}) { |
768 | if (M && combineDPPMov(MovMI&: *M)) |
769 | ++NumDPPMovsCombined; |
770 | } |
771 | Changed = true; |
772 | } |
773 | } |
774 | } |
775 | } |
776 | return Changed; |
777 | } |
778 | |