1 | //===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | /// The pass tries to use the 32-bit encoding for instructions when possible. |
8 | //===----------------------------------------------------------------------===// |
9 | // |
10 | |
11 | #include "AMDGPU.h" |
12 | #include "GCNSubtarget.h" |
13 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
14 | #include "Utils/AMDGPUBaseInfo.h" |
15 | #include "llvm/ADT/Statistic.h" |
16 | #include "llvm/CodeGen/MachineFunctionPass.h" |
17 | |
18 | #define DEBUG_TYPE "si-shrink-instructions" |
19 | |
20 | STATISTIC(NumInstructionsShrunk, |
21 | "Number of 64-bit instruction reduced to 32-bit." ); |
22 | STATISTIC(NumLiteralConstantsFolded, |
23 | "Number of literal constants folded into 32-bit instructions." ); |
24 | |
25 | using namespace llvm; |
26 | |
27 | namespace { |
28 | |
29 | class SIShrinkInstructions : public MachineFunctionPass { |
30 | MachineFunction *MF; |
31 | MachineRegisterInfo *MRI; |
32 | const GCNSubtarget *ST; |
33 | const SIInstrInfo *TII; |
34 | const SIRegisterInfo *TRI; |
35 | |
36 | public: |
37 | static char ID; |
38 | |
39 | public: |
40 | SIShrinkInstructions() : MachineFunctionPass(ID) { |
41 | } |
42 | |
43 | bool foldImmediates(MachineInstr &MI, bool TryToCommute = true) const; |
44 | bool shouldShrinkTrue16(MachineInstr &MI) const; |
45 | bool isKImmOperand(const MachineOperand &Src) const; |
46 | bool isKUImmOperand(const MachineOperand &Src) const; |
47 | bool isKImmOrKUImmOperand(const MachineOperand &Src, bool &IsUnsigned) const; |
48 | bool isReverseInlineImm(const MachineOperand &Src, int32_t &ReverseImm) const; |
49 | void copyExtraImplicitOps(MachineInstr &NewMI, MachineInstr &MI) const; |
50 | void shrinkScalarCompare(MachineInstr &MI) const; |
51 | void shrinkMIMG(MachineInstr &MI) const; |
52 | void shrinkMadFma(MachineInstr &MI) const; |
53 | bool shrinkScalarLogicOp(MachineInstr &MI) const; |
54 | bool tryReplaceDeadSDST(MachineInstr &MI) const; |
55 | bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R, |
56 | Register Reg, unsigned SubReg) const; |
57 | bool instReadsReg(const MachineInstr *MI, unsigned Reg, |
58 | unsigned SubReg) const; |
59 | bool instModifiesReg(const MachineInstr *MI, unsigned Reg, |
60 | unsigned SubReg) const; |
61 | TargetInstrInfo::RegSubRegPair getSubRegForIndex(Register Reg, unsigned Sub, |
62 | unsigned I) const; |
63 | void dropInstructionKeepingImpDefs(MachineInstr &MI) const; |
64 | MachineInstr *matchSwap(MachineInstr &MovT) const; |
65 | |
66 | bool runOnMachineFunction(MachineFunction &MF) override; |
67 | |
68 | StringRef getPassName() const override { return "SI Shrink Instructions" ; } |
69 | |
70 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
71 | AU.setPreservesCFG(); |
72 | MachineFunctionPass::getAnalysisUsage(AU); |
73 | } |
74 | }; |
75 | |
76 | } // End anonymous namespace. |
77 | |
78 | INITIALIZE_PASS(SIShrinkInstructions, DEBUG_TYPE, |
79 | "SI Shrink Instructions" , false, false) |
80 | |
81 | char SIShrinkInstructions::ID = 0; |
82 | |
83 | FunctionPass *llvm::createSIShrinkInstructionsPass() { |
84 | return new SIShrinkInstructions(); |
85 | } |
86 | |
87 | /// This function checks \p MI for operands defined by a move immediate |
88 | /// instruction and then folds the literal constant into the instruction if it |
89 | /// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions. |
90 | bool SIShrinkInstructions::foldImmediates(MachineInstr &MI, |
91 | bool TryToCommute) const { |
92 | assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI)); |
93 | |
94 | int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); |
95 | |
96 | // Try to fold Src0 |
97 | MachineOperand &Src0 = MI.getOperand(i: Src0Idx); |
98 | if (Src0.isReg()) { |
99 | Register Reg = Src0.getReg(); |
100 | if (Reg.isVirtual()) { |
101 | MachineInstr *Def = MRI->getUniqueVRegDef(Reg); |
102 | if (Def && Def->isMoveImmediate()) { |
103 | MachineOperand &MovSrc = Def->getOperand(i: 1); |
104 | bool ConstantFolded = false; |
105 | |
106 | if (TII->isOperandLegal(MI, OpIdx: Src0Idx, MO: &MovSrc)) { |
107 | if (MovSrc.isImm()) { |
108 | Src0.ChangeToImmediate(ImmVal: MovSrc.getImm()); |
109 | ConstantFolded = true; |
110 | } else if (MovSrc.isFI()) { |
111 | Src0.ChangeToFrameIndex(Idx: MovSrc.getIndex()); |
112 | ConstantFolded = true; |
113 | } else if (MovSrc.isGlobal()) { |
114 | Src0.ChangeToGA(GV: MovSrc.getGlobal(), Offset: MovSrc.getOffset(), |
115 | TargetFlags: MovSrc.getTargetFlags()); |
116 | ConstantFolded = true; |
117 | } |
118 | } |
119 | |
120 | if (ConstantFolded) { |
121 | if (MRI->use_nodbg_empty(RegNo: Reg)) |
122 | Def->eraseFromParent(); |
123 | ++NumLiteralConstantsFolded; |
124 | return true; |
125 | } |
126 | } |
127 | } |
128 | } |
129 | |
130 | // We have failed to fold src0, so commute the instruction and try again. |
131 | if (TryToCommute && MI.isCommutable()) { |
132 | if (TII->commuteInstruction(MI)) { |
133 | if (foldImmediates(MI, TryToCommute: false)) |
134 | return true; |
135 | |
136 | // Commute back. |
137 | TII->commuteInstruction(MI); |
138 | } |
139 | } |
140 | |
141 | return false; |
142 | } |
143 | |
144 | /// Do not shrink the instruction if its registers are not expressible in the |
145 | /// shrunk encoding. |
146 | bool SIShrinkInstructions::shouldShrinkTrue16(MachineInstr &MI) const { |
147 | for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) { |
148 | const MachineOperand &MO = MI.getOperand(i: I); |
149 | if (MO.isReg()) { |
150 | Register Reg = MO.getReg(); |
151 | assert(!Reg.isVirtual() && "Prior checks should ensure we only shrink " |
152 | "True16 Instructions post-RA" ); |
153 | if (AMDGPU::VGPR_32RegClass.contains(Reg) && |
154 | !AMDGPU::VGPR_32_Lo128RegClass.contains(Reg)) |
155 | return false; |
156 | } |
157 | } |
158 | return true; |
159 | } |
160 | |
161 | bool SIShrinkInstructions::isKImmOperand(const MachineOperand &Src) const { |
162 | return isInt<16>(x: SignExtend64(X: Src.getImm(), B: 32)) && |
163 | !TII->isInlineConstant(MI: *Src.getParent(), OpIdx: Src.getOperandNo()); |
164 | } |
165 | |
166 | bool SIShrinkInstructions::isKUImmOperand(const MachineOperand &Src) const { |
167 | return isUInt<16>(x: Src.getImm()) && |
168 | !TII->isInlineConstant(MI: *Src.getParent(), OpIdx: Src.getOperandNo()); |
169 | } |
170 | |
171 | bool SIShrinkInstructions::isKImmOrKUImmOperand(const MachineOperand &Src, |
172 | bool &IsUnsigned) const { |
173 | if (isInt<16>(x: SignExtend64(X: Src.getImm(), B: 32))) { |
174 | IsUnsigned = false; |
175 | return !TII->isInlineConstant(MO: Src); |
176 | } |
177 | |
178 | if (isUInt<16>(x: Src.getImm())) { |
179 | IsUnsigned = true; |
180 | return !TII->isInlineConstant(MO: Src); |
181 | } |
182 | |
183 | return false; |
184 | } |
185 | |
186 | /// \returns true if the constant in \p Src should be replaced with a bitreverse |
187 | /// of an inline immediate. |
188 | bool SIShrinkInstructions::isReverseInlineImm(const MachineOperand &Src, |
189 | int32_t &ReverseImm) const { |
190 | if (!isInt<32>(x: Src.getImm()) || TII->isInlineConstant(MO: Src)) |
191 | return false; |
192 | |
193 | ReverseImm = reverseBits<int32_t>(Val: static_cast<int32_t>(Src.getImm())); |
194 | return ReverseImm >= -16 && ReverseImm <= 64; |
195 | } |
196 | |
197 | /// Copy implicit register operands from specified instruction to this |
198 | /// instruction that are not part of the instruction definition. |
199 | void SIShrinkInstructions::(MachineInstr &NewMI, |
200 | MachineInstr &MI) const { |
201 | MachineFunction &MF = *MI.getMF(); |
202 | for (unsigned i = MI.getDesc().getNumOperands() + |
203 | MI.getDesc().implicit_uses().size() + |
204 | MI.getDesc().implicit_defs().size(), |
205 | e = MI.getNumOperands(); |
206 | i != e; ++i) { |
207 | const MachineOperand &MO = MI.getOperand(i); |
208 | if ((MO.isReg() && MO.isImplicit()) || MO.isRegMask()) |
209 | NewMI.addOperand(MF, Op: MO); |
210 | } |
211 | } |
212 | |
213 | void SIShrinkInstructions::shrinkScalarCompare(MachineInstr &MI) const { |
214 | if (!ST->hasSCmpK()) |
215 | return; |
216 | |
217 | // cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to |
218 | // get constants on the RHS. |
219 | if (!MI.getOperand(i: 0).isReg()) |
220 | TII->commuteInstruction(MI, NewMI: false, OpIdx0: 0, OpIdx1: 1); |
221 | |
222 | // cmpk requires src0 to be a register |
223 | const MachineOperand &Src0 = MI.getOperand(i: 0); |
224 | if (!Src0.isReg()) |
225 | return; |
226 | |
227 | MachineOperand &Src1 = MI.getOperand(i: 1); |
228 | if (!Src1.isImm()) |
229 | return; |
230 | |
231 | int SOPKOpc = AMDGPU::getSOPKOp(Opcode: MI.getOpcode()); |
232 | if (SOPKOpc == -1) |
233 | return; |
234 | |
235 | // eq/ne is special because the imm16 can be treated as signed or unsigned, |
236 | // and initially selected to the unsigned versions. |
237 | if (SOPKOpc == AMDGPU::S_CMPK_EQ_U32 || SOPKOpc == AMDGPU::S_CMPK_LG_U32) { |
238 | bool HasUImm; |
239 | if (isKImmOrKUImmOperand(Src: Src1, IsUnsigned&: HasUImm)) { |
240 | if (!HasUImm) { |
241 | SOPKOpc = (SOPKOpc == AMDGPU::S_CMPK_EQ_U32) ? |
242 | AMDGPU::S_CMPK_EQ_I32 : AMDGPU::S_CMPK_LG_I32; |
243 | Src1.setImm(SignExtend32(X: Src1.getImm(), B: 32)); |
244 | } |
245 | |
246 | MI.setDesc(TII->get(SOPKOpc)); |
247 | } |
248 | |
249 | return; |
250 | } |
251 | |
252 | const MCInstrDesc &NewDesc = TII->get(SOPKOpc); |
253 | |
254 | if ((SIInstrInfo::sopkIsZext(Opcode: SOPKOpc) && isKUImmOperand(Src: Src1)) || |
255 | (!SIInstrInfo::sopkIsZext(Opcode: SOPKOpc) && isKImmOperand(Src: Src1))) { |
256 | if (!SIInstrInfo::sopkIsZext(Opcode: SOPKOpc)) |
257 | Src1.setImm(SignExtend64(X: Src1.getImm(), B: 32)); |
258 | MI.setDesc(NewDesc); |
259 | } |
260 | } |
261 | |
262 | // Shrink NSA encoded instructions with contiguous VGPRs to non-NSA encoding. |
263 | void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const { |
264 | const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: MI.getOpcode()); |
265 | if (!Info) |
266 | return; |
267 | |
268 | uint8_t NewEncoding; |
269 | switch (Info->MIMGEncoding) { |
270 | case AMDGPU::MIMGEncGfx10NSA: |
271 | NewEncoding = AMDGPU::MIMGEncGfx10Default; |
272 | break; |
273 | case AMDGPU::MIMGEncGfx11NSA: |
274 | NewEncoding = AMDGPU::MIMGEncGfx11Default; |
275 | break; |
276 | default: |
277 | return; |
278 | } |
279 | |
280 | int VAddr0Idx = |
281 | AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0); |
282 | unsigned NewAddrDwords = Info->VAddrDwords; |
283 | const TargetRegisterClass *RC; |
284 | |
285 | if (Info->VAddrDwords == 2) { |
286 | RC = &AMDGPU::VReg_64RegClass; |
287 | } else if (Info->VAddrDwords == 3) { |
288 | RC = &AMDGPU::VReg_96RegClass; |
289 | } else if (Info->VAddrDwords == 4) { |
290 | RC = &AMDGPU::VReg_128RegClass; |
291 | } else if (Info->VAddrDwords == 5) { |
292 | RC = &AMDGPU::VReg_160RegClass; |
293 | } else if (Info->VAddrDwords == 6) { |
294 | RC = &AMDGPU::VReg_192RegClass; |
295 | } else if (Info->VAddrDwords == 7) { |
296 | RC = &AMDGPU::VReg_224RegClass; |
297 | } else if (Info->VAddrDwords == 8) { |
298 | RC = &AMDGPU::VReg_256RegClass; |
299 | } else if (Info->VAddrDwords == 9) { |
300 | RC = &AMDGPU::VReg_288RegClass; |
301 | } else if (Info->VAddrDwords == 10) { |
302 | RC = &AMDGPU::VReg_320RegClass; |
303 | } else if (Info->VAddrDwords == 11) { |
304 | RC = &AMDGPU::VReg_352RegClass; |
305 | } else if (Info->VAddrDwords == 12) { |
306 | RC = &AMDGPU::VReg_384RegClass; |
307 | } else { |
308 | RC = &AMDGPU::VReg_512RegClass; |
309 | NewAddrDwords = 16; |
310 | } |
311 | |
312 | unsigned VgprBase = 0; |
313 | unsigned NextVgpr = 0; |
314 | bool IsUndef = true; |
315 | bool IsKill = NewAddrDwords == Info->VAddrDwords; |
316 | const unsigned NSAMaxSize = ST->getNSAMaxSize(); |
317 | const bool IsPartialNSA = NewAddrDwords > NSAMaxSize; |
318 | const unsigned EndVAddr = IsPartialNSA ? NSAMaxSize : Info->VAddrOperands; |
319 | for (unsigned Idx = 0; Idx < EndVAddr; ++Idx) { |
320 | const MachineOperand &Op = MI.getOperand(i: VAddr0Idx + Idx); |
321 | unsigned Vgpr = TRI->getHWRegIndex(Reg: Op.getReg()); |
322 | unsigned Dwords = TRI->getRegSizeInBits(Op.getReg(), *MRI) / 32; |
323 | assert(Dwords > 0 && "Un-implemented for less than 32 bit regs" ); |
324 | |
325 | if (Idx == 0) { |
326 | VgprBase = Vgpr; |
327 | NextVgpr = Vgpr + Dwords; |
328 | } else if (Vgpr == NextVgpr) { |
329 | NextVgpr = Vgpr + Dwords; |
330 | } else { |
331 | return; |
332 | } |
333 | |
334 | if (!Op.isUndef()) |
335 | IsUndef = false; |
336 | if (!Op.isKill()) |
337 | IsKill = false; |
338 | } |
339 | |
340 | if (VgprBase + NewAddrDwords > 256) |
341 | return; |
342 | |
343 | // Further check for implicit tied operands - this may be present if TFE is |
344 | // enabled |
345 | int TFEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::tfe); |
346 | int LWEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::lwe); |
347 | unsigned TFEVal = (TFEIdx == -1) ? 0 : MI.getOperand(i: TFEIdx).getImm(); |
348 | unsigned LWEVal = (LWEIdx == -1) ? 0 : MI.getOperand(i: LWEIdx).getImm(); |
349 | int ToUntie = -1; |
350 | if (TFEVal || LWEVal) { |
351 | // TFE/LWE is enabled so we need to deal with an implicit tied operand |
352 | for (unsigned i = LWEIdx + 1, e = MI.getNumOperands(); i != e; ++i) { |
353 | if (MI.getOperand(i).isReg() && MI.getOperand(i).isTied() && |
354 | MI.getOperand(i).isImplicit()) { |
355 | // This is the tied operand |
356 | assert( |
357 | ToUntie == -1 && |
358 | "found more than one tied implicit operand when expecting only 1" ); |
359 | ToUntie = i; |
360 | MI.untieRegOperand(OpIdx: ToUntie); |
361 | } |
362 | } |
363 | } |
364 | |
365 | unsigned NewOpcode = AMDGPU::getMIMGOpcode(BaseOpcode: Info->BaseOpcode, MIMGEncoding: NewEncoding, |
366 | VDataDwords: Info->VDataDwords, VAddrDwords: NewAddrDwords); |
367 | MI.setDesc(TII->get(NewOpcode)); |
368 | MI.getOperand(i: VAddr0Idx).setReg(RC->getRegister(i: VgprBase)); |
369 | MI.getOperand(i: VAddr0Idx).setIsUndef(IsUndef); |
370 | MI.getOperand(i: VAddr0Idx).setIsKill(IsKill); |
371 | |
372 | for (unsigned i = 1; i < EndVAddr; ++i) |
373 | MI.removeOperand(OpNo: VAddr0Idx + 1); |
374 | |
375 | if (ToUntie >= 0) { |
376 | MI.tieOperands( |
377 | AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata), |
378 | ToUntie - (EndVAddr - 1)); |
379 | } |
380 | } |
381 | |
382 | // Shrink MAD to MADAK/MADMK and FMA to FMAAK/FMAMK. |
383 | void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const { |
384 | // Pre-GFX10 VOP3 instructions like MAD/FMA cannot take a literal operand so |
385 | // there is no reason to try to shrink them. |
386 | if (!ST->hasVOP3Literal()) |
387 | return; |
388 | |
389 | // There is no advantage to doing this pre-RA. |
390 | if (!MF->getProperties().hasProperty( |
391 | P: MachineFunctionProperties::Property::NoVRegs)) |
392 | return; |
393 | |
394 | if (TII->hasAnyModifiersSet(MI)) |
395 | return; |
396 | |
397 | const unsigned Opcode = MI.getOpcode(); |
398 | MachineOperand &Src0 = *TII->getNamedOperand(MI, AMDGPU::OpName::src0); |
399 | MachineOperand &Src1 = *TII->getNamedOperand(MI, AMDGPU::OpName::src1); |
400 | MachineOperand &Src2 = *TII->getNamedOperand(MI, AMDGPU::OpName::src2); |
401 | unsigned NewOpcode = AMDGPU::INSTRUCTION_LIST_END; |
402 | |
403 | bool Swap; |
404 | |
405 | // Detect "Dst = VSrc * VGPR + Imm" and convert to AK form. |
406 | if (Src2.isImm() && !TII->isInlineConstant(MO: Src2)) { |
407 | if (Src1.isReg() && TRI->isVGPR(MRI: *MRI, Reg: Src1.getReg())) |
408 | Swap = false; |
409 | else if (Src0.isReg() && TRI->isVGPR(MRI: *MRI, Reg: Src0.getReg())) |
410 | Swap = true; |
411 | else |
412 | return; |
413 | |
414 | switch (Opcode) { |
415 | default: |
416 | llvm_unreachable("Unexpected mad/fma opcode!" ); |
417 | case AMDGPU::V_MAD_F32_e64: |
418 | NewOpcode = AMDGPU::V_MADAK_F32; |
419 | break; |
420 | case AMDGPU::V_FMA_F32_e64: |
421 | NewOpcode = AMDGPU::V_FMAAK_F32; |
422 | break; |
423 | case AMDGPU::V_MAD_F16_e64: |
424 | NewOpcode = AMDGPU::V_MADAK_F16; |
425 | break; |
426 | case AMDGPU::V_FMA_F16_e64: |
427 | case AMDGPU::V_FMA_F16_gfx9_e64: |
428 | NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_t16 |
429 | : AMDGPU::V_FMAAK_F16; |
430 | break; |
431 | } |
432 | } |
433 | |
434 | // Detect "Dst = VSrc * Imm + VGPR" and convert to MK form. |
435 | if (Src2.isReg() && TRI->isVGPR(MRI: *MRI, Reg: Src2.getReg())) { |
436 | if (Src1.isImm() && !TII->isInlineConstant(MO: Src1)) |
437 | Swap = false; |
438 | else if (Src0.isImm() && !TII->isInlineConstant(MO: Src0)) |
439 | Swap = true; |
440 | else |
441 | return; |
442 | |
443 | switch (Opcode) { |
444 | default: |
445 | llvm_unreachable("Unexpected mad/fma opcode!" ); |
446 | case AMDGPU::V_MAD_F32_e64: |
447 | NewOpcode = AMDGPU::V_MADMK_F32; |
448 | break; |
449 | case AMDGPU::V_FMA_F32_e64: |
450 | NewOpcode = AMDGPU::V_FMAMK_F32; |
451 | break; |
452 | case AMDGPU::V_MAD_F16_e64: |
453 | NewOpcode = AMDGPU::V_MADMK_F16; |
454 | break; |
455 | case AMDGPU::V_FMA_F16_e64: |
456 | case AMDGPU::V_FMA_F16_gfx9_e64: |
457 | NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16 |
458 | : AMDGPU::V_FMAMK_F16; |
459 | break; |
460 | } |
461 | } |
462 | |
463 | if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) |
464 | return; |
465 | |
466 | if (AMDGPU::isTrue16Inst(Opc: NewOpcode) && !shouldShrinkTrue16(MI)) |
467 | return; |
468 | |
469 | if (Swap) { |
470 | // Swap Src0 and Src1 by building a new instruction. |
471 | BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(NewOpcode), |
472 | MI.getOperand(i: 0).getReg()) |
473 | .add(Src1) |
474 | .add(Src0) |
475 | .add(Src2) |
476 | .setMIFlags(MI.getFlags()); |
477 | MI.eraseFromParent(); |
478 | } else { |
479 | TII->removeModOperands(MI); |
480 | MI.setDesc(TII->get(NewOpcode)); |
481 | } |
482 | } |
483 | |
484 | /// Attempt to shrink AND/OR/XOR operations requiring non-inlineable literals. |
485 | /// For AND or OR, try using S_BITSET{0,1} to clear or set bits. |
486 | /// If the inverse of the immediate is legal, use ANDN2, ORN2 or |
487 | /// XNOR (as a ^ b == ~(a ^ ~b)). |
488 | /// \returns true if the caller should continue the machine function iterator |
489 | bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const { |
490 | unsigned Opc = MI.getOpcode(); |
491 | const MachineOperand *Dest = &MI.getOperand(i: 0); |
492 | MachineOperand *Src0 = &MI.getOperand(i: 1); |
493 | MachineOperand *Src1 = &MI.getOperand(i: 2); |
494 | MachineOperand *SrcReg = Src0; |
495 | MachineOperand *SrcImm = Src1; |
496 | |
497 | if (!SrcImm->isImm() || |
498 | AMDGPU::isInlinableLiteral32(Literal: SrcImm->getImm(), HasInv2Pi: ST->hasInv2PiInlineImm())) |
499 | return false; |
500 | |
501 | uint32_t Imm = static_cast<uint32_t>(SrcImm->getImm()); |
502 | uint32_t NewImm = 0; |
503 | |
504 | if (Opc == AMDGPU::S_AND_B32) { |
505 | if (isPowerOf2_32(Value: ~Imm)) { |
506 | NewImm = llvm::countr_one(Value: Imm); |
507 | Opc = AMDGPU::S_BITSET0_B32; |
508 | } else if (AMDGPU::isInlinableLiteral32(Literal: ~Imm, HasInv2Pi: ST->hasInv2PiInlineImm())) { |
509 | NewImm = ~Imm; |
510 | Opc = AMDGPU::S_ANDN2_B32; |
511 | } |
512 | } else if (Opc == AMDGPU::S_OR_B32) { |
513 | if (isPowerOf2_32(Value: Imm)) { |
514 | NewImm = llvm::countr_zero(Val: Imm); |
515 | Opc = AMDGPU::S_BITSET1_B32; |
516 | } else if (AMDGPU::isInlinableLiteral32(Literal: ~Imm, HasInv2Pi: ST->hasInv2PiInlineImm())) { |
517 | NewImm = ~Imm; |
518 | Opc = AMDGPU::S_ORN2_B32; |
519 | } |
520 | } else if (Opc == AMDGPU::S_XOR_B32) { |
521 | if (AMDGPU::isInlinableLiteral32(Literal: ~Imm, HasInv2Pi: ST->hasInv2PiInlineImm())) { |
522 | NewImm = ~Imm; |
523 | Opc = AMDGPU::S_XNOR_B32; |
524 | } |
525 | } else { |
526 | llvm_unreachable("unexpected opcode" ); |
527 | } |
528 | |
529 | if (NewImm != 0) { |
530 | if (Dest->getReg().isVirtual() && SrcReg->isReg()) { |
531 | MRI->setRegAllocationHint(VReg: Dest->getReg(), Type: 0, PrefReg: SrcReg->getReg()); |
532 | MRI->setRegAllocationHint(VReg: SrcReg->getReg(), Type: 0, PrefReg: Dest->getReg()); |
533 | return true; |
534 | } |
535 | |
536 | if (SrcReg->isReg() && SrcReg->getReg() == Dest->getReg()) { |
537 | const bool IsUndef = SrcReg->isUndef(); |
538 | const bool IsKill = SrcReg->isKill(); |
539 | MI.setDesc(TII->get(Opc)); |
540 | if (Opc == AMDGPU::S_BITSET0_B32 || |
541 | Opc == AMDGPU::S_BITSET1_B32) { |
542 | Src0->ChangeToImmediate(ImmVal: NewImm); |
543 | // Remove the immediate and add the tied input. |
544 | MI.getOperand(i: 2).ChangeToRegister(Reg: Dest->getReg(), /*IsDef*/ isDef: false, |
545 | /*isImp*/ false, isKill: IsKill, |
546 | /*isDead*/ false, isUndef: IsUndef); |
547 | MI.tieOperands(DefIdx: 0, UseIdx: 2); |
548 | } else { |
549 | SrcImm->setImm(NewImm); |
550 | } |
551 | } |
552 | } |
553 | |
554 | return false; |
555 | } |
556 | |
557 | // This is the same as MachineInstr::readsRegister/modifiesRegister except |
558 | // it takes subregs into account. |
559 | bool SIShrinkInstructions::instAccessReg( |
560 | iterator_range<MachineInstr::const_mop_iterator> &&R, Register Reg, |
561 | unsigned SubReg) const { |
562 | for (const MachineOperand &MO : R) { |
563 | if (!MO.isReg()) |
564 | continue; |
565 | |
566 | if (Reg.isPhysical() && MO.getReg().isPhysical()) { |
567 | if (TRI->regsOverlap(Reg, MO.getReg())) |
568 | return true; |
569 | } else if (MO.getReg() == Reg && Reg.isVirtual()) { |
570 | LaneBitmask Overlap = TRI->getSubRegIndexLaneMask(SubReg) & |
571 | TRI->getSubRegIndexLaneMask(MO.getSubReg()); |
572 | if (Overlap.any()) |
573 | return true; |
574 | } |
575 | } |
576 | return false; |
577 | } |
578 | |
579 | bool SIShrinkInstructions::instReadsReg(const MachineInstr *MI, unsigned Reg, |
580 | unsigned SubReg) const { |
581 | return instAccessReg(R: MI->uses(), Reg, SubReg); |
582 | } |
583 | |
584 | bool SIShrinkInstructions::instModifiesReg(const MachineInstr *MI, unsigned Reg, |
585 | unsigned SubReg) const { |
586 | return instAccessReg(R: MI->defs(), Reg, SubReg); |
587 | } |
588 | |
589 | TargetInstrInfo::RegSubRegPair |
590 | SIShrinkInstructions::getSubRegForIndex(Register Reg, unsigned Sub, |
591 | unsigned I) const { |
592 | if (TRI->getRegSizeInBits(Reg, *MRI) != 32) { |
593 | if (Reg.isPhysical()) { |
594 | Reg = TRI->getSubReg(Reg, TRI->getSubRegFromChannel(Channel: I)); |
595 | } else { |
596 | Sub = TRI->getSubRegFromChannel(Channel: I + TRI->getChannelFromSubReg(SubReg: Sub)); |
597 | } |
598 | } |
599 | return TargetInstrInfo::RegSubRegPair(Reg, Sub); |
600 | } |
601 | |
602 | void SIShrinkInstructions::dropInstructionKeepingImpDefs( |
603 | MachineInstr &MI) const { |
604 | for (unsigned i = MI.getDesc().getNumOperands() + |
605 | MI.getDesc().implicit_uses().size() + |
606 | MI.getDesc().implicit_defs().size(), |
607 | e = MI.getNumOperands(); |
608 | i != e; ++i) { |
609 | const MachineOperand &Op = MI.getOperand(i); |
610 | if (!Op.isDef()) |
611 | continue; |
612 | BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), |
613 | TII->get(AMDGPU::IMPLICIT_DEF), Op.getReg()); |
614 | } |
615 | |
616 | MI.eraseFromParent(); |
617 | } |
618 | |
619 | // Match: |
620 | // mov t, x |
621 | // mov x, y |
622 | // mov y, t |
623 | // |
624 | // => |
625 | // |
626 | // mov t, x (t is potentially dead and move eliminated) |
627 | // v_swap_b32 x, y |
628 | // |
629 | // Returns next valid instruction pointer if was able to create v_swap_b32. |
630 | // |
631 | // This shall not be done too early not to prevent possible folding which may |
632 | // remove matched moves, and this should preferably be done before RA to |
633 | // release saved registers and also possibly after RA which can insert copies |
634 | // too. |
635 | // |
636 | // This is really just a generic peephole that is not a canonical shrinking, |
637 | // although requirements match the pass placement and it reduces code size too. |
638 | MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const { |
639 | assert(MovT.getOpcode() == AMDGPU::V_MOV_B32_e32 || |
640 | MovT.getOpcode() == AMDGPU::COPY); |
641 | |
642 | Register T = MovT.getOperand(i: 0).getReg(); |
643 | unsigned Tsub = MovT.getOperand(i: 0).getSubReg(); |
644 | MachineOperand &Xop = MovT.getOperand(i: 1); |
645 | |
646 | if (!Xop.isReg()) |
647 | return nullptr; |
648 | Register X = Xop.getReg(); |
649 | unsigned Xsub = Xop.getSubReg(); |
650 | |
651 | unsigned Size = TII->getOpSize(MI: MovT, OpNo: 0) / 4; |
652 | |
653 | if (!TRI->isVGPR(MRI: *MRI, Reg: X)) |
654 | return nullptr; |
655 | |
656 | const unsigned SearchLimit = 16; |
657 | unsigned Count = 0; |
658 | bool KilledT = false; |
659 | for (auto Iter = std::next(x: MovT.getIterator()), |
660 | E = MovT.getParent()->instr_end(); |
661 | Iter != E && Count < SearchLimit && !KilledT; ++Iter, ++Count) { |
662 | |
663 | MachineInstr *MovY = &*Iter; |
664 | KilledT = MovY->killsRegister(T, TRI); |
665 | |
666 | if ((MovY->getOpcode() != AMDGPU::V_MOV_B32_e32 && |
667 | MovY->getOpcode() != AMDGPU::COPY) || |
668 | !MovY->getOperand(1).isReg() || |
669 | MovY->getOperand(1).getReg() != T || |
670 | MovY->getOperand(1).getSubReg() != Tsub) |
671 | continue; |
672 | |
673 | Register Y = MovY->getOperand(i: 0).getReg(); |
674 | unsigned Ysub = MovY->getOperand(i: 0).getSubReg(); |
675 | |
676 | if (!TRI->isVGPR(MRI: *MRI, Reg: Y)) |
677 | continue; |
678 | |
679 | MachineInstr *MovX = nullptr; |
680 | for (auto IY = MovY->getIterator(), I = std::next(x: MovT.getIterator()); |
681 | I != IY; ++I) { |
682 | if (instReadsReg(MI: &*I, Reg: X, SubReg: Xsub) || instModifiesReg(MI: &*I, Reg: Y, SubReg: Ysub) || |
683 | instModifiesReg(MI: &*I, Reg: T, SubReg: Tsub) || |
684 | (MovX && instModifiesReg(MI: &*I, Reg: X, SubReg: Xsub))) { |
685 | MovX = nullptr; |
686 | break; |
687 | } |
688 | if (!instReadsReg(MI: &*I, Reg: Y, SubReg: Ysub)) { |
689 | if (!MovX && instModifiesReg(MI: &*I, Reg: X, SubReg: Xsub)) { |
690 | MovX = nullptr; |
691 | break; |
692 | } |
693 | continue; |
694 | } |
695 | if (MovX || |
696 | (I->getOpcode() != AMDGPU::V_MOV_B32_e32 && |
697 | I->getOpcode() != AMDGPU::COPY) || |
698 | I->getOperand(0).getReg() != X || |
699 | I->getOperand(0).getSubReg() != Xsub) { |
700 | MovX = nullptr; |
701 | break; |
702 | } |
703 | |
704 | if (Size > 1 && (I->getNumImplicitOperands() > (I->isCopy() ? 0U : 1U))) |
705 | continue; |
706 | |
707 | MovX = &*I; |
708 | } |
709 | |
710 | if (!MovX) |
711 | continue; |
712 | |
713 | LLVM_DEBUG(dbgs() << "Matched v_swap_b32:\n" << MovT << *MovX << *MovY); |
714 | |
715 | for (unsigned I = 0; I < Size; ++I) { |
716 | TargetInstrInfo::RegSubRegPair X1, Y1; |
717 | X1 = getSubRegForIndex(Reg: X, Sub: Xsub, I); |
718 | Y1 = getSubRegForIndex(Reg: Y, Sub: Ysub, I); |
719 | MachineBasicBlock &MBB = *MovT.getParent(); |
720 | auto MIB = BuildMI(MBB, MovX->getIterator(), MovT.getDebugLoc(), |
721 | TII->get(AMDGPU::V_SWAP_B32)) |
722 | .addDef(X1.Reg, 0, X1.SubReg) |
723 | .addDef(Y1.Reg, 0, Y1.SubReg) |
724 | .addReg(Y1.Reg, 0, Y1.SubReg) |
725 | .addReg(X1.Reg, 0, X1.SubReg).getInstr(); |
726 | if (MovX->hasRegisterImplicitUseOperand(AMDGPU::EXEC)) { |
727 | // Drop implicit EXEC. |
728 | MIB->removeOperand(MIB->getNumExplicitOperands()); |
729 | MIB->copyImplicitOps(*MBB.getParent(), *MovX); |
730 | } |
731 | } |
732 | MovX->eraseFromParent(); |
733 | dropInstructionKeepingImpDefs(MI&: *MovY); |
734 | MachineInstr *Next = &*std::next(x: MovT.getIterator()); |
735 | |
736 | if (T.isVirtual() && MRI->use_nodbg_empty(RegNo: T)) { |
737 | dropInstructionKeepingImpDefs(MI&: MovT); |
738 | } else { |
739 | Xop.setIsKill(false); |
740 | for (int I = MovT.getNumImplicitOperands() - 1; I >= 0; --I ) { |
741 | unsigned OpNo = MovT.getNumExplicitOperands() + I; |
742 | const MachineOperand &Op = MovT.getOperand(i: OpNo); |
743 | if (Op.isKill() && TRI->regsOverlap(X, Op.getReg())) |
744 | MovT.removeOperand(OpNo); |
745 | } |
746 | } |
747 | |
748 | return Next; |
749 | } |
750 | |
751 | return nullptr; |
752 | } |
753 | |
754 | // If an instruction has dead sdst replace it with NULL register on gfx1030+ |
755 | bool SIShrinkInstructions::tryReplaceDeadSDST(MachineInstr &MI) const { |
756 | if (!ST->hasGFX10_3Insts()) |
757 | return false; |
758 | |
759 | MachineOperand *Op = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); |
760 | if (!Op) |
761 | return false; |
762 | Register SDstReg = Op->getReg(); |
763 | if (SDstReg.isPhysical() || !MRI->use_nodbg_empty(RegNo: SDstReg)) |
764 | return false; |
765 | |
766 | Op->setReg(ST->isWave32() ? AMDGPU::SGPR_NULL : AMDGPU::SGPR_NULL64); |
767 | return true; |
768 | } |
769 | |
770 | bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { |
771 | if (skipFunction(F: MF.getFunction())) |
772 | return false; |
773 | |
774 | this->MF = &MF; |
775 | MRI = &MF.getRegInfo(); |
776 | ST = &MF.getSubtarget<GCNSubtarget>(); |
777 | TII = ST->getInstrInfo(); |
778 | TRI = &TII->getRegisterInfo(); |
779 | |
780 | unsigned VCCReg = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC; |
781 | |
782 | std::vector<unsigned> I1Defs; |
783 | |
784 | for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); |
785 | BI != BE; ++BI) { |
786 | |
787 | MachineBasicBlock &MBB = *BI; |
788 | MachineBasicBlock::iterator I, Next; |
789 | for (I = MBB.begin(); I != MBB.end(); I = Next) { |
790 | Next = std::next(x: I); |
791 | MachineInstr &MI = *I; |
792 | |
793 | if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) { |
794 | // If this has a literal constant source that is the same as the |
795 | // reversed bits of an inline immediate, replace with a bitreverse of |
796 | // that constant. This saves 4 bytes in the common case of materializing |
797 | // sign bits. |
798 | |
799 | // Test if we are after regalloc. We only want to do this after any |
800 | // optimizations happen because this will confuse them. |
801 | // XXX - not exactly a check for post-regalloc run. |
802 | MachineOperand &Src = MI.getOperand(i: 1); |
803 | if (Src.isImm() && MI.getOperand(i: 0).getReg().isPhysical()) { |
804 | int32_t ReverseImm; |
805 | if (isReverseInlineImm(Src, ReverseImm)) { |
806 | MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32)); |
807 | Src.setImm(ReverseImm); |
808 | continue; |
809 | } |
810 | } |
811 | } |
812 | |
813 | if (ST->hasSwap() && (MI.getOpcode() == AMDGPU::V_MOV_B32_e32 || |
814 | MI.getOpcode() == AMDGPU::COPY)) { |
815 | if (auto *NextMI = matchSwap(MovT&: MI)) { |
816 | Next = NextMI->getIterator(); |
817 | continue; |
818 | } |
819 | } |
820 | |
821 | // Try to use S_ADDK_I32 and S_MULK_I32. |
822 | if (MI.getOpcode() == AMDGPU::S_ADD_I32 || |
823 | MI.getOpcode() == AMDGPU::S_MUL_I32) { |
824 | const MachineOperand *Dest = &MI.getOperand(i: 0); |
825 | MachineOperand *Src0 = &MI.getOperand(i: 1); |
826 | MachineOperand *Src1 = &MI.getOperand(i: 2); |
827 | |
828 | if (!Src0->isReg() && Src1->isReg()) { |
829 | if (TII->commuteInstruction(MI, false, 1, 2)) |
830 | std::swap(a&: Src0, b&: Src1); |
831 | } |
832 | |
833 | // FIXME: This could work better if hints worked with subregisters. If |
834 | // we have a vector add of a constant, we usually don't get the correct |
835 | // allocation due to the subregister usage. |
836 | if (Dest->getReg().isVirtual() && Src0->isReg()) { |
837 | MRI->setRegAllocationHint(VReg: Dest->getReg(), Type: 0, PrefReg: Src0->getReg()); |
838 | MRI->setRegAllocationHint(VReg: Src0->getReg(), Type: 0, PrefReg: Dest->getReg()); |
839 | continue; |
840 | } |
841 | |
842 | if (Src0->isReg() && Src0->getReg() == Dest->getReg()) { |
843 | if (Src1->isImm() && isKImmOperand(Src: *Src1)) { |
844 | unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ? |
845 | AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32; |
846 | |
847 | Src1->setImm(SignExtend64(X: Src1->getImm(), B: 32)); |
848 | MI.setDesc(TII->get(Opc)); |
849 | MI.tieOperands(DefIdx: 0, UseIdx: 1); |
850 | } |
851 | } |
852 | } |
853 | |
854 | // Try to use s_cmpk_* |
855 | if (MI.isCompare() && TII->isSOPC(MI)) { |
856 | shrinkScalarCompare(MI); |
857 | continue; |
858 | } |
859 | |
860 | // Try to use S_MOVK_I32, which will save 4 bytes for small immediates. |
861 | if (MI.getOpcode() == AMDGPU::S_MOV_B32) { |
862 | const MachineOperand &Dst = MI.getOperand(i: 0); |
863 | MachineOperand &Src = MI.getOperand(i: 1); |
864 | |
865 | if (Src.isImm() && Dst.getReg().isPhysical()) { |
866 | int32_t ReverseImm; |
867 | if (isKImmOperand(Src)) { |
868 | MI.setDesc(TII->get(AMDGPU::S_MOVK_I32)); |
869 | Src.setImm(SignExtend64(X: Src.getImm(), B: 32)); |
870 | } else if (isReverseInlineImm(Src, ReverseImm)) { |
871 | MI.setDesc(TII->get(AMDGPU::S_BREV_B32)); |
872 | Src.setImm(ReverseImm); |
873 | } |
874 | } |
875 | |
876 | continue; |
877 | } |
878 | |
879 | // Shrink scalar logic operations. |
880 | if (MI.getOpcode() == AMDGPU::S_AND_B32 || |
881 | MI.getOpcode() == AMDGPU::S_OR_B32 || |
882 | MI.getOpcode() == AMDGPU::S_XOR_B32) { |
883 | if (shrinkScalarLogicOp(MI)) |
884 | continue; |
885 | } |
886 | |
887 | if (TII->isMIMG(Opcode: MI.getOpcode()) && |
888 | ST->getGeneration() >= AMDGPUSubtarget::GFX10 && |
889 | MF.getProperties().hasProperty( |
890 | P: MachineFunctionProperties::Property::NoVRegs)) { |
891 | shrinkMIMG(MI); |
892 | continue; |
893 | } |
894 | |
895 | if (!TII->isVOP3(MI)) |
896 | continue; |
897 | |
898 | if (MI.getOpcode() == AMDGPU::V_MAD_F32_e64 || |
899 | MI.getOpcode() == AMDGPU::V_FMA_F32_e64 || |
900 | MI.getOpcode() == AMDGPU::V_MAD_F16_e64 || |
901 | MI.getOpcode() == AMDGPU::V_FMA_F16_e64 || |
902 | MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_e64) { |
903 | shrinkMadFma(MI); |
904 | continue; |
905 | } |
906 | |
907 | if (!TII->hasVALU32BitEncoding(Opcode: MI.getOpcode())) { |
908 | // If there is no chance we will shrink it and use VCC as sdst to get |
909 | // a 32 bit form try to replace dead sdst with NULL. |
910 | tryReplaceDeadSDST(MI); |
911 | continue; |
912 | } |
913 | |
914 | if (!TII->canShrink(MI, MRI: *MRI)) { |
915 | // Try commuting the instruction and see if that enables us to shrink |
916 | // it. |
917 | if (!MI.isCommutable() || !TII->commuteInstruction(MI) || |
918 | !TII->canShrink(MI, MRI: *MRI)) { |
919 | tryReplaceDeadSDST(MI); |
920 | continue; |
921 | } |
922 | } |
923 | |
924 | int Op32 = AMDGPU::getVOPe32(Opcode: MI.getOpcode()); |
925 | |
926 | if (TII->isVOPC(Opcode: Op32)) { |
927 | MachineOperand &Op0 = MI.getOperand(i: 0); |
928 | if (Op0.isReg()) { |
929 | // Exclude VOPCX instructions as these don't explicitly write a |
930 | // dst. |
931 | Register DstReg = Op0.getReg(); |
932 | if (DstReg.isVirtual()) { |
933 | // VOPC instructions can only write to the VCC register. We can't |
934 | // force them to use VCC here, because this is only one register and |
935 | // cannot deal with sequences which would require multiple copies of |
936 | // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...) |
937 | // |
938 | // So, instead of forcing the instruction to write to VCC, we |
939 | // provide a hint to the register allocator to use VCC and then we |
940 | // will run this pass again after RA and shrink it if it outputs to |
941 | // VCC. |
942 | MRI->setRegAllocationHint(VReg: DstReg, Type: 0, PrefReg: VCCReg); |
943 | continue; |
944 | } |
945 | if (DstReg != VCCReg) |
946 | continue; |
947 | } |
948 | } |
949 | |
950 | if (Op32 == AMDGPU::V_CNDMASK_B32_e32) { |
951 | // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC |
952 | // instructions. |
953 | const MachineOperand *Src2 = |
954 | TII->getNamedOperand(MI, AMDGPU::OpName::src2); |
955 | if (!Src2->isReg()) |
956 | continue; |
957 | Register SReg = Src2->getReg(); |
958 | if (SReg.isVirtual()) { |
959 | MRI->setRegAllocationHint(VReg: SReg, Type: 0, PrefReg: VCCReg); |
960 | continue; |
961 | } |
962 | if (SReg != VCCReg) |
963 | continue; |
964 | } |
965 | |
966 | // Check for the bool flag output for instructions like V_ADD_I32_e64. |
967 | const MachineOperand *SDst = TII->getNamedOperand(MI, |
968 | AMDGPU::OpName::sdst); |
969 | |
970 | if (SDst) { |
971 | bool Next = false; |
972 | |
973 | if (SDst->getReg() != VCCReg) { |
974 | if (SDst->getReg().isVirtual()) |
975 | MRI->setRegAllocationHint(VReg: SDst->getReg(), Type: 0, PrefReg: VCCReg); |
976 | Next = true; |
977 | } |
978 | |
979 | // All of the instructions with carry outs also have an SGPR input in |
980 | // src2. |
981 | const MachineOperand *Src2 = TII->getNamedOperand(MI, |
982 | AMDGPU::OpName::src2); |
983 | if (Src2 && Src2->getReg() != VCCReg) { |
984 | if (Src2->getReg().isVirtual()) |
985 | MRI->setRegAllocationHint(VReg: Src2->getReg(), Type: 0, PrefReg: VCCReg); |
986 | Next = true; |
987 | } |
988 | |
989 | if (Next) |
990 | continue; |
991 | } |
992 | |
993 | // Pre-GFX10, shrinking VOP3 instructions pre-RA gave us the chance to |
994 | // fold an immediate into the shrunk instruction as a literal operand. In |
995 | // GFX10 VOP3 instructions can take a literal operand anyway, so there is |
996 | // no advantage to doing this. |
997 | if (ST->hasVOP3Literal() && |
998 | !MF.getProperties().hasProperty( |
999 | P: MachineFunctionProperties::Property::NoVRegs)) |
1000 | continue; |
1001 | |
1002 | if (ST->hasTrue16BitInsts() && AMDGPU::isTrue16Inst(Opc: MI.getOpcode()) && |
1003 | !shouldShrinkTrue16(MI)) |
1004 | continue; |
1005 | |
1006 | // We can shrink this instruction |
1007 | LLVM_DEBUG(dbgs() << "Shrinking " << MI); |
1008 | |
1009 | MachineInstr *Inst32 = TII->buildShrunkInst(MI, NewOpcode: Op32); |
1010 | ++NumInstructionsShrunk; |
1011 | |
1012 | // Copy extra operands not present in the instruction definition. |
1013 | copyExtraImplicitOps(NewMI&: *Inst32, MI); |
1014 | |
1015 | // Copy deadness from the old explicit vcc def to the new implicit def. |
1016 | if (SDst && SDst->isDead()) |
1017 | Inst32->findRegisterDefOperand(Reg: VCCReg, /*TRI=*/nullptr)->setIsDead(); |
1018 | |
1019 | MI.eraseFromParent(); |
1020 | foldImmediates(MI&: *Inst32); |
1021 | |
1022 | LLVM_DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n'); |
1023 | } |
1024 | } |
1025 | return false; |
1026 | } |
1027 | |