1//===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7/// The pass tries to use the 32-bit encoding for instructions when possible.
8//===----------------------------------------------------------------------===//
9//
10
11#include "AMDGPU.h"
12#include "GCNSubtarget.h"
13#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
14#include "Utils/AMDGPUBaseInfo.h"
15#include "llvm/ADT/Statistic.h"
16#include "llvm/CodeGen/MachineFunctionPass.h"
17
18#define DEBUG_TYPE "si-shrink-instructions"
19
20STATISTIC(NumInstructionsShrunk,
21 "Number of 64-bit instruction reduced to 32-bit.");
22STATISTIC(NumLiteralConstantsFolded,
23 "Number of literal constants folded into 32-bit instructions.");
24
25using namespace llvm;
26
27namespace {
28
29class SIShrinkInstructions : public MachineFunctionPass {
30 MachineFunction *MF;
31 MachineRegisterInfo *MRI;
32 const GCNSubtarget *ST;
33 const SIInstrInfo *TII;
34 const SIRegisterInfo *TRI;
35
36public:
37 static char ID;
38
39public:
40 SIShrinkInstructions() : MachineFunctionPass(ID) {
41 }
42
43 bool foldImmediates(MachineInstr &MI, bool TryToCommute = true) const;
44 bool shouldShrinkTrue16(MachineInstr &MI) const;
45 bool isKImmOperand(const MachineOperand &Src) const;
46 bool isKUImmOperand(const MachineOperand &Src) const;
47 bool isKImmOrKUImmOperand(const MachineOperand &Src, bool &IsUnsigned) const;
48 bool isReverseInlineImm(const MachineOperand &Src, int32_t &ReverseImm) const;
49 void copyExtraImplicitOps(MachineInstr &NewMI, MachineInstr &MI) const;
50 void shrinkScalarCompare(MachineInstr &MI) const;
51 void shrinkMIMG(MachineInstr &MI) const;
52 void shrinkMadFma(MachineInstr &MI) const;
53 bool shrinkScalarLogicOp(MachineInstr &MI) const;
54 bool tryReplaceDeadSDST(MachineInstr &MI) const;
55 bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R,
56 Register Reg, unsigned SubReg) const;
57 bool instReadsReg(const MachineInstr *MI, unsigned Reg,
58 unsigned SubReg) const;
59 bool instModifiesReg(const MachineInstr *MI, unsigned Reg,
60 unsigned SubReg) const;
61 TargetInstrInfo::RegSubRegPair getSubRegForIndex(Register Reg, unsigned Sub,
62 unsigned I) const;
63 void dropInstructionKeepingImpDefs(MachineInstr &MI) const;
64 MachineInstr *matchSwap(MachineInstr &MovT) const;
65
66 bool runOnMachineFunction(MachineFunction &MF) override;
67
68 StringRef getPassName() const override { return "SI Shrink Instructions"; }
69
70 void getAnalysisUsage(AnalysisUsage &AU) const override {
71 AU.setPreservesCFG();
72 MachineFunctionPass::getAnalysisUsage(AU);
73 }
74};
75
76} // End anonymous namespace.
77
78INITIALIZE_PASS(SIShrinkInstructions, DEBUG_TYPE,
79 "SI Shrink Instructions", false, false)
80
81char SIShrinkInstructions::ID = 0;
82
83FunctionPass *llvm::createSIShrinkInstructionsPass() {
84 return new SIShrinkInstructions();
85}
86
87/// This function checks \p MI for operands defined by a move immediate
88/// instruction and then folds the literal constant into the instruction if it
89/// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions.
90bool SIShrinkInstructions::foldImmediates(MachineInstr &MI,
91 bool TryToCommute) const {
92 assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI));
93
94 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
95
96 // Try to fold Src0
97 MachineOperand &Src0 = MI.getOperand(i: Src0Idx);
98 if (Src0.isReg()) {
99 Register Reg = Src0.getReg();
100 if (Reg.isVirtual()) {
101 MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
102 if (Def && Def->isMoveImmediate()) {
103 MachineOperand &MovSrc = Def->getOperand(i: 1);
104 bool ConstantFolded = false;
105
106 if (TII->isOperandLegal(MI, OpIdx: Src0Idx, MO: &MovSrc)) {
107 if (MovSrc.isImm()) {
108 Src0.ChangeToImmediate(ImmVal: MovSrc.getImm());
109 ConstantFolded = true;
110 } else if (MovSrc.isFI()) {
111 Src0.ChangeToFrameIndex(Idx: MovSrc.getIndex());
112 ConstantFolded = true;
113 } else if (MovSrc.isGlobal()) {
114 Src0.ChangeToGA(GV: MovSrc.getGlobal(), Offset: MovSrc.getOffset(),
115 TargetFlags: MovSrc.getTargetFlags());
116 ConstantFolded = true;
117 }
118 }
119
120 if (ConstantFolded) {
121 if (MRI->use_nodbg_empty(RegNo: Reg))
122 Def->eraseFromParent();
123 ++NumLiteralConstantsFolded;
124 return true;
125 }
126 }
127 }
128 }
129
130 // We have failed to fold src0, so commute the instruction and try again.
131 if (TryToCommute && MI.isCommutable()) {
132 if (TII->commuteInstruction(MI)) {
133 if (foldImmediates(MI, TryToCommute: false))
134 return true;
135
136 // Commute back.
137 TII->commuteInstruction(MI);
138 }
139 }
140
141 return false;
142}
143
144/// Do not shrink the instruction if its registers are not expressible in the
145/// shrunk encoding.
146bool SIShrinkInstructions::shouldShrinkTrue16(MachineInstr &MI) const {
147 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
148 const MachineOperand &MO = MI.getOperand(i: I);
149 if (MO.isReg()) {
150 Register Reg = MO.getReg();
151 assert(!Reg.isVirtual() && "Prior checks should ensure we only shrink "
152 "True16 Instructions post-RA");
153 if (AMDGPU::VGPR_32RegClass.contains(Reg) &&
154 !AMDGPU::VGPR_32_Lo128RegClass.contains(Reg))
155 return false;
156 }
157 }
158 return true;
159}
160
161bool SIShrinkInstructions::isKImmOperand(const MachineOperand &Src) const {
162 return isInt<16>(x: SignExtend64(X: Src.getImm(), B: 32)) &&
163 !TII->isInlineConstant(MI: *Src.getParent(), OpIdx: Src.getOperandNo());
164}
165
166bool SIShrinkInstructions::isKUImmOperand(const MachineOperand &Src) const {
167 return isUInt<16>(x: Src.getImm()) &&
168 !TII->isInlineConstant(MI: *Src.getParent(), OpIdx: Src.getOperandNo());
169}
170
171bool SIShrinkInstructions::isKImmOrKUImmOperand(const MachineOperand &Src,
172 bool &IsUnsigned) const {
173 if (isInt<16>(x: SignExtend64(X: Src.getImm(), B: 32))) {
174 IsUnsigned = false;
175 return !TII->isInlineConstant(MO: Src);
176 }
177
178 if (isUInt<16>(x: Src.getImm())) {
179 IsUnsigned = true;
180 return !TII->isInlineConstant(MO: Src);
181 }
182
183 return false;
184}
185
186/// \returns true if the constant in \p Src should be replaced with a bitreverse
187/// of an inline immediate.
188bool SIShrinkInstructions::isReverseInlineImm(const MachineOperand &Src,
189 int32_t &ReverseImm) const {
190 if (!isInt<32>(x: Src.getImm()) || TII->isInlineConstant(MO: Src))
191 return false;
192
193 ReverseImm = reverseBits<int32_t>(Val: static_cast<int32_t>(Src.getImm()));
194 return ReverseImm >= -16 && ReverseImm <= 64;
195}
196
197/// Copy implicit register operands from specified instruction to this
198/// instruction that are not part of the instruction definition.
199void SIShrinkInstructions::copyExtraImplicitOps(MachineInstr &NewMI,
200 MachineInstr &MI) const {
201 MachineFunction &MF = *MI.getMF();
202 for (unsigned i = MI.getDesc().getNumOperands() +
203 MI.getDesc().implicit_uses().size() +
204 MI.getDesc().implicit_defs().size(),
205 e = MI.getNumOperands();
206 i != e; ++i) {
207 const MachineOperand &MO = MI.getOperand(i);
208 if ((MO.isReg() && MO.isImplicit()) || MO.isRegMask())
209 NewMI.addOperand(MF, Op: MO);
210 }
211}
212
213void SIShrinkInstructions::shrinkScalarCompare(MachineInstr &MI) const {
214 if (!ST->hasSCmpK())
215 return;
216
217 // cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to
218 // get constants on the RHS.
219 if (!MI.getOperand(i: 0).isReg())
220 TII->commuteInstruction(MI, NewMI: false, OpIdx0: 0, OpIdx1: 1);
221
222 // cmpk requires src0 to be a register
223 const MachineOperand &Src0 = MI.getOperand(i: 0);
224 if (!Src0.isReg())
225 return;
226
227 MachineOperand &Src1 = MI.getOperand(i: 1);
228 if (!Src1.isImm())
229 return;
230
231 int SOPKOpc = AMDGPU::getSOPKOp(Opcode: MI.getOpcode());
232 if (SOPKOpc == -1)
233 return;
234
235 // eq/ne is special because the imm16 can be treated as signed or unsigned,
236 // and initially selected to the unsigned versions.
237 if (SOPKOpc == AMDGPU::S_CMPK_EQ_U32 || SOPKOpc == AMDGPU::S_CMPK_LG_U32) {
238 bool HasUImm;
239 if (isKImmOrKUImmOperand(Src: Src1, IsUnsigned&: HasUImm)) {
240 if (!HasUImm) {
241 SOPKOpc = (SOPKOpc == AMDGPU::S_CMPK_EQ_U32) ?
242 AMDGPU::S_CMPK_EQ_I32 : AMDGPU::S_CMPK_LG_I32;
243 Src1.setImm(SignExtend32(X: Src1.getImm(), B: 32));
244 }
245
246 MI.setDesc(TII->get(SOPKOpc));
247 }
248
249 return;
250 }
251
252 const MCInstrDesc &NewDesc = TII->get(SOPKOpc);
253
254 if ((SIInstrInfo::sopkIsZext(Opcode: SOPKOpc) && isKUImmOperand(Src: Src1)) ||
255 (!SIInstrInfo::sopkIsZext(Opcode: SOPKOpc) && isKImmOperand(Src: Src1))) {
256 if (!SIInstrInfo::sopkIsZext(Opcode: SOPKOpc))
257 Src1.setImm(SignExtend64(X: Src1.getImm(), B: 32));
258 MI.setDesc(NewDesc);
259 }
260}
261
262// Shrink NSA encoded instructions with contiguous VGPRs to non-NSA encoding.
263void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const {
264 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: MI.getOpcode());
265 if (!Info)
266 return;
267
268 uint8_t NewEncoding;
269 switch (Info->MIMGEncoding) {
270 case AMDGPU::MIMGEncGfx10NSA:
271 NewEncoding = AMDGPU::MIMGEncGfx10Default;
272 break;
273 case AMDGPU::MIMGEncGfx11NSA:
274 NewEncoding = AMDGPU::MIMGEncGfx11Default;
275 break;
276 default:
277 return;
278 }
279
280 int VAddr0Idx =
281 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
282 unsigned NewAddrDwords = Info->VAddrDwords;
283 const TargetRegisterClass *RC;
284
285 if (Info->VAddrDwords == 2) {
286 RC = &AMDGPU::VReg_64RegClass;
287 } else if (Info->VAddrDwords == 3) {
288 RC = &AMDGPU::VReg_96RegClass;
289 } else if (Info->VAddrDwords == 4) {
290 RC = &AMDGPU::VReg_128RegClass;
291 } else if (Info->VAddrDwords == 5) {
292 RC = &AMDGPU::VReg_160RegClass;
293 } else if (Info->VAddrDwords == 6) {
294 RC = &AMDGPU::VReg_192RegClass;
295 } else if (Info->VAddrDwords == 7) {
296 RC = &AMDGPU::VReg_224RegClass;
297 } else if (Info->VAddrDwords == 8) {
298 RC = &AMDGPU::VReg_256RegClass;
299 } else if (Info->VAddrDwords == 9) {
300 RC = &AMDGPU::VReg_288RegClass;
301 } else if (Info->VAddrDwords == 10) {
302 RC = &AMDGPU::VReg_320RegClass;
303 } else if (Info->VAddrDwords == 11) {
304 RC = &AMDGPU::VReg_352RegClass;
305 } else if (Info->VAddrDwords == 12) {
306 RC = &AMDGPU::VReg_384RegClass;
307 } else {
308 RC = &AMDGPU::VReg_512RegClass;
309 NewAddrDwords = 16;
310 }
311
312 unsigned VgprBase = 0;
313 unsigned NextVgpr = 0;
314 bool IsUndef = true;
315 bool IsKill = NewAddrDwords == Info->VAddrDwords;
316 const unsigned NSAMaxSize = ST->getNSAMaxSize();
317 const bool IsPartialNSA = NewAddrDwords > NSAMaxSize;
318 const unsigned EndVAddr = IsPartialNSA ? NSAMaxSize : Info->VAddrOperands;
319 for (unsigned Idx = 0; Idx < EndVAddr; ++Idx) {
320 const MachineOperand &Op = MI.getOperand(i: VAddr0Idx + Idx);
321 unsigned Vgpr = TRI->getHWRegIndex(Reg: Op.getReg());
322 unsigned Dwords = TRI->getRegSizeInBits(Op.getReg(), *MRI) / 32;
323 assert(Dwords > 0 && "Un-implemented for less than 32 bit regs");
324
325 if (Idx == 0) {
326 VgprBase = Vgpr;
327 NextVgpr = Vgpr + Dwords;
328 } else if (Vgpr == NextVgpr) {
329 NextVgpr = Vgpr + Dwords;
330 } else {
331 return;
332 }
333
334 if (!Op.isUndef())
335 IsUndef = false;
336 if (!Op.isKill())
337 IsKill = false;
338 }
339
340 if (VgprBase + NewAddrDwords > 256)
341 return;
342
343 // Further check for implicit tied operands - this may be present if TFE is
344 // enabled
345 int TFEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::tfe);
346 int LWEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::lwe);
347 unsigned TFEVal = (TFEIdx == -1) ? 0 : MI.getOperand(i: TFEIdx).getImm();
348 unsigned LWEVal = (LWEIdx == -1) ? 0 : MI.getOperand(i: LWEIdx).getImm();
349 int ToUntie = -1;
350 if (TFEVal || LWEVal) {
351 // TFE/LWE is enabled so we need to deal with an implicit tied operand
352 for (unsigned i = LWEIdx + 1, e = MI.getNumOperands(); i != e; ++i) {
353 if (MI.getOperand(i).isReg() && MI.getOperand(i).isTied() &&
354 MI.getOperand(i).isImplicit()) {
355 // This is the tied operand
356 assert(
357 ToUntie == -1 &&
358 "found more than one tied implicit operand when expecting only 1");
359 ToUntie = i;
360 MI.untieRegOperand(OpIdx: ToUntie);
361 }
362 }
363 }
364
365 unsigned NewOpcode = AMDGPU::getMIMGOpcode(BaseOpcode: Info->BaseOpcode, MIMGEncoding: NewEncoding,
366 VDataDwords: Info->VDataDwords, VAddrDwords: NewAddrDwords);
367 MI.setDesc(TII->get(NewOpcode));
368 MI.getOperand(i: VAddr0Idx).setReg(RC->getRegister(i: VgprBase));
369 MI.getOperand(i: VAddr0Idx).setIsUndef(IsUndef);
370 MI.getOperand(i: VAddr0Idx).setIsKill(IsKill);
371
372 for (unsigned i = 1; i < EndVAddr; ++i)
373 MI.removeOperand(OpNo: VAddr0Idx + 1);
374
375 if (ToUntie >= 0) {
376 MI.tieOperands(
377 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata),
378 ToUntie - (EndVAddr - 1));
379 }
380}
381
382// Shrink MAD to MADAK/MADMK and FMA to FMAAK/FMAMK.
383void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
384 // Pre-GFX10 VOP3 instructions like MAD/FMA cannot take a literal operand so
385 // there is no reason to try to shrink them.
386 if (!ST->hasVOP3Literal())
387 return;
388
389 // There is no advantage to doing this pre-RA.
390 if (!MF->getProperties().hasProperty(
391 P: MachineFunctionProperties::Property::NoVRegs))
392 return;
393
394 if (TII->hasAnyModifiersSet(MI))
395 return;
396
397 const unsigned Opcode = MI.getOpcode();
398 MachineOperand &Src0 = *TII->getNamedOperand(MI, AMDGPU::OpName::src0);
399 MachineOperand &Src1 = *TII->getNamedOperand(MI, AMDGPU::OpName::src1);
400 MachineOperand &Src2 = *TII->getNamedOperand(MI, AMDGPU::OpName::src2);
401 unsigned NewOpcode = AMDGPU::INSTRUCTION_LIST_END;
402
403 bool Swap;
404
405 // Detect "Dst = VSrc * VGPR + Imm" and convert to AK form.
406 if (Src2.isImm() && !TII->isInlineConstant(MO: Src2)) {
407 if (Src1.isReg() && TRI->isVGPR(MRI: *MRI, Reg: Src1.getReg()))
408 Swap = false;
409 else if (Src0.isReg() && TRI->isVGPR(MRI: *MRI, Reg: Src0.getReg()))
410 Swap = true;
411 else
412 return;
413
414 switch (Opcode) {
415 default:
416 llvm_unreachable("Unexpected mad/fma opcode!");
417 case AMDGPU::V_MAD_F32_e64:
418 NewOpcode = AMDGPU::V_MADAK_F32;
419 break;
420 case AMDGPU::V_FMA_F32_e64:
421 NewOpcode = AMDGPU::V_FMAAK_F32;
422 break;
423 case AMDGPU::V_MAD_F16_e64:
424 NewOpcode = AMDGPU::V_MADAK_F16;
425 break;
426 case AMDGPU::V_FMA_F16_e64:
427 case AMDGPU::V_FMA_F16_gfx9_e64:
428 NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_t16
429 : AMDGPU::V_FMAAK_F16;
430 break;
431 }
432 }
433
434 // Detect "Dst = VSrc * Imm + VGPR" and convert to MK form.
435 if (Src2.isReg() && TRI->isVGPR(MRI: *MRI, Reg: Src2.getReg())) {
436 if (Src1.isImm() && !TII->isInlineConstant(MO: Src1))
437 Swap = false;
438 else if (Src0.isImm() && !TII->isInlineConstant(MO: Src0))
439 Swap = true;
440 else
441 return;
442
443 switch (Opcode) {
444 default:
445 llvm_unreachable("Unexpected mad/fma opcode!");
446 case AMDGPU::V_MAD_F32_e64:
447 NewOpcode = AMDGPU::V_MADMK_F32;
448 break;
449 case AMDGPU::V_FMA_F32_e64:
450 NewOpcode = AMDGPU::V_FMAMK_F32;
451 break;
452 case AMDGPU::V_MAD_F16_e64:
453 NewOpcode = AMDGPU::V_MADMK_F16;
454 break;
455 case AMDGPU::V_FMA_F16_e64:
456 case AMDGPU::V_FMA_F16_gfx9_e64:
457 NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16
458 : AMDGPU::V_FMAMK_F16;
459 break;
460 }
461 }
462
463 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END)
464 return;
465
466 if (AMDGPU::isTrue16Inst(Opc: NewOpcode) && !shouldShrinkTrue16(MI))
467 return;
468
469 if (Swap) {
470 // Swap Src0 and Src1 by building a new instruction.
471 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(NewOpcode),
472 MI.getOperand(i: 0).getReg())
473 .add(Src1)
474 .add(Src0)
475 .add(Src2)
476 .setMIFlags(MI.getFlags());
477 MI.eraseFromParent();
478 } else {
479 TII->removeModOperands(MI);
480 MI.setDesc(TII->get(NewOpcode));
481 }
482}
483
484/// Attempt to shrink AND/OR/XOR operations requiring non-inlineable literals.
485/// For AND or OR, try using S_BITSET{0,1} to clear or set bits.
486/// If the inverse of the immediate is legal, use ANDN2, ORN2 or
487/// XNOR (as a ^ b == ~(a ^ ~b)).
488/// \returns true if the caller should continue the machine function iterator
489bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const {
490 unsigned Opc = MI.getOpcode();
491 const MachineOperand *Dest = &MI.getOperand(i: 0);
492 MachineOperand *Src0 = &MI.getOperand(i: 1);
493 MachineOperand *Src1 = &MI.getOperand(i: 2);
494 MachineOperand *SrcReg = Src0;
495 MachineOperand *SrcImm = Src1;
496
497 if (!SrcImm->isImm() ||
498 AMDGPU::isInlinableLiteral32(Literal: SrcImm->getImm(), HasInv2Pi: ST->hasInv2PiInlineImm()))
499 return false;
500
501 uint32_t Imm = static_cast<uint32_t>(SrcImm->getImm());
502 uint32_t NewImm = 0;
503
504 if (Opc == AMDGPU::S_AND_B32) {
505 if (isPowerOf2_32(Value: ~Imm)) {
506 NewImm = llvm::countr_one(Value: Imm);
507 Opc = AMDGPU::S_BITSET0_B32;
508 } else if (AMDGPU::isInlinableLiteral32(Literal: ~Imm, HasInv2Pi: ST->hasInv2PiInlineImm())) {
509 NewImm = ~Imm;
510 Opc = AMDGPU::S_ANDN2_B32;
511 }
512 } else if (Opc == AMDGPU::S_OR_B32) {
513 if (isPowerOf2_32(Value: Imm)) {
514 NewImm = llvm::countr_zero(Val: Imm);
515 Opc = AMDGPU::S_BITSET1_B32;
516 } else if (AMDGPU::isInlinableLiteral32(Literal: ~Imm, HasInv2Pi: ST->hasInv2PiInlineImm())) {
517 NewImm = ~Imm;
518 Opc = AMDGPU::S_ORN2_B32;
519 }
520 } else if (Opc == AMDGPU::S_XOR_B32) {
521 if (AMDGPU::isInlinableLiteral32(Literal: ~Imm, HasInv2Pi: ST->hasInv2PiInlineImm())) {
522 NewImm = ~Imm;
523 Opc = AMDGPU::S_XNOR_B32;
524 }
525 } else {
526 llvm_unreachable("unexpected opcode");
527 }
528
529 if (NewImm != 0) {
530 if (Dest->getReg().isVirtual() && SrcReg->isReg()) {
531 MRI->setRegAllocationHint(VReg: Dest->getReg(), Type: 0, PrefReg: SrcReg->getReg());
532 MRI->setRegAllocationHint(VReg: SrcReg->getReg(), Type: 0, PrefReg: Dest->getReg());
533 return true;
534 }
535
536 if (SrcReg->isReg() && SrcReg->getReg() == Dest->getReg()) {
537 const bool IsUndef = SrcReg->isUndef();
538 const bool IsKill = SrcReg->isKill();
539 MI.setDesc(TII->get(Opc));
540 if (Opc == AMDGPU::S_BITSET0_B32 ||
541 Opc == AMDGPU::S_BITSET1_B32) {
542 Src0->ChangeToImmediate(ImmVal: NewImm);
543 // Remove the immediate and add the tied input.
544 MI.getOperand(i: 2).ChangeToRegister(Reg: Dest->getReg(), /*IsDef*/ isDef: false,
545 /*isImp*/ false, isKill: IsKill,
546 /*isDead*/ false, isUndef: IsUndef);
547 MI.tieOperands(DefIdx: 0, UseIdx: 2);
548 } else {
549 SrcImm->setImm(NewImm);
550 }
551 }
552 }
553
554 return false;
555}
556
557// This is the same as MachineInstr::readsRegister/modifiesRegister except
558// it takes subregs into account.
559bool SIShrinkInstructions::instAccessReg(
560 iterator_range<MachineInstr::const_mop_iterator> &&R, Register Reg,
561 unsigned SubReg) const {
562 for (const MachineOperand &MO : R) {
563 if (!MO.isReg())
564 continue;
565
566 if (Reg.isPhysical() && MO.getReg().isPhysical()) {
567 if (TRI->regsOverlap(Reg, MO.getReg()))
568 return true;
569 } else if (MO.getReg() == Reg && Reg.isVirtual()) {
570 LaneBitmask Overlap = TRI->getSubRegIndexLaneMask(SubReg) &
571 TRI->getSubRegIndexLaneMask(MO.getSubReg());
572 if (Overlap.any())
573 return true;
574 }
575 }
576 return false;
577}
578
579bool SIShrinkInstructions::instReadsReg(const MachineInstr *MI, unsigned Reg,
580 unsigned SubReg) const {
581 return instAccessReg(R: MI->uses(), Reg, SubReg);
582}
583
584bool SIShrinkInstructions::instModifiesReg(const MachineInstr *MI, unsigned Reg,
585 unsigned SubReg) const {
586 return instAccessReg(R: MI->defs(), Reg, SubReg);
587}
588
589TargetInstrInfo::RegSubRegPair
590SIShrinkInstructions::getSubRegForIndex(Register Reg, unsigned Sub,
591 unsigned I) const {
592 if (TRI->getRegSizeInBits(Reg, *MRI) != 32) {
593 if (Reg.isPhysical()) {
594 Reg = TRI->getSubReg(Reg, TRI->getSubRegFromChannel(Channel: I));
595 } else {
596 Sub = TRI->getSubRegFromChannel(Channel: I + TRI->getChannelFromSubReg(SubReg: Sub));
597 }
598 }
599 return TargetInstrInfo::RegSubRegPair(Reg, Sub);
600}
601
602void SIShrinkInstructions::dropInstructionKeepingImpDefs(
603 MachineInstr &MI) const {
604 for (unsigned i = MI.getDesc().getNumOperands() +
605 MI.getDesc().implicit_uses().size() +
606 MI.getDesc().implicit_defs().size(),
607 e = MI.getNumOperands();
608 i != e; ++i) {
609 const MachineOperand &Op = MI.getOperand(i);
610 if (!Op.isDef())
611 continue;
612 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
613 TII->get(AMDGPU::IMPLICIT_DEF), Op.getReg());
614 }
615
616 MI.eraseFromParent();
617}
618
619// Match:
620// mov t, x
621// mov x, y
622// mov y, t
623//
624// =>
625//
626// mov t, x (t is potentially dead and move eliminated)
627// v_swap_b32 x, y
628//
629// Returns next valid instruction pointer if was able to create v_swap_b32.
630//
631// This shall not be done too early not to prevent possible folding which may
632// remove matched moves, and this should preferably be done before RA to
633// release saved registers and also possibly after RA which can insert copies
634// too.
635//
636// This is really just a generic peephole that is not a canonical shrinking,
637// although requirements match the pass placement and it reduces code size too.
638MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const {
639 assert(MovT.getOpcode() == AMDGPU::V_MOV_B32_e32 ||
640 MovT.getOpcode() == AMDGPU::COPY);
641
642 Register T = MovT.getOperand(i: 0).getReg();
643 unsigned Tsub = MovT.getOperand(i: 0).getSubReg();
644 MachineOperand &Xop = MovT.getOperand(i: 1);
645
646 if (!Xop.isReg())
647 return nullptr;
648 Register X = Xop.getReg();
649 unsigned Xsub = Xop.getSubReg();
650
651 unsigned Size = TII->getOpSize(MI: MovT, OpNo: 0) / 4;
652
653 if (!TRI->isVGPR(MRI: *MRI, Reg: X))
654 return nullptr;
655
656 const unsigned SearchLimit = 16;
657 unsigned Count = 0;
658 bool KilledT = false;
659 for (auto Iter = std::next(x: MovT.getIterator()),
660 E = MovT.getParent()->instr_end();
661 Iter != E && Count < SearchLimit && !KilledT; ++Iter, ++Count) {
662
663 MachineInstr *MovY = &*Iter;
664 KilledT = MovY->killsRegister(T, TRI);
665
666 if ((MovY->getOpcode() != AMDGPU::V_MOV_B32_e32 &&
667 MovY->getOpcode() != AMDGPU::COPY) ||
668 !MovY->getOperand(1).isReg() ||
669 MovY->getOperand(1).getReg() != T ||
670 MovY->getOperand(1).getSubReg() != Tsub)
671 continue;
672
673 Register Y = MovY->getOperand(i: 0).getReg();
674 unsigned Ysub = MovY->getOperand(i: 0).getSubReg();
675
676 if (!TRI->isVGPR(MRI: *MRI, Reg: Y))
677 continue;
678
679 MachineInstr *MovX = nullptr;
680 for (auto IY = MovY->getIterator(), I = std::next(x: MovT.getIterator());
681 I != IY; ++I) {
682 if (instReadsReg(MI: &*I, Reg: X, SubReg: Xsub) || instModifiesReg(MI: &*I, Reg: Y, SubReg: Ysub) ||
683 instModifiesReg(MI: &*I, Reg: T, SubReg: Tsub) ||
684 (MovX && instModifiesReg(MI: &*I, Reg: X, SubReg: Xsub))) {
685 MovX = nullptr;
686 break;
687 }
688 if (!instReadsReg(MI: &*I, Reg: Y, SubReg: Ysub)) {
689 if (!MovX && instModifiesReg(MI: &*I, Reg: X, SubReg: Xsub)) {
690 MovX = nullptr;
691 break;
692 }
693 continue;
694 }
695 if (MovX ||
696 (I->getOpcode() != AMDGPU::V_MOV_B32_e32 &&
697 I->getOpcode() != AMDGPU::COPY) ||
698 I->getOperand(0).getReg() != X ||
699 I->getOperand(0).getSubReg() != Xsub) {
700 MovX = nullptr;
701 break;
702 }
703
704 if (Size > 1 && (I->getNumImplicitOperands() > (I->isCopy() ? 0U : 1U)))
705 continue;
706
707 MovX = &*I;
708 }
709
710 if (!MovX)
711 continue;
712
713 LLVM_DEBUG(dbgs() << "Matched v_swap_b32:\n" << MovT << *MovX << *MovY);
714
715 for (unsigned I = 0; I < Size; ++I) {
716 TargetInstrInfo::RegSubRegPair X1, Y1;
717 X1 = getSubRegForIndex(Reg: X, Sub: Xsub, I);
718 Y1 = getSubRegForIndex(Reg: Y, Sub: Ysub, I);
719 MachineBasicBlock &MBB = *MovT.getParent();
720 auto MIB = BuildMI(MBB, MovX->getIterator(), MovT.getDebugLoc(),
721 TII->get(AMDGPU::V_SWAP_B32))
722 .addDef(X1.Reg, 0, X1.SubReg)
723 .addDef(Y1.Reg, 0, Y1.SubReg)
724 .addReg(Y1.Reg, 0, Y1.SubReg)
725 .addReg(X1.Reg, 0, X1.SubReg).getInstr();
726 if (MovX->hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
727 // Drop implicit EXEC.
728 MIB->removeOperand(MIB->getNumExplicitOperands());
729 MIB->copyImplicitOps(*MBB.getParent(), *MovX);
730 }
731 }
732 MovX->eraseFromParent();
733 dropInstructionKeepingImpDefs(MI&: *MovY);
734 MachineInstr *Next = &*std::next(x: MovT.getIterator());
735
736 if (T.isVirtual() && MRI->use_nodbg_empty(RegNo: T)) {
737 dropInstructionKeepingImpDefs(MI&: MovT);
738 } else {
739 Xop.setIsKill(false);
740 for (int I = MovT.getNumImplicitOperands() - 1; I >= 0; --I ) {
741 unsigned OpNo = MovT.getNumExplicitOperands() + I;
742 const MachineOperand &Op = MovT.getOperand(i: OpNo);
743 if (Op.isKill() && TRI->regsOverlap(X, Op.getReg()))
744 MovT.removeOperand(OpNo);
745 }
746 }
747
748 return Next;
749 }
750
751 return nullptr;
752}
753
754// If an instruction has dead sdst replace it with NULL register on gfx1030+
755bool SIShrinkInstructions::tryReplaceDeadSDST(MachineInstr &MI) const {
756 if (!ST->hasGFX10_3Insts())
757 return false;
758
759 MachineOperand *Op = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
760 if (!Op)
761 return false;
762 Register SDstReg = Op->getReg();
763 if (SDstReg.isPhysical() || !MRI->use_nodbg_empty(RegNo: SDstReg))
764 return false;
765
766 Op->setReg(ST->isWave32() ? AMDGPU::SGPR_NULL : AMDGPU::SGPR_NULL64);
767 return true;
768}
769
770bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
771 if (skipFunction(F: MF.getFunction()))
772 return false;
773
774 this->MF = &MF;
775 MRI = &MF.getRegInfo();
776 ST = &MF.getSubtarget<GCNSubtarget>();
777 TII = ST->getInstrInfo();
778 TRI = &TII->getRegisterInfo();
779
780 unsigned VCCReg = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
781
782 std::vector<unsigned> I1Defs;
783
784 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
785 BI != BE; ++BI) {
786
787 MachineBasicBlock &MBB = *BI;
788 MachineBasicBlock::iterator I, Next;
789 for (I = MBB.begin(); I != MBB.end(); I = Next) {
790 Next = std::next(x: I);
791 MachineInstr &MI = *I;
792
793 if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) {
794 // If this has a literal constant source that is the same as the
795 // reversed bits of an inline immediate, replace with a bitreverse of
796 // that constant. This saves 4 bytes in the common case of materializing
797 // sign bits.
798
799 // Test if we are after regalloc. We only want to do this after any
800 // optimizations happen because this will confuse them.
801 // XXX - not exactly a check for post-regalloc run.
802 MachineOperand &Src = MI.getOperand(i: 1);
803 if (Src.isImm() && MI.getOperand(i: 0).getReg().isPhysical()) {
804 int32_t ReverseImm;
805 if (isReverseInlineImm(Src, ReverseImm)) {
806 MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32));
807 Src.setImm(ReverseImm);
808 continue;
809 }
810 }
811 }
812
813 if (ST->hasSwap() && (MI.getOpcode() == AMDGPU::V_MOV_B32_e32 ||
814 MI.getOpcode() == AMDGPU::COPY)) {
815 if (auto *NextMI = matchSwap(MovT&: MI)) {
816 Next = NextMI->getIterator();
817 continue;
818 }
819 }
820
821 // Try to use S_ADDK_I32 and S_MULK_I32.
822 if (MI.getOpcode() == AMDGPU::S_ADD_I32 ||
823 MI.getOpcode() == AMDGPU::S_MUL_I32) {
824 const MachineOperand *Dest = &MI.getOperand(i: 0);
825 MachineOperand *Src0 = &MI.getOperand(i: 1);
826 MachineOperand *Src1 = &MI.getOperand(i: 2);
827
828 if (!Src0->isReg() && Src1->isReg()) {
829 if (TII->commuteInstruction(MI, false, 1, 2))
830 std::swap(a&: Src0, b&: Src1);
831 }
832
833 // FIXME: This could work better if hints worked with subregisters. If
834 // we have a vector add of a constant, we usually don't get the correct
835 // allocation due to the subregister usage.
836 if (Dest->getReg().isVirtual() && Src0->isReg()) {
837 MRI->setRegAllocationHint(VReg: Dest->getReg(), Type: 0, PrefReg: Src0->getReg());
838 MRI->setRegAllocationHint(VReg: Src0->getReg(), Type: 0, PrefReg: Dest->getReg());
839 continue;
840 }
841
842 if (Src0->isReg() && Src0->getReg() == Dest->getReg()) {
843 if (Src1->isImm() && isKImmOperand(Src: *Src1)) {
844 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ?
845 AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32;
846
847 Src1->setImm(SignExtend64(X: Src1->getImm(), B: 32));
848 MI.setDesc(TII->get(Opc));
849 MI.tieOperands(DefIdx: 0, UseIdx: 1);
850 }
851 }
852 }
853
854 // Try to use s_cmpk_*
855 if (MI.isCompare() && TII->isSOPC(MI)) {
856 shrinkScalarCompare(MI);
857 continue;
858 }
859
860 // Try to use S_MOVK_I32, which will save 4 bytes for small immediates.
861 if (MI.getOpcode() == AMDGPU::S_MOV_B32) {
862 const MachineOperand &Dst = MI.getOperand(i: 0);
863 MachineOperand &Src = MI.getOperand(i: 1);
864
865 if (Src.isImm() && Dst.getReg().isPhysical()) {
866 int32_t ReverseImm;
867 if (isKImmOperand(Src)) {
868 MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
869 Src.setImm(SignExtend64(X: Src.getImm(), B: 32));
870 } else if (isReverseInlineImm(Src, ReverseImm)) {
871 MI.setDesc(TII->get(AMDGPU::S_BREV_B32));
872 Src.setImm(ReverseImm);
873 }
874 }
875
876 continue;
877 }
878
879 // Shrink scalar logic operations.
880 if (MI.getOpcode() == AMDGPU::S_AND_B32 ||
881 MI.getOpcode() == AMDGPU::S_OR_B32 ||
882 MI.getOpcode() == AMDGPU::S_XOR_B32) {
883 if (shrinkScalarLogicOp(MI))
884 continue;
885 }
886
887 if (TII->isMIMG(Opcode: MI.getOpcode()) &&
888 ST->getGeneration() >= AMDGPUSubtarget::GFX10 &&
889 MF.getProperties().hasProperty(
890 P: MachineFunctionProperties::Property::NoVRegs)) {
891 shrinkMIMG(MI);
892 continue;
893 }
894
895 if (!TII->isVOP3(MI))
896 continue;
897
898 if (MI.getOpcode() == AMDGPU::V_MAD_F32_e64 ||
899 MI.getOpcode() == AMDGPU::V_FMA_F32_e64 ||
900 MI.getOpcode() == AMDGPU::V_MAD_F16_e64 ||
901 MI.getOpcode() == AMDGPU::V_FMA_F16_e64 ||
902 MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_e64) {
903 shrinkMadFma(MI);
904 continue;
905 }
906
907 if (!TII->hasVALU32BitEncoding(Opcode: MI.getOpcode())) {
908 // If there is no chance we will shrink it and use VCC as sdst to get
909 // a 32 bit form try to replace dead sdst with NULL.
910 tryReplaceDeadSDST(MI);
911 continue;
912 }
913
914 if (!TII->canShrink(MI, MRI: *MRI)) {
915 // Try commuting the instruction and see if that enables us to shrink
916 // it.
917 if (!MI.isCommutable() || !TII->commuteInstruction(MI) ||
918 !TII->canShrink(MI, MRI: *MRI)) {
919 tryReplaceDeadSDST(MI);
920 continue;
921 }
922 }
923
924 int Op32 = AMDGPU::getVOPe32(Opcode: MI.getOpcode());
925
926 if (TII->isVOPC(Opcode: Op32)) {
927 MachineOperand &Op0 = MI.getOperand(i: 0);
928 if (Op0.isReg()) {
929 // Exclude VOPCX instructions as these don't explicitly write a
930 // dst.
931 Register DstReg = Op0.getReg();
932 if (DstReg.isVirtual()) {
933 // VOPC instructions can only write to the VCC register. We can't
934 // force them to use VCC here, because this is only one register and
935 // cannot deal with sequences which would require multiple copies of
936 // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...)
937 //
938 // So, instead of forcing the instruction to write to VCC, we
939 // provide a hint to the register allocator to use VCC and then we
940 // will run this pass again after RA and shrink it if it outputs to
941 // VCC.
942 MRI->setRegAllocationHint(VReg: DstReg, Type: 0, PrefReg: VCCReg);
943 continue;
944 }
945 if (DstReg != VCCReg)
946 continue;
947 }
948 }
949
950 if (Op32 == AMDGPU::V_CNDMASK_B32_e32) {
951 // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC
952 // instructions.
953 const MachineOperand *Src2 =
954 TII->getNamedOperand(MI, AMDGPU::OpName::src2);
955 if (!Src2->isReg())
956 continue;
957 Register SReg = Src2->getReg();
958 if (SReg.isVirtual()) {
959 MRI->setRegAllocationHint(VReg: SReg, Type: 0, PrefReg: VCCReg);
960 continue;
961 }
962 if (SReg != VCCReg)
963 continue;
964 }
965
966 // Check for the bool flag output for instructions like V_ADD_I32_e64.
967 const MachineOperand *SDst = TII->getNamedOperand(MI,
968 AMDGPU::OpName::sdst);
969
970 if (SDst) {
971 bool Next = false;
972
973 if (SDst->getReg() != VCCReg) {
974 if (SDst->getReg().isVirtual())
975 MRI->setRegAllocationHint(VReg: SDst->getReg(), Type: 0, PrefReg: VCCReg);
976 Next = true;
977 }
978
979 // All of the instructions with carry outs also have an SGPR input in
980 // src2.
981 const MachineOperand *Src2 = TII->getNamedOperand(MI,
982 AMDGPU::OpName::src2);
983 if (Src2 && Src2->getReg() != VCCReg) {
984 if (Src2->getReg().isVirtual())
985 MRI->setRegAllocationHint(VReg: Src2->getReg(), Type: 0, PrefReg: VCCReg);
986 Next = true;
987 }
988
989 if (Next)
990 continue;
991 }
992
993 // Pre-GFX10, shrinking VOP3 instructions pre-RA gave us the chance to
994 // fold an immediate into the shrunk instruction as a literal operand. In
995 // GFX10 VOP3 instructions can take a literal operand anyway, so there is
996 // no advantage to doing this.
997 if (ST->hasVOP3Literal() &&
998 !MF.getProperties().hasProperty(
999 P: MachineFunctionProperties::Property::NoVRegs))
1000 continue;
1001
1002 if (ST->hasTrue16BitInsts() && AMDGPU::isTrue16Inst(Opc: MI.getOpcode()) &&
1003 !shouldShrinkTrue16(MI))
1004 continue;
1005
1006 // We can shrink this instruction
1007 LLVM_DEBUG(dbgs() << "Shrinking " << MI);
1008
1009 MachineInstr *Inst32 = TII->buildShrunkInst(MI, NewOpcode: Op32);
1010 ++NumInstructionsShrunk;
1011
1012 // Copy extra operands not present in the instruction definition.
1013 copyExtraImplicitOps(NewMI&: *Inst32, MI);
1014
1015 // Copy deadness from the old explicit vcc def to the new implicit def.
1016 if (SDst && SDst->isDead())
1017 Inst32->findRegisterDefOperand(Reg: VCCReg, /*TRI=*/nullptr)->setIsDead();
1018
1019 MI.eraseFromParent();
1020 foldImmediates(MI&: *Inst32);
1021
1022 LLVM_DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n');
1023 }
1024 }
1025 return false;
1026}
1027

source code of llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp