SIShrinkInstructions.cpp source code [llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp]

1	//===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	/// The pass tries to use the 32-bit encoding for instructions when possible.
8	//===----------------------------------------------------------------------===//
9	//
10
11	#include "AMDGPU.h"
12	#include "GCNSubtarget.h"
13	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
14	#include "Utils/AMDGPUBaseInfo.h"
15	#include "llvm/ADT/Statistic.h"
16	#include "llvm/CodeGen/MachineFunctionPass.h"
17
18	#define DEBUG_TYPE "si-shrink-instructions"
19
20	STATISTIC(NumInstructionsShrunk,
21	"Number of 64-bit instruction reduced to 32-bit.");
22	STATISTIC(NumLiteralConstantsFolded,
23	"Number of literal constants folded into 32-bit instructions.");
24
25	using namespace llvm;
26
27	namespace {
28
29	class SIShrinkInstructions : public MachineFunctionPass {
30	MachineFunction *MF;
31	MachineRegisterInfo *MRI;
32	const GCNSubtarget *ST;
33	const SIInstrInfo *TII;
34	const SIRegisterInfo *TRI;
35
36	public:
37	static char ID;
38
39	public:
40	SIShrinkInstructions() : MachineFunctionPass (ID) {
41	}
42
43	bool foldImmediates(MachineInstr &MI, bool TryToCommute = true) const;
44	bool shouldShrinkTrue16(MachineInstr &MI) const;
45	bool isKImmOperand(const MachineOperand &Src) const;
46	bool isKUImmOperand(const MachineOperand &Src) const;
47	bool isKImmOrKUImmOperand(const MachineOperand &Src, bool &IsUnsigned) const;
48	bool isReverseInlineImm(const MachineOperand &Src, int32_t &ReverseImm) const;
49	void copyExtraImplicitOps(MachineInstr &NewMI, MachineInstr &MI) const;
50	void shrinkScalarCompare(MachineInstr &MI) const;
51	void shrinkMIMG(MachineInstr &MI) const;
52	void shrinkMadFma(MachineInstr &MI) const;
53	bool shrinkScalarLogicOp(MachineInstr &MI) const;
54	bool tryReplaceDeadSDST(MachineInstr &MI) const;
55	bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R,
56	Register Reg, unsigned SubReg) const;
57	bool instReadsReg(const MachineInstr MI, unsigned* Reg,
58	unsigned SubReg) const;
59	bool instModifiesReg(const MachineInstr MI, unsigned* Reg,
60	unsigned SubReg) const;
61	TargetInstrInfo::RegSubRegPair getSubRegForIndex(Register Reg, unsigned Sub,
62	unsigned I) const;
63	void dropInstructionKeepingImpDefs(MachineInstr &MI) const;
64	MachineInstr matchSwap(MachineInstr &MovT) const*;
65
66	bool runOnMachineFunction(MachineFunction &MF) override;
67
68	StringRef getPassName() const override { return "SI Shrink Instructions"; }
69
70	void getAnalysisUsage(AnalysisUsage &AU) const override {
71	AU.setPreservesCFG();
72	MachineFunctionPass::getAnalysisUsage(AU);
73	}
74	};
75
76	} // End anonymous namespace.
77
78	INITIALIZE_PASS(SIShrinkInstructions, DEBUG_TYPE,
79	"SI Shrink Instructions", false, false)
80
81	char SIShrinkInstructions::ID = `0`;
82
83	FunctionPass *llvm::createSIShrinkInstructionsPass() {
84	return new SIShrinkInstructions ();
85	}
86
87	/// This function checks \p MI for operands defined by a move immediate
88	/// instruction and then folds the literal constant into the instruction if it
89	/// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions.
90	bool SIShrinkInstructions::foldImmediates(MachineInstr &MI,
91	bool TryToCommute) const {
92	assert(TII->isVOP1(MI) \|\| TII->isVOP2(MI) \|\| TII->isVOPC(MI));
93
94	int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
95
96	// Try to fold Src0
97	MachineOperand &Src0 = MI.getOperand(i: Src0Idx);
98	if (Src0.isReg()) {
99	Register Reg = Src0.getReg();
100	if (Reg.isVirtual()) {
101	MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
102	if (Def && Def->isMoveImmediate()) {
103	MachineOperand &MovSrc = Def->getOperand(i: `1`);
104	bool ConstantFolded = false;
105
106	if (TII->isOperandLegal(MI, OpIdx: Src0Idx, MO: &MovSrc)) {
107	if (MovSrc.isImm()) {
108	Src0.ChangeToImmediate(ImmVal: MovSrc.getImm());
109	ConstantFolded = true;
110	} else if (MovSrc.isFI()) {
111	Src0.ChangeToFrameIndex(Idx: MovSrc.getIndex());
112	ConstantFolded = true;
113	} else if (MovSrc.isGlobal()) {
114	Src0.ChangeToGA(GV: MovSrc.getGlobal(), Offset: MovSrc.getOffset(),
115	TargetFlags: MovSrc.getTargetFlags());
116	ConstantFolded = true;
117	}
118	}
119
120	if (ConstantFolded) {
121	if (MRI->use_nodbg_empty(RegNo: Reg))
122	Def->eraseFromParent();
123	++NumLiteralConstantsFolded;
124	return true;
125	}
126	}
127	}
128	}
129
130	// We have failed to fold src0, so commute the instruction and try again.
131	if (TryToCommute && MI.isCommutable()) {
132	if (TII->commuteInstruction(MI)) {
133	if (foldImmediates(MI, TryToCommute: false))
134	return true;
135
136	// Commute back.
137	TII->commuteInstruction(MI);
138	}
139	}
140
141	return false;
142	}
143
144	/// Do not shrink the instruction if its registers are not expressible in the
145	/// shrunk encoding.
146	bool SIShrinkInstructions::shouldShrinkTrue16(MachineInstr &MI) const {
147	for (unsigned I = `0`, E = MI.getNumExplicitOperands(); I != E; ++I) {
148	const MachineOperand &MO = MI.getOperand(i: I);
149	if (MO.isReg()) {
150	Register Reg = MO.getReg();
151	assert(!Reg.isVirtual() && "Prior checks should ensure we only shrink "
152	"True16 Instructions post-RA");
153	if (AMDGPU::VGPR_32RegClass.contains(Reg) &&
154	!AMDGPU::VGPR_32_Lo128RegClass.contains(Reg))
155	return false;
156	}
157	}
158	return true;
159	}
160
161	bool SIShrinkInstructions::isKImmOperand(const MachineOperand &Src) const {
162	return isInt<`16`>(x: SignExtend64(X: Src.getImm(), B: `32`)) &&
163	!TII->isInlineConstant(MI: *Src.getParent(), OpIdx: Src.getOperandNo());
164	}
165
166	bool SIShrinkInstructions::isKUImmOperand(const MachineOperand &Src) const {
167	return isUInt<`16`>(x: Src.getImm()) &&
168	!TII->isInlineConstant(MI: *Src.getParent(), OpIdx: Src.getOperandNo());
169	}
170
171	bool SIShrinkInstructions::isKImmOrKUImmOperand(const MachineOperand &Src,
172	bool &IsUnsigned) const {
173	if (isInt<`16`>(x: SignExtend64(X: Src.getImm(), B: `32`))) {
174	IsUnsigned = false;
175	return !TII->isInlineConstant(MO: Src);
176	}
177
178	if (isUInt<`16`>(x: Src.getImm())) {
179	IsUnsigned = true;
180	return !TII->isInlineConstant(MO: Src);
181	}
182
183	return false;
184	}
185
186	/// \returns true if the constant in \p Src should be replaced with a bitreverse
187	/// of an inline immediate.
188	bool SIShrinkInstructions::isReverseInlineImm(const MachineOperand &Src,
189	int32_t &ReverseImm) const {
190	if (!isInt<`32`>(x: Src.getImm()) \|\| TII->isInlineConstant(MO: Src))
191	return false;
192
193	ReverseImm = reverseBits<int32_t>(Val: static_cast<int32_t>(Src.getImm()));
194	return ReverseImm >= -`16` && ReverseImm <= `64`;
195	}
196
197	/// Copy implicit register operands from specified instruction to this
198	/// instruction that are not part of the instruction definition.
199	void SIShrinkInstructions::copyExtraImplicitOps(MachineInstr &NewMI,
200	MachineInstr &MI) const {
201	MachineFunction &MF = *MI.getMF();
202	for (unsigned i = MI.getDesc().getNumOperands() +
203	MI.getDesc().implicit_uses().size() +
204	MI.getDesc().implicit_defs().size(),
205	e = MI.getNumOperands();
206	i != e; ++i) {
207	const MachineOperand &MO = MI.getOperand(i);
208	if ((MO.isReg() && MO.isImplicit()) \|\| MO.isRegMask())
209	NewMI.addOperand(MF, Op: MO);
210	}
211	}
212
213	void SIShrinkInstructions::shrinkScalarCompare(MachineInstr &MI) const {
214	if (!ST->hasSCmpK())
215	return;
216
217	// cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to
218	// get constants on the RHS.
219	if (!MI.getOperand(i: `0`).isReg())
220	TII->commuteInstruction(MI, NewMI: false, OpIdx0: `0`, OpIdx1: `1`);
221
222	// cmpk requires src0 to be a register
223	const MachineOperand &Src0 = MI.getOperand(i: `0`);
224	if (!Src0.isReg())
225	return;
226
227	MachineOperand &Src1 = MI.getOperand(i: `1`);
228	if (!Src1.isImm())
229	return;
230
231	int SOPKOpc = AMDGPU::getSOPKOp(Opcode: MI.getOpcode());
232	if (SOPKOpc == -`1`)
233	return;
234
235	// eq/ne is special because the imm16 can be treated as signed or unsigned,
236	// and initially selected to the unsigned versions.
237	if (SOPKOpc == AMDGPU::S_CMPK_EQ_U32 \|\| SOPKOpc == AMDGPU::S_CMPK_LG_U32) {
238	bool HasUImm;
239	if (isKImmOrKUImmOperand(Src: Src1, IsUnsigned&: HasUImm)) {
240	if (!HasUImm) {
241	SOPKOpc = (SOPKOpc == AMDGPU::S_CMPK_EQ_U32) ?
242	AMDGPU::S_CMPK_EQ_I32 : AMDGPU::S_CMPK_LG_I32;
243	Src1.setImm(SignExtend32(X: Src1.getImm(), B: `32`));
244	}
245
246	MI.setDesc(TII->get(SOPKOpc));
247	}
248
249	return;
250	}
251
252	const MCInstrDesc &NewDesc = TII->get(SOPKOpc);
253
254	if ((SIInstrInfo::sopkIsZext(Opcode: SOPKOpc) && isKUImmOperand(Src: Src1)) \|\|
255	(!SIInstrInfo::sopkIsZext(Opcode: SOPKOpc) && isKImmOperand(Src: Src1))) {
256	if (!SIInstrInfo::sopkIsZext(Opcode: SOPKOpc))
257	Src1.setImm(SignExtend64(X: Src1.getImm(), B: `32`));
258	MI.setDesc(NewDesc);
259	}
260	}
261
262	// Shrink NSA encoded instructions with contiguous VGPRs to non-NSA encoding.
263	void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const {
264	const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: MI.getOpcode());
265	if (!Info)
266	return;
267
268	uint8_t NewEncoding;
269	switch (Info->MIMGEncoding) {
270	case AMDGPU::MIMGEncGfx10NSA:
271	NewEncoding = AMDGPU::MIMGEncGfx10Default;
272	break;
273	case AMDGPU::MIMGEncGfx11NSA:
274	NewEncoding = AMDGPU::MIMGEncGfx11Default;
275	break;
276	default:
277	return;
278	}
279
280	int VAddr0Idx =
281	AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
282	unsigned NewAddrDwords = Info->VAddrDwords;
283	const TargetRegisterClass *RC;
284
285	if (Info->VAddrDwords == `2`) {
286	RC = &AMDGPU::VReg_64RegClass;
287	} else if (Info->VAddrDwords == `3`) {
288	RC = &AMDGPU::VReg_96RegClass;
289	} else if (Info->VAddrDwords == `4`) {
290	RC = &AMDGPU::VReg_128RegClass;
291	} else if (Info->VAddrDwords == `5`) {
292	RC = &AMDGPU::VReg_160RegClass;
293	} else if (Info->VAddrDwords == `6`) {
294	RC = &AMDGPU::VReg_192RegClass;
295	} else if (Info->VAddrDwords == `7`) {
296	RC = &AMDGPU::VReg_224RegClass;
297	} else if (Info->VAddrDwords == `8`) {
298	RC = &AMDGPU::VReg_256RegClass;
299	} else if (Info->VAddrDwords == `9`) {
300	RC = &AMDGPU::VReg_288RegClass;
301	} else if (Info->VAddrDwords == `10`) {
302	RC = &AMDGPU::VReg_320RegClass;
303	} else if (Info->VAddrDwords == `11`) {
304	RC = &AMDGPU::VReg_352RegClass;
305	} else if (Info->VAddrDwords == `12`) {
306	RC = &AMDGPU::VReg_384RegClass;
307	} else {
308	RC = &AMDGPU::VReg_512RegClass;
309	NewAddrDwords = `16`;
310	}
311
312	unsigned VgprBase = `0`;
313	unsigned NextVgpr = `0`;
314	bool IsUndef = true;
315	bool IsKill = NewAddrDwords == Info->VAddrDwords;
316	const unsigned NSAMaxSize = ST->getNSAMaxSize();
317	const bool IsPartialNSA = NewAddrDwords > NSAMaxSize;
318	const unsigned EndVAddr = IsPartialNSA ? NSAMaxSize : Info->VAddrOperands;
319	for (unsigned Idx = `0`; Idx < EndVAddr; ++Idx) {
320	const MachineOperand &Op = MI.getOperand(i: VAddr0Idx + Idx);
321	unsigned Vgpr = TRI->getHWRegIndex(Reg: Op.getReg());
322	unsigned Dwords = TRI->getRegSizeInBits(Op.getReg(), *MRI) / `32`;
323	assert(Dwords > `0` && "Un-implemented for less than 32 bit regs");
324
325	if (Idx == `0`) {
326	VgprBase = Vgpr;
327	NextVgpr = Vgpr + Dwords;
328	} else if (Vgpr == NextVgpr) {
329	NextVgpr = Vgpr + Dwords;
330	} else {
331	return;
332	}
333
334	if (!Op.isUndef())
335	IsUndef = false;
336	if (!Op.isKill())
337	IsKill = false;
338	}
339
340	if (VgprBase + NewAddrDwords > `256`)
341	return;
342
343	// Further check for implicit tied operands - this may be present if TFE is
344	// enabled
345	int TFEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::tfe);
346	int LWEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::lwe);
347	unsigned TFEVal = (TFEIdx == -`1`) ? `0` : MI.getOperand(i: TFEIdx).getImm();
348	unsigned LWEVal = (LWEIdx == -`1`) ? `0` : MI.getOperand(i: LWEIdx).getImm();
349	int ToUntie = -`1`;
350	if (TFEVal \|\| LWEVal) {
351	// TFE/LWE is enabled so we need to deal with an implicit tied operand
352	for (unsigned i = LWEIdx + `1`, e = MI.getNumOperands(); i != e; ++i) {
353	if (MI.getOperand(i).isReg() && MI.getOperand(i).isTied() &&
354	MI.getOperand(i).isImplicit()) {
355	// This is the tied operand
356	assert(
357	ToUntie == -`1` &&
358	"found more than one tied implicit operand when expecting only 1");
359	ToUntie = i;
360	MI.untieRegOperand(OpIdx: ToUntie);
361	}
362	}
363	}
364
365	unsigned NewOpcode = AMDGPU::getMIMGOpcode(BaseOpcode: Info->BaseOpcode, MIMGEncoding: NewEncoding,
366	VDataDwords: Info->VDataDwords, VAddrDwords: NewAddrDwords);
367	MI.setDesc(TII->get(NewOpcode));
368	MI.getOperand(i: VAddr0Idx).setReg(RC->getRegister(i: VgprBase));
369	MI.getOperand(i: VAddr0Idx).setIsUndef(IsUndef);
370	MI.getOperand(i: VAddr0Idx).setIsKill(IsKill);
371
372	for (unsigned i = `1`; i < EndVAddr; ++i)
373	MI.removeOperand(OpNo: VAddr0Idx + `1`);
374
375	if (ToUntie >= `0`) {
376	MI.tieOperands(
377	AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata),
378	ToUntie - (EndVAddr - `1`));
379	}
380	}
381
382	// Shrink MAD to MADAK/MADMK and FMA to FMAAK/FMAMK.
383	void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
384	// Pre-GFX10 VOP3 instructions like MAD/FMA cannot take a literal operand so
385	// there is no reason to try to shrink them.
386	if (!ST->hasVOP3Literal())
387	return;
388
389	// There is no advantage to doing this pre-RA.
390	if (!MF->getProperties().hasProperty(
391	P: MachineFunctionProperties::Property::NoVRegs))
392	return;
393
394	if (TII->hasAnyModifiersSet(MI))
395	return;
396
397	const unsigned Opcode = MI.getOpcode();
398	MachineOperand &Src0 = *TII->getNamedOperand(MI, AMDGPU::OpName::src0);
399	MachineOperand &Src1 = *TII->getNamedOperand(MI, AMDGPU::OpName::src1);
400	MachineOperand &Src2 = *TII->getNamedOperand(MI, AMDGPU::OpName::src2);
401	unsigned NewOpcode = AMDGPU::INSTRUCTION_LIST_END;
402
403	bool Swap;
404
405	// Detect "Dst = VSrc VGPR + Imm" and convert to AK form.*
406	if (Src2.isImm() && !TII->isInlineConstant(MO: Src2)) {
407	if (Src1.isReg() && TRI->isVGPR(MRI: *MRI, Reg: Src1.getReg()))
408	Swap = false;
409	else if (Src0.isReg() && TRI->isVGPR(MRI: *MRI, Reg: Src0.getReg()))
410	Swap = true;
411	else
412	return;
413
414	switch (Opcode) {
415	default:
416	llvm_unreachable("Unexpected mad/fma opcode!");
417	case AMDGPU::V_MAD_F32_e64:
418	NewOpcode = AMDGPU::V_MADAK_F32;
419	break;
420	case AMDGPU::V_FMA_F32_e64:
421	NewOpcode = AMDGPU::V_FMAAK_F32;
422	break;
423	case AMDGPU::V_MAD_F16_e64:
424	NewOpcode = AMDGPU::V_MADAK_F16;
425	break;
426	case AMDGPU::V_FMA_F16_e64:
427	case AMDGPU::V_FMA_F16_gfx9_e64:
428	NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_t16
429	: AMDGPU::V_FMAAK_F16;
430	break;
431	}
432	}
433
434	// Detect "Dst = VSrc Imm + VGPR" and convert to MK form.*
435	if (Src2.isReg() && TRI->isVGPR(MRI: *MRI, Reg: Src2.getReg())) {
436	if (Src1.isImm() && !TII->isInlineConstant(MO: Src1))
437	Swap = false;
438	else if (Src0.isImm() && !TII->isInlineConstant(MO: Src0))
439	Swap = true;
440	else
441	return;
442
443	switch (Opcode) {
444	default:
445	llvm_unreachable("Unexpected mad/fma opcode!");
446	case AMDGPU::V_MAD_F32_e64:
447	NewOpcode = AMDGPU::V_MADMK_F32;
448	break;
449	case AMDGPU::V_FMA_F32_e64:
450	NewOpcode = AMDGPU::V_FMAMK_F32;
451	break;
452	case AMDGPU::V_MAD_F16_e64:
453	NewOpcode = AMDGPU::V_MADMK_F16;
454	break;
455	case AMDGPU::V_FMA_F16_e64:
456	case AMDGPU::V_FMA_F16_gfx9_e64:
457	NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16
458	: AMDGPU::V_FMAMK_F16;
459	break;
460	}
461	}
462
463	if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END)
464	return;
465
466	if (AMDGPU::isTrue16Inst(Opc: NewOpcode) && !shouldShrinkTrue16(MI))
467	return;
468
469	if (Swap) {
470	// Swap Src0 and Src1 by building a new instruction.
471	BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(NewOpcode),
472	MI.getOperand(i: `0`).getReg())
473	.add(Src1)
474	.add(Src0)
475	.add(Src2)
476	.setMIFlags(MI.getFlags());
477	MI.eraseFromParent();
478	} else {
479	TII->removeModOperands(MI);
480	MI.setDesc(TII->get(NewOpcode));
481	}
482	}
483
484	/// Attempt to shrink AND/OR/XOR operations requiring non-inlineable literals.
485	/// For AND or OR, try using S_BITSET{0,1} to clear or set bits.
486	/// If the inverse of the immediate is legal, use ANDN2, ORN2 or
487	/// XNOR (as a ^ b == ~(a ^ ~b)).
488	/// \returns true if the caller should continue the machine function iterator
489	bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const {
490	unsigned Opc = MI.getOpcode();
491	const MachineOperand *Dest = &MI.getOperand(i: `0`);
492	MachineOperand *Src0 = &MI.getOperand(i: `1`);
493	MachineOperand *Src1 = &MI.getOperand(i: `2`);
494	MachineOperand *SrcReg = Src0;
495	MachineOperand *SrcImm = Src1;
496
497	if (!SrcImm->isImm() \|\|
498	AMDGPU::isInlinableLiteral32(Literal: SrcImm->getImm(), HasInv2Pi: ST->hasInv2PiInlineImm()))
499	return false;
500
501	uint32_t Imm = static_cast<uint32_t>(SrcImm->getImm());
502	uint32_t NewImm = `0`;
503
504	if (Opc == AMDGPU::S_AND_B32) {
505	if (isPowerOf2_32(Value: ~Imm)) {
506	NewImm = llvm::countr_one(Value: Imm);
507	Opc = AMDGPU::S_BITSET0_B32;
508	} else if (AMDGPU::isInlinableLiteral32(Literal: ~Imm, HasInv2Pi: ST->hasInv2PiInlineImm())) {
509	NewImm = ~Imm;
510	Opc = AMDGPU::S_ANDN2_B32;
511	}
512	} else if (Opc == AMDGPU::S_OR_B32) {
513	if (isPowerOf2_32(Value: Imm)) {
514	NewImm = llvm::countr_zero(Val: Imm);
515	Opc = AMDGPU::S_BITSET1_B32;
516	} else if (AMDGPU::isInlinableLiteral32(Literal: ~Imm, HasInv2Pi: ST->hasInv2PiInlineImm())) {
517	NewImm = ~Imm;
518	Opc = AMDGPU::S_ORN2_B32;
519	}
520	} else if (Opc == AMDGPU::S_XOR_B32) {
521	if (AMDGPU::isInlinableLiteral32(Literal: ~Imm, HasInv2Pi: ST->hasInv2PiInlineImm())) {
522	NewImm = ~Imm;
523	Opc = AMDGPU::S_XNOR_B32;
524	}
525	} else {
526	llvm_unreachable("unexpected opcode");
527	}
528
529	if (NewImm != `0`) {
530	if (Dest->getReg().isVirtual() && SrcReg->isReg()) {
531	MRI->setRegAllocationHint(VReg: Dest->getReg(), Type: `0`, PrefReg: SrcReg->getReg());
532	MRI->setRegAllocationHint(VReg: SrcReg->getReg(), Type: `0`, PrefReg: Dest->getReg());
533	return true;
534	}
535
536	if (SrcReg->isReg() && SrcReg->getReg() == Dest->getReg()) {
537	const bool IsUndef = SrcReg->isUndef();
538	const bool IsKill = SrcReg->isKill();
539	MI.setDesc(TII->get(Opc));
540	if (Opc == AMDGPU::S_BITSET0_B32 \|\|
541	Opc == AMDGPU::S_BITSET1_B32) {
542	Src0->ChangeToImmediate(ImmVal: NewImm);
543	// Remove the immediate and add the tied input.
544	MI.getOperand(i: `2`).ChangeToRegister(Reg: Dest->getReg(), /IsDef/ isDef: false,
545	/isImp/ false, isKill: IsKill,
546	/isDead/ false, isUndef: IsUndef);
547	MI.tieOperands(DefIdx: `0`, UseIdx: `2`);
548	} else {
549	SrcImm->setImm(NewImm);
550	}
551	}
552	}
553
554	return false;
555	}
556
557	// This is the same as MachineInstr::readsRegister/modifiesRegister except
558	// it takes subregs into account.
559	bool SIShrinkInstructions::instAccessReg(
560	iterator_range<MachineInstr::const_mop_iterator> &&R, Register Reg,
561	unsigned SubReg) const {
562	for (const MachineOperand &MO : R) {
563	if (!MO.isReg())
564	continue;
565
566	if (Reg.isPhysical() && MO.getReg().isPhysical()) {
567	if (TRI->regsOverlap(Reg, MO.getReg()))
568	return true;
569	} else if (MO.getReg() == Reg && Reg.isVirtual()) {
570	LaneBitmask Overlap = TRI->getSubRegIndexLaneMask(SubReg) &
571	TRI->getSubRegIndexLaneMask(MO.getSubReg());
572	if (Overlap.any())
573	return true;
574	}
575	}
576	return false;
577	}
578
579	bool SIShrinkInstructions::instReadsReg(const MachineInstr MI, unsigned* Reg,
580	unsigned SubReg) const {
581	return instAccessReg(R: MI->uses(), Reg, SubReg);
582	}
583
584	bool SIShrinkInstructions::instModifiesReg(const MachineInstr MI, unsigned* Reg,
585	unsigned SubReg) const {
586	return instAccessReg(R: MI->defs(), Reg, SubReg);
587	}
588
589	TargetInstrInfo::RegSubRegPair
590	SIShrinkInstructions::getSubRegForIndex(Register Reg, unsigned Sub,
591	unsigned I) const {
592	if (TRI->getRegSizeInBits(Reg, *MRI) != `32`) {
593	if (Reg.isPhysical()) {
594	Reg = TRI->getSubReg(Reg, TRI->getSubRegFromChannel(Channel: I));
595	} else {
596	Sub = TRI->getSubRegFromChannel(Channel: I + TRI->getChannelFromSubReg(SubReg: Sub));
597	}
598	}
599	return TargetInstrInfo::RegSubRegPair (Reg, Sub);
600	}
601
602	void SIShrinkInstructions::dropInstructionKeepingImpDefs(
603	MachineInstr &MI) const {
604	for (unsigned i = MI.getDesc().getNumOperands() +
605	MI.getDesc().implicit_uses().size() +
606	MI.getDesc().implicit_defs().size(),
607	e = MI.getNumOperands();
608	i != e; ++i) {
609	const MachineOperand &Op = MI.getOperand(i);
610	if (!Op.isDef())
611	continue;
612	BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
613	TII->get(AMDGPU::IMPLICIT_DEF), Op.getReg());
614	}
615
616	MI.eraseFromParent();
617	}
618
619	// Match:
620	// mov t, x
621	// mov x, y
622	// mov y, t
623	//
624	// =>
625	//
626	// mov t, x (t is potentially dead and move eliminated)
627	// v_swap_b32 x, y
628	//
629	// Returns next valid instruction pointer if was able to create v_swap_b32.
630	//
631	// This shall not be done too early not to prevent possible folding which may
632	// remove matched moves, and this should preferably be done before RA to
633	// release saved registers and also possibly after RA which can insert copies
634	// too.
635	//
636	// This is really just a generic peephole that is not a canonical shrinking,
637	// although requirements match the pass placement and it reduces code size too.
638	MachineInstr SIShrinkInstructions::matchSwap(MachineInstr &MovT) const* {
639	assert(MovT.getOpcode() == AMDGPU::V_MOV_B32_e32 \|\|
640	MovT.getOpcode() == AMDGPU::COPY);
641
642	Register T = MovT.getOperand(i: `0`).getReg();
643	unsigned Tsub = MovT.getOperand(i: `0`).getSubReg();
644	MachineOperand &Xop = MovT.getOperand(i: `1`);
645
646	if (!Xop.isReg())
647	return nullptr;
648	Register X = Xop.getReg();
649	unsigned Xsub = Xop.getSubReg();
650
651	unsigned Size = TII->getOpSize(MI: MovT, OpNo: `0`) / `4`;
652
653	if (!TRI->isVGPR(MRI: *MRI, Reg: X))
654	return nullptr;
655
656	const unsigned SearchLimit = `16`;
657	unsigned Count = `0`;
658	bool KilledT = false;
659	for (auto Iter = std::next(x: MovT.getIterator()),
660	E = MovT.getParent()->instr_end();
661	Iter != E && Count < SearchLimit && !KilledT; ++Iter, ++Count) {
662
663	MachineInstr MovY = &Iter;
664	KilledT = MovY->killsRegister(T, TRI);
665
666	if ((MovY->getOpcode() != AMDGPU::V_MOV_B32_e32 &&
667	MovY->getOpcode() != AMDGPU::COPY) \|\|
668	!MovY->getOperand(`1`).isReg() \|\|
669	MovY->getOperand(`1`).getReg() != T \|\|
670	MovY->getOperand(`1`).getSubReg() != Tsub)
671	continue;
672
673	Register Y = MovY->getOperand(i: `0`).getReg();
674	unsigned Ysub = MovY->getOperand(i: `0`).getSubReg();
675
676	if (!TRI->isVGPR(MRI: *MRI, Reg: Y))
677	continue;
678
679	MachineInstr MovX = nullptr*;
680	for (auto IY = MovY->getIterator(), I = std::next(x: MovT.getIterator());
681	I != IY; ++I) {
682	if (instReadsReg(MI: &I, Reg: X, SubReg: Xsub) \|\| instModifiesReg(MI: &I, Reg: Y, SubReg: Ysub) \|\|
683	instModifiesReg(MI: &*I, Reg: T, SubReg: Tsub) \|\|
684	(MovX && instModifiesReg(MI: &*I, Reg: X, SubReg: Xsub))) {
685	MovX = nullptr;
686	break;
687	}
688	if (!instReadsReg(MI: &*I, Reg: Y, SubReg: Ysub)) {
689	if (!MovX && instModifiesReg(MI: &*I, Reg: X, SubReg: Xsub)) {
690	MovX = nullptr;
691	break;
692	}
693	continue;
694	}
695	if (MovX \|\|
696	(I->getOpcode() != AMDGPU::V_MOV_B32_e32 &&
697	I->getOpcode() != AMDGPU::COPY) \|\|
698	I->getOperand(`0`).getReg() != X \|\|
699	I->getOperand(`0`).getSubReg() != Xsub) {
700	MovX = nullptr;
701	break;
702	}
703
704	if (Size > `1` && (I ->getNumImplicitOperands() > (I ->isCopy() ? `0U` : `1U`)))
705	continue;
706
707	MovX = &*I;
708	}
709
710	if (!MovX)
711	continue;
712
713	LLVM_DEBUG(dbgs() << "Matched v_swap_b32:\n" << MovT << MovX << MovY);
714
715	for (unsigned I = `0`; I < Size; ++I) {
716	TargetInstrInfo::RegSubRegPair X1, Y1;
717	X1 = getSubRegForIndex(Reg: X, Sub: Xsub, I);
718	Y1 = getSubRegForIndex(Reg: Y, Sub: Ysub, I);
719	MachineBasicBlock &MBB = *MovT.getParent();
720	auto MIB = BuildMI(MBB, MovX->getIterator(), MovT.getDebugLoc(),
721	TII->get(AMDGPU::V_SWAP_B32))
722	.addDef(X1.Reg, `0`, X1.SubReg)
723	.addDef(Y1.Reg, `0`, Y1.SubReg)
724	.addReg(Y1.Reg, `0`, Y1.SubReg)
725	.addReg(X1.Reg, `0`, X1.SubReg).getInstr();
726	if (MovX->hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
727	// Drop implicit EXEC.
728	MIB->removeOperand(MIB->getNumExplicitOperands());
729	MIB->copyImplicitOps(MBB.getParent(), MovX);
730	}
731	}
732	MovX->eraseFromParent();
733	dropInstructionKeepingImpDefs(MI&: *MovY);
734	MachineInstr Next = &std::next(x: MovT.getIterator());
735
736	if (T.isVirtual() && MRI->use_nodbg_empty(RegNo: T)) {
737	dropInstructionKeepingImpDefs(MI&: MovT);
738	} else {
739	Xop.setIsKill(false);
740	for (int I = MovT.getNumImplicitOperands() - `1`; I >= `0`; --I ) {
741	unsigned OpNo = MovT.getNumExplicitOperands() + I;
742	const MachineOperand &Op = MovT.getOperand(i: OpNo);
743	if (Op.isKill() && TRI->regsOverlap(X, Op.getReg()))
744	MovT.removeOperand(OpNo);
745	}
746	}
747
748	return Next;
749	}
750
751	return nullptr;
752	}
753
754	// If an instruction has dead sdst replace it with NULL register on gfx1030+
755	bool SIShrinkInstructions::tryReplaceDeadSDST(MachineInstr &MI) const {
756	if (!ST->hasGFX10_3Insts())
757	return false;
758
759	MachineOperand *Op = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
760	if (!Op)
761	return false;
762	Register SDstReg = Op->getReg();
763	if (SDstReg.isPhysical() \|\| !MRI->use_nodbg_empty(RegNo: SDstReg))
764	return false;
765
766	Op->setReg(ST->isWave32() ? AMDGPU::SGPR_NULL : AMDGPU::SGPR_NULL64);
767	return true;
768	}
769
770	bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
771	if (skipFunction(F: MF.getFunction()))
772	return false;
773
774	this->MF = &MF;
775	MRI = &MF.getRegInfo();
776	ST = &MF.getSubtarget<GCNSubtarget>();
777	TII = ST->getInstrInfo();
778	TRI = &TII->getRegisterInfo();
779
780	unsigned VCCReg = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
781
782	std::vector<unsigned> I1Defs;
783
784	for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
785	BI != BE; ++BI) {
786
787	MachineBasicBlock &MBB = *BI;
788	MachineBasicBlock::iterator I, Next;
789	for (I = MBB.begin(); I != MBB.end(); I = Next) {
790	Next = std::next(x: I);
791	MachineInstr &MI = *I;
792
793	if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) {
794	// If this has a literal constant source that is the same as the
795	// reversed bits of an inline immediate, replace with a bitreverse of
796	// that constant. This saves 4 bytes in the common case of materializing
797	// sign bits.
798
799	// Test if we are after regalloc. We only want to do this after any
800	// optimizations happen because this will confuse them.
801	// XXX - not exactly a check for post-regalloc run.
802	MachineOperand &Src = MI.getOperand(i: `1`);
803	if (Src.isImm() && MI.getOperand(i: `0`).getReg().isPhysical()) {
804	int32_t ReverseImm;
805	if (isReverseInlineImm(Src, ReverseImm)) {
806	MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32));
807	Src.setImm(ReverseImm);
808	continue;
809	}
810	}
811	}
812
813	if (ST->hasSwap() && (MI.getOpcode() == AMDGPU::V_MOV_B32_e32 \|\|
814	MI.getOpcode() == AMDGPU::COPY)) {
815	if (auto *NextMI = matchSwap(MovT&: MI)) {
816	Next = NextMI->getIterator();
817	continue;
818	}
819	}
820
821	// Try to use S_ADDK_I32 and S_MULK_I32.
822	if (MI.getOpcode() == AMDGPU::S_ADD_I32 \|\|
823	MI.getOpcode() == AMDGPU::S_MUL_I32) {
824	const MachineOperand *Dest = &MI.getOperand(i: `0`);
825	MachineOperand *Src0 = &MI.getOperand(i: `1`);
826	MachineOperand *Src1 = &MI.getOperand(i: `2`);
827
828	if (!Src0->isReg() && Src1->isReg()) {
829	if (TII->commuteInstruction(MI, false, `1`, `2`))
830	std::swap(a&: Src0, b&: Src1);
831	}
832
833	// FIXME: This could work better if hints worked with subregisters. If
834	// we have a vector add of a constant, we usually don't get the correct
835	// allocation due to the subregister usage.
836	if (Dest->getReg().isVirtual() && Src0->isReg()) {
837	MRI->setRegAllocationHint(VReg: Dest->getReg(), Type: `0`, PrefReg: Src0->getReg());
838	MRI->setRegAllocationHint(VReg: Src0->getReg(), Type: `0`, PrefReg: Dest->getReg());
839	continue;
840	}
841
842	if (Src0->isReg() && Src0->getReg() == Dest->getReg()) {
843	if (Src1->isImm() && isKImmOperand(Src: *Src1)) {
844	unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ?
845	AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32;
846
847	Src1->setImm(SignExtend64(X: Src1->getImm(), B: `32`));
848	MI.setDesc(TII->get(Opc));
849	MI.tieOperands(DefIdx: `0`, UseIdx: `1`);
850	}
851	}
852	}
853
854	// Try to use s_cmpk_*
855	if (MI.isCompare() && TII->isSOPC(MI)) {
856	shrinkScalarCompare(MI);
857	continue;
858	}
859
860	// Try to use S_MOVK_I32, which will save 4 bytes for small immediates.
861	if (MI.getOpcode() == AMDGPU::S_MOV_B32) {
862	const MachineOperand &Dst = MI.getOperand(i: `0`);
863	MachineOperand &Src = MI.getOperand(i: `1`);
864
865	if (Src.isImm() && Dst.getReg().isPhysical()) {
866	int32_t ReverseImm;
867	if (isKImmOperand(Src)) {
868	MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
869	Src.setImm(SignExtend64(X: Src.getImm(), B: `32`));
870	} else if (isReverseInlineImm(Src, ReverseImm)) {
871	MI.setDesc(TII->get(AMDGPU::S_BREV_B32));
872	Src.setImm(ReverseImm);
873	}
874	}
875
876	continue;
877	}
878
879	// Shrink scalar logic operations.
880	if (MI.getOpcode() == AMDGPU::S_AND_B32 \|\|
881	MI.getOpcode() == AMDGPU::S_OR_B32 \|\|
882	MI.getOpcode() == AMDGPU::S_XOR_B32) {
883	if (shrinkScalarLogicOp(MI))
884	continue;
885	}
886
887	if (TII->isMIMG(Opcode: MI.getOpcode()) &&
888	ST->getGeneration() >= AMDGPUSubtarget::GFX10 &&
889	MF.getProperties().hasProperty(
890	P: MachineFunctionProperties::Property::NoVRegs)) {
891	shrinkMIMG(MI);
892	continue;
893	}
894
895	if (!TII->isVOP3(MI))
896	continue;
897
898	if (MI.getOpcode() == AMDGPU::V_MAD_F32_e64 \|\|
899	MI.getOpcode() == AMDGPU::V_FMA_F32_e64 \|\|
900	MI.getOpcode() == AMDGPU::V_MAD_F16_e64 \|\|
901	MI.getOpcode() == AMDGPU::V_FMA_F16_e64 \|\|
902	MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_e64) {
903	shrinkMadFma(MI);
904	continue;
905	}
906
907	if (!TII->hasVALU32BitEncoding(Opcode: MI.getOpcode())) {
908	// If there is no chance we will shrink it and use VCC as sdst to get
909	// a 32 bit form try to replace dead sdst with NULL.
910	tryReplaceDeadSDST(MI);
911	continue;
912	}
913
914	if (!TII->canShrink(MI, MRI: *MRI)) {
915	// Try commuting the instruction and see if that enables us to shrink
916	// it.
917	if (!MI.isCommutable() \|\| !TII->commuteInstruction(MI) \|\|
918	!TII->canShrink(MI, MRI: *MRI)) {
919	tryReplaceDeadSDST(MI);
920	continue;
921	}
922	}
923
924	int Op32 = AMDGPU::getVOPe32(Opcode: MI.getOpcode());
925
926	if (TII->isVOPC(Opcode: Op32)) {
927	MachineOperand &Op0 = MI.getOperand(i: `0`);
928	if (Op0.isReg()) {
929	// Exclude VOPCX instructions as these don't explicitly write a
930	// dst.
931	Register DstReg = Op0.getReg();
932	if (DstReg.isVirtual()) {
933	// VOPC instructions can only write to the VCC register. We can't
934	// force them to use VCC here, because this is only one register and
935	// cannot deal with sequences which would require multiple copies of
936	// VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...)
937	//
938	// So, instead of forcing the instruction to write to VCC, we
939	// provide a hint to the register allocator to use VCC and then we
940	// will run this pass again after RA and shrink it if it outputs to
941	// VCC.
942	MRI->setRegAllocationHint(VReg: DstReg, Type: `0`, PrefReg: VCCReg);
943	continue;
944	}
945	if (DstReg != VCCReg)
946	continue;
947	}
948	}
949
950	if (Op32 == AMDGPU::V_CNDMASK_B32_e32) {
951	// We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC
952	// instructions.
953	const MachineOperand *Src2 =
954	TII->getNamedOperand(MI, AMDGPU::OpName::src2);
955	if (!Src2->isReg())
956	continue;
957	Register SReg = Src2->getReg();
958	if (SReg.isVirtual()) {
959	MRI->setRegAllocationHint(VReg: SReg, Type: `0`, PrefReg: VCCReg);
960	continue;
961	}
962	if (SReg != VCCReg)
963	continue;
964	}
965
966	// Check for the bool flag output for instructions like V_ADD_I32_e64.
967	const MachineOperand *SDst = TII->getNamedOperand(MI,
968	AMDGPU::OpName::sdst);
969
970	if (SDst) {
971	bool Next = false;
972
973	if (SDst->getReg() != VCCReg) {
974	if (SDst->getReg().isVirtual())
975	MRI->setRegAllocationHint(VReg: SDst->getReg(), Type: `0`, PrefReg: VCCReg);
976	Next = true;
977	}
978
979	// All of the instructions with carry outs also have an SGPR input in
980	// src2.
981	const MachineOperand *Src2 = TII->getNamedOperand(MI,
982	AMDGPU::OpName::src2);
983	if (Src2 && Src2->getReg() != VCCReg) {
984	if (Src2->getReg().isVirtual())
985	MRI->setRegAllocationHint(VReg: Src2->getReg(), Type: `0`, PrefReg: VCCReg);
986	Next = true;
987	}
988
989	if (Next)
990	continue;
991	}
992
993	// Pre-GFX10, shrinking VOP3 instructions pre-RA gave us the chance to
994	// fold an immediate into the shrunk instruction as a literal operand. In
995	// GFX10 VOP3 instructions can take a literal operand anyway, so there is
996	// no advantage to doing this.
997	if (ST->hasVOP3Literal() &&
998	!MF.getProperties().hasProperty(
999	P: MachineFunctionProperties::Property::NoVRegs))
1000	continue;
1001
1002	if (ST->hasTrue16BitInsts() && AMDGPU::isTrue16Inst(Opc: MI.getOpcode()) &&
1003	!shouldShrinkTrue16(MI))
1004	continue;
1005
1006	// We can shrink this instruction
1007	LLVM_DEBUG(dbgs() << "Shrinking " << MI);
1008
1009	MachineInstr *Inst32 = TII->buildShrunkInst(MI, NewOpcode: Op32);
1010	++NumInstructionsShrunk;
1011
1012	// Copy extra operands not present in the instruction definition.
1013	copyExtraImplicitOps(NewMI&: *Inst32, MI);
1014
1015	// Copy deadness from the old explicit vcc def to the new implicit def.
1016	if (SDst && SDst->isDead())
1017	Inst32->findRegisterDefOperand(Reg: VCCReg, /TRI=/nullptr)->setIsDead();
1018
1019	MI.eraseFromParent();
1020	foldImmediates(MI&: *Inst32);
1021
1022	LLVM_DEBUG(dbgs() << "e32 MI = " << *Inst32 << `'\n'`);
1023	}
1024	}
1025	return false;
1026	}
1027

source code of llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp