SILowerControlFlow.cpp source code [llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp]

1	//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// This pass lowers the pseudo control flow instructions to real
11	/// machine instructions.
12	///
13	/// All control flow is handled using predicated instructions and
14	/// a predicate stack. Each Scalar ALU controls the operations of 64 Vector
15	/// ALUs. The Scalar ALU can update the predicate for any of the Vector ALUs
16	/// by writing to the 64-bit EXEC register (each bit corresponds to a
17	/// single vector ALU). Typically, for predicates, a vector ALU will write
18	/// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each
19	/// Vector ALU) and then the ScalarALU will AND the VCC register with the
20	/// EXEC to update the predicates.
21	///
22	/// For example:
23	/// %vcc = V_CMP_GT_F32 %vgpr1, %vgpr2
24	/// %sgpr0 = SI_IF %vcc
25	/// %vgpr0 = V_ADD_F32 %vgpr0, %vgpr0
26	/// %sgpr0 = SI_ELSE %sgpr0
27	/// %vgpr0 = V_SUB_F32 %vgpr0, %vgpr0
28	/// SI_END_CF %sgpr0
29	///
30	/// becomes:
31	///
32	/// %sgpr0 = S_AND_SAVEEXEC_B64 %vcc // Save and update the exec mask
33	/// %sgpr0 = S_XOR_B64 %sgpr0, %exec // Clear live bits from saved exec mask
34	/// S_CBRANCH_EXECZ label0 // This instruction is an optional
35	/// // optimization which allows us to
36	/// // branch if all the bits of
37	/// // EXEC are zero.
38	/// %vgpr0 = V_ADD_F32 %vgpr0, %vgpr0 // Do the IF block of the branch
39	///
40	/// label0:
41	/// %sgpr0 = S_OR_SAVEEXEC_B64 %sgpr0 // Restore the exec mask for the Then
42	/// // block
43	/// %exec = S_XOR_B64 %sgpr0, %exec // Update the exec mask
44	/// S_BRANCH_EXECZ label1 // Use our branch optimization
45	/// // instruction again.
46	/// %vgpr0 = V_SUB_F32 %vgpr0, %vgpr // Do the THEN block
47	/// label1:
48	/// %exec = S_OR_B64 %exec, %sgpr0 // Re-enable saved exec mask bits
49	//===----------------------------------------------------------------------===//
50
51	#include "AMDGPU.h"
52	#include "GCNSubtarget.h"
53	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
54	#include "llvm/ADT/SmallSet.h"
55	#include "llvm/CodeGen/LiveIntervals.h"
56	#include "llvm/CodeGen/LiveVariables.h"
57	#include "llvm/CodeGen/MachineDominators.h"
58	#include "llvm/CodeGen/MachineFunctionPass.h"
59	#include "llvm/Target/TargetMachine.h"
60
61	using namespace llvm;
62
63	#define DEBUG_TYPE "si-lower-control-flow"
64
65	static cl::opt<bool>
66	RemoveRedundantEndcf("amdgpu-remove-redundant-endcf",
67	cl::init(Val: true), cl::ReallyHidden);
68
69	namespace {
70
71	class SILowerControlFlow : public MachineFunctionPass {
72	private:
73	const SIRegisterInfo TRI = nullptr*;
74	const SIInstrInfo TII = nullptr*;
75	LiveIntervals LIS = nullptr*;
76	LiveVariables LV = nullptr*;
77	MachineDominatorTree MDT = nullptr*;
78	MachineRegisterInfo MRI = nullptr*;
79	SetVector<MachineInstr*> LoweredEndCf;
80	DenseSet<Register> LoweredIf;
81	SmallSet<MachineBasicBlock *, `4`> KillBlocks;
82	SmallSet<Register, `8`> RecomputeRegs;
83
84	const TargetRegisterClass BoolRC = nullptr*;
85	unsigned AndOpc;
86	unsigned OrOpc;
87	unsigned XorOpc;
88	unsigned MovTermOpc;
89	unsigned Andn2TermOpc;
90	unsigned XorTermrOpc;
91	unsigned OrTermrOpc;
92	unsigned OrSaveExecOpc;
93	unsigned Exec;
94
95	bool EnableOptimizeEndCf = false;
96
97	bool hasKill(const MachineBasicBlock Begin, const* MachineBasicBlock *End);
98
99	void emitIf(MachineInstr &MI);
100	void emitElse(MachineInstr &MI);
101	void emitIfBreak(MachineInstr &MI);
102	void emitLoop(MachineInstr &MI);
103
104	MachineBasicBlock *emitEndCf(MachineInstr &MI);
105
106	void lowerInitExec(MachineBasicBlock *MBB, MachineInstr &MI);
107
108	void findMaskOperands(MachineInstr &MI, unsigned OpNo,
109	SmallVectorImpl<MachineOperand> &Src) const;
110
111	void combineMasks(MachineInstr &MI);
112
113	bool removeMBBifRedundant(MachineBasicBlock &MBB);
114
115	MachineBasicBlock *process(MachineInstr &MI);
116
117	// Skip to the next instruction, ignoring debug instructions, and trivial
118	// block boundaries (blocks that have one (typically fallthrough) successor,
119	// and the successor has one predecessor.
120	MachineBasicBlock::iterator
121	skipIgnoreExecInstsTrivialSucc(MachineBasicBlock &MBB,
122	MachineBasicBlock::iterator It) const;
123
124	/// Find the insertion point for a new conditional branch.
125	MachineBasicBlock::iterator
126	skipToUncondBrOrEnd(MachineBasicBlock &MBB,
127	MachineBasicBlock::iterator I) const {
128	assert(I ->isTerminator());
129
130	// FIXME: What if we had multiple pre-existing conditional branches?
131	MachineBasicBlock::iterator End = MBB.end();
132	while (I != End && !I ->isUnconditionalBranch())
133	++I;
134	return I;
135	}
136
137	// Remove redundant SI_END_CF instructions.
138	void optimizeEndCf();
139
140	public:
141	static char ID;
142
143	SILowerControlFlow() : MachineFunctionPass (ID) {}
144
145	bool runOnMachineFunction(MachineFunction &MF) override;
146
147	StringRef getPassName() const override {
148	return "SI Lower control flow pseudo instructions";
149	}
150
151	void getAnalysisUsage(AnalysisUsage &AU) const override {
152	AU.addUsedIfAvailable<LiveIntervals>();
153	// Should preserve the same set that TwoAddressInstructions does.
154	AU.addPreserved<MachineDominatorTree>();
155	AU.addPreserved<SlotIndexes>();
156	AU.addPreserved<LiveIntervals>();
157	AU.addPreservedID(ID&: LiveVariablesID);
158	MachineFunctionPass::getAnalysisUsage(AU);
159	}
160	};
161
162	} // end anonymous namespace
163
164	char SILowerControlFlow::ID = `0`;
165
166	INITIALIZE_PASS(SILowerControlFlow, DEBUG_TYPE,
167	"SI lower control flow", false, false)
168
169	static void setImpSCCDefDead(MachineInstr &MI, bool IsDead) {
170	MachineOperand &ImpDefSCC = MI.getOperand(i: `3`);
171	assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef());
172
173	ImpDefSCC.setIsDead(IsDead);
174	}
175
176	char &llvm::SILowerControlFlowID = SILowerControlFlow::ID;
177
178	bool SILowerControlFlow::hasKill(const MachineBasicBlock *Begin,
179	const MachineBasicBlock *End) {
180	DenseSet<const MachineBasicBlock*> Visited;
181	SmallVector<MachineBasicBlock *, `4`> Worklist(Begin->successors());
182
183	while (!Worklist.empty()) {
184	MachineBasicBlock *MBB = Worklist.pop_back_val();
185
186	if (MBB == End \|\| !Visited.insert(V: MBB).second)
187	continue;
188	if (KillBlocks.contains(Ptr: MBB))
189	return true;
190
191	Worklist.append(in_start: MBB->succ_begin(), in_end: MBB->succ_end());
192	}
193
194	return false;
195	}
196
197	static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI) {
198	Register SaveExecReg = MI.getOperand(i: `0`).getReg();
199	auto U = MRI->use_instr_nodbg_begin(RegNo: SaveExecReg);
200
201	if (U == MRI->use_instr_nodbg_end() \|\|
202	std::next(x: U) != MRI->use_instr_nodbg_end() \|\|
203	U ->getOpcode() != AMDGPU::SI_END_CF)
204	return false;
205
206	return true;
207	}
208
209	void SILowerControlFlow::emitIf(MachineInstr &MI) {
210	MachineBasicBlock &MBB = *MI.getParent();
211	const DebugLoc &DL = MI.getDebugLoc();
212	MachineBasicBlock::iterator I(&MI);
213	Register SaveExecReg = MI.getOperand(i: `0`).getReg();
214	MachineOperand& Cond = MI.getOperand(i: `1`);
215	assert(Cond.getSubReg() == AMDGPU::NoSubRegister);
216
217	MachineOperand &ImpDefSCC = MI.getOperand(i: `4`);
218	assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef());
219
220	// If there is only one use of save exec register and that use is SI_END_CF,
221	// we can optimize SI_IF by returning the full saved exec mask instead of
222	// just cleared bits.
223	bool SimpleIf = isSimpleIf(MI, MRI);
224
225	if (SimpleIf) {
226	// Check for SI_KILL__TERMINATOR on path from if to endif.*
227	// if there is any such terminator simplifications are not safe.
228	auto UseMI = MRI->use_instr_nodbg_begin(RegNo: SaveExecReg);
229	SimpleIf = !hasKill(Begin: MI.getParent(), End: UseMI ->getParent());
230	}
231
232	// Add an implicit def of exec to discourage scheduling VALU after this which
233	// will interfere with trying to form s_and_saveexec_b64 later.
234	Register CopyReg = SimpleIf ? SaveExecReg
235	: MRI->createVirtualRegister(RegClass: BoolRC);
236	MachineInstr *CopyExec =
237	BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), CopyReg)
238	.addReg(Exec)
239	.addReg(Exec, RegState::ImplicitDefine);
240	LoweredIf.insert(V: CopyReg);
241
242	Register Tmp = MRI->createVirtualRegister(RegClass: BoolRC);
243
244	MachineInstr *And =
245	BuildMI(MBB, I, DL, TII->get(AndOpc), Tmp)
246	.addReg(CopyReg)
247	.add(Cond);
248	if (LV)
249	LV->replaceKillInstruction(Reg: Cond.getReg(), OldMI&: MI, NewMI&: *And);
250
251	setImpSCCDefDead(MI&: And, IsDead: true*);
252
253	MachineInstr Xor = nullptr*;
254	if (!SimpleIf) {
255	Xor =
256	BuildMI(MBB, I, DL, TII->get(XorOpc), SaveExecReg)
257	.addReg(Tmp)
258	.addReg(CopyReg);
259	setImpSCCDefDead(MI&: *Xor, IsDead: ImpDefSCC.isDead());
260	}
261
262	// Use a copy that is a terminator to get correct spill code placement it with
263	// fast regalloc.
264	MachineInstr *SetExec =
265	BuildMI(MBB, I, DL, TII->get(MovTermOpc), Exec)
266	.addReg(Tmp, RegState::Kill);
267	if (LV)
268	LV->getVarInfo(Reg: Tmp).Kills.push_back(x: SetExec);
269
270	// Skip ahead to the unconditional branch in case there are other terminators
271	// present.
272	I = skipToUncondBrOrEnd(MBB, I);
273
274	// Insert the S_CBRANCH_EXECZ instruction which will be optimized later
275	// during SIRemoveShortExecBranches.
276	MachineInstr *NewBr = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
277	.add(MI.getOperand(i: `2`));
278
279	if (!LIS) {
280	MI.eraseFromParent();
281	return;
282	}
283
284	LIS->InsertMachineInstrInMaps(MI&: *CopyExec);
285
286	// Replace with and so we don't need to fix the live interval for condition
287	// register.
288	LIS->ReplaceMachineInstrInMaps(MI, NewMI&: *And);
289
290	if (!SimpleIf)
291	LIS->InsertMachineInstrInMaps(MI&: *Xor);
292	LIS->InsertMachineInstrInMaps(MI&: *SetExec);
293	LIS->InsertMachineInstrInMaps(MI&: *NewBr);
294
295	LIS->removeAllRegUnitsForPhysReg(AMDGPU::Reg: EXEC);
296	MI.eraseFromParent();
297
298	// FIXME: Is there a better way of adjusting the liveness? It shouldn't be
299	// hard to add another def here but I'm not sure how to correctly update the
300	// valno.
301	RecomputeRegs.insert(V: SaveExecReg);
302	LIS->createAndComputeVirtRegInterval(Reg: Tmp);
303	if (!SimpleIf)
304	LIS->createAndComputeVirtRegInterval(Reg: CopyReg);
305	}
306
307	void SILowerControlFlow::emitElse(MachineInstr &MI) {
308	MachineBasicBlock &MBB = *MI.getParent();
309	const DebugLoc &DL = MI.getDebugLoc();
310
311	Register DstReg = MI.getOperand(i: `0`).getReg();
312	Register SrcReg = MI.getOperand(i: `1`).getReg();
313
314	MachineBasicBlock::iterator Start = MBB.begin();
315
316	// This must be inserted before phis and any spill code inserted before the
317	// else.
318	Register SaveReg = MRI->createVirtualRegister(RegClass: BoolRC);
319	MachineInstr *OrSaveExec =
320	BuildMI(MBB, Start, DL, TII->get(OrSaveExecOpc), SaveReg)
321	.add(MI.getOperand(i: `1`)); // Saved EXEC
322	if (LV)
323	LV->replaceKillInstruction(Reg: SrcReg, OldMI&: MI, NewMI&: *OrSaveExec);
324
325	MachineBasicBlock *DestBB = MI.getOperand(i: `2`).getMBB();
326
327	MachineBasicBlock::iterator ElsePt(MI);
328
329	// This accounts for any modification of the EXEC mask within the block and
330	// can be optimized out pre-RA when not required.
331	MachineInstr *And = BuildMI(MBB, ElsePt, DL, TII->get(AndOpc), DstReg)
332	.addReg(Exec)
333	.addReg(SaveReg);
334
335	MachineInstr *Xor =
336	BuildMI(MBB, ElsePt, DL, TII->get(XorTermrOpc), Exec)
337	.addReg(Exec)
338	.addReg(DstReg);
339
340	// Skip ahead to the unconditional branch in case there are other terminators
341	// present.
342	ElsePt = skipToUncondBrOrEnd(MBB, I: ElsePt);
343
344	MachineInstr *Branch =
345	BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
346	.addMBB(DestBB);
347
348	if (!LIS) {
349	MI.eraseFromParent();
350	return;
351	}
352
353	LIS->RemoveMachineInstrFromMaps(MI);
354	MI.eraseFromParent();
355
356	LIS->InsertMachineInstrInMaps(MI&: *OrSaveExec);
357	LIS->InsertMachineInstrInMaps(MI&: *And);
358
359	LIS->InsertMachineInstrInMaps(MI&: *Xor);
360	LIS->InsertMachineInstrInMaps(MI&: *Branch);
361
362	RecomputeRegs.insert(V: SrcReg);
363	RecomputeRegs.insert(V: DstReg);
364	LIS->createAndComputeVirtRegInterval(Reg: SaveReg);
365
366	// Let this be recomputed.
367	LIS->removeAllRegUnitsForPhysReg(AMDGPU::Reg: EXEC);
368	}
369
370	void SILowerControlFlow::emitIfBreak(MachineInstr &MI) {
371	MachineBasicBlock &MBB = *MI.getParent();
372	const DebugLoc &DL = MI.getDebugLoc();
373	auto Dst = MI.getOperand(i: `0`).getReg();
374
375	// Skip ANDing with exec if the break condition is already masked by exec
376	// because it is a V_CMP in the same basic block. (We know the break
377	// condition operand was an i1 in IR, so if it is a VALU instruction it must
378	// be one with a carry-out.)
379	bool SkipAnding = false;
380	if (MI.getOperand(i: `1`).isReg()) {
381	if (MachineInstr *Def = MRI->getUniqueVRegDef(Reg: MI.getOperand(i: `1`).getReg())) {
382	SkipAnding = Def->getParent() == MI.getParent()
383	&& SIInstrInfo::isVALU(MI: *Def);
384	}
385	}
386
387	// AND the break condition operand with exec, then OR that into the "loop
388	// exit" mask.
389	MachineInstr And = nullptr, Or = nullptr;
390	Register AndReg;
391	if (!SkipAnding) {
392	AndReg = MRI->createVirtualRegister(RegClass: BoolRC);
393	And = BuildMI(MBB, &MI, DL, TII->get(AndOpc), AndReg)
394	.addReg(Exec)
395	.add(MI.getOperand(i: `1`));
396	if (LV)
397	LV->replaceKillInstruction(Reg: MI.getOperand(i: `1`).getReg(), OldMI&: MI, NewMI&: *And);
398	Or = BuildMI(MBB, &MI, DL, TII->get(OrOpc), Dst)
399	.addReg(AndReg)
400	.add(MI.getOperand(i: `2`));
401	} else {
402	Or = BuildMI(MBB, &MI, DL, TII->get(OrOpc), Dst)
403	.add(MI.getOperand(i: `1`))
404	.add(MI.getOperand(i: `2`));
405	if (LV)
406	LV->replaceKillInstruction(Reg: MI.getOperand(i: `1`).getReg(), OldMI&: MI, NewMI&: *Or);
407	}
408	if (LV)
409	LV->replaceKillInstruction(Reg: MI.getOperand(i: `2`).getReg(), OldMI&: MI, NewMI&: *Or);
410
411	if (LIS) {
412	LIS->ReplaceMachineInstrInMaps(MI, NewMI&: *Or);
413	if (And) {
414	// Read of original operand 1 is on And now not Or.
415	RecomputeRegs.insert(V: And->getOperand(i: `2`).getReg());
416	LIS->InsertMachineInstrInMaps(MI&: *And);
417	LIS->createAndComputeVirtRegInterval(Reg: AndReg);
418	}
419	}
420
421	MI.eraseFromParent();
422	}
423
424	void SILowerControlFlow::emitLoop(MachineInstr &MI) {
425	MachineBasicBlock &MBB = *MI.getParent();
426	const DebugLoc &DL = MI.getDebugLoc();
427
428	MachineInstr *AndN2 =
429	BuildMI(MBB, &MI, DL, TII->get(Andn2TermOpc), Exec)
430	.addReg(Exec)
431	.add(MI.getOperand(i: `0`));
432	if (LV)
433	LV->replaceKillInstruction(Reg: MI.getOperand(i: `0`).getReg(), OldMI&: MI, NewMI&: *AndN2);
434
435	auto BranchPt = skipToUncondBrOrEnd(MBB, I: MI.getIterator());
436	MachineInstr *Branch =
437	BuildMI(MBB, BranchPt, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
438	.add(MI.getOperand(i: `1`));
439
440	if (LIS) {
441	RecomputeRegs.insert(V: MI.getOperand(i: `0`).getReg());
442	LIS->ReplaceMachineInstrInMaps(MI, NewMI&: *AndN2);
443	LIS->InsertMachineInstrInMaps(MI&: *Branch);
444	}
445
446	MI.eraseFromParent();
447	}
448
449	MachineBasicBlock::iterator
450	SILowerControlFlow::skipIgnoreExecInstsTrivialSucc(
451	MachineBasicBlock &MBB, MachineBasicBlock::iterator It) const {
452
453	SmallSet<const MachineBasicBlock *, `4`> Visited;
454	MachineBasicBlock *B = &MBB;
455	do {
456	if (!Visited.insert(Ptr: B).second)
457	return MBB.end();
458
459	auto E = B->end();
460	for ( ; It != E; ++It) {
461	if (TII->mayReadEXEC(MRI: MRI, MI: It))
462	break;
463	}
464
465	if (It != E)
466	return It;
467
468	if (B->succ_size() != `1`)
469	return MBB.end();
470
471	// If there is one trivial successor, advance to the next block.
472	MachineBasicBlock Succ = B->succ_begin();
473
474	It = Succ->begin();
475	B = Succ;
476	} while (true);
477	}
478
479	MachineBasicBlock *SILowerControlFlow::emitEndCf(MachineInstr &MI) {
480	MachineBasicBlock &MBB = *MI.getParent();
481	const DebugLoc &DL = MI.getDebugLoc();
482
483	MachineBasicBlock::iterator InsPt = MBB.begin();
484
485	// If we have instructions that aren't prolog instructions, split the block
486	// and emit a terminator instruction. This ensures correct spill placement.
487	// FIXME: We should unconditionally split the block here.
488	bool NeedBlockSplit = false;
489	Register DataReg = MI.getOperand(i: `0`).getReg();
490	for (MachineBasicBlock::iterator I = InsPt, E = MI.getIterator();
491	I != E; ++I) {
492	if (I ->modifiesRegister(DataReg, TRI)) {
493	NeedBlockSplit = true;
494	break;
495	}
496	}
497
498	unsigned Opcode = OrOpc;
499	MachineBasicBlock *SplitBB = &MBB;
500	if (NeedBlockSplit) {
501	SplitBB = MBB.splitAt(SplitInst&: MI, /UpdateLiveIns/true, LIS);
502	if (MDT && SplitBB != &MBB) {
503	MachineDomTreeNode MBBNode = (MDT)[&MBB];
504	SmallVector<MachineDomTreeNode *> Children(MBBNode->begin(),
505	MBBNode->end());
506	MachineDomTreeNode *SplitBBNode = MDT->addNewBlock(BB: SplitBB, DomBB: &MBB);
507	for (MachineDomTreeNode *Child : Children)
508	MDT->changeImmediateDominator(N: Child, NewIDom: SplitBBNode);
509	}
510	Opcode = OrTermrOpc;
511	InsPt = MI;
512	}
513
514	MachineInstr *NewMI =
515	BuildMI(MBB, InsPt, DL, TII->get(Opcode), Exec)
516	.addReg(Exec)
517	.add(MI.getOperand(i: `0`));
518	if (LV) {
519	LV->replaceKillInstruction(Reg: DataReg, OldMI&: MI, NewMI&: *NewMI);
520
521	if (SplitBB != &MBB) {
522	// Track the set of registers defined in the original block so we don't
523	// accidentally add the original block to AliveBlocks. AliveBlocks only
524	// includes blocks which are live through, which excludes live outs and
525	// local defs.
526	DenseSet<Register> DefInOrigBlock;
527
528	for (MachineBasicBlock *BlockPiece : {&MBB, SplitBB}) {
529	for (MachineInstr &X : *BlockPiece) {
530	for (MachineOperand &Op : X.all_defs()) {
531	if (Op.getReg().isVirtual())
532	DefInOrigBlock.insert(V: Op.getReg());
533	}
534	}
535	}
536
537	for (unsigned i = `0`, e = MRI->getNumVirtRegs(); i != e; ++i) {
538	Register Reg = Register::index2VirtReg(Index: i);
539	LiveVariables::VarInfo &VI = LV->getVarInfo(Reg);
540
541	if (VI.AliveBlocks.test(Idx: MBB.getNumber()))
542	VI.AliveBlocks.set(SplitBB->getNumber());
543	else {
544	for (MachineInstr *Kill : VI.Kills) {
545	if (Kill->getParent() == SplitBB && !DefInOrigBlock.contains(V: Reg))
546	VI.AliveBlocks.set(MBB.getNumber());
547	}
548	}
549	}
550	}
551	}
552
553	LoweredEndCf.insert(X: NewMI);
554
555	if (LIS)
556	LIS->ReplaceMachineInstrInMaps(MI, NewMI&: *NewMI);
557
558	MI.eraseFromParent();
559
560	if (LIS)
561	LIS->handleMove(MI&: *NewMI);
562	return SplitBB;
563	}
564
565	// Returns replace operands for a logical operation, either single result
566	// for exec or two operands if source was another equivalent operation.
567	void SILowerControlFlow::findMaskOperands(MachineInstr &MI, unsigned OpNo,
568	SmallVectorImpl<MachineOperand> &Src) const {
569	MachineOperand &Op = MI.getOperand(i: OpNo);
570	if (!Op.isReg() \|\| !Op.getReg().isVirtual()) {
571	Src.push_back(Elt: Op);
572	return;
573	}
574
575	MachineInstr *Def = MRI->getUniqueVRegDef(Reg: Op.getReg());
576	if (!Def \|\| Def->getParent() != MI.getParent() \|\|
577	!(Def->isFullCopy() \|\| (Def->getOpcode() == MI.getOpcode())))
578	return;
579
580	// Make sure we do not modify exec between def and use.
581	// A copy with implicitly defined exec inserted earlier is an exclusion, it
582	// does not really modify exec.
583	for (auto I = Def->getIterator(); I != MI.getIterator(); ++I)
584	if (I ->modifiesRegister(AMDGPU::Reg: EXEC, TRI) &&
585	!(I ->isCopy() && I ->getOperand(i: `0`).getReg() != Exec))
586	return;
587
588	for (const auto &SrcOp : Def->explicit_operands())
589	if (SrcOp.isReg() && SrcOp.isUse() &&
590	(SrcOp.getReg().isVirtual() \|\| SrcOp.getReg() == Exec))
591	Src.push_back(Elt: SrcOp);
592	}
593
594	// Search and combine pairs of equivalent instructions, like
595	// S_AND_B64 x, (S_AND_B64 x, y) => S_AND_B64 x, y
596	// S_OR_B64 x, (S_OR_B64 x, y) => S_OR_B64 x, y
597	// One of the operands is exec mask.
598	void SILowerControlFlow::combineMasks(MachineInstr &MI) {
599	assert(MI.getNumExplicitOperands() == `3`);
600	SmallVector<MachineOperand, `4`> Ops;
601	unsigned OpToReplace = `1`;
602	findMaskOperands(MI, OpNo: `1`, Src&: Ops);
603	if (Ops.size() == `1`) OpToReplace = `2`; // First operand can be exec or its copy
604	findMaskOperands(MI, OpNo: `2`, Src&: Ops);
605	if (Ops.size() != `3`) return;
606
607	unsigned UniqueOpndIdx;
608	if (Ops [`0`].isIdenticalTo(Other: Ops [`1`])) UniqueOpndIdx = `2`;
609	else if (Ops [`0`].isIdenticalTo(Other: Ops [`2`])) UniqueOpndIdx = `1`;
610	else if (Ops [`1`].isIdenticalTo(Other: Ops [`2`])) UniqueOpndIdx = `1`;
611	else return;
612
613	Register Reg = MI.getOperand(i: OpToReplace).getReg();
614	MI.removeOperand(OpNo: OpToReplace);
615	MI.addOperand(Op: Ops [UniqueOpndIdx]);
616	if (MRI->use_empty(RegNo: Reg))
617	MRI->getUniqueVRegDef(Reg)->eraseFromParent();
618	}
619
620	void SILowerControlFlow::optimizeEndCf() {
621	// If the only instruction immediately following this END_CF is another
622	// END_CF in the only successor we can avoid emitting exec mask restore here.
623	if (!EnableOptimizeEndCf)
624	return;
625
626	for (MachineInstr *MI : reverse(C&: LoweredEndCf)) {
627	MachineBasicBlock &MBB = *MI->getParent();
628	auto Next =
629	skipIgnoreExecInstsTrivialSucc(MBB, It: std::next(x: MI->getIterator()));
630	if (Next == MBB.end() \|\| !LoweredEndCf.count(key: &*Next))
631	continue;
632	// Only skip inner END_CF if outer ENDCF belongs to SI_IF.
633	// If that belongs to SI_ELSE then saved mask has an inverted value.
634	Register SavedExec
635	= TII->getNamedOperand(*Next, AMDGPU::OpName::src1)->getReg();
636	assert(SavedExec.isVirtual() && "Expected saved exec to be src1!");
637
638	const MachineInstr *Def = MRI->getUniqueVRegDef(Reg: SavedExec);
639	if (Def && LoweredIf.count(V: SavedExec)) {
640	LLVM_DEBUG(dbgs() << "Skip redundant "; MI->dump());
641	if (LIS)
642	LIS->RemoveMachineInstrFromMaps(MI&: *MI);
643	Register Reg;
644	if (LV)
645	Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
646	MI->eraseFromParent();
647	if (LV)
648	LV->recomputeForSingleDefVirtReg(Reg);
649	removeMBBifRedundant(MBB);
650	}
651	}
652	}
653
654	MachineBasicBlock *SILowerControlFlow::process(MachineInstr &MI) {
655	MachineBasicBlock &MBB = *MI.getParent();
656	MachineBasicBlock::iterator I(MI);
657	MachineInstr Prev = (I != MBB.begin()) ? &(std::prev(x: I)) : nullptr;
658
659	MachineBasicBlock *SplitBB = &MBB;
660
661	switch (MI.getOpcode()) {
662	case AMDGPU::SI_IF:
663	emitIf(MI);
664	break;
665
666	case AMDGPU::SI_ELSE:
667	emitElse(MI);
668	break;
669
670	case AMDGPU::SI_IF_BREAK:
671	emitIfBreak(MI);
672	break;
673
674	case AMDGPU::SI_LOOP:
675	emitLoop(MI);
676	break;
677
678	case AMDGPU::SI_WATERFALL_LOOP:
679	MI.setDesc(TII->get(AMDGPU::S_CBRANCH_EXECNZ));
680	break;
681
682	case AMDGPU::SI_END_CF:
683	SplitBB = emitEndCf(MI);
684	break;
685
686	default:
687	assert(false && "Attempt to process unsupported instruction");
688	break;
689	}
690
691	MachineBasicBlock::iterator Next;
692	for (I = Prev ? Prev->getIterator() : MBB.begin(); I != MBB.end(); I = Next) {
693	Next = std::next(x: I);
694	MachineInstr &MaskMI = *I;
695	switch (MaskMI.getOpcode()) {
696	case AMDGPU::S_AND_B64:
697	case AMDGPU::S_OR_B64:
698	case AMDGPU::S_AND_B32:
699	case AMDGPU::S_OR_B32:
700	// Cleanup bit manipulations on exec mask
701	combineMasks(MI&: MaskMI);
702	break;
703	default:
704	I = MBB.end();
705	break;
706	}
707	}
708
709	return SplitBB;
710	}
711
712	void SILowerControlFlow::lowerInitExec(MachineBasicBlock *MBB,
713	MachineInstr &MI) {
714	MachineFunction &MF = *MBB->getParent();
715	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
716	bool IsWave32 = ST.isWave32();
717
718	if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
719	// This should be before all vector instructions.
720	MachineInstr InitMI = BuildMI(MBB, MBB->begin(), MI.getDebugLoc(),
721	TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), Exec)
722	.addImm(MI.getOperand(i: `0`).getImm());
723	if (LIS) {
724	LIS->RemoveMachineInstrFromMaps(MI);
725	LIS->InsertMachineInstrInMaps(MI&: *InitMI);
726	}
727	MI.eraseFromParent();
728	return;
729	}
730
731	// Extract the thread count from an SGPR input and set EXEC accordingly.
732	// Since BFM can't shift by 64, handle that case with CMP + CMOV.
733	//
734	// S_BFE_U32 count, input, {shift, 7}
735	// S_BFM_B64 exec, count, 0
736	// S_CMP_EQ_U32 count, 64
737	// S_CMOV_B64 exec, -1
738	Register InputReg = MI.getOperand(i: `0`).getReg();
739	MachineInstr FirstMI = &MBB->begin();
740	if (InputReg.isVirtual()) {
741	MachineInstr *DefInstr = MRI->getVRegDef(Reg: InputReg);
742	assert(DefInstr && DefInstr->isCopy());
743	if (DefInstr->getParent() == MBB) {
744	if (DefInstr != FirstMI) {
745	// If the `InputReg` is defined in current block, we also need to
746	// move that instruction to the beginning of the block.
747	DefInstr->removeFromParent();
748	MBB->insert(I: FirstMI, MI: DefInstr);
749	if (LIS)
750	LIS->handleMove(MI&: *DefInstr);
751	} else {
752	// If first instruction is definition then move pointer after it.
753	FirstMI = &*std::next(x: FirstMI->getIterator());
754	}
755	}
756	}
757
758	// Insert instruction sequence at block beginning (before vector operations).
759	const DebugLoc DL = MI.getDebugLoc();
760	const unsigned WavefrontSize = ST.getWavefrontSize();
761	const unsigned Mask = (WavefrontSize << `1`) - `1`;
762	Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
763	auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg)
764	.addReg(InputReg)
765	.addImm((MI.getOperand(`1`).getImm() & Mask) \| `0x70000`);
766	if (LV)
767	LV->recomputeForSingleDefVirtReg(Reg: InputReg);
768	auto BfmMI =
769	BuildMI(*MBB, FirstMI, DL,
770	TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec)
771	.addReg(CountReg)
772	.addImm(`0`);
773	auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
774	.addReg(CountReg, RegState::Kill)
775	.addImm(WavefrontSize);
776	if (LV)
777	LV->getVarInfo(Reg: CountReg).Kills.push_back(CmpMI);
778	auto CmovMI =
779	BuildMI(*MBB, FirstMI, DL,
780	TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
781	Exec)
782	.addImm(-`1`);
783
784	if (!LIS) {
785	MI.eraseFromParent();
786	return;
787	}
788
789	LIS->RemoveMachineInstrFromMaps(MI);
790	MI.eraseFromParent();
791
792	LIS->InsertMachineInstrInMaps(MI&: *BfeMI);
793	LIS->InsertMachineInstrInMaps(MI&: *BfmMI);
794	LIS->InsertMachineInstrInMaps(MI&: *CmpMI);
795	LIS->InsertMachineInstrInMaps(MI&: *CmovMI);
796
797	RecomputeRegs.insert(V: InputReg);
798	LIS->createAndComputeVirtRegInterval(Reg: CountReg);
799	}
800
801	bool SILowerControlFlow::removeMBBifRedundant(MachineBasicBlock &MBB) {
802	for (auto &I : MBB.instrs()) {
803	if (!I.isDebugInstr() && !I.isUnconditionalBranch())
804	return false;
805	}
806
807	assert(MBB.succ_size() == `1` && "MBB has more than one successor");
808
809	MachineBasicBlock Succ = MBB.succ_begin();
810	MachineBasicBlock FallThrough = nullptr*;
811
812	while (!MBB.predecessors().empty()) {
813	MachineBasicBlock P = MBB.pred_begin();
814	if (P->getFallThrough(JumpToFallThrough: false) == &MBB)
815	FallThrough = P;
816	P->ReplaceUsesOfBlockWith(Old: &MBB, New: Succ);
817	}
818	MBB.removeSuccessor(Succ);
819	if (LIS) {
820	for (auto &I : MBB.instrs())
821	LIS->RemoveMachineInstrFromMaps(MI&: I);
822	}
823	if (MDT) {
824	// If Succ, the single successor of MBB, is dominated by MBB, MDT needs
825	// updating by changing Succ's idom to the one of MBB; otherwise, MBB must
826	// be a leaf node in MDT and could be erased directly.
827	if (MDT->dominates(A: &MBB, B: Succ))
828	MDT->changeImmediateDominator(N: MDT->getNode(BB: Succ),
829	NewIDom: MDT->getNode(BB: &MBB)->getIDom());
830	MDT->eraseNode(BB: &MBB);
831	}
832	MBB.clear();
833	MBB.eraseFromParent();
834	if (FallThrough && !FallThrough->isLayoutSuccessor(MBB: Succ)) {
835	// Note: we cannot update block layout and preserve live intervals;
836	// hence we must insert a branch.
837	MachineInstr BranchMI = BuildMI(FallThrough, FallThrough->end(),
838	FallThrough->findBranchDebugLoc(), TII->get(AMDGPU::S_BRANCH))
839	.addMBB(Succ);
840	if (LIS)
841	LIS->InsertMachineInstrInMaps(MI&: *BranchMI);
842	}
843
844	return true;
845	}
846
847	bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
848	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
849	TII = ST.getInstrInfo();
850	TRI = &TII->getRegisterInfo();
851	EnableOptimizeEndCf = RemoveRedundantEndcf &&
852	MF.getTarget().getOptLevel() > CodeGenOptLevel::None;
853
854	// This doesn't actually need LiveIntervals, but we can preserve them.
855	LIS = getAnalysisIfAvailable<LiveIntervals>();
856	// This doesn't actually need LiveVariables, but we can preserve them.
857	LV = getAnalysisIfAvailable<LiveVariables>();
858	MDT = getAnalysisIfAvailable<MachineDominatorTree>();
859	MRI = &MF.getRegInfo();
860	BoolRC = TRI->getBoolRC();
861
862	if (ST.isWave32()) {
863	AndOpc = AMDGPU::S_AND_B32;
864	OrOpc = AMDGPU::S_OR_B32;
865	XorOpc = AMDGPU::S_XOR_B32;
866	MovTermOpc = AMDGPU::S_MOV_B32_term;
867	Andn2TermOpc = AMDGPU::S_ANDN2_B32_term;
868	XorTermrOpc = AMDGPU::S_XOR_B32_term;
869	OrTermrOpc = AMDGPU::S_OR_B32_term;
870	OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32;
871	Exec = AMDGPU::EXEC_LO;
872	} else {
873	AndOpc = AMDGPU::S_AND_B64;
874	OrOpc = AMDGPU::S_OR_B64;
875	XorOpc = AMDGPU::S_XOR_B64;
876	MovTermOpc = AMDGPU::S_MOV_B64_term;
877	Andn2TermOpc = AMDGPU::S_ANDN2_B64_term;
878	XorTermrOpc = AMDGPU::S_XOR_B64_term;
879	OrTermrOpc = AMDGPU::S_OR_B64_term;
880	OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B64;
881	Exec = AMDGPU::EXEC;
882	}
883
884	// Compute set of blocks with kills
885	const bool CanDemote =
886	MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
887	for (auto &MBB : MF) {
888	bool IsKillBlock = false;
889	for (auto &Term : MBB.terminators()) {
890	if (TII->isKillTerminator(Opcode: Term.getOpcode())) {
891	KillBlocks.insert(Ptr: &MBB);
892	IsKillBlock = true;
893	break;
894	}
895	}
896	if (CanDemote && !IsKillBlock) {
897	for (auto &MI : MBB) {
898	if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) {
899	KillBlocks.insert(Ptr: &MBB);
900	break;
901	}
902	}
903	}
904	}
905
906	bool Changed = false;
907	MachineFunction::iterator NextBB;
908	for (MachineFunction::iterator BI = MF.begin();
909	BI != MF.end(); BI = NextBB) {
910	NextBB = std::next(x: BI);
911	MachineBasicBlock MBB = &BI;
912
913	MachineBasicBlock::iterator I, E, Next;
914	E = MBB->end();
915	for (I = MBB->begin(); I != E; I = Next) {
916	Next = std::next(x: I);
917	MachineInstr &MI = *I;
918	MachineBasicBlock *SplitMBB = MBB;
919
920	switch (MI.getOpcode()) {
921	case AMDGPU::SI_IF:
922	case AMDGPU::SI_ELSE:
923	case AMDGPU::SI_IF_BREAK:
924	case AMDGPU::SI_WATERFALL_LOOP:
925	case AMDGPU::SI_LOOP:
926	case AMDGPU::SI_END_CF:
927	SplitMBB = process(MI);
928	Changed = true;
929	break;
930
931	// FIXME: find a better place for this
932	case AMDGPU::SI_INIT_EXEC:
933	case AMDGPU::SI_INIT_EXEC_FROM_INPUT:
934	lowerInitExec(MBB, MI);
935	if (LIS)
936	LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
937	Changed = true;
938	break;
939
940	default:
941	break;
942	}
943
944	if (SplitMBB != MBB) {
945	MBB = Next ->getParent();
946	E = MBB->end();
947	}
948	}
949	}
950
951	optimizeEndCf();
952
953	if (LIS) {
954	for (Register Reg : RecomputeRegs) {
955	LIS->removeInterval(Reg);
956	LIS->createAndComputeVirtRegInterval(Reg);
957	}
958	}
959
960	RecomputeRegs.clear();
961	LoweredEndCf.clear();
962	LoweredIf.clear();
963	KillBlocks.clear();
964
965	return Changed;
966	}
967

source code of llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp