SIWholeQuadMode.cpp source code [llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp]

1	//===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// This pass adds instructions to enable whole quad mode (strict or non-strict)
11	/// for pixel shaders, and strict whole wavefront mode for all programs.
12	///
13	/// The "strict" prefix indicates that inactive lanes do not take part in
14	/// control flow, specifically an inactive lane enabled by a strict WQM/WWM will
15	/// always be enabled irrespective of control flow decisions. Conversely in
16	/// non-strict WQM inactive lanes may control flow decisions.
17	///
18	/// Whole quad mode is required for derivative computations, but it interferes
19	/// with shader side effects (stores and atomics). It ensures that WQM is
20	/// enabled when necessary, but disabled around stores and atomics.
21	///
22	/// When necessary, this pass creates a function prolog
23	///
24	/// S_MOV_B64 LiveMask, EXEC
25	/// S_WQM_B64 EXEC, EXEC
26	///
27	/// to enter WQM at the top of the function and surrounds blocks of Exact
28	/// instructions by
29	///
30	/// S_AND_SAVEEXEC_B64 Tmp, LiveMask
31	/// ...
32	/// S_MOV_B64 EXEC, Tmp
33	///
34	/// We also compute when a sequence of instructions requires strict whole
35	/// wavefront mode (StrictWWM) and insert instructions to save and restore it:
36	///
37	/// S_OR_SAVEEXEC_B64 Tmp, -1
38	/// ...
39	/// S_MOV_B64 EXEC, Tmp
40	///
41	/// When a sequence of instructions requires strict whole quad mode (StrictWQM)
42	/// we use a similar save and restore mechanism and force whole quad mode for
43	/// those instructions:
44	///
45	/// S_MOV_B64 Tmp, EXEC
46	/// S_WQM_B64 EXEC, EXEC
47	/// ...
48	/// S_MOV_B64 EXEC, Tmp
49	///
50	/// In order to avoid excessive switching during sequences of Exact
51	/// instructions, the pass first analyzes which instructions must be run in WQM
52	/// (aka which instructions produce values that lead to derivative
53	/// computations).
54	///
55	/// Basic blocks are always exited in WQM as long as some successor needs WQM.
56	///
57	/// There is room for improvement given better control flow analysis:
58	///
59	/// (1) at the top level (outside of control flow statements, and as long as
60	/// kill hasn't been used), one SGPR can be saved by recovering WQM from
61	/// the LiveMask (this is implemented for the entry block).
62	///
63	/// (2) when entire regions (e.g. if-else blocks or entire loops) only
64	/// consist of exact and don't-care instructions, the switch only has to
65	/// be done at the entry and exit points rather than potentially in each
66	/// block of the region.
67	///
68	//===----------------------------------------------------------------------===//
69
70	#include "AMDGPU.h"
71	#include "GCNSubtarget.h"
72	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
73	#include "llvm/ADT/MapVector.h"
74	#include "llvm/ADT/PostOrderIterator.h"
75	#include "llvm/CodeGen/LiveIntervals.h"
76	#include "llvm/CodeGen/MachineBasicBlock.h"
77	#include "llvm/CodeGen/MachineDominators.h"
78	#include "llvm/CodeGen/MachineFunctionPass.h"
79	#include "llvm/CodeGen/MachineInstr.h"
80	#include "llvm/CodeGen/MachinePostDominators.h"
81	#include "llvm/IR/CallingConv.h"
82	#include "llvm/InitializePasses.h"
83	#include "llvm/Support/raw_ostream.h"
84
85	using namespace llvm;
86
87	#define DEBUG_TYPE "si-wqm"
88
89	namespace {
90
91	enum {
92	StateWQM = `0x1`,
93	StateStrictWWM = `0x2`,
94	StateStrictWQM = `0x4`,
95	StateExact = `0x8`,
96	StateStrict = StateStrictWWM \| StateStrictWQM,
97	};
98
99	struct PrintState {
100	public:
101	int State;
102
103	explicit PrintState(int State) : State(State) {}
104	};
105
106	#ifndef NDEBUG
107	static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
108
109	static const std::pair<char, const char *> Mapping[] = {
110	std::pair(StateWQM, "WQM"), std::pair(StateStrictWWM, "StrictWWM"),
111	std::pair(StateStrictWQM, "StrictWQM"), std::pair(StateExact, "Exact")};
112	char State = PS.State;
113	for (auto M : Mapping) {
114	if (State & M.first) {
115	OS << M.second;
116	State &= ~M.first;
117
118	if (State)
119	OS << `'\|'`;
120	}
121	}
122	assert(State == `0`);
123	return OS;
124	}
125	#endif
126
127	struct InstrInfo {
128	char Needs = `0`;
129	char Disabled = `0`;
130	char OutNeeds = `0`;
131	};
132
133	struct BlockInfo {
134	char Needs = `0`;
135	char InNeeds = `0`;
136	char OutNeeds = `0`;
137	char InitialState = `0`;
138	bool NeedsLowering = false;
139	};
140
141	struct WorkItem {
142	MachineBasicBlock MBB = nullptr*;
143	MachineInstr MI = nullptr*;
144
145	WorkItem() = default;
146	WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
147	WorkItem(MachineInstr *MI) : MI(MI) {}
148	};
149
150	class SIWholeQuadMode : public MachineFunctionPass {
151	private:
152	const SIInstrInfo *TII;
153	const SIRegisterInfo *TRI;
154	const GCNSubtarget *ST;
155	MachineRegisterInfo *MRI;
156	LiveIntervals *LIS;
157	MachineDominatorTree *MDT;
158	MachinePostDominatorTree *PDT;
159
160	unsigned AndOpc;
161	unsigned AndTermOpc;
162	unsigned AndN2Opc;
163	unsigned XorOpc;
164	unsigned AndSaveExecOpc;
165	unsigned AndSaveExecTermOpc;
166	unsigned WQMOpc;
167	Register Exec;
168	Register LiveMaskReg;
169
170	DenseMap<const MachineInstr *, InstrInfo> Instructions;
171	MapVector<MachineBasicBlock *, BlockInfo> Blocks;
172
173	// Tracks state (WQM/StrictWWM/StrictWQM/Exact) after a given instruction
174	DenseMap<const MachineInstr , char*> StateTransition;
175
176	SmallVector<MachineInstr *, `2`> LiveMaskQueries;
177	SmallVector<MachineInstr *, `4`> LowerToMovInstrs;
178	SmallVector<MachineInstr *, `4`> LowerToCopyInstrs;
179	SmallVector<MachineInstr *, `4`> KillInstrs;
180
181	void printInfo();
182
183	void markInstruction(MachineInstr &MI, char Flag,
184	std::vector<WorkItem> &Worklist);
185	void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg,
186	unsigned SubReg, char Flag, std::vector<WorkItem> &Worklist);
187	void markOperand(const MachineInstr &MI, const MachineOperand &Op, char Flag,
188	std::vector<WorkItem> &Worklist);
189	void markInstructionUses(const MachineInstr &MI, char Flag,
190	std::vector<WorkItem> &Worklist);
191	char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
192	void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
193	void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
194	char analyzeFunction(MachineFunction &MF);
195
196	MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,
197	MachineBasicBlock::iterator Before);
198	MachineBasicBlock::iterator
199	prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
200	MachineBasicBlock::iterator Last, bool PreferLast,
201	bool SaveSCC);
202	void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
203	Register SaveWQM);
204	void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
205	Register SavedWQM);
206	void toStrictMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
207	Register SaveOrig, char StrictStateNeeded);
208	void fromStrictMode(MachineBasicBlock &MBB,
209	MachineBasicBlock::iterator Before, Register SavedOrig,
210	char NonStrictState, char CurrentStrictState);
211
212	MachineBasicBlock splitBlock(MachineBasicBlock BB, MachineInstr *TermMI);
213
214	MachineInstr *lowerKillI1(MachineBasicBlock &MBB, MachineInstr &MI,
215	bool IsWQM);
216	MachineInstr *lowerKillF32(MachineBasicBlock &MBB, MachineInstr &MI);
217	void lowerPseudoStrictMode(MachineBasicBlock &MBB, MachineInstr *Entry,
218	MachineInstr *Exit);
219
220	void lowerBlock(MachineBasicBlock &MBB);
221	void processBlock(MachineBasicBlock &MBB, bool IsEntry);
222
223	void lowerLiveMaskQueries();
224	void lowerCopyInstrs();
225	void lowerKillInstrs(bool IsWQM);
226
227	public:
228	static char ID;
229
230	SIWholeQuadMode() :
231	MachineFunctionPass (ID) { }
232
233	bool runOnMachineFunction(MachineFunction &MF) override;
234
235	StringRef getPassName() const override { return "SI Whole Quad Mode"; }
236
237	void getAnalysisUsage(AnalysisUsage &AU) const override {
238	AU.addRequired<LiveIntervals>();
239	AU.addPreserved<SlotIndexes>();
240	AU.addPreserved<LiveIntervals>();
241	AU.addPreserved<MachineDominatorTree>();
242	AU.addPreserved<MachinePostDominatorTree>();
243	MachineFunctionPass::getAnalysisUsage(AU);
244	}
245
246	MachineFunctionProperties getClearedProperties() const override {
247	return MachineFunctionProperties ().set(
248	MachineFunctionProperties::Property::IsSSA);
249	}
250	};
251
252	} // end anonymous namespace
253
254	char SIWholeQuadMode::ID = `0`;
255
256	INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
257	false)
258	INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
259	INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
260	INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
261	INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
262	false)
263
264	char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
265
266	FunctionPass *llvm::createSIWholeQuadModePass() {
267	return new SIWholeQuadMode;
268	}
269
270	#ifndef NDEBUG
271	LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() {
272	for (const auto &BII : Blocks) {
273	dbgs() << "\n"
274	<< printMBBReference(MBB: *BII.first) << ":\n"
275	<< " InNeeds = " << PrintState (BII.second.InNeeds)
276	<< ", Needs = " << PrintState (BII.second.Needs)
277	<< ", OutNeeds = " << PrintState (BII.second.OutNeeds) << "\n\n";
278
279	for (const MachineInstr &MI : *BII.first) {
280	auto III = Instructions.find(Val: &MI);
281	if (III == Instructions.end())
282	continue;
283
284	dbgs() << " " << MI << " Needs = " << PrintState (III ->second.Needs)
285	<< ", OutNeeds = " << PrintState (III ->second.OutNeeds) << `'\n'`;
286	}
287	}
288	}
289	#endif
290
291	void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
292	std::vector<WorkItem> &Worklist) {
293	InstrInfo &II = Instructions [&MI];
294
295	assert(!(Flag & StateExact) && Flag != `0`);
296
297	// Remove any disabled states from the flag. The user that required it gets
298	// an undefined value in the helper lanes. For example, this can happen if
299	// the result of an atomic is used by instruction that requires WQM, where
300	// ignoring the request for WQM is correct as per the relevant specs.
301	Flag &= ~II.Disabled;
302
303	// Ignore if the flag is already encompassed by the existing needs, or we
304	// just disabled everything.
305	if ((II.Needs & Flag) == Flag)
306	return;
307
308	LLVM_DEBUG(dbgs() << "markInstruction " << PrintState (Flag) << ": " << MI);
309	II.Needs \|= Flag;
310	Worklist.push_back(x: &MI);
311	}
312
313	/// Mark all relevant definitions of register \p Reg in usage \p UseMI.
314	void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
315	Register Reg, unsigned SubReg, char Flag,
316	std::vector<WorkItem> &Worklist) {
317	LLVM_DEBUG(dbgs() << "markDefs " << PrintState (Flag) << ": " << UseMI);
318
319	LiveQueryResult UseLRQ = LR.Query(Idx: LIS->getInstructionIndex(Instr: UseMI));
320	const VNInfo *Value = UseLRQ.valueIn();
321	if (!Value)
322	return;
323
324	// Note: this code assumes that lane masks on AMDGPU completely
325	// cover registers.
326	const LaneBitmask UseLanes =
327	SubReg ? TRI->getSubRegIndexLaneMask(SubReg)
328	: (Reg.isVirtual() ? MRI->getMaxLaneMaskForVReg(Reg)
329	: LaneBitmask::getNone());
330
331	// Perform a depth-first iteration of the LiveRange graph marking defs.
332	// Stop processing of a given branch when all use lanes have been defined.
333	// The first definition stops processing for a physical register.
334	struct PhiEntry {
335	const VNInfo *Phi;
336	unsigned PredIdx;
337	LaneBitmask DefinedLanes;
338
339	PhiEntry(const VNInfo Phi, unsigned* PredIdx, LaneBitmask DefinedLanes)
340	: Phi(Phi), PredIdx(PredIdx), DefinedLanes (DefinedLanes) {}
341	};
342	using VisitKey = std::pair<const VNInfo *, LaneBitmask>;
343	SmallVector<PhiEntry, `2`> PhiStack;
344	SmallSet<VisitKey, `4`> Visited;
345	LaneBitmask DefinedLanes;
346	unsigned NextPredIdx = `0`; // Only used for processing phi nodes
347	do {
348	const VNInfo NextValue = nullptr*;
349	const VisitKey Key(Value, DefinedLanes);
350
351	if (Visited.insert(V: Key).second) {
352	// On first visit to a phi then start processing first predecessor
353	NextPredIdx = `0`;
354	}
355
356	if (Value->isPHIDef()) {
357	// Each predecessor node in the phi must be processed as a subgraph
358	const MachineBasicBlock *MBB = LIS->getMBBFromIndex(index: Value->def);
359	assert(MBB && "Phi-def has no defining MBB");
360
361	// Find next predecessor to process
362	unsigned Idx = NextPredIdx;
363	auto PI = MBB->pred_begin() + Idx;
364	auto PE = MBB->pred_end();
365	for (; PI != PE && !NextValue; ++PI, ++Idx) {
366	if (const VNInfo VN = LR.getVNInfoBefore(Idx: LIS->getMBBEndIdx(mbb: PI))) {
367	if (!Visited.count(V: VisitKey (VN, DefinedLanes)))
368	NextValue = VN;
369	}
370	}
371
372	// If there are more predecessors to process; add phi to stack
373	if (PI != PE)
374	PhiStack.emplace_back(Args&: Value, Args&: Idx, Args&: DefinedLanes);
375	} else {
376	MachineInstr *MI = LIS->getInstructionFromIndex(index: Value->def);
377	assert(MI && "Def has no defining instruction");
378
379	if (Reg.isVirtual()) {
380	// Iterate over all operands to find relevant definitions
381	bool HasDef = false;
382	for (const MachineOperand &Op : MI->all_defs()) {
383	if (Op.getReg() != Reg)
384	continue;
385
386	// Compute lanes defined and overlap with use
387	LaneBitmask OpLanes =
388	Op.isUndef() ? LaneBitmask::getAll()
389	: TRI->getSubRegIndexLaneMask(Op.getSubReg());
390	LaneBitmask Overlap = (UseLanes & OpLanes);
391
392	// Record if this instruction defined any of use
393	HasDef \|= Overlap.any();
394
395	// Mark any lanes defined
396	DefinedLanes \|= OpLanes;
397	}
398
399	// Check if all lanes of use have been defined
400	if ((DefinedLanes & UseLanes) != UseLanes) {
401	// Definition not complete; need to process input value
402	LiveQueryResult LRQ = LR.Query(Idx: LIS->getInstructionIndex(Instr: *MI));
403	if (const VNInfo *VN = LRQ.valueIn()) {
404	if (!Visited.count(V: VisitKey (VN, DefinedLanes)))
405	NextValue = VN;
406	}
407	}
408
409	// Only mark the instruction if it defines some part of the use
410	if (HasDef)
411	markInstruction(MI&: *MI, Flag, Worklist);
412	} else {
413	// For physical registers simply mark the defining instruction
414	markInstruction(MI&: *MI, Flag, Worklist);
415	}
416	}
417
418	if (!NextValue && !PhiStack.empty()) {
419	// Reach end of chain; revert to processing last phi
420	PhiEntry &Entry = PhiStack.back();
421	NextValue = Entry.Phi;
422	NextPredIdx = Entry.PredIdx;
423	DefinedLanes = Entry.DefinedLanes;
424	PhiStack.pop_back();
425	}
426
427	Value = NextValue;
428	} while (Value);
429	}
430
431	void SIWholeQuadMode::markOperand(const MachineInstr &MI,
432	const MachineOperand &Op, char Flag,
433	std::vector<WorkItem> &Worklist) {
434	assert(Op.isReg());
435	Register Reg = Op.getReg();
436
437	// Ignore some hardware registers
438	switch (Reg) {
439	case AMDGPU::EXEC:
440	case AMDGPU::EXEC_LO:
441	return;
442	default:
443	break;
444	}
445
446	LLVM_DEBUG(dbgs() << "markOperand " << PrintState (Flag) << ": " << Op
447	<< " for " << MI);
448	if (Reg.isVirtual()) {
449	LiveRange &LR = LIS->getInterval(Reg);
450	markDefs(UseMI: MI, LR, Reg, SubReg: Op.getSubReg(), Flag, Worklist);
451	} else {
452	// Handle physical registers that we need to track; this is mostly relevant
453	// for VCC, which can appear as the (implicit) input of a uniform branch,
454	// e.g. when a loop counter is stored in a VGPR.
455	for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg())) {
456	LiveRange &LR = LIS->getRegUnit(Unit);
457	const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
458	if (!Value)
459	continue;
460
461	markDefs(MI, LR, Unit, AMDGPU::NoSubRegister, Flag, Worklist);
462	}
463	}
464	}
465
466	/// Mark all instructions defining the uses in \p MI with \p Flag.
467	void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
468	std::vector<WorkItem> &Worklist) {
469	LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState (Flag) << ": "
470	<< MI);
471
472	for (const MachineOperand &Use : MI.all_uses())
473	markOperand(MI, Op: Use, Flag, Worklist);
474	}
475
476	// Scan instructions to determine which ones require an Exact execmask and
477	// which ones seed WQM requirements.
478	char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
479	std::vector<WorkItem> &Worklist) {
480	char GlobalFlags = `0`;
481	bool WQMOutputs = MF.getFunction().hasFnAttribute(Kind: "amdgpu-ps-wqm-outputs");
482	SmallVector<MachineInstr *, `4`> SetInactiveInstrs;
483	SmallVector<MachineInstr *, `4`> SoftWQMInstrs;
484	bool HasImplicitDerivatives =
485	MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
486
487	// We need to visit the basic blocks in reverse post-order so that we visit
488	// defs before uses, in particular so that we don't accidentally mark an
489	// instruction as needing e.g. WQM before visiting it and realizing it needs
490	// WQM disabled.
491	ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
492	for (MachineBasicBlock *MBB : RPOT) {
493	BlockInfo &BBI = Blocks [MBB];
494
495	for (MachineInstr &MI : *MBB) {
496	InstrInfo &III = Instructions [&MI];
497	unsigned Opcode = MI.getOpcode();
498	char Flags = `0`;
499
500	if (TII->isWQM(Opcode)) {
501	// If LOD is not supported WQM is not needed.
502	if (!ST->hasExtendedImageInsts())
503	continue;
504	// Only generate implicit WQM if implicit derivatives are required.
505	// This avoids inserting unintended WQM if a shader type without
506	// implicit derivatives uses an image sampling instruction.
507	if (!HasImplicitDerivatives)
508	continue;
509	// Sampling instructions don't need to produce results for all pixels
510	// in a quad, they just require all inputs of a quad to have been
511	// computed for derivatives.
512	markInstructionUses(MI, Flag: StateWQM, Worklist);
513	GlobalFlags \|= StateWQM;
514	continue;
515	} else if (Opcode == AMDGPU::WQM) {
516	// The WQM intrinsic requires its output to have all the helper lanes
517	// correct, so we need it to be in WQM.
518	Flags = StateWQM;
519	LowerToCopyInstrs.push_back(Elt: &MI);
520	} else if (Opcode == AMDGPU::SOFT_WQM) {
521	LowerToCopyInstrs.push_back(Elt: &MI);
522	SoftWQMInstrs.push_back(Elt: &MI);
523	continue;
524	} else if (Opcode == AMDGPU::STRICT_WWM) {
525	// The STRICT_WWM intrinsic doesn't make the same guarantee, and plus
526	// it needs to be executed in WQM or Exact so that its copy doesn't
527	// clobber inactive lanes.
528	markInstructionUses(MI, Flag: StateStrictWWM, Worklist);
529	GlobalFlags \|= StateStrictWWM;
530	LowerToMovInstrs.push_back(Elt: &MI);
531	continue;
532	} else if (Opcode == AMDGPU::STRICT_WQM \|\|
533	TII->isDualSourceBlendEXP(MI)) {
534	// STRICT_WQM is similar to STRICTWWM, but instead of enabling all
535	// threads of the wave like STRICTWWM, STRICT_WQM enables all threads in
536	// quads that have at least one active thread.
537	markInstructionUses(MI, Flag: StateStrictWQM, Worklist);
538	GlobalFlags \|= StateStrictWQM;
539
540	if (Opcode == AMDGPU::STRICT_WQM) {
541	LowerToMovInstrs.push_back(Elt: &MI);
542	} else {
543	// Dual source blend export acts as implicit strict-wqm, its sources
544	// need to be shuffled in strict wqm, but the export itself needs to
545	// run in exact mode.
546	BBI.Needs \|= StateExact;
547	if (!(BBI.InNeeds & StateExact)) {
548	BBI.InNeeds \|= StateExact;
549	Worklist.push_back(x: MBB);
550	}
551	GlobalFlags \|= StateExact;
552	III.Disabled = StateWQM \| StateStrict;
553	}
554	continue;
555	} else if (Opcode == AMDGPU::LDS_PARAM_LOAD \|\|
556	Opcode == AMDGPU::DS_PARAM_LOAD \|\|
557	Opcode == AMDGPU::LDS_DIRECT_LOAD \|\|
558	Opcode == AMDGPU::DS_DIRECT_LOAD) {
559	// Mark these STRICTWQM, but only for the instruction, not its operands.
560	// This avoid unnecessarily marking M0 as requiring WQM.
561	InstrInfo &II = Instructions [&MI];
562	II.Needs \|= StateStrictWQM;
563	GlobalFlags \|= StateStrictWQM;
564	continue;
565	} else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 \|\|
566	Opcode == AMDGPU::V_SET_INACTIVE_B64) {
567	III.Disabled = StateStrict;
568	MachineOperand &Inactive = MI.getOperand(i: `2`);
569	if (Inactive.isReg()) {
570	if (Inactive.isUndef()) {
571	LowerToCopyInstrs.push_back(Elt: &MI);
572	} else {
573	markOperand(MI, Op: Inactive, Flag: StateStrictWWM, Worklist);
574	}
575	}
576	SetInactiveInstrs.push_back(Elt: &MI);
577	continue;
578	} else if (TII->isDisableWQM(MI)) {
579	BBI.Needs \|= StateExact;
580	if (!(BBI.InNeeds & StateExact)) {
581	BBI.InNeeds \|= StateExact;
582	Worklist.push_back(x: MBB);
583	}
584	GlobalFlags \|= StateExact;
585	III.Disabled = StateWQM \| StateStrict;
586	continue;
587	} else {
588	if (Opcode == AMDGPU::SI_PS_LIVE \|\| Opcode == AMDGPU::SI_LIVE_MASK) {
589	LiveMaskQueries.push_back(Elt: &MI);
590	} else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR \|\|
591	Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR \|\|
592	Opcode == AMDGPU::SI_DEMOTE_I1) {
593	KillInstrs.push_back(Elt: &MI);
594	BBI.NeedsLowering = true;
595	} else if (WQMOutputs) {
596	// The function is in machine SSA form, which means that physical
597	// VGPRs correspond to shader inputs and outputs. Inputs are
598	// only used, outputs are only defined.
599	// FIXME: is this still valid?
600	for (const MachineOperand &MO : MI.defs()) {
601	if (!MO.isReg())
602	continue;
603
604	Register Reg = MO.getReg();
605
606	if (!Reg.isVirtual() &&
607	TRI->hasVectorRegisters(RC: TRI->getPhysRegBaseClass(Reg))) {
608	Flags = StateWQM;
609	break;
610	}
611	}
612	}
613
614	if (!Flags)
615	continue;
616	}
617
618	markInstruction(MI, Flag: Flags, Worklist);
619	GlobalFlags \|= Flags;
620	}
621	}
622
623	// Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
624	// ever used anywhere in the function. This implements the corresponding
625	// semantics of @llvm.amdgcn.set.inactive.
626	// Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm.
627	if (GlobalFlags & StateWQM) {
628	for (MachineInstr *MI : SetInactiveInstrs)
629	markInstruction(MI&: *MI, Flag: StateWQM, Worklist);
630	for (MachineInstr *MI : SoftWQMInstrs)
631	markInstruction(MI&: *MI, Flag: StateWQM, Worklist);
632	}
633
634	return GlobalFlags;
635	}
636
637	void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
638	std::vector<WorkItem>& Worklist) {
639	MachineBasicBlock *MBB = MI.getParent();
640	InstrInfo II = Instructions [&MI]; // take a copy to prevent dangling references
641	BlockInfo &BI = Blocks [MBB];
642
643	// Control flow-type instructions and stores to temporary memory that are
644	// followed by WQM computations must themselves be in WQM.
645	if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) &&
646	(MI.isTerminator() \|\| (TII->usesVM_CNT(MI) && MI.mayStore()))) {
647	Instructions [&MI].Needs = StateWQM;
648	II.Needs = StateWQM;
649	}
650
651	// Propagate to block level
652	if (II.Needs & StateWQM) {
653	BI.Needs \|= StateWQM;
654	if (!(BI.InNeeds & StateWQM)) {
655	BI.InNeeds \|= StateWQM;
656	Worklist.push_back(x: MBB);
657	}
658	}
659
660	// Propagate backwards within block
661	if (MachineInstr *PrevMI = MI.getPrevNode()) {
662	char InNeeds = (II.Needs & ~StateStrict) \| II.OutNeeds;
663	if (!PrevMI->isPHI()) {
664	InstrInfo &PrevII = Instructions [PrevMI];
665	if ((PrevII.OutNeeds \| InNeeds) != PrevII.OutNeeds) {
666	PrevII.OutNeeds \|= InNeeds;
667	Worklist.push_back(x: PrevMI);
668	}
669	}
670	}
671
672	// Propagate WQM flag to instruction inputs
673	assert(!(II.Needs & StateExact));
674
675	if (II.Needs != `0`)
676	markInstructionUses(MI, Flag: II.Needs, Worklist);
677
678	// Ensure we process a block containing StrictWWM/StrictWQM, even if it does
679	// not require any WQM transitions.
680	if (II.Needs & StateStrictWWM)
681	BI.Needs \|= StateStrictWWM;
682	if (II.Needs & StateStrictWQM)
683	BI.Needs \|= StateStrictWQM;
684	}
685
686	void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
687	std::vector<WorkItem>& Worklist) {
688	BlockInfo BI = Blocks [&MBB]; // Make a copy to prevent dangling references.
689
690	// Propagate through instructions
691	if (!MBB.empty()) {
692	MachineInstr LastMI = &MBB.rbegin();
693	InstrInfo &LastII = Instructions [LastMI];
694	if ((LastII.OutNeeds \| BI.OutNeeds) != LastII.OutNeeds) {
695	LastII.OutNeeds \|= BI.OutNeeds;
696	Worklist.push_back(x: LastMI);
697	}
698	}
699
700	// Predecessor blocks must provide for our WQM/Exact needs.
701	for (MachineBasicBlock *Pred : MBB.predecessors()) {
702	BlockInfo &PredBI = Blocks [Pred];
703	if ((PredBI.OutNeeds \| BI.InNeeds) == PredBI.OutNeeds)
704	continue;
705
706	PredBI.OutNeeds \|= BI.InNeeds;
707	PredBI.InNeeds \|= BI.InNeeds;
708	Worklist.push_back(x: Pred);
709	}
710
711	// All successors must be prepared to accept the same set of WQM/Exact data.
712	for (MachineBasicBlock *Succ : MBB.successors()) {
713	BlockInfo &SuccBI = Blocks [Succ];
714	if ((SuccBI.InNeeds \| BI.OutNeeds) == SuccBI.InNeeds)
715	continue;
716
717	SuccBI.InNeeds \|= BI.OutNeeds;
718	Worklist.push_back(x: Succ);
719	}
720	}
721
722	char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
723	std::vector<WorkItem> Worklist;
724	char GlobalFlags = scanInstructions(MF, Worklist);
725
726	while (!Worklist.empty()) {
727	WorkItem WI = Worklist.back();
728	Worklist.pop_back();
729
730	if (WI.MI)
731	propagateInstruction(MI&: *WI.MI, Worklist);
732	else
733	propagateBlock(MBB&: *WI.MBB, Worklist);
734	}
735
736	return GlobalFlags;
737	}
738
739	MachineBasicBlock::iterator
740	SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
741	MachineBasicBlock::iterator Before) {
742	Register SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
743
744	MachineInstr *Save =
745	BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
746	.addReg(AMDGPU::SCC);
747	MachineInstr *Restore =
748	BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC)
749	.addReg(SaveReg);
750
751	LIS->InsertMachineInstrInMaps(MI&: *Save);
752	LIS->InsertMachineInstrInMaps(MI&: *Restore);
753	LIS->createAndComputeVirtRegInterval(Reg: SaveReg);
754
755	return Restore;
756	}
757
758	MachineBasicBlock SIWholeQuadMode::splitBlock(MachineBasicBlock BB,
759	MachineInstr *TermMI) {
760	LLVM_DEBUG(dbgs() << "Split block " << printMBBReference(*BB) << " @ "
761	<< *TermMI << "\n");
762
763	MachineBasicBlock *SplitBB =
764	BB->splitAt(SplitInst&: TermMI, /UpdateLiveIns/* true, LIS);
765
766	// Convert last instruction in block to a terminator.
767	// Note: this only covers the expected patterns
768	unsigned NewOpcode = `0`;
769	switch (TermMI->getOpcode()) {
770	case AMDGPU::S_AND_B32:
771	NewOpcode = AMDGPU::S_AND_B32_term;
772	break;
773	case AMDGPU::S_AND_B64:
774	NewOpcode = AMDGPU::S_AND_B64_term;
775	break;
776	case AMDGPU::S_MOV_B32:
777	NewOpcode = AMDGPU::S_MOV_B32_term;
778	break;
779	case AMDGPU::S_MOV_B64:
780	NewOpcode = AMDGPU::S_MOV_B64_term;
781	break;
782	default:
783	break;
784	}
785	if (NewOpcode)
786	TermMI->setDesc(TII->get(NewOpcode));
787
788	if (SplitBB != BB) {
789	// Update dominator trees
790	using DomTreeT = DomTreeBase<MachineBasicBlock>;
791	SmallVector<DomTreeT::UpdateType, `16`> DTUpdates;
792	for (MachineBasicBlock *Succ : SplitBB->successors()) {
793	DTUpdates.push_back(Elt: {DomTreeT::Insert, SplitBB, Succ});
794	DTUpdates.push_back(Elt: {DomTreeT::Delete, BB, Succ});
795	}
796	DTUpdates.push_back(Elt: {DomTreeT::Insert, BB, SplitBB});
797	if (MDT)
798	MDT->getBase().applyUpdates(Updates: DTUpdates);
799	if (PDT)
800	PDT->getBase().applyUpdates(Updates: DTUpdates);
801
802	// Link blocks
803	MachineInstr *MI =
804	BuildMI(*BB, BB->end(), DebugLoc(), TII->get(AMDGPU::S_BRANCH))
805	.addMBB(SplitBB);
806	LIS->InsertMachineInstrInMaps(MI&: *MI);
807	}
808
809	return SplitBB;
810	}
811
812	MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB,
813	MachineInstr &MI) {
814	const DebugLoc &DL = MI.getDebugLoc();
815	unsigned Opcode = `0`;
816
817	assert(MI.getOperand(`0`).isReg());
818
819	// Comparison is for live lanes; however here we compute the inverse
820	// (killed lanes). This is because VCMP will always generate 0 bits
821	// for inactive lanes so a mask of live lanes would not be correct
822	// inside control flow.
823	// Invert the comparison by swapping the operands and adjusting
824	// the comparison codes.
825
826	switch (MI.getOperand(i: `2`).getImm()) {
827	case ISD::SETUEQ:
828	Opcode = AMDGPU::V_CMP_LG_F32_e64;
829	break;
830	case ISD::SETUGT:
831	Opcode = AMDGPU::V_CMP_GE_F32_e64;
832	break;
833	case ISD::SETUGE:
834	Opcode = AMDGPU::V_CMP_GT_F32_e64;
835	break;
836	case ISD::SETULT:
837	Opcode = AMDGPU::V_CMP_LE_F32_e64;
838	break;
839	case ISD::SETULE:
840	Opcode = AMDGPU::V_CMP_LT_F32_e64;
841	break;
842	case ISD::SETUNE:
843	Opcode = AMDGPU::V_CMP_EQ_F32_e64;
844	break;
845	case ISD::SETO:
846	Opcode = AMDGPU::V_CMP_O_F32_e64;
847	break;
848	case ISD::SETUO:
849	Opcode = AMDGPU::V_CMP_U_F32_e64;
850	break;
851	case ISD::SETOEQ:
852	case ISD::SETEQ:
853	Opcode = AMDGPU::V_CMP_NEQ_F32_e64;
854	break;
855	case ISD::SETOGT:
856	case ISD::SETGT:
857	Opcode = AMDGPU::V_CMP_NLT_F32_e64;
858	break;
859	case ISD::SETOGE:
860	case ISD::SETGE:
861	Opcode = AMDGPU::V_CMP_NLE_F32_e64;
862	break;
863	case ISD::SETOLT:
864	case ISD::SETLT:
865	Opcode = AMDGPU::V_CMP_NGT_F32_e64;
866	break;
867	case ISD::SETOLE:
868	case ISD::SETLE:
869	Opcode = AMDGPU::V_CMP_NGE_F32_e64;
870	break;
871	case ISD::SETONE:
872	case ISD::SETNE:
873	Opcode = AMDGPU::V_CMP_NLG_F32_e64;
874	break;
875	default:
876	llvm_unreachable("invalid ISD:SET cond code");
877	}
878
879	// Pick opcode based on comparison type.
880	MachineInstr *VcmpMI;
881	const MachineOperand &Op0 = MI.getOperand(i: `0`);
882	const MachineOperand &Op1 = MI.getOperand(i: `1`);
883
884	// VCC represents lanes killed.
885	Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
886
887	if (TRI->isVGPR(MRI: *MRI, Reg: Op0.getReg())) {
888	Opcode = AMDGPU::getVOPe32(Opcode);
889	VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)).add(Op1).add(Op0);
890	} else {
891	VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode))
892	.addReg(VCC, RegState::Define)
893	.addImm(`0`) // src0 modifiers
894	.add(Op1)
895	.addImm(`0`) // src1 modifiers
896	.add(Op0)
897	.addImm(`0`); // omod
898	}
899
900	MachineInstr *MaskUpdateMI =
901	BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
902	.addReg(LiveMaskReg)
903	.addReg(VCC);
904
905	// State of SCC represents whether any lanes are live in mask,
906	// if SCC is 0 then no lanes will be alive anymore.
907	MachineInstr *EarlyTermMI =
908	BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
909
910	MachineInstr *ExecMaskMI =
911	BuildMI(MBB, MI, DL, TII->get(AndN2Opc), Exec).addReg(Exec).addReg(VCC);
912
913	assert(MBB.succ_size() == `1`);
914	MachineInstr *NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
915	.addMBB(*MBB.succ_begin());
916
917	// Update live intervals
918	LIS->ReplaceMachineInstrInMaps(MI, NewMI&: *VcmpMI);
919	MBB.remove(I: &MI);
920
921	LIS->InsertMachineInstrInMaps(MI&: *MaskUpdateMI);
922	LIS->InsertMachineInstrInMaps(MI&: *ExecMaskMI);
923	LIS->InsertMachineInstrInMaps(MI&: *EarlyTermMI);
924	LIS->InsertMachineInstrInMaps(MI&: *NewTerm);
925
926	return NewTerm;
927	}
928
929	MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
930	MachineInstr &MI, bool IsWQM) {
931	const DebugLoc &DL = MI.getDebugLoc();
932	MachineInstr MaskUpdateMI = nullptr*;
933
934	const bool IsDemote = IsWQM && (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1);
935	const MachineOperand &Op = MI.getOperand(i: `0`);
936	int64_t KillVal = MI.getOperand(i: `1`).getImm();
937	MachineInstr ComputeKilledMaskMI = nullptr*;
938	Register CndReg = !Op.isImm() ? Op.getReg() : Register ();
939	Register TmpReg;
940
941	// Is this a static or dynamic kill?
942	if (Op.isImm()) {
943	if (Op.getImm() == KillVal) {
944	// Static: all active lanes are killed
945	MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
946	.addReg(LiveMaskReg)
947	.addReg(Exec);
948	} else {
949	// Static: kill does nothing
950	MachineInstr NewTerm = nullptr*;
951	if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) {
952	LIS->RemoveMachineInstrFromMaps(MI);
953	} else {
954	assert(MBB.succ_size() == `1`);
955	NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
956	.addMBB(*MBB.succ_begin());
957	LIS->ReplaceMachineInstrInMaps(MI, NewMI&: *NewTerm);
958	}
959	MBB.remove(I: &MI);
960	return NewTerm;
961	}
962	} else {
963	if (!KillVal) {
964	// Op represents live lanes after kill,
965	// so exec mask needs to be factored in.
966	TmpReg = MRI->createVirtualRegister(RegClass: TRI->getBoolRC());
967	ComputeKilledMaskMI =
968	BuildMI(MBB, MI, DL, TII->get(XorOpc), TmpReg).add(Op).addReg(Exec);
969	MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
970	.addReg(LiveMaskReg)
971	.addReg(TmpReg);
972	} else {
973	// Op represents lanes to kill
974	MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
975	.addReg(LiveMaskReg)
976	.add(Op);
977	}
978	}
979
980	// State of SCC represents whether any lanes are live in mask,
981	// if SCC is 0 then no lanes will be alive anymore.
982	MachineInstr *EarlyTermMI =
983	BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
984
985	// In the case we got this far some lanes are still live,
986	// update EXEC to deactivate lanes as appropriate.
987	MachineInstr *NewTerm;
988	MachineInstr WQMMaskMI = nullptr*;
989	Register LiveMaskWQM;
990	if (IsDemote) {
991	// Demote - deactivate quads with only helper lanes
992	LiveMaskWQM = MRI->createVirtualRegister(RegClass: TRI->getBoolRC());
993	WQMMaskMI =
994	BuildMI(MBB, MI, DL, TII->get(WQMOpc), LiveMaskWQM).addReg(LiveMaskReg);
995	NewTerm = BuildMI(MBB, MI, DL, TII->get(AndOpc), Exec)
996	.addReg(Exec)
997	.addReg(LiveMaskWQM);
998	} else {
999	// Kill - deactivate lanes no longer in live mask
1000	if (Op.isImm()) {
1001	unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1002	NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(`0`);
1003	} else if (!IsWQM) {
1004	NewTerm = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Exec)
1005	.addReg(Exec)
1006	.addReg(LiveMaskReg);
1007	} else {
1008	unsigned Opcode = KillVal ? AndN2Opc : AndOpc;
1009	NewTerm =
1010	BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec).addReg(Exec).add(Op);
1011	}
1012	}
1013
1014	// Update live intervals
1015	LIS->RemoveMachineInstrFromMaps(MI);
1016	MBB.remove(I: &MI);
1017	assert(EarlyTermMI);
1018	assert(MaskUpdateMI);
1019	assert(NewTerm);
1020	if (ComputeKilledMaskMI)
1021	LIS->InsertMachineInstrInMaps(MI&: *ComputeKilledMaskMI);
1022	LIS->InsertMachineInstrInMaps(MI&: *MaskUpdateMI);
1023	LIS->InsertMachineInstrInMaps(MI&: *EarlyTermMI);
1024	if (WQMMaskMI)
1025	LIS->InsertMachineInstrInMaps(MI&: *WQMMaskMI);
1026	LIS->InsertMachineInstrInMaps(MI&: *NewTerm);
1027
1028	if (CndReg) {
1029	LIS->removeInterval(Reg: CndReg);
1030	LIS->createAndComputeVirtRegInterval(Reg: CndReg);
1031	}
1032	if (TmpReg)
1033	LIS->createAndComputeVirtRegInterval(Reg: TmpReg);
1034	if (LiveMaskWQM)
1035	LIS->createAndComputeVirtRegInterval(Reg: LiveMaskWQM);
1036
1037	return NewTerm;
1038	}
1039
1040	// Convert a strict mode transition to a pseudo transition.
1041	// This still pre-allocates registers to prevent clobbering,
1042	// but avoids any EXEC mask changes.
1043	void SIWholeQuadMode::lowerPseudoStrictMode(MachineBasicBlock &MBB,
1044	MachineInstr *Entry,
1045	MachineInstr *Exit) {
1046	assert(Entry->getOpcode() == AMDGPU::ENTER_STRICT_WQM);
1047	assert(Exit->getOpcode() == AMDGPU::EXIT_STRICT_WQM);
1048
1049	Register SaveOrig = Entry->getOperand(i: `0`).getReg();
1050
1051	MachineInstr *NewEntry =
1052	BuildMI(MBB, Entry, DebugLoc(), TII->get(AMDGPU::ENTER_PSEUDO_WM));
1053	MachineInstr *NewExit =
1054	BuildMI(MBB, Exit, DebugLoc(), TII->get(AMDGPU::EXIT_PSEUDO_WM));
1055
1056	LIS->ReplaceMachineInstrInMaps(MI&: Exit, NewMI&: NewExit);
1057	Exit->eraseFromParent();
1058
1059	LIS->ReplaceMachineInstrInMaps(MI&: Entry, NewMI&: NewEntry);
1060	Entry->eraseFromParent();
1061
1062	LIS->removeInterval(Reg: SaveOrig);
1063	}
1064
1065	// Replace (or supplement) instructions accessing live mask.
1066	// This can only happen once all the live mask registers have been created
1067	// and the execute state (WQM/StrictWWM/Exact) of instructions is known.
1068	void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
1069	auto BII = Blocks.find(Key: &MBB);
1070	if (BII == Blocks.end())
1071	return;
1072
1073	const BlockInfo &BI = BII->second;
1074	if (!BI.NeedsLowering)
1075	return;
1076
1077	LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n");
1078
1079	SmallVector<MachineInstr *, `4`> SplitPoints;
1080	char State = BI.InitialState;
1081	MachineInstr StrictEntry = nullptr*;
1082
1083	for (MachineInstr &MI : llvm::make_early_inc_range(
1084	Range: llvm::make_range(x: MBB.getFirstNonPHI(), y: MBB.end()))) {
1085	char PreviousState = State;
1086
1087	if (StateTransition.count(Val: &MI))
1088	State = StateTransition [&MI];
1089
1090	MachineInstr SplitPoint = nullptr*;
1091	switch (MI.getOpcode()) {
1092	case AMDGPU::SI_DEMOTE_I1:
1093	case AMDGPU::SI_KILL_I1_TERMINATOR:
1094	SplitPoint = lowerKillI1(MBB, MI, IsWQM: State == StateWQM);
1095	break;
1096	case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1097	SplitPoint = lowerKillF32(MBB, MI);
1098	break;
1099	case AMDGPU::ENTER_STRICT_WQM:
1100	StrictEntry = PreviousState == StateWQM ? &MI : nullptr;
1101	break;
1102	case AMDGPU::EXIT_STRICT_WQM:
1103	if (State == StateWQM && StrictEntry) {
1104	// Transition WQM -> StrictWQM -> WQM detected.
1105	lowerPseudoStrictMode(MBB, Entry: StrictEntry, Exit: &MI);
1106	}
1107	StrictEntry = nullptr;
1108	break;
1109	case AMDGPU::ENTER_STRICT_WWM:
1110	case AMDGPU::EXIT_STRICT_WWM:
1111	StrictEntry = nullptr;
1112	break;
1113	default:
1114	break;
1115	}
1116	if (SplitPoint)
1117	SplitPoints.push_back(Elt: SplitPoint);
1118	}
1119
1120	// Perform splitting after instruction scan to simplify iteration.
1121	if (!SplitPoints.empty()) {
1122	MachineBasicBlock *BB = &MBB;
1123	for (MachineInstr *MI : SplitPoints) {
1124	BB = splitBlock(BB, TermMI: MI);
1125	}
1126	}
1127	}
1128
1129	// Return an iterator in the (inclusive) range [First, Last] at which
1130	// instructions can be safely inserted, keeping in mind that some of the
1131	// instructions we want to add necessarily clobber SCC.
1132	MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
1133	MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
1134	MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
1135	if (!SaveSCC)
1136	return PreferLast ? Last : First;
1137
1138	LiveRange &LR =
1139	LIS->getRegUnit(*TRI->regunits(MCRegister::from(AMDGPU::SCC)).begin());
1140	auto MBBE = MBB.end();
1141	SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(Instr: *First)
1142	: LIS->getMBBEndIdx(mbb: &MBB);
1143	SlotIndex LastIdx =
1144	Last != MBBE ? LIS->getInstructionIndex(Instr: *Last) : LIS->getMBBEndIdx(mbb: &MBB);
1145	SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
1146	const LiveRange::Segment *S;
1147
1148	for (;;) {
1149	S = LR.getSegmentContaining(Idx);
1150	if (!S)
1151	break;
1152
1153	if (PreferLast) {
1154	SlotIndex Next = S->start.getBaseIndex();
1155	if (Next < FirstIdx)
1156	break;
1157	Idx = Next;
1158	} else {
1159	MachineInstr *EndMI = LIS->getInstructionFromIndex(index: S->end.getBaseIndex());
1160	assert(EndMI && "Segment does not end on valid instruction");
1161	auto NextI = std::next(x: EndMI->getIterator());
1162	if (NextI == MBB.end())
1163	break;
1164	SlotIndex Next = LIS->getInstructionIndex(Instr: *NextI);
1165	if (Next > LastIdx)
1166	break;
1167	Idx = Next;
1168	}
1169	}
1170
1171	MachineBasicBlock::iterator MBBI;
1172
1173	if (MachineInstr *MI = LIS->getInstructionFromIndex(index: Idx))
1174	MBBI = MI;
1175	else {
1176	assert(Idx == LIS->getMBBEndIdx(&MBB));
1177	MBBI = MBB.end();
1178	}
1179
1180	// Move insertion point past any operations modifying EXEC.
1181	// This assumes that the value of SCC defined by any of these operations
1182	// does not need to be preserved.
1183	while (MBBI != Last) {
1184	bool IsExecDef = false;
1185	for (const MachineOperand &MO : MBBI ->all_defs()) {
1186	IsExecDef \|=
1187	MO.getReg() == AMDGPU::EXEC_LO \|\| MO.getReg() == AMDGPU::EXEC;
1188	}
1189	if (!IsExecDef)
1190	break;
1191	MBBI ++;
1192	S = nullptr;
1193	}
1194
1195	if (S)
1196	MBBI = saveSCC(MBB, Before: MBBI);
1197
1198	return MBBI;
1199	}
1200
1201	void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
1202	MachineBasicBlock::iterator Before,
1203	Register SaveWQM) {
1204	bool IsTerminator = Before == MBB.end();
1205	if (!IsTerminator) {
1206	auto FirstTerm = MBB.getFirstTerminator();
1207	if (FirstTerm != MBB.end()) {
1208	SlotIndex FirstTermIdx = LIS->getInstructionIndex(Instr: *FirstTerm);
1209	SlotIndex BeforeIdx = LIS->getInstructionIndex(Instr: *Before);
1210	IsTerminator = BeforeIdx > FirstTermIdx;
1211	}
1212	}
1213
1214	MachineInstr *MI;
1215
1216	if (SaveWQM) {
1217	unsigned Opcode = IsTerminator ? AndSaveExecTermOpc : AndSaveExecOpc;
1218	MI = BuildMI(MBB, Before, DebugLoc (), TII->get(Opcode), SaveWQM)
1219	.addReg(LiveMaskReg);
1220	} else {
1221	unsigned Opcode = IsTerminator ? AndTermOpc : AndOpc;
1222	MI = BuildMI(MBB, Before, DebugLoc (), TII->get(Opcode), Exec)
1223	.addReg(Exec)
1224	.addReg(LiveMaskReg);
1225	}
1226
1227	LIS->InsertMachineInstrInMaps(MI&: *MI);
1228	StateTransition [MI] = StateExact;
1229	}
1230
1231	void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
1232	MachineBasicBlock::iterator Before,
1233	Register SavedWQM) {
1234	MachineInstr *MI;
1235
1236	if (SavedWQM) {
1237	MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec)
1238	.addReg(SavedWQM);
1239	} else {
1240	MI = BuildMI(MBB, Before, DebugLoc (), TII->get(WQMOpc), Exec).addReg(Exec);
1241	}
1242
1243	LIS->InsertMachineInstrInMaps(MI&: *MI);
1244	StateTransition [MI] = StateWQM;
1245	}
1246
1247	void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB,
1248	MachineBasicBlock::iterator Before,
1249	Register SaveOrig, char StrictStateNeeded) {
1250	MachineInstr *MI;
1251	assert(SaveOrig);
1252	assert(StrictStateNeeded == StateStrictWWM \|\|
1253	StrictStateNeeded == StateStrictWQM);
1254
1255	if (StrictStateNeeded == StateStrictWWM) {
1256	MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WWM),
1257	SaveOrig)
1258	.addImm(-`1`);
1259	} else {
1260	MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WQM),
1261	SaveOrig)
1262	.addImm(-`1`);
1263	}
1264	LIS->InsertMachineInstrInMaps(MI&: *MI);
1265	StateTransition [MI] = StrictStateNeeded;
1266
1267	// Mark block as needing lower so it will be checked for unnecessary transitions.
1268	auto BII = Blocks.find(Key: &MBB);
1269	if (BII != Blocks.end())
1270	BII->second.NeedsLowering = true;
1271	}
1272
1273	void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB,
1274	MachineBasicBlock::iterator Before,
1275	Register SavedOrig, char NonStrictState,
1276	char CurrentStrictState) {
1277	MachineInstr *MI;
1278
1279	assert(SavedOrig);
1280	assert(CurrentStrictState == StateStrictWWM \|\|
1281	CurrentStrictState == StateStrictWQM);
1282
1283	if (CurrentStrictState == StateStrictWWM) {
1284	MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WWM),
1285	Exec)
1286	.addReg(SavedOrig);
1287	} else {
1288	MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WQM),
1289	Exec)
1290	.addReg(SavedOrig);
1291	}
1292	LIS->InsertMachineInstrInMaps(MI&: *MI);
1293	StateTransition [MI] = NonStrictState;
1294	}
1295
1296	void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
1297	auto BII = Blocks.find(Key: &MBB);
1298	if (BII == Blocks.end())
1299	return;
1300
1301	BlockInfo &BI = BII->second;
1302
1303	// This is a non-entry block that is WQM throughout, so no need to do
1304	// anything.
1305	if (!IsEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) {
1306	BI.InitialState = StateWQM;
1307	return;
1308	}
1309
1310	LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)
1311	<< ":\n");
1312
1313	Register SavedWQMReg;
1314	Register SavedNonStrictReg;
1315	bool WQMFromExec = IsEntry;
1316	char State = (IsEntry \|\| !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
1317	char NonStrictState = `0`;
1318	const TargetRegisterClass *BoolRC = TRI->getBoolRC();
1319
1320	auto II = MBB.getFirstNonPHI(), IE = MBB.end();
1321	if (IsEntry) {
1322	// Skip the instruction that saves LiveMask
1323	if (II != IE && II->getOpcode() == AMDGPU::COPY &&
1324	II->getOperand(`1`).getReg() == TRI->getExec())
1325	++II;
1326	}
1327
1328	// This stores the first instruction where it's safe to switch from WQM to
1329	// Exact or vice versa.
1330	MachineBasicBlock::iterator FirstWQM = IE;
1331
1332	// This stores the first instruction where it's safe to switch from Strict
1333	// mode to Exact/WQM or to switch to Strict mode. It must always be the same
1334	// as, or after, FirstWQM since if it's safe to switch to/from Strict, it must
1335	// be safe to switch to/from WQM as well.
1336	MachineBasicBlock::iterator FirstStrict = IE;
1337
1338	// Record initial state is block information.
1339	BI.InitialState = State;
1340
1341	for (;;) {
1342	MachineBasicBlock::iterator Next = II;
1343	char Needs = StateExact \| StateWQM; // Strict mode is disabled by default.
1344	char OutNeeds = `0`;
1345
1346	if (FirstWQM == IE)
1347	FirstWQM = II;
1348
1349	if (FirstStrict == IE)
1350	FirstStrict = II;
1351
1352	// First, figure out the allowed states (Needs) based on the propagated
1353	// flags.
1354	if (II != IE) {
1355	MachineInstr &MI = *II;
1356
1357	if (MI.isTerminator() \|\| TII->mayReadEXEC(MRI: *MRI, MI)) {
1358	auto III = Instructions.find(Val: &MI);
1359	if (III != Instructions.end()) {
1360	if (III ->second.Needs & StateStrictWWM)
1361	Needs = StateStrictWWM;
1362	else if (III ->second.Needs & StateStrictWQM)
1363	Needs = StateStrictWQM;
1364	else if (III ->second.Needs & StateWQM)
1365	Needs = StateWQM;
1366	else
1367	Needs &= ~III ->second.Disabled;
1368	OutNeeds = III ->second.OutNeeds;
1369	}
1370	} else {
1371	// If the instruction doesn't actually need a correct EXEC, then we can
1372	// safely leave Strict mode enabled.
1373	Needs = StateExact \| StateWQM \| StateStrict;
1374	}
1375
1376	// Exact mode exit can occur in terminators, but must be before branches.
1377	if (MI.isBranch() && OutNeeds == StateExact)
1378	Needs = StateExact;
1379
1380	++Next;
1381	} else {
1382	// End of basic block
1383	if (BI.OutNeeds & StateWQM)
1384	Needs = StateWQM;
1385	else if (BI.OutNeeds == StateExact)
1386	Needs = StateExact;
1387	else
1388	Needs = StateWQM \| StateExact;
1389	}
1390
1391	// Now, transition if necessary.
1392	if (!(Needs & State)) {
1393	MachineBasicBlock::iterator First;
1394	if (State == StateStrictWWM \|\| Needs == StateStrictWWM \|\|
1395	State == StateStrictWQM \|\| Needs == StateStrictWQM) {
1396	// We must switch to or from Strict mode.
1397	First = FirstStrict;
1398	} else {
1399	// We only need to switch to/from WQM, so we can use FirstWQM.
1400	First = FirstWQM;
1401	}
1402
1403	// Whether we need to save SCC depends on start and end states.
1404	bool SaveSCC = false;
1405	switch (State) {
1406	case StateExact:
1407	case StateStrictWWM:
1408	case StateStrictWQM:
1409	// Exact/Strict -> Strict: save SCC
1410	// Exact/Strict -> WQM: save SCC if WQM mask is generated from exec
1411	// Exact/Strict -> Exact: no save
1412	SaveSCC = (Needs & StateStrict) \|\| ((Needs & StateWQM) && WQMFromExec);
1413	break;
1414	case StateWQM:
1415	// WQM -> Exact/Strict: save SCC
1416	SaveSCC = !(Needs & StateWQM);
1417	break;
1418	default:
1419	llvm_unreachable("Unknown state");
1420	break;
1421	}
1422	MachineBasicBlock::iterator Before =
1423	prepareInsertion(MBB, First, Last: II, PreferLast: Needs == StateWQM, SaveSCC);
1424
1425	if (State & StateStrict) {
1426	assert(State == StateStrictWWM \|\| State == StateStrictWQM);
1427	assert(SavedNonStrictReg);
1428	fromStrictMode(MBB, Before, SavedOrig: SavedNonStrictReg, NonStrictState, CurrentStrictState: State);
1429
1430	LIS->createAndComputeVirtRegInterval(Reg: SavedNonStrictReg);
1431	SavedNonStrictReg = `0`;
1432	State = NonStrictState;
1433	}
1434
1435	if (Needs & StateStrict) {
1436	NonStrictState = State;
1437	assert(Needs == StateStrictWWM \|\| Needs == StateStrictWQM);
1438	assert(!SavedNonStrictReg);
1439	SavedNonStrictReg = MRI->createVirtualRegister(RegClass: BoolRC);
1440
1441	toStrictMode(MBB, Before, SaveOrig: SavedNonStrictReg, StrictStateNeeded: Needs);
1442	State = Needs;
1443
1444	} else {
1445	if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {
1446	if (!WQMFromExec && (OutNeeds & StateWQM)) {
1447	assert(!SavedWQMReg);
1448	SavedWQMReg = MRI->createVirtualRegister(RegClass: BoolRC);
1449	}
1450
1451	toExact(MBB, Before, SaveWQM: SavedWQMReg);
1452	State = StateExact;
1453	} else if (State == StateExact && (Needs & StateWQM) &&
1454	!(Needs & StateExact)) {
1455	assert(WQMFromExec == (SavedWQMReg == `0`));
1456
1457	toWQM(MBB, Before, SavedWQM: SavedWQMReg);
1458
1459	if (SavedWQMReg) {
1460	LIS->createAndComputeVirtRegInterval(Reg: SavedWQMReg);
1461	SavedWQMReg = `0`;
1462	}
1463	State = StateWQM;
1464	} else {
1465	// We can get here if we transitioned from StrictWWM to a
1466	// non-StrictWWM state that already matches our needs, but we
1467	// shouldn't need to do anything.
1468	assert(Needs & State);
1469	}
1470	}
1471	}
1472
1473	if (Needs != (StateExact \| StateWQM \| StateStrict)) {
1474	if (Needs != (StateExact \| StateWQM))
1475	FirstWQM = IE;
1476	FirstStrict = IE;
1477	}
1478
1479	if (II == IE)
1480	break;
1481
1482	II = Next;
1483	}
1484	assert(!SavedWQMReg);
1485	assert(!SavedNonStrictReg);
1486	}
1487
1488	void SIWholeQuadMode::lowerLiveMaskQueries() {
1489	for (MachineInstr *MI : LiveMaskQueries) {
1490	const DebugLoc &DL = MI->getDebugLoc();
1491	Register Dest = MI->getOperand(i: `0`).getReg();
1492
1493	MachineInstr *Copy =
1494	BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
1495	.addReg(LiveMaskReg);
1496
1497	LIS->ReplaceMachineInstrInMaps(MI&: MI, NewMI&: Copy);
1498	MI->eraseFromParent();
1499	}
1500	}
1501
1502	void SIWholeQuadMode::lowerCopyInstrs() {
1503	for (MachineInstr *MI : LowerToMovInstrs) {
1504	assert(MI->getNumExplicitOperands() == `2`);
1505
1506	const Register Reg = MI->getOperand(i: `0`).getReg();
1507
1508	const TargetRegisterClass *regClass =
1509	TRI->getRegClassForOperandReg(MRI: *MRI, MO: MI->getOperand(i: `0`));
1510	if (TRI->isVGPRClass(RC: regClass)) {
1511	const unsigned MovOp = TII->getMovOpcode(DstRC: regClass);
1512	MI->setDesc(TII->get(MovOp));
1513
1514	// Check that it already implicitly depends on exec (like all VALU movs
1515	// should do).
1516	assert(any_of(MI->implicit_operands(), [](const MachineOperand &MO) {
1517	return MO.isUse() && MO.getReg() == AMDGPU::EXEC;
1518	}));
1519	} else {
1520	// Remove early-clobber and exec dependency from simple SGPR copies.
1521	// This allows some to be eliminated during/post RA.
1522	LLVM_DEBUG(dbgs() << "simplify SGPR copy: " << *MI);
1523	if (MI->getOperand(i: `0`).isEarlyClobber()) {
1524	LIS->removeInterval(Reg);
1525	MI->getOperand(i: `0`).setIsEarlyClobber(false);
1526	LIS->createAndComputeVirtRegInterval(Reg);
1527	}
1528	int Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC, /TRI=/nullptr);
1529	while (Index >= `0`) {
1530	MI->removeOperand(OpNo: Index);
1531	Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC, /TRI=/nullptr);
1532	}
1533	MI->setDesc(TII->get(AMDGPU::COPY));
1534	LLVM_DEBUG(dbgs() << " -> " << *MI);
1535	}
1536	}
1537	for (MachineInstr *MI : LowerToCopyInstrs) {
1538	if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 \|\|
1539	MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) {
1540	assert(MI->getNumExplicitOperands() == `3`);
1541	// the only reason we should be here is V_SET_INACTIVE has
1542	// an undef input so it is being replaced by a simple copy.
1543	// There should be a second undef source that we should remove.
1544	assert(MI->getOperand(`2`).isUndef());
1545	MI->removeOperand(OpNo: `2`);
1546	MI->untieRegOperand(OpIdx: `1`);
1547	} else {
1548	assert(MI->getNumExplicitOperands() == `2`);
1549	}
1550
1551	unsigned CopyOp = MI->getOperand(`1`).isReg()
1552	? (unsigned)AMDGPU::COPY
1553	: TII->getMovOpcode(TRI->getRegClassForOperandReg(
1554	*MRI, MI->getOperand(`0`)));
1555	MI->setDesc(TII->get(CopyOp));
1556	}
1557	}
1558
1559	void SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {
1560	for (MachineInstr *MI : KillInstrs) {
1561	MachineBasicBlock *MBB = MI->getParent();
1562	MachineInstr SplitPoint = nullptr*;
1563	switch (MI->getOpcode()) {
1564	case AMDGPU::SI_DEMOTE_I1:
1565	case AMDGPU::SI_KILL_I1_TERMINATOR:
1566	SplitPoint = lowerKillI1(MBB&: MBB, MI&: MI, IsWQM);
1567	break;
1568	case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1569	SplitPoint = lowerKillF32(MBB&: MBB, MI&: MI);
1570	break;
1571	default:
1572	continue;
1573	}
1574	if (SplitPoint)
1575	splitBlock(BB: MBB, TermMI: SplitPoint);
1576	}
1577	}
1578
1579	bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
1580	LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName()
1581	<< " ------------- \n");
1582	LLVM_DEBUG(MF.dump(););
1583
1584	Instructions.clear();
1585	Blocks.clear();
1586	LiveMaskQueries.clear();
1587	LowerToCopyInstrs.clear();
1588	LowerToMovInstrs.clear();
1589	KillInstrs.clear();
1590	StateTransition.clear();
1591
1592	ST = &MF.getSubtarget<GCNSubtarget>();
1593
1594	TII = ST->getInstrInfo();
1595	TRI = &TII->getRegisterInfo();
1596	MRI = &MF.getRegInfo();
1597	LIS = &getAnalysis<LiveIntervals>();
1598	MDT = getAnalysisIfAvailable<MachineDominatorTree>();
1599	PDT = getAnalysisIfAvailable<MachinePostDominatorTree>();
1600
1601	if (ST->isWave32()) {
1602	AndOpc = AMDGPU::S_AND_B32;
1603	AndTermOpc = AMDGPU::S_AND_B32_term;
1604	AndN2Opc = AMDGPU::S_ANDN2_B32;
1605	XorOpc = AMDGPU::S_XOR_B32;
1606	AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
1607	AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B32_term;
1608	WQMOpc = AMDGPU::S_WQM_B32;
1609	Exec = AMDGPU::EXEC_LO;
1610	} else {
1611	AndOpc = AMDGPU::S_AND_B64;
1612	AndTermOpc = AMDGPU::S_AND_B64_term;
1613	AndN2Opc = AMDGPU::S_ANDN2_B64;
1614	XorOpc = AMDGPU::S_XOR_B64;
1615	AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
1616	AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B64_term;
1617	WQMOpc = AMDGPU::S_WQM_B64;
1618	Exec = AMDGPU::EXEC;
1619	}
1620
1621	const char GlobalFlags = analyzeFunction(MF);
1622	const bool NeedsLiveMask = !(KillInstrs.empty() && LiveMaskQueries.empty());
1623
1624	LiveMaskReg = Exec;
1625
1626	// Shader is simple does not need any state changes or any complex lowering
1627	if (!(GlobalFlags & (StateWQM \| StateStrict)) && LowerToCopyInstrs.empty() &&
1628	LowerToMovInstrs.empty() && KillInstrs.empty()) {
1629	lowerLiveMaskQueries();
1630	return !LiveMaskQueries.empty();
1631	}
1632
1633	MachineBasicBlock &Entry = MF.front();
1634	MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
1635
1636	// Store a copy of the original live mask when required
1637	if (NeedsLiveMask \|\| (GlobalFlags & StateWQM)) {
1638	LiveMaskReg = MRI->createVirtualRegister(RegClass: TRI->getBoolRC());
1639	MachineInstr *MI =
1640	BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
1641	.addReg(Exec);
1642	LIS->InsertMachineInstrInMaps(MI&: *MI);
1643	}
1644
1645	LLVM_DEBUG(printInfo());
1646
1647	lowerLiveMaskQueries();
1648	lowerCopyInstrs();
1649
1650	// Shader only needs WQM
1651	if (GlobalFlags == StateWQM) {
1652	auto MI = BuildMI(Entry, EntryMI, DebugLoc (), TII->get(WQMOpc), Exec)
1653	.addReg(Exec);
1654	LIS->InsertMachineInstrInMaps(MI&: *MI);
1655	lowerKillInstrs(IsWQM: true);
1656	} else {
1657	for (auto BII : Blocks)
1658	processBlock(MBB&: *BII.first, IsEntry: BII.first == &Entry);
1659	// Lowering blocks causes block splitting so perform as a second pass.
1660	for (auto BII : Blocks)
1661	lowerBlock(MBB&: *BII.first);
1662	}
1663
1664	// Compute live range for live mask
1665	if (LiveMaskReg != Exec)
1666	LIS->createAndComputeVirtRegInterval(Reg: LiveMaskReg);
1667
1668	// Physical registers like SCC aren't tracked by default anyway, so just
1669	// removing the ranges we computed is the simplest option for maintaining
1670	// the analysis results.
1671	LIS->removeAllRegUnitsForPhysReg(AMDGPU::SCC);
1672
1673	// If we performed any kills then recompute EXEC
1674	if (!KillInstrs.empty())
1675	LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
1676
1677	return true;
1678	}
1679

source code of llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp