AMDGPURegisterBankInfo.cpp source code [llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp]

1	//===- AMDGPURegisterBankInfo.cpp -------------------------------- C++ --==//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	/// \file
9	/// This file implements the targeting of the RegisterBankInfo class for
10	/// AMDGPU.
11	///
12	/// \par
13	///
14	/// AMDGPU has unique register bank constraints that require special high level
15	/// strategies to deal with. There are two main true physical register banks
16	/// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a
17	/// sort of pseudo-register bank needed to represent SGPRs used in a vector
18	/// boolean context. There is also the AGPR bank, which is a special purpose
19	/// physical register bank present on some subtargets.
20	///
21	/// Copying from VGPR to SGPR is generally illegal, unless the value is known to
22	/// be uniform. It is generally not valid to legalize operands by inserting
23	/// copies as on other targets. Operations which require uniform, SGPR operands
24	/// generally require scalarization by repeatedly executing the instruction,
25	/// activating each set of lanes using a unique set of input values. This is
26	/// referred to as a waterfall loop.
27	///
28	/// \par Booleans
29	///
30	/// Booleans (s1 values) requires special consideration. A vector compare result
31	/// is naturally a bitmask with one bit per lane, in a 32 or 64-bit
32	/// register. These are represented with the VCC bank. During selection, we need
33	/// to be able to unambiguously go back from a register class to a register
34	/// bank. To distinguish whether an SGPR should use the SGPR or VCC register
35	/// bank, we need to know the use context type. An SGPR s1 value always means a
36	/// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets
37	/// SCC, which is a 1-bit unaddressable register. This will need to be copied to
38	/// a 32-bit virtual register. Taken together, this means we need to adjust the
39	/// type of boolean operations to be regbank legal. All SALU booleans need to be
40	/// widened to 32-bits, and all VALU booleans need to be s1 values.
41	///
42	/// A noteworthy exception to the s1-means-vcc rule is for legalization artifact
43	/// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc
44	/// bank. A non-boolean source (such as a truncate from a 1-bit load from
45	/// memory) will require a copy to the VCC bank which will require clearing the
46	/// high bits and inserting a compare.
47	///
48	/// \par Constant bus restriction
49	///
50	/// VALU instructions have a limitation known as the constant bus
51	/// restriction. Most VALU instructions can use SGPR operands, but may read at
52	/// most 1 SGPR or constant literal value (this to 2 in gfx10 for most
53	/// instructions). This is one unique SGPR, so the same SGPR may be used for
54	/// multiple operands. From a register bank perspective, any combination of
55	/// operands should be legal as an SGPR, but this is contextually dependent on
56	/// the SGPR operands all being the same register. There is therefore optimal to
57	/// choose the SGPR with the most uses to minimize the number of copies.
58	///
59	/// We avoid trying to solve this problem in RegBankSelect. Any VALU G_*
60	/// operation should have its source operands all mapped to VGPRs (except for
61	/// VCC), inserting copies from any SGPR operands. This the most trivial legal
62	/// mapping. Anything beyond the simplest 1:1 instruction selection would be too
63	/// complicated to solve here. Every optimization pattern or instruction
64	/// selected to multiple outputs would have to enforce this rule, and there
65	/// would be additional complexity in tracking this rule for every G_*
66	/// operation. By forcing all inputs to VGPRs, it also simplifies the task of
67	/// picking the optimal operand combination from a post-isel optimization pass.
68	///
69	//===----------------------------------------------------------------------===//
70
71	#include "AMDGPURegisterBankInfo.h"
72
73	#include "AMDGPU.h"
74	#include "AMDGPUGlobalISelUtils.h"
75	#include "AMDGPUInstrInfo.h"
76	#include "GCNSubtarget.h"
77	#include "SIMachineFunctionInfo.h"
78	#include "SIRegisterInfo.h"
79	#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
80	#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
81	#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
82	#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
83	#include "llvm/CodeGen/RegisterBank.h"
84	#include "llvm/IR/IntrinsicsAMDGPU.h"
85
86	#define GET_TARGET_REGBANK_IMPL
87	#include "AMDGPUGenRegisterBank.inc"
88
89	// This file will be TableGen'ed at some point.
90	#include "AMDGPUGenRegisterBankInfo.def"
91
92	using namespace llvm;
93	using namespace MIPatternMatch;
94
95	namespace {
96
97	// Observer to apply a register bank to new registers created by LegalizerHelper.
98	class ApplyRegBankMapping final : public GISelChangeObserver {
99	private:
100	MachineIRBuilder &B;
101	const AMDGPURegisterBankInfo &RBI;
102	MachineRegisterInfo &MRI;
103	const RegisterBank *NewBank;
104	SmallVector<MachineInstr *, `4`> NewInsts;
105
106	public:
107	ApplyRegBankMapping(MachineIRBuilder &B, const AMDGPURegisterBankInfo &RBI_,
108	MachineRegisterInfo &MRI_, const RegisterBank *RB)
109	: B(B), RBI(RBI_), MRI(MRI_), NewBank(RB) {
110	assert(!B.isObservingChanges());
111	B.setChangeObserver(*this);
112	}
113
114	~ApplyRegBankMapping() {
115	for (MachineInstr *MI : NewInsts)
116	applyBank(MI&: *MI);
117
118	B.stopObservingChanges();
119	}
120
121	/// Set any registers that don't have a set register class or bank to SALU.
122	void applyBank(MachineInstr &MI) {
123	const unsigned Opc = MI.getOpcode();
124	if (Opc == AMDGPU::G_ANYEXT \|\| Opc == AMDGPU::G_ZEXT \|\|
125	Opc == AMDGPU::G_SEXT) {
126	// LegalizerHelper wants to use the basic legalization artifacts when
127	// widening etc. We don't handle selection with vcc in artifact sources,
128	// so we need to use a select instead to handle these properly.
129	Register DstReg = MI.getOperand(i: `0`).getReg();
130	Register SrcReg = MI.getOperand(i: `1`).getReg();
131	const RegisterBank SrcBank = RBI.getRegBank(SrcReg, MRI, RBI.TRI);
132	if (SrcBank == &AMDGPU::VCCRegBank) {
133	const LLT S32 = LLT::scalar(SizeInBits: `32`);
134	assert(MRI.getType(SrcReg) == LLT::scalar(`1`));
135	assert(MRI.getType(DstReg) == S32);
136	assert(NewBank == &AMDGPU::VGPRRegBank);
137
138	// Replace the extension with a select, which really uses the boolean
139	// source.
140	B.setInsertPt(MBB&: *MI.getParent(), II: MI);
141
142	auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -`1` : `1`);
143	auto False = B.buildConstant(Res: S32, Val: `0`);
144	B.buildSelect(Res: DstReg, Tst: SrcReg, Op0: True, Op1: False);
145	MRI.setRegBank(Reg: True.getReg(`0`), RegBank: *NewBank);
146	MRI.setRegBank(Reg: False.getReg(Idx: `0`), RegBank: *NewBank);
147	MI.eraseFromParent();
148	}
149
150	assert(!MRI.getRegClassOrRegBank(DstReg));
151	MRI.setRegBank(Reg: DstReg, RegBank: *NewBank);
152	return;
153	}
154
155	#ifndef NDEBUG
156	if (Opc == AMDGPU::G_TRUNC) {
157	Register DstReg = MI.getOperand(i: `0`).getReg();
158	const RegisterBank DstBank = RBI.getRegBank(DstReg, MRI, RBI.TRI);
159	assert(DstBank != &AMDGPU::VCCRegBank);
160	}
161	#endif
162
163	for (MachineOperand &Op : MI.operands()) {
164	if (!Op.isReg())
165	continue;
166
167	// We may see physical registers if building a real MI
168	Register Reg = Op.getReg();
169	if (Reg.isPhysical() \|\| MRI.getRegClassOrRegBank(Reg))
170	continue;
171
172	const RegisterBank *RB = NewBank;
173	if (MRI.getType(Reg) == LLT::scalar(SizeInBits: `1`)) {
174	assert(NewBank == &AMDGPU::VGPRRegBank &&
175	"s1 operands should only be used for vector bools");
176	assert((MI.getOpcode() != AMDGPU::G_TRUNC &&
177	MI.getOpcode() != AMDGPU::G_ANYEXT) &&
178	"not expecting legalization artifacts here");
179	RB = &AMDGPU::VCCRegBank;
180	}
181
182	MRI.setRegBank(Reg, RegBank: *RB);
183	}
184	}
185
186	void erasingInstr(MachineInstr &MI) override {}
187
188	void createdInstr(MachineInstr &MI) override {
189	// At this point, the instruction was just inserted and has no operands.
190	NewInsts.push_back(Elt: &MI);
191	}
192
193	void changingInstr(MachineInstr &MI) override {}
194	void changedInstr(MachineInstr &MI) override {
195	// FIXME: In principle we should probably add the instruction to NewInsts,
196	// but the way the LegalizerHelper uses the observer, we will always see the
197	// registers we need to set the regbank on also referenced in a new
198	// instruction.
199	}
200	};
201
202	}
203
204	AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST)
205	: Subtarget(ST), TRI(Subtarget.getRegisterInfo()),
206	TII(Subtarget.getInstrInfo()) {
207
208	// HACK: Until this is fully tablegen'd.
209	static llvm::once_flag InitializeRegisterBankFlag;
210
211	static auto InitializeRegisterBankOnce = [this]() {
212	assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank &&
213	&getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank &&
214	&getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank);
215	(void)this;
216	};
217
218	llvm::call_once(flag&: InitializeRegisterBankFlag, F&: InitializeRegisterBankOnce);
219	}
220
221	static bool isVectorRegisterBank(const RegisterBank &Bank) {
222	unsigned BankID = Bank.getID();
223	return BankID == AMDGPU::VGPRRegBankID \|\| BankID == AMDGPU::AGPRRegBankID;
224	}
225
226	bool AMDGPURegisterBankInfo::isDivergentRegBank(const RegisterBank RB) const* {
227	return RB != &AMDGPU::SGPRRegBank;
228	}
229
230	unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
231	const RegisterBank &Src,
232	TypeSize Size) const {
233	// TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
234	if (Dst.getID() == AMDGPU::SGPRRegBankID &&
235	(isVectorRegisterBank(Src) \|\| Src.getID() == AMDGPU::VCCRegBankID)) {
236	return std::numeric_limits<unsigned>::max();
237	}
238
239	// Bool values are tricky, because the meaning is based on context. The SCC
240	// and VCC banks are for the natural scalar and vector conditions produced by
241	// a compare.
242	//
243	// Legalization doesn't know about the necessary context, so an s1 use may
244	// have been a truncate from an arbitrary value, in which case a copy (lowered
245	// as a compare with 0) needs to be inserted.
246	if (Size == `1` &&
247	(Dst.getID() == AMDGPU::SGPRRegBankID) &&
248	(isVectorRegisterBank(Src) \|\|
249	Src.getID() == AMDGPU::SGPRRegBankID \|\|
250	Src.getID() == AMDGPU::VCCRegBankID))
251	return std::numeric_limits<unsigned>::max();
252
253	// There is no direct copy between AGPRs.
254	if (Dst.getID() == AMDGPU::AGPRRegBankID &&
255	Src.getID() == AMDGPU::AGPRRegBankID)
256	return `4`;
257
258	return RegisterBankInfo::copyCost(A: Dst, B: Src, Size);
259	}
260
261	unsigned AMDGPURegisterBankInfo::getBreakDownCost(
262	const ValueMapping &ValMapping,
263	const RegisterBank CurBank) const* {
264	// Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
265	// VGPR.
266	// FIXME: Is there a better way to do this?
267	if (ValMapping.NumBreakDowns >= `2` \|\| ValMapping.BreakDown[`0`].Length >= `64`)
268	return `10`; // This is expensive.
269
270	assert(ValMapping.NumBreakDowns == `2` &&
271	ValMapping.BreakDown[`0`].Length == `32` &&
272	ValMapping.BreakDown[`0`].StartIdx == `0` &&
273	ValMapping.BreakDown[`1`].Length == `32` &&
274	ValMapping.BreakDown[`1`].StartIdx == `32` &&
275	ValMapping.BreakDown[`0`].RegBank == ValMapping.BreakDown[`1`].RegBank);
276
277	// 32-bit extract of a 64-bit value is just access of a subregister, so free.
278	// TODO: Cost of 0 hits assert, though it's not clear it's what we really
279	// want.
280
281	// TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
282	// alignment restrictions, but this probably isn't important.
283	return `1`;
284	}
285
286	const RegisterBank &
287	AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
288	LLT Ty) const {
289	if (&RC == &AMDGPU::SReg_1RegClass)
290	return AMDGPU::VCCRegBank;
291
292	// We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a
293	// VCC-like use.
294	if (TRI->isSGPRClass(RC: &RC)) {
295	// FIXME: This probably came from a copy from a physical register, which
296	// should be inferable from the copied to-type. We don't have many boolean
297	// physical register constraints so just assume a normal SGPR for now.
298	if (!Ty.isValid())
299	return AMDGPU::SGPRRegBank;
300
301	return Ty == LLT::scalar(`1`) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
302	}
303
304	return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank;
305	}
306
307	template <unsigned NumOps>
308	RegisterBankInfo::InstructionMappings
309	AMDGPURegisterBankInfo::addMappingFromTable(
310	const MachineInstr &MI, const MachineRegisterInfo &MRI,
311	const std::array<unsigned, NumOps> RegSrcOpIdx,
312	ArrayRef<OpRegBankEntry<NumOps>> Table) const {
313
314	InstructionMappings AltMappings;
315
316	SmallVector<const ValueMapping *, `10`> Operands(MI.getNumOperands());
317
318	unsigned Sizes[NumOps];
319	for (unsigned I = `0`; I < NumOps; ++I) {
320	Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg();
321	Sizes[I] = getSizeInBits(Reg, MRI, *TRI);
322	}
323
324	for (unsigned I = `0`, E = MI.getNumExplicitDefs(); I != E; ++I) {
325	unsigned SizeI = getSizeInBits(MI.getOperand(i: I).getReg(), MRI, *TRI);
326	Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI);
327	}
328
329	// getInstrMapping's default mapping uses ID 1, so start at 2.
330	unsigned MappingID = `2`;
331	for (const auto &Entry : Table) {
332	for (unsigned I = `0`; I < NumOps; ++I) {
333	int OpIdx = RegSrcOpIdx[I];
334	Operands [OpIdx] = AMDGPU::getValueMapping(BankID: Entry.RegBanks[I], Size: Sizes[I]);
335	}
336
337	AltMappings.push_back(Elt: &getInstructionMapping(ID: MappingID++, Cost: Entry.Cost,
338	OperandsMapping: getOperandsMapping(OpdsMapping: Operands),
339	NumOperands: Operands.size()));
340	}
341
342	return AltMappings;
343	}
344
345	RegisterBankInfo::InstructionMappings
346	AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
347	const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
348	switch (cast<GIntrinsic>(Val: MI).getIntrinsicID()) {
349	case Intrinsic::amdgcn_readlane: {
350	static const OpRegBankEntry<`3`> Table[`2`] = {
351	// Perfectly legal.
352	{ { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, `1` },
353
354	// Need a readfirstlane for the index.
355	{ { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, `2` }
356	};
357
358	const std::array<unsigned, `3`> RegSrcOpIdx = { ._M_elems: { `0`, `2`, `3` } };
359	return addMappingFromTable<`3`>(MI, MRI, RegSrcOpIdx, Table);
360	}
361	case Intrinsic::amdgcn_writelane: {
362	static const OpRegBankEntry<`4`> Table[`4`] = {
363	// Perfectly legal.
364	{ { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, `1` },
365
366	// Need readfirstlane of first op
367	{ { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, `2` },
368
369	// Need readfirstlane of second op
370	{ { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, `2` },
371
372	// Need readfirstlane of both ops
373	{ { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, `3` }
374	};
375
376	// rsrc, voffset, offset
377	const std::array<unsigned, `4`> RegSrcOpIdx = { ._M_elems: { `0`, `2`, `3`, `4` } };
378	return addMappingFromTable<`4`>(MI, MRI, RegSrcOpIdx, Table);
379	}
380	default:
381	return RegisterBankInfo::getInstrAlternativeMappings(MI);
382	}
383	}
384
385	RegisterBankInfo::InstructionMappings
386	AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
387	const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
388
389	switch (cast<GIntrinsic>(Val: MI).getIntrinsicID()) {
390	case Intrinsic::amdgcn_s_buffer_load: {
391	static const OpRegBankEntry<`2`> Table[`4`] = {
392	// Perfectly legal.
393	{ { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, `1` },
394
395	// Only need 1 register in loop
396	{ { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, `300` },
397
398	// Have to waterfall the resource.
399	{ { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, `1000` },
400
401	// Have to waterfall the resource, and the offset.
402	{ { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, `1500` }
403	};
404
405	// rsrc, offset
406	const std::array<unsigned, `2`> RegSrcOpIdx = { ._M_elems: { `2`, `3` } };
407	return addMappingFromTable<`2`>(MI, MRI, RegSrcOpIdx, Table);
408	}
409	case Intrinsic::amdgcn_ds_ordered_add:
410	case Intrinsic::amdgcn_ds_ordered_swap: {
411	// VGPR = M0, VGPR
412	static const OpRegBankEntry<`3`> Table[`2`] = {
413	// Perfectly legal.
414	{ { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, `1` },
415
416	// Need a readfirstlane for m0
417	{ { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, `2` }
418	};
419
420	const std::array<unsigned, `3`> RegSrcOpIdx = { ._M_elems: { `0`, `2`, `3` } };
421	return addMappingFromTable<`3`>(MI, MRI, RegSrcOpIdx, Table);
422	}
423	case Intrinsic::amdgcn_s_sendmsg:
424	case Intrinsic::amdgcn_s_sendmsghalt: {
425	// FIXME: Should have no register for immediate
426	static const OpRegBankEntry<`1`> Table[`2`] = {
427	// Perfectly legal.
428	{ { AMDGPU::SGPRRegBankID }, `1` },
429
430	// Need readlane
431	{ { AMDGPU::VGPRRegBankID }, `3` }
432	};
433
434	const std::array<unsigned, `1`> RegSrcOpIdx = { ._M_elems: { `2` } };
435	return addMappingFromTable<`1`>(MI, MRI, RegSrcOpIdx, Table);
436	}
437	default:
438	return RegisterBankInfo::getInstrAlternativeMappings(MI);
439	}
440	}
441
442	// FIXME: Returns uniform if there's no source value information. This is
443	// probably wrong.
444	bool AMDGPURegisterBankInfo::isScalarLoadLegal(const MachineInstr &MI) const {
445	if (!MI.hasOneMemOperand())
446	return false;
447
448	const MachineMemOperand MMO = MI.memoperands_begin();
449	const unsigned AS = MMO->getAddrSpace();
450	const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS \|\|
451	AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
452	const unsigned MemSize = `8` * MMO->getSize().getValue();
453
454	// Require 4-byte alignment.
455	return (MMO->getAlign() >= Align (`4`) \|\|
456	(Subtarget.hasScalarSubwordLoads() &&
457	((MemSize == `16` && MMO->getAlign() >= Align (`2`)) \|\|
458	(MemSize == `8` && MMO->getAlign() >= Align (`1`))))) &&
459	// Can't do a scalar atomic load.
460	!MMO->isAtomic() &&
461	// Don't use scalar loads for volatile accesses to non-constant address
462	// spaces.
463	(IsConst \|\| !MMO->isVolatile()) &&
464	// Memory must be known constant, or not written before this load.
465	(IsConst \|\| MMO->isInvariant() \|\| (MMO->getFlags() & MONoClobber)) &&
466	AMDGPUInstrInfo::isUniformMMO(MMO);
467	}
468
469	RegisterBankInfo::InstructionMappings
470	AMDGPURegisterBankInfo::getInstrAlternativeMappings(
471	const MachineInstr &MI) const {
472
473	const MachineFunction &MF = *MI.getParent()->getParent();
474	const MachineRegisterInfo &MRI = MF.getRegInfo();
475
476
477	InstructionMappings AltMappings;
478	switch (MI.getOpcode()) {
479	case TargetOpcode::G_CONSTANT:
480	case TargetOpcode::G_IMPLICIT_DEF: {
481	unsigned Size = getSizeInBits(MI.getOperand(i: `0`).getReg(), MRI, *TRI);
482	if (Size == `1`) {
483	static const OpRegBankEntry<`1`> Table[`3`] = {
484	{ { AMDGPU::VGPRRegBankID }, `1` },
485	{ { AMDGPU::SGPRRegBankID }, `1` },
486	{ { AMDGPU::VCCRegBankID }, `1` }
487	};
488
489	return addMappingFromTable<`1`>(MI, MRI, RegSrcOpIdx: {._M_elems: { `0` }}, Table);
490	}
491
492	[[fallthrough]];
493	}
494	case TargetOpcode::G_FCONSTANT:
495	case TargetOpcode::G_FRAME_INDEX:
496	case TargetOpcode::G_GLOBAL_VALUE: {
497	static const OpRegBankEntry<`1`> Table[`2`] = {
498	{ { AMDGPU::VGPRRegBankID }, `1` },
499	{ { AMDGPU::SGPRRegBankID }, `1` }
500	};
501
502	return addMappingFromTable<`1`>(MI, MRI, RegSrcOpIdx: {._M_elems: { `0` }}, Table);
503	}
504	case TargetOpcode::G_AND:
505	case TargetOpcode::G_OR:
506	case TargetOpcode::G_XOR: {
507	unsigned Size = getSizeInBits(MI.getOperand(i: `0`).getReg(), MRI, *TRI);
508
509	if (Size == `1`) {
510	// s_{and\|or\|xor}_b32 set scc when the result of the 32-bit op is not 0.
511	const InstructionMapping &SCCMapping = getInstructionMapping(
512	`1`, `1`, getOperandsMapping(
513	{AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, `32`),
514	AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, `32`),
515	AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, `32`)}),
516	`3`); // Num Operands
517	AltMappings.push_back(Elt: &SCCMapping);
518
519	const InstructionMapping &VCCMapping0 = getInstructionMapping(
520	`2`, `1`, getOperandsMapping(
521	{AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
522	AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
523	AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}),
524	`3`); // Num Operands
525	AltMappings.push_back(Elt: &VCCMapping0);
526	return AltMappings;
527	}
528
529	if (Size != `64`)
530	break;
531
532	const InstructionMapping &SSMapping = getInstructionMapping(
533	`1`, `1`, getOperandsMapping(
534	{AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
535	AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
536	AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
537	`3`); // Num Operands
538	AltMappings.push_back(Elt: &SSMapping);
539
540	const InstructionMapping &VVMapping = getInstructionMapping(
541	`2`, `2`, getOperandsMapping(
542	{AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
543	AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
544	AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
545	`3`); // Num Operands
546	AltMappings.push_back(Elt: &VVMapping);
547	break;
548	}
549	case TargetOpcode::G_LOAD:
550	case TargetOpcode::G_ZEXTLOAD:
551	case TargetOpcode::G_SEXTLOAD: {
552	unsigned Size = getSizeInBits(MI.getOperand(i: `0`).getReg(), MRI, *TRI);
553	LLT PtrTy = MRI.getType(Reg: MI.getOperand(i: `1`).getReg());
554	unsigned PtrSize = PtrTy.getSizeInBits();
555	unsigned AS = PtrTy.getAddressSpace();
556
557	if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
558	AS != AMDGPUAS::PRIVATE_ADDRESS) &&
559	isScalarLoadLegal(MI)) {
560	const InstructionMapping &SSMapping = getInstructionMapping(
561	`1`, `1`, getOperandsMapping(
562	{AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
563	AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}),
564	`2`); // Num Operands
565	AltMappings.push_back(Elt: &SSMapping);
566	}
567
568	const InstructionMapping &VVMapping = getInstructionMapping(
569	`2`, `1`,
570	getOperandsMapping(
571	{AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
572	AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}),
573	`2`); // Num Operands
574	AltMappings.push_back(Elt: &VVMapping);
575
576	// It may be possible to have a vgpr = load sgpr mapping here, because
577	// the mubuf instructions support this kind of load, but probably for only
578	// gfx7 and older. However, the addressing mode matching in the instruction
579	// selector should be able to do a better job of detecting and selecting
580	// these kinds of loads from the vgpr = load vgpr mapping.
581
582	return AltMappings;
583
584	}
585	case TargetOpcode::G_SELECT: {
586	unsigned Size = getSizeInBits(MI.getOperand(i: `0`).getReg(), MRI, *TRI);
587	const InstructionMapping &SSMapping = getInstructionMapping(`1`, `1`,
588	getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
589	AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, `1`),
590	AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
591	AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
592	`4`); // Num Operands
593	AltMappings.push_back(Elt: &SSMapping);
594
595	const InstructionMapping &VVMapping = getInstructionMapping(`2`, `1`,
596	getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
597	AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, `1`),
598	AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
599	AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
600	`4`); // Num Operands
601	AltMappings.push_back(Elt: &VVMapping);
602
603	return AltMappings;
604	}
605	case TargetOpcode::G_UADDE:
606	case TargetOpcode::G_USUBE:
607	case TargetOpcode::G_SADDE:
608	case TargetOpcode::G_SSUBE: {
609	unsigned Size = getSizeInBits(MI.getOperand(i: `0`).getReg(), MRI, *TRI);
610	const InstructionMapping &SSMapping = getInstructionMapping(`1`, `1`,
611	getOperandsMapping(
612	{AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
613	AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, `1`),
614	AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
615	AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
616	AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, `1`)}),
617	`5`); // Num Operands
618	AltMappings.push_back(Elt: &SSMapping);
619
620	const InstructionMapping &VVMapping = getInstructionMapping(`2`, `1`,
621	getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
622	AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, `1`),
623	AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
624	AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
625	AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, `1`)}),
626	`5`); // Num Operands
627	AltMappings.push_back(Elt: &VVMapping);
628	return AltMappings;
629	}
630	case AMDGPU::G_BRCOND: {
631	assert(MRI.getType(MI.getOperand(`0`).getReg()).getSizeInBits() == `1`);
632
633	// TODO: Change type to 32 for scalar
634	const InstructionMapping &SMapping = getInstructionMapping(
635	`1`, `1`, getOperandsMapping(
636	{AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, `1`), nullptr}),
637	`2`); // Num Operands
638	AltMappings.push_back(Elt: &SMapping);
639
640	const InstructionMapping &VMapping = getInstructionMapping(
641	`1`, `1`, getOperandsMapping(
642	{AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, `1`), nullptr }),
643	`2`); // Num Operands
644	AltMappings.push_back(Elt: &VMapping);
645	return AltMappings;
646	}
647	case AMDGPU::G_INTRINSIC:
648	case AMDGPU::G_INTRINSIC_CONVERGENT:
649	return getInstrAlternativeMappingsIntrinsic(MI, MRI);
650	case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
651	case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
652	return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI);
653	default:
654	break;
655	}
656	return RegisterBankInfo::getInstrAlternativeMappings(MI);
657	}
658
659	void AMDGPURegisterBankInfo::split64BitValueForMapping(
660	MachineIRBuilder &B,
661	SmallVector<Register, `2`> &Regs,
662	LLT HalfTy,
663	Register Reg) const {
664	assert(HalfTy.getSizeInBits() == `32`);
665	MachineRegisterInfo *MRI = B.getMRI();
666	Register LoLHS = MRI->createGenericVirtualRegister(Ty: HalfTy);
667	Register HiLHS = MRI->createGenericVirtualRegister(Ty: HalfTy);
668	const RegisterBank Bank = getRegBank(Reg, MRI, *TRI);
669	MRI->setRegBank(Reg: LoLHS, RegBank: *Bank);
670	MRI->setRegBank(Reg: HiLHS, RegBank: *Bank);
671
672	Regs.push_back(Elt: LoLHS);
673	Regs.push_back(Elt: HiLHS);
674
675	B.buildInstr(AMDGPU::G_UNMERGE_VALUES)
676	.addDef(LoLHS)
677	.addDef(HiLHS)
678	.addUse(Reg);
679	}
680
681	/// Replace the current type each register in \p Regs has with \p NewTy
682	static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs,
683	LLT NewTy) {
684	for (Register Reg : Regs) {
685	assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits());
686	MRI.setType(VReg: Reg, Ty: NewTy);
687	}
688	}
689
690	static LLT getHalfSizedType(LLT Ty) {
691	if (Ty.isVector()) {
692	assert(Ty.getElementCount().isKnownMultipleOf(`2`));
693	return LLT::scalarOrVector(EC: Ty.getElementCount().divideCoefficientBy(RHS: `2`),
694	ScalarTy: Ty.getElementType());
695	}
696
697	assert(Ty.getScalarSizeInBits() % `2` == `0`);
698	return LLT::scalar(SizeInBits: Ty.getScalarSizeInBits() / `2`);
699	}
700
701	// Build one or more V_READFIRSTLANE_B32 instructions to move the given vector
702	// source value into a scalar register.
703	Register AMDGPURegisterBankInfo::buildReadFirstLane(MachineIRBuilder &B,
704	MachineRegisterInfo &MRI,
705	Register Src) const {
706	LLT Ty = MRI.getType(Reg: Src);
707	const RegisterBank Bank = getRegBank(Src, MRI, TRI);
708
709	if (Bank == &AMDGPU::SGPRRegBank)
710	return Src;
711
712	unsigned Bits = Ty.getSizeInBits();
713	assert(Bits % `32` == `0`);
714
715	if (Bank != &AMDGPU::VGPRRegBank) {
716	// We need to copy from AGPR to VGPR
717	Src = B.buildCopy(Res: Ty, Op: Src).getReg(Idx: `0`);
718	MRI.setRegBank(Src, AMDGPU::VGPRRegBank);
719	}
720
721	LLT S32 = LLT::scalar(SizeInBits: `32`);
722	unsigned NumParts = Bits / `32`;
723	SmallVector<Register, `8`> SrcParts;
724	SmallVector<Register, `8`> DstParts;
725
726	if (Bits == `32`) {
727	SrcParts.push_back(Elt: Src);
728	} else {
729	auto Unmerge = B.buildUnmerge(Res: S32, Op: Src);
730	for (unsigned i = `0`; i < NumParts; ++i)
731	SrcParts.push_back(Elt: Unmerge.getReg(Idx: i));
732	}
733
734	for (unsigned i = `0`; i < NumParts; ++i) {
735	Register SrcPart = SrcParts [i];
736	Register DstPart = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
737	MRI.setType(VReg: DstPart, Ty: NumParts == `1` ? Ty : S32);
738
739	const TargetRegisterClass *Constrained =
740	constrainGenericRegister(SrcPart, AMDGPU::VGPR_32RegClass, MRI);
741	(void)Constrained;
742	assert(Constrained && "Failed to constrain readfirstlane src reg");
743
744	B.buildInstr(AMDGPU::V_READFIRSTLANE_B32, {DstPart}, {SrcPart});
745
746	DstParts.push_back(Elt: DstPart);
747	}
748
749	if (Bits == `32`)
750	return DstParts [`0`];
751
752	Register Dst = B.buildMergeLikeInstr(Res: Ty, Ops: DstParts).getReg(Idx: `0`);
753	MRI.setRegBank(Dst, AMDGPU::SGPRRegBank);
754	return Dst;
755	}
756
757	/// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
758	/// any of the required SGPR operands are VGPRs, perform a waterfall loop to
759	/// execute the instruction for each unique combination of values in all lanes
760	/// in the wave. The block will be split such that rest of the instructions are
761	/// moved to a new block.
762	///
763	/// Essentially performs this loop:
764	//
765	/// Save Execution Mask
766	/// For (Lane : Wavefront) {
767	/// Enable Lane, Disable all other lanes
768	/// SGPR = read SGPR value for current lane from VGPR
769	/// VGPRResult[Lane] = use_op SGPR
770	/// }
771	/// Restore Execution Mask
772	///
773	/// There is additional complexity to try for compare values to identify the
774	/// unique values used.
775	bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
776	MachineIRBuilder &B, iterator_range<MachineBasicBlock::iterator> Range,
777	SmallSet<Register, `4`> &SGPROperandRegs) const {
778	// Track use registers which have already been expanded with a readfirstlane
779	// sequence. This may have multiple uses if moving a sequence.
780	DenseMap<Register, Register> WaterfalledRegMap;
781
782	MachineBasicBlock &MBB = B.getMBB();
783	MachineFunction *MF = &B.getMF();
784
785	const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
786	const unsigned MovExecOpc =
787	Subtarget.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
788	const unsigned MovExecTermOpc =
789	Subtarget.isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
790
791	const unsigned XorTermOpc = Subtarget.isWave32() ?
792	AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
793	const unsigned AndSaveExecOpc = Subtarget.isWave32() ?
794	AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
795	const unsigned ExecReg = Subtarget.isWave32() ?
796	AMDGPU::EXEC_LO : AMDGPU::EXEC;
797
798	#ifndef NDEBUG
799	const int OrigRangeSize = std::distance(first: Range.begin(), last: Range.end());
800	#endif
801
802	MachineRegisterInfo &MRI = *B.getMRI();
803	Register SaveExecReg = MRI.createVirtualRegister(RegClass: WaveRC);
804	Register InitSaveExecReg = MRI.createVirtualRegister(RegClass: WaveRC);
805
806	// Don't bother using generic instructions/registers for the exec mask.
807	B.buildInstr(Opcode: TargetOpcode::IMPLICIT_DEF)
808	.addDef(RegNo: InitSaveExecReg);
809
810	Register PhiExec = MRI.createVirtualRegister(RegClass: WaveRC);
811	Register NewExec = MRI.createVirtualRegister(RegClass: WaveRC);
812
813	// To insert the loop we need to split the block. Move everything before this
814	// point to a new block, and insert a new empty block before this instruction.
815	MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
816	MachineBasicBlock *BodyBB = MF->CreateMachineBasicBlock();
817	MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
818	MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
819	MachineFunction::iterator MBBI(MBB);
820	++MBBI;
821	MF->insert(MBBI, MBB: LoopBB);
822	MF->insert(MBBI, MBB: BodyBB);
823	MF->insert(MBBI, MBB: RestoreExecBB);
824	MF->insert(MBBI, MBB: RemainderBB);
825
826	LoopBB->addSuccessor(Succ: BodyBB);
827	BodyBB->addSuccessor(Succ: RestoreExecBB);
828	BodyBB->addSuccessor(Succ: LoopBB);
829
830	// Move the rest of the block into a new block.
831	RemainderBB->transferSuccessorsAndUpdatePHIs(FromMBB: &MBB);
832	RemainderBB->splice(Where: RemainderBB->begin(), Other: &MBB, From: Range.end(), To: MBB.end());
833
834	MBB.addSuccessor(Succ: LoopBB);
835	RestoreExecBB->addSuccessor(Succ: RemainderBB);
836
837	B.setInsertPt(MBB&: *LoopBB, II: LoopBB->end());
838
839	B.buildInstr(Opcode: TargetOpcode::PHI)
840	.addDef(RegNo: PhiExec)
841	.addReg(RegNo: InitSaveExecReg)
842	.addMBB(MBB: &MBB)
843	.addReg(RegNo: NewExec)
844	.addMBB(MBB: BodyBB);
845
846	const DebugLoc &DL = B.getDL();
847
848	MachineInstr &FirstInst = *Range.begin();
849
850	// Move the instruction into the loop body. Note we moved everything after
851	// Range.end() already into a new block, so Range.end() is no longer valid.
852	BodyBB->splice(Where: BodyBB->end(), Other: &MBB, From: Range.begin(), To: MBB.end());
853
854	// Figure out the iterator range after splicing the instructions.
855	MachineBasicBlock::iterator NewBegin = FirstInst.getIterator();
856	auto NewEnd = BodyBB->end();
857
858	B.setMBB(*LoopBB);
859
860	LLT S1 = LLT::scalar(SizeInBits: `1`);
861	Register CondReg;
862
863	assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
864
865	for (MachineInstr &MI : make_range(x: NewBegin, y: NewEnd)) {
866	for (MachineOperand &Op : MI.all_uses()) {
867	Register OldReg = Op.getReg();
868	if (!SGPROperandRegs.count(V: OldReg))
869	continue;
870
871	// See if we already processed this register in another instruction in the
872	// sequence.
873	auto OldVal = WaterfalledRegMap.find(Val: OldReg);
874	if (OldVal != WaterfalledRegMap.end()) {
875	Op.setReg(OldVal ->second);
876	continue;
877	}
878
879	Register OpReg = Op.getReg();
880	LLT OpTy = MRI.getType(Reg: OpReg);
881
882	const RegisterBank OpBank = getRegBank(OpReg, MRI, TRI);
883	if (OpBank != &AMDGPU::VGPRRegBank) {
884	// Insert copy from AGPR to VGPR before the loop.
885	B.setMBB(MBB);
886	OpReg = B.buildCopy(Res: OpTy, Op: OpReg).getReg(Idx: `0`);
887	MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank);
888	B.setMBB(*LoopBB);
889	}
890
891	Register CurrentLaneReg = buildReadFirstLane(B, MRI, Src: OpReg);
892
893	// Build the comparison(s).
894	unsigned OpSize = OpTy.getSizeInBits();
895	bool Is64 = OpSize % `64` == `0`;
896	unsigned PartSize = Is64 ? `64` : `32`;
897	LLT PartTy = LLT::scalar(SizeInBits: PartSize);
898	unsigned NumParts = OpSize / PartSize;
899	SmallVector<Register, `8`> OpParts;
900	SmallVector<Register, `8`> CurrentLaneParts;
901
902	if (NumParts == `1`) {
903	OpParts.push_back(Elt: OpReg);
904	CurrentLaneParts.push_back(Elt: CurrentLaneReg);
905	} else {
906	auto UnmergeOp = B.buildUnmerge(Res: PartTy, Op: OpReg);
907	auto UnmergeCurrentLane = B.buildUnmerge(Res: PartTy, Op: CurrentLaneReg);
908	for (unsigned i = `0`; i < NumParts; ++i) {
909	OpParts.push_back(Elt: UnmergeOp.getReg(Idx: i));
910	CurrentLaneParts.push_back(Elt: UnmergeCurrentLane.getReg(Idx: i));
911	MRI.setRegBank(OpParts[i], AMDGPU::VGPRRegBank);
912	MRI.setRegBank(CurrentLaneParts[i], AMDGPU::SGPRRegBank);
913	}
914	}
915
916	for (unsigned i = `0`; i < NumParts; ++i) {
917	auto CmpReg = B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: S1, Op0: CurrentLaneParts [i],
918	Op1: OpParts [i]).getReg(Idx: `0`);
919	MRI.setRegBank(CmpReg, AMDGPU::VCCRegBank);
920
921	if (!CondReg) {
922	CondReg = CmpReg;
923	} else {
924	CondReg = B.buildAnd(Dst: S1, Src0: CondReg, Src1: CmpReg).getReg(Idx: `0`);
925	MRI.setRegBank(CondReg, AMDGPU::VCCRegBank);
926	}
927	}
928
929	Op.setReg(CurrentLaneReg);
930
931	// Make sure we don't re-process this register again.
932	WaterfalledRegMap.insert(KV: std::pair(OldReg, Op.getReg()));
933	}
934	}
935
936	// The ballot becomes a no-op during instruction selection.
937	CondReg = B.buildIntrinsic(Intrinsic::amdgcn_ballot,
938	{LLT::scalar(Subtarget.isWave32() ? `32` : `64`)})
939	.addReg(CondReg)
940	.getReg(`0`);
941	MRI.setRegClass(Reg: CondReg, RC: WaveRC);
942
943	// Update EXEC, save the original EXEC value to VCC.
944	B.buildInstr(Opcode: AndSaveExecOpc)
945	.addDef(RegNo: NewExec)
946	.addReg(RegNo: CondReg, flags: RegState::Kill);
947
948	MRI.setSimpleHint(VReg: NewExec, PrefReg: CondReg);
949
950	B.setInsertPt(MBB&: *BodyBB, II: BodyBB->end());
951
952	// Update EXEC, switch all done bits to 0 and all todo bits to 1.
953	B.buildInstr(Opcode: XorTermOpc)
954	.addDef(RegNo: ExecReg)
955	.addReg(RegNo: ExecReg)
956	.addReg(RegNo: NewExec);
957
958	// XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
959	// s_cbranch_scc0?
960
961	// Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
962	B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
963
964	// Save the EXEC mask before the loop.
965	BuildMI(MBB, MBB.end(), DL, TII->get(MovExecOpc), SaveExecReg)
966	.addReg(ExecReg);
967
968	// Restore the EXEC mask after the loop.
969	B.setMBB(*RestoreExecBB);
970	B.buildInstr(Opcode: MovExecTermOpc)
971	.addDef(RegNo: ExecReg)
972	.addReg(RegNo: SaveExecReg);
973
974	// Set the insert point after the original instruction, so any new
975	// instructions will be in the remainder.
976	B.setInsertPt(MBB&: *RemainderBB, II: RemainderBB->begin());
977
978	return true;
979	}
980
981	// Return any unique registers used by \p MI at \p OpIndices that need to be
982	// handled in a waterfall loop. Returns these registers in \p
983	// SGPROperandRegs. Returns true if there are any operands to handle and a
984	// waterfall loop is necessary.
985	bool AMDGPURegisterBankInfo::collectWaterfallOperands(
986	SmallSet<Register, `4`> &SGPROperandRegs, MachineInstr &MI,
987	MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const {
988	for (unsigned Op : OpIndices) {
989	assert(MI.getOperand(Op).isUse());
990	Register Reg = MI.getOperand(i: Op).getReg();
991	const RegisterBank OpBank = getRegBank(Reg, MRI, TRI);
992	if (OpBank->getID() != AMDGPU::SGPRRegBankID)
993	SGPROperandRegs.insert(V: Reg);
994	}
995
996	// No operands need to be replaced, so no need to loop.
997	return !SGPROperandRegs.empty();
998	}
999
1000	bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
1001	MachineIRBuilder &B, MachineInstr &MI, ArrayRef<unsigned> OpIndices) const {
1002	// Use a set to avoid extra readfirstlanes in the case where multiple operands
1003	// are the same register.
1004	SmallSet<Register, `4`> SGPROperandRegs;
1005
1006	if (!collectWaterfallOperands(SGPROperandRegs, MI, *B.getMRI(), OpIndices))
1007	return false;
1008
1009	MachineBasicBlock::iterator I = MI.getIterator();
1010	return executeInWaterfallLoop(B, make_range(x: I, y: std::next(x: I)),
1011	SGPROperandRegs);
1012	}
1013
1014	// Legalize an operand that must be an SGPR by inserting a readfirstlane.
1015	void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
1016	MachineIRBuilder &B, MachineInstr &MI, unsigned OpIdx) const {
1017	Register Reg = MI.getOperand(i: OpIdx).getReg();
1018	MachineRegisterInfo &MRI = *B.getMRI();
1019	const RegisterBank Bank = getRegBank(Reg, MRI, TRI);
1020	if (Bank == &AMDGPU::SGPRRegBank)
1021	return;
1022
1023	Reg = buildReadFirstLane(B, MRI, Src: Reg);
1024	MI.getOperand(i: OpIdx).setReg(Reg);
1025	}
1026
1027	/// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the
1028	/// rest will be in the remainder.
1029	static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) {
1030	unsigned TotalSize = Ty.getSizeInBits();
1031	if (!Ty.isVector())
1032	return {LLT::scalar(SizeInBits: FirstSize), LLT::scalar(SizeInBits: TotalSize - FirstSize)};
1033
1034	LLT EltTy = Ty.getElementType();
1035	unsigned EltSize = EltTy.getSizeInBits();
1036	assert(FirstSize % EltSize == `0`);
1037
1038	unsigned FirstPartNumElts = FirstSize / EltSize;
1039	unsigned RemainderElts = (TotalSize - FirstSize) / EltSize;
1040
1041	return {LLT::scalarOrVector(EC: ElementCount::getFixed(MinVal: FirstPartNumElts), ScalarTy: EltTy),
1042	LLT::scalarOrVector(EC: ElementCount::getFixed(MinVal: RemainderElts), ScalarTy: EltTy)};
1043	}
1044
1045	static LLT widen96To128(LLT Ty) {
1046	if (!Ty.isVector())
1047	return LLT::scalar(SizeInBits: `128`);
1048
1049	LLT EltTy = Ty.getElementType();
1050	assert(`128` % EltTy.getSizeInBits() == `0`);
1051	return LLT::fixed_vector(NumElements: `128` / EltTy.getSizeInBits(), ScalarTy: EltTy);
1052	}
1053
1054	bool AMDGPURegisterBankInfo::applyMappingLoad(
1055	MachineIRBuilder &B,
1056	const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1057	MachineInstr &MI) const {
1058	MachineRegisterInfo &MRI = *B.getMRI();
1059	Register DstReg = MI.getOperand(i: `0`).getReg();
1060	const LLT LoadTy = MRI.getType(Reg: DstReg);
1061	unsigned LoadSize = LoadTy.getSizeInBits();
1062	const unsigned MaxNonSmrdLoadSize = `128`;
1063
1064	const RegisterBank *DstBank =
1065	OpdMapper.getInstrMapping().getOperandMapping(i: `0`).BreakDown[`0`].RegBank;
1066	if (DstBank == &AMDGPU::SGPRRegBank) {
1067	// There are some special cases that we need to look at for 32 bit and 96
1068	// bit SGPR loads otherwise we have nothing to do.
1069	if (LoadSize != `32` && (LoadSize != `96` \|\| Subtarget.hasScalarDwordx3Loads()))
1070	return false;
1071
1072	MachineMemOperand MMO = MI.memoperands_begin();
1073	const unsigned MemSize = `8` * MMO->getSize().getValue();
1074	// Scalar loads of size 8 or 16 bit with proper alignment may be widened to
1075	// 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit
1076	// scalar loads should have a load size of 32 but memory access size of less
1077	// than 32.
1078	if (LoadSize == `32` &&
1079	(MemSize == `32` \|\| LoadTy.isVector() \|\| !isScalarLoadLegal(MI)))
1080	return false;
1081
1082	if (LoadSize == `32` &&
1083	((MemSize == `8` && MMO->getAlign() >= Align (`1`)) \|\|
1084	(MemSize == `16` && MMO->getAlign() >= Align (`2`))) &&
1085	isScalarLoadLegal(MI) &&
1086	Subtarget.getGeneration() >= AMDGPUSubtarget::GFX12)
1087	return false;
1088
1089	Register PtrReg = MI.getOperand(i: `1`).getReg();
1090
1091	ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);
1092
1093	if (LoadSize == `32`) {
1094	// This is an extending load from a sub-dword size. Widen the memory
1095	// access size to 4 bytes and clear the extra high bits appropriately
1096	const LLT S32 = LLT::scalar(SizeInBits: `32`);
1097	if (MI.getOpcode() == AMDGPU::G_SEXTLOAD) {
1098	// Must extend the sign bit into higher bits for a G_SEXTLOAD
1099	auto WideLoad = B.buildLoadFromOffset(Dst: S32, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: `0`);
1100	B.buildSExtInReg(Res: MI.getOperand(i: `0`), Op: WideLoad, ImmOp: MemSize);
1101	} else if (MI.getOpcode() == AMDGPU::G_ZEXTLOAD) {
1102	// Must extend zero into higher bits with an AND for a G_ZEXTLOAD
1103	auto WideLoad = B.buildLoadFromOffset(Dst: S32, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: `0`);
1104	B.buildZExtInReg(Res: MI.getOperand(i: `0`), Op: WideLoad, ImmOp: MemSize);
1105	} else
1106	// We do not need to touch the higher bits for regular loads.
1107	B.buildLoadFromOffset(Dst: MI.getOperand(i: `0`), BasePtr: PtrReg, BaseMMO&: *MMO, Offset: `0`);
1108	} else {
1109	// 96-bit loads are only available for vector loads. We need to split this
1110	// into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
1111	if (MMO->getAlign() < Align (`16`)) {
1112	LegalizerHelper Helper(B.getMF(), ApplyBank, B);
1113	LLT Part64, Part32;
1114	std::tie(args&: Part64, args&: Part32) = splitUnequalType(Ty: LoadTy, FirstSize: `64`);
1115	if (Helper.reduceLoadStoreWidth(MI&: cast<GAnyLoad>(Val&: MI), TypeIdx: `0`, NarrowTy: Part64) !=
1116	LegalizerHelper::Legalized)
1117	return false;
1118	return true;
1119	} else {
1120	LLT WiderTy = widen96To128(Ty: LoadTy);
1121	auto WideLoad = B.buildLoadFromOffset(Dst: WiderTy, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: `0`);
1122	if (WiderTy.isScalar())
1123	B.buildTrunc(Res: MI.getOperand(i: `0`), Op: WideLoad);
1124	else {
1125	B.buildDeleteTrailingVectorElements(Res: MI.getOperand(i: `0`).getReg(),
1126	Op0: WideLoad);
1127	}
1128	}
1129	}
1130
1131	MI.eraseFromParent();
1132	return true;
1133	}
1134
1135	// 128-bit loads are supported for all instruction types.
1136	if (LoadSize <= MaxNonSmrdLoadSize)
1137	return false;
1138
1139	SmallVector<Register, `16`> DefRegs(OpdMapper.getVRegs(OpIdx: `0`));
1140	SmallVector<Register, `1`> SrcRegs(OpdMapper.getVRegs(OpIdx: `1`));
1141
1142	if (SrcRegs.empty())
1143	SrcRegs.push_back(Elt: MI.getOperand(i: `1`).getReg());
1144
1145	assert(LoadSize % MaxNonSmrdLoadSize == `0`);
1146
1147	// RegBankSelect only emits scalar types, so we need to reset the pointer
1148	// operand to a pointer type.
1149	Register BasePtrReg = SrcRegs [`0`];
1150	LLT PtrTy = MRI.getType(Reg: MI.getOperand(i: `1`).getReg());
1151	MRI.setType(VReg: BasePtrReg, Ty: PtrTy);
1152
1153	unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize;
1154	const LLT LoadSplitTy = LoadTy.divide(Factor: NumSplitParts);
1155	ApplyRegBankMapping O(B, *this, MRI, &AMDGPU::VGPRRegBank);
1156	LegalizerHelper Helper(B.getMF(), O, B);
1157
1158	if (LoadTy.isVector()) {
1159	if (Helper.fewerElementsVector(MI, TypeIdx: `0`, NarrowTy: LoadSplitTy) != LegalizerHelper::Legalized)
1160	return false;
1161	} else {
1162	if (Helper.narrowScalar(MI, TypeIdx: `0`, NarrowTy: LoadSplitTy) != LegalizerHelper::Legalized)
1163	return false;
1164	}
1165
1166	MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
1167	return true;
1168	}
1169
1170	bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc(
1171	MachineIRBuilder &B,
1172	const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1173	MachineInstr &MI) const {
1174	MachineRegisterInfo &MRI = *B.getMRI();
1175	const MachineFunction &MF = B.getMF();
1176	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1177	const auto &TFI = *ST.getFrameLowering();
1178
1179	// Guard in case the stack growth direction ever changes with scratch
1180	// instructions.
1181	if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown)
1182	return false;
1183
1184	Register Dst = MI.getOperand(i: `0`).getReg();
1185	Register AllocSize = MI.getOperand(i: `1`).getReg();
1186	Align Alignment = assumeAligned(Value: MI.getOperand(i: `2`).getImm());
1187
1188	const RegisterBank SizeBank = getRegBank(AllocSize, MRI, TRI);
1189
1190	// TODO: Need to emit a wave reduction to get the maximum size.
1191	if (SizeBank != &AMDGPU::SGPRRegBank)
1192	return false;
1193
1194	LLT PtrTy = MRI.getType(Reg: Dst);
1195	LLT IntPtrTy = LLT::scalar(SizeInBits: PtrTy.getSizeInBits());
1196
1197	const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1198	Register SPReg = Info->getStackPtrOffsetReg();
1199	ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank);
1200
1201	auto WaveSize = B.buildConstant(LLT::scalar(SizeInBits: `32`), ST.getWavefrontSizeLog2());
1202	auto ScaledSize = B.buildShl(Dst: IntPtrTy, Src0: AllocSize, Src1: WaveSize);
1203
1204	auto SPCopy = B.buildCopy(Res: PtrTy, Op: SPReg);
1205	if (Alignment > TFI.getStackAlign()) {
1206	auto PtrAdd = B.buildPtrAdd(Res: PtrTy, Op0: SPCopy, Op1: ScaledSize);
1207	B.buildMaskLowPtrBits(Res: Dst, Op0: PtrAdd,
1208	NumBits: Log2(A: Alignment) + ST.getWavefrontSizeLog2());
1209	} else {
1210	B.buildPtrAdd(Res: Dst, Op0: SPCopy, Op1: ScaledSize);
1211	}
1212
1213	MI.eraseFromParent();
1214	return true;
1215	}
1216
1217	bool AMDGPURegisterBankInfo::applyMappingImage(
1218	MachineIRBuilder &B, MachineInstr &MI,
1219	const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1220	int RsrcIdx) const {
1221	const int NumDefs = MI.getNumExplicitDefs();
1222
1223	// The reported argument index is relative to the IR intrinsic call arguments,
1224	// so we need to shift by the number of defs and the intrinsic ID.
1225	RsrcIdx += NumDefs + `1`;
1226
1227	// Insert copies to VGPR arguments.
1228	applyDefaultMapping(OpdMapper);
1229
1230	// Fixup any SGPR arguments.
1231	SmallVector<unsigned, `4`> SGPRIndexes;
1232	for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) {
1233	if (!MI.getOperand(i: I).isReg())
1234	continue;
1235
1236	// If this intrinsic has a sampler, it immediately follows rsrc.
1237	if (I == RsrcIdx \|\| I == RsrcIdx + `1`)
1238	SGPRIndexes.push_back(Elt: I);
1239	}
1240
1241	executeInWaterfallLoop(B, MI, OpIndices: SGPRIndexes);
1242	return true;
1243	}
1244
1245	// Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store
1246	// the three offsets (voffset, soffset and instoffset)
1247	unsigned AMDGPURegisterBankInfo::setBufferOffsets(
1248	MachineIRBuilder &B, Register CombinedOffset, Register &VOffsetReg,
1249	Register &SOffsetReg, int64_t &InstOffsetVal, Align Alignment) const {
1250	const LLT S32 = LLT::scalar(SizeInBits: `32`);
1251	MachineRegisterInfo *MRI = B.getMRI();
1252
1253	if (std::optional<int64_t> Imm =
1254	getIConstantVRegSExtVal(VReg: CombinedOffset, MRI: *MRI)) {
1255	uint32_t SOffset, ImmOffset;
1256	if (TII->splitMUBUFOffset(Imm: *Imm, SOffset, ImmOffset, Alignment)) {
1257	VOffsetReg = B.buildConstant(Res: S32, Val: `0`).getReg(Idx: `0`);
1258	SOffsetReg = B.buildConstant(Res: S32, Val: SOffset).getReg(Idx: `0`);
1259	InstOffsetVal = ImmOffset;
1260
1261	B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1262	B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1263	return SOffset + ImmOffset;
1264	}
1265	}
1266
1267	Register Base;
1268	unsigned Offset;
1269
1270	std::tie(args&: Base, args&: Offset) =
1271	AMDGPU::getBaseWithConstantOffset(MRI&: *MRI, Reg: CombinedOffset);
1272
1273	uint32_t SOffset, ImmOffset;
1274	if ((int)Offset > `0` &&
1275	TII->splitMUBUFOffset(Imm: Offset, SOffset, ImmOffset, Alignment)) {
1276	if (getRegBank(Base, MRI, TRI) == &AMDGPU::VGPRRegBank) {
1277	VOffsetReg = Base;
1278	SOffsetReg = B.buildConstant(Res: S32, Val: SOffset).getReg(Idx: `0`);
1279	B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1280	InstOffsetVal = ImmOffset;
1281	return `0`; // XXX - Why is this 0?
1282	}
1283
1284	// If we have SGPR base, we can use it for soffset.
1285	if (SOffset == `0`) {
1286	VOffsetReg = B.buildConstant(Res: S32, Val: `0`).getReg(Idx: `0`);
1287	B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1288	SOffsetReg = Base;
1289	InstOffsetVal = ImmOffset;
1290	return `0`; // XXX - Why is this 0?
1291	}
1292	}
1293
1294	// Handle the variable sgpr + vgpr case.
1295	MachineInstr Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, MRI);
1296	if (Add && (int)Offset >= `0`) {
1297	Register Src0 = getSrcRegIgnoringCopies(Reg: Add->getOperand(i: `1`).getReg(), MRI: *MRI);
1298	Register Src1 = getSrcRegIgnoringCopies(Reg: Add->getOperand(i: `2`).getReg(), MRI: *MRI);
1299
1300	const RegisterBank Src0Bank = getRegBank(Src0, MRI, *TRI);
1301	const RegisterBank Src1Bank = getRegBank(Src1, MRI, *TRI);
1302
1303	if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) {
1304	VOffsetReg = Src0;
1305	SOffsetReg = Src1;
1306	return `0`;
1307	}
1308
1309	if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) {
1310	VOffsetReg = Src1;
1311	SOffsetReg = Src0;
1312	return `0`;
1313	}
1314	}
1315
1316	// Ensure we have a VGPR for the combined offset. This could be an issue if we
1317	// have an SGPR offset and a VGPR resource.
1318	if (getRegBank(CombinedOffset, MRI, TRI) == &AMDGPU::VGPRRegBank) {
1319	VOffsetReg = CombinedOffset;
1320	} else {
1321	VOffsetReg = B.buildCopy(Res: S32, Op: CombinedOffset).getReg(Idx: `0`);
1322	B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1323	}
1324
1325	SOffsetReg = B.buildConstant(Res: S32, Val: `0`).getReg(Idx: `0`);
1326	B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1327	return `0`;
1328	}
1329
1330	bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(
1331	MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
1332	MachineInstr &MI = OpdMapper.getMI();
1333	MachineRegisterInfo &MRI = OpdMapper.getMRI();
1334
1335	const LLT S32 = LLT::scalar(SizeInBits: `32`);
1336	Register Dst = MI.getOperand(i: `0`).getReg();
1337	LLT Ty = MRI.getType(Reg: Dst);
1338
1339	const RegisterBank *RSrcBank =
1340	OpdMapper.getInstrMapping().getOperandMapping(i: `1`).BreakDown[`0`].RegBank;
1341	const RegisterBank *OffsetBank =
1342	OpdMapper.getInstrMapping().getOperandMapping(i: `2`).BreakDown[`0`].RegBank;
1343	if (RSrcBank == &AMDGPU::SGPRRegBank &&
1344	OffsetBank == &AMDGPU::SGPRRegBank)
1345	return true; // Legal mapping
1346
1347	// FIXME: 96-bit case was widened during legalize. We need to narrow it back
1348	// here but don't have an MMO.
1349
1350	unsigned LoadSize = Ty.getSizeInBits();
1351	int NumLoads = `1`;
1352	if (LoadSize == `256` \|\| LoadSize == `512`) {
1353	NumLoads = LoadSize / `128`;
1354	Ty = Ty.divide(Factor: NumLoads);
1355	}
1356
1357	// Use the alignment to ensure that the required offsets will fit into the
1358	// immediate offsets.
1359	const Align Alignment = NumLoads > `1` ? Align (`16` * NumLoads) : Align (`1`);
1360
1361	MachineFunction &MF = B.getMF();
1362
1363	Register SOffset;
1364	Register VOffset;
1365	int64_t ImmOffset = `0`;
1366
1367	unsigned MMOOffset = setBufferOffsets(B, CombinedOffset: MI.getOperand(i: `2`).getReg(), VOffsetReg&: VOffset,
1368	SOffsetReg&: SOffset, InstOffsetVal&: ImmOffset, Alignment);
1369
1370	// TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we
1371	// can, but we need to track an MMO for that.
1372	const unsigned MemSize = (Ty.getSizeInBits() + `7`) / `8`;
1373	const Align MemAlign(`4`); // FIXME: ABI type alignment?
1374	MachineMemOperand *BaseMMO = MF.getMachineMemOperand(
1375	PtrInfo: MachinePointerInfo (),
1376	F: MachineMemOperand::MOLoad \| MachineMemOperand::MODereferenceable \|
1377	MachineMemOperand::MOInvariant,
1378	Size: MemSize, BaseAlignment: MemAlign);
1379	if (MMOOffset != `0`)
1380	BaseMMO = MF.getMachineMemOperand(MMO: BaseMMO, Offset: MMOOffset, Size: MemSize);
1381
1382	// If only the offset is divergent, emit a MUBUF buffer load instead. We can
1383	// assume that the buffer is unswizzled.
1384
1385	Register RSrc = MI.getOperand(i: `1`).getReg();
1386	Register VIndex = B.buildConstant(Res: S32, Val: `0`).getReg(Idx: `0`);
1387	B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank);
1388
1389	SmallVector<Register, `4`> LoadParts(NumLoads);
1390
1391	MachineBasicBlock::iterator MII = MI.getIterator();
1392	MachineInstrSpan Span(MII, &B.getMBB());
1393
1394	for (int i = `0`; i < NumLoads; ++i) {
1395	if (NumLoads == `1`) {
1396	LoadParts [i] = Dst;
1397	} else {
1398	LoadParts [i] = MRI.createGenericVirtualRegister(Ty);
1399	MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank);
1400	}
1401
1402	MachineMemOperand *MMO = BaseMMO;
1403	if (i != `0`)
1404	BaseMMO = MF.getMachineMemOperand(MMO: BaseMMO, Offset: MMOOffset + `16` * i, Size: MemSize);
1405
1406	B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD)
1407	.addDef(LoadParts[i]) // vdata
1408	.addUse(RSrc) // rsrc
1409	.addUse(VIndex) // vindex
1410	.addUse(VOffset) // voffset
1411	.addUse(SOffset) // soffset
1412	.addImm(ImmOffset + `16` * i) // offset(imm)
1413	.addImm(`0`) // cachepolicy, swizzled buffer(imm)
1414	.addImm(`0`) // idxen(imm)
1415	.addMemOperand(MMO);
1416	}
1417
1418	// TODO: If only the resource is a VGPR, it may be better to execute the
1419	// scalar load in the waterfall loop if the resource is expected to frequently
1420	// be dynamically uniform.
1421	if (RSrcBank != &AMDGPU::SGPRRegBank) {
1422	// Remove the original instruction to avoid potentially confusing the
1423	// waterfall loop logic.
1424	B.setInstr(*Span.begin());
1425	MI.eraseFromParent();
1426
1427	SmallSet<Register, `4`> OpsToWaterfall;
1428
1429	OpsToWaterfall.insert(V: RSrc);
1430	executeInWaterfallLoop(B, make_range(x: Span.begin(), y: Span.end()),
1431	OpsToWaterfall);
1432	}
1433
1434	if (NumLoads != `1`) {
1435	if (Ty.isVector())
1436	B.buildConcatVectors(Res: Dst, Ops: LoadParts);
1437	else
1438	B.buildMergeLikeInstr(Res: Dst, Ops: LoadParts);
1439	}
1440
1441	// We removed the instruction earlier with a waterfall loop.
1442	if (RSrcBank == &AMDGPU::SGPRRegBank)
1443	MI.eraseFromParent();
1444
1445	return true;
1446	}
1447
1448	bool AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B,
1449	const OperandsMapper &OpdMapper,
1450	bool Signed) const {
1451	MachineInstr &MI = OpdMapper.getMI();
1452	MachineRegisterInfo &MRI = OpdMapper.getMRI();
1453
1454	// Insert basic copies
1455	applyDefaultMapping(OpdMapper);
1456
1457	Register DstReg = MI.getOperand(i: `0`).getReg();
1458	LLT Ty = MRI.getType(Reg: DstReg);
1459
1460	const LLT S32 = LLT::scalar(SizeInBits: `32`);
1461
1462	unsigned FirstOpnd = isa<GIntrinsic>(Val: MI) ? `2` : `1`;
1463	Register SrcReg = MI.getOperand(i: FirstOpnd).getReg();
1464	Register OffsetReg = MI.getOperand(i: FirstOpnd + `1`).getReg();
1465	Register WidthReg = MI.getOperand(i: FirstOpnd + `2`).getReg();
1466
1467	const RegisterBank *DstBank =
1468	OpdMapper.getInstrMapping().getOperandMapping(i: `0`).BreakDown[`0`].RegBank;
1469	if (DstBank == &AMDGPU::VGPRRegBank) {
1470	if (Ty == S32)
1471	return true;
1472
1473	// There is no 64-bit vgpr bitfield extract instructions so the operation
1474	// is expanded to a sequence of instructions that implement the operation.
1475	ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank);
1476
1477	const LLT S64 = LLT::scalar(SizeInBits: `64`);
1478	// Shift the source operand so that extracted bits start at bit 0.
1479	auto ShiftOffset = Signed ? B.buildAShr(Dst: S64, Src0: SrcReg, Src1: OffsetReg)
1480	: B.buildLShr(Dst: S64, Src0: SrcReg, Src1: OffsetReg);
1481	auto UnmergeSOffset = B.buildUnmerge(Res: {S32, S32}, Op: ShiftOffset);
1482
1483	// A 64-bit bitfield extract uses the 32-bit bitfield extract instructions
1484	// if the width is a constant.
1485	if (auto ConstWidth = getIConstantVRegValWithLookThrough(VReg: WidthReg, MRI)) {
1486	// Use the 32-bit bitfield extract instruction if the width is a constant.
1487	// Depending on the width size, use either the low or high 32-bits.
1488	auto Zero = B.buildConstant(Res: S32, Val: `0`);
1489	auto WidthImm = ConstWidth ->Value.getZExtValue();
1490	if (WidthImm <= `32`) {
1491	// Use bitfield extract on the lower 32-bit source, and then sign-extend
1492	// or clear the upper 32-bits.
1493	auto Extract =
1494	Signed ? B.buildSbfx(Dst: S32, Src: UnmergeSOffset.getReg(Idx: `0`), LSB: Zero, Width: WidthReg)
1495	: B.buildUbfx(Dst: S32, Src: UnmergeSOffset.getReg(Idx: `0`), LSB: Zero, Width: WidthReg);
1496	auto Extend =
1497	Signed ? B.buildAShr(Dst: S32, Src0: Extract, Src1: B.buildConstant(Res: S32, Val: `31`)) : Zero;
1498	B.buildMergeLikeInstr(Res: DstReg, Ops: {Extract, Extend});
1499	} else {
1500	// Use bitfield extract on upper 32-bit source, and combine with lower
1501	// 32-bit source.
1502	auto UpperWidth = B.buildConstant(Res: S32, Val: WidthImm - `32`);
1503	auto Extract =
1504	Signed
1505	? B.buildSbfx(Dst: S32, Src: UnmergeSOffset.getReg(Idx: `1`), LSB: Zero, Width: UpperWidth)
1506	: B.buildUbfx(Dst: S32, Src: UnmergeSOffset.getReg(Idx: `1`), LSB: Zero, Width: UpperWidth);
1507	B.buildMergeLikeInstr(Res: DstReg, Ops: {UnmergeSOffset.getReg(Idx: `0`), Extract});
1508	}
1509	MI.eraseFromParent();
1510	return true;
1511	}
1512
1513	// Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit
1514	// operations.
1515	auto ExtShift = B.buildSub(Dst: S32, Src0: B.buildConstant(Res: S32, Val: `64`), Src1: WidthReg);
1516	auto SignBit = B.buildShl(Dst: S64, Src0: ShiftOffset, Src1: ExtShift);
1517	if (Signed)
1518	B.buildAShr(Dst: S64, Src0: SignBit, Src1: ExtShift);
1519	else
1520	B.buildLShr(Dst: S64, Src0: SignBit, Src1: ExtShift);
1521	MI.eraseFromParent();
1522	return true;
1523	}
1524
1525	// The scalar form packs the offset and width in a single operand.
1526
1527	ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank);
1528
1529	// Ensure the high bits are clear to insert the offset.
1530	auto OffsetMask = B.buildConstant(Res: S32, Val: maskTrailingOnes<unsigned>(N: `6`));
1531	auto ClampOffset = B.buildAnd(Dst: S32, Src0: OffsetReg, Src1: OffsetMask);
1532
1533	// Zeros out the low bits, so don't bother clamping the input value.
1534	auto ShiftWidth = B.buildShl(Dst: S32, Src0: WidthReg, Src1: B.buildConstant(Res: S32, Val: `16`));
1535
1536	// Transformation function, pack the offset and width of a BFE into
1537	// the format expected by the S_BFE_I32 / S_BFE_U32. In the second
1538	// source, bits [5:0] contain the offset and bits [22:16] the width.
1539	auto MergedInputs = B.buildOr(Dst: S32, Src0: ClampOffset, Src1: ShiftWidth);
1540
1541	// TODO: It might be worth using a pseudo here to avoid scc clobber and
1542	// register class constraints.
1543	unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
1544	(Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
1545
1546	auto MIB = B.buildInstr(Opc, DstOps: {DstReg}, SrcOps: {SrcReg, MergedInputs});
1547	if (!constrainSelectedInstRegOperands(MIB, TII, TRI, this))
1548	llvm_unreachable("failed to constrain BFE");
1549
1550	MI.eraseFromParent();
1551	return true;
1552	}
1553
1554	bool AMDGPURegisterBankInfo::applyMappingMAD_64_32(
1555	MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
1556	MachineInstr &MI = OpdMapper.getMI();
1557	MachineRegisterInfo &MRI = OpdMapper.getMRI();
1558
1559	// Insert basic copies.
1560	applyDefaultMapping(OpdMapper);
1561
1562	Register Dst0 = MI.getOperand(i: `0`).getReg();
1563	Register Dst1 = MI.getOperand(i: `1`).getReg();
1564	Register Src0 = MI.getOperand(i: `2`).getReg();
1565	Register Src1 = MI.getOperand(i: `3`).getReg();
1566	Register Src2 = MI.getOperand(i: `4`).getReg();
1567
1568	if (MRI.getRegBankOrNull(Src0) == &AMDGPU::VGPRRegBank)
1569	return true;
1570
1571	bool IsUnsigned = MI.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
1572	LLT S1 = LLT::scalar(SizeInBits: `1`);
1573	LLT S32 = LLT::scalar(SizeInBits: `32`);
1574
1575	bool DstOnValu = MRI.getRegBankOrNull(Src2) == &AMDGPU::VGPRRegBank;
1576	bool Accumulate = true;
1577
1578	if (!DstOnValu) {
1579	if (mi_match(R: Src2, MRI, P: m_ZeroInt()))
1580	Accumulate = false;
1581	}
1582
1583	// Keep the multiplication on the SALU.
1584	Register DstHi;
1585	Register DstLo = B.buildMul(Dst: S32, Src0, Src1).getReg(Idx: `0`);
1586	bool MulHiInVgpr = false;
1587
1588	MRI.setRegBank(DstLo, AMDGPU::SGPRRegBank);
1589
1590	if (Subtarget.hasSMulHi()) {
1591	DstHi = IsUnsigned ? B.buildUMulH(Dst: S32, Src0, Src1).getReg(Idx: `0`)
1592	: B.buildSMulH(Dst: S32, Src0, Src1).getReg(Idx: `0`);
1593	MRI.setRegBank(DstHi, AMDGPU::SGPRRegBank);
1594	} else {
1595	Register VSrc0 = B.buildCopy(Res: S32, Op: Src0).getReg(Idx: `0`);
1596	Register VSrc1 = B.buildCopy(Res: S32, Op: Src1).getReg(Idx: `0`);
1597
1598	MRI.setRegBank(VSrc0, AMDGPU::VGPRRegBank);
1599	MRI.setRegBank(VSrc1, AMDGPU::VGPRRegBank);
1600
1601	DstHi = IsUnsigned ? B.buildUMulH(Dst: S32, Src0: VSrc0, Src1: VSrc1).getReg(Idx: `0`)
1602	: B.buildSMulH(Dst: S32, Src0: VSrc0, Src1: VSrc1).getReg(Idx: `0`);
1603	MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank);
1604
1605	if (!DstOnValu) {
1606	DstHi = buildReadFirstLane(B, MRI, Src: DstHi);
1607	} else {
1608	MulHiInVgpr = true;
1609	}
1610	}
1611
1612	// Accumulate and produce the "carry-out" bit.
1613	//
1614	// The "carry-out" is defined as bit 64 of the result when computed as a
1615	// big integer. For unsigned multiply-add, this matches the usual definition
1616	// of carry-out. For signed multiply-add, bit 64 is the sign bit of the
1617	// result, which is determined as:
1618	// sign(Src0 Src1) + sign(Src2) + carry-out from unsigned 64-bit add*
1619	LLT CarryType = DstOnValu ? S1 : S32;
1620	const RegisterBank &CarryBank =
1621	DstOnValu ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
1622	const RegisterBank &DstBank =
1623	DstOnValu ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank;
1624	Register Carry;
1625	Register Zero;
1626
1627	if (!IsUnsigned) {
1628	Zero = B.buildConstant(Res: S32, Val: `0`).getReg(Idx: `0`);
1629	MRI.setRegBank(Zero,
1630	MulHiInVgpr ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank);
1631
1632	Carry = B.buildICmp(Pred: CmpInst::ICMP_SLT, Res: MulHiInVgpr ? S1 : S32, Op0: DstHi, Op1: Zero)
1633	.getReg(Idx: `0`);
1634	MRI.setRegBank(Carry, MulHiInVgpr ? AMDGPU::VCCRegBank
1635	: AMDGPU::SGPRRegBank);
1636
1637	if (DstOnValu && !MulHiInVgpr) {
1638	Carry = B.buildTrunc(Res: S1, Op: Carry).getReg(Idx: `0`);
1639	MRI.setRegBank(Carry, AMDGPU::VCCRegBank);
1640	}
1641	}
1642
1643	if (Accumulate) {
1644	if (DstOnValu) {
1645	DstLo = B.buildCopy(Res: S32, Op: DstLo).getReg(Idx: `0`);
1646	DstHi = B.buildCopy(Res: S32, Op: DstHi).getReg(Idx: `0`);
1647	MRI.setRegBank(DstLo, AMDGPU::VGPRRegBank);
1648	MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank);
1649	}
1650
1651	auto Unmerge = B.buildUnmerge(Res: S32, Op: Src2);
1652	Register Src2Lo = Unmerge.getReg(Idx: `0`);
1653	Register Src2Hi = Unmerge.getReg(Idx: `1`);
1654	MRI.setRegBank(Reg: Src2Lo, RegBank: DstBank);
1655	MRI.setRegBank(Reg: Src2Hi, RegBank: DstBank);
1656
1657	if (!IsUnsigned) {
1658	auto Src2Sign = B.buildICmp(Pred: CmpInst::ICMP_SLT, Res: CarryType, Op0: Src2Hi, Op1: Zero);
1659	MRI.setRegBank(Reg: Src2Sign.getReg(Idx: `0`), RegBank: CarryBank);
1660
1661	Carry = B.buildXor(Dst: CarryType, Src0: Carry, Src1: Src2Sign).getReg(Idx: `0`);
1662	MRI.setRegBank(Reg: Carry, RegBank: CarryBank);
1663	}
1664
1665	auto AddLo = B.buildUAddo(Res: S32, CarryOut: CarryType, Op0: DstLo, Op1: Src2Lo);
1666	DstLo = AddLo.getReg(Idx: `0`);
1667	Register CarryLo = AddLo.getReg(Idx: `1`);
1668	MRI.setRegBank(Reg: DstLo, RegBank: DstBank);
1669	MRI.setRegBank(Reg: CarryLo, RegBank: CarryBank);
1670
1671	auto AddHi = B.buildUAdde(Res: S32, CarryOut: CarryType, Op0: DstHi, Op1: Src2Hi, CarryIn: CarryLo);
1672	DstHi = AddHi.getReg(Idx: `0`);
1673	MRI.setRegBank(Reg: DstHi, RegBank: DstBank);
1674
1675	Register CarryHi = AddHi.getReg(Idx: `1`);
1676	MRI.setRegBank(Reg: CarryHi, RegBank: CarryBank);
1677
1678	if (IsUnsigned) {
1679	Carry = CarryHi;
1680	} else {
1681	Carry = B.buildXor(Dst: CarryType, Src0: Carry, Src1: CarryHi).getReg(Idx: `0`);
1682	MRI.setRegBank(Reg: Carry, RegBank: CarryBank);
1683	}
1684	} else {
1685	if (IsUnsigned) {
1686	Carry = B.buildConstant(Res: CarryType, Val: `0`).getReg(Idx: `0`);
1687	MRI.setRegBank(Reg: Carry, RegBank: CarryBank);
1688	}
1689	}
1690
1691	B.buildMergeLikeInstr(Res: Dst0, Ops: {DstLo, DstHi});
1692
1693	if (DstOnValu) {
1694	B.buildCopy(Res: Dst1, Op: Carry);
1695	} else {
1696	B.buildTrunc(Res: Dst1, Op: Carry);
1697	}
1698
1699	MI.eraseFromParent();
1700	return true;
1701	}
1702
1703	// Return a suitable opcode for extending the operands of Opc when widening.
1704	static unsigned getExtendOp(unsigned Opc) {
1705	switch (Opc) {
1706	case TargetOpcode::G_ASHR:
1707	case TargetOpcode::G_SMIN:
1708	case TargetOpcode::G_SMAX:
1709	return TargetOpcode::G_SEXT;
1710	case TargetOpcode::G_LSHR:
1711	case TargetOpcode::G_UMIN:
1712	case TargetOpcode::G_UMAX:
1713	return TargetOpcode::G_ZEXT;
1714	default:
1715	return TargetOpcode::G_ANYEXT;
1716	}
1717	}
1718
1719	// Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding
1720	// any illegal vector extend or unmerge operations.
1721	static std::pair<Register, Register>
1722	unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) {
1723	const LLT S32 = LLT::scalar(SizeInBits: `32`);
1724	auto Bitcast = B.buildBitcast(Dst: S32, Src);
1725
1726	if (ExtOpcode == TargetOpcode::G_SEXT) {
1727	auto ExtLo = B.buildSExtInReg(Res: S32, Op: Bitcast, ImmOp: `16`);
1728	auto ShiftHi = B.buildAShr(Dst: S32, Src0: Bitcast, Src1: B.buildConstant(Res: S32, Val: `16`));
1729	return std::pair(ExtLo.getReg(Idx: `0`), ShiftHi.getReg(Idx: `0`));
1730	}
1731
1732	auto ShiftHi = B.buildLShr(Dst: S32, Src0: Bitcast, Src1: B.buildConstant(Res: S32, Val: `16`));
1733	if (ExtOpcode == TargetOpcode::G_ZEXT) {
1734	auto ExtLo = B.buildAnd(Dst: S32, Src0: Bitcast, Src1: B.buildConstant(Res: S32, Val: `0xffff`));
1735	return std::pair(ExtLo.getReg(Idx: `0`), ShiftHi.getReg(Idx: `0`));
1736	}
1737
1738	assert(ExtOpcode == TargetOpcode::G_ANYEXT);
1739	return std::pair(Bitcast.getReg(Idx: `0`), ShiftHi.getReg(Idx: `0`));
1740	}
1741
1742	// For cases where only a single copy is inserted for matching register banks.
1743	// Replace the register in the instruction operand
1744	static bool substituteSimpleCopyRegs(
1745	const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
1746	SmallVector<unsigned, `1`> SrcReg(OpdMapper.getVRegs(OpIdx));
1747	if (!SrcReg.empty()) {
1748	assert(SrcReg.size() == `1`);
1749	OpdMapper.getMI().getOperand(i: OpIdx).setReg(SrcReg [`0`]);
1750	return true;
1751	}
1752
1753	return false;
1754	}
1755
1756	/// Handle register layout difference for f16 images for some subtargets.
1757	Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B,
1758	MachineRegisterInfo &MRI,
1759	Register Reg) const {
1760	if (!Subtarget.hasUnpackedD16VMem())
1761	return Reg;
1762
1763	const LLT S16 = LLT::scalar(SizeInBits: `16`);
1764	LLT StoreVT = MRI.getType(Reg);
1765	if (!StoreVT.isVector() \|\| StoreVT.getElementType() != S16)
1766	return Reg;
1767
1768	auto Unmerge = B.buildUnmerge(Res: S16, Op: Reg);
1769
1770
1771	SmallVector<Register, `4`> WideRegs;
1772	for (int I = `0`, E = Unmerge ->getNumOperands() - `1`; I != E; ++I)
1773	WideRegs.push_back(Elt: Unmerge.getReg(Idx: I));
1774
1775	const LLT S32 = LLT::scalar(SizeInBits: `32`);
1776	int NumElts = StoreVT.getNumElements();
1777
1778	return B.buildMergeLikeInstr(Res: LLT::fixed_vector(NumElements: NumElts, ScalarTy: S32), Ops: WideRegs)
1779	.getReg(Idx: `0`);
1780	}
1781
1782	static std::pair<Register, unsigned>
1783	getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
1784	int64_t Const;
1785	if (mi_match(R: Reg, MRI, P: m_ICst(Cst&: Const)))
1786	return std::pair(Register (), Const);
1787
1788	Register Base;
1789	if (mi_match(R: Reg, MRI, P: m_GAdd(L: m_Reg(R&: Base), R: m_ICst(Cst&: Const))))
1790	return std::pair(Base, Const);
1791
1792	// TODO: Handle G_OR used for add case
1793	return std::pair(Reg, `0`);
1794	}
1795
1796	std::pair<Register, unsigned>
1797	AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B,
1798	Register OrigOffset) const {
1799	const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST: Subtarget);
1800	Register BaseReg;
1801	unsigned ImmOffset;
1802	const LLT S32 = LLT::scalar(SizeInBits: `32`);
1803
1804	// TODO: Use AMDGPU::getBaseWithConstantOffset() instead.
1805	std::tie(args&: BaseReg, args&: ImmOffset) = getBaseWithConstantOffset(MRI&: *B.getMRI(),
1806	Reg: OrigOffset);
1807
1808	unsigned C1 = `0`;
1809	if (ImmOffset != `0`) {
1810	// If the immediate value is too big for the immoffset field, put only bits
1811	// that would normally fit in the immoffset field. The remaining value that
1812	// is copied/added for the voffset field is a large power of 2, and it
1813	// stands more chance of being CSEd with the copy/add for another similar
1814	// load/store.
1815	// However, do not do that rounding down if that is a negative
1816	// number, as it appears to be illegal to have a negative offset in the
1817	// vgpr, even if adding the immediate offset makes it positive.
1818	unsigned Overflow = ImmOffset & ~MaxImm;
1819	ImmOffset -= Overflow;
1820	if ((int32_t)Overflow < `0`) {
1821	Overflow += ImmOffset;
1822	ImmOffset = `0`;
1823	}
1824
1825	C1 = ImmOffset;
1826	if (Overflow != `0`) {
1827	if (!BaseReg)
1828	BaseReg = B.buildConstant(Res: S32, Val: Overflow).getReg(Idx: `0`);
1829	else {
1830	auto OverflowVal = B.buildConstant(Res: S32, Val: Overflow);
1831	BaseReg = B.buildAdd(Dst: S32, Src0: BaseReg, Src1: OverflowVal).getReg(Idx: `0`);
1832	}
1833	}
1834	}
1835
1836	if (!BaseReg)
1837	BaseReg = B.buildConstant(Res: S32, Val: `0`).getReg(Idx: `0`);
1838
1839	return {BaseReg, C1};
1840	}
1841
1842	bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg,
1843	Register SrcReg) const {
1844	MachineRegisterInfo &MRI = *B.getMRI();
1845	LLT SrcTy = MRI.getType(Reg: SrcReg);
1846	if (SrcTy.getSizeInBits() == `32`) {
1847	// Use a v_mov_b32 here to make the exec dependency explicit.
1848	B.buildInstr(AMDGPU::V_MOV_B32_e32)
1849	.addDef(DstReg)
1850	.addUse(SrcReg);
1851	return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) &&
1852	constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI);
1853	}
1854
1855	Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1856	Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1857
1858	B.buildInstr(AMDGPU::V_MOV_B32_e32)
1859	.addDef(TmpReg0)
1860	.addUse(SrcReg, `0`, AMDGPU::sub0);
1861	B.buildInstr(AMDGPU::V_MOV_B32_e32)
1862	.addDef(TmpReg1)
1863	.addUse(SrcReg, `0`, AMDGPU::sub1);
1864	B.buildInstr(AMDGPU::REG_SEQUENCE)
1865	.addDef(DstReg)
1866	.addUse(TmpReg0)
1867	.addImm(AMDGPU::sub0)
1868	.addUse(TmpReg1)
1869	.addImm(AMDGPU::sub1);
1870
1871	return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) &&
1872	constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI);
1873	}
1874
1875	/// Utility function for pushing dynamic vector indexes with a constant offset
1876	/// into waterfall loops.
1877	static void reinsertVectorIndexAdd(MachineIRBuilder &B,
1878	MachineInstr &IdxUseInstr,
1879	unsigned OpIdx,
1880	unsigned ConstOffset) {
1881	MachineRegisterInfo &MRI = *B.getMRI();
1882	const LLT S32 = LLT::scalar(SizeInBits: `32`);
1883	Register WaterfallIdx = IdxUseInstr.getOperand(i: OpIdx).getReg();
1884	B.setInsertPt(MBB&: *IdxUseInstr.getParent(), II: IdxUseInstr.getIterator());
1885
1886	auto MaterializedOffset = B.buildConstant(Res: S32, Val: ConstOffset);
1887
1888	auto Add = B.buildAdd(Dst: S32, Src0: WaterfallIdx, Src1: MaterializedOffset);
1889	MRI.setRegBank(MaterializedOffset.getReg(`0`), AMDGPU::SGPRRegBank);
1890	MRI.setRegBank(Add.getReg(`0`), AMDGPU::SGPRRegBank);
1891	IdxUseInstr.getOperand(i: OpIdx).setReg(Add.getReg(Idx: `0`));
1892	}
1893
1894	/// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the
1895	/// original 32-bit source value (to be inserted in the low part of the combined
1896	/// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit
1897	/// value.
1898	static void extendLow32IntoHigh32(MachineIRBuilder &B,
1899	Register Hi32Reg, Register Lo32Reg,
1900	unsigned ExtOpc,
1901	const RegisterBank &RegBank,
1902	bool IsBooleanSrc = false) {
1903	if (ExtOpc == AMDGPU::G_ZEXT) {
1904	B.buildConstant(Res: Hi32Reg, Val: `0`);
1905	} else if (ExtOpc == AMDGPU::G_SEXT) {
1906	if (IsBooleanSrc) {
1907	// If we know the original source was an s1, the high half is the same as
1908	// the low.
1909	B.buildCopy(Res: Hi32Reg, Op: Lo32Reg);
1910	} else {
1911	// Replicate sign bit from 32-bit extended part.
1912	auto ShiftAmt = B.buildConstant(Res: LLT::scalar(SizeInBits: `32`), Val: `31`);
1913	B.getMRI()->setRegBank(Reg: ShiftAmt.getReg(Idx: `0`), RegBank);
1914	B.buildAShr(Dst: Hi32Reg, Src0: Lo32Reg, Src1: ShiftAmt);
1915	}
1916	} else {
1917	assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension");
1918	B.buildUndef(Res: Hi32Reg);
1919	}
1920	}
1921
1922	bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
1923	MachineIRBuilder &B, MachineInstr &MI,
1924	const OperandsMapper &OpdMapper) const {
1925	MachineRegisterInfo &MRI = *B.getMRI();
1926
1927	Register VecReg = MI.getOperand(i: `1`).getReg();
1928	Register Idx = MI.getOperand(i: `2`).getReg();
1929
1930	const RegisterBank &IdxBank =
1931	*OpdMapper.getInstrMapping().getOperandMapping(i: `2`).BreakDown[`0`].RegBank;
1932
1933	bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
1934
1935	LLT VecTy = MRI.getType(Reg: VecReg);
1936	unsigned EltSize = VecTy.getScalarSizeInBits();
1937	unsigned NumElem = VecTy.getNumElements();
1938
1939	if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
1940	IsDivergentIdx, Subtarget: &Subtarget))
1941	return false;
1942
1943	LLT S32 = LLT::scalar(SizeInBits: `32`);
1944
1945	const RegisterBank &DstBank =
1946	*OpdMapper.getInstrMapping().getOperandMapping(i: `0`).BreakDown[`0`].RegBank;
1947	const RegisterBank &SrcBank =
1948	*OpdMapper.getInstrMapping().getOperandMapping(i: `1`).BreakDown[`0`].RegBank;
1949
1950	const RegisterBank &CCBank =
1951	(DstBank == AMDGPU::SGPRRegBank &&
1952	SrcBank == AMDGPU::SGPRRegBank &&
1953	IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
1954	: AMDGPU::VCCRegBank;
1955	LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(`1`);
1956
1957	if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
1958	Idx = B.buildCopy(Res: S32, Op: Idx)->getOperand(i: `0`).getReg();
1959	MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
1960	}
1961
1962	LLT EltTy = VecTy.getScalarType();
1963	SmallVector<Register, `2`> DstRegs(OpdMapper.getVRegs(OpIdx: `0`));
1964	unsigned NumLanes = DstRegs.size();
1965	if (!NumLanes)
1966	NumLanes = `1`;
1967	else
1968	EltTy = MRI.getType(Reg: DstRegs [`0`]);
1969
1970	auto UnmergeToEltTy = B.buildUnmerge(Res: EltTy, Op: VecReg);
1971	SmallVector<Register, `2`> Res(NumLanes);
1972	for (unsigned L = `0`; L < NumLanes; ++L)
1973	Res [L] = UnmergeToEltTy.getReg(Idx: L);
1974
1975	for (unsigned I = `1`; I < NumElem; ++I) {
1976	auto IC = B.buildConstant(Res: S32, Val: I);
1977	MRI.setRegBank(IC->getOperand(`0`).getReg(), AMDGPU::SGPRRegBank);
1978	auto Cmp = B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: CCTy, Op0: Idx, Op1: IC);
1979	MRI.setRegBank(Reg: Cmp ->getOperand(i: `0`).getReg(), RegBank: CCBank);
1980
1981	for (unsigned L = `0`; L < NumLanes; ++L) {
1982	auto S = B.buildSelect(Res: EltTy, Tst: Cmp,
1983	Op0: UnmergeToEltTy.getReg(Idx: I * NumLanes + L), Op1: Res [L]);
1984
1985	for (unsigned N : { `0`, `2`, `3` })
1986	MRI.setRegBank(Reg: S ->getOperand(i: N).getReg(), RegBank: DstBank);
1987
1988	Res [L] = S ->getOperand(i: `0`).getReg();
1989	}
1990	}
1991
1992	for (unsigned L = `0`; L < NumLanes; ++L) {
1993	Register DstReg = (NumLanes == `1`) ? MI.getOperand(i: `0`).getReg() : DstRegs [L];
1994	B.buildCopy(Res: DstReg, Op: Res [L]);
1995	MRI.setRegBank(Reg: DstReg, RegBank: DstBank);
1996	}
1997
1998	MRI.setRegBank(Reg: MI.getOperand(i: `0`).getReg(), RegBank: DstBank);
1999	MI.eraseFromParent();
2000
2001	return true;
2002	}
2003
2004	// Insert a cross regbank copy for a register if it already has a bank that
2005	// differs from the one we want to set.
2006	static Register constrainRegToBank(MachineRegisterInfo &MRI,
2007	MachineIRBuilder &B, Register &Reg,
2008	const RegisterBank &Bank) {
2009	const RegisterBank *CurrBank = MRI.getRegBankOrNull(Reg);
2010	if (CurrBank && *CurrBank != Bank) {
2011	Register Copy = B.buildCopy(Res: MRI.getType(Reg), Op: Reg).getReg(Idx: `0`);
2012	MRI.setRegBank(Reg: Copy, RegBank: Bank);
2013	return Copy;
2014	}
2015
2016	MRI.setRegBank(Reg, RegBank: Bank);
2017	return Reg;
2018	}
2019
2020	bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
2021	MachineIRBuilder &B, MachineInstr &MI,
2022	const OperandsMapper &OpdMapper) const {
2023
2024	MachineRegisterInfo &MRI = *B.getMRI();
2025	Register VecReg = MI.getOperand(i: `1`).getReg();
2026	Register Idx = MI.getOperand(i: `3`).getReg();
2027
2028	const RegisterBank &IdxBank =
2029	*OpdMapper.getInstrMapping().getOperandMapping(i: `3`).BreakDown[`0`].RegBank;
2030
2031	bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
2032
2033	LLT VecTy = MRI.getType(Reg: VecReg);
2034	unsigned EltSize = VecTy.getScalarSizeInBits();
2035	unsigned NumElem = VecTy.getNumElements();
2036
2037	if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
2038	IsDivergentIdx, Subtarget: &Subtarget))
2039	return false;
2040
2041	LLT S32 = LLT::scalar(SizeInBits: `32`);
2042
2043	const RegisterBank &DstBank =
2044	*OpdMapper.getInstrMapping().getOperandMapping(i: `0`).BreakDown[`0`].RegBank;
2045	const RegisterBank &SrcBank =
2046	*OpdMapper.getInstrMapping().getOperandMapping(i: `1`).BreakDown[`0`].RegBank;
2047	const RegisterBank &InsBank =
2048	*OpdMapper.getInstrMapping().getOperandMapping(i: `2`).BreakDown[`0`].RegBank;
2049
2050	const RegisterBank &CCBank =
2051	(DstBank == AMDGPU::SGPRRegBank &&
2052	SrcBank == AMDGPU::SGPRRegBank &&
2053	InsBank == AMDGPU::SGPRRegBank &&
2054	IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
2055	: AMDGPU::VCCRegBank;
2056	LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(`1`);
2057
2058	if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
2059	Idx = B.buildCopy(Res: S32, Op: Idx)->getOperand(i: `0`).getReg();
2060	MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
2061	}
2062
2063	LLT EltTy = VecTy.getScalarType();
2064	SmallVector<Register, `2`> InsRegs(OpdMapper.getVRegs(OpIdx: `2`));
2065	unsigned NumLanes = InsRegs.size();
2066	if (!NumLanes) {
2067	NumLanes = `1`;
2068	InsRegs.push_back(Elt: MI.getOperand(i: `2`).getReg());
2069	} else {
2070	EltTy = MRI.getType(Reg: InsRegs [`0`]);
2071	}
2072
2073	auto UnmergeToEltTy = B.buildUnmerge(Res: EltTy, Op: VecReg);
2074	SmallVector<Register, `16`> Ops(NumElem * NumLanes);
2075
2076	for (unsigned I = `0`; I < NumElem; ++I) {
2077	auto IC = B.buildConstant(Res: S32, Val: I);
2078	MRI.setRegBank(IC->getOperand(`0`).getReg(), AMDGPU::SGPRRegBank);
2079	auto Cmp = B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: CCTy, Op0: Idx, Op1: IC);
2080	MRI.setRegBank(Reg: Cmp ->getOperand(i: `0`).getReg(), RegBank: CCBank);
2081
2082	for (unsigned L = `0`; L < NumLanes; ++L) {
2083	Register Op0 = constrainRegToBank(MRI, B, Reg&: InsRegs [L], Bank: DstBank);
2084	Register Op1 = UnmergeToEltTy.getReg(Idx: I * NumLanes + L);
2085	Op1 = constrainRegToBank(MRI, B, Reg&: Op1, Bank: DstBank);
2086
2087	Register Select = B.buildSelect(Res: EltTy, Tst: Cmp, Op0, Op1).getReg(Idx: `0`);
2088	MRI.setRegBank(Reg: Select, RegBank: DstBank);
2089
2090	Ops [I * NumLanes + L] = Select;
2091	}
2092	}
2093
2094	LLT MergeTy = LLT::fixed_vector(NumElements: Ops.size(), ScalarTy: EltTy);
2095	if (MergeTy == MRI.getType(Reg: MI.getOperand(i: `0`).getReg())) {
2096	B.buildBuildVector(Res: MI.getOperand(i: `0`), Ops);
2097	} else {
2098	auto Vec = B.buildBuildVector(Res: MergeTy, Ops);
2099	MRI.setRegBank(Reg: Vec ->getOperand(i: `0`).getReg(), RegBank: DstBank);
2100	B.buildBitcast(Dst: MI.getOperand(i: `0`).getReg(), Src: Vec);
2101	}
2102
2103	MRI.setRegBank(Reg: MI.getOperand(i: `0`).getReg(), RegBank: DstBank);
2104	MI.eraseFromParent();
2105
2106	return true;
2107	}
2108
2109	// Break s_mul_u64 into 32-bit vector operations.
2110	void AMDGPURegisterBankInfo::applyMappingSMULU64(
2111	MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
2112	SmallVector<Register, `2`> DefRegs(OpdMapper.getVRegs(OpIdx: `0`));
2113	SmallVector<Register, `2`> Src0Regs(OpdMapper.getVRegs(OpIdx: `1`));
2114	SmallVector<Register, `2`> Src1Regs(OpdMapper.getVRegs(OpIdx: `2`));
2115
2116	// All inputs are SGPRs, nothing special to do.
2117	if (DefRegs.empty()) {
2118	assert(Src0Regs.empty() && Src1Regs.empty());
2119	applyDefaultMapping(OpdMapper);
2120	return;
2121	}
2122
2123	assert(DefRegs.size() == `2`);
2124	assert(Src0Regs.size() == Src1Regs.size() &&
2125	(Src0Regs.empty() \|\| Src0Regs.size() == `2`));
2126
2127	MachineRegisterInfo &MRI = OpdMapper.getMRI();
2128	MachineInstr &MI = OpdMapper.getMI();
2129	Register DstReg = MI.getOperand(i: `0`).getReg();
2130	LLT HalfTy = LLT::scalar(SizeInBits: `32`);
2131
2132	// Depending on where the source registers came from, the generic code may
2133	// have decided to split the inputs already or not. If not, we still need to
2134	// extract the values.
2135
2136	if (Src0Regs.empty())
2137	split64BitValueForMapping(B, Regs&: Src0Regs, HalfTy, Reg: MI.getOperand(i: `1`).getReg());
2138	else
2139	setRegsToType(MRI, Regs: Src0Regs, NewTy: HalfTy);
2140
2141	if (Src1Regs.empty())
2142	split64BitValueForMapping(B, Regs&: Src1Regs, HalfTy, Reg: MI.getOperand(i: `2`).getReg());
2143	else
2144	setRegsToType(MRI, Regs: Src1Regs, NewTy: HalfTy);
2145
2146	setRegsToType(MRI, Regs: DefRegs, NewTy: HalfTy);
2147
2148	// The multiplication is done as follows:
2149	//
2150	// Op1H Op1L
2151	// Op0H Op0L*
2152	// --------------------
2153	// Op1HOp0L Op1LOp0L
2154	// + Op1HOp0H Op1LOp0H
2155	// -----------------------------------------
2156	// (Op1HOp0L + Op1LOp0H + carry) Op1LOp0L*
2157	//
2158	// We drop Op1HOp0H because the result of the multiplication is a 64-bit*
2159	// value and that would overflow.
2160	// The low 32-bit value is Op1LOp0L.*
2161	// The high 32-bit value is Op1HOp0L + Op1LOp0H + carry (from
2162	// Op1LOp0L).*
2163
2164	ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank);
2165
2166	Register Hi = B.buildUMulH(Dst: HalfTy, Src0: Src0Regs [`0`], Src1: Src1Regs [`0`]).getReg(Idx: `0`);
2167	Register MulLoHi = B.buildMul(Dst: HalfTy, Src0: Src0Regs [`0`], Src1: Src1Regs [`1`]).getReg(Idx: `0`);
2168	Register Add = B.buildAdd(Dst: HalfTy, Src0: Hi, Src1: MulLoHi).getReg(Idx: `0`);
2169	Register MulHiLo = B.buildMul(Dst: HalfTy, Src0: Src0Regs [`1`], Src1: Src1Regs [`0`]).getReg(Idx: `0`);
2170	B.buildAdd(Dst: DefRegs [`1`], Src0: Add, Src1: MulHiLo);
2171	B.buildMul(Dst: DefRegs [`0`], Src0: Src0Regs [`0`], Src1: Src1Regs [`0`]);
2172
2173	MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2174	MI.eraseFromParent();
2175	}
2176
2177	void AMDGPURegisterBankInfo::applyMappingImpl(
2178	MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
2179	MachineInstr &MI = OpdMapper.getMI();
2180	B.setInstrAndDebugLoc(MI);
2181	unsigned Opc = MI.getOpcode();
2182	MachineRegisterInfo &MRI = OpdMapper.getMRI();
2183	switch (Opc) {
2184	case AMDGPU::G_CONSTANT:
2185	case AMDGPU::G_IMPLICIT_DEF: {
2186	Register DstReg = MI.getOperand(i: `0`).getReg();
2187	LLT DstTy = MRI.getType(Reg: DstReg);
2188	if (DstTy != LLT::scalar(SizeInBits: `1`))
2189	break;
2190
2191	const RegisterBank *DstBank =
2192	OpdMapper.getInstrMapping().getOperandMapping(i: `0`).BreakDown[`0`].RegBank;
2193	if (DstBank == &AMDGPU::VCCRegBank)
2194	break;
2195	SmallVector<Register, `1`> DefRegs(OpdMapper.getVRegs(OpIdx: `0`));
2196	if (DefRegs.empty())
2197	DefRegs.push_back(Elt: DstReg);
2198
2199	B.setInsertPt(MBB&: *MI.getParent(), II: ++MI.getIterator());
2200
2201	Register NewDstReg = MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: `32`));
2202	LLVMContext &Ctx = B.getMF().getFunction().getContext();
2203
2204	MI.getOperand(i: `0`).setReg(NewDstReg);
2205	if (Opc != AMDGPU::G_IMPLICIT_DEF) {
2206	uint64_t ConstVal = MI.getOperand(i: `1`).getCImm()->getZExtValue();
2207	MI.getOperand(i: `1`).setCImm(
2208	ConstantInt::get(Ty: IntegerType::getInt32Ty(C&: Ctx), V: ConstVal));
2209	}
2210
2211	MRI.setRegBank(Reg: NewDstReg, RegBank: *DstBank);
2212	B.buildTrunc(Res: DefRegs [`0`], Op: NewDstReg);
2213	return;
2214	}
2215	case AMDGPU::G_PHI: {
2216	Register DstReg = MI.getOperand(i: `0`).getReg();
2217	LLT DstTy = MRI.getType(Reg: DstReg);
2218	if (DstTy != LLT::scalar(SizeInBits: `1`))
2219	break;
2220
2221	const LLT S32 = LLT::scalar(SizeInBits: `32`);
2222	const RegisterBank *DstBank =
2223	OpdMapper.getInstrMapping().getOperandMapping(i: `0`).BreakDown[`0`].RegBank;
2224	if (DstBank == &AMDGPU::VCCRegBank) {
2225	applyDefaultMapping(OpdMapper);
2226	// The standard handling only considers the result register bank for
2227	// phis. For VCC, blindly inserting a copy when the phi is lowered will
2228	// produce an invalid copy. We can only copy with some kind of compare to
2229	// get a vector boolean result. Insert a register bank copy that will be
2230	// correctly lowered to a compare.
2231	for (unsigned I = `1`, E = MI.getNumOperands(); I != E; I += `2`) {
2232	Register SrcReg = MI.getOperand(i: I).getReg();
2233	const RegisterBank SrcBank = getRegBank(SrcReg, MRI, TRI);
2234
2235	if (SrcBank != &AMDGPU::VCCRegBank) {
2236	MachineBasicBlock *SrcMBB = MI.getOperand(i: I + `1`).getMBB();
2237	B.setInsertPt(MBB&: *SrcMBB, II: SrcMBB->getFirstTerminator());
2238
2239	auto Copy = B.buildCopy(Res: LLT::scalar(SizeInBits: `1`), Op: SrcReg);
2240	MRI.setRegBank(Copy.getReg(`0`), AMDGPU::VCCRegBank);
2241	MI.getOperand(i: I).setReg(Copy.getReg(Idx: `0`));
2242	}
2243	}
2244
2245	return;
2246	}
2247
2248	// Phi handling is strange and only considers the bank of the destination.
2249	substituteSimpleCopyRegs(OpdMapper, OpIdx: `0`);
2250
2251	// Promote SGPR/VGPR booleans to s32
2252	ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);
2253	B.setInsertPt(MBB&: B.getMBB(), II: MI);
2254	LegalizerHelper Helper(B.getMF(), ApplyBank, B);
2255
2256	if (Helper.widenScalar(MI, TypeIdx: `0`, WideTy: S32) != LegalizerHelper::Legalized)
2257	llvm_unreachable("widen scalar should have succeeded");
2258
2259	return;
2260	}
2261	case AMDGPU::G_FCMP:
2262	if (!Subtarget.hasSALUFloatInsts())
2263	break;
2264	LLVM_FALLTHROUGH;
2265	case AMDGPU::G_ICMP:
2266	case AMDGPU::G_UADDO:
2267	case AMDGPU::G_USUBO:
2268	case AMDGPU::G_UADDE:
2269	case AMDGPU::G_SADDE:
2270	case AMDGPU::G_USUBE:
2271	case AMDGPU::G_SSUBE: {
2272	unsigned BoolDstOp =
2273	(Opc == AMDGPU::G_ICMP \|\| Opc == AMDGPU::G_FCMP) ? `0` : `1`;
2274	Register DstReg = MI.getOperand(i: BoolDstOp).getReg();
2275
2276	const RegisterBank *DstBank =
2277	OpdMapper.getInstrMapping().getOperandMapping(i: `0`).BreakDown[`0`].RegBank;
2278	if (DstBank != &AMDGPU::SGPRRegBank)
2279	break;
2280
2281	const bool HasCarryIn = MI.getNumOperands() == `5`;
2282
2283	// If this is a scalar compare, promote the result to s32, as the selection
2284	// will end up using a copy to a 32-bit vreg.
2285	const LLT S32 = LLT::scalar(SizeInBits: `32`);
2286	Register NewDstReg = MRI.createGenericVirtualRegister(Ty: S32);
2287	MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank);
2288	MI.getOperand(i: BoolDstOp).setReg(NewDstReg);
2289
2290	if (HasCarryIn) {
2291	Register NewSrcReg = MRI.createGenericVirtualRegister(Ty: S32);
2292	MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank);
2293	B.buildZExt(Res: NewSrcReg, Op: MI.getOperand(i: `4`).getReg());
2294	MI.getOperand(i: `4`).setReg(NewSrcReg);
2295	}
2296
2297	MachineBasicBlock *MBB = MI.getParent();
2298	B.setInsertPt(MBB&: *MBB, II: std::next(x: MI.getIterator()));
2299
2300	// If we had a constrained VCC result register, a copy was inserted to VCC
2301	// from SGPR.
2302	SmallVector<Register, `1`> DefRegs(OpdMapper.getVRegs(OpIdx: `0`));
2303	if (DefRegs.empty())
2304	DefRegs.push_back(Elt: DstReg);
2305	B.buildTrunc(Res: DefRegs [`0`], Op: NewDstReg);
2306	return;
2307	}
2308	case AMDGPU::G_SELECT: {
2309	Register DstReg = MI.getOperand(i: `0`).getReg();
2310	LLT DstTy = MRI.getType(Reg: DstReg);
2311
2312	SmallVector<Register, `1`> CondRegs(OpdMapper.getVRegs(OpIdx: `1`));
2313	if (CondRegs.empty())
2314	CondRegs.push_back(Elt: MI.getOperand(i: `1`).getReg());
2315	else {
2316	assert(CondRegs.size() == `1`);
2317	}
2318
2319	const RegisterBank CondBank = getRegBank(CondRegs [`0`], MRI, TRI);
2320	if (CondBank == &AMDGPU::SGPRRegBank) {
2321	const LLT S32 = LLT::scalar(SizeInBits: `32`);
2322	Register NewCondReg = MRI.createGenericVirtualRegister(Ty: S32);
2323	MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2324
2325	MI.getOperand(i: `1`).setReg(NewCondReg);
2326	B.buildZExt(Res: NewCondReg, Op: CondRegs [`0`]);
2327	}
2328
2329	if (DstTy.getSizeInBits() != `64`)
2330	break;
2331
2332	LLT HalfTy = getHalfSizedType(Ty: DstTy);
2333
2334	SmallVector<Register, `2`> DefRegs(OpdMapper.getVRegs(OpIdx: `0`));
2335	SmallVector<Register, `2`> Src1Regs(OpdMapper.getVRegs(OpIdx: `2`));
2336	SmallVector<Register, `2`> Src2Regs(OpdMapper.getVRegs(OpIdx: `3`));
2337
2338	// All inputs are SGPRs, nothing special to do.
2339	if (DefRegs.empty()) {
2340	assert(Src1Regs.empty() && Src2Regs.empty());
2341	break;
2342	}
2343
2344	if (Src1Regs.empty())
2345	split64BitValueForMapping(B, Regs&: Src1Regs, HalfTy, Reg: MI.getOperand(i: `2`).getReg());
2346	else {
2347	setRegsToType(MRI, Regs: Src1Regs, NewTy: HalfTy);
2348	}
2349
2350	if (Src2Regs.empty())
2351	split64BitValueForMapping(B, Regs&: Src2Regs, HalfTy, Reg: MI.getOperand(i: `3`).getReg());
2352	else
2353	setRegsToType(MRI, Regs: Src2Regs, NewTy: HalfTy);
2354
2355	setRegsToType(MRI, Regs: DefRegs, NewTy: HalfTy);
2356
2357	B.buildSelect(Res: DefRegs [`0`], Tst: CondRegs [`0`], Op0: Src1Regs [`0`], Op1: Src2Regs [`0`]);
2358	B.buildSelect(Res: DefRegs [`1`], Tst: CondRegs [`0`], Op0: Src1Regs [`1`], Op1: Src2Regs [`1`]);
2359
2360	MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2361	MI.eraseFromParent();
2362	return;
2363	}
2364	case AMDGPU::G_BRCOND: {
2365	Register CondReg = MI.getOperand(i: `0`).getReg();
2366	// FIXME: Should use legalizer helper, but should change bool ext type.
2367	const RegisterBank *CondBank =
2368	OpdMapper.getInstrMapping().getOperandMapping(i: `0`).BreakDown[`0`].RegBank;
2369
2370	if (CondBank == &AMDGPU::SGPRRegBank) {
2371	const LLT S32 = LLT::scalar(SizeInBits: `32`);
2372	Register NewCondReg = MRI.createGenericVirtualRegister(Ty: S32);
2373	MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2374
2375	MI.getOperand(i: `0`).setReg(NewCondReg);
2376	B.buildZExt(Res: NewCondReg, Op: CondReg);
2377	return;
2378	}
2379
2380	break;
2381	}
2382	case AMDGPU::G_AND:
2383	case AMDGPU::G_OR:
2384	case AMDGPU::G_XOR: {
2385	// 64-bit and is only available on the SALU, so split into 2 32-bit ops if
2386	// there is a VGPR input.
2387	Register DstReg = MI.getOperand(i: `0`).getReg();
2388	LLT DstTy = MRI.getType(Reg: DstReg);
2389
2390	if (DstTy.getSizeInBits() == `1`) {
2391	const RegisterBank *DstBank =
2392	OpdMapper.getInstrMapping().getOperandMapping(i: `0`).BreakDown[`0`].RegBank;
2393	if (DstBank == &AMDGPU::VCCRegBank)
2394	break;
2395
2396	MachineFunction *MF = MI.getParent()->getParent();
2397	ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);
2398	LegalizerHelper Helper(*MF, ApplyBank, B);
2399
2400	if (Helper.widenScalar(MI, TypeIdx: `0`, WideTy: LLT::scalar(SizeInBits: `32`)) !=
2401	LegalizerHelper::Legalized)
2402	llvm_unreachable("widen scalar should have succeeded");
2403	return;
2404	}
2405
2406	if (DstTy.getSizeInBits() != `64`)
2407	break;
2408
2409	LLT HalfTy = getHalfSizedType(Ty: DstTy);
2410	SmallVector<Register, `2`> DefRegs(OpdMapper.getVRegs(OpIdx: `0`));
2411	SmallVector<Register, `2`> Src0Regs(OpdMapper.getVRegs(OpIdx: `1`));
2412	SmallVector<Register, `2`> Src1Regs(OpdMapper.getVRegs(OpIdx: `2`));
2413
2414	// All inputs are SGPRs, nothing special to do.
2415	if (DefRegs.empty()) {
2416	assert(Src0Regs.empty() && Src1Regs.empty());
2417	break;
2418	}
2419
2420	assert(DefRegs.size() == `2`);
2421	assert(Src0Regs.size() == Src1Regs.size() &&
2422	(Src0Regs.empty() \|\| Src0Regs.size() == `2`));
2423
2424	// Depending on where the source registers came from, the generic code may
2425	// have decided to split the inputs already or not. If not, we still need to
2426	// extract the values.
2427
2428	if (Src0Regs.empty())
2429	split64BitValueForMapping(B, Regs&: Src0Regs, HalfTy, Reg: MI.getOperand(i: `1`).getReg());
2430	else
2431	setRegsToType(MRI, Regs: Src0Regs, NewTy: HalfTy);
2432
2433	if (Src1Regs.empty())
2434	split64BitValueForMapping(B, Regs&: Src1Regs, HalfTy, Reg: MI.getOperand(i: `2`).getReg());
2435	else
2436	setRegsToType(MRI, Regs: Src1Regs, NewTy: HalfTy);
2437
2438	setRegsToType(MRI, Regs: DefRegs, NewTy: HalfTy);
2439
2440	B.buildInstr(Opc, DstOps: {DefRegs [`0`]}, SrcOps: {Src0Regs [`0`], Src1Regs [`0`]});
2441	B.buildInstr(Opc, DstOps: {DefRegs [`1`]}, SrcOps: {Src0Regs [`1`], Src1Regs [`1`]});
2442
2443	MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2444	MI.eraseFromParent();
2445	return;
2446	}
2447	case AMDGPU::G_ABS: {
2448	Register SrcReg = MI.getOperand(i: `1`).getReg();
2449	const RegisterBank *SrcBank = MRI.getRegBankOrNull(Reg: SrcReg);
2450
2451	// There is no VALU abs instruction so we need to replace it with a sub and
2452	// max combination.
2453	if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) {
2454	MachineFunction *MF = MI.getParent()->getParent();
2455	ApplyRegBankMapping Apply(B, *this, MRI, &AMDGPU::VGPRRegBank);
2456	LegalizerHelper Helper(*MF, Apply, B);
2457
2458	if (Helper.lowerAbsToMaxNeg(MI) != LegalizerHelper::Legalized)
2459	llvm_unreachable("lowerAbsToMaxNeg should have succeeded");
2460	return;
2461	}
2462	[[fallthrough]];
2463	}
2464	case AMDGPU::G_ADD:
2465	case AMDGPU::G_SUB:
2466	case AMDGPU::G_MUL:
2467	case AMDGPU::G_SHL:
2468	case AMDGPU::G_LSHR:
2469	case AMDGPU::G_ASHR:
2470	case AMDGPU::G_SMIN:
2471	case AMDGPU::G_SMAX:
2472	case AMDGPU::G_UMIN:
2473	case AMDGPU::G_UMAX: {
2474	Register DstReg = MI.getOperand(i: `0`).getReg();
2475	LLT DstTy = MRI.getType(Reg: DstReg);
2476
2477	// Special case for s_mul_u64. There is not a vector equivalent of
2478	// s_mul_u64. Hence, we have to break down s_mul_u64 into 32-bit vector
2479	// multiplications.
2480	if (Opc == AMDGPU::G_MUL && DstTy.getSizeInBits() == `64`) {
2481	applyMappingSMULU64(B, OpdMapper);
2482	return;
2483	}
2484
2485	// 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
2486	// Packed 16-bit operations need to be scalarized and promoted.
2487	if (DstTy != LLT::scalar(SizeInBits: `16`) && DstTy != LLT::fixed_vector(NumElements: `2`, ScalarSizeInBits: `16`))
2488	break;
2489
2490	const RegisterBank *DstBank =
2491	OpdMapper.getInstrMapping().getOperandMapping(i: `0`).BreakDown[`0`].RegBank;
2492	if (DstBank == &AMDGPU::VGPRRegBank)
2493	break;
2494
2495	const LLT S32 = LLT::scalar(SizeInBits: `32`);
2496	MachineBasicBlock *MBB = MI.getParent();
2497	MachineFunction *MF = MBB->getParent();
2498	ApplyRegBankMapping ApplySALU(B, *this, MRI, &AMDGPU::SGPRRegBank);
2499
2500	if (DstTy.isVector() && Opc == AMDGPU::G_ABS) {
2501	Register WideSrcLo, WideSrcHi;
2502
2503	std::tie(args&: WideSrcLo, args&: WideSrcHi) =
2504	unpackV2S16ToS32(B, Src: MI.getOperand(i: `1`).getReg(), ExtOpcode: TargetOpcode::G_SEXT);
2505	auto Lo = B.buildInstr(AMDGPU::G_ABS, {S32}, {WideSrcLo});
2506	auto Hi = B.buildInstr(AMDGPU::G_ABS, {S32}, {WideSrcHi});
2507	B.buildBuildVectorTrunc(Res: DstReg, Ops: {Lo.getReg(`0`), Hi.getReg(`0`)});
2508	MI.eraseFromParent();
2509	return;
2510	}
2511
2512	if (DstTy.isVector()) {
2513	Register WideSrc0Lo, WideSrc0Hi;
2514	Register WideSrc1Lo, WideSrc1Hi;
2515
2516	unsigned ExtendOp = getExtendOp(Opc: MI.getOpcode());
2517	std::tie(args&: WideSrc0Lo, args&: WideSrc0Hi)
2518	= unpackV2S16ToS32(B, Src: MI.getOperand(i: `1`).getReg(), ExtOpcode: ExtendOp);
2519	std::tie(args&: WideSrc1Lo, args&: WideSrc1Hi)
2520	= unpackV2S16ToS32(B, Src: MI.getOperand(i: `2`).getReg(), ExtOpcode: ExtendOp);
2521	auto Lo = B.buildInstr(Opc: MI.getOpcode(), DstOps: {S32}, SrcOps: {WideSrc0Lo, WideSrc1Lo});
2522	auto Hi = B.buildInstr(Opc: MI.getOpcode(), DstOps: {S32}, SrcOps: {WideSrc0Hi, WideSrc1Hi});
2523	B.buildBuildVectorTrunc(Res: DstReg, Ops: {Lo.getReg(Idx: `0`), Hi.getReg(Idx: `0`)});
2524	MI.eraseFromParent();
2525	} else {
2526	LegalizerHelper Helper(*MF, ApplySALU, B);
2527
2528	if (Helper.widenScalar(MI, TypeIdx: `0`, WideTy: S32) != LegalizerHelper::Legalized)
2529	llvm_unreachable("widen scalar should have succeeded");
2530
2531	// FIXME: s16 shift amounts should be legal.
2532	if (Opc == AMDGPU::G_SHL \|\| Opc == AMDGPU::G_LSHR \|\|
2533	Opc == AMDGPU::G_ASHR) {
2534	B.setInsertPt(MBB&: *MBB, II: MI.getIterator());
2535	if (Helper.widenScalar(MI, TypeIdx: `1`, WideTy: S32) != LegalizerHelper::Legalized)
2536	llvm_unreachable("widen scalar should have succeeded");
2537	}
2538	}
2539
2540	return;
2541	}
2542	case AMDGPU::G_AMDGPU_S_MUL_I64_I32:
2543	case AMDGPU::G_AMDGPU_S_MUL_U64_U32: {
2544	// This is a special case for s_mul_u64. We use
2545	// G_AMDGPU_S_MUL_I64_I32 opcode to represent an s_mul_u64 operation
2546	// where the 33 higher bits are sign-extended and
2547	// G_AMDGPU_S_MUL_U64_U32 opcode to represent an s_mul_u64 operation
2548	// where the 32 higher bits are zero-extended. In case scalar registers are
2549	// selected, both opcodes are lowered as s_mul_u64. If the vector registers
2550	// are selected, then G_AMDGPU_S_MUL_I64_I32 and
2551	// G_AMDGPU_S_MUL_U64_U32 are lowered with a vector mad instruction.
2552
2553	// Insert basic copies.
2554	applyDefaultMapping(OpdMapper);
2555
2556	Register DstReg = MI.getOperand(i: `0`).getReg();
2557	Register SrcReg0 = MI.getOperand(i: `1`).getReg();
2558	Register SrcReg1 = MI.getOperand(i: `2`).getReg();
2559	const LLT S32 = LLT::scalar(SizeInBits: `32`);
2560	const LLT S64 = LLT::scalar(SizeInBits: `64`);
2561	assert(MRI.getType(DstReg) == S64 && "This is a special case for s_mul_u64 "
2562	"that handles only 64-bit operands.");
2563	const RegisterBank *DstBank =
2564	OpdMapper.getInstrMapping().getOperandMapping(i: `0`).BreakDown[`0`].RegBank;
2565
2566	// Replace G_AMDGPU_S_MUL_I64_I32 and G_AMDGPU_S_MUL_U64_U32
2567	// with s_mul_u64 operation.
2568	if (DstBank == &AMDGPU::SGPRRegBank) {
2569	MI.setDesc(TII->get(AMDGPU::S_MUL_U64));
2570	MRI.setRegClass(DstReg, &AMDGPU::SGPR_64RegClass);
2571	MRI.setRegClass(SrcReg0, &AMDGPU::SGPR_64RegClass);
2572	MRI.setRegClass(SrcReg1, &AMDGPU::SGPR_64RegClass);
2573	return;
2574	}
2575
2576	// Replace G_AMDGPU_S_MUL_I64_I32 and G_AMDGPU_S_MUL_U64_U32
2577	// with a vector mad.
2578	assert(MRI.getRegBankOrNull(DstReg) == &AMDGPU::VGPRRegBank &&
2579	"The destination operand should be in vector registers.");
2580
2581	DebugLoc DL = MI.getDebugLoc();
2582
2583	// Extract the lower subregister from the first operand.
2584	Register Op0L = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2585	MRI.setRegClass(Op0L, &AMDGPU::VGPR_32RegClass);
2586	MRI.setType(VReg: Op0L, Ty: S32);
2587	B.buildTrunc(Res: Op0L, Op: SrcReg0);
2588
2589	// Extract the lower subregister from the second operand.
2590	Register Op1L = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2591	MRI.setRegClass(Op1L, &AMDGPU::VGPR_32RegClass);
2592	MRI.setType(VReg: Op1L, Ty: S32);
2593	B.buildTrunc(Res: Op1L, Op: SrcReg1);
2594
2595	unsigned NewOpc = Opc == AMDGPU::G_AMDGPU_S_MUL_U64_U32
2596	? AMDGPU::G_AMDGPU_MAD_U64_U32
2597	: AMDGPU::G_AMDGPU_MAD_I64_I32;
2598
2599	MachineIRBuilder B(MI);
2600	Register Zero64 = B.buildConstant(Res: S64, Val: `0`).getReg(Idx: `0`);
2601	MRI.setRegClass(Zero64, &AMDGPU::VReg_64RegClass);
2602	Register CarryOut = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
2603	MRI.setRegClass(CarryOut, &AMDGPU::VReg_64RegClass);
2604	B.buildInstr(Opc: NewOpc, DstOps: {DstReg, CarryOut}, SrcOps: {Op0L, Op1L, Zero64});
2605	MI.eraseFromParent();
2606	return;
2607	}
2608	case AMDGPU::G_SEXT_INREG: {
2609	SmallVector<Register, `2`> SrcRegs(OpdMapper.getVRegs(OpIdx: `1`));
2610	if (SrcRegs.empty())
2611	break; // Nothing to repair
2612
2613	const LLT S32 = LLT::scalar(SizeInBits: `32`);
2614	ApplyRegBankMapping O(B, *this, MRI, &AMDGPU::VGPRRegBank);
2615
2616	// Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs
2617	// we would need to further expand, and doesn't let us directly set the
2618	// result registers.
2619	SmallVector<Register, `2`> DstRegs(OpdMapper.getVRegs(OpIdx: `0`));
2620
2621	int Amt = MI.getOperand(i: `2`).getImm();
2622	if (Amt <= `32`) {
2623	// Downstream users have expectations for the high bit behavior, so freeze
2624	// incoming undefined bits.
2625	if (Amt == `32`) {
2626	// The low bits are unchanged.
2627	B.buildFreeze(Dst: DstRegs [`0`], Src: SrcRegs [`0`]);
2628	} else {
2629	auto Freeze = B.buildFreeze(Dst: S32, Src: SrcRegs [`0`]);
2630	// Extend in the low bits and propagate the sign bit to the high half.
2631	B.buildSExtInReg(Res: DstRegs [`0`], Op: Freeze, ImmOp: Amt);
2632	}
2633
2634	B.buildAShr(Dst: DstRegs [`1`], Src0: DstRegs [`0`], Src1: B.buildConstant(Res: S32, Val: `31`));
2635	} else {
2636	// The low bits are unchanged, and extend in the high bits.
2637	// No freeze required
2638	B.buildCopy(Res: DstRegs [`0`], Op: SrcRegs [`0`]);
2639	B.buildSExtInReg(Res: DstRegs [`1`], Op: DstRegs [`0`], ImmOp: Amt - `32`);
2640	}
2641
2642	Register DstReg = MI.getOperand(i: `0`).getReg();
2643	MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2644	MI.eraseFromParent();
2645	return;
2646	}
2647	case AMDGPU::G_CTPOP:
2648	case AMDGPU::G_BITREVERSE: {
2649	const RegisterBank *DstBank =
2650	OpdMapper.getInstrMapping().getOperandMapping(i: `0`).BreakDown[`0`].RegBank;
2651	if (DstBank == &AMDGPU::SGPRRegBank)
2652	break;
2653
2654	Register SrcReg = MI.getOperand(i: `1`).getReg();
2655	const LLT S32 = LLT::scalar(SizeInBits: `32`);
2656	LLT Ty = MRI.getType(Reg: SrcReg);
2657	if (Ty == S32)
2658	break;
2659
2660	ApplyRegBankMapping ApplyVALU(B, *this, MRI, &AMDGPU::VGPRRegBank);
2661
2662	MachineFunction &MF = B.getMF();
2663	LegalizerHelper Helper(MF, ApplyVALU, B);
2664
2665	if (Helper.narrowScalar(MI, TypeIdx: `1`, NarrowTy: S32) != LegalizerHelper::Legalized)
2666	llvm_unreachable("narrowScalar should have succeeded");
2667	return;
2668	}
2669	case AMDGPU::G_AMDGPU_FFBH_U32:
2670	case AMDGPU::G_AMDGPU_FFBL_B32:
2671	case AMDGPU::G_CTLZ_ZERO_UNDEF:
2672	case AMDGPU::G_CTTZ_ZERO_UNDEF: {
2673	const RegisterBank *DstBank =
2674	OpdMapper.getInstrMapping().getOperandMapping(i: `0`).BreakDown[`0`].RegBank;
2675	if (DstBank == &AMDGPU::SGPRRegBank)
2676	break;
2677
2678	Register SrcReg = MI.getOperand(i: `1`).getReg();
2679	const LLT S32 = LLT::scalar(SizeInBits: `32`);
2680	LLT Ty = MRI.getType(Reg: SrcReg);
2681	if (Ty == S32)
2682	break;
2683
2684	// We can narrow this more efficiently than Helper can by using ffbh/ffbl
2685	// which return -1 when the input is zero:
2686	// (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
2687	// (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
2688	// (ffbh hi:lo) -> (umin (ffbh hi), (uaddsat (ffbh lo), 32))
2689	// (ffbl hi:lo) -> (umin (uaddsat (ffbh hi), 32), (ffbh lo))
2690	ApplyRegBankMapping ApplyVALU(B, *this, MRI, &AMDGPU::VGPRRegBank);
2691	SmallVector<Register, `2`> SrcRegs(OpdMapper.getVRegs(OpIdx: `1`));
2692	unsigned NewOpc = Opc == AMDGPU::G_CTLZ_ZERO_UNDEF
2693	? (unsigned)AMDGPU::G_AMDGPU_FFBH_U32
2694	: Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
2695	? (unsigned)AMDGPU::G_AMDGPU_FFBL_B32
2696	: Opc;
2697	unsigned Idx = NewOpc == AMDGPU::G_AMDGPU_FFBH_U32;
2698	auto X = B.buildInstr(Opc: NewOpc, DstOps: {S32}, SrcOps: {SrcRegs [Idx]});
2699	auto Y = B.buildInstr(Opc: NewOpc, DstOps: {S32}, SrcOps: {SrcRegs [Idx ^ `1`]});
2700	unsigned AddOpc =
2701	Opc == AMDGPU::G_CTLZ_ZERO_UNDEF \|\| Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
2702	? AMDGPU::G_ADD
2703	: AMDGPU::G_UADDSAT;
2704	Y = B.buildInstr(Opc: AddOpc, DstOps: {S32}, SrcOps: {Y, B.buildConstant(Res: S32, Val: `32`)});
2705	Register DstReg = MI.getOperand(i: `0`).getReg();
2706	B.buildUMin(Dst: DstReg, Src0: X, Src1: Y);
2707	MI.eraseFromParent();
2708	return;
2709	}
2710	case AMDGPU::G_SEXT:
2711	case AMDGPU::G_ZEXT:
2712	case AMDGPU::G_ANYEXT: {
2713	Register SrcReg = MI.getOperand(i: `1`).getReg();
2714	LLT SrcTy = MRI.getType(Reg: SrcReg);
2715	const bool Signed = Opc == AMDGPU::G_SEXT;
2716
2717	assert(OpdMapper.getVRegs(`1`).empty());
2718
2719	const RegisterBank *SrcBank =
2720	OpdMapper.getInstrMapping().getOperandMapping(i: `1`).BreakDown[`0`].RegBank;
2721
2722	Register DstReg = MI.getOperand(i: `0`).getReg();
2723	LLT DstTy = MRI.getType(Reg: DstReg);
2724	if (DstTy.isScalar() &&
2725	SrcBank != &AMDGPU::SGPRRegBank &&
2726	SrcBank != &AMDGPU::VCCRegBank &&
2727	// FIXME: Should handle any type that round to s64 when irregular
2728	// breakdowns supported.
2729	DstTy.getSizeInBits() == `64` &&
2730	SrcTy.getSizeInBits() <= `32`) {
2731	SmallVector<Register, `2`> DefRegs(OpdMapper.getVRegs(OpIdx: `0`));
2732
2733	// Extend to 32-bit, and then extend the low half.
2734	if (Signed) {
2735	// TODO: Should really be buildSExtOrCopy
2736	B.buildSExtOrTrunc(Res: DefRegs [`0`], Op: SrcReg);
2737	} else if (Opc == AMDGPU::G_ZEXT) {
2738	B.buildZExtOrTrunc(Res: DefRegs [`0`], Op: SrcReg);
2739	} else {
2740	B.buildAnyExtOrTrunc(Res: DefRegs [`0`], Op: SrcReg);
2741	}
2742
2743	extendLow32IntoHigh32(B, Hi32Reg: DefRegs [`1`], Lo32Reg: DefRegs [`0`], ExtOpc: Opc, RegBank: *SrcBank);
2744	MRI.setRegBank(Reg: DstReg, RegBank: *SrcBank);
2745	MI.eraseFromParent();
2746	return;
2747	}
2748
2749	if (SrcTy != LLT::scalar(SizeInBits: `1`))
2750	return;
2751
2752	// It is not legal to have a legalization artifact with a VCC source. Rather
2753	// than introducing a copy, insert the select we would have to select the
2754	// copy to.
2755	if (SrcBank == &AMDGPU::VCCRegBank) {
2756	SmallVector<Register, `2`> DefRegs(OpdMapper.getVRegs(OpIdx: `0`));
2757
2758	const RegisterBank *DstBank = &AMDGPU::VGPRRegBank;
2759
2760	unsigned DstSize = DstTy.getSizeInBits();
2761	// 64-bit select is SGPR only
2762	const bool UseSel64 = DstSize > `32` &&
2763	SrcBank->getID() == AMDGPU::SGPRRegBankID;
2764
2765	// TODO: Should s16 select be legal?
2766	LLT SelType = UseSel64 ? LLT::scalar(SizeInBits: `64`) : LLT::scalar(SizeInBits: `32`);
2767	auto True = B.buildConstant(Res: SelType, Val: Signed ? -`1` : `1`);
2768	auto False = B.buildConstant(Res: SelType, Val: `0`);
2769
2770	MRI.setRegBank(Reg: True.getReg(Idx: `0`), RegBank: *DstBank);
2771	MRI.setRegBank(Reg: False.getReg(Idx: `0`), RegBank: *DstBank);
2772	MRI.setRegBank(Reg: DstReg, RegBank: *DstBank);
2773
2774	if (DstSize > `32`) {
2775	B.buildSelect(Res: DefRegs [`0`], Tst: SrcReg, Op0: True, Op1: False);
2776	extendLow32IntoHigh32(B, Hi32Reg: DefRegs [`1`], Lo32Reg: DefRegs [`0`], ExtOpc: Opc, RegBank: SrcBank, IsBooleanSrc: true*);
2777	} else if (DstSize < `32`) {
2778	auto Sel = B.buildSelect(Res: SelType, Tst: SrcReg, Op0: True, Op1: False);
2779	MRI.setRegBank(Reg: Sel.getReg(Idx: `0`), RegBank: *DstBank);
2780	B.buildTrunc(Res: DstReg, Op: Sel);
2781	} else {
2782	B.buildSelect(Res: DstReg, Tst: SrcReg, Op0: True, Op1: False);
2783	}
2784
2785	MI.eraseFromParent();
2786	return;
2787	}
2788
2789	break;
2790	}
2791	case AMDGPU::G_EXTRACT_VECTOR_ELT: {
2792	SmallVector<Register, `2`> DstRegs(OpdMapper.getVRegs(OpIdx: `0`));
2793
2794	assert(OpdMapper.getVRegs(`1`).empty() && OpdMapper.getVRegs(`2`).empty());
2795
2796	Register DstReg = MI.getOperand(i: `0`).getReg();
2797	Register SrcReg = MI.getOperand(i: `1`).getReg();
2798
2799	const LLT S32 = LLT::scalar(SizeInBits: `32`);
2800	LLT DstTy = MRI.getType(Reg: DstReg);
2801	LLT SrcTy = MRI.getType(Reg: SrcReg);
2802
2803	if (foldExtractEltToCmpSelect(B, MI, OpdMapper))
2804	return;
2805
2806	const ValueMapping &DstMapping
2807	= OpdMapper.getInstrMapping().getOperandMapping(i: `0`);
2808	const RegisterBank *DstBank = DstMapping.BreakDown[`0`].RegBank;
2809	const RegisterBank *SrcBank =
2810	OpdMapper.getInstrMapping().getOperandMapping(i: `1`).BreakDown[`0`].RegBank;
2811	const RegisterBank *IdxBank =
2812	OpdMapper.getInstrMapping().getOperandMapping(i: `2`).BreakDown[`0`].RegBank;
2813
2814	Register BaseIdxReg;
2815	unsigned ConstOffset;
2816	std::tie(args&: BaseIdxReg, args&: ConstOffset) =
2817	AMDGPU::getBaseWithConstantOffset(MRI, Reg: MI.getOperand(i: `2`).getReg());
2818
2819	// See if the index is an add of a constant which will be foldable by moving
2820	// the base register of the index later if this is going to be executed in a
2821	// waterfall loop. This is essentially to reassociate the add of a constant
2822	// with the readfirstlane.
2823	bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2824	ConstOffset > `0` &&
2825	ConstOffset < SrcTy.getNumElements();
2826
2827	// Move the base register. We'll re-insert the add later.
2828	if (ShouldMoveIndexIntoLoop)
2829	MI.getOperand(i: `2`).setReg(BaseIdxReg);
2830
2831	// If this is a VGPR result only because the index was a VGPR result, the
2832	// actual indexing will be done on the SGPR source vector, which will
2833	// produce a scalar result. We need to copy to the VGPR result inside the
2834	// waterfall loop.
2835	const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank &&
2836	SrcBank == &AMDGPU::SGPRRegBank;
2837	if (DstRegs.empty()) {
2838	applyDefaultMapping(OpdMapper);
2839
2840	executeInWaterfallLoop(B, MI, OpIndices: {`2`});
2841
2842	if (NeedCopyToVGPR) {
2843	// We don't want a phi for this temporary reg.
2844	Register TmpReg = MRI.createGenericVirtualRegister(Ty: DstTy);
2845	MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank);
2846	MI.getOperand(i: `0`).setReg(TmpReg);
2847	B.setInsertPt(MBB&: *MI.getParent(), II: ++MI.getIterator());
2848
2849	// Use a v_mov_b32 here to make the exec dependency explicit.
2850	buildVCopy(B, DstReg, SrcReg: TmpReg);
2851	}
2852
2853	// Re-insert the constant offset add inside the waterfall loop.
2854	if (ShouldMoveIndexIntoLoop)
2855	reinsertVectorIndexAdd(B, IdxUseInstr&: MI, OpIdx: `2`, ConstOffset);
2856
2857	return;
2858	}
2859
2860	assert(DstTy.getSizeInBits() == `64`);
2861
2862	LLT Vec32 = LLT::fixed_vector(NumElements: `2` * SrcTy.getNumElements(), ScalarSizeInBits: `32`);
2863
2864	auto CastSrc = B.buildBitcast(Dst: Vec32, Src: SrcReg);
2865	auto One = B.buildConstant(Res: S32, Val: `1`);
2866
2867	MachineBasicBlock::iterator MII = MI.getIterator();
2868
2869	// Split the vector index into 32-bit pieces. Prepare to move all of the
2870	// new instructions into a waterfall loop if necessary.
2871	//
2872	// Don't put the bitcast or constant in the loop.
2873	MachineInstrSpan Span(MII, &B.getMBB());
2874
2875	// Compute 32-bit element indices, (2 OrigIdx, 2 * OrigIdx + 1).*
2876	auto IdxLo = B.buildShl(Dst: S32, Src0: BaseIdxReg, Src1: One);
2877	auto IdxHi = B.buildAdd(Dst: S32, Src0: IdxLo, Src1: One);
2878
2879	auto Extract0 = B.buildExtractVectorElement(Res: DstRegs [`0`], Val: CastSrc, Idx: IdxLo);
2880	auto Extract1 = B.buildExtractVectorElement(Res: DstRegs [`1`], Val: CastSrc, Idx: IdxHi);
2881
2882	MRI.setRegBank(Reg: DstReg, RegBank: *DstBank);
2883	MRI.setRegBank(Reg: CastSrc.getReg(Idx: `0`), RegBank: *SrcBank);
2884	MRI.setRegBank(One.getReg(`0`), AMDGPU::SGPRRegBank);
2885	MRI.setRegBank(IdxLo.getReg(`0`), AMDGPU::SGPRRegBank);
2886	MRI.setRegBank(IdxHi.getReg(`0`), AMDGPU::SGPRRegBank);
2887
2888	SmallSet<Register, `4`> OpsToWaterfall;
2889	if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { `2` })) {
2890	MI.eraseFromParent();
2891	return;
2892	}
2893
2894	// Remove the original instruction to avoid potentially confusing the
2895	// waterfall loop logic.
2896	B.setInstr(*Span.begin());
2897	MI.eraseFromParent();
2898	executeInWaterfallLoop(B, make_range(x: Span.begin(), y: Span.end()),
2899	OpsToWaterfall);
2900
2901	if (NeedCopyToVGPR) {
2902	MachineBasicBlock *LoopBB = Extract1 ->getParent();
2903	Register TmpReg0 = MRI.createGenericVirtualRegister(Ty: S32);
2904	Register TmpReg1 = MRI.createGenericVirtualRegister(Ty: S32);
2905	MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank);
2906	MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank);
2907
2908	Extract0 ->getOperand(i: `0`).setReg(TmpReg0);
2909	Extract1 ->getOperand(i: `0`).setReg(TmpReg1);
2910
2911	B.setInsertPt(MBB&: *LoopBB, II: ++Extract1 ->getIterator());
2912
2913	buildVCopy(B, DstReg: DstRegs [`0`], SrcReg: TmpReg0);
2914	buildVCopy(B, DstReg: DstRegs [`1`], SrcReg: TmpReg1);
2915	}
2916
2917	if (ShouldMoveIndexIntoLoop)
2918	reinsertVectorIndexAdd(B, IdxUseInstr&: *IdxLo, OpIdx: `1`, ConstOffset);
2919
2920	return;
2921	}
2922	case AMDGPU::G_INSERT_VECTOR_ELT: {
2923	SmallVector<Register, `2`> InsRegs(OpdMapper.getVRegs(OpIdx: `2`));
2924
2925	Register DstReg = MI.getOperand(i: `0`).getReg();
2926	LLT VecTy = MRI.getType(Reg: DstReg);
2927
2928	assert(OpdMapper.getVRegs(`0`).empty());
2929	assert(OpdMapper.getVRegs(`3`).empty());
2930
2931	if (substituteSimpleCopyRegs(OpdMapper, OpIdx: `1`))
2932	MRI.setType(VReg: MI.getOperand(i: `1`).getReg(), Ty: VecTy);
2933
2934	if (foldInsertEltToCmpSelect(B, MI, OpdMapper))
2935	return;
2936
2937	const RegisterBank *IdxBank =
2938	OpdMapper.getInstrMapping().getOperandMapping(i: `3`).BreakDown[`0`].RegBank;
2939
2940	Register SrcReg = MI.getOperand(i: `1`).getReg();
2941	Register InsReg = MI.getOperand(i: `2`).getReg();
2942	LLT InsTy = MRI.getType(Reg: InsReg);
2943	(void)InsTy;
2944
2945	Register BaseIdxReg;
2946	unsigned ConstOffset;
2947	std::tie(args&: BaseIdxReg, args&: ConstOffset) =
2948	AMDGPU::getBaseWithConstantOffset(MRI, Reg: MI.getOperand(i: `3`).getReg());
2949
2950	// See if the index is an add of a constant which will be foldable by moving
2951	// the base register of the index later if this is going to be executed in a
2952	// waterfall loop. This is essentially to reassociate the add of a constant
2953	// with the readfirstlane.
2954	bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2955	ConstOffset > `0` &&
2956	ConstOffset < VecTy.getNumElements();
2957
2958	// Move the base register. We'll re-insert the add later.
2959	if (ShouldMoveIndexIntoLoop)
2960	MI.getOperand(i: `3`).setReg(BaseIdxReg);
2961
2962
2963	if (InsRegs.empty()) {
2964	executeInWaterfallLoop(B, MI, OpIndices: {`3`});
2965
2966	// Re-insert the constant offset add inside the waterfall loop.
2967	if (ShouldMoveIndexIntoLoop) {
2968	reinsertVectorIndexAdd(B, IdxUseInstr&: MI, OpIdx: `3`, ConstOffset);
2969	}
2970
2971	return;
2972	}
2973
2974	assert(InsTy.getSizeInBits() == `64`);
2975
2976	const LLT S32 = LLT::scalar(SizeInBits: `32`);
2977	LLT Vec32 = LLT::fixed_vector(NumElements: `2` * VecTy.getNumElements(), ScalarSizeInBits: `32`);
2978
2979	auto CastSrc = B.buildBitcast(Dst: Vec32, Src: SrcReg);
2980	auto One = B.buildConstant(Res: S32, Val: `1`);
2981
2982	// Split the vector index into 32-bit pieces. Prepare to move all of the
2983	// new instructions into a waterfall loop if necessary.
2984	//
2985	// Don't put the bitcast or constant in the loop.
2986	MachineInstrSpan Span(MachineBasicBlock::iterator (&MI), &B.getMBB());
2987
2988	// Compute 32-bit element indices, (2 OrigIdx, 2 * OrigIdx + 1).*
2989	auto IdxLo = B.buildShl(Dst: S32, Src0: BaseIdxReg, Src1: One);
2990	auto IdxHi = B.buildAdd(Dst: S32, Src0: IdxLo, Src1: One);
2991
2992	auto InsLo = B.buildInsertVectorElement(Res: Vec32, Val: CastSrc, Elt: InsRegs [`0`], Idx: IdxLo);
2993	auto InsHi = B.buildInsertVectorElement(Res: Vec32, Val: InsLo, Elt: InsRegs [`1`], Idx: IdxHi);
2994
2995	const RegisterBank *DstBank =
2996	OpdMapper.getInstrMapping().getOperandMapping(i: `0`).BreakDown[`0`].RegBank;
2997	const RegisterBank *SrcBank =
2998	OpdMapper.getInstrMapping().getOperandMapping(i: `1`).BreakDown[`0`].RegBank;
2999	const RegisterBank *InsSrcBank =
3000	OpdMapper.getInstrMapping().getOperandMapping(i: `2`).BreakDown[`0`].RegBank;
3001
3002	MRI.setRegBank(Reg: InsReg, RegBank: *InsSrcBank);
3003	MRI.setRegBank(Reg: CastSrc.getReg(Idx: `0`), RegBank: *SrcBank);
3004	MRI.setRegBank(Reg: InsLo.getReg(Idx: `0`), RegBank: *DstBank);
3005	MRI.setRegBank(Reg: InsHi.getReg(Idx: `0`), RegBank: *DstBank);
3006	MRI.setRegBank(One.getReg(`0`), AMDGPU::SGPRRegBank);
3007	MRI.setRegBank(IdxLo.getReg(`0`), AMDGPU::SGPRRegBank);
3008	MRI.setRegBank(IdxHi.getReg(`0`), AMDGPU::SGPRRegBank);
3009
3010
3011	SmallSet<Register, `4`> OpsToWaterfall;
3012	if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { `3` })) {
3013	B.setInsertPt(MBB&: B.getMBB(), II: MI);
3014	B.buildBitcast(Dst: DstReg, Src: InsHi);
3015	MI.eraseFromParent();
3016	return;
3017	}
3018
3019	B.setInstr(*Span.begin());
3020	MI.eraseFromParent();
3021
3022	// Figure out the point after the waterfall loop before mangling the control
3023	// flow.
3024	executeInWaterfallLoop(B, make_range(x: Span.begin(), y: Span.end()),
3025	OpsToWaterfall);
3026
3027	// The insertion point is now right after the original instruction.
3028	//
3029	// Keep the bitcast to the original vector type out of the loop. Doing this
3030	// saved an extra phi we don't need inside the loop.
3031	B.buildBitcast(Dst: DstReg, Src: InsHi);
3032
3033	// Re-insert the constant offset add inside the waterfall loop.
3034	if (ShouldMoveIndexIntoLoop)
3035	reinsertVectorIndexAdd(B, IdxUseInstr&: *IdxLo, OpIdx: `1`, ConstOffset);
3036
3037	return;
3038	}
3039	case AMDGPU::G_AMDGPU_BUFFER_LOAD:
3040	case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
3041	case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
3042	case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
3043	case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
3044	case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
3045	case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE:
3046	case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
3047	case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
3048	case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
3049	case AMDGPU::G_AMDGPU_BUFFER_STORE:
3050	case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
3051	case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
3052	case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
3053	case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16:
3054	case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
3055	case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: {
3056	applyDefaultMapping(OpdMapper);
3057	executeInWaterfallLoop(B, MI, OpIndices: {`1`, `4`});
3058	return;
3059	}
3060	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
3061	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
3062	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
3063	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
3064	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
3065	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
3066	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
3067	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
3068	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
3069	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
3070	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
3071	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: {
3072	applyDefaultMapping(OpdMapper);
3073	executeInWaterfallLoop(B, MI, OpIndices: {`2`, `5`});
3074	return;
3075	}
3076	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
3077	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD_BF16:
3078	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
3079	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
3080	applyDefaultMapping(OpdMapper);
3081	executeInWaterfallLoop(B, MI, OpIndices: {`2`, `5`});
3082	return;
3083	}
3084	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
3085	applyDefaultMapping(OpdMapper);
3086	executeInWaterfallLoop(B, MI, OpIndices: {`3`, `6`});
3087	return;
3088	}
3089	case AMDGPU::G_AMDGPU_S_BUFFER_LOAD:
3090	case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
3091	case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE:
3092	case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
3093	case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT: {
3094	applyMappingSBufferLoad(B, OpdMapper);
3095	return;
3096	}
3097	case AMDGPU::G_INTRINSIC:
3098	case AMDGPU::G_INTRINSIC_CONVERGENT: {
3099	switch (cast<GIntrinsic>(Val&: MI).getIntrinsicID()) {
3100	case Intrinsic::amdgcn_readlane: {
3101	substituteSimpleCopyRegs(OpdMapper, OpIdx: `2`);
3102
3103	assert(OpdMapper.getVRegs(`0`).empty());
3104	assert(OpdMapper.getVRegs(`3`).empty());
3105
3106	// Make sure the index is an SGPR. It doesn't make sense to run this in a
3107	// waterfall loop, so assume it's a uniform value.
3108	constrainOpWithReadfirstlane(B, MI, OpIdx: `3`); // Index
3109	return;
3110	}
3111	case Intrinsic::amdgcn_writelane: {
3112	assert(OpdMapper.getVRegs(`0`).empty());
3113	assert(OpdMapper.getVRegs(`2`).empty());
3114	assert(OpdMapper.getVRegs(`3`).empty());
3115
3116	substituteSimpleCopyRegs(OpdMapper, OpIdx: `4`); // VGPR input val
3117	constrainOpWithReadfirstlane(B, MI, OpIdx: `2`); // Source value
3118	constrainOpWithReadfirstlane(B, MI, OpIdx: `3`); // Index
3119	return;
3120	}
3121	case Intrinsic::amdgcn_interp_p1:
3122	case Intrinsic::amdgcn_interp_p2:
3123	case Intrinsic::amdgcn_interp_mov:
3124	case Intrinsic::amdgcn_interp_p1_f16:
3125	case Intrinsic::amdgcn_interp_p2_f16:
3126	case Intrinsic::amdgcn_lds_param_load: {
3127	applyDefaultMapping(OpdMapper);
3128
3129	// Readlane for m0 value, which is always the last operand.
3130	// FIXME: Should this be a waterfall loop instead?
3131	constrainOpWithReadfirstlane(B, MI, OpIdx: MI.getNumOperands() - `1`); // Index
3132	return;
3133	}
3134	case Intrinsic::amdgcn_interp_inreg_p10:
3135	case Intrinsic::amdgcn_interp_inreg_p2:
3136	case Intrinsic::amdgcn_interp_inreg_p10_f16:
3137	case Intrinsic::amdgcn_interp_inreg_p2_f16:
3138	case Intrinsic::amdgcn_interp_p10_rtz_f16:
3139	case Intrinsic::amdgcn_interp_p2_rtz_f16:
3140	applyDefaultMapping(OpdMapper);
3141	return;
3142	case Intrinsic::amdgcn_permlane16:
3143	case Intrinsic::amdgcn_permlanex16: {
3144	// Doing a waterfall loop over these wouldn't make any sense.
3145	substituteSimpleCopyRegs(OpdMapper, OpIdx: `2`);
3146	substituteSimpleCopyRegs(OpdMapper, OpIdx: `3`);
3147	constrainOpWithReadfirstlane(B, MI, OpIdx: `4`);
3148	constrainOpWithReadfirstlane(B, MI, OpIdx: `5`);
3149	return;
3150	}
3151	case Intrinsic::amdgcn_sbfe:
3152	applyMappingBFE(B, OpdMapper, Signed: true);
3153	return;
3154	case Intrinsic::amdgcn_ubfe:
3155	applyMappingBFE(B, OpdMapper, Signed: false);
3156	return;
3157	case Intrinsic::amdgcn_inverse_ballot:
3158	case Intrinsic::amdgcn_s_bitreplicate:
3159	case Intrinsic::amdgcn_s_quadmask:
3160	case Intrinsic::amdgcn_s_wqm:
3161	applyDefaultMapping(OpdMapper);
3162	constrainOpWithReadfirstlane(B, MI, OpIdx: `2`); // Mask
3163	return;
3164	case Intrinsic::amdgcn_ballot:
3165	// Use default handling and insert copy to vcc source.
3166	break;
3167	}
3168	break;
3169	}
3170	case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3171	case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
3172	case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
3173	case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
3174	const AMDGPU::RsrcIntrinsic *RSrcIntrin =
3175	AMDGPU::lookupRsrcIntrinsic(Intr: AMDGPU::getIntrinsicID(I: MI));
3176	assert(RSrcIntrin && RSrcIntrin->IsImage);
3177	// Non-images can have complications from operands that allow both SGPR
3178	// and VGPR. For now it's too complicated to figure out the final opcode
3179	// to derive the register bank from the MCInstrDesc.
3180	applyMappingImage(B, MI, OpdMapper, RsrcIdx: RSrcIntrin->RsrcArg);
3181	return;
3182	}
3183	case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
3184	unsigned N = MI.getNumExplicitOperands() - `2`;
3185	applyDefaultMapping(OpdMapper);
3186	executeInWaterfallLoop(B, MI, OpIndices: {N});
3187	return;
3188	}
3189	case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
3190	case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: {
3191	auto IntrID = cast<GIntrinsic>(Val&: MI).getIntrinsicID();
3192	switch (IntrID) {
3193	case Intrinsic::amdgcn_ds_ordered_add:
3194	case Intrinsic::amdgcn_ds_ordered_swap: {
3195	// This is only allowed to execute with 1 lane, so readfirstlane is safe.
3196	assert(OpdMapper.getVRegs(`0`).empty());
3197	substituteSimpleCopyRegs(OpdMapper, OpIdx: `3`);
3198	constrainOpWithReadfirstlane(B, MI, OpIdx: `2`); // M0
3199	return;
3200	}
3201	case Intrinsic::amdgcn_ds_gws_init:
3202	case Intrinsic::amdgcn_ds_gws_barrier:
3203	case Intrinsic::amdgcn_ds_gws_sema_br: {
3204	// Only the first lane is executes, so readfirstlane is safe.
3205	substituteSimpleCopyRegs(OpdMapper, OpIdx: `1`);
3206	constrainOpWithReadfirstlane(B, MI, OpIdx: `2`); // M0
3207	return;
3208	}
3209	case Intrinsic::amdgcn_ds_gws_sema_v:
3210	case Intrinsic::amdgcn_ds_gws_sema_p:
3211	case Intrinsic::amdgcn_ds_gws_sema_release_all: {
3212	// Only the first lane is executes, so readfirstlane is safe.
3213	constrainOpWithReadfirstlane(B, MI, OpIdx: `1`); // M0
3214	return;
3215	}
3216	case Intrinsic::amdgcn_ds_append:
3217	case Intrinsic::amdgcn_ds_consume: {
3218	constrainOpWithReadfirstlane(B, MI, OpIdx: `2`); // M0
3219	return;
3220	}
3221	case Intrinsic::amdgcn_s_sendmsg:
3222	case Intrinsic::amdgcn_s_sendmsghalt: {
3223	// FIXME: Should this use a waterfall loop?
3224	constrainOpWithReadfirstlane(B, MI, OpIdx: `2`); // M0
3225	return;
3226	}
3227	case Intrinsic::amdgcn_s_setreg: {
3228	constrainOpWithReadfirstlane(B, MI, OpIdx: `2`);
3229	return;
3230	}
3231	case Intrinsic::amdgcn_s_ttracedata:
3232	constrainOpWithReadfirstlane(B, MI, OpIdx: `1`); // M0
3233	return;
3234	case Intrinsic::amdgcn_raw_buffer_load_lds:
3235	case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: {
3236	applyDefaultMapping(OpdMapper);
3237	constrainOpWithReadfirstlane(B, MI, OpIdx: `1`); // rsrc
3238	constrainOpWithReadfirstlane(B, MI, OpIdx: `2`); // M0
3239	constrainOpWithReadfirstlane(B, MI, OpIdx: `5`); // soffset
3240	return;
3241	}
3242	case Intrinsic::amdgcn_struct_buffer_load_lds:
3243	case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
3244	applyDefaultMapping(OpdMapper);
3245	constrainOpWithReadfirstlane(B, MI, OpIdx: `1`); // rsrc
3246	constrainOpWithReadfirstlane(B, MI, OpIdx: `2`); // M0
3247	constrainOpWithReadfirstlane(B, MI, OpIdx: `6`); // soffset
3248	return;
3249	}
3250	case Intrinsic::amdgcn_global_load_lds: {
3251	applyDefaultMapping(OpdMapper);
3252	constrainOpWithReadfirstlane(B, MI, OpIdx: `2`);
3253	return;
3254	}
3255	case Intrinsic::amdgcn_lds_direct_load: {
3256	applyDefaultMapping(OpdMapper);
3257	// Readlane for m0 value, which is always the last operand.
3258	constrainOpWithReadfirstlane(B, MI, OpIdx: MI.getNumOperands() - `1`); // Index
3259	return;
3260	}
3261	case Intrinsic::amdgcn_exp_row:
3262	applyDefaultMapping(OpdMapper);
3263	constrainOpWithReadfirstlane(B, MI, OpIdx: `8`); // M0
3264	return;
3265	case Intrinsic::amdgcn_s_sleep_var:
3266	assert(OpdMapper.getVRegs(`1`).empty());
3267	constrainOpWithReadfirstlane(B, MI, OpIdx: `1`);
3268	return;
3269	case Intrinsic::amdgcn_s_barrier_signal_var:
3270	case Intrinsic::amdgcn_s_barrier_join:
3271	case Intrinsic::amdgcn_s_wakeup_barrier:
3272	constrainOpWithReadfirstlane(B, MI, OpIdx: `1`);
3273	return;
3274	case Intrinsic::amdgcn_s_barrier_signal_isfirst_var:
3275	constrainOpWithReadfirstlane(B, MI, OpIdx: `2`);
3276	return;
3277	case Intrinsic::amdgcn_s_barrier_init:
3278	constrainOpWithReadfirstlane(B, MI, OpIdx: `1`);
3279	constrainOpWithReadfirstlane(B, MI, OpIdx: `2`);
3280	return;
3281	case Intrinsic::amdgcn_s_get_barrier_state: {
3282	constrainOpWithReadfirstlane(B, MI, OpIdx: `2`);
3283	return;
3284	}
3285	default: {
3286	if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
3287	AMDGPU::lookupRsrcIntrinsic(Intr: IntrID)) {
3288	// Non-images can have complications from operands that allow both SGPR
3289	// and VGPR. For now it's too complicated to figure out the final opcode
3290	// to derive the register bank from the MCInstrDesc.
3291	if (RSrcIntrin->IsImage) {
3292	applyMappingImage(B, MI, OpdMapper, RsrcIdx: RSrcIntrin->RsrcArg);
3293	return;
3294	}
3295	}
3296
3297	break;
3298	}
3299	}
3300	break;
3301	}
3302	case AMDGPU::G_SI_CALL: {
3303	// Use a set to avoid extra readfirstlanes in the case where multiple
3304	// operands are the same register.
3305	SmallSet<Register, `4`> SGPROperandRegs;
3306
3307	if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, {`1`}))
3308	break;
3309
3310	// Move all copies to physical SGPRs that are used by the call instruction
3311	// into the loop block. Start searching for these copies until the
3312	// ADJCALLSTACKUP.
3313	unsigned FrameSetupOpcode = AMDGPU::ADJCALLSTACKUP;
3314	unsigned FrameDestroyOpcode = AMDGPU::ADJCALLSTACKDOWN;
3315
3316	// Move all non-copies before the copies, so that a complete range can be
3317	// moved into the waterfall loop.
3318	SmallVector<MachineInstr *, `4`> NonCopyInstrs;
3319	// Count of NonCopyInstrs found until the current LastCopy.
3320	unsigned NonCopyInstrsLen = `0`;
3321	MachineBasicBlock::iterator Start(&MI);
3322	MachineBasicBlock::iterator LastCopy = Start;
3323	MachineBasicBlock *MBB = MI.getParent();
3324	const SIMachineFunctionInfo *Info =
3325	MBB->getParent()->getInfo<SIMachineFunctionInfo>();
3326	while (Start ->getOpcode() != FrameSetupOpcode) {
3327	--Start;
3328	bool IsCopy = false;
3329	if (Start->getOpcode() == AMDGPU::COPY) {
3330	auto &Dst = Start ->getOperand(i: `0`);
3331	if (Dst.isReg()) {
3332	Register Reg = Dst.getReg();
3333	if (Reg.isPhysical() && MI.readsRegister(Reg, TRI)) {
3334	IsCopy = true;
3335	} else {
3336	// Also move the copy from the scratch rsrc descriptor into the loop
3337	// to allow it to be optimized away.
3338	auto &Src = Start ->getOperand(i: `1`);
3339	if (Src.isReg()) {
3340	Reg = Src.getReg();
3341	IsCopy = Info->getScratchRSrcReg() == Reg;
3342	}
3343	}
3344	}
3345	}
3346
3347	if (IsCopy) {
3348	LastCopy = Start;
3349	NonCopyInstrsLen = NonCopyInstrs.size();
3350	} else {
3351	NonCopyInstrs.push_back(Elt: &*Start);
3352	}
3353	}
3354	NonCopyInstrs.resize(N: NonCopyInstrsLen);
3355
3356	for (auto *NonCopy : reverse(C&: NonCopyInstrs)) {
3357	MBB->splice(Where: LastCopy, Other: MBB, From: NonCopy->getIterator());
3358	}
3359	Start = LastCopy;
3360
3361	// Do the same for copies after the loop
3362	NonCopyInstrs.clear();
3363	NonCopyInstrsLen = `0`;
3364	MachineBasicBlock::iterator End(&MI);
3365	LastCopy = End;
3366	while (End ->getOpcode() != FrameDestroyOpcode) {
3367	++End;
3368	bool IsCopy = false;
3369	if (End->getOpcode() == AMDGPU::COPY) {
3370	auto &Src = End ->getOperand(i: `1`);
3371	if (Src.isReg()) {
3372	Register Reg = Src.getReg();
3373	IsCopy = Reg.isPhysical() && MI.modifiesRegister(Reg, TRI);
3374	}
3375	}
3376
3377	if (IsCopy) {
3378	LastCopy = End;
3379	NonCopyInstrsLen = NonCopyInstrs.size();
3380	} else {
3381	NonCopyInstrs.push_back(Elt: &*End);
3382	}
3383	}
3384	NonCopyInstrs.resize(N: NonCopyInstrsLen);
3385
3386	End = LastCopy;
3387	++LastCopy;
3388	for (auto *NonCopy : reverse(C&: NonCopyInstrs)) {
3389	MBB->splice(Where: LastCopy, Other: MBB, From: NonCopy->getIterator());
3390	}
3391
3392	++End;
3393	B.setInsertPt(MBB&: B.getMBB(), II: Start);
3394	executeInWaterfallLoop(B, make_range(x: Start, y: End), SGPROperandRegs);
3395	break;
3396	}
3397	case AMDGPU::G_LOAD:
3398	case AMDGPU::G_ZEXTLOAD:
3399	case AMDGPU::G_SEXTLOAD: {
3400	if (applyMappingLoad(B, OpdMapper, MI))
3401	return;
3402	break;
3403	}
3404	case AMDGPU::G_DYN_STACKALLOC:
3405	applyMappingDynStackAlloc(B, OpdMapper, MI);
3406	return;
3407	case AMDGPU::G_STACKRESTORE: {
3408	applyDefaultMapping(OpdMapper);
3409	constrainOpWithReadfirstlane(B, MI, OpIdx: `0`);
3410	return;
3411	}
3412	case AMDGPU::G_SBFX:
3413	applyMappingBFE(B, OpdMapper, /Signed/ true);
3414	return;
3415	case AMDGPU::G_UBFX:
3416	applyMappingBFE(B, OpdMapper, /Signed/ false);
3417	return;
3418	case AMDGPU::G_AMDGPU_MAD_U64_U32:
3419	case AMDGPU::G_AMDGPU_MAD_I64_I32:
3420	applyMappingMAD_64_32(B, OpdMapper);
3421	return;
3422	case AMDGPU::G_PREFETCH: {
3423	if (!Subtarget.hasPrefetch()) {
3424	MI.eraseFromParent();
3425	return;
3426	}
3427	Register PtrReg = MI.getOperand(i: `0`).getReg();
3428	unsigned PtrBank = getRegBankID(PtrReg, MRI, AMDGPU::SGPRRegBankID);
3429	if (PtrBank == AMDGPU::VGPRRegBankID) {
3430	MI.eraseFromParent();
3431	return;
3432	}
3433	unsigned AS = MRI.getType(Reg: PtrReg).getAddressSpace();
3434	if (!AMDGPU::isFlatGlobalAddrSpace(AS) &&
3435	AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
3436	MI.eraseFromParent();
3437	return;
3438	}
3439	applyDefaultMapping(OpdMapper);
3440	return;
3441	}
3442	default:
3443	break;
3444	}
3445
3446	return applyDefaultMapping(OpdMapper);
3447	}
3448
3449	// vgpr, sgpr -> vgpr
3450	// vgpr, agpr -> vgpr
3451	// agpr, agpr -> agpr
3452	// agpr, sgpr -> vgpr
3453	static unsigned regBankUnion(unsigned RB0, unsigned RB1) {
3454	if (RB0 == AMDGPU::InvalidRegBankID)
3455	return RB1;
3456	if (RB1 == AMDGPU::InvalidRegBankID)
3457	return RB0;
3458
3459	if (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID)
3460	return AMDGPU::SGPRRegBankID;
3461
3462	if (RB0 == AMDGPU::AGPRRegBankID && RB1 == AMDGPU::AGPRRegBankID)
3463	return AMDGPU::AGPRRegBankID;
3464
3465	return AMDGPU::VGPRRegBankID;
3466	}
3467
3468	static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1) {
3469	if (RB0 == AMDGPU::InvalidRegBankID)
3470	return RB1;
3471	if (RB1 == AMDGPU::InvalidRegBankID)
3472	return RB0;
3473
3474	// vcc, vcc -> vcc
3475	// vcc, sgpr -> vcc
3476	// vcc, vgpr -> vcc
3477	if (RB0 == AMDGPU::VCCRegBankID \|\| RB1 == AMDGPU::VCCRegBankID)
3478	return AMDGPU::VCCRegBankID;
3479
3480	// vcc, vgpr -> vgpr
3481	return regBankUnion(RB0, RB1);
3482	}
3483
3484	unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI,
3485	const MachineInstr &MI) const {
3486	unsigned RegBank = AMDGPU::InvalidRegBankID;
3487
3488	for (const MachineOperand &MO : MI.operands()) {
3489	if (!MO.isReg())
3490	continue;
3491	Register Reg = MO.getReg();
3492	if (const RegisterBank Bank = getRegBank(Reg, MRI, TRI)) {
3493	RegBank = regBankUnion(RB0: RegBank, RB1: Bank->getID());
3494	if (RegBank == AMDGPU::VGPRRegBankID)
3495	break;
3496	}
3497	}
3498
3499	return RegBank;
3500	}
3501
3502	bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
3503	const MachineFunction &MF = *MI.getParent()->getParent();
3504	const MachineRegisterInfo &MRI = MF.getRegInfo();
3505	for (const MachineOperand &MO : MI.operands()) {
3506	if (!MO.isReg())
3507	continue;
3508	Register Reg = MO.getReg();
3509	if (const RegisterBank Bank = getRegBank(Reg, MRI, TRI)) {
3510	if (Bank->getID() != AMDGPU::SGPRRegBankID)
3511	return false;
3512	}
3513	}
3514	return true;
3515	}
3516
3517	const RegisterBankInfo::InstructionMapping &
3518	AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
3519	const MachineFunction &MF = *MI.getParent()->getParent();
3520	const MachineRegisterInfo &MRI = MF.getRegInfo();
3521	SmallVector<const ValueMapping*, `8`> OpdsMapping(MI.getNumOperands());
3522
3523	for (unsigned i = `0`, e = MI.getNumOperands(); i != e; ++i) {
3524	const MachineOperand &SrcOp = MI.getOperand(i);
3525	if (!SrcOp.isReg())
3526	continue;
3527
3528	unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI);
3529	OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3530	}
3531	return getInstructionMapping(ID: `1`, Cost: `1`, OperandsMapping: getOperandsMapping(OpdsMapping),
3532	NumOperands: MI.getNumOperands());
3533	}
3534
3535	const RegisterBankInfo::InstructionMapping &
3536	AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
3537	const MachineFunction &MF = *MI.getParent()->getParent();
3538	const MachineRegisterInfo &MRI = MF.getRegInfo();
3539	SmallVector<const ValueMapping*, `8`> OpdsMapping(MI.getNumOperands());
3540
3541	// Even though we technically could use SGPRs, this would require knowledge of
3542	// the constant bus restriction. Force all sources to VGPR (except for VCC).
3543	//
3544	// TODO: Unary ops are trivially OK, so accept SGPRs?
3545	for (unsigned i = `0`, e = MI.getNumOperands(); i != e; ++i) {
3546	const MachineOperand &Src = MI.getOperand(i);
3547	if (!Src.isReg())
3548	continue;
3549
3550	unsigned Size = getSizeInBits(Src.getReg(), MRI, *TRI);
3551	unsigned BankID = Size == `1` ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
3552	OpdsMapping [i] = AMDGPU::getValueMapping(BankID, Size);
3553	}
3554
3555	return getInstructionMapping(ID: `1`, Cost: `1`, OperandsMapping: getOperandsMapping(OpdsMapping),
3556	NumOperands: MI.getNumOperands());
3557	}
3558
3559	const RegisterBankInfo::InstructionMapping &
3560	AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
3561	const MachineFunction &MF = *MI.getParent()->getParent();
3562	const MachineRegisterInfo &MRI = MF.getRegInfo();
3563	SmallVector<const ValueMapping*, `8`> OpdsMapping(MI.getNumOperands());
3564
3565	for (unsigned I = `0`, E = MI.getNumOperands(); I != E; ++I) {
3566	const MachineOperand &Op = MI.getOperand(i: I);
3567	if (!Op.isReg())
3568	continue;
3569
3570	unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI);
3571	OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3572	}
3573
3574	return getInstructionMapping(ID: `1`, Cost: `1`, OperandsMapping: getOperandsMapping(OpdsMapping),
3575	NumOperands: MI.getNumOperands());
3576	}
3577
3578	const RegisterBankInfo::InstructionMapping &
3579	AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI,
3580	const MachineInstr &MI,
3581	int RsrcIdx) const {
3582	// The reported argument index is relative to the IR intrinsic call arguments,
3583	// so we need to shift by the number of defs and the intrinsic ID.
3584	RsrcIdx += MI.getNumExplicitDefs() + `1`;
3585
3586	const int NumOps = MI.getNumOperands();
3587	SmallVector<const ValueMapping *, `8`> OpdsMapping(NumOps);
3588
3589	// TODO: Should packed/unpacked D16 difference be reported here as part of
3590	// the value mapping?
3591	for (int I = `0`; I != NumOps; ++I) {
3592	if (!MI.getOperand(i: I).isReg())
3593	continue;
3594
3595	Register OpReg = MI.getOperand(i: I).getReg();
3596	// We replace some dead address operands with $noreg
3597	if (!OpReg)
3598	continue;
3599
3600	unsigned Size = getSizeInBits(OpReg, MRI, *TRI);
3601
3602	// FIXME: Probably need a new intrinsic register bank searchable table to
3603	// handle arbitrary intrinsics easily.
3604	//
3605	// If this has a sampler, it immediately follows rsrc.
3606	const bool MustBeSGPR = I == RsrcIdx \|\| I == RsrcIdx + `1`;
3607
3608	if (MustBeSGPR) {
3609	// If this must be an SGPR, so we must report whatever it is as legal.
3610	unsigned NewBank = getRegBankID(OpReg, MRI, AMDGPU::SGPRRegBankID);
3611	OpdsMapping [I] = AMDGPU::getValueMapping(BankID: NewBank, Size);
3612	} else {
3613	// Some operands must be VGPR, and these are easy to copy to.
3614	OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3615	}
3616	}
3617
3618	return getInstructionMapping(ID: `1`, Cost: `1`, OperandsMapping: getOperandsMapping(OpdsMapping), NumOperands: NumOps);
3619	}
3620
3621	/// Return the mapping for a pointer argument.
3622	const RegisterBankInfo::ValueMapping *
3623	AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI,
3624	Register PtrReg) const {
3625	LLT PtrTy = MRI.getType(Reg: PtrReg);
3626	unsigned Size = PtrTy.getSizeInBits();
3627	if (Subtarget.useFlatForGlobal() \|\|
3628	!AMDGPU::isFlatGlobalAddrSpace(PtrTy.getAddressSpace()))
3629	return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3630
3631	// If we're using MUBUF instructions for global memory, an SGPR base register
3632	// is possible. Otherwise this needs to be a VGPR.
3633	const RegisterBank PtrBank = getRegBank(PtrReg, MRI, TRI);
3634	return AMDGPU::getValueMapping(BankID: PtrBank->getID(), Size);
3635	}
3636
3637	const RegisterBankInfo::InstructionMapping &
3638	AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
3639
3640	const MachineFunction &MF = *MI.getParent()->getParent();
3641	const MachineRegisterInfo &MRI = MF.getRegInfo();
3642	SmallVector<const ValueMapping*, `2`> OpdsMapping(`2`);
3643	unsigned Size = getSizeInBits(MI.getOperand(i: `0`).getReg(), MRI, *TRI);
3644	Register PtrReg = MI.getOperand(i: `1`).getReg();
3645	LLT PtrTy = MRI.getType(Reg: PtrReg);
3646	unsigned AS = PtrTy.getAddressSpace();
3647	unsigned PtrSize = PtrTy.getSizeInBits();
3648
3649	const ValueMapping *ValMapping;
3650	const ValueMapping *PtrMapping;
3651
3652	const RegisterBank PtrBank = getRegBank(PtrReg, MRI, TRI);
3653
3654	if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) {
3655	if (isScalarLoadLegal(MI)) {
3656	// We have a uniform instruction so we want to use an SMRD load
3657	ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3658	PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
3659	} else {
3660	ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3661
3662	// If we're using MUBUF instructions for global memory, an SGPR base
3663	// register is possible. Otherwise this needs to be a VGPR.
3664	unsigned PtrBankID = Subtarget.useFlatForGlobal() ?
3665	AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID;
3666
3667	PtrMapping = AMDGPU::getValueMapping(BankID: PtrBankID, Size: PtrSize);
3668	}
3669	} else {
3670	ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3671	PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
3672	}
3673
3674	OpdsMapping [`0`] = ValMapping;
3675	OpdsMapping [`1`] = PtrMapping;
3676	const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping(
3677	ID: `1`, Cost: `1`, OperandsMapping: getOperandsMapping(OpdsMapping), NumOperands: MI.getNumOperands());
3678	return Mapping;
3679
3680	// FIXME: Do we want to add a mapping for FLAT load, or should we just
3681	// handle that during instruction selection?
3682	}
3683
3684	unsigned
3685	AMDGPURegisterBankInfo::getRegBankID(Register Reg,
3686	const MachineRegisterInfo &MRI,
3687	unsigned Default) const {
3688	const RegisterBank Bank = getRegBank(Reg, MRI, TRI);
3689	return Bank ? Bank->getID() : Default;
3690	}
3691
3692	const RegisterBankInfo::ValueMapping *
3693	AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg,
3694	const MachineRegisterInfo &MRI,
3695	const TargetRegisterInfo &TRI) const {
3696	// Lie and claim anything is legal, even though this needs to be an SGPR
3697	// applyMapping will have to deal with it as a waterfall loop.
3698	unsigned Bank = getRegBankID(Reg, MRI, AMDGPU::SGPRRegBankID);
3699	unsigned Size = getSizeInBits(Reg, MRI, TRI);
3700	return AMDGPU::getValueMapping(BankID: Bank, Size);
3701	}
3702
3703	const RegisterBankInfo::ValueMapping *
3704	AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg,
3705	const MachineRegisterInfo &MRI,
3706	const TargetRegisterInfo &TRI) const {
3707	unsigned Size = getSizeInBits(Reg, MRI, TRI);
3708	return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3709	}
3710
3711	const RegisterBankInfo::ValueMapping *
3712	AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg,
3713	const MachineRegisterInfo &MRI,
3714	const TargetRegisterInfo &TRI) const {
3715	unsigned Size = getSizeInBits(Reg, MRI, TRI);
3716	return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size);
3717	}
3718
3719	///
3720	/// This function must return a legal mapping, because
3721	/// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
3722	/// in RegBankSelect::Mode::Fast. Any mapping that would cause a
3723	/// VGPR to SGPR generated is illegal.
3724	///
3725	// Operands that must be SGPRs must accept potentially divergent VGPRs as
3726	// legal. These will be dealt with in applyMappingImpl.
3727	//
3728	const RegisterBankInfo::InstructionMapping &
3729	AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
3730	const MachineFunction &MF = *MI.getParent()->getParent();
3731	const MachineRegisterInfo &MRI = MF.getRegInfo();
3732
3733	if (MI.isCopy() \|\| MI.getOpcode() == AMDGPU::G_FREEZE) {
3734	// The default logic bothers to analyze impossible alternative mappings. We
3735	// want the most straightforward mapping, so just directly handle this.
3736	const RegisterBank *DstBank = getRegBank(MI.getOperand(i: `0`).getReg(), MRI,
3737	*TRI);
3738	const RegisterBank *SrcBank = getRegBank(MI.getOperand(i: `1`).getReg(), MRI,
3739	*TRI);
3740	assert(SrcBank && "src bank should have been assigned already");
3741	if (!DstBank)
3742	DstBank = SrcBank;
3743
3744	unsigned Size = getSizeInBits(MI.getOperand(i: `0`).getReg(), MRI, *TRI);
3745	if (MI.getOpcode() != AMDGPU::G_FREEZE &&
3746	cannotCopy(DstBank, SrcBank, TypeSize::getFixed(Size)))
3747	return getInvalidInstructionMapping();
3748
3749	const ValueMapping &ValMap = getValueMapping(StartIdx: `0`, Length: Size, RegBank: *DstBank);
3750	unsigned OpdsMappingSize = MI.isCopy() ? `1` : `2`;
3751	SmallVector<const ValueMapping *, `1`> OpdsMapping(OpdsMappingSize);
3752	OpdsMapping [`0`] = &ValMap;
3753	if (MI.getOpcode() == AMDGPU::G_FREEZE)
3754	OpdsMapping [`1`] = &ValMap;
3755
3756	return getInstructionMapping(
3757	ID: `1`, /Cost/ `1`,
3758	/OperandsMapping/ getOperandsMapping(OpdsMapping), NumOperands: OpdsMappingSize);
3759	}
3760
3761	if (MI.isRegSequence()) {
3762	// If any input is a VGPR, the result must be a VGPR. The default handling
3763	// assumes any copy between banks is legal.
3764	unsigned BankID = AMDGPU::SGPRRegBankID;
3765
3766	for (unsigned I = `1`, E = MI.getNumOperands(); I != E; I += `2`) {
3767	auto OpBank = getRegBankID(Reg: MI.getOperand(i: I).getReg(), MRI);
3768	// It doesn't make sense to use vcc or scc banks here, so just ignore
3769	// them.
3770	if (OpBank != AMDGPU::SGPRRegBankID) {
3771	BankID = AMDGPU::VGPRRegBankID;
3772	break;
3773	}
3774	}
3775	unsigned Size = getSizeInBits(MI.getOperand(i: `0`).getReg(), MRI, *TRI);
3776
3777	const ValueMapping &ValMap = getValueMapping(StartIdx: `0`, Length: Size, RegBank: getRegBank(ID: BankID));
3778	return getInstructionMapping(
3779	ID: `1`, /Cost/ `1`,
3780	/OperandsMapping/ getOperandsMapping(OpdsMapping: {&ValMap}), NumOperands: `1`);
3781	}
3782
3783	// The default handling is broken and doesn't handle illegal SGPR->VGPR copies
3784	// properly.
3785	//
3786	// TODO: There are additional exec masking dependencies to analyze.
3787	if (auto *PHI = dyn_cast<GPhi>(Val: &MI)) {
3788	unsigned ResultBank = AMDGPU::InvalidRegBankID;
3789	Register DstReg = PHI->getReg(Idx: `0`);
3790
3791	// Sometimes the result may have already been assigned a bank.
3792	if (const RegisterBank DstBank = getRegBank(DstReg, MRI, TRI))
3793	ResultBank = DstBank->getID();
3794
3795	for (unsigned I = `0`; I < PHI->getNumIncomingValues(); ++I) {
3796	Register Reg = PHI->getIncomingValue(I);
3797	const RegisterBank Bank = getRegBank(Reg, MRI, TRI);
3798
3799	// FIXME: Assuming VGPR for any undetermined inputs.
3800	if (!Bank \|\| Bank->getID() == AMDGPU::VGPRRegBankID) {
3801	ResultBank = AMDGPU::VGPRRegBankID;
3802	break;
3803	}
3804
3805	// FIXME: Need to promote SGPR case to s32
3806	unsigned OpBank = Bank->getID();
3807	ResultBank = regBankBoolUnion(RB0: ResultBank, RB1: OpBank);
3808	}
3809
3810	assert(ResultBank != AMDGPU::InvalidRegBankID);
3811
3812	unsigned Size = MRI.getType(Reg: DstReg).getSizeInBits();
3813
3814	const ValueMapping &ValMap =
3815	getValueMapping(StartIdx: `0`, Length: Size, RegBank: getRegBank(ID: ResultBank));
3816	return getInstructionMapping(
3817	ID: `1`, /Cost/ `1`,
3818	/OperandsMapping/ getOperandsMapping(OpdsMapping: {&ValMap}), NumOperands: `1`);
3819	}
3820
3821	const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
3822	if (Mapping.isValid())
3823	return Mapping;
3824
3825	SmallVector<const ValueMapping*, `8`> OpdsMapping(MI.getNumOperands());
3826
3827	switch (MI.getOpcode()) {
3828	default:
3829	return getInvalidInstructionMapping();
3830
3831	case AMDGPU::G_AND:
3832	case AMDGPU::G_OR:
3833	case AMDGPU::G_XOR:
3834	case AMDGPU::G_MUL: {
3835	unsigned Size = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
3836	if (Size == `1`) {
3837	const RegisterBank *DstBank
3838	= getRegBank(MI.getOperand(i: `0`).getReg(), MRI, *TRI);
3839
3840	unsigned TargetBankID = AMDGPU::InvalidRegBankID;
3841	unsigned BankLHS = AMDGPU::InvalidRegBankID;
3842	unsigned BankRHS = AMDGPU::InvalidRegBankID;
3843	if (DstBank) {
3844	TargetBankID = DstBank->getID();
3845	if (DstBank == &AMDGPU::VCCRegBank) {
3846	TargetBankID = AMDGPU::VCCRegBankID;
3847	BankLHS = AMDGPU::VCCRegBankID;
3848	BankRHS = AMDGPU::VCCRegBankID;
3849	} else {
3850	BankLHS = getRegBankID(MI.getOperand(`1`).getReg(), MRI,
3851	AMDGPU::SGPRRegBankID);
3852	BankRHS = getRegBankID(MI.getOperand(`2`).getReg(), MRI,
3853	AMDGPU::SGPRRegBankID);
3854	}
3855	} else {
3856	BankLHS = getRegBankID(MI.getOperand(`1`).getReg(), MRI,
3857	AMDGPU::VCCRegBankID);
3858	BankRHS = getRegBankID(MI.getOperand(`2`).getReg(), MRI,
3859	AMDGPU::VCCRegBankID);
3860
3861	// Both inputs should be true booleans to produce a boolean result.
3862	if (BankLHS == AMDGPU::VGPRRegBankID \|\| BankRHS == AMDGPU::VGPRRegBankID) {
3863	TargetBankID = AMDGPU::VGPRRegBankID;
3864	} else if (BankLHS == AMDGPU::VCCRegBankID \|\| BankRHS == AMDGPU::VCCRegBankID) {
3865	TargetBankID = AMDGPU::VCCRegBankID;
3866	BankLHS = AMDGPU::VCCRegBankID;
3867	BankRHS = AMDGPU::VCCRegBankID;
3868	} else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) {
3869	TargetBankID = AMDGPU::SGPRRegBankID;
3870	}
3871	}
3872
3873	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: TargetBankID, Size);
3874	OpdsMapping [`1`] = AMDGPU::getValueMapping(BankID: BankLHS, Size);
3875	OpdsMapping [`2`] = AMDGPU::getValueMapping(BankID: BankRHS, Size);
3876	break;
3877	}
3878
3879	if (Size == `64`) {
3880
3881	if (isSALUMapping(MI)) {
3882	OpdsMapping[`0`] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);
3883	OpdsMapping [`1`] = OpdsMapping [`2`] = OpdsMapping [`0`];
3884	} else {
3885	OpdsMapping[`0`] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
3886	unsigned Bank1 = getRegBankID(Reg: MI.getOperand(i: `1`).getReg(), MRI /, DefaultBankID/);
3887	OpdsMapping [`1`] = AMDGPU::getValueMapping(BankID: Bank1, Size);
3888
3889	unsigned Bank2 = getRegBankID(Reg: MI.getOperand(i: `2`).getReg(), MRI /, DefaultBankID/);
3890	OpdsMapping [`2`] = AMDGPU::getValueMapping(BankID: Bank2, Size);
3891	}
3892
3893	break;
3894	}
3895
3896	[[fallthrough]];
3897	}
3898	case AMDGPU::G_PTR_ADD:
3899	case AMDGPU::G_PTRMASK:
3900	case AMDGPU::G_ADD:
3901	case AMDGPU::G_SUB:
3902	case AMDGPU::G_SHL:
3903	case AMDGPU::G_LSHR:
3904	case AMDGPU::G_ASHR:
3905	case AMDGPU::G_UADDO:
3906	case AMDGPU::G_USUBO:
3907	case AMDGPU::G_UADDE:
3908	case AMDGPU::G_SADDE:
3909	case AMDGPU::G_USUBE:
3910	case AMDGPU::G_SSUBE:
3911	case AMDGPU::G_SMIN:
3912	case AMDGPU::G_SMAX:
3913	case AMDGPU::G_UMIN:
3914	case AMDGPU::G_UMAX:
3915	case AMDGPU::G_ABS:
3916	case AMDGPU::G_SHUFFLE_VECTOR:
3917	case AMDGPU::G_SBFX:
3918	case AMDGPU::G_UBFX:
3919	case AMDGPU::G_AMDGPU_S_MUL_I64_I32:
3920	case AMDGPU::G_AMDGPU_S_MUL_U64_U32:
3921	if (isSALUMapping(MI))
3922	return getDefaultMappingSOP(MI);
3923	return getDefaultMappingVOP(MI);
3924	case AMDGPU::G_FADD:
3925	case AMDGPU::G_FSUB:
3926	case AMDGPU::G_FMUL:
3927	case AMDGPU::G_FMA:
3928	case AMDGPU::G_FFLOOR:
3929	case AMDGPU::G_FCEIL:
3930	case AMDGPU::G_INTRINSIC_ROUNDEVEN:
3931	case AMDGPU::G_FMINNUM:
3932	case AMDGPU::G_FMAXNUM:
3933	case AMDGPU::G_FMINIMUM:
3934	case AMDGPU::G_FMAXIMUM:
3935	case AMDGPU::G_INTRINSIC_TRUNC:
3936	case AMDGPU::G_STRICT_FADD:
3937	case AMDGPU::G_STRICT_FSUB:
3938	case AMDGPU::G_STRICT_FMUL:
3939	case AMDGPU::G_STRICT_FMA: {
3940	LLT Ty = MRI.getType(Reg: MI.getOperand(i: `0`).getReg());
3941	unsigned Size = Ty.getSizeInBits();
3942	if (Subtarget.hasSALUFloatInsts() && Ty.isScalar() &&
3943	(Size == `32` \|\| Size == `16`) && isSALUMapping(MI))
3944	return getDefaultMappingSOP(MI);
3945	return getDefaultMappingVOP(MI);
3946	}
3947	case AMDGPU::G_FPTOSI:
3948	case AMDGPU::G_FPTOUI:
3949	case AMDGPU::G_SITOFP:
3950	case AMDGPU::G_UITOFP: {
3951	unsigned SizeDst = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
3952	unsigned SizeSrc = MRI.getType(Reg: MI.getOperand(i: `1`).getReg()).getSizeInBits();
3953	if (Subtarget.hasSALUFloatInsts() && SizeDst == `32` && SizeSrc == `32` &&
3954	isSALUMapping(MI))
3955	return getDefaultMappingSOP(MI);
3956	return getDefaultMappingVOP(MI);
3957	}
3958	case AMDGPU::G_FPTRUNC:
3959	case AMDGPU::G_FPEXT: {
3960	unsigned SizeDst = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
3961	unsigned SizeSrc = MRI.getType(Reg: MI.getOperand(i: `1`).getReg()).getSizeInBits();
3962	if (Subtarget.hasSALUFloatInsts() && SizeDst != `64` && SizeSrc != `64` &&
3963	isSALUMapping(MI))
3964	return getDefaultMappingSOP(MI);
3965	return getDefaultMappingVOP(MI);
3966	}
3967	case AMDGPU::G_FSQRT:
3968	case AMDGPU::G_FEXP2:
3969	case AMDGPU::G_FLOG2: {
3970	unsigned Size = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
3971	if (Subtarget.hasPseudoScalarTrans() && (Size == `16` \|\| Size == `32`) &&
3972	isSALUMapping(MI))
3973	return getDefaultMappingSOP(MI);
3974	return getDefaultMappingVOP(MI);
3975	}
3976	case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU
3977	case AMDGPU::G_SSUBSAT:
3978	case AMDGPU::G_UADDSAT:
3979	case AMDGPU::G_USUBSAT:
3980	case AMDGPU::G_FMAD:
3981	case AMDGPU::G_FLDEXP:
3982	case AMDGPU::G_FMINNUM_IEEE:
3983	case AMDGPU::G_FMAXNUM_IEEE:
3984	case AMDGPU::G_FCANONICALIZE:
3985	case AMDGPU::G_STRICT_FLDEXP:
3986	case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar?
3987	case AMDGPU::G_FSHR: // TODO: Expand for scalar
3988	case AMDGPU::G_AMDGPU_FMIN_LEGACY:
3989	case AMDGPU::G_AMDGPU_FMAX_LEGACY:
3990	case AMDGPU::G_AMDGPU_RCP_IFLAG:
3991	case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
3992	case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
3993	case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
3994	case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
3995	case AMDGPU::G_AMDGPU_CVT_PK_I16_I32:
3996	case AMDGPU::G_AMDGPU_SMED3:
3997	case AMDGPU::G_AMDGPU_FMED3:
3998	return getDefaultMappingVOP(MI);
3999	case AMDGPU::G_UMULH:
4000	case AMDGPU::G_SMULH: {
4001	if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI))
4002	return getDefaultMappingSOP(MI);
4003	return getDefaultMappingVOP(MI);
4004	}
4005	case AMDGPU::G_AMDGPU_MAD_U64_U32:
4006	case AMDGPU::G_AMDGPU_MAD_I64_I32: {
4007	// Three possible mappings:
4008	//
4009	// - Default SOP
4010	// - Default VOP
4011	// - Scalar multiply: src0 and src1 are SGPRs, the rest is VOP.
4012	//
4013	// This allows instruction selection to keep the multiplication part of the
4014	// instruction on the SALU.
4015	bool AllSalu = true;
4016	bool MulSalu = true;
4017	for (unsigned i = `0`; i < `5`; ++i) {
4018	Register Reg = MI.getOperand(i).getReg();
4019	if (const RegisterBank Bank = getRegBank(Reg, MRI, TRI)) {
4020	if (Bank->getID() != AMDGPU::SGPRRegBankID) {
4021	AllSalu = false;
4022	if (i == `2` \|\| i == `3`) {
4023	MulSalu = false;
4024	break;
4025	}
4026	}
4027	}
4028	}
4029
4030	if (AllSalu)
4031	return getDefaultMappingSOP(MI);
4032
4033	// If the multiply-add is full-rate in VALU, use that even if the
4034	// multiplication part is scalar. Accumulating separately on the VALU would
4035	// take two instructions.
4036	if (!MulSalu \|\| Subtarget.hasFullRate64Ops())
4037	return getDefaultMappingVOP(MI);
4038
4039	// Keep the multiplication on the SALU, then accumulate on the VALU.
4040	OpdsMapping[`0`] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, `64`);
4041	OpdsMapping[`1`] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, `1`);
4042	OpdsMapping[`2`] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, `32`);
4043	OpdsMapping[`3`] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, `32`);
4044	OpdsMapping[`4`] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, `64`);
4045	break;
4046	}
4047	case AMDGPU::G_IMPLICIT_DEF: {
4048	unsigned Size = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
4049	OpdsMapping[`0`] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4050	break;
4051	}
4052	case AMDGPU::G_FCONSTANT:
4053	case AMDGPU::G_CONSTANT:
4054	case AMDGPU::G_GLOBAL_VALUE:
4055	case AMDGPU::G_BLOCK_ADDR:
4056	case AMDGPU::G_READSTEADYCOUNTER:
4057	case AMDGPU::G_READCYCLECOUNTER: {
4058	unsigned Size = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
4059	OpdsMapping[`0`] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4060	break;
4061	}
4062	case AMDGPU::G_FRAME_INDEX: {
4063	// TODO: This should be the same as other constants, but eliminateFrameIndex
4064	// currently assumes VALU uses.
4065	unsigned Size = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
4066	OpdsMapping[`0`] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4067	break;
4068	}
4069	case AMDGPU::G_DYN_STACKALLOC: {
4070	// Result is always uniform, and a wave reduction is needed for the source.
4071	OpdsMapping[`0`] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, `32`);
4072	unsigned SrcBankID = getRegBankID(Reg: MI.getOperand(i: `1`).getReg(), MRI);
4073	OpdsMapping [`1`] = AMDGPU::getValueMapping(BankID: SrcBankID, Size: `32`);
4074	break;
4075	}
4076	case AMDGPU::G_AMDGPU_WAVE_ADDRESS: {
4077	// This case is weird because we expect a physical register in the source,
4078	// but need to set a bank anyway.
4079	//
4080	// TODO: We could select the result to SGPR or VGPR
4081	OpdsMapping[`0`] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, `32`);
4082	OpdsMapping[`1`] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, `32`);
4083	break;
4084	}
4085	case AMDGPU::G_INSERT: {
4086	unsigned BankID = getMappingType(MRI, MI);
4087	unsigned DstSize = getSizeInBits(MI.getOperand(i: `0`).getReg(), MRI, *TRI);
4088	unsigned SrcSize = getSizeInBits(MI.getOperand(i: `1`).getReg(), MRI, *TRI);
4089	unsigned EltSize = getSizeInBits(MI.getOperand(i: `2`).getReg(), MRI, *TRI);
4090	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID, Size: DstSize);
4091	OpdsMapping [`1`] = AMDGPU::getValueMapping(BankID, Size: SrcSize);
4092	OpdsMapping [`2`] = AMDGPU::getValueMapping(BankID, Size: EltSize);
4093	OpdsMapping [`3`] = nullptr;
4094	break;
4095	}
4096	case AMDGPU::G_EXTRACT: {
4097	unsigned BankID = getRegBankID(Reg: MI.getOperand(i: `1`).getReg(), MRI);
4098	unsigned DstSize = getSizeInBits(MI.getOperand(i: `0`).getReg(), MRI, *TRI);
4099	unsigned SrcSize = getSizeInBits(MI.getOperand(i: `1`).getReg(), MRI, *TRI);
4100	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID, Size: DstSize);
4101	OpdsMapping [`1`] = AMDGPU::getValueMapping(BankID, Size: SrcSize);
4102	OpdsMapping [`2`] = nullptr;
4103	break;
4104	}
4105	case AMDGPU::G_BUILD_VECTOR:
4106	case AMDGPU::G_BUILD_VECTOR_TRUNC: {
4107	LLT DstTy = MRI.getType(Reg: MI.getOperand(i: `0`).getReg());
4108	if (DstTy == LLT::fixed_vector(NumElements: `2`, ScalarSizeInBits: `16`)) {
4109	unsigned DstSize = DstTy.getSizeInBits();
4110	unsigned SrcSize = MRI.getType(Reg: MI.getOperand(i: `1`).getReg()).getSizeInBits();
4111	unsigned Src0BankID = getRegBankID(Reg: MI.getOperand(i: `1`).getReg(), MRI);
4112	unsigned Src1BankID = getRegBankID(Reg: MI.getOperand(i: `2`).getReg(), MRI);
4113	unsigned DstBankID = regBankUnion(RB0: Src0BankID, RB1: Src1BankID);
4114
4115	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: DstBankID, Size: DstSize);
4116	OpdsMapping [`1`] = AMDGPU::getValueMapping(BankID: Src0BankID, Size: SrcSize);
4117	OpdsMapping [`2`] = AMDGPU::getValueMapping(BankID: Src1BankID, Size: SrcSize);
4118	break;
4119	}
4120
4121	[[fallthrough]];
4122	}
4123	case AMDGPU::G_MERGE_VALUES:
4124	case AMDGPU::G_CONCAT_VECTORS: {
4125	unsigned Bank = getMappingType(MRI, MI);
4126	unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
4127	unsigned SrcSize = MRI.getType(Reg: MI.getOperand(i: `1`).getReg()).getSizeInBits();
4128
4129	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: Bank, Size: DstSize);
4130	// Op1 and Dst should use the same register bank.
4131	for (unsigned i = `1`, e = MI.getNumOperands(); i != e; ++i)
4132	OpdsMapping [i] = AMDGPU::getValueMapping(BankID: Bank, Size: SrcSize);
4133	break;
4134	}
4135	case AMDGPU::G_BITREVERSE:
4136	case AMDGPU::G_BITCAST:
4137	case AMDGPU::G_INTTOPTR:
4138	case AMDGPU::G_PTRTOINT:
4139	case AMDGPU::G_FABS:
4140	case AMDGPU::G_FNEG: {
4141	unsigned Size = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
4142	unsigned BankID = getRegBankID(Reg: MI.getOperand(i: `1`).getReg(), MRI);
4143	OpdsMapping [`0`] = OpdsMapping [`1`] = AMDGPU::getValueMapping(BankID, Size);
4144	break;
4145	}
4146	case AMDGPU::G_AMDGPU_FFBH_U32:
4147	case AMDGPU::G_AMDGPU_FFBL_B32:
4148	case AMDGPU::G_CTLZ_ZERO_UNDEF:
4149	case AMDGPU::G_CTTZ_ZERO_UNDEF: {
4150	unsigned Size = MRI.getType(Reg: MI.getOperand(i: `1`).getReg()).getSizeInBits();
4151	unsigned BankID = getRegBankID(Reg: MI.getOperand(i: `1`).getReg(), MRI);
4152	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID, Size: `32`);
4153	OpdsMapping [`1`] = AMDGPU::getValueMappingSGPR64Only(BankID, Size);
4154	break;
4155	}
4156	case AMDGPU::G_CTPOP: {
4157	unsigned Size = MRI.getType(Reg: MI.getOperand(i: `1`).getReg()).getSizeInBits();
4158	unsigned BankID = getRegBankID(Reg: MI.getOperand(i: `1`).getReg(), MRI);
4159	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID, Size: `32`);
4160
4161	// This should really be getValueMappingSGPR64Only, but allowing the generic
4162	// code to handle the register split just makes using LegalizerHelper more
4163	// difficult.
4164	OpdsMapping [`1`] = AMDGPU::getValueMapping(BankID, Size);
4165	break;
4166	}
4167	case AMDGPU::G_TRUNC: {
4168	Register Dst = MI.getOperand(i: `0`).getReg();
4169	Register Src = MI.getOperand(i: `1`).getReg();
4170	unsigned Bank = getRegBankID(Reg: Src, MRI);
4171	unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
4172	unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
4173	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: Bank, Size: DstSize);
4174	OpdsMapping [`1`] = AMDGPU::getValueMapping(BankID: Bank, Size: SrcSize);
4175	break;
4176	}
4177	case AMDGPU::G_ZEXT:
4178	case AMDGPU::G_SEXT:
4179	case AMDGPU::G_ANYEXT:
4180	case AMDGPU::G_SEXT_INREG: {
4181	Register Dst = MI.getOperand(i: `0`).getReg();
4182	Register Src = MI.getOperand(i: `1`).getReg();
4183	unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
4184	unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
4185
4186	unsigned DstBank;
4187	const RegisterBank SrcBank = getRegBank(Src, MRI, TRI);
4188	assert(SrcBank);
4189	switch (SrcBank->getID()) {
4190	case AMDGPU::SGPRRegBankID:
4191	DstBank = AMDGPU::SGPRRegBankID;
4192	break;
4193	default:
4194	DstBank = AMDGPU::VGPRRegBankID;
4195	break;
4196	}
4197
4198	// Scalar extend can use 64-bit BFE, but VGPRs require extending to
4199	// 32-bits, and then to 64.
4200	OpdsMapping [`0`] = AMDGPU::getValueMappingSGPR64Only(BankID: DstBank, Size: DstSize);
4201	OpdsMapping [`1`] = AMDGPU::getValueMappingSGPR64Only(BankID: SrcBank->getID(),
4202	Size: SrcSize);
4203	break;
4204	}
4205	case AMDGPU::G_IS_FPCLASS: {
4206	Register SrcReg = MI.getOperand(i: `1`).getReg();
4207	unsigned SrcSize = MRI.getType(Reg: SrcReg).getSizeInBits();
4208	unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
4209	OpdsMapping[`0`] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
4210	OpdsMapping[`1`] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4211	break;
4212	}
4213	case AMDGPU::G_STORE: {
4214	assert(MI.getOperand(`0`).isReg());
4215	unsigned Size = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
4216
4217	// FIXME: We need to specify a different reg bank once scalar stores are
4218	// supported.
4219	const ValueMapping *ValMapping =
4220	AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4221	OpdsMapping [`0`] = ValMapping;
4222	OpdsMapping [`1`] = getValueMappingForPtr(MRI, PtrReg: MI.getOperand(i: `1`).getReg());
4223	break;
4224	}
4225	case AMDGPU::G_ICMP:
4226	case AMDGPU::G_FCMP: {
4227	unsigned Size = MRI.getType(Reg: MI.getOperand(i: `2`).getReg()).getSizeInBits();
4228
4229	// See if the result register has already been constrained to vcc, which may
4230	// happen due to control flow intrinsic lowering.
4231	unsigned DstBank = getRegBankID(MI.getOperand(`0`).getReg(), MRI,
4232	AMDGPU::SGPRRegBankID);
4233	unsigned Op2Bank = getRegBankID(Reg: MI.getOperand(i: `2`).getReg(), MRI);
4234	unsigned Op3Bank = getRegBankID(Reg: MI.getOperand(i: `3`).getReg(), MRI);
4235
4236	auto canUseSCCICMP = [&]() {
4237	auto Pred =
4238	static_cast<CmpInst::Predicate>(MI.getOperand(i: `1`).getPredicate());
4239	return Size == `32` \|\|
4240	(Size == `64` &&
4241	(Pred == CmpInst::ICMP_EQ \|\| Pred == CmpInst::ICMP_NE) &&
4242	Subtarget.hasScalarCompareEq64());
4243	};
4244	auto canUseSCCFCMP = [&]() {
4245	return Subtarget.hasSALUFloatInsts() && (Size == `32` \|\| Size == `16`);
4246	};
4247
4248	bool isICMP = MI.getOpcode() == AMDGPU::G_ICMP;
4249	bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID &&
4250	Op2Bank == AMDGPU::SGPRRegBankID &&
4251	Op3Bank == AMDGPU::SGPRRegBankID &&
4252	(isICMP ? canUseSCCICMP() : canUseSCCFCMP());
4253
4254	DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4255	unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4256
4257	// TODO: Use 32-bit for scalar output size.
4258	// SCC results will need to be copied to a 32-bit SGPR virtual register.
4259	const unsigned ResultSize = `1`;
4260
4261	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: DstBank, Size: ResultSize);
4262	OpdsMapping [`1`] = nullptr; // Predicate Operand.
4263	OpdsMapping [`2`] = AMDGPU::getValueMapping(BankID: SrcBank, Size);
4264	OpdsMapping [`3`] = AMDGPU::getValueMapping(BankID: SrcBank, Size);
4265	break;
4266	}
4267	case AMDGPU::G_EXTRACT_VECTOR_ELT: {
4268	// VGPR index can be used for waterfall when indexing a SGPR vector.
4269	unsigned SrcBankID = getRegBankID(Reg: MI.getOperand(i: `1`).getReg(), MRI);
4270	unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
4271	unsigned SrcSize = MRI.getType(Reg: MI.getOperand(i: `1`).getReg()).getSizeInBits();
4272	unsigned IdxSize = MRI.getType(Reg: MI.getOperand(i: `2`).getReg()).getSizeInBits();
4273	unsigned IdxBank = getRegBankID(Reg: MI.getOperand(i: `2`).getReg(), MRI);
4274	unsigned OutputBankID = regBankUnion(RB0: SrcBankID, RB1: IdxBank);
4275
4276	OpdsMapping [`0`] = AMDGPU::getValueMappingSGPR64Only(BankID: OutputBankID, Size: DstSize);
4277	OpdsMapping [`1`] = AMDGPU::getValueMapping(BankID: SrcBankID, Size: SrcSize);
4278
4279	// The index can be either if the source vector is VGPR.
4280	OpdsMapping [`2`] = AMDGPU::getValueMapping(BankID: IdxBank, Size: IdxSize);
4281	break;
4282	}
4283	case AMDGPU::G_INSERT_VECTOR_ELT: {
4284	unsigned OutputBankID = isSALUMapping(MI) ?
4285	AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4286
4287	unsigned VecSize = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
4288	unsigned InsertSize = MRI.getType(Reg: MI.getOperand(i: `2`).getReg()).getSizeInBits();
4289	unsigned IdxSize = MRI.getType(Reg: MI.getOperand(i: `3`).getReg()).getSizeInBits();
4290	unsigned InsertEltBankID = getRegBankID(Reg: MI.getOperand(i: `2`).getReg(), MRI);
4291	unsigned IdxBankID = getRegBankID(Reg: MI.getOperand(i: `3`).getReg(), MRI);
4292
4293	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: OutputBankID, Size: VecSize);
4294	OpdsMapping [`1`] = AMDGPU::getValueMapping(BankID: OutputBankID, Size: VecSize);
4295
4296	// This is a weird case, because we need to break down the mapping based on
4297	// the register bank of a different operand.
4298	if (InsertSize == `64` && OutputBankID == AMDGPU::VGPRRegBankID) {
4299	OpdsMapping [`2`] = AMDGPU::getValueMappingSplit64(BankID: InsertEltBankID,
4300	Size: InsertSize);
4301	} else {
4302	assert(InsertSize == `32` \|\| InsertSize == `64`);
4303	OpdsMapping [`2`] = AMDGPU::getValueMapping(BankID: InsertEltBankID, Size: InsertSize);
4304	}
4305
4306	// The index can be either if the source vector is VGPR.
4307	OpdsMapping [`3`] = AMDGPU::getValueMapping(BankID: IdxBankID, Size: IdxSize);
4308	break;
4309	}
4310	case AMDGPU::G_UNMERGE_VALUES: {
4311	unsigned Bank = getMappingType(MRI, MI);
4312
4313	// Op1 and Dst should use the same register bank.
4314	// FIXME: Shouldn't this be the default? Why do we need to handle this?
4315	for (unsigned i = `0`, e = MI.getNumOperands(); i != e; ++i) {
4316	unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
4317	OpdsMapping [i] = AMDGPU::getValueMapping(BankID: Bank, Size);
4318	}
4319	break;
4320	}
4321	case AMDGPU::G_AMDGPU_BUFFER_LOAD:
4322	case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
4323	case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
4324	case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
4325	case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
4326	case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
4327	case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE:
4328	case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
4329	case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
4330	case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
4331	case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
4332	case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16:
4333	case AMDGPU::G_AMDGPU_BUFFER_STORE:
4334	case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
4335	case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
4336	case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
4337	case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: {
4338	OpdsMapping [`0`] = getVGPROpMapping(MI.getOperand(i: `0`).getReg(), MRI, *TRI);
4339
4340	// rsrc
4341	OpdsMapping [`1`] = getSGPROpMapping(MI.getOperand(i: `1`).getReg(), MRI, *TRI);
4342
4343	// vindex
4344	OpdsMapping [`2`] = getVGPROpMapping(MI.getOperand(i: `2`).getReg(), MRI, *TRI);
4345
4346	// voffset
4347	OpdsMapping [`3`] = getVGPROpMapping(MI.getOperand(i: `3`).getReg(), MRI, *TRI);
4348
4349	// soffset
4350	OpdsMapping [`4`] = getSGPROpMapping(MI.getOperand(i: `4`).getReg(), MRI, *TRI);
4351
4352	// Any remaining operands are immediates and were correctly null
4353	// initialized.
4354	break;
4355	}
4356	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
4357	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
4358	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
4359	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
4360	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
4361	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
4362	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
4363	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
4364	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
4365	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
4366	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
4367	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
4368	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
4369	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD_BF16:
4370	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
4371	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
4372	// vdata_out
4373	OpdsMapping [`0`] = getVGPROpMapping(MI.getOperand(i: `0`).getReg(), MRI, *TRI);
4374
4375	// vdata_in
4376	OpdsMapping [`1`] = getVGPROpMapping(MI.getOperand(i: `1`).getReg(), MRI, *TRI);
4377
4378	// rsrc
4379	OpdsMapping [`2`] = getSGPROpMapping(MI.getOperand(i: `2`).getReg(), MRI, *TRI);
4380
4381	// vindex
4382	OpdsMapping [`3`] = getVGPROpMapping(MI.getOperand(i: `3`).getReg(), MRI, *TRI);
4383
4384	// voffset
4385	OpdsMapping [`4`] = getVGPROpMapping(MI.getOperand(i: `4`).getReg(), MRI, *TRI);
4386
4387	// soffset
4388	OpdsMapping [`5`] = getSGPROpMapping(MI.getOperand(i: `5`).getReg(), MRI, *TRI);
4389
4390	// Any remaining operands are immediates and were correctly null
4391	// initialized.
4392	break;
4393	}
4394	case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
4395	// vdata_out
4396	OpdsMapping [`0`] = getVGPROpMapping(MI.getOperand(i: `0`).getReg(), MRI, *TRI);
4397
4398	// vdata_in
4399	OpdsMapping [`1`] = getVGPROpMapping(MI.getOperand(i: `1`).getReg(), MRI, *TRI);
4400
4401	// cmp
4402	OpdsMapping [`2`] = getVGPROpMapping(MI.getOperand(i: `2`).getReg(), MRI, *TRI);
4403
4404	// rsrc
4405	OpdsMapping [`3`] = getSGPROpMapping(MI.getOperand(i: `3`).getReg(), MRI, *TRI);
4406
4407	// vindex
4408	OpdsMapping [`4`] = getVGPROpMapping(MI.getOperand(i: `4`).getReg(), MRI, *TRI);
4409
4410	// voffset
4411	OpdsMapping [`5`] = getVGPROpMapping(MI.getOperand(i: `5`).getReg(), MRI, *TRI);
4412
4413	// soffset
4414	OpdsMapping [`6`] = getSGPROpMapping(MI.getOperand(i: `6`).getReg(), MRI, *TRI);
4415
4416	// Any remaining operands are immediates and were correctly null
4417	// initialized.
4418	break;
4419	}
4420	case AMDGPU::G_AMDGPU_S_BUFFER_LOAD:
4421	case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
4422	case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE:
4423	case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
4424	case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT: {
4425	// Lie and claim everything is legal, even though some need to be
4426	// SGPRs. applyMapping will have to deal with it as a waterfall loop.
4427	OpdsMapping [`1`] = getSGPROpMapping(MI.getOperand(i: `1`).getReg(), MRI, *TRI);
4428	OpdsMapping [`2`] = getSGPROpMapping(MI.getOperand(i: `2`).getReg(), MRI, *TRI);
4429
4430	// We need to convert this to a MUBUF if either the resource of offset is
4431	// VGPR.
4432	unsigned RSrcBank = OpdsMapping [`1`]->BreakDown[`0`].RegBank->getID();
4433	unsigned OffsetBank = OpdsMapping [`2`]->BreakDown[`0`].RegBank->getID();
4434	unsigned ResultBank = regBankUnion(RB0: RSrcBank, RB1: OffsetBank);
4435
4436	unsigned Size0 = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
4437	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: ResultBank, Size: Size0);
4438	break;
4439	}
4440	case AMDGPU::G_INTRINSIC:
4441	case AMDGPU::G_INTRINSIC_CONVERGENT: {
4442	switch (cast<GIntrinsic>(Val: MI).getIntrinsicID()) {
4443	default:
4444	return getInvalidInstructionMapping();
4445	case Intrinsic::amdgcn_div_fmas:
4446	case Intrinsic::amdgcn_div_fixup:
4447	case Intrinsic::amdgcn_trig_preop:
4448	case Intrinsic::amdgcn_sin:
4449	case Intrinsic::amdgcn_cos:
4450	case Intrinsic::amdgcn_log_clamp:
4451	case Intrinsic::amdgcn_rcp_legacy:
4452	case Intrinsic::amdgcn_rsq_legacy:
4453	case Intrinsic::amdgcn_rsq_clamp:
4454	case Intrinsic::amdgcn_fmul_legacy:
4455	case Intrinsic::amdgcn_fma_legacy:
4456	case Intrinsic::amdgcn_frexp_mant:
4457	case Intrinsic::amdgcn_frexp_exp:
4458	case Intrinsic::amdgcn_fract:
4459	case Intrinsic::amdgcn_cvt_pknorm_i16:
4460	case Intrinsic::amdgcn_cvt_pknorm_u16:
4461	case Intrinsic::amdgcn_cvt_pk_i16:
4462	case Intrinsic::amdgcn_cvt_pk_u16:
4463	case Intrinsic::amdgcn_fmed3:
4464	case Intrinsic::amdgcn_cubeid:
4465	case Intrinsic::amdgcn_cubema:
4466	case Intrinsic::amdgcn_cubesc:
4467	case Intrinsic::amdgcn_cubetc:
4468	case Intrinsic::amdgcn_sffbh:
4469	case Intrinsic::amdgcn_fmad_ftz:
4470	case Intrinsic::amdgcn_mbcnt_lo:
4471	case Intrinsic::amdgcn_mbcnt_hi:
4472	case Intrinsic::amdgcn_mul_u24:
4473	case Intrinsic::amdgcn_mul_i24:
4474	case Intrinsic::amdgcn_mulhi_u24:
4475	case Intrinsic::amdgcn_mulhi_i24:
4476	case Intrinsic::amdgcn_lerp:
4477	case Intrinsic::amdgcn_sad_u8:
4478	case Intrinsic::amdgcn_msad_u8:
4479	case Intrinsic::amdgcn_sad_hi_u8:
4480	case Intrinsic::amdgcn_sad_u16:
4481	case Intrinsic::amdgcn_qsad_pk_u16_u8:
4482	case Intrinsic::amdgcn_mqsad_pk_u16_u8:
4483	case Intrinsic::amdgcn_mqsad_u32_u8:
4484	case Intrinsic::amdgcn_cvt_pk_u8_f32:
4485	case Intrinsic::amdgcn_alignbyte:
4486	case Intrinsic::amdgcn_perm:
4487	case Intrinsic::amdgcn_fdot2:
4488	case Intrinsic::amdgcn_sdot2:
4489	case Intrinsic::amdgcn_udot2:
4490	case Intrinsic::amdgcn_sdot4:
4491	case Intrinsic::amdgcn_udot4:
4492	case Intrinsic::amdgcn_sdot8:
4493	case Intrinsic::amdgcn_udot8:
4494	case Intrinsic::amdgcn_fdot2_bf16_bf16:
4495	case Intrinsic::amdgcn_fdot2_f16_f16:
4496	case Intrinsic::amdgcn_fdot2_f32_bf16:
4497	case Intrinsic::amdgcn_sudot4:
4498	case Intrinsic::amdgcn_sudot8:
4499	case Intrinsic::amdgcn_dot4_f32_fp8_bf8:
4500	case Intrinsic::amdgcn_dot4_f32_bf8_fp8:
4501	case Intrinsic::amdgcn_dot4_f32_fp8_fp8:
4502	case Intrinsic::amdgcn_dot4_f32_bf8_bf8:
4503	case Intrinsic::amdgcn_cvt_f32_fp8:
4504	case Intrinsic::amdgcn_cvt_f32_bf8:
4505	case Intrinsic::amdgcn_cvt_pk_f32_fp8:
4506	case Intrinsic::amdgcn_cvt_pk_f32_bf8:
4507	case Intrinsic::amdgcn_cvt_pk_fp8_f32:
4508	case Intrinsic::amdgcn_cvt_pk_bf8_f32:
4509	case Intrinsic::amdgcn_cvt_sr_fp8_f32:
4510	case Intrinsic::amdgcn_cvt_sr_bf8_f32:
4511	case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16:
4512	case Intrinsic::amdgcn_wmma_f16_16x16x16_f16:
4513	case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied:
4514	case Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied:
4515	case Intrinsic::amdgcn_wmma_f32_16x16x16_bf16:
4516	case Intrinsic::amdgcn_wmma_f32_16x16x16_f16:
4517	case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4:
4518	case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8:
4519	case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8:
4520	case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8:
4521	case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8:
4522	case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8:
4523	case Intrinsic::amdgcn_wmma_i32_16x16x32_iu4:
4524	case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
4525	case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
4526	case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
4527	case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
4528	case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
4529	case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
4530	case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4:
4531	case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
4532	case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
4533	case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
4534	case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8:
4535	return getDefaultMappingVOP(MI);
4536	case Intrinsic::amdgcn_log:
4537	case Intrinsic::amdgcn_exp2:
4538	case Intrinsic::amdgcn_rcp:
4539	case Intrinsic::amdgcn_rsq:
4540	case Intrinsic::amdgcn_sqrt: {
4541	unsigned Size = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
4542	if (Subtarget.hasPseudoScalarTrans() && (Size == `16` \|\| Size == `32`) &&
4543	isSALUMapping(MI))
4544	return getDefaultMappingSOP(MI);
4545	return getDefaultMappingVOP(MI);
4546	}
4547	case Intrinsic::amdgcn_sbfe:
4548	case Intrinsic::amdgcn_ubfe:
4549	if (isSALUMapping(MI))
4550	return getDefaultMappingSOP(MI);
4551	return getDefaultMappingVOP(MI);
4552	case Intrinsic::amdgcn_ds_swizzle:
4553	case Intrinsic::amdgcn_ds_permute:
4554	case Intrinsic::amdgcn_ds_bpermute:
4555	case Intrinsic::amdgcn_update_dpp:
4556	case Intrinsic::amdgcn_mov_dpp8:
4557	case Intrinsic::amdgcn_mov_dpp:
4558	case Intrinsic::amdgcn_strict_wwm:
4559	case Intrinsic::amdgcn_wwm:
4560	case Intrinsic::amdgcn_strict_wqm:
4561	case Intrinsic::amdgcn_wqm:
4562	case Intrinsic::amdgcn_softwqm:
4563	case Intrinsic::amdgcn_set_inactive:
4564	case Intrinsic::amdgcn_set_inactive_chain_arg:
4565	case Intrinsic::amdgcn_permlane64:
4566	return getDefaultMappingAllVGPR(MI);
4567	case Intrinsic::amdgcn_cvt_pkrtz:
4568	if (Subtarget.hasSALUFloatInsts() && isSALUMapping(MI))
4569	return getDefaultMappingSOP(MI);
4570	return getDefaultMappingVOP(MI);
4571	case Intrinsic::amdgcn_kernarg_segment_ptr:
4572	case Intrinsic::amdgcn_s_getpc:
4573	case Intrinsic::amdgcn_groupstaticsize:
4574	case Intrinsic::amdgcn_reloc_constant:
4575	case Intrinsic::returnaddress: {
4576	unsigned Size = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
4577	OpdsMapping[`0`] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4578	break;
4579	}
4580	case Intrinsic::amdgcn_wqm_vote: {
4581	unsigned Size = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
4582	OpdsMapping[`0`] = OpdsMapping[`2`]
4583	= AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size);
4584	break;
4585	}
4586	case Intrinsic::amdgcn_ps_live: {
4587	OpdsMapping[`0`] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, `1`);
4588	break;
4589	}
4590	case Intrinsic::amdgcn_div_scale: {
4591	unsigned Dst0Size = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
4592	unsigned Dst1Size = MRI.getType(Reg: MI.getOperand(i: `1`).getReg()).getSizeInBits();
4593	OpdsMapping[`0`] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size);
4594	OpdsMapping[`1`] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size);
4595
4596	unsigned SrcSize = MRI.getType(Reg: MI.getOperand(i: `3`).getReg()).getSizeInBits();
4597	OpdsMapping[`3`] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4598	OpdsMapping[`4`] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4599	break;
4600	}
4601	case Intrinsic::amdgcn_class: {
4602	Register Src0Reg = MI.getOperand(i: `2`).getReg();
4603	Register Src1Reg = MI.getOperand(i: `3`).getReg();
4604	unsigned Src0Size = MRI.getType(Reg: Src0Reg).getSizeInBits();
4605	unsigned Src1Size = MRI.getType(Reg: Src1Reg).getSizeInBits();
4606	unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
4607	OpdsMapping[`0`] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
4608	OpdsMapping[`2`] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src0Size);
4609	OpdsMapping[`3`] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src1Size);
4610	break;
4611	}
4612	case Intrinsic::amdgcn_icmp:
4613	case Intrinsic::amdgcn_fcmp: {
4614	unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
4615	// This is not VCCRegBank because this is not used in boolean contexts.
4616	OpdsMapping[`0`] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4617	unsigned OpSize = MRI.getType(Reg: MI.getOperand(i: `2`).getReg()).getSizeInBits();
4618	OpdsMapping[`2`] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4619	OpdsMapping[`3`] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4620	break;
4621	}
4622	case Intrinsic::amdgcn_readlane: {
4623	// This must be an SGPR, but accept a VGPR.
4624	Register IdxReg = MI.getOperand(i: `3`).getReg();
4625	unsigned IdxSize = MRI.getType(Reg: IdxReg).getSizeInBits();
4626	unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4627	OpdsMapping [`3`] = AMDGPU::getValueMapping(BankID: IdxBank, Size: IdxSize);
4628	[[fallthrough]];
4629	}
4630	case Intrinsic::amdgcn_readfirstlane: {
4631	unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
4632	unsigned SrcSize = MRI.getType(Reg: MI.getOperand(i: `2`).getReg()).getSizeInBits();
4633	OpdsMapping[`0`] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4634	OpdsMapping[`2`] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4635	break;
4636	}
4637	case Intrinsic::amdgcn_writelane: {
4638	unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
4639	Register SrcReg = MI.getOperand(i: `2`).getReg();
4640	unsigned SrcSize = MRI.getType(Reg: SrcReg).getSizeInBits();
4641	unsigned SrcBank = getRegBankID(SrcReg, MRI, AMDGPU::SGPRRegBankID);
4642	Register IdxReg = MI.getOperand(i: `3`).getReg();
4643	unsigned IdxSize = MRI.getType(Reg: IdxReg).getSizeInBits();
4644	unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4645	OpdsMapping[`0`] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4646
4647	// These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
4648	// to legalize.
4649	OpdsMapping [`2`] = AMDGPU::getValueMapping(BankID: SrcBank, Size: SrcSize);
4650	OpdsMapping [`3`] = AMDGPU::getValueMapping(BankID: IdxBank, Size: IdxSize);
4651	OpdsMapping[`4`] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4652	break;
4653	}
4654	case Intrinsic::amdgcn_if_break: {
4655	unsigned Size = getSizeInBits(MI.getOperand(i: `0`).getReg(), MRI, *TRI);
4656	OpdsMapping[`0`] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4657	OpdsMapping[`2`] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, `1`);
4658	OpdsMapping[`3`] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4659	break;
4660	}
4661	case Intrinsic::amdgcn_permlane16:
4662	case Intrinsic::amdgcn_permlanex16: {
4663	unsigned Size = getSizeInBits(MI.getOperand(i: `0`).getReg(), MRI, *TRI);
4664	OpdsMapping[`0`] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4665	OpdsMapping[`2`] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4666	OpdsMapping[`3`] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4667	OpdsMapping [`4`] = getSGPROpMapping(MI.getOperand(i: `3`).getReg(), MRI, *TRI);
4668	OpdsMapping [`5`] = getSGPROpMapping(MI.getOperand(i: `4`).getReg(), MRI, *TRI);
4669	break;
4670	}
4671	case Intrinsic::amdgcn_permlane16_var:
4672	case Intrinsic::amdgcn_permlanex16_var: {
4673	unsigned Size = getSizeInBits(MI.getOperand(i: `0`).getReg(), MRI, *TRI);
4674	OpdsMapping[`0`] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4675	OpdsMapping[`2`] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4676	OpdsMapping[`3`] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4677	OpdsMapping[`4`] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4678	break;
4679	}
4680	case Intrinsic::amdgcn_mfma_f32_4x4x1f32:
4681	case Intrinsic::amdgcn_mfma_f32_4x4x4f16:
4682	case Intrinsic::amdgcn_mfma_i32_4x4x4i8:
4683	case Intrinsic::amdgcn_mfma_f32_4x4x2bf16:
4684	case Intrinsic::amdgcn_mfma_f32_16x16x1f32:
4685	case Intrinsic::amdgcn_mfma_f32_16x16x4f32:
4686	case Intrinsic::amdgcn_mfma_f32_16x16x4f16:
4687	case Intrinsic::amdgcn_mfma_f32_16x16x16f16:
4688	case Intrinsic::amdgcn_mfma_i32_16x16x4i8:
4689	case Intrinsic::amdgcn_mfma_i32_16x16x16i8:
4690	case Intrinsic::amdgcn_mfma_f32_16x16x2bf16:
4691	case Intrinsic::amdgcn_mfma_f32_16x16x8bf16:
4692	case Intrinsic::amdgcn_mfma_f32_32x32x1f32:
4693	case Intrinsic::amdgcn_mfma_f32_32x32x2f32:
4694	case Intrinsic::amdgcn_mfma_f32_32x32x4f16:
4695	case Intrinsic::amdgcn_mfma_f32_32x32x8f16:
4696	case Intrinsic::amdgcn_mfma_i32_32x32x4i8:
4697	case Intrinsic::amdgcn_mfma_i32_32x32x8i8:
4698	case Intrinsic::amdgcn_mfma_f32_32x32x2bf16:
4699	case Intrinsic::amdgcn_mfma_f32_32x32x4bf16:
4700	case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k:
4701	case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k:
4702	case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k:
4703	case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k:
4704	case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k:
4705	case Intrinsic::amdgcn_mfma_f64_16x16x4f64:
4706	case Intrinsic::amdgcn_mfma_f64_4x4x4f64:
4707	case Intrinsic::amdgcn_mfma_i32_16x16x32_i8:
4708	case Intrinsic::amdgcn_mfma_i32_32x32x16_i8:
4709	case Intrinsic::amdgcn_mfma_f32_16x16x8_xf32:
4710	case Intrinsic::amdgcn_mfma_f32_32x32x4_xf32:
4711	case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_bf8:
4712	case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_fp8:
4713	case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_bf8:
4714	case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_fp8:
4715	case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_bf8:
4716	case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_fp8:
4717	case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_bf8:
4718	case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_fp8: {
4719	// Default for MAI intrinsics.
4720	// srcC can also be an immediate which can be folded later.
4721	// FIXME: Should we eventually add an alternative mapping with AGPR src
4722	// for srcA/srcB?
4723	//
4724	// vdst, srcA, srcB, srcC
4725	const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4726	OpdsMapping [`0`] =
4727	Info->mayNeedAGPRs()
4728	? getAGPROpMapping(MI.getOperand(i: `0`).getReg(), MRI, *TRI)
4729	: getVGPROpMapping(MI.getOperand(i: `0`).getReg(), MRI, *TRI);
4730	OpdsMapping [`2`] = getVGPROpMapping(MI.getOperand(i: `2`).getReg(), MRI, *TRI);
4731	OpdsMapping [`3`] = getVGPROpMapping(MI.getOperand(i: `3`).getReg(), MRI, *TRI);
4732	OpdsMapping [`4`] =
4733	Info->mayNeedAGPRs()
4734	? getAGPROpMapping(MI.getOperand(i: `4`).getReg(), MRI, *TRI)
4735	: getVGPROpMapping(MI.getOperand(i: `4`).getReg(), MRI, *TRI);
4736	break;
4737	}
4738	case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
4739	case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
4740	case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
4741	case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
4742	case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
4743	case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
4744	case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
4745	case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
4746	case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
4747	case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
4748	case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
4749	case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
4750	case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
4751	case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8: {
4752	// vdst, srcA, srcB, srcC, idx
4753	OpdsMapping [`0`] = getAGPROpMapping(MI.getOperand(i: `0`).getReg(), MRI, *TRI);
4754	OpdsMapping [`2`] = getVGPROpMapping(MI.getOperand(i: `2`).getReg(), MRI, *TRI);
4755	OpdsMapping [`3`] = getVGPROpMapping(MI.getOperand(i: `3`).getReg(), MRI, *TRI);
4756	OpdsMapping [`4`] = getAGPROpMapping(MI.getOperand(i: `4`).getReg(), MRI, *TRI);
4757	OpdsMapping [`5`] = getVGPROpMapping(MI.getOperand(i: `5`).getReg(), MRI, *TRI);
4758	break;
4759	}
4760	case Intrinsic::amdgcn_interp_p1:
4761	case Intrinsic::amdgcn_interp_p2:
4762	case Intrinsic::amdgcn_interp_mov:
4763	case Intrinsic::amdgcn_interp_p1_f16:
4764	case Intrinsic::amdgcn_interp_p2_f16:
4765	case Intrinsic::amdgcn_lds_param_load: {
4766	const int M0Idx = MI.getNumOperands() - `1`;
4767	Register M0Reg = MI.getOperand(i: M0Idx).getReg();
4768	unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
4769	unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
4770
4771	OpdsMapping[`0`] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4772	for (int I = `2`; I != M0Idx && MI.getOperand(I).isReg(); ++I)
4773	OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, `32`);
4774
4775	// Must be SGPR, but we must take whatever the original bank is and fix it
4776	// later.
4777	OpdsMapping [M0Idx] = AMDGPU::getValueMapping(BankID: M0Bank, Size: `32`);
4778	break;
4779	}
4780	case Intrinsic::amdgcn_interp_inreg_p10:
4781	case Intrinsic::amdgcn_interp_inreg_p2:
4782	case Intrinsic::amdgcn_interp_inreg_p10_f16:
4783	case Intrinsic::amdgcn_interp_inreg_p2_f16:
4784	case Intrinsic::amdgcn_interp_p10_rtz_f16:
4785	case Intrinsic::amdgcn_interp_p2_rtz_f16: {
4786	unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
4787	OpdsMapping[`0`] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4788	OpdsMapping[`2`] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, `32`);
4789	OpdsMapping[`3`] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, `32`);
4790	OpdsMapping[`4`] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, `32`);
4791	break;
4792	}
4793	case Intrinsic::amdgcn_ballot: {
4794	unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
4795	unsigned SrcSize = MRI.getType(Reg: MI.getOperand(i: `2`).getReg()).getSizeInBits();
4796	OpdsMapping[`0`] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4797	OpdsMapping[`2`] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize);
4798	break;
4799	}
4800	case Intrinsic::amdgcn_inverse_ballot: {
4801	// This must be an SGPR, but accept a VGPR.
4802	Register MaskReg = MI.getOperand(i: `2`).getReg();
4803	unsigned MaskSize = MRI.getType(Reg: MaskReg).getSizeInBits();
4804	unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID);
4805	OpdsMapping[`0`] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, `1`);
4806	OpdsMapping [`2`] = AMDGPU::getValueMapping(BankID: MaskBank, Size: MaskSize);
4807	break;
4808	}
4809	case Intrinsic::amdgcn_s_quadmask:
4810	case Intrinsic::amdgcn_s_wqm: {
4811	Register MaskReg = MI.getOperand(i: `2`).getReg();
4812	unsigned MaskSize = MRI.getType(Reg: MaskReg).getSizeInBits();
4813	unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID);
4814	OpdsMapping[`0`] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, MaskSize);
4815	OpdsMapping [`2`] = AMDGPU::getValueMapping(BankID: MaskBank, Size: MaskSize);
4816	break;
4817	}
4818	case Intrinsic::amdgcn_wave_reduce_umin:
4819	case Intrinsic::amdgcn_wave_reduce_umax: {
4820	unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
4821	OpdsMapping[`0`] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4822	unsigned OpSize = MRI.getType(Reg: MI.getOperand(i: `2`).getReg()).getSizeInBits();
4823	auto regBankID =
4824	isSALUMapping(MI) ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4825	OpdsMapping [`2`] = AMDGPU::getValueMapping(BankID: regBankID, Size: OpSize);
4826	break;
4827	}
4828	case Intrinsic::amdgcn_s_bitreplicate:
4829	Register MaskReg = MI.getOperand(i: `2`).getReg();
4830	unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID);
4831	OpdsMapping[`0`] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, `64`);
4832	OpdsMapping [`2`] = AMDGPU::getValueMapping(BankID: MaskBank, Size: `32`);
4833	}
4834	break;
4835	}
4836	case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4837	case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
4838	case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
4839	case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
4840	auto IntrID = AMDGPU::getIntrinsicID(I: MI);
4841	const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(Intr: IntrID);
4842	assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic");
4843	// Non-images can have complications from operands that allow both SGPR
4844	// and VGPR. For now it's too complicated to figure out the final opcode
4845	// to derive the register bank from the MCInstrDesc.
4846	assert(RSrcIntrin->IsImage);
4847	return getImageMapping(MRI, MI, RsrcIdx: RSrcIntrin->RsrcArg);
4848	}
4849	case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
4850	unsigned N = MI.getNumExplicitOperands() - `2`;
4851	OpdsMapping[`0`] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, `128`);
4852	OpdsMapping [N] = getSGPROpMapping(MI.getOperand(i: N).getReg(), MRI, *TRI);
4853	if (N == `3`) {
4854	// Sequential form: all operands combined into VGPR256/VGPR512
4855	unsigned Size = MRI.getType(Reg: MI.getOperand(i: `2`).getReg()).getSizeInBits();
4856	if (Size > `256`)
4857	Size = `512`;
4858	OpdsMapping[`2`] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4859	} else {
4860	// NSA form
4861	for (unsigned I = `2`; I < N; ++I) {
4862	unsigned Size = MRI.getType(Reg: MI.getOperand(i: I).getReg()).getSizeInBits();
4863	OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4864	}
4865	}
4866	break;
4867	}
4868	case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
4869	case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: {
4870	auto IntrID = cast<GIntrinsic>(Val: MI).getIntrinsicID();
4871	switch (IntrID) {
4872	case Intrinsic::amdgcn_s_getreg:
4873	case Intrinsic::amdgcn_s_memtime:
4874	case Intrinsic::amdgcn_s_memrealtime:
4875	case Intrinsic::amdgcn_s_get_waveid_in_workgroup:
4876	case Intrinsic::amdgcn_s_sendmsg_rtn: {
4877	unsigned Size = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
4878	OpdsMapping[`0`] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4879	break;
4880	}
4881	case Intrinsic::amdgcn_global_atomic_fadd:
4882	case Intrinsic::amdgcn_global_atomic_csub:
4883	case Intrinsic::amdgcn_global_atomic_fmin:
4884	case Intrinsic::amdgcn_global_atomic_fmax:
4885	case Intrinsic::amdgcn_global_atomic_fmin_num:
4886	case Intrinsic::amdgcn_global_atomic_fmax_num:
4887	case Intrinsic::amdgcn_flat_atomic_fadd:
4888	case Intrinsic::amdgcn_flat_atomic_fmin:
4889	case Intrinsic::amdgcn_flat_atomic_fmax:
4890	case Intrinsic::amdgcn_flat_atomic_fmin_num:
4891	case Intrinsic::amdgcn_flat_atomic_fmax_num:
4892	case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
4893	case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
4894	case Intrinsic::amdgcn_atomic_cond_sub_u32:
4895	case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
4896	case Intrinsic::amdgcn_global_load_tr_b64:
4897	case Intrinsic::amdgcn_global_load_tr_b128:
4898	return getDefaultMappingAllVGPR(MI);
4899	case Intrinsic::amdgcn_ds_ordered_add:
4900	case Intrinsic::amdgcn_ds_ordered_swap:
4901	case Intrinsic::amdgcn_ds_fadd_v2bf16: {
4902	unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
4903	OpdsMapping[`0`] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4904	unsigned M0Bank = getRegBankID(MI.getOperand(`2`).getReg(), MRI,
4905	AMDGPU::SGPRRegBankID);
4906	OpdsMapping [`2`] = AMDGPU::getValueMapping(BankID: M0Bank, Size: `32`);
4907	OpdsMapping[`3`] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, `32`);
4908	break;
4909	}
4910	case Intrinsic::amdgcn_ds_append:
4911	case Intrinsic::amdgcn_ds_consume: {
4912	unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
4913	OpdsMapping[`0`] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4914	OpdsMapping [`2`] = getSGPROpMapping(MI.getOperand(i: `2`).getReg(), MRI, *TRI);
4915	break;
4916	}
4917	case Intrinsic::amdgcn_exp_compr:
4918	OpdsMapping[`3`] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, `32`);
4919	OpdsMapping[`4`] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, `32`);
4920	break;
4921	case Intrinsic::amdgcn_exp:
4922	// FIXME: Could we support packed types here?
4923	OpdsMapping[`3`] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, `32`);
4924	OpdsMapping[`4`] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, `32`);
4925	OpdsMapping[`5`] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, `32`);
4926	OpdsMapping[`6`] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, `32`);
4927	break;
4928	case Intrinsic::amdgcn_exp_row:
4929	OpdsMapping[`3`] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, `32`);
4930	OpdsMapping[`4`] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, `32`);
4931	OpdsMapping[`5`] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, `32`);
4932	OpdsMapping[`6`] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, `32`);
4933	OpdsMapping [`8`] = getSGPROpMapping(MI.getOperand(i: `8`).getReg(), MRI, *TRI);
4934	break;
4935	case Intrinsic::amdgcn_s_sendmsg:
4936	case Intrinsic::amdgcn_s_sendmsghalt: {
4937	// This must be an SGPR, but accept a VGPR.
4938	unsigned Bank = getRegBankID(MI.getOperand(`2`).getReg(), MRI,
4939	AMDGPU::SGPRRegBankID);
4940	OpdsMapping [`2`] = AMDGPU::getValueMapping(BankID: Bank, Size: `32`);
4941	break;
4942	}
4943	case Intrinsic::amdgcn_s_setreg: {
4944	// This must be an SGPR, but accept a VGPR.
4945	unsigned Bank = getRegBankID(MI.getOperand(`2`).getReg(), MRI,
4946	AMDGPU::SGPRRegBankID);
4947	OpdsMapping [`2`] = AMDGPU::getValueMapping(BankID: Bank, Size: `32`);
4948	break;
4949	}
4950	case Intrinsic::amdgcn_s_ttracedata: {
4951	// This must be an SGPR, but accept a VGPR.
4952	unsigned Bank =
4953	getRegBankID(MI.getOperand(`1`).getReg(), MRI, AMDGPU::SGPRRegBankID);
4954	OpdsMapping [`1`] = AMDGPU::getValueMapping(BankID: Bank, Size: `32`);
4955	break;
4956	}
4957	case Intrinsic::amdgcn_end_cf: {
4958	unsigned Size = getSizeInBits(MI.getOperand(i: `1`).getReg(), MRI, *TRI);
4959	OpdsMapping[`1`] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4960	break;
4961	}
4962	case Intrinsic::amdgcn_else: {
4963	unsigned WaveSize = getSizeInBits(MI.getOperand(i: `1`).getReg(), MRI, *TRI);
4964	OpdsMapping[`0`] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, `1`);
4965	OpdsMapping[`1`] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4966	OpdsMapping[`3`] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4967	break;
4968	}
4969	case Intrinsic::amdgcn_live_mask: {
4970	OpdsMapping[`0`] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, `1`);
4971	break;
4972	}
4973	case Intrinsic::amdgcn_wqm_demote:
4974	case Intrinsic::amdgcn_kill: {
4975	OpdsMapping[`1`] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, `1`);
4976	break;
4977	}
4978	case Intrinsic::amdgcn_raw_buffer_load:
4979	case Intrinsic::amdgcn_raw_ptr_buffer_load:
4980	case Intrinsic::amdgcn_raw_tbuffer_load:
4981	case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
4982	// FIXME: Should make intrinsic ID the last operand of the instruction,
4983	// then this would be the same as store
4984	OpdsMapping [`0`] = getVGPROpMapping(MI.getOperand(i: `0`).getReg(), MRI, *TRI);
4985	OpdsMapping [`2`] = getSGPROpMapping(MI.getOperand(i: `2`).getReg(), MRI, *TRI);
4986	OpdsMapping [`3`] = getVGPROpMapping(MI.getOperand(i: `3`).getReg(), MRI, *TRI);
4987	OpdsMapping [`4`] = getSGPROpMapping(MI.getOperand(i: `4`).getReg(), MRI, *TRI);
4988	break;
4989	}
4990	case Intrinsic::amdgcn_raw_buffer_load_lds:
4991	case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: {
4992	OpdsMapping [`1`] = getSGPROpMapping(MI.getOperand(i: `1`).getReg(), MRI, *TRI);
4993	OpdsMapping [`2`] = getSGPROpMapping(MI.getOperand(i: `2`).getReg(), MRI, *TRI);
4994	OpdsMapping [`4`] = getVGPROpMapping(MI.getOperand(i: `4`).getReg(), MRI, *TRI);
4995	OpdsMapping [`5`] = getSGPROpMapping(MI.getOperand(i: `5`).getReg(), MRI, *TRI);
4996	break;
4997	}
4998	case Intrinsic::amdgcn_raw_buffer_store:
4999	case Intrinsic::amdgcn_raw_ptr_buffer_store:
5000	case Intrinsic::amdgcn_raw_buffer_store_format:
5001	case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
5002	case Intrinsic::amdgcn_raw_tbuffer_store:
5003	case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
5004	OpdsMapping [`1`] = getVGPROpMapping(MI.getOperand(i: `1`).getReg(), MRI, *TRI);
5005	OpdsMapping [`2`] = getSGPROpMapping(MI.getOperand(i: `2`).getReg(), MRI, *TRI);
5006	OpdsMapping [`3`] = getVGPROpMapping(MI.getOperand(i: `3`).getReg(), MRI, *TRI);
5007	OpdsMapping [`4`] = getSGPROpMapping(MI.getOperand(i: `4`).getReg(), MRI, *TRI);
5008	break;
5009	}
5010	case Intrinsic::amdgcn_struct_buffer_load:
5011	case Intrinsic::amdgcn_struct_ptr_buffer_load:
5012	case Intrinsic::amdgcn_struct_tbuffer_load:
5013	case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
5014	OpdsMapping [`0`] = getVGPROpMapping(MI.getOperand(i: `0`).getReg(), MRI, *TRI);
5015	OpdsMapping [`2`] = getSGPROpMapping(MI.getOperand(i: `2`).getReg(), MRI, *TRI);
5016	OpdsMapping [`3`] = getVGPROpMapping(MI.getOperand(i: `3`).getReg(), MRI, *TRI);
5017	OpdsMapping [`4`] = getVGPROpMapping(MI.getOperand(i: `4`).getReg(), MRI, *TRI);
5018	OpdsMapping [`5`] = getSGPROpMapping(MI.getOperand(i: `5`).getReg(), MRI, *TRI);
5019	break;
5020	}
5021	case Intrinsic::amdgcn_struct_buffer_load_lds:
5022	case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
5023	OpdsMapping [`1`] = getSGPROpMapping(MI.getOperand(i: `1`).getReg(), MRI, *TRI);
5024	OpdsMapping [`2`] = getSGPROpMapping(MI.getOperand(i: `2`).getReg(), MRI, *TRI);
5025	OpdsMapping [`4`] = getVGPROpMapping(MI.getOperand(i: `4`).getReg(), MRI, *TRI);
5026	OpdsMapping [`5`] = getVGPROpMapping(MI.getOperand(i: `5`).getReg(), MRI, *TRI);
5027	OpdsMapping [`6`] = getSGPROpMapping(MI.getOperand(i: `6`).getReg(), MRI, *TRI);
5028	break;
5029	}
5030	case Intrinsic::amdgcn_struct_buffer_store:
5031	case Intrinsic::amdgcn_struct_ptr_buffer_store:
5032	case Intrinsic::amdgcn_struct_tbuffer_store:
5033	case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
5034	OpdsMapping [`1`] = getVGPROpMapping(MI.getOperand(i: `1`).getReg(), MRI, *TRI);
5035	OpdsMapping [`2`] = getSGPROpMapping(MI.getOperand(i: `2`).getReg(), MRI, *TRI);
5036	OpdsMapping [`3`] = getVGPROpMapping(MI.getOperand(i: `3`).getReg(), MRI, *TRI);
5037	OpdsMapping [`4`] = getVGPROpMapping(MI.getOperand(i: `4`).getReg(), MRI, *TRI);
5038	OpdsMapping [`5`] = getSGPROpMapping(MI.getOperand(i: `5`).getReg(), MRI, *TRI);
5039	break;
5040	}
5041	case Intrinsic::amdgcn_init_exec_from_input: {
5042	unsigned Size = getSizeInBits(MI.getOperand(i: `1`).getReg(), MRI, *TRI);
5043	OpdsMapping[`1`] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
5044	break;
5045	}
5046	case Intrinsic::amdgcn_ds_gws_init:
5047	case Intrinsic::amdgcn_ds_gws_barrier:
5048	case Intrinsic::amdgcn_ds_gws_sema_br: {
5049	OpdsMapping[`1`] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, `32`);
5050
5051	// This must be an SGPR, but accept a VGPR.
5052	unsigned Bank = getRegBankID(MI.getOperand(`2`).getReg(), MRI,
5053	AMDGPU::SGPRRegBankID);
5054	OpdsMapping [`2`] = AMDGPU::getValueMapping(BankID: Bank, Size: `32`);
5055	break;
5056	}
5057	case Intrinsic::amdgcn_ds_gws_sema_v:
5058	case Intrinsic::amdgcn_ds_gws_sema_p:
5059	case Intrinsic::amdgcn_ds_gws_sema_release_all: {
5060	// This must be an SGPR, but accept a VGPR.
5061	unsigned Bank = getRegBankID(MI.getOperand(`1`).getReg(), MRI,
5062	AMDGPU::SGPRRegBankID);
5063	OpdsMapping [`1`] = AMDGPU::getValueMapping(BankID: Bank, Size: `32`);
5064	break;
5065	}
5066	case Intrinsic::amdgcn_global_load_lds: {
5067	OpdsMapping [`1`] = getVGPROpMapping(MI.getOperand(i: `1`).getReg(), MRI, *TRI);
5068	OpdsMapping [`2`] = getSGPROpMapping(MI.getOperand(i: `2`).getReg(), MRI, *TRI);
5069	break;
5070	}
5071	case Intrinsic::amdgcn_lds_direct_load: {
5072	const int M0Idx = MI.getNumOperands() - `1`;
5073	Register M0Reg = MI.getOperand(i: M0Idx).getReg();
5074	unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
5075	unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
5076
5077	OpdsMapping[`0`] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
5078	for (int I = `2`; I != M0Idx && MI.getOperand(I).isReg(); ++I)
5079	OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, `32`);
5080
5081	// Must be SGPR, but we must take whatever the original bank is and fix it
5082	// later.
5083	OpdsMapping [M0Idx] = AMDGPU::getValueMapping(BankID: M0Bank, Size: `32`);
5084	break;
5085	}
5086	case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
5087	case Intrinsic::amdgcn_ds_sub_gs_reg_rtn:
5088	OpdsMapping [`0`] = getVGPROpMapping(MI.getOperand(i: `0`).getReg(), MRI, *TRI);
5089	OpdsMapping [`2`] = getVGPROpMapping(MI.getOperand(i: `2`).getReg(), MRI, *TRI);
5090	break;
5091	case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
5092	OpdsMapping [`0`] =
5093	getVGPROpMapping(MI.getOperand(i: `0`).getReg(), MRI, TRI); // %vdst*
5094	OpdsMapping [`1`] =
5095	getVGPROpMapping(MI.getOperand(i: `1`).getReg(), MRI, TRI); // %addr*
5096	OpdsMapping [`3`] =
5097	getVGPROpMapping(MI.getOperand(i: `3`).getReg(), MRI, TRI); // %addr*
5098	OpdsMapping [`4`] =
5099	getVGPROpMapping(MI.getOperand(i: `4`).getReg(), MRI, TRI); // %data0*
5100	OpdsMapping [`5`] =
5101	getVGPROpMapping(MI.getOperand(i: `5`).getReg(), MRI, TRI); // %data1*
5102	break;
5103	}
5104	case Intrinsic::amdgcn_s_sleep_var:
5105	OpdsMapping [`1`] = getSGPROpMapping(MI.getOperand(i: `1`).getReg(), MRI, *TRI);
5106	break;
5107	case Intrinsic::amdgcn_s_barrier_signal_var:
5108	case Intrinsic::amdgcn_s_barrier_join:
5109	case Intrinsic::amdgcn_s_wakeup_barrier:
5110	OpdsMapping [`1`] = getSGPROpMapping(MI.getOperand(i: `1`).getReg(), MRI, *TRI);
5111	break;
5112	case Intrinsic::amdgcn_s_barrier_init:
5113	OpdsMapping [`1`] = getSGPROpMapping(MI.getOperand(i: `1`).getReg(), MRI, *TRI);
5114	OpdsMapping [`2`] = getSGPROpMapping(MI.getOperand(i: `2`).getReg(), MRI, *TRI);
5115	break;
5116	case Intrinsic::amdgcn_s_barrier_signal_isfirst_var: {
5117	const unsigned ResultSize = `1`;
5118	OpdsMapping[`0`] =
5119	AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, ResultSize);
5120	OpdsMapping [`2`] = getSGPROpMapping(MI.getOperand(i: `2`).getReg(), MRI, *TRI);
5121	break;
5122	}
5123	case Intrinsic::amdgcn_s_barrier_signal_isfirst:
5124	case Intrinsic::amdgcn_s_barrier_leave: {
5125	const unsigned ResultSize = `1`;
5126	OpdsMapping[`0`] =
5127	AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, ResultSize);
5128	break;
5129	}
5130	case Intrinsic::amdgcn_s_get_barrier_state: {
5131	OpdsMapping [`0`] = getSGPROpMapping(MI.getOperand(i: `0`).getReg(), MRI, *TRI);
5132	OpdsMapping [`2`] = getSGPROpMapping(MI.getOperand(i: `2`).getReg(), MRI, *TRI);
5133	break;
5134	}
5135	default:
5136	return getInvalidInstructionMapping();
5137	}
5138	break;
5139	}
5140	case AMDGPU::G_SELECT: {
5141	unsigned Size = MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits();
5142	unsigned Op2Bank = getRegBankID(MI.getOperand(`2`).getReg(), MRI,
5143	AMDGPU::SGPRRegBankID);
5144	unsigned Op3Bank = getRegBankID(MI.getOperand(`3`).getReg(), MRI,
5145	AMDGPU::SGPRRegBankID);
5146	bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID &&
5147	Op3Bank == AMDGPU::SGPRRegBankID;
5148
5149	unsigned CondBankDefault = SGPRSrcs ?
5150	AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
5151	unsigned CondBank = getRegBankID(Reg: MI.getOperand(i: `1`).getReg(), MRI,
5152	Default: CondBankDefault);
5153	if (CondBank == AMDGPU::SGPRRegBankID)
5154	CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
5155	else if (CondBank == AMDGPU::VGPRRegBankID)
5156	CondBank = AMDGPU::VCCRegBankID;
5157
5158	unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ?
5159	AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
5160
5161	assert(CondBank == AMDGPU::VCCRegBankID \|\| CondBank == AMDGPU::SGPRRegBankID);
5162
5163	// TODO: Should report 32-bit for scalar condition type.
5164	if (Size == `64`) {
5165	OpdsMapping [`0`] = AMDGPU::getValueMappingSGPR64Only(BankID: Bank, Size);
5166	OpdsMapping [`1`] = AMDGPU::getValueMapping(BankID: CondBank, Size: `1`);
5167	OpdsMapping [`2`] = AMDGPU::getValueMappingSGPR64Only(BankID: Bank, Size);
5168	OpdsMapping [`3`] = AMDGPU::getValueMappingSGPR64Only(BankID: Bank, Size);
5169	} else {
5170	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: Bank, Size);
5171	OpdsMapping [`1`] = AMDGPU::getValueMapping(BankID: CondBank, Size: `1`);
5172	OpdsMapping [`2`] = AMDGPU::getValueMapping(BankID: Bank, Size);
5173	OpdsMapping [`3`] = AMDGPU::getValueMapping(BankID: Bank, Size);
5174	}
5175
5176	break;
5177	}
5178
5179	case AMDGPU::G_SI_CALL: {
5180	OpdsMapping[`0`] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, `64`);
5181	// Lie and claim everything is legal, even though some need to be
5182	// SGPRs. applyMapping will have to deal with it as a waterfall loop.
5183	OpdsMapping [`1`] = getSGPROpMapping(MI.getOperand(i: `1`).getReg(), MRI, *TRI);
5184
5185	// Allow anything for implicit arguments
5186	for (unsigned I = `4`; I < MI.getNumOperands(); ++I) {
5187	if (MI.getOperand(i: I).isReg()) {
5188	Register Reg = MI.getOperand(i: I).getReg();
5189	auto OpBank = getRegBankID(Reg, MRI);
5190	unsigned Size = getSizeInBits(Reg, MRI, *TRI);
5191	OpdsMapping [I] = AMDGPU::getValueMapping(BankID: OpBank, Size);
5192	}
5193	}
5194	break;
5195	}
5196	case AMDGPU::G_LOAD:
5197	case AMDGPU::G_ZEXTLOAD:
5198	case AMDGPU::G_SEXTLOAD:
5199	return getInstrMappingForLoad(MI);
5200
5201	case AMDGPU::G_ATOMICRMW_XCHG:
5202	case AMDGPU::G_ATOMICRMW_ADD:
5203	case AMDGPU::G_ATOMICRMW_SUB:
5204	case AMDGPU::G_ATOMICRMW_AND:
5205	case AMDGPU::G_ATOMICRMW_OR:
5206	case AMDGPU::G_ATOMICRMW_XOR:
5207	case AMDGPU::G_ATOMICRMW_MAX:
5208	case AMDGPU::G_ATOMICRMW_MIN:
5209	case AMDGPU::G_ATOMICRMW_UMAX:
5210	case AMDGPU::G_ATOMICRMW_UMIN:
5211	case AMDGPU::G_ATOMICRMW_FADD:
5212	case AMDGPU::G_ATOMICRMW_UINC_WRAP:
5213	case AMDGPU::G_ATOMICRMW_UDEC_WRAP:
5214	case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
5215	case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
5216	case AMDGPU::G_AMDGPU_ATOMIC_FMAX: {
5217	OpdsMapping [`0`] = getVGPROpMapping(MI.getOperand(i: `0`).getReg(), MRI, *TRI);
5218	OpdsMapping [`1`] = getValueMappingForPtr(MRI, PtrReg: MI.getOperand(i: `1`).getReg());
5219	OpdsMapping [`2`] = getVGPROpMapping(MI.getOperand(i: `2`).getReg(), MRI, *TRI);
5220	break;
5221	}
5222	case AMDGPU::G_ATOMIC_CMPXCHG: {
5223	OpdsMapping [`0`] = getVGPROpMapping(MI.getOperand(i: `0`).getReg(), MRI, *TRI);
5224	OpdsMapping [`1`] = getValueMappingForPtr(MRI, PtrReg: MI.getOperand(i: `1`).getReg());
5225	OpdsMapping [`2`] = getVGPROpMapping(MI.getOperand(i: `2`).getReg(), MRI, *TRI);
5226	OpdsMapping [`3`] = getVGPROpMapping(MI.getOperand(i: `3`).getReg(), MRI, *TRI);
5227	break;
5228	}
5229	case AMDGPU::G_BRCOND: {
5230	unsigned Bank = getRegBankID(MI.getOperand(`0`).getReg(), MRI,
5231	AMDGPU::SGPRRegBankID);
5232	assert(MRI.getType(MI.getOperand(`0`).getReg()).getSizeInBits() == `1`);
5233	if (Bank != AMDGPU::SGPRRegBankID)
5234	Bank = AMDGPU::VCCRegBankID;
5235
5236	OpdsMapping [`0`] = AMDGPU::getValueMapping(BankID: Bank, Size: `1`);
5237	break;
5238	}
5239	case AMDGPU::G_FPTRUNC_ROUND_UPWARD:
5240	case AMDGPU::G_FPTRUNC_ROUND_DOWNWARD:
5241	return getDefaultMappingVOP(MI);
5242	case AMDGPU::G_PREFETCH:
5243	OpdsMapping [`0`] = getSGPROpMapping(MI.getOperand(i: `0`).getReg(), MRI, *TRI);
5244	break;
5245	}
5246
5247	return getInstructionMapping(/ID/`1`, /Cost/`1`,
5248	OperandsMapping: getOperandsMapping(OpdsMapping),
5249	NumOperands: MI.getNumOperands());
5250	}
5251

source code of llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp