1//===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the RegisterBankInfo class for
10/// AMDGPU.
11///
12/// \par
13///
14/// AMDGPU has unique register bank constraints that require special high level
15/// strategies to deal with. There are two main true physical register banks
16/// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a
17/// sort of pseudo-register bank needed to represent SGPRs used in a vector
18/// boolean context. There is also the AGPR bank, which is a special purpose
19/// physical register bank present on some subtargets.
20///
21/// Copying from VGPR to SGPR is generally illegal, unless the value is known to
22/// be uniform. It is generally not valid to legalize operands by inserting
23/// copies as on other targets. Operations which require uniform, SGPR operands
24/// generally require scalarization by repeatedly executing the instruction,
25/// activating each set of lanes using a unique set of input values. This is
26/// referred to as a waterfall loop.
27///
28/// \par Booleans
29///
30/// Booleans (s1 values) requires special consideration. A vector compare result
31/// is naturally a bitmask with one bit per lane, in a 32 or 64-bit
32/// register. These are represented with the VCC bank. During selection, we need
33/// to be able to unambiguously go back from a register class to a register
34/// bank. To distinguish whether an SGPR should use the SGPR or VCC register
35/// bank, we need to know the use context type. An SGPR s1 value always means a
36/// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets
37/// SCC, which is a 1-bit unaddressable register. This will need to be copied to
38/// a 32-bit virtual register. Taken together, this means we need to adjust the
39/// type of boolean operations to be regbank legal. All SALU booleans need to be
40/// widened to 32-bits, and all VALU booleans need to be s1 values.
41///
42/// A noteworthy exception to the s1-means-vcc rule is for legalization artifact
43/// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc
44/// bank. A non-boolean source (such as a truncate from a 1-bit load from
45/// memory) will require a copy to the VCC bank which will require clearing the
46/// high bits and inserting a compare.
47///
48/// \par Constant bus restriction
49///
50/// VALU instructions have a limitation known as the constant bus
51/// restriction. Most VALU instructions can use SGPR operands, but may read at
52/// most 1 SGPR or constant literal value (this to 2 in gfx10 for most
53/// instructions). This is one unique SGPR, so the same SGPR may be used for
54/// multiple operands. From a register bank perspective, any combination of
55/// operands should be legal as an SGPR, but this is contextually dependent on
56/// the SGPR operands all being the same register. There is therefore optimal to
57/// choose the SGPR with the most uses to minimize the number of copies.
58///
59/// We avoid trying to solve this problem in RegBankSelect. Any VALU G_*
60/// operation should have its source operands all mapped to VGPRs (except for
61/// VCC), inserting copies from any SGPR operands. This the most trivial legal
62/// mapping. Anything beyond the simplest 1:1 instruction selection would be too
63/// complicated to solve here. Every optimization pattern or instruction
64/// selected to multiple outputs would have to enforce this rule, and there
65/// would be additional complexity in tracking this rule for every G_*
66/// operation. By forcing all inputs to VGPRs, it also simplifies the task of
67/// picking the optimal operand combination from a post-isel optimization pass.
68///
69//===----------------------------------------------------------------------===//
70
71#include "AMDGPURegisterBankInfo.h"
72
73#include "AMDGPU.h"
74#include "AMDGPUGlobalISelUtils.h"
75#include "AMDGPUInstrInfo.h"
76#include "GCNSubtarget.h"
77#include "SIMachineFunctionInfo.h"
78#include "SIRegisterInfo.h"
79#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
80#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
81#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
82#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
83#include "llvm/CodeGen/RegisterBank.h"
84#include "llvm/IR/IntrinsicsAMDGPU.h"
85
86#define GET_TARGET_REGBANK_IMPL
87#include "AMDGPUGenRegisterBank.inc"
88
89// This file will be TableGen'ed at some point.
90#include "AMDGPUGenRegisterBankInfo.def"
91
92using namespace llvm;
93using namespace MIPatternMatch;
94
95namespace {
96
97// Observer to apply a register bank to new registers created by LegalizerHelper.
98class ApplyRegBankMapping final : public GISelChangeObserver {
99private:
100 MachineIRBuilder &B;
101 const AMDGPURegisterBankInfo &RBI;
102 MachineRegisterInfo &MRI;
103 const RegisterBank *NewBank;
104 SmallVector<MachineInstr *, 4> NewInsts;
105
106public:
107 ApplyRegBankMapping(MachineIRBuilder &B, const AMDGPURegisterBankInfo &RBI_,
108 MachineRegisterInfo &MRI_, const RegisterBank *RB)
109 : B(B), RBI(RBI_), MRI(MRI_), NewBank(RB) {
110 assert(!B.isObservingChanges());
111 B.setChangeObserver(*this);
112 }
113
114 ~ApplyRegBankMapping() {
115 for (MachineInstr *MI : NewInsts)
116 applyBank(MI&: *MI);
117
118 B.stopObservingChanges();
119 }
120
121 /// Set any registers that don't have a set register class or bank to SALU.
122 void applyBank(MachineInstr &MI) {
123 const unsigned Opc = MI.getOpcode();
124 if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT ||
125 Opc == AMDGPU::G_SEXT) {
126 // LegalizerHelper wants to use the basic legalization artifacts when
127 // widening etc. We don't handle selection with vcc in artifact sources,
128 // so we need to use a select instead to handle these properly.
129 Register DstReg = MI.getOperand(i: 0).getReg();
130 Register SrcReg = MI.getOperand(i: 1).getReg();
131 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI);
132 if (SrcBank == &AMDGPU::VCCRegBank) {
133 const LLT S32 = LLT::scalar(SizeInBits: 32);
134 assert(MRI.getType(SrcReg) == LLT::scalar(1));
135 assert(MRI.getType(DstReg) == S32);
136 assert(NewBank == &AMDGPU::VGPRRegBank);
137
138 // Replace the extension with a select, which really uses the boolean
139 // source.
140 B.setInsertPt(MBB&: *MI.getParent(), II: MI);
141
142 auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1);
143 auto False = B.buildConstant(Res: S32, Val: 0);
144 B.buildSelect(Res: DstReg, Tst: SrcReg, Op0: True, Op1: False);
145 MRI.setRegBank(Reg: True.getReg(0), RegBank: *NewBank);
146 MRI.setRegBank(Reg: False.getReg(Idx: 0), RegBank: *NewBank);
147 MI.eraseFromParent();
148 }
149
150 assert(!MRI.getRegClassOrRegBank(DstReg));
151 MRI.setRegBank(Reg: DstReg, RegBank: *NewBank);
152 return;
153 }
154
155#ifndef NDEBUG
156 if (Opc == AMDGPU::G_TRUNC) {
157 Register DstReg = MI.getOperand(i: 0).getReg();
158 const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI);
159 assert(DstBank != &AMDGPU::VCCRegBank);
160 }
161#endif
162
163 for (MachineOperand &Op : MI.operands()) {
164 if (!Op.isReg())
165 continue;
166
167 // We may see physical registers if building a real MI
168 Register Reg = Op.getReg();
169 if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg))
170 continue;
171
172 const RegisterBank *RB = NewBank;
173 if (MRI.getType(Reg) == LLT::scalar(SizeInBits: 1)) {
174 assert(NewBank == &AMDGPU::VGPRRegBank &&
175 "s1 operands should only be used for vector bools");
176 assert((MI.getOpcode() != AMDGPU::G_TRUNC &&
177 MI.getOpcode() != AMDGPU::G_ANYEXT) &&
178 "not expecting legalization artifacts here");
179 RB = &AMDGPU::VCCRegBank;
180 }
181
182 MRI.setRegBank(Reg, RegBank: *RB);
183 }
184 }
185
186 void erasingInstr(MachineInstr &MI) override {}
187
188 void createdInstr(MachineInstr &MI) override {
189 // At this point, the instruction was just inserted and has no operands.
190 NewInsts.push_back(Elt: &MI);
191 }
192
193 void changingInstr(MachineInstr &MI) override {}
194 void changedInstr(MachineInstr &MI) override {
195 // FIXME: In principle we should probably add the instruction to NewInsts,
196 // but the way the LegalizerHelper uses the observer, we will always see the
197 // registers we need to set the regbank on also referenced in a new
198 // instruction.
199 }
200};
201
202}
203
204AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST)
205 : Subtarget(ST), TRI(Subtarget.getRegisterInfo()),
206 TII(Subtarget.getInstrInfo()) {
207
208 // HACK: Until this is fully tablegen'd.
209 static llvm::once_flag InitializeRegisterBankFlag;
210
211 static auto InitializeRegisterBankOnce = [this]() {
212 assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank &&
213 &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank &&
214 &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank);
215 (void)this;
216 };
217
218 llvm::call_once(flag&: InitializeRegisterBankFlag, F&: InitializeRegisterBankOnce);
219}
220
221static bool isVectorRegisterBank(const RegisterBank &Bank) {
222 unsigned BankID = Bank.getID();
223 return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID;
224}
225
226bool AMDGPURegisterBankInfo::isDivergentRegBank(const RegisterBank *RB) const {
227 return RB != &AMDGPU::SGPRRegBank;
228}
229
230unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
231 const RegisterBank &Src,
232 TypeSize Size) const {
233 // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
234 if (Dst.getID() == AMDGPU::SGPRRegBankID &&
235 (isVectorRegisterBank(Src) || Src.getID() == AMDGPU::VCCRegBankID)) {
236 return std::numeric_limits<unsigned>::max();
237 }
238
239 // Bool values are tricky, because the meaning is based on context. The SCC
240 // and VCC banks are for the natural scalar and vector conditions produced by
241 // a compare.
242 //
243 // Legalization doesn't know about the necessary context, so an s1 use may
244 // have been a truncate from an arbitrary value, in which case a copy (lowered
245 // as a compare with 0) needs to be inserted.
246 if (Size == 1 &&
247 (Dst.getID() == AMDGPU::SGPRRegBankID) &&
248 (isVectorRegisterBank(Src) ||
249 Src.getID() == AMDGPU::SGPRRegBankID ||
250 Src.getID() == AMDGPU::VCCRegBankID))
251 return std::numeric_limits<unsigned>::max();
252
253 // There is no direct copy between AGPRs.
254 if (Dst.getID() == AMDGPU::AGPRRegBankID &&
255 Src.getID() == AMDGPU::AGPRRegBankID)
256 return 4;
257
258 return RegisterBankInfo::copyCost(A: Dst, B: Src, Size);
259}
260
261unsigned AMDGPURegisterBankInfo::getBreakDownCost(
262 const ValueMapping &ValMapping,
263 const RegisterBank *CurBank) const {
264 // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
265 // VGPR.
266 // FIXME: Is there a better way to do this?
267 if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64)
268 return 10; // This is expensive.
269
270 assert(ValMapping.NumBreakDowns == 2 &&
271 ValMapping.BreakDown[0].Length == 32 &&
272 ValMapping.BreakDown[0].StartIdx == 0 &&
273 ValMapping.BreakDown[1].Length == 32 &&
274 ValMapping.BreakDown[1].StartIdx == 32 &&
275 ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank);
276
277 // 32-bit extract of a 64-bit value is just access of a subregister, so free.
278 // TODO: Cost of 0 hits assert, though it's not clear it's what we really
279 // want.
280
281 // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
282 // alignment restrictions, but this probably isn't important.
283 return 1;
284}
285
286const RegisterBank &
287AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
288 LLT Ty) const {
289 if (&RC == &AMDGPU::SReg_1RegClass)
290 return AMDGPU::VCCRegBank;
291
292 // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a
293 // VCC-like use.
294 if (TRI->isSGPRClass(RC: &RC)) {
295 // FIXME: This probably came from a copy from a physical register, which
296 // should be inferable from the copied to-type. We don't have many boolean
297 // physical register constraints so just assume a normal SGPR for now.
298 if (!Ty.isValid())
299 return AMDGPU::SGPRRegBank;
300
301 return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
302 }
303
304 return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank;
305}
306
307template <unsigned NumOps>
308RegisterBankInfo::InstructionMappings
309AMDGPURegisterBankInfo::addMappingFromTable(
310 const MachineInstr &MI, const MachineRegisterInfo &MRI,
311 const std::array<unsigned, NumOps> RegSrcOpIdx,
312 ArrayRef<OpRegBankEntry<NumOps>> Table) const {
313
314 InstructionMappings AltMappings;
315
316 SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands());
317
318 unsigned Sizes[NumOps];
319 for (unsigned I = 0; I < NumOps; ++I) {
320 Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg();
321 Sizes[I] = getSizeInBits(Reg, MRI, *TRI);
322 }
323
324 for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) {
325 unsigned SizeI = getSizeInBits(MI.getOperand(i: I).getReg(), MRI, *TRI);
326 Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI);
327 }
328
329 // getInstrMapping's default mapping uses ID 1, so start at 2.
330 unsigned MappingID = 2;
331 for (const auto &Entry : Table) {
332 for (unsigned I = 0; I < NumOps; ++I) {
333 int OpIdx = RegSrcOpIdx[I];
334 Operands[OpIdx] = AMDGPU::getValueMapping(BankID: Entry.RegBanks[I], Size: Sizes[I]);
335 }
336
337 AltMappings.push_back(Elt: &getInstructionMapping(ID: MappingID++, Cost: Entry.Cost,
338 OperandsMapping: getOperandsMapping(OpdsMapping: Operands),
339 NumOperands: Operands.size()));
340 }
341
342 return AltMappings;
343}
344
345RegisterBankInfo::InstructionMappings
346AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
347 const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
348 switch (cast<GIntrinsic>(Val: MI).getIntrinsicID()) {
349 case Intrinsic::amdgcn_readlane: {
350 static const OpRegBankEntry<3> Table[2] = {
351 // Perfectly legal.
352 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
353
354 // Need a readfirstlane for the index.
355 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
356 };
357
358 const std::array<unsigned, 3> RegSrcOpIdx = { ._M_elems: { 0, 2, 3 } };
359 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, Table);
360 }
361 case Intrinsic::amdgcn_writelane: {
362 static const OpRegBankEntry<4> Table[4] = {
363 // Perfectly legal.
364 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
365
366 // Need readfirstlane of first op
367 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
368
369 // Need readfirstlane of second op
370 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
371
372 // Need readfirstlane of both ops
373 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 }
374 };
375
376 // rsrc, voffset, offset
377 const std::array<unsigned, 4> RegSrcOpIdx = { ._M_elems: { 0, 2, 3, 4 } };
378 return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, Table);
379 }
380 default:
381 return RegisterBankInfo::getInstrAlternativeMappings(MI);
382 }
383}
384
385RegisterBankInfo::InstructionMappings
386AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
387 const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
388
389 switch (cast<GIntrinsic>(Val: MI).getIntrinsicID()) {
390 case Intrinsic::amdgcn_s_buffer_load: {
391 static const OpRegBankEntry<2> Table[4] = {
392 // Perfectly legal.
393 { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
394
395 // Only need 1 register in loop
396 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 },
397
398 // Have to waterfall the resource.
399 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
400
401 // Have to waterfall the resource, and the offset.
402 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 }
403 };
404
405 // rsrc, offset
406 const std::array<unsigned, 2> RegSrcOpIdx = { ._M_elems: { 2, 3 } };
407 return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, Table);
408 }
409 case Intrinsic::amdgcn_ds_ordered_add:
410 case Intrinsic::amdgcn_ds_ordered_swap: {
411 // VGPR = M0, VGPR
412 static const OpRegBankEntry<3> Table[2] = {
413 // Perfectly legal.
414 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
415
416 // Need a readfirstlane for m0
417 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
418 };
419
420 const std::array<unsigned, 3> RegSrcOpIdx = { ._M_elems: { 0, 2, 3 } };
421 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, Table);
422 }
423 case Intrinsic::amdgcn_s_sendmsg:
424 case Intrinsic::amdgcn_s_sendmsghalt: {
425 // FIXME: Should have no register for immediate
426 static const OpRegBankEntry<1> Table[2] = {
427 // Perfectly legal.
428 { { AMDGPU::SGPRRegBankID }, 1 },
429
430 // Need readlane
431 { { AMDGPU::VGPRRegBankID }, 3 }
432 };
433
434 const std::array<unsigned, 1> RegSrcOpIdx = { ._M_elems: { 2 } };
435 return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, Table);
436 }
437 default:
438 return RegisterBankInfo::getInstrAlternativeMappings(MI);
439 }
440}
441
442// FIXME: Returns uniform if there's no source value information. This is
443// probably wrong.
444bool AMDGPURegisterBankInfo::isScalarLoadLegal(const MachineInstr &MI) const {
445 if (!MI.hasOneMemOperand())
446 return false;
447
448 const MachineMemOperand *MMO = *MI.memoperands_begin();
449 const unsigned AS = MMO->getAddrSpace();
450 const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS ||
451 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
452 const unsigned MemSize = 8 * MMO->getSize().getValue();
453
454 // Require 4-byte alignment.
455 return (MMO->getAlign() >= Align(4) ||
456 (Subtarget.hasScalarSubwordLoads() &&
457 ((MemSize == 16 && MMO->getAlign() >= Align(2)) ||
458 (MemSize == 8 && MMO->getAlign() >= Align(1))))) &&
459 // Can't do a scalar atomic load.
460 !MMO->isAtomic() &&
461 // Don't use scalar loads for volatile accesses to non-constant address
462 // spaces.
463 (IsConst || !MMO->isVolatile()) &&
464 // Memory must be known constant, or not written before this load.
465 (IsConst || MMO->isInvariant() || (MMO->getFlags() & MONoClobber)) &&
466 AMDGPUInstrInfo::isUniformMMO(MMO);
467}
468
469RegisterBankInfo::InstructionMappings
470AMDGPURegisterBankInfo::getInstrAlternativeMappings(
471 const MachineInstr &MI) const {
472
473 const MachineFunction &MF = *MI.getParent()->getParent();
474 const MachineRegisterInfo &MRI = MF.getRegInfo();
475
476
477 InstructionMappings AltMappings;
478 switch (MI.getOpcode()) {
479 case TargetOpcode::G_CONSTANT:
480 case TargetOpcode::G_IMPLICIT_DEF: {
481 unsigned Size = getSizeInBits(MI.getOperand(i: 0).getReg(), MRI, *TRI);
482 if (Size == 1) {
483 static const OpRegBankEntry<1> Table[3] = {
484 { { AMDGPU::VGPRRegBankID }, 1 },
485 { { AMDGPU::SGPRRegBankID }, 1 },
486 { { AMDGPU::VCCRegBankID }, 1 }
487 };
488
489 return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx: {._M_elems: { 0 }}, Table);
490 }
491
492 [[fallthrough]];
493 }
494 case TargetOpcode::G_FCONSTANT:
495 case TargetOpcode::G_FRAME_INDEX:
496 case TargetOpcode::G_GLOBAL_VALUE: {
497 static const OpRegBankEntry<1> Table[2] = {
498 { { AMDGPU::VGPRRegBankID }, 1 },
499 { { AMDGPU::SGPRRegBankID }, 1 }
500 };
501
502 return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx: {._M_elems: { 0 }}, Table);
503 }
504 case TargetOpcode::G_AND:
505 case TargetOpcode::G_OR:
506 case TargetOpcode::G_XOR: {
507 unsigned Size = getSizeInBits(MI.getOperand(i: 0).getReg(), MRI, *TRI);
508
509 if (Size == 1) {
510 // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
511 const InstructionMapping &SCCMapping = getInstructionMapping(
512 1, 1, getOperandsMapping(
513 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
514 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
515 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}),
516 3); // Num Operands
517 AltMappings.push_back(Elt: &SCCMapping);
518
519 const InstructionMapping &VCCMapping0 = getInstructionMapping(
520 2, 1, getOperandsMapping(
521 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
522 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
523 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}),
524 3); // Num Operands
525 AltMappings.push_back(Elt: &VCCMapping0);
526 return AltMappings;
527 }
528
529 if (Size != 64)
530 break;
531
532 const InstructionMapping &SSMapping = getInstructionMapping(
533 1, 1, getOperandsMapping(
534 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
535 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
536 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
537 3); // Num Operands
538 AltMappings.push_back(Elt: &SSMapping);
539
540 const InstructionMapping &VVMapping = getInstructionMapping(
541 2, 2, getOperandsMapping(
542 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
543 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
544 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
545 3); // Num Operands
546 AltMappings.push_back(Elt: &VVMapping);
547 break;
548 }
549 case TargetOpcode::G_LOAD:
550 case TargetOpcode::G_ZEXTLOAD:
551 case TargetOpcode::G_SEXTLOAD: {
552 unsigned Size = getSizeInBits(MI.getOperand(i: 0).getReg(), MRI, *TRI);
553 LLT PtrTy = MRI.getType(Reg: MI.getOperand(i: 1).getReg());
554 unsigned PtrSize = PtrTy.getSizeInBits();
555 unsigned AS = PtrTy.getAddressSpace();
556
557 if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
558 AS != AMDGPUAS::PRIVATE_ADDRESS) &&
559 isScalarLoadLegal(MI)) {
560 const InstructionMapping &SSMapping = getInstructionMapping(
561 1, 1, getOperandsMapping(
562 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
563 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}),
564 2); // Num Operands
565 AltMappings.push_back(Elt: &SSMapping);
566 }
567
568 const InstructionMapping &VVMapping = getInstructionMapping(
569 2, 1,
570 getOperandsMapping(
571 {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
572 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}),
573 2); // Num Operands
574 AltMappings.push_back(Elt: &VVMapping);
575
576 // It may be possible to have a vgpr = load sgpr mapping here, because
577 // the mubuf instructions support this kind of load, but probably for only
578 // gfx7 and older. However, the addressing mode matching in the instruction
579 // selector should be able to do a better job of detecting and selecting
580 // these kinds of loads from the vgpr = load vgpr mapping.
581
582 return AltMappings;
583
584 }
585 case TargetOpcode::G_SELECT: {
586 unsigned Size = getSizeInBits(MI.getOperand(i: 0).getReg(), MRI, *TRI);
587 const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
588 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
589 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
590 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
591 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
592 4); // Num Operands
593 AltMappings.push_back(Elt: &SSMapping);
594
595 const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
596 getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
597 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
598 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
599 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
600 4); // Num Operands
601 AltMappings.push_back(Elt: &VVMapping);
602
603 return AltMappings;
604 }
605 case TargetOpcode::G_UADDE:
606 case TargetOpcode::G_USUBE:
607 case TargetOpcode::G_SADDE:
608 case TargetOpcode::G_SSUBE: {
609 unsigned Size = getSizeInBits(MI.getOperand(i: 0).getReg(), MRI, *TRI);
610 const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
611 getOperandsMapping(
612 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
613 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
614 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
615 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
616 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}),
617 5); // Num Operands
618 AltMappings.push_back(Elt: &SSMapping);
619
620 const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
621 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
622 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
623 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
624 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
625 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}),
626 5); // Num Operands
627 AltMappings.push_back(Elt: &VVMapping);
628 return AltMappings;
629 }
630 case AMDGPU::G_BRCOND: {
631 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
632
633 // TODO: Change type to 32 for scalar
634 const InstructionMapping &SMapping = getInstructionMapping(
635 1, 1, getOperandsMapping(
636 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}),
637 2); // Num Operands
638 AltMappings.push_back(Elt: &SMapping);
639
640 const InstructionMapping &VMapping = getInstructionMapping(
641 1, 1, getOperandsMapping(
642 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }),
643 2); // Num Operands
644 AltMappings.push_back(Elt: &VMapping);
645 return AltMappings;
646 }
647 case AMDGPU::G_INTRINSIC:
648 case AMDGPU::G_INTRINSIC_CONVERGENT:
649 return getInstrAlternativeMappingsIntrinsic(MI, MRI);
650 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
651 case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
652 return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI);
653 default:
654 break;
655 }
656 return RegisterBankInfo::getInstrAlternativeMappings(MI);
657}
658
659void AMDGPURegisterBankInfo::split64BitValueForMapping(
660 MachineIRBuilder &B,
661 SmallVector<Register, 2> &Regs,
662 LLT HalfTy,
663 Register Reg) const {
664 assert(HalfTy.getSizeInBits() == 32);
665 MachineRegisterInfo *MRI = B.getMRI();
666 Register LoLHS = MRI->createGenericVirtualRegister(Ty: HalfTy);
667 Register HiLHS = MRI->createGenericVirtualRegister(Ty: HalfTy);
668 const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI);
669 MRI->setRegBank(Reg: LoLHS, RegBank: *Bank);
670 MRI->setRegBank(Reg: HiLHS, RegBank: *Bank);
671
672 Regs.push_back(Elt: LoLHS);
673 Regs.push_back(Elt: HiLHS);
674
675 B.buildInstr(AMDGPU::G_UNMERGE_VALUES)
676 .addDef(LoLHS)
677 .addDef(HiLHS)
678 .addUse(Reg);
679}
680
681/// Replace the current type each register in \p Regs has with \p NewTy
682static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs,
683 LLT NewTy) {
684 for (Register Reg : Regs) {
685 assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits());
686 MRI.setType(VReg: Reg, Ty: NewTy);
687 }
688}
689
690static LLT getHalfSizedType(LLT Ty) {
691 if (Ty.isVector()) {
692 assert(Ty.getElementCount().isKnownMultipleOf(2));
693 return LLT::scalarOrVector(EC: Ty.getElementCount().divideCoefficientBy(RHS: 2),
694 ScalarTy: Ty.getElementType());
695 }
696
697 assert(Ty.getScalarSizeInBits() % 2 == 0);
698 return LLT::scalar(SizeInBits: Ty.getScalarSizeInBits() / 2);
699}
700
701// Build one or more V_READFIRSTLANE_B32 instructions to move the given vector
702// source value into a scalar register.
703Register AMDGPURegisterBankInfo::buildReadFirstLane(MachineIRBuilder &B,
704 MachineRegisterInfo &MRI,
705 Register Src) const {
706 LLT Ty = MRI.getType(Reg: Src);
707 const RegisterBank *Bank = getRegBank(Src, MRI, *TRI);
708
709 if (Bank == &AMDGPU::SGPRRegBank)
710 return Src;
711
712 unsigned Bits = Ty.getSizeInBits();
713 assert(Bits % 32 == 0);
714
715 if (Bank != &AMDGPU::VGPRRegBank) {
716 // We need to copy from AGPR to VGPR
717 Src = B.buildCopy(Res: Ty, Op: Src).getReg(Idx: 0);
718 MRI.setRegBank(Src, AMDGPU::VGPRRegBank);
719 }
720
721 LLT S32 = LLT::scalar(SizeInBits: 32);
722 unsigned NumParts = Bits / 32;
723 SmallVector<Register, 8> SrcParts;
724 SmallVector<Register, 8> DstParts;
725
726 if (Bits == 32) {
727 SrcParts.push_back(Elt: Src);
728 } else {
729 auto Unmerge = B.buildUnmerge(Res: S32, Op: Src);
730 for (unsigned i = 0; i < NumParts; ++i)
731 SrcParts.push_back(Elt: Unmerge.getReg(Idx: i));
732 }
733
734 for (unsigned i = 0; i < NumParts; ++i) {
735 Register SrcPart = SrcParts[i];
736 Register DstPart = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
737 MRI.setType(VReg: DstPart, Ty: NumParts == 1 ? Ty : S32);
738
739 const TargetRegisterClass *Constrained =
740 constrainGenericRegister(SrcPart, AMDGPU::VGPR_32RegClass, MRI);
741 (void)Constrained;
742 assert(Constrained && "Failed to constrain readfirstlane src reg");
743
744 B.buildInstr(AMDGPU::V_READFIRSTLANE_B32, {DstPart}, {SrcPart});
745
746 DstParts.push_back(Elt: DstPart);
747 }
748
749 if (Bits == 32)
750 return DstParts[0];
751
752 Register Dst = B.buildMergeLikeInstr(Res: Ty, Ops: DstParts).getReg(Idx: 0);
753 MRI.setRegBank(Dst, AMDGPU::SGPRRegBank);
754 return Dst;
755}
756
757/// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
758/// any of the required SGPR operands are VGPRs, perform a waterfall loop to
759/// execute the instruction for each unique combination of values in all lanes
760/// in the wave. The block will be split such that rest of the instructions are
761/// moved to a new block.
762///
763/// Essentially performs this loop:
764//
765/// Save Execution Mask
766/// For (Lane : Wavefront) {
767/// Enable Lane, Disable all other lanes
768/// SGPR = read SGPR value for current lane from VGPR
769/// VGPRResult[Lane] = use_op SGPR
770/// }
771/// Restore Execution Mask
772///
773/// There is additional complexity to try for compare values to identify the
774/// unique values used.
775bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
776 MachineIRBuilder &B, iterator_range<MachineBasicBlock::iterator> Range,
777 SmallSet<Register, 4> &SGPROperandRegs) const {
778 // Track use registers which have already been expanded with a readfirstlane
779 // sequence. This may have multiple uses if moving a sequence.
780 DenseMap<Register, Register> WaterfalledRegMap;
781
782 MachineBasicBlock &MBB = B.getMBB();
783 MachineFunction *MF = &B.getMF();
784
785 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
786 const unsigned MovExecOpc =
787 Subtarget.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
788 const unsigned MovExecTermOpc =
789 Subtarget.isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
790
791 const unsigned XorTermOpc = Subtarget.isWave32() ?
792 AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
793 const unsigned AndSaveExecOpc = Subtarget.isWave32() ?
794 AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
795 const unsigned ExecReg = Subtarget.isWave32() ?
796 AMDGPU::EXEC_LO : AMDGPU::EXEC;
797
798#ifndef NDEBUG
799 const int OrigRangeSize = std::distance(first: Range.begin(), last: Range.end());
800#endif
801
802 MachineRegisterInfo &MRI = *B.getMRI();
803 Register SaveExecReg = MRI.createVirtualRegister(RegClass: WaveRC);
804 Register InitSaveExecReg = MRI.createVirtualRegister(RegClass: WaveRC);
805
806 // Don't bother using generic instructions/registers for the exec mask.
807 B.buildInstr(Opcode: TargetOpcode::IMPLICIT_DEF)
808 .addDef(RegNo: InitSaveExecReg);
809
810 Register PhiExec = MRI.createVirtualRegister(RegClass: WaveRC);
811 Register NewExec = MRI.createVirtualRegister(RegClass: WaveRC);
812
813 // To insert the loop we need to split the block. Move everything before this
814 // point to a new block, and insert a new empty block before this instruction.
815 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
816 MachineBasicBlock *BodyBB = MF->CreateMachineBasicBlock();
817 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
818 MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
819 MachineFunction::iterator MBBI(MBB);
820 ++MBBI;
821 MF->insert(MBBI, MBB: LoopBB);
822 MF->insert(MBBI, MBB: BodyBB);
823 MF->insert(MBBI, MBB: RestoreExecBB);
824 MF->insert(MBBI, MBB: RemainderBB);
825
826 LoopBB->addSuccessor(Succ: BodyBB);
827 BodyBB->addSuccessor(Succ: RestoreExecBB);
828 BodyBB->addSuccessor(Succ: LoopBB);
829
830 // Move the rest of the block into a new block.
831 RemainderBB->transferSuccessorsAndUpdatePHIs(FromMBB: &MBB);
832 RemainderBB->splice(Where: RemainderBB->begin(), Other: &MBB, From: Range.end(), To: MBB.end());
833
834 MBB.addSuccessor(Succ: LoopBB);
835 RestoreExecBB->addSuccessor(Succ: RemainderBB);
836
837 B.setInsertPt(MBB&: *LoopBB, II: LoopBB->end());
838
839 B.buildInstr(Opcode: TargetOpcode::PHI)
840 .addDef(RegNo: PhiExec)
841 .addReg(RegNo: InitSaveExecReg)
842 .addMBB(MBB: &MBB)
843 .addReg(RegNo: NewExec)
844 .addMBB(MBB: BodyBB);
845
846 const DebugLoc &DL = B.getDL();
847
848 MachineInstr &FirstInst = *Range.begin();
849
850 // Move the instruction into the loop body. Note we moved everything after
851 // Range.end() already into a new block, so Range.end() is no longer valid.
852 BodyBB->splice(Where: BodyBB->end(), Other: &MBB, From: Range.begin(), To: MBB.end());
853
854 // Figure out the iterator range after splicing the instructions.
855 MachineBasicBlock::iterator NewBegin = FirstInst.getIterator();
856 auto NewEnd = BodyBB->end();
857
858 B.setMBB(*LoopBB);
859
860 LLT S1 = LLT::scalar(SizeInBits: 1);
861 Register CondReg;
862
863 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
864
865 for (MachineInstr &MI : make_range(x: NewBegin, y: NewEnd)) {
866 for (MachineOperand &Op : MI.all_uses()) {
867 Register OldReg = Op.getReg();
868 if (!SGPROperandRegs.count(V: OldReg))
869 continue;
870
871 // See if we already processed this register in another instruction in the
872 // sequence.
873 auto OldVal = WaterfalledRegMap.find(Val: OldReg);
874 if (OldVal != WaterfalledRegMap.end()) {
875 Op.setReg(OldVal->second);
876 continue;
877 }
878
879 Register OpReg = Op.getReg();
880 LLT OpTy = MRI.getType(Reg: OpReg);
881
882 const RegisterBank *OpBank = getRegBank(OpReg, MRI, *TRI);
883 if (OpBank != &AMDGPU::VGPRRegBank) {
884 // Insert copy from AGPR to VGPR before the loop.
885 B.setMBB(MBB);
886 OpReg = B.buildCopy(Res: OpTy, Op: OpReg).getReg(Idx: 0);
887 MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank);
888 B.setMBB(*LoopBB);
889 }
890
891 Register CurrentLaneReg = buildReadFirstLane(B, MRI, Src: OpReg);
892
893 // Build the comparison(s).
894 unsigned OpSize = OpTy.getSizeInBits();
895 bool Is64 = OpSize % 64 == 0;
896 unsigned PartSize = Is64 ? 64 : 32;
897 LLT PartTy = LLT::scalar(SizeInBits: PartSize);
898 unsigned NumParts = OpSize / PartSize;
899 SmallVector<Register, 8> OpParts;
900 SmallVector<Register, 8> CurrentLaneParts;
901
902 if (NumParts == 1) {
903 OpParts.push_back(Elt: OpReg);
904 CurrentLaneParts.push_back(Elt: CurrentLaneReg);
905 } else {
906 auto UnmergeOp = B.buildUnmerge(Res: PartTy, Op: OpReg);
907 auto UnmergeCurrentLane = B.buildUnmerge(Res: PartTy, Op: CurrentLaneReg);
908 for (unsigned i = 0; i < NumParts; ++i) {
909 OpParts.push_back(Elt: UnmergeOp.getReg(Idx: i));
910 CurrentLaneParts.push_back(Elt: UnmergeCurrentLane.getReg(Idx: i));
911 MRI.setRegBank(OpParts[i], AMDGPU::VGPRRegBank);
912 MRI.setRegBank(CurrentLaneParts[i], AMDGPU::SGPRRegBank);
913 }
914 }
915
916 for (unsigned i = 0; i < NumParts; ++i) {
917 auto CmpReg = B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: S1, Op0: CurrentLaneParts[i],
918 Op1: OpParts[i]).getReg(Idx: 0);
919 MRI.setRegBank(CmpReg, AMDGPU::VCCRegBank);
920
921 if (!CondReg) {
922 CondReg = CmpReg;
923 } else {
924 CondReg = B.buildAnd(Dst: S1, Src0: CondReg, Src1: CmpReg).getReg(Idx: 0);
925 MRI.setRegBank(CondReg, AMDGPU::VCCRegBank);
926 }
927 }
928
929 Op.setReg(CurrentLaneReg);
930
931 // Make sure we don't re-process this register again.
932 WaterfalledRegMap.insert(KV: std::pair(OldReg, Op.getReg()));
933 }
934 }
935
936 // The ballot becomes a no-op during instruction selection.
937 CondReg = B.buildIntrinsic(Intrinsic::amdgcn_ballot,
938 {LLT::scalar(Subtarget.isWave32() ? 32 : 64)})
939 .addReg(CondReg)
940 .getReg(0);
941 MRI.setRegClass(Reg: CondReg, RC: WaveRC);
942
943 // Update EXEC, save the original EXEC value to VCC.
944 B.buildInstr(Opcode: AndSaveExecOpc)
945 .addDef(RegNo: NewExec)
946 .addReg(RegNo: CondReg, flags: RegState::Kill);
947
948 MRI.setSimpleHint(VReg: NewExec, PrefReg: CondReg);
949
950 B.setInsertPt(MBB&: *BodyBB, II: BodyBB->end());
951
952 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
953 B.buildInstr(Opcode: XorTermOpc)
954 .addDef(RegNo: ExecReg)
955 .addReg(RegNo: ExecReg)
956 .addReg(RegNo: NewExec);
957
958 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
959 // s_cbranch_scc0?
960
961 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
962 B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
963
964 // Save the EXEC mask before the loop.
965 BuildMI(MBB, MBB.end(), DL, TII->get(MovExecOpc), SaveExecReg)
966 .addReg(ExecReg);
967
968 // Restore the EXEC mask after the loop.
969 B.setMBB(*RestoreExecBB);
970 B.buildInstr(Opcode: MovExecTermOpc)
971 .addDef(RegNo: ExecReg)
972 .addReg(RegNo: SaveExecReg);
973
974 // Set the insert point after the original instruction, so any new
975 // instructions will be in the remainder.
976 B.setInsertPt(MBB&: *RemainderBB, II: RemainderBB->begin());
977
978 return true;
979}
980
981// Return any unique registers used by \p MI at \p OpIndices that need to be
982// handled in a waterfall loop. Returns these registers in \p
983// SGPROperandRegs. Returns true if there are any operands to handle and a
984// waterfall loop is necessary.
985bool AMDGPURegisterBankInfo::collectWaterfallOperands(
986 SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI,
987 MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const {
988 for (unsigned Op : OpIndices) {
989 assert(MI.getOperand(Op).isUse());
990 Register Reg = MI.getOperand(i: Op).getReg();
991 const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
992 if (OpBank->getID() != AMDGPU::SGPRRegBankID)
993 SGPROperandRegs.insert(V: Reg);
994 }
995
996 // No operands need to be replaced, so no need to loop.
997 return !SGPROperandRegs.empty();
998}
999
1000bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
1001 MachineIRBuilder &B, MachineInstr &MI, ArrayRef<unsigned> OpIndices) const {
1002 // Use a set to avoid extra readfirstlanes in the case where multiple operands
1003 // are the same register.
1004 SmallSet<Register, 4> SGPROperandRegs;
1005
1006 if (!collectWaterfallOperands(SGPROperandRegs, MI, *B.getMRI(), OpIndices))
1007 return false;
1008
1009 MachineBasicBlock::iterator I = MI.getIterator();
1010 return executeInWaterfallLoop(B, make_range(x: I, y: std::next(x: I)),
1011 SGPROperandRegs);
1012}
1013
1014// Legalize an operand that must be an SGPR by inserting a readfirstlane.
1015void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
1016 MachineIRBuilder &B, MachineInstr &MI, unsigned OpIdx) const {
1017 Register Reg = MI.getOperand(i: OpIdx).getReg();
1018 MachineRegisterInfo &MRI = *B.getMRI();
1019 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
1020 if (Bank == &AMDGPU::SGPRRegBank)
1021 return;
1022
1023 Reg = buildReadFirstLane(B, MRI, Src: Reg);
1024 MI.getOperand(i: OpIdx).setReg(Reg);
1025}
1026
1027/// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the
1028/// rest will be in the remainder.
1029static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) {
1030 unsigned TotalSize = Ty.getSizeInBits();
1031 if (!Ty.isVector())
1032 return {LLT::scalar(SizeInBits: FirstSize), LLT::scalar(SizeInBits: TotalSize - FirstSize)};
1033
1034 LLT EltTy = Ty.getElementType();
1035 unsigned EltSize = EltTy.getSizeInBits();
1036 assert(FirstSize % EltSize == 0);
1037
1038 unsigned FirstPartNumElts = FirstSize / EltSize;
1039 unsigned RemainderElts = (TotalSize - FirstSize) / EltSize;
1040
1041 return {LLT::scalarOrVector(EC: ElementCount::getFixed(MinVal: FirstPartNumElts), ScalarTy: EltTy),
1042 LLT::scalarOrVector(EC: ElementCount::getFixed(MinVal: RemainderElts), ScalarTy: EltTy)};
1043}
1044
1045static LLT widen96To128(LLT Ty) {
1046 if (!Ty.isVector())
1047 return LLT::scalar(SizeInBits: 128);
1048
1049 LLT EltTy = Ty.getElementType();
1050 assert(128 % EltTy.getSizeInBits() == 0);
1051 return LLT::fixed_vector(NumElements: 128 / EltTy.getSizeInBits(), ScalarTy: EltTy);
1052}
1053
1054bool AMDGPURegisterBankInfo::applyMappingLoad(
1055 MachineIRBuilder &B,
1056 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1057 MachineInstr &MI) const {
1058 MachineRegisterInfo &MRI = *B.getMRI();
1059 Register DstReg = MI.getOperand(i: 0).getReg();
1060 const LLT LoadTy = MRI.getType(Reg: DstReg);
1061 unsigned LoadSize = LoadTy.getSizeInBits();
1062 const unsigned MaxNonSmrdLoadSize = 128;
1063
1064 const RegisterBank *DstBank =
1065 OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
1066 if (DstBank == &AMDGPU::SGPRRegBank) {
1067 // There are some special cases that we need to look at for 32 bit and 96
1068 // bit SGPR loads otherwise we have nothing to do.
1069 if (LoadSize != 32 && (LoadSize != 96 || Subtarget.hasScalarDwordx3Loads()))
1070 return false;
1071
1072 MachineMemOperand *MMO = *MI.memoperands_begin();
1073 const unsigned MemSize = 8 * MMO->getSize().getValue();
1074 // Scalar loads of size 8 or 16 bit with proper alignment may be widened to
1075 // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit
1076 // scalar loads should have a load size of 32 but memory access size of less
1077 // than 32.
1078 if (LoadSize == 32 &&
1079 (MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI)))
1080 return false;
1081
1082 if (LoadSize == 32 &&
1083 ((MemSize == 8 && MMO->getAlign() >= Align(1)) ||
1084 (MemSize == 16 && MMO->getAlign() >= Align(2))) &&
1085 isScalarLoadLegal(MI) &&
1086 Subtarget.getGeneration() >= AMDGPUSubtarget::GFX12)
1087 return false;
1088
1089 Register PtrReg = MI.getOperand(i: 1).getReg();
1090
1091 ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);
1092
1093 if (LoadSize == 32) {
1094 // This is an extending load from a sub-dword size. Widen the memory
1095 // access size to 4 bytes and clear the extra high bits appropriately
1096 const LLT S32 = LLT::scalar(SizeInBits: 32);
1097 if (MI.getOpcode() == AMDGPU::G_SEXTLOAD) {
1098 // Must extend the sign bit into higher bits for a G_SEXTLOAD
1099 auto WideLoad = B.buildLoadFromOffset(Dst: S32, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: 0);
1100 B.buildSExtInReg(Res: MI.getOperand(i: 0), Op: WideLoad, ImmOp: MemSize);
1101 } else if (MI.getOpcode() == AMDGPU::G_ZEXTLOAD) {
1102 // Must extend zero into higher bits with an AND for a G_ZEXTLOAD
1103 auto WideLoad = B.buildLoadFromOffset(Dst: S32, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: 0);
1104 B.buildZExtInReg(Res: MI.getOperand(i: 0), Op: WideLoad, ImmOp: MemSize);
1105 } else
1106 // We do not need to touch the higher bits for regular loads.
1107 B.buildLoadFromOffset(Dst: MI.getOperand(i: 0), BasePtr: PtrReg, BaseMMO&: *MMO, Offset: 0);
1108 } else {
1109 // 96-bit loads are only available for vector loads. We need to split this
1110 // into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
1111 if (MMO->getAlign() < Align(16)) {
1112 LegalizerHelper Helper(B.getMF(), ApplyBank, B);
1113 LLT Part64, Part32;
1114 std::tie(args&: Part64, args&: Part32) = splitUnequalType(Ty: LoadTy, FirstSize: 64);
1115 if (Helper.reduceLoadStoreWidth(MI&: cast<GAnyLoad>(Val&: MI), TypeIdx: 0, NarrowTy: Part64) !=
1116 LegalizerHelper::Legalized)
1117 return false;
1118 return true;
1119 } else {
1120 LLT WiderTy = widen96To128(Ty: LoadTy);
1121 auto WideLoad = B.buildLoadFromOffset(Dst: WiderTy, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: 0);
1122 if (WiderTy.isScalar())
1123 B.buildTrunc(Res: MI.getOperand(i: 0), Op: WideLoad);
1124 else {
1125 B.buildDeleteTrailingVectorElements(Res: MI.getOperand(i: 0).getReg(),
1126 Op0: WideLoad);
1127 }
1128 }
1129 }
1130
1131 MI.eraseFromParent();
1132 return true;
1133 }
1134
1135 // 128-bit loads are supported for all instruction types.
1136 if (LoadSize <= MaxNonSmrdLoadSize)
1137 return false;
1138
1139 SmallVector<Register, 16> DefRegs(OpdMapper.getVRegs(OpIdx: 0));
1140 SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(OpIdx: 1));
1141
1142 if (SrcRegs.empty())
1143 SrcRegs.push_back(Elt: MI.getOperand(i: 1).getReg());
1144
1145 assert(LoadSize % MaxNonSmrdLoadSize == 0);
1146
1147 // RegBankSelect only emits scalar types, so we need to reset the pointer
1148 // operand to a pointer type.
1149 Register BasePtrReg = SrcRegs[0];
1150 LLT PtrTy = MRI.getType(Reg: MI.getOperand(i: 1).getReg());
1151 MRI.setType(VReg: BasePtrReg, Ty: PtrTy);
1152
1153 unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize;
1154 const LLT LoadSplitTy = LoadTy.divide(Factor: NumSplitParts);
1155 ApplyRegBankMapping O(B, *this, MRI, &AMDGPU::VGPRRegBank);
1156 LegalizerHelper Helper(B.getMF(), O, B);
1157
1158 if (LoadTy.isVector()) {
1159 if (Helper.fewerElementsVector(MI, TypeIdx: 0, NarrowTy: LoadSplitTy) != LegalizerHelper::Legalized)
1160 return false;
1161 } else {
1162 if (Helper.narrowScalar(MI, TypeIdx: 0, NarrowTy: LoadSplitTy) != LegalizerHelper::Legalized)
1163 return false;
1164 }
1165
1166 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
1167 return true;
1168}
1169
1170bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc(
1171 MachineIRBuilder &B,
1172 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1173 MachineInstr &MI) const {
1174 MachineRegisterInfo &MRI = *B.getMRI();
1175 const MachineFunction &MF = B.getMF();
1176 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1177 const auto &TFI = *ST.getFrameLowering();
1178
1179 // Guard in case the stack growth direction ever changes with scratch
1180 // instructions.
1181 if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown)
1182 return false;
1183
1184 Register Dst = MI.getOperand(i: 0).getReg();
1185 Register AllocSize = MI.getOperand(i: 1).getReg();
1186 Align Alignment = assumeAligned(Value: MI.getOperand(i: 2).getImm());
1187
1188 const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI);
1189
1190 // TODO: Need to emit a wave reduction to get the maximum size.
1191 if (SizeBank != &AMDGPU::SGPRRegBank)
1192 return false;
1193
1194 LLT PtrTy = MRI.getType(Reg: Dst);
1195 LLT IntPtrTy = LLT::scalar(SizeInBits: PtrTy.getSizeInBits());
1196
1197 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1198 Register SPReg = Info->getStackPtrOffsetReg();
1199 ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank);
1200
1201 auto WaveSize = B.buildConstant(LLT::scalar(SizeInBits: 32), ST.getWavefrontSizeLog2());
1202 auto ScaledSize = B.buildShl(Dst: IntPtrTy, Src0: AllocSize, Src1: WaveSize);
1203
1204 auto SPCopy = B.buildCopy(Res: PtrTy, Op: SPReg);
1205 if (Alignment > TFI.getStackAlign()) {
1206 auto PtrAdd = B.buildPtrAdd(Res: PtrTy, Op0: SPCopy, Op1: ScaledSize);
1207 B.buildMaskLowPtrBits(Res: Dst, Op0: PtrAdd,
1208 NumBits: Log2(A: Alignment) + ST.getWavefrontSizeLog2());
1209 } else {
1210 B.buildPtrAdd(Res: Dst, Op0: SPCopy, Op1: ScaledSize);
1211 }
1212
1213 MI.eraseFromParent();
1214 return true;
1215}
1216
1217bool AMDGPURegisterBankInfo::applyMappingImage(
1218 MachineIRBuilder &B, MachineInstr &MI,
1219 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1220 int RsrcIdx) const {
1221 const int NumDefs = MI.getNumExplicitDefs();
1222
1223 // The reported argument index is relative to the IR intrinsic call arguments,
1224 // so we need to shift by the number of defs and the intrinsic ID.
1225 RsrcIdx += NumDefs + 1;
1226
1227 // Insert copies to VGPR arguments.
1228 applyDefaultMapping(OpdMapper);
1229
1230 // Fixup any SGPR arguments.
1231 SmallVector<unsigned, 4> SGPRIndexes;
1232 for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) {
1233 if (!MI.getOperand(i: I).isReg())
1234 continue;
1235
1236 // If this intrinsic has a sampler, it immediately follows rsrc.
1237 if (I == RsrcIdx || I == RsrcIdx + 1)
1238 SGPRIndexes.push_back(Elt: I);
1239 }
1240
1241 executeInWaterfallLoop(B, MI, OpIndices: SGPRIndexes);
1242 return true;
1243}
1244
1245// Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store
1246// the three offsets (voffset, soffset and instoffset)
1247unsigned AMDGPURegisterBankInfo::setBufferOffsets(
1248 MachineIRBuilder &B, Register CombinedOffset, Register &VOffsetReg,
1249 Register &SOffsetReg, int64_t &InstOffsetVal, Align Alignment) const {
1250 const LLT S32 = LLT::scalar(SizeInBits: 32);
1251 MachineRegisterInfo *MRI = B.getMRI();
1252
1253 if (std::optional<int64_t> Imm =
1254 getIConstantVRegSExtVal(VReg: CombinedOffset, MRI: *MRI)) {
1255 uint32_t SOffset, ImmOffset;
1256 if (TII->splitMUBUFOffset(Imm: *Imm, SOffset, ImmOffset, Alignment)) {
1257 VOffsetReg = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
1258 SOffsetReg = B.buildConstant(Res: S32, Val: SOffset).getReg(Idx: 0);
1259 InstOffsetVal = ImmOffset;
1260
1261 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1262 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1263 return SOffset + ImmOffset;
1264 }
1265 }
1266
1267 Register Base;
1268 unsigned Offset;
1269
1270 std::tie(args&: Base, args&: Offset) =
1271 AMDGPU::getBaseWithConstantOffset(MRI&: *MRI, Reg: CombinedOffset);
1272
1273 uint32_t SOffset, ImmOffset;
1274 if ((int)Offset > 0 &&
1275 TII->splitMUBUFOffset(Imm: Offset, SOffset, ImmOffset, Alignment)) {
1276 if (getRegBank(Base, *MRI, *TRI) == &AMDGPU::VGPRRegBank) {
1277 VOffsetReg = Base;
1278 SOffsetReg = B.buildConstant(Res: S32, Val: SOffset).getReg(Idx: 0);
1279 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1280 InstOffsetVal = ImmOffset;
1281 return 0; // XXX - Why is this 0?
1282 }
1283
1284 // If we have SGPR base, we can use it for soffset.
1285 if (SOffset == 0) {
1286 VOffsetReg = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
1287 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1288 SOffsetReg = Base;
1289 InstOffsetVal = ImmOffset;
1290 return 0; // XXX - Why is this 0?
1291 }
1292 }
1293
1294 // Handle the variable sgpr + vgpr case.
1295 MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI);
1296 if (Add && (int)Offset >= 0) {
1297 Register Src0 = getSrcRegIgnoringCopies(Reg: Add->getOperand(i: 1).getReg(), MRI: *MRI);
1298 Register Src1 = getSrcRegIgnoringCopies(Reg: Add->getOperand(i: 2).getReg(), MRI: *MRI);
1299
1300 const RegisterBank *Src0Bank = getRegBank(Src0, *MRI, *TRI);
1301 const RegisterBank *Src1Bank = getRegBank(Src1, *MRI, *TRI);
1302
1303 if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) {
1304 VOffsetReg = Src0;
1305 SOffsetReg = Src1;
1306 return 0;
1307 }
1308
1309 if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) {
1310 VOffsetReg = Src1;
1311 SOffsetReg = Src0;
1312 return 0;
1313 }
1314 }
1315
1316 // Ensure we have a VGPR for the combined offset. This could be an issue if we
1317 // have an SGPR offset and a VGPR resource.
1318 if (getRegBank(CombinedOffset, *MRI, *TRI) == &AMDGPU::VGPRRegBank) {
1319 VOffsetReg = CombinedOffset;
1320 } else {
1321 VOffsetReg = B.buildCopy(Res: S32, Op: CombinedOffset).getReg(Idx: 0);
1322 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1323 }
1324
1325 SOffsetReg = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
1326 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1327 return 0;
1328}
1329
1330bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(
1331 MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
1332 MachineInstr &MI = OpdMapper.getMI();
1333 MachineRegisterInfo &MRI = OpdMapper.getMRI();
1334
1335 const LLT S32 = LLT::scalar(SizeInBits: 32);
1336 Register Dst = MI.getOperand(i: 0).getReg();
1337 LLT Ty = MRI.getType(Reg: Dst);
1338
1339 const RegisterBank *RSrcBank =
1340 OpdMapper.getInstrMapping().getOperandMapping(i: 1).BreakDown[0].RegBank;
1341 const RegisterBank *OffsetBank =
1342 OpdMapper.getInstrMapping().getOperandMapping(i: 2).BreakDown[0].RegBank;
1343 if (RSrcBank == &AMDGPU::SGPRRegBank &&
1344 OffsetBank == &AMDGPU::SGPRRegBank)
1345 return true; // Legal mapping
1346
1347 // FIXME: 96-bit case was widened during legalize. We need to narrow it back
1348 // here but don't have an MMO.
1349
1350 unsigned LoadSize = Ty.getSizeInBits();
1351 int NumLoads = 1;
1352 if (LoadSize == 256 || LoadSize == 512) {
1353 NumLoads = LoadSize / 128;
1354 Ty = Ty.divide(Factor: NumLoads);
1355 }
1356
1357 // Use the alignment to ensure that the required offsets will fit into the
1358 // immediate offsets.
1359 const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1);
1360
1361 MachineFunction &MF = B.getMF();
1362
1363 Register SOffset;
1364 Register VOffset;
1365 int64_t ImmOffset = 0;
1366
1367 unsigned MMOOffset = setBufferOffsets(B, CombinedOffset: MI.getOperand(i: 2).getReg(), VOffsetReg&: VOffset,
1368 SOffsetReg&: SOffset, InstOffsetVal&: ImmOffset, Alignment);
1369
1370 // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we
1371 // can, but we need to track an MMO for that.
1372 const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8;
1373 const Align MemAlign(4); // FIXME: ABI type alignment?
1374 MachineMemOperand *BaseMMO = MF.getMachineMemOperand(
1375 PtrInfo: MachinePointerInfo(),
1376 F: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1377 MachineMemOperand::MOInvariant,
1378 Size: MemSize, BaseAlignment: MemAlign);
1379 if (MMOOffset != 0)
1380 BaseMMO = MF.getMachineMemOperand(MMO: BaseMMO, Offset: MMOOffset, Size: MemSize);
1381
1382 // If only the offset is divergent, emit a MUBUF buffer load instead. We can
1383 // assume that the buffer is unswizzled.
1384
1385 Register RSrc = MI.getOperand(i: 1).getReg();
1386 Register VIndex = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
1387 B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank);
1388
1389 SmallVector<Register, 4> LoadParts(NumLoads);
1390
1391 MachineBasicBlock::iterator MII = MI.getIterator();
1392 MachineInstrSpan Span(MII, &B.getMBB());
1393
1394 for (int i = 0; i < NumLoads; ++i) {
1395 if (NumLoads == 1) {
1396 LoadParts[i] = Dst;
1397 } else {
1398 LoadParts[i] = MRI.createGenericVirtualRegister(Ty);
1399 MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank);
1400 }
1401
1402 MachineMemOperand *MMO = BaseMMO;
1403 if (i != 0)
1404 BaseMMO = MF.getMachineMemOperand(MMO: BaseMMO, Offset: MMOOffset + 16 * i, Size: MemSize);
1405
1406 B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD)
1407 .addDef(LoadParts[i]) // vdata
1408 .addUse(RSrc) // rsrc
1409 .addUse(VIndex) // vindex
1410 .addUse(VOffset) // voffset
1411 .addUse(SOffset) // soffset
1412 .addImm(ImmOffset + 16 * i) // offset(imm)
1413 .addImm(0) // cachepolicy, swizzled buffer(imm)
1414 .addImm(0) // idxen(imm)
1415 .addMemOperand(MMO);
1416 }
1417
1418 // TODO: If only the resource is a VGPR, it may be better to execute the
1419 // scalar load in the waterfall loop if the resource is expected to frequently
1420 // be dynamically uniform.
1421 if (RSrcBank != &AMDGPU::SGPRRegBank) {
1422 // Remove the original instruction to avoid potentially confusing the
1423 // waterfall loop logic.
1424 B.setInstr(*Span.begin());
1425 MI.eraseFromParent();
1426
1427 SmallSet<Register, 4> OpsToWaterfall;
1428
1429 OpsToWaterfall.insert(V: RSrc);
1430 executeInWaterfallLoop(B, make_range(x: Span.begin(), y: Span.end()),
1431 OpsToWaterfall);
1432 }
1433
1434 if (NumLoads != 1) {
1435 if (Ty.isVector())
1436 B.buildConcatVectors(Res: Dst, Ops: LoadParts);
1437 else
1438 B.buildMergeLikeInstr(Res: Dst, Ops: LoadParts);
1439 }
1440
1441 // We removed the instruction earlier with a waterfall loop.
1442 if (RSrcBank == &AMDGPU::SGPRRegBank)
1443 MI.eraseFromParent();
1444
1445 return true;
1446}
1447
1448bool AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B,
1449 const OperandsMapper &OpdMapper,
1450 bool Signed) const {
1451 MachineInstr &MI = OpdMapper.getMI();
1452 MachineRegisterInfo &MRI = OpdMapper.getMRI();
1453
1454 // Insert basic copies
1455 applyDefaultMapping(OpdMapper);
1456
1457 Register DstReg = MI.getOperand(i: 0).getReg();
1458 LLT Ty = MRI.getType(Reg: DstReg);
1459
1460 const LLT S32 = LLT::scalar(SizeInBits: 32);
1461
1462 unsigned FirstOpnd = isa<GIntrinsic>(Val: MI) ? 2 : 1;
1463 Register SrcReg = MI.getOperand(i: FirstOpnd).getReg();
1464 Register OffsetReg = MI.getOperand(i: FirstOpnd + 1).getReg();
1465 Register WidthReg = MI.getOperand(i: FirstOpnd + 2).getReg();
1466
1467 const RegisterBank *DstBank =
1468 OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
1469 if (DstBank == &AMDGPU::VGPRRegBank) {
1470 if (Ty == S32)
1471 return true;
1472
1473 // There is no 64-bit vgpr bitfield extract instructions so the operation
1474 // is expanded to a sequence of instructions that implement the operation.
1475 ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank);
1476
1477 const LLT S64 = LLT::scalar(SizeInBits: 64);
1478 // Shift the source operand so that extracted bits start at bit 0.
1479 auto ShiftOffset = Signed ? B.buildAShr(Dst: S64, Src0: SrcReg, Src1: OffsetReg)
1480 : B.buildLShr(Dst: S64, Src0: SrcReg, Src1: OffsetReg);
1481 auto UnmergeSOffset = B.buildUnmerge(Res: {S32, S32}, Op: ShiftOffset);
1482
1483 // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions
1484 // if the width is a constant.
1485 if (auto ConstWidth = getIConstantVRegValWithLookThrough(VReg: WidthReg, MRI)) {
1486 // Use the 32-bit bitfield extract instruction if the width is a constant.
1487 // Depending on the width size, use either the low or high 32-bits.
1488 auto Zero = B.buildConstant(Res: S32, Val: 0);
1489 auto WidthImm = ConstWidth->Value.getZExtValue();
1490 if (WidthImm <= 32) {
1491 // Use bitfield extract on the lower 32-bit source, and then sign-extend
1492 // or clear the upper 32-bits.
1493 auto Extract =
1494 Signed ? B.buildSbfx(Dst: S32, Src: UnmergeSOffset.getReg(Idx: 0), LSB: Zero, Width: WidthReg)
1495 : B.buildUbfx(Dst: S32, Src: UnmergeSOffset.getReg(Idx: 0), LSB: Zero, Width: WidthReg);
1496 auto Extend =
1497 Signed ? B.buildAShr(Dst: S32, Src0: Extract, Src1: B.buildConstant(Res: S32, Val: 31)) : Zero;
1498 B.buildMergeLikeInstr(Res: DstReg, Ops: {Extract, Extend});
1499 } else {
1500 // Use bitfield extract on upper 32-bit source, and combine with lower
1501 // 32-bit source.
1502 auto UpperWidth = B.buildConstant(Res: S32, Val: WidthImm - 32);
1503 auto Extract =
1504 Signed
1505 ? B.buildSbfx(Dst: S32, Src: UnmergeSOffset.getReg(Idx: 1), LSB: Zero, Width: UpperWidth)
1506 : B.buildUbfx(Dst: S32, Src: UnmergeSOffset.getReg(Idx: 1), LSB: Zero, Width: UpperWidth);
1507 B.buildMergeLikeInstr(Res: DstReg, Ops: {UnmergeSOffset.getReg(Idx: 0), Extract});
1508 }
1509 MI.eraseFromParent();
1510 return true;
1511 }
1512
1513 // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit
1514 // operations.
1515 auto ExtShift = B.buildSub(Dst: S32, Src0: B.buildConstant(Res: S32, Val: 64), Src1: WidthReg);
1516 auto SignBit = B.buildShl(Dst: S64, Src0: ShiftOffset, Src1: ExtShift);
1517 if (Signed)
1518 B.buildAShr(Dst: S64, Src0: SignBit, Src1: ExtShift);
1519 else
1520 B.buildLShr(Dst: S64, Src0: SignBit, Src1: ExtShift);
1521 MI.eraseFromParent();
1522 return true;
1523 }
1524
1525 // The scalar form packs the offset and width in a single operand.
1526
1527 ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank);
1528
1529 // Ensure the high bits are clear to insert the offset.
1530 auto OffsetMask = B.buildConstant(Res: S32, Val: maskTrailingOnes<unsigned>(N: 6));
1531 auto ClampOffset = B.buildAnd(Dst: S32, Src0: OffsetReg, Src1: OffsetMask);
1532
1533 // Zeros out the low bits, so don't bother clamping the input value.
1534 auto ShiftWidth = B.buildShl(Dst: S32, Src0: WidthReg, Src1: B.buildConstant(Res: S32, Val: 16));
1535
1536 // Transformation function, pack the offset and width of a BFE into
1537 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
1538 // source, bits [5:0] contain the offset and bits [22:16] the width.
1539 auto MergedInputs = B.buildOr(Dst: S32, Src0: ClampOffset, Src1: ShiftWidth);
1540
1541 // TODO: It might be worth using a pseudo here to avoid scc clobber and
1542 // register class constraints.
1543 unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
1544 (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
1545
1546 auto MIB = B.buildInstr(Opc, DstOps: {DstReg}, SrcOps: {SrcReg, MergedInputs});
1547 if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
1548 llvm_unreachable("failed to constrain BFE");
1549
1550 MI.eraseFromParent();
1551 return true;
1552}
1553
1554bool AMDGPURegisterBankInfo::applyMappingMAD_64_32(
1555 MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
1556 MachineInstr &MI = OpdMapper.getMI();
1557 MachineRegisterInfo &MRI = OpdMapper.getMRI();
1558
1559 // Insert basic copies.
1560 applyDefaultMapping(OpdMapper);
1561
1562 Register Dst0 = MI.getOperand(i: 0).getReg();
1563 Register Dst1 = MI.getOperand(i: 1).getReg();
1564 Register Src0 = MI.getOperand(i: 2).getReg();
1565 Register Src1 = MI.getOperand(i: 3).getReg();
1566 Register Src2 = MI.getOperand(i: 4).getReg();
1567
1568 if (MRI.getRegBankOrNull(Src0) == &AMDGPU::VGPRRegBank)
1569 return true;
1570
1571 bool IsUnsigned = MI.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
1572 LLT S1 = LLT::scalar(SizeInBits: 1);
1573 LLT S32 = LLT::scalar(SizeInBits: 32);
1574
1575 bool DstOnValu = MRI.getRegBankOrNull(Src2) == &AMDGPU::VGPRRegBank;
1576 bool Accumulate = true;
1577
1578 if (!DstOnValu) {
1579 if (mi_match(R: Src2, MRI, P: m_ZeroInt()))
1580 Accumulate = false;
1581 }
1582
1583 // Keep the multiplication on the SALU.
1584 Register DstHi;
1585 Register DstLo = B.buildMul(Dst: S32, Src0, Src1).getReg(Idx: 0);
1586 bool MulHiInVgpr = false;
1587
1588 MRI.setRegBank(DstLo, AMDGPU::SGPRRegBank);
1589
1590 if (Subtarget.hasSMulHi()) {
1591 DstHi = IsUnsigned ? B.buildUMulH(Dst: S32, Src0, Src1).getReg(Idx: 0)
1592 : B.buildSMulH(Dst: S32, Src0, Src1).getReg(Idx: 0);
1593 MRI.setRegBank(DstHi, AMDGPU::SGPRRegBank);
1594 } else {
1595 Register VSrc0 = B.buildCopy(Res: S32, Op: Src0).getReg(Idx: 0);
1596 Register VSrc1 = B.buildCopy(Res: S32, Op: Src1).getReg(Idx: 0);
1597
1598 MRI.setRegBank(VSrc0, AMDGPU::VGPRRegBank);
1599 MRI.setRegBank(VSrc1, AMDGPU::VGPRRegBank);
1600
1601 DstHi = IsUnsigned ? B.buildUMulH(Dst: S32, Src0: VSrc0, Src1: VSrc1).getReg(Idx: 0)
1602 : B.buildSMulH(Dst: S32, Src0: VSrc0, Src1: VSrc1).getReg(Idx: 0);
1603 MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank);
1604
1605 if (!DstOnValu) {
1606 DstHi = buildReadFirstLane(B, MRI, Src: DstHi);
1607 } else {
1608 MulHiInVgpr = true;
1609 }
1610 }
1611
1612 // Accumulate and produce the "carry-out" bit.
1613 //
1614 // The "carry-out" is defined as bit 64 of the result when computed as a
1615 // big integer. For unsigned multiply-add, this matches the usual definition
1616 // of carry-out. For signed multiply-add, bit 64 is the sign bit of the
1617 // result, which is determined as:
1618 // sign(Src0 * Src1) + sign(Src2) + carry-out from unsigned 64-bit add
1619 LLT CarryType = DstOnValu ? S1 : S32;
1620 const RegisterBank &CarryBank =
1621 DstOnValu ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
1622 const RegisterBank &DstBank =
1623 DstOnValu ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank;
1624 Register Carry;
1625 Register Zero;
1626
1627 if (!IsUnsigned) {
1628 Zero = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
1629 MRI.setRegBank(Zero,
1630 MulHiInVgpr ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank);
1631
1632 Carry = B.buildICmp(Pred: CmpInst::ICMP_SLT, Res: MulHiInVgpr ? S1 : S32, Op0: DstHi, Op1: Zero)
1633 .getReg(Idx: 0);
1634 MRI.setRegBank(Carry, MulHiInVgpr ? AMDGPU::VCCRegBank
1635 : AMDGPU::SGPRRegBank);
1636
1637 if (DstOnValu && !MulHiInVgpr) {
1638 Carry = B.buildTrunc(Res: S1, Op: Carry).getReg(Idx: 0);
1639 MRI.setRegBank(Carry, AMDGPU::VCCRegBank);
1640 }
1641 }
1642
1643 if (Accumulate) {
1644 if (DstOnValu) {
1645 DstLo = B.buildCopy(Res: S32, Op: DstLo).getReg(Idx: 0);
1646 DstHi = B.buildCopy(Res: S32, Op: DstHi).getReg(Idx: 0);
1647 MRI.setRegBank(DstLo, AMDGPU::VGPRRegBank);
1648 MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank);
1649 }
1650
1651 auto Unmerge = B.buildUnmerge(Res: S32, Op: Src2);
1652 Register Src2Lo = Unmerge.getReg(Idx: 0);
1653 Register Src2Hi = Unmerge.getReg(Idx: 1);
1654 MRI.setRegBank(Reg: Src2Lo, RegBank: DstBank);
1655 MRI.setRegBank(Reg: Src2Hi, RegBank: DstBank);
1656
1657 if (!IsUnsigned) {
1658 auto Src2Sign = B.buildICmp(Pred: CmpInst::ICMP_SLT, Res: CarryType, Op0: Src2Hi, Op1: Zero);
1659 MRI.setRegBank(Reg: Src2Sign.getReg(Idx: 0), RegBank: CarryBank);
1660
1661 Carry = B.buildXor(Dst: CarryType, Src0: Carry, Src1: Src2Sign).getReg(Idx: 0);
1662 MRI.setRegBank(Reg: Carry, RegBank: CarryBank);
1663 }
1664
1665 auto AddLo = B.buildUAddo(Res: S32, CarryOut: CarryType, Op0: DstLo, Op1: Src2Lo);
1666 DstLo = AddLo.getReg(Idx: 0);
1667 Register CarryLo = AddLo.getReg(Idx: 1);
1668 MRI.setRegBank(Reg: DstLo, RegBank: DstBank);
1669 MRI.setRegBank(Reg: CarryLo, RegBank: CarryBank);
1670
1671 auto AddHi = B.buildUAdde(Res: S32, CarryOut: CarryType, Op0: DstHi, Op1: Src2Hi, CarryIn: CarryLo);
1672 DstHi = AddHi.getReg(Idx: 0);
1673 MRI.setRegBank(Reg: DstHi, RegBank: DstBank);
1674
1675 Register CarryHi = AddHi.getReg(Idx: 1);
1676 MRI.setRegBank(Reg: CarryHi, RegBank: CarryBank);
1677
1678 if (IsUnsigned) {
1679 Carry = CarryHi;
1680 } else {
1681 Carry = B.buildXor(Dst: CarryType, Src0: Carry, Src1: CarryHi).getReg(Idx: 0);
1682 MRI.setRegBank(Reg: Carry, RegBank: CarryBank);
1683 }
1684 } else {
1685 if (IsUnsigned) {
1686 Carry = B.buildConstant(Res: CarryType, Val: 0).getReg(Idx: 0);
1687 MRI.setRegBank(Reg: Carry, RegBank: CarryBank);
1688 }
1689 }
1690
1691 B.buildMergeLikeInstr(Res: Dst0, Ops: {DstLo, DstHi});
1692
1693 if (DstOnValu) {
1694 B.buildCopy(Res: Dst1, Op: Carry);
1695 } else {
1696 B.buildTrunc(Res: Dst1, Op: Carry);
1697 }
1698
1699 MI.eraseFromParent();
1700 return true;
1701}
1702
1703// Return a suitable opcode for extending the operands of Opc when widening.
1704static unsigned getExtendOp(unsigned Opc) {
1705 switch (Opc) {
1706 case TargetOpcode::G_ASHR:
1707 case TargetOpcode::G_SMIN:
1708 case TargetOpcode::G_SMAX:
1709 return TargetOpcode::G_SEXT;
1710 case TargetOpcode::G_LSHR:
1711 case TargetOpcode::G_UMIN:
1712 case TargetOpcode::G_UMAX:
1713 return TargetOpcode::G_ZEXT;
1714 default:
1715 return TargetOpcode::G_ANYEXT;
1716 }
1717}
1718
1719// Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding
1720// any illegal vector extend or unmerge operations.
1721static std::pair<Register, Register>
1722unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) {
1723 const LLT S32 = LLT::scalar(SizeInBits: 32);
1724 auto Bitcast = B.buildBitcast(Dst: S32, Src);
1725
1726 if (ExtOpcode == TargetOpcode::G_SEXT) {
1727 auto ExtLo = B.buildSExtInReg(Res: S32, Op: Bitcast, ImmOp: 16);
1728 auto ShiftHi = B.buildAShr(Dst: S32, Src0: Bitcast, Src1: B.buildConstant(Res: S32, Val: 16));
1729 return std::pair(ExtLo.getReg(Idx: 0), ShiftHi.getReg(Idx: 0));
1730 }
1731
1732 auto ShiftHi = B.buildLShr(Dst: S32, Src0: Bitcast, Src1: B.buildConstant(Res: S32, Val: 16));
1733 if (ExtOpcode == TargetOpcode::G_ZEXT) {
1734 auto ExtLo = B.buildAnd(Dst: S32, Src0: Bitcast, Src1: B.buildConstant(Res: S32, Val: 0xffff));
1735 return std::pair(ExtLo.getReg(Idx: 0), ShiftHi.getReg(Idx: 0));
1736 }
1737
1738 assert(ExtOpcode == TargetOpcode::G_ANYEXT);
1739 return std::pair(Bitcast.getReg(Idx: 0), ShiftHi.getReg(Idx: 0));
1740}
1741
1742// For cases where only a single copy is inserted for matching register banks.
1743// Replace the register in the instruction operand
1744static bool substituteSimpleCopyRegs(
1745 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
1746 SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx));
1747 if (!SrcReg.empty()) {
1748 assert(SrcReg.size() == 1);
1749 OpdMapper.getMI().getOperand(i: OpIdx).setReg(SrcReg[0]);
1750 return true;
1751 }
1752
1753 return false;
1754}
1755
1756/// Handle register layout difference for f16 images for some subtargets.
1757Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B,
1758 MachineRegisterInfo &MRI,
1759 Register Reg) const {
1760 if (!Subtarget.hasUnpackedD16VMem())
1761 return Reg;
1762
1763 const LLT S16 = LLT::scalar(SizeInBits: 16);
1764 LLT StoreVT = MRI.getType(Reg);
1765 if (!StoreVT.isVector() || StoreVT.getElementType() != S16)
1766 return Reg;
1767
1768 auto Unmerge = B.buildUnmerge(Res: S16, Op: Reg);
1769
1770
1771 SmallVector<Register, 4> WideRegs;
1772 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1773 WideRegs.push_back(Elt: Unmerge.getReg(Idx: I));
1774
1775 const LLT S32 = LLT::scalar(SizeInBits: 32);
1776 int NumElts = StoreVT.getNumElements();
1777
1778 return B.buildMergeLikeInstr(Res: LLT::fixed_vector(NumElements: NumElts, ScalarTy: S32), Ops: WideRegs)
1779 .getReg(Idx: 0);
1780}
1781
1782static std::pair<Register, unsigned>
1783getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
1784 int64_t Const;
1785 if (mi_match(R: Reg, MRI, P: m_ICst(Cst&: Const)))
1786 return std::pair(Register(), Const);
1787
1788 Register Base;
1789 if (mi_match(R: Reg, MRI, P: m_GAdd(L: m_Reg(R&: Base), R: m_ICst(Cst&: Const))))
1790 return std::pair(Base, Const);
1791
1792 // TODO: Handle G_OR used for add case
1793 return std::pair(Reg, 0);
1794}
1795
1796std::pair<Register, unsigned>
1797AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B,
1798 Register OrigOffset) const {
1799 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST: Subtarget);
1800 Register BaseReg;
1801 unsigned ImmOffset;
1802 const LLT S32 = LLT::scalar(SizeInBits: 32);
1803
1804 // TODO: Use AMDGPU::getBaseWithConstantOffset() instead.
1805 std::tie(args&: BaseReg, args&: ImmOffset) = getBaseWithConstantOffset(MRI&: *B.getMRI(),
1806 Reg: OrigOffset);
1807
1808 unsigned C1 = 0;
1809 if (ImmOffset != 0) {
1810 // If the immediate value is too big for the immoffset field, put only bits
1811 // that would normally fit in the immoffset field. The remaining value that
1812 // is copied/added for the voffset field is a large power of 2, and it
1813 // stands more chance of being CSEd with the copy/add for another similar
1814 // load/store.
1815 // However, do not do that rounding down if that is a negative
1816 // number, as it appears to be illegal to have a negative offset in the
1817 // vgpr, even if adding the immediate offset makes it positive.
1818 unsigned Overflow = ImmOffset & ~MaxImm;
1819 ImmOffset -= Overflow;
1820 if ((int32_t)Overflow < 0) {
1821 Overflow += ImmOffset;
1822 ImmOffset = 0;
1823 }
1824
1825 C1 = ImmOffset;
1826 if (Overflow != 0) {
1827 if (!BaseReg)
1828 BaseReg = B.buildConstant(Res: S32, Val: Overflow).getReg(Idx: 0);
1829 else {
1830 auto OverflowVal = B.buildConstant(Res: S32, Val: Overflow);
1831 BaseReg = B.buildAdd(Dst: S32, Src0: BaseReg, Src1: OverflowVal).getReg(Idx: 0);
1832 }
1833 }
1834 }
1835
1836 if (!BaseReg)
1837 BaseReg = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
1838
1839 return {BaseReg, C1};
1840}
1841
1842bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg,
1843 Register SrcReg) const {
1844 MachineRegisterInfo &MRI = *B.getMRI();
1845 LLT SrcTy = MRI.getType(Reg: SrcReg);
1846 if (SrcTy.getSizeInBits() == 32) {
1847 // Use a v_mov_b32 here to make the exec dependency explicit.
1848 B.buildInstr(AMDGPU::V_MOV_B32_e32)
1849 .addDef(DstReg)
1850 .addUse(SrcReg);
1851 return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) &&
1852 constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI);
1853 }
1854
1855 Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1856 Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1857
1858 B.buildInstr(AMDGPU::V_MOV_B32_e32)
1859 .addDef(TmpReg0)
1860 .addUse(SrcReg, 0, AMDGPU::sub0);
1861 B.buildInstr(AMDGPU::V_MOV_B32_e32)
1862 .addDef(TmpReg1)
1863 .addUse(SrcReg, 0, AMDGPU::sub1);
1864 B.buildInstr(AMDGPU::REG_SEQUENCE)
1865 .addDef(DstReg)
1866 .addUse(TmpReg0)
1867 .addImm(AMDGPU::sub0)
1868 .addUse(TmpReg1)
1869 .addImm(AMDGPU::sub1);
1870
1871 return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) &&
1872 constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI);
1873}
1874
1875/// Utility function for pushing dynamic vector indexes with a constant offset
1876/// into waterfall loops.
1877static void reinsertVectorIndexAdd(MachineIRBuilder &B,
1878 MachineInstr &IdxUseInstr,
1879 unsigned OpIdx,
1880 unsigned ConstOffset) {
1881 MachineRegisterInfo &MRI = *B.getMRI();
1882 const LLT S32 = LLT::scalar(SizeInBits: 32);
1883 Register WaterfallIdx = IdxUseInstr.getOperand(i: OpIdx).getReg();
1884 B.setInsertPt(MBB&: *IdxUseInstr.getParent(), II: IdxUseInstr.getIterator());
1885
1886 auto MaterializedOffset = B.buildConstant(Res: S32, Val: ConstOffset);
1887
1888 auto Add = B.buildAdd(Dst: S32, Src0: WaterfallIdx, Src1: MaterializedOffset);
1889 MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank);
1890 MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank);
1891 IdxUseInstr.getOperand(i: OpIdx).setReg(Add.getReg(Idx: 0));
1892}
1893
1894/// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the
1895/// original 32-bit source value (to be inserted in the low part of the combined
1896/// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit
1897/// value.
1898static void extendLow32IntoHigh32(MachineIRBuilder &B,
1899 Register Hi32Reg, Register Lo32Reg,
1900 unsigned ExtOpc,
1901 const RegisterBank &RegBank,
1902 bool IsBooleanSrc = false) {
1903 if (ExtOpc == AMDGPU::G_ZEXT) {
1904 B.buildConstant(Res: Hi32Reg, Val: 0);
1905 } else if (ExtOpc == AMDGPU::G_SEXT) {
1906 if (IsBooleanSrc) {
1907 // If we know the original source was an s1, the high half is the same as
1908 // the low.
1909 B.buildCopy(Res: Hi32Reg, Op: Lo32Reg);
1910 } else {
1911 // Replicate sign bit from 32-bit extended part.
1912 auto ShiftAmt = B.buildConstant(Res: LLT::scalar(SizeInBits: 32), Val: 31);
1913 B.getMRI()->setRegBank(Reg: ShiftAmt.getReg(Idx: 0), RegBank);
1914 B.buildAShr(Dst: Hi32Reg, Src0: Lo32Reg, Src1: ShiftAmt);
1915 }
1916 } else {
1917 assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension");
1918 B.buildUndef(Res: Hi32Reg);
1919 }
1920}
1921
1922bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
1923 MachineIRBuilder &B, MachineInstr &MI,
1924 const OperandsMapper &OpdMapper) const {
1925 MachineRegisterInfo &MRI = *B.getMRI();
1926
1927 Register VecReg = MI.getOperand(i: 1).getReg();
1928 Register Idx = MI.getOperand(i: 2).getReg();
1929
1930 const RegisterBank &IdxBank =
1931 *OpdMapper.getInstrMapping().getOperandMapping(i: 2).BreakDown[0].RegBank;
1932
1933 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
1934
1935 LLT VecTy = MRI.getType(Reg: VecReg);
1936 unsigned EltSize = VecTy.getScalarSizeInBits();
1937 unsigned NumElem = VecTy.getNumElements();
1938
1939 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
1940 IsDivergentIdx, Subtarget: &Subtarget))
1941 return false;
1942
1943 LLT S32 = LLT::scalar(SizeInBits: 32);
1944
1945 const RegisterBank &DstBank =
1946 *OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
1947 const RegisterBank &SrcBank =
1948 *OpdMapper.getInstrMapping().getOperandMapping(i: 1).BreakDown[0].RegBank;
1949
1950 const RegisterBank &CCBank =
1951 (DstBank == AMDGPU::SGPRRegBank &&
1952 SrcBank == AMDGPU::SGPRRegBank &&
1953 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
1954 : AMDGPU::VCCRegBank;
1955 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
1956
1957 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
1958 Idx = B.buildCopy(Res: S32, Op: Idx)->getOperand(i: 0).getReg();
1959 MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
1960 }
1961
1962 LLT EltTy = VecTy.getScalarType();
1963 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(OpIdx: 0));
1964 unsigned NumLanes = DstRegs.size();
1965 if (!NumLanes)
1966 NumLanes = 1;
1967 else
1968 EltTy = MRI.getType(Reg: DstRegs[0]);
1969
1970 auto UnmergeToEltTy = B.buildUnmerge(Res: EltTy, Op: VecReg);
1971 SmallVector<Register, 2> Res(NumLanes);
1972 for (unsigned L = 0; L < NumLanes; ++L)
1973 Res[L] = UnmergeToEltTy.getReg(Idx: L);
1974
1975 for (unsigned I = 1; I < NumElem; ++I) {
1976 auto IC = B.buildConstant(Res: S32, Val: I);
1977 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
1978 auto Cmp = B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: CCTy, Op0: Idx, Op1: IC);
1979 MRI.setRegBank(Reg: Cmp->getOperand(i: 0).getReg(), RegBank: CCBank);
1980
1981 for (unsigned L = 0; L < NumLanes; ++L) {
1982 auto S = B.buildSelect(Res: EltTy, Tst: Cmp,
1983 Op0: UnmergeToEltTy.getReg(Idx: I * NumLanes + L), Op1: Res[L]);
1984
1985 for (unsigned N : { 0, 2, 3 })
1986 MRI.setRegBank(Reg: S->getOperand(i: N).getReg(), RegBank: DstBank);
1987
1988 Res[L] = S->getOperand(i: 0).getReg();
1989 }
1990 }
1991
1992 for (unsigned L = 0; L < NumLanes; ++L) {
1993 Register DstReg = (NumLanes == 1) ? MI.getOperand(i: 0).getReg() : DstRegs[L];
1994 B.buildCopy(Res: DstReg, Op: Res[L]);
1995 MRI.setRegBank(Reg: DstReg, RegBank: DstBank);
1996 }
1997
1998 MRI.setRegBank(Reg: MI.getOperand(i: 0).getReg(), RegBank: DstBank);
1999 MI.eraseFromParent();
2000
2001 return true;
2002}
2003
2004// Insert a cross regbank copy for a register if it already has a bank that
2005// differs from the one we want to set.
2006static Register constrainRegToBank(MachineRegisterInfo &MRI,
2007 MachineIRBuilder &B, Register &Reg,
2008 const RegisterBank &Bank) {
2009 const RegisterBank *CurrBank = MRI.getRegBankOrNull(Reg);
2010 if (CurrBank && *CurrBank != Bank) {
2011 Register Copy = B.buildCopy(Res: MRI.getType(Reg), Op: Reg).getReg(Idx: 0);
2012 MRI.setRegBank(Reg: Copy, RegBank: Bank);
2013 return Copy;
2014 }
2015
2016 MRI.setRegBank(Reg, RegBank: Bank);
2017 return Reg;
2018}
2019
2020bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
2021 MachineIRBuilder &B, MachineInstr &MI,
2022 const OperandsMapper &OpdMapper) const {
2023
2024 MachineRegisterInfo &MRI = *B.getMRI();
2025 Register VecReg = MI.getOperand(i: 1).getReg();
2026 Register Idx = MI.getOperand(i: 3).getReg();
2027
2028 const RegisterBank &IdxBank =
2029 *OpdMapper.getInstrMapping().getOperandMapping(i: 3).BreakDown[0].RegBank;
2030
2031 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
2032
2033 LLT VecTy = MRI.getType(Reg: VecReg);
2034 unsigned EltSize = VecTy.getScalarSizeInBits();
2035 unsigned NumElem = VecTy.getNumElements();
2036
2037 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
2038 IsDivergentIdx, Subtarget: &Subtarget))
2039 return false;
2040
2041 LLT S32 = LLT::scalar(SizeInBits: 32);
2042
2043 const RegisterBank &DstBank =
2044 *OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
2045 const RegisterBank &SrcBank =
2046 *OpdMapper.getInstrMapping().getOperandMapping(i: 1).BreakDown[0].RegBank;
2047 const RegisterBank &InsBank =
2048 *OpdMapper.getInstrMapping().getOperandMapping(i: 2).BreakDown[0].RegBank;
2049
2050 const RegisterBank &CCBank =
2051 (DstBank == AMDGPU::SGPRRegBank &&
2052 SrcBank == AMDGPU::SGPRRegBank &&
2053 InsBank == AMDGPU::SGPRRegBank &&
2054 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
2055 : AMDGPU::VCCRegBank;
2056 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
2057
2058 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
2059 Idx = B.buildCopy(Res: S32, Op: Idx)->getOperand(i: 0).getReg();
2060 MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
2061 }
2062
2063 LLT EltTy = VecTy.getScalarType();
2064 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(OpIdx: 2));
2065 unsigned NumLanes = InsRegs.size();
2066 if (!NumLanes) {
2067 NumLanes = 1;
2068 InsRegs.push_back(Elt: MI.getOperand(i: 2).getReg());
2069 } else {
2070 EltTy = MRI.getType(Reg: InsRegs[0]);
2071 }
2072
2073 auto UnmergeToEltTy = B.buildUnmerge(Res: EltTy, Op: VecReg);
2074 SmallVector<Register, 16> Ops(NumElem * NumLanes);
2075
2076 for (unsigned I = 0; I < NumElem; ++I) {
2077 auto IC = B.buildConstant(Res: S32, Val: I);
2078 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
2079 auto Cmp = B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: CCTy, Op0: Idx, Op1: IC);
2080 MRI.setRegBank(Reg: Cmp->getOperand(i: 0).getReg(), RegBank: CCBank);
2081
2082 for (unsigned L = 0; L < NumLanes; ++L) {
2083 Register Op0 = constrainRegToBank(MRI, B, Reg&: InsRegs[L], Bank: DstBank);
2084 Register Op1 = UnmergeToEltTy.getReg(Idx: I * NumLanes + L);
2085 Op1 = constrainRegToBank(MRI, B, Reg&: Op1, Bank: DstBank);
2086
2087 Register Select = B.buildSelect(Res: EltTy, Tst: Cmp, Op0, Op1).getReg(Idx: 0);
2088 MRI.setRegBank(Reg: Select, RegBank: DstBank);
2089
2090 Ops[I * NumLanes + L] = Select;
2091 }
2092 }
2093
2094 LLT MergeTy = LLT::fixed_vector(NumElements: Ops.size(), ScalarTy: EltTy);
2095 if (MergeTy == MRI.getType(Reg: MI.getOperand(i: 0).getReg())) {
2096 B.buildBuildVector(Res: MI.getOperand(i: 0), Ops);
2097 } else {
2098 auto Vec = B.buildBuildVector(Res: MergeTy, Ops);
2099 MRI.setRegBank(Reg: Vec->getOperand(i: 0).getReg(), RegBank: DstBank);
2100 B.buildBitcast(Dst: MI.getOperand(i: 0).getReg(), Src: Vec);
2101 }
2102
2103 MRI.setRegBank(Reg: MI.getOperand(i: 0).getReg(), RegBank: DstBank);
2104 MI.eraseFromParent();
2105
2106 return true;
2107}
2108
2109// Break s_mul_u64 into 32-bit vector operations.
2110void AMDGPURegisterBankInfo::applyMappingSMULU64(
2111 MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
2112 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(OpIdx: 0));
2113 SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(OpIdx: 1));
2114 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(OpIdx: 2));
2115
2116 // All inputs are SGPRs, nothing special to do.
2117 if (DefRegs.empty()) {
2118 assert(Src0Regs.empty() && Src1Regs.empty());
2119 applyDefaultMapping(OpdMapper);
2120 return;
2121 }
2122
2123 assert(DefRegs.size() == 2);
2124 assert(Src0Regs.size() == Src1Regs.size() &&
2125 (Src0Regs.empty() || Src0Regs.size() == 2));
2126
2127 MachineRegisterInfo &MRI = OpdMapper.getMRI();
2128 MachineInstr &MI = OpdMapper.getMI();
2129 Register DstReg = MI.getOperand(i: 0).getReg();
2130 LLT HalfTy = LLT::scalar(SizeInBits: 32);
2131
2132 // Depending on where the source registers came from, the generic code may
2133 // have decided to split the inputs already or not. If not, we still need to
2134 // extract the values.
2135
2136 if (Src0Regs.empty())
2137 split64BitValueForMapping(B, Regs&: Src0Regs, HalfTy, Reg: MI.getOperand(i: 1).getReg());
2138 else
2139 setRegsToType(MRI, Regs: Src0Regs, NewTy: HalfTy);
2140
2141 if (Src1Regs.empty())
2142 split64BitValueForMapping(B, Regs&: Src1Regs, HalfTy, Reg: MI.getOperand(i: 2).getReg());
2143 else
2144 setRegsToType(MRI, Regs: Src1Regs, NewTy: HalfTy);
2145
2146 setRegsToType(MRI, Regs: DefRegs, NewTy: HalfTy);
2147
2148 // The multiplication is done as follows:
2149 //
2150 // Op1H Op1L
2151 // * Op0H Op0L
2152 // --------------------
2153 // Op1H*Op0L Op1L*Op0L
2154 // + Op1H*Op0H Op1L*Op0H
2155 // -----------------------------------------
2156 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
2157 //
2158 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
2159 // value and that would overflow.
2160 // The low 32-bit value is Op1L*Op0L.
2161 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from
2162 // Op1L*Op0L).
2163
2164 ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank);
2165
2166 Register Hi = B.buildUMulH(Dst: HalfTy, Src0: Src0Regs[0], Src1: Src1Regs[0]).getReg(Idx: 0);
2167 Register MulLoHi = B.buildMul(Dst: HalfTy, Src0: Src0Regs[0], Src1: Src1Regs[1]).getReg(Idx: 0);
2168 Register Add = B.buildAdd(Dst: HalfTy, Src0: Hi, Src1: MulLoHi).getReg(Idx: 0);
2169 Register MulHiLo = B.buildMul(Dst: HalfTy, Src0: Src0Regs[1], Src1: Src1Regs[0]).getReg(Idx: 0);
2170 B.buildAdd(Dst: DefRegs[1], Src0: Add, Src1: MulHiLo);
2171 B.buildMul(Dst: DefRegs[0], Src0: Src0Regs[0], Src1: Src1Regs[0]);
2172
2173 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2174 MI.eraseFromParent();
2175}
2176
2177void AMDGPURegisterBankInfo::applyMappingImpl(
2178 MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
2179 MachineInstr &MI = OpdMapper.getMI();
2180 B.setInstrAndDebugLoc(MI);
2181 unsigned Opc = MI.getOpcode();
2182 MachineRegisterInfo &MRI = OpdMapper.getMRI();
2183 switch (Opc) {
2184 case AMDGPU::G_CONSTANT:
2185 case AMDGPU::G_IMPLICIT_DEF: {
2186 Register DstReg = MI.getOperand(i: 0).getReg();
2187 LLT DstTy = MRI.getType(Reg: DstReg);
2188 if (DstTy != LLT::scalar(SizeInBits: 1))
2189 break;
2190
2191 const RegisterBank *DstBank =
2192 OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
2193 if (DstBank == &AMDGPU::VCCRegBank)
2194 break;
2195 SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(OpIdx: 0));
2196 if (DefRegs.empty())
2197 DefRegs.push_back(Elt: DstReg);
2198
2199 B.setInsertPt(MBB&: *MI.getParent(), II: ++MI.getIterator());
2200
2201 Register NewDstReg = MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: 32));
2202 LLVMContext &Ctx = B.getMF().getFunction().getContext();
2203
2204 MI.getOperand(i: 0).setReg(NewDstReg);
2205 if (Opc != AMDGPU::G_IMPLICIT_DEF) {
2206 uint64_t ConstVal = MI.getOperand(i: 1).getCImm()->getZExtValue();
2207 MI.getOperand(i: 1).setCImm(
2208 ConstantInt::get(Ty: IntegerType::getInt32Ty(C&: Ctx), V: ConstVal));
2209 }
2210
2211 MRI.setRegBank(Reg: NewDstReg, RegBank: *DstBank);
2212 B.buildTrunc(Res: DefRegs[0], Op: NewDstReg);
2213 return;
2214 }
2215 case AMDGPU::G_PHI: {
2216 Register DstReg = MI.getOperand(i: 0).getReg();
2217 LLT DstTy = MRI.getType(Reg: DstReg);
2218 if (DstTy != LLT::scalar(SizeInBits: 1))
2219 break;
2220
2221 const LLT S32 = LLT::scalar(SizeInBits: 32);
2222 const RegisterBank *DstBank =
2223 OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
2224 if (DstBank == &AMDGPU::VCCRegBank) {
2225 applyDefaultMapping(OpdMapper);
2226 // The standard handling only considers the result register bank for
2227 // phis. For VCC, blindly inserting a copy when the phi is lowered will
2228 // produce an invalid copy. We can only copy with some kind of compare to
2229 // get a vector boolean result. Insert a register bank copy that will be
2230 // correctly lowered to a compare.
2231 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
2232 Register SrcReg = MI.getOperand(i: I).getReg();
2233 const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
2234
2235 if (SrcBank != &AMDGPU::VCCRegBank) {
2236 MachineBasicBlock *SrcMBB = MI.getOperand(i: I + 1).getMBB();
2237 B.setInsertPt(MBB&: *SrcMBB, II: SrcMBB->getFirstTerminator());
2238
2239 auto Copy = B.buildCopy(Res: LLT::scalar(SizeInBits: 1), Op: SrcReg);
2240 MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank);
2241 MI.getOperand(i: I).setReg(Copy.getReg(Idx: 0));
2242 }
2243 }
2244
2245 return;
2246 }
2247
2248 // Phi handling is strange and only considers the bank of the destination.
2249 substituteSimpleCopyRegs(OpdMapper, OpIdx: 0);
2250
2251 // Promote SGPR/VGPR booleans to s32
2252 ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);
2253 B.setInsertPt(MBB&: B.getMBB(), II: MI);
2254 LegalizerHelper Helper(B.getMF(), ApplyBank, B);
2255
2256 if (Helper.widenScalar(MI, TypeIdx: 0, WideTy: S32) != LegalizerHelper::Legalized)
2257 llvm_unreachable("widen scalar should have succeeded");
2258
2259 return;
2260 }
2261 case AMDGPU::G_FCMP:
2262 if (!Subtarget.hasSALUFloatInsts())
2263 break;
2264 LLVM_FALLTHROUGH;
2265 case AMDGPU::G_ICMP:
2266 case AMDGPU::G_UADDO:
2267 case AMDGPU::G_USUBO:
2268 case AMDGPU::G_UADDE:
2269 case AMDGPU::G_SADDE:
2270 case AMDGPU::G_USUBE:
2271 case AMDGPU::G_SSUBE: {
2272 unsigned BoolDstOp =
2273 (Opc == AMDGPU::G_ICMP || Opc == AMDGPU::G_FCMP) ? 0 : 1;
2274 Register DstReg = MI.getOperand(i: BoolDstOp).getReg();
2275
2276 const RegisterBank *DstBank =
2277 OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
2278 if (DstBank != &AMDGPU::SGPRRegBank)
2279 break;
2280
2281 const bool HasCarryIn = MI.getNumOperands() == 5;
2282
2283 // If this is a scalar compare, promote the result to s32, as the selection
2284 // will end up using a copy to a 32-bit vreg.
2285 const LLT S32 = LLT::scalar(SizeInBits: 32);
2286 Register NewDstReg = MRI.createGenericVirtualRegister(Ty: S32);
2287 MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank);
2288 MI.getOperand(i: BoolDstOp).setReg(NewDstReg);
2289
2290 if (HasCarryIn) {
2291 Register NewSrcReg = MRI.createGenericVirtualRegister(Ty: S32);
2292 MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank);
2293 B.buildZExt(Res: NewSrcReg, Op: MI.getOperand(i: 4).getReg());
2294 MI.getOperand(i: 4).setReg(NewSrcReg);
2295 }
2296
2297 MachineBasicBlock *MBB = MI.getParent();
2298 B.setInsertPt(MBB&: *MBB, II: std::next(x: MI.getIterator()));
2299
2300 // If we had a constrained VCC result register, a copy was inserted to VCC
2301 // from SGPR.
2302 SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(OpIdx: 0));
2303 if (DefRegs.empty())
2304 DefRegs.push_back(Elt: DstReg);
2305 B.buildTrunc(Res: DefRegs[0], Op: NewDstReg);
2306 return;
2307 }
2308 case AMDGPU::G_SELECT: {
2309 Register DstReg = MI.getOperand(i: 0).getReg();
2310 LLT DstTy = MRI.getType(Reg: DstReg);
2311
2312 SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(OpIdx: 1));
2313 if (CondRegs.empty())
2314 CondRegs.push_back(Elt: MI.getOperand(i: 1).getReg());
2315 else {
2316 assert(CondRegs.size() == 1);
2317 }
2318
2319 const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI);
2320 if (CondBank == &AMDGPU::SGPRRegBank) {
2321 const LLT S32 = LLT::scalar(SizeInBits: 32);
2322 Register NewCondReg = MRI.createGenericVirtualRegister(Ty: S32);
2323 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2324
2325 MI.getOperand(i: 1).setReg(NewCondReg);
2326 B.buildZExt(Res: NewCondReg, Op: CondRegs[0]);
2327 }
2328
2329 if (DstTy.getSizeInBits() != 64)
2330 break;
2331
2332 LLT HalfTy = getHalfSizedType(Ty: DstTy);
2333
2334 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(OpIdx: 0));
2335 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(OpIdx: 2));
2336 SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(OpIdx: 3));
2337
2338 // All inputs are SGPRs, nothing special to do.
2339 if (DefRegs.empty()) {
2340 assert(Src1Regs.empty() && Src2Regs.empty());
2341 break;
2342 }
2343
2344 if (Src1Regs.empty())
2345 split64BitValueForMapping(B, Regs&: Src1Regs, HalfTy, Reg: MI.getOperand(i: 2).getReg());
2346 else {
2347 setRegsToType(MRI, Regs: Src1Regs, NewTy: HalfTy);
2348 }
2349
2350 if (Src2Regs.empty())
2351 split64BitValueForMapping(B, Regs&: Src2Regs, HalfTy, Reg: MI.getOperand(i: 3).getReg());
2352 else
2353 setRegsToType(MRI, Regs: Src2Regs, NewTy: HalfTy);
2354
2355 setRegsToType(MRI, Regs: DefRegs, NewTy: HalfTy);
2356
2357 B.buildSelect(Res: DefRegs[0], Tst: CondRegs[0], Op0: Src1Regs[0], Op1: Src2Regs[0]);
2358 B.buildSelect(Res: DefRegs[1], Tst: CondRegs[0], Op0: Src1Regs[1], Op1: Src2Regs[1]);
2359
2360 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2361 MI.eraseFromParent();
2362 return;
2363 }
2364 case AMDGPU::G_BRCOND: {
2365 Register CondReg = MI.getOperand(i: 0).getReg();
2366 // FIXME: Should use legalizer helper, but should change bool ext type.
2367 const RegisterBank *CondBank =
2368 OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
2369
2370 if (CondBank == &AMDGPU::SGPRRegBank) {
2371 const LLT S32 = LLT::scalar(SizeInBits: 32);
2372 Register NewCondReg = MRI.createGenericVirtualRegister(Ty: S32);
2373 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2374
2375 MI.getOperand(i: 0).setReg(NewCondReg);
2376 B.buildZExt(Res: NewCondReg, Op: CondReg);
2377 return;
2378 }
2379
2380 break;
2381 }
2382 case AMDGPU::G_AND:
2383 case AMDGPU::G_OR:
2384 case AMDGPU::G_XOR: {
2385 // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
2386 // there is a VGPR input.
2387 Register DstReg = MI.getOperand(i: 0).getReg();
2388 LLT DstTy = MRI.getType(Reg: DstReg);
2389
2390 if (DstTy.getSizeInBits() == 1) {
2391 const RegisterBank *DstBank =
2392 OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
2393 if (DstBank == &AMDGPU::VCCRegBank)
2394 break;
2395
2396 MachineFunction *MF = MI.getParent()->getParent();
2397 ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);
2398 LegalizerHelper Helper(*MF, ApplyBank, B);
2399
2400 if (Helper.widenScalar(MI, TypeIdx: 0, WideTy: LLT::scalar(SizeInBits: 32)) !=
2401 LegalizerHelper::Legalized)
2402 llvm_unreachable("widen scalar should have succeeded");
2403 return;
2404 }
2405
2406 if (DstTy.getSizeInBits() != 64)
2407 break;
2408
2409 LLT HalfTy = getHalfSizedType(Ty: DstTy);
2410 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(OpIdx: 0));
2411 SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(OpIdx: 1));
2412 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(OpIdx: 2));
2413
2414 // All inputs are SGPRs, nothing special to do.
2415 if (DefRegs.empty()) {
2416 assert(Src0Regs.empty() && Src1Regs.empty());
2417 break;
2418 }
2419
2420 assert(DefRegs.size() == 2);
2421 assert(Src0Regs.size() == Src1Regs.size() &&
2422 (Src0Regs.empty() || Src0Regs.size() == 2));
2423
2424 // Depending on where the source registers came from, the generic code may
2425 // have decided to split the inputs already or not. If not, we still need to
2426 // extract the values.
2427
2428 if (Src0Regs.empty())
2429 split64BitValueForMapping(B, Regs&: Src0Regs, HalfTy, Reg: MI.getOperand(i: 1).getReg());
2430 else
2431 setRegsToType(MRI, Regs: Src0Regs, NewTy: HalfTy);
2432
2433 if (Src1Regs.empty())
2434 split64BitValueForMapping(B, Regs&: Src1Regs, HalfTy, Reg: MI.getOperand(i: 2).getReg());
2435 else
2436 setRegsToType(MRI, Regs: Src1Regs, NewTy: HalfTy);
2437
2438 setRegsToType(MRI, Regs: DefRegs, NewTy: HalfTy);
2439
2440 B.buildInstr(Opc, DstOps: {DefRegs[0]}, SrcOps: {Src0Regs[0], Src1Regs[0]});
2441 B.buildInstr(Opc, DstOps: {DefRegs[1]}, SrcOps: {Src0Regs[1], Src1Regs[1]});
2442
2443 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2444 MI.eraseFromParent();
2445 return;
2446 }
2447 case AMDGPU::G_ABS: {
2448 Register SrcReg = MI.getOperand(i: 1).getReg();
2449 const RegisterBank *SrcBank = MRI.getRegBankOrNull(Reg: SrcReg);
2450
2451 // There is no VALU abs instruction so we need to replace it with a sub and
2452 // max combination.
2453 if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) {
2454 MachineFunction *MF = MI.getParent()->getParent();
2455 ApplyRegBankMapping Apply(B, *this, MRI, &AMDGPU::VGPRRegBank);
2456 LegalizerHelper Helper(*MF, Apply, B);
2457
2458 if (Helper.lowerAbsToMaxNeg(MI) != LegalizerHelper::Legalized)
2459 llvm_unreachable("lowerAbsToMaxNeg should have succeeded");
2460 return;
2461 }
2462 [[fallthrough]];
2463 }
2464 case AMDGPU::G_ADD:
2465 case AMDGPU::G_SUB:
2466 case AMDGPU::G_MUL:
2467 case AMDGPU::G_SHL:
2468 case AMDGPU::G_LSHR:
2469 case AMDGPU::G_ASHR:
2470 case AMDGPU::G_SMIN:
2471 case AMDGPU::G_SMAX:
2472 case AMDGPU::G_UMIN:
2473 case AMDGPU::G_UMAX: {
2474 Register DstReg = MI.getOperand(i: 0).getReg();
2475 LLT DstTy = MRI.getType(Reg: DstReg);
2476
2477 // Special case for s_mul_u64. There is not a vector equivalent of
2478 // s_mul_u64. Hence, we have to break down s_mul_u64 into 32-bit vector
2479 // multiplications.
2480 if (Opc == AMDGPU::G_MUL && DstTy.getSizeInBits() == 64) {
2481 applyMappingSMULU64(B, OpdMapper);
2482 return;
2483 }
2484
2485 // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
2486 // Packed 16-bit operations need to be scalarized and promoted.
2487 if (DstTy != LLT::scalar(SizeInBits: 16) && DstTy != LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16))
2488 break;
2489
2490 const RegisterBank *DstBank =
2491 OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
2492 if (DstBank == &AMDGPU::VGPRRegBank)
2493 break;
2494
2495 const LLT S32 = LLT::scalar(SizeInBits: 32);
2496 MachineBasicBlock *MBB = MI.getParent();
2497 MachineFunction *MF = MBB->getParent();
2498 ApplyRegBankMapping ApplySALU(B, *this, MRI, &AMDGPU::SGPRRegBank);
2499
2500 if (DstTy.isVector() && Opc == AMDGPU::G_ABS) {
2501 Register WideSrcLo, WideSrcHi;
2502
2503 std::tie(args&: WideSrcLo, args&: WideSrcHi) =
2504 unpackV2S16ToS32(B, Src: MI.getOperand(i: 1).getReg(), ExtOpcode: TargetOpcode::G_SEXT);
2505 auto Lo = B.buildInstr(AMDGPU::G_ABS, {S32}, {WideSrcLo});
2506 auto Hi = B.buildInstr(AMDGPU::G_ABS, {S32}, {WideSrcHi});
2507 B.buildBuildVectorTrunc(Res: DstReg, Ops: {Lo.getReg(0), Hi.getReg(0)});
2508 MI.eraseFromParent();
2509 return;
2510 }
2511
2512 if (DstTy.isVector()) {
2513 Register WideSrc0Lo, WideSrc0Hi;
2514 Register WideSrc1Lo, WideSrc1Hi;
2515
2516 unsigned ExtendOp = getExtendOp(Opc: MI.getOpcode());
2517 std::tie(args&: WideSrc0Lo, args&: WideSrc0Hi)
2518 = unpackV2S16ToS32(B, Src: MI.getOperand(i: 1).getReg(), ExtOpcode: ExtendOp);
2519 std::tie(args&: WideSrc1Lo, args&: WideSrc1Hi)
2520 = unpackV2S16ToS32(B, Src: MI.getOperand(i: 2).getReg(), ExtOpcode: ExtendOp);
2521 auto Lo = B.buildInstr(Opc: MI.getOpcode(), DstOps: {S32}, SrcOps: {WideSrc0Lo, WideSrc1Lo});
2522 auto Hi = B.buildInstr(Opc: MI.getOpcode(), DstOps: {S32}, SrcOps: {WideSrc0Hi, WideSrc1Hi});
2523 B.buildBuildVectorTrunc(Res: DstReg, Ops: {Lo.getReg(Idx: 0), Hi.getReg(Idx: 0)});
2524 MI.eraseFromParent();
2525 } else {
2526 LegalizerHelper Helper(*MF, ApplySALU, B);
2527
2528 if (Helper.widenScalar(MI, TypeIdx: 0, WideTy: S32) != LegalizerHelper::Legalized)
2529 llvm_unreachable("widen scalar should have succeeded");
2530
2531 // FIXME: s16 shift amounts should be legal.
2532 if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR ||
2533 Opc == AMDGPU::G_ASHR) {
2534 B.setInsertPt(MBB&: *MBB, II: MI.getIterator());
2535 if (Helper.widenScalar(MI, TypeIdx: 1, WideTy: S32) != LegalizerHelper::Legalized)
2536 llvm_unreachable("widen scalar should have succeeded");
2537 }
2538 }
2539
2540 return;
2541 }
2542 case AMDGPU::G_AMDGPU_S_MUL_I64_I32:
2543 case AMDGPU::G_AMDGPU_S_MUL_U64_U32: {
2544 // This is a special case for s_mul_u64. We use
2545 // G_AMDGPU_S_MUL_I64_I32 opcode to represent an s_mul_u64 operation
2546 // where the 33 higher bits are sign-extended and
2547 // G_AMDGPU_S_MUL_U64_U32 opcode to represent an s_mul_u64 operation
2548 // where the 32 higher bits are zero-extended. In case scalar registers are
2549 // selected, both opcodes are lowered as s_mul_u64. If the vector registers
2550 // are selected, then G_AMDGPU_S_MUL_I64_I32 and
2551 // G_AMDGPU_S_MUL_U64_U32 are lowered with a vector mad instruction.
2552
2553 // Insert basic copies.
2554 applyDefaultMapping(OpdMapper);
2555
2556 Register DstReg = MI.getOperand(i: 0).getReg();
2557 Register SrcReg0 = MI.getOperand(i: 1).getReg();
2558 Register SrcReg1 = MI.getOperand(i: 2).getReg();
2559 const LLT S32 = LLT::scalar(SizeInBits: 32);
2560 const LLT S64 = LLT::scalar(SizeInBits: 64);
2561 assert(MRI.getType(DstReg) == S64 && "This is a special case for s_mul_u64 "
2562 "that handles only 64-bit operands.");
2563 const RegisterBank *DstBank =
2564 OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
2565
2566 // Replace G_AMDGPU_S_MUL_I64_I32 and G_AMDGPU_S_MUL_U64_U32
2567 // with s_mul_u64 operation.
2568 if (DstBank == &AMDGPU::SGPRRegBank) {
2569 MI.setDesc(TII->get(AMDGPU::S_MUL_U64));
2570 MRI.setRegClass(DstReg, &AMDGPU::SGPR_64RegClass);
2571 MRI.setRegClass(SrcReg0, &AMDGPU::SGPR_64RegClass);
2572 MRI.setRegClass(SrcReg1, &AMDGPU::SGPR_64RegClass);
2573 return;
2574 }
2575
2576 // Replace G_AMDGPU_S_MUL_I64_I32 and G_AMDGPU_S_MUL_U64_U32
2577 // with a vector mad.
2578 assert(MRI.getRegBankOrNull(DstReg) == &AMDGPU::VGPRRegBank &&
2579 "The destination operand should be in vector registers.");
2580
2581 DebugLoc DL = MI.getDebugLoc();
2582
2583 // Extract the lower subregister from the first operand.
2584 Register Op0L = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2585 MRI.setRegClass(Op0L, &AMDGPU::VGPR_32RegClass);
2586 MRI.setType(VReg: Op0L, Ty: S32);
2587 B.buildTrunc(Res: Op0L, Op: SrcReg0);
2588
2589 // Extract the lower subregister from the second operand.
2590 Register Op1L = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2591 MRI.setRegClass(Op1L, &AMDGPU::VGPR_32RegClass);
2592 MRI.setType(VReg: Op1L, Ty: S32);
2593 B.buildTrunc(Res: Op1L, Op: SrcReg1);
2594
2595 unsigned NewOpc = Opc == AMDGPU::G_AMDGPU_S_MUL_U64_U32
2596 ? AMDGPU::G_AMDGPU_MAD_U64_U32
2597 : AMDGPU::G_AMDGPU_MAD_I64_I32;
2598
2599 MachineIRBuilder B(MI);
2600 Register Zero64 = B.buildConstant(Res: S64, Val: 0).getReg(Idx: 0);
2601 MRI.setRegClass(Zero64, &AMDGPU::VReg_64RegClass);
2602 Register CarryOut = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
2603 MRI.setRegClass(CarryOut, &AMDGPU::VReg_64RegClass);
2604 B.buildInstr(Opc: NewOpc, DstOps: {DstReg, CarryOut}, SrcOps: {Op0L, Op1L, Zero64});
2605 MI.eraseFromParent();
2606 return;
2607 }
2608 case AMDGPU::G_SEXT_INREG: {
2609 SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(OpIdx: 1));
2610 if (SrcRegs.empty())
2611 break; // Nothing to repair
2612
2613 const LLT S32 = LLT::scalar(SizeInBits: 32);
2614 ApplyRegBankMapping O(B, *this, MRI, &AMDGPU::VGPRRegBank);
2615
2616 // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs
2617 // we would need to further expand, and doesn't let us directly set the
2618 // result registers.
2619 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(OpIdx: 0));
2620
2621 int Amt = MI.getOperand(i: 2).getImm();
2622 if (Amt <= 32) {
2623 // Downstream users have expectations for the high bit behavior, so freeze
2624 // incoming undefined bits.
2625 if (Amt == 32) {
2626 // The low bits are unchanged.
2627 B.buildFreeze(Dst: DstRegs[0], Src: SrcRegs[0]);
2628 } else {
2629 auto Freeze = B.buildFreeze(Dst: S32, Src: SrcRegs[0]);
2630 // Extend in the low bits and propagate the sign bit to the high half.
2631 B.buildSExtInReg(Res: DstRegs[0], Op: Freeze, ImmOp: Amt);
2632 }
2633
2634 B.buildAShr(Dst: DstRegs[1], Src0: DstRegs[0], Src1: B.buildConstant(Res: S32, Val: 31));
2635 } else {
2636 // The low bits are unchanged, and extend in the high bits.
2637 // No freeze required
2638 B.buildCopy(Res: DstRegs[0], Op: SrcRegs[0]);
2639 B.buildSExtInReg(Res: DstRegs[1], Op: DstRegs[0], ImmOp: Amt - 32);
2640 }
2641
2642 Register DstReg = MI.getOperand(i: 0).getReg();
2643 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2644 MI.eraseFromParent();
2645 return;
2646 }
2647 case AMDGPU::G_CTPOP:
2648 case AMDGPU::G_BITREVERSE: {
2649 const RegisterBank *DstBank =
2650 OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
2651 if (DstBank == &AMDGPU::SGPRRegBank)
2652 break;
2653
2654 Register SrcReg = MI.getOperand(i: 1).getReg();
2655 const LLT S32 = LLT::scalar(SizeInBits: 32);
2656 LLT Ty = MRI.getType(Reg: SrcReg);
2657 if (Ty == S32)
2658 break;
2659
2660 ApplyRegBankMapping ApplyVALU(B, *this, MRI, &AMDGPU::VGPRRegBank);
2661
2662 MachineFunction &MF = B.getMF();
2663 LegalizerHelper Helper(MF, ApplyVALU, B);
2664
2665 if (Helper.narrowScalar(MI, TypeIdx: 1, NarrowTy: S32) != LegalizerHelper::Legalized)
2666 llvm_unreachable("narrowScalar should have succeeded");
2667 return;
2668 }
2669 case AMDGPU::G_AMDGPU_FFBH_U32:
2670 case AMDGPU::G_AMDGPU_FFBL_B32:
2671 case AMDGPU::G_CTLZ_ZERO_UNDEF:
2672 case AMDGPU::G_CTTZ_ZERO_UNDEF: {
2673 const RegisterBank *DstBank =
2674 OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
2675 if (DstBank == &AMDGPU::SGPRRegBank)
2676 break;
2677
2678 Register SrcReg = MI.getOperand(i: 1).getReg();
2679 const LLT S32 = LLT::scalar(SizeInBits: 32);
2680 LLT Ty = MRI.getType(Reg: SrcReg);
2681 if (Ty == S32)
2682 break;
2683
2684 // We can narrow this more efficiently than Helper can by using ffbh/ffbl
2685 // which return -1 when the input is zero:
2686 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
2687 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
2688 // (ffbh hi:lo) -> (umin (ffbh hi), (uaddsat (ffbh lo), 32))
2689 // (ffbl hi:lo) -> (umin (uaddsat (ffbh hi), 32), (ffbh lo))
2690 ApplyRegBankMapping ApplyVALU(B, *this, MRI, &AMDGPU::VGPRRegBank);
2691 SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(OpIdx: 1));
2692 unsigned NewOpc = Opc == AMDGPU::G_CTLZ_ZERO_UNDEF
2693 ? (unsigned)AMDGPU::G_AMDGPU_FFBH_U32
2694 : Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
2695 ? (unsigned)AMDGPU::G_AMDGPU_FFBL_B32
2696 : Opc;
2697 unsigned Idx = NewOpc == AMDGPU::G_AMDGPU_FFBH_U32;
2698 auto X = B.buildInstr(Opc: NewOpc, DstOps: {S32}, SrcOps: {SrcRegs[Idx]});
2699 auto Y = B.buildInstr(Opc: NewOpc, DstOps: {S32}, SrcOps: {SrcRegs[Idx ^ 1]});
2700 unsigned AddOpc =
2701 Opc == AMDGPU::G_CTLZ_ZERO_UNDEF || Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
2702 ? AMDGPU::G_ADD
2703 : AMDGPU::G_UADDSAT;
2704 Y = B.buildInstr(Opc: AddOpc, DstOps: {S32}, SrcOps: {Y, B.buildConstant(Res: S32, Val: 32)});
2705 Register DstReg = MI.getOperand(i: 0).getReg();
2706 B.buildUMin(Dst: DstReg, Src0: X, Src1: Y);
2707 MI.eraseFromParent();
2708 return;
2709 }
2710 case AMDGPU::G_SEXT:
2711 case AMDGPU::G_ZEXT:
2712 case AMDGPU::G_ANYEXT: {
2713 Register SrcReg = MI.getOperand(i: 1).getReg();
2714 LLT SrcTy = MRI.getType(Reg: SrcReg);
2715 const bool Signed = Opc == AMDGPU::G_SEXT;
2716
2717 assert(OpdMapper.getVRegs(1).empty());
2718
2719 const RegisterBank *SrcBank =
2720 OpdMapper.getInstrMapping().getOperandMapping(i: 1).BreakDown[0].RegBank;
2721
2722 Register DstReg = MI.getOperand(i: 0).getReg();
2723 LLT DstTy = MRI.getType(Reg: DstReg);
2724 if (DstTy.isScalar() &&
2725 SrcBank != &AMDGPU::SGPRRegBank &&
2726 SrcBank != &AMDGPU::VCCRegBank &&
2727 // FIXME: Should handle any type that round to s64 when irregular
2728 // breakdowns supported.
2729 DstTy.getSizeInBits() == 64 &&
2730 SrcTy.getSizeInBits() <= 32) {
2731 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(OpIdx: 0));
2732
2733 // Extend to 32-bit, and then extend the low half.
2734 if (Signed) {
2735 // TODO: Should really be buildSExtOrCopy
2736 B.buildSExtOrTrunc(Res: DefRegs[0], Op: SrcReg);
2737 } else if (Opc == AMDGPU::G_ZEXT) {
2738 B.buildZExtOrTrunc(Res: DefRegs[0], Op: SrcReg);
2739 } else {
2740 B.buildAnyExtOrTrunc(Res: DefRegs[0], Op: SrcReg);
2741 }
2742
2743 extendLow32IntoHigh32(B, Hi32Reg: DefRegs[1], Lo32Reg: DefRegs[0], ExtOpc: Opc, RegBank: *SrcBank);
2744 MRI.setRegBank(Reg: DstReg, RegBank: *SrcBank);
2745 MI.eraseFromParent();
2746 return;
2747 }
2748
2749 if (SrcTy != LLT::scalar(SizeInBits: 1))
2750 return;
2751
2752 // It is not legal to have a legalization artifact with a VCC source. Rather
2753 // than introducing a copy, insert the select we would have to select the
2754 // copy to.
2755 if (SrcBank == &AMDGPU::VCCRegBank) {
2756 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(OpIdx: 0));
2757
2758 const RegisterBank *DstBank = &AMDGPU::VGPRRegBank;
2759
2760 unsigned DstSize = DstTy.getSizeInBits();
2761 // 64-bit select is SGPR only
2762 const bool UseSel64 = DstSize > 32 &&
2763 SrcBank->getID() == AMDGPU::SGPRRegBankID;
2764
2765 // TODO: Should s16 select be legal?
2766 LLT SelType = UseSel64 ? LLT::scalar(SizeInBits: 64) : LLT::scalar(SizeInBits: 32);
2767 auto True = B.buildConstant(Res: SelType, Val: Signed ? -1 : 1);
2768 auto False = B.buildConstant(Res: SelType, Val: 0);
2769
2770 MRI.setRegBank(Reg: True.getReg(Idx: 0), RegBank: *DstBank);
2771 MRI.setRegBank(Reg: False.getReg(Idx: 0), RegBank: *DstBank);
2772 MRI.setRegBank(Reg: DstReg, RegBank: *DstBank);
2773
2774 if (DstSize > 32) {
2775 B.buildSelect(Res: DefRegs[0], Tst: SrcReg, Op0: True, Op1: False);
2776 extendLow32IntoHigh32(B, Hi32Reg: DefRegs[1], Lo32Reg: DefRegs[0], ExtOpc: Opc, RegBank: *SrcBank, IsBooleanSrc: true);
2777 } else if (DstSize < 32) {
2778 auto Sel = B.buildSelect(Res: SelType, Tst: SrcReg, Op0: True, Op1: False);
2779 MRI.setRegBank(Reg: Sel.getReg(Idx: 0), RegBank: *DstBank);
2780 B.buildTrunc(Res: DstReg, Op: Sel);
2781 } else {
2782 B.buildSelect(Res: DstReg, Tst: SrcReg, Op0: True, Op1: False);
2783 }
2784
2785 MI.eraseFromParent();
2786 return;
2787 }
2788
2789 break;
2790 }
2791 case AMDGPU::G_EXTRACT_VECTOR_ELT: {
2792 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(OpIdx: 0));
2793
2794 assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty());
2795
2796 Register DstReg = MI.getOperand(i: 0).getReg();
2797 Register SrcReg = MI.getOperand(i: 1).getReg();
2798
2799 const LLT S32 = LLT::scalar(SizeInBits: 32);
2800 LLT DstTy = MRI.getType(Reg: DstReg);
2801 LLT SrcTy = MRI.getType(Reg: SrcReg);
2802
2803 if (foldExtractEltToCmpSelect(B, MI, OpdMapper))
2804 return;
2805
2806 const ValueMapping &DstMapping
2807 = OpdMapper.getInstrMapping().getOperandMapping(i: 0);
2808 const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank;
2809 const RegisterBank *SrcBank =
2810 OpdMapper.getInstrMapping().getOperandMapping(i: 1).BreakDown[0].RegBank;
2811 const RegisterBank *IdxBank =
2812 OpdMapper.getInstrMapping().getOperandMapping(i: 2).BreakDown[0].RegBank;
2813
2814 Register BaseIdxReg;
2815 unsigned ConstOffset;
2816 std::tie(args&: BaseIdxReg, args&: ConstOffset) =
2817 AMDGPU::getBaseWithConstantOffset(MRI, Reg: MI.getOperand(i: 2).getReg());
2818
2819 // See if the index is an add of a constant which will be foldable by moving
2820 // the base register of the index later if this is going to be executed in a
2821 // waterfall loop. This is essentially to reassociate the add of a constant
2822 // with the readfirstlane.
2823 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2824 ConstOffset > 0 &&
2825 ConstOffset < SrcTy.getNumElements();
2826
2827 // Move the base register. We'll re-insert the add later.
2828 if (ShouldMoveIndexIntoLoop)
2829 MI.getOperand(i: 2).setReg(BaseIdxReg);
2830
2831 // If this is a VGPR result only because the index was a VGPR result, the
2832 // actual indexing will be done on the SGPR source vector, which will
2833 // produce a scalar result. We need to copy to the VGPR result inside the
2834 // waterfall loop.
2835 const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank &&
2836 SrcBank == &AMDGPU::SGPRRegBank;
2837 if (DstRegs.empty()) {
2838 applyDefaultMapping(OpdMapper);
2839
2840 executeInWaterfallLoop(B, MI, OpIndices: {2});
2841
2842 if (NeedCopyToVGPR) {
2843 // We don't want a phi for this temporary reg.
2844 Register TmpReg = MRI.createGenericVirtualRegister(Ty: DstTy);
2845 MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank);
2846 MI.getOperand(i: 0).setReg(TmpReg);
2847 B.setInsertPt(MBB&: *MI.getParent(), II: ++MI.getIterator());
2848
2849 // Use a v_mov_b32 here to make the exec dependency explicit.
2850 buildVCopy(B, DstReg, SrcReg: TmpReg);
2851 }
2852
2853 // Re-insert the constant offset add inside the waterfall loop.
2854 if (ShouldMoveIndexIntoLoop)
2855 reinsertVectorIndexAdd(B, IdxUseInstr&: MI, OpIdx: 2, ConstOffset);
2856
2857 return;
2858 }
2859
2860 assert(DstTy.getSizeInBits() == 64);
2861
2862 LLT Vec32 = LLT::fixed_vector(NumElements: 2 * SrcTy.getNumElements(), ScalarSizeInBits: 32);
2863
2864 auto CastSrc = B.buildBitcast(Dst: Vec32, Src: SrcReg);
2865 auto One = B.buildConstant(Res: S32, Val: 1);
2866
2867 MachineBasicBlock::iterator MII = MI.getIterator();
2868
2869 // Split the vector index into 32-bit pieces. Prepare to move all of the
2870 // new instructions into a waterfall loop if necessary.
2871 //
2872 // Don't put the bitcast or constant in the loop.
2873 MachineInstrSpan Span(MII, &B.getMBB());
2874
2875 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2876 auto IdxLo = B.buildShl(Dst: S32, Src0: BaseIdxReg, Src1: One);
2877 auto IdxHi = B.buildAdd(Dst: S32, Src0: IdxLo, Src1: One);
2878
2879 auto Extract0 = B.buildExtractVectorElement(Res: DstRegs[0], Val: CastSrc, Idx: IdxLo);
2880 auto Extract1 = B.buildExtractVectorElement(Res: DstRegs[1], Val: CastSrc, Idx: IdxHi);
2881
2882 MRI.setRegBank(Reg: DstReg, RegBank: *DstBank);
2883 MRI.setRegBank(Reg: CastSrc.getReg(Idx: 0), RegBank: *SrcBank);
2884 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2885 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2886 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2887
2888 SmallSet<Register, 4> OpsToWaterfall;
2889 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) {
2890 MI.eraseFromParent();
2891 return;
2892 }
2893
2894 // Remove the original instruction to avoid potentially confusing the
2895 // waterfall loop logic.
2896 B.setInstr(*Span.begin());
2897 MI.eraseFromParent();
2898 executeInWaterfallLoop(B, make_range(x: Span.begin(), y: Span.end()),
2899 OpsToWaterfall);
2900
2901 if (NeedCopyToVGPR) {
2902 MachineBasicBlock *LoopBB = Extract1->getParent();
2903 Register TmpReg0 = MRI.createGenericVirtualRegister(Ty: S32);
2904 Register TmpReg1 = MRI.createGenericVirtualRegister(Ty: S32);
2905 MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank);
2906 MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank);
2907
2908 Extract0->getOperand(i: 0).setReg(TmpReg0);
2909 Extract1->getOperand(i: 0).setReg(TmpReg1);
2910
2911 B.setInsertPt(MBB&: *LoopBB, II: ++Extract1->getIterator());
2912
2913 buildVCopy(B, DstReg: DstRegs[0], SrcReg: TmpReg0);
2914 buildVCopy(B, DstReg: DstRegs[1], SrcReg: TmpReg1);
2915 }
2916
2917 if (ShouldMoveIndexIntoLoop)
2918 reinsertVectorIndexAdd(B, IdxUseInstr&: *IdxLo, OpIdx: 1, ConstOffset);
2919
2920 return;
2921 }
2922 case AMDGPU::G_INSERT_VECTOR_ELT: {
2923 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(OpIdx: 2));
2924
2925 Register DstReg = MI.getOperand(i: 0).getReg();
2926 LLT VecTy = MRI.getType(Reg: DstReg);
2927
2928 assert(OpdMapper.getVRegs(0).empty());
2929 assert(OpdMapper.getVRegs(3).empty());
2930
2931 if (substituteSimpleCopyRegs(OpdMapper, OpIdx: 1))
2932 MRI.setType(VReg: MI.getOperand(i: 1).getReg(), Ty: VecTy);
2933
2934 if (foldInsertEltToCmpSelect(B, MI, OpdMapper))
2935 return;
2936
2937 const RegisterBank *IdxBank =
2938 OpdMapper.getInstrMapping().getOperandMapping(i: 3).BreakDown[0].RegBank;
2939
2940 Register SrcReg = MI.getOperand(i: 1).getReg();
2941 Register InsReg = MI.getOperand(i: 2).getReg();
2942 LLT InsTy = MRI.getType(Reg: InsReg);
2943 (void)InsTy;
2944
2945 Register BaseIdxReg;
2946 unsigned ConstOffset;
2947 std::tie(args&: BaseIdxReg, args&: ConstOffset) =
2948 AMDGPU::getBaseWithConstantOffset(MRI, Reg: MI.getOperand(i: 3).getReg());
2949
2950 // See if the index is an add of a constant which will be foldable by moving
2951 // the base register of the index later if this is going to be executed in a
2952 // waterfall loop. This is essentially to reassociate the add of a constant
2953 // with the readfirstlane.
2954 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2955 ConstOffset > 0 &&
2956 ConstOffset < VecTy.getNumElements();
2957
2958 // Move the base register. We'll re-insert the add later.
2959 if (ShouldMoveIndexIntoLoop)
2960 MI.getOperand(i: 3).setReg(BaseIdxReg);
2961
2962
2963 if (InsRegs.empty()) {
2964 executeInWaterfallLoop(B, MI, OpIndices: {3});
2965
2966 // Re-insert the constant offset add inside the waterfall loop.
2967 if (ShouldMoveIndexIntoLoop) {
2968 reinsertVectorIndexAdd(B, IdxUseInstr&: MI, OpIdx: 3, ConstOffset);
2969 }
2970
2971 return;
2972 }
2973
2974 assert(InsTy.getSizeInBits() == 64);
2975
2976 const LLT S32 = LLT::scalar(SizeInBits: 32);
2977 LLT Vec32 = LLT::fixed_vector(NumElements: 2 * VecTy.getNumElements(), ScalarSizeInBits: 32);
2978
2979 auto CastSrc = B.buildBitcast(Dst: Vec32, Src: SrcReg);
2980 auto One = B.buildConstant(Res: S32, Val: 1);
2981
2982 // Split the vector index into 32-bit pieces. Prepare to move all of the
2983 // new instructions into a waterfall loop if necessary.
2984 //
2985 // Don't put the bitcast or constant in the loop.
2986 MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
2987
2988 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2989 auto IdxLo = B.buildShl(Dst: S32, Src0: BaseIdxReg, Src1: One);
2990 auto IdxHi = B.buildAdd(Dst: S32, Src0: IdxLo, Src1: One);
2991
2992 auto InsLo = B.buildInsertVectorElement(Res: Vec32, Val: CastSrc, Elt: InsRegs[0], Idx: IdxLo);
2993 auto InsHi = B.buildInsertVectorElement(Res: Vec32, Val: InsLo, Elt: InsRegs[1], Idx: IdxHi);
2994
2995 const RegisterBank *DstBank =
2996 OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank;
2997 const RegisterBank *SrcBank =
2998 OpdMapper.getInstrMapping().getOperandMapping(i: 1).BreakDown[0].RegBank;
2999 const RegisterBank *InsSrcBank =
3000 OpdMapper.getInstrMapping().getOperandMapping(i: 2).BreakDown[0].RegBank;
3001
3002 MRI.setRegBank(Reg: InsReg, RegBank: *InsSrcBank);
3003 MRI.setRegBank(Reg: CastSrc.getReg(Idx: 0), RegBank: *SrcBank);
3004 MRI.setRegBank(Reg: InsLo.getReg(Idx: 0), RegBank: *DstBank);
3005 MRI.setRegBank(Reg: InsHi.getReg(Idx: 0), RegBank: *DstBank);
3006 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
3007 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
3008 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
3009
3010
3011 SmallSet<Register, 4> OpsToWaterfall;
3012 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) {
3013 B.setInsertPt(MBB&: B.getMBB(), II: MI);
3014 B.buildBitcast(Dst: DstReg, Src: InsHi);
3015 MI.eraseFromParent();
3016 return;
3017 }
3018
3019 B.setInstr(*Span.begin());
3020 MI.eraseFromParent();
3021
3022 // Figure out the point after the waterfall loop before mangling the control
3023 // flow.
3024 executeInWaterfallLoop(B, make_range(x: Span.begin(), y: Span.end()),
3025 OpsToWaterfall);
3026
3027 // The insertion point is now right after the original instruction.
3028 //
3029 // Keep the bitcast to the original vector type out of the loop. Doing this
3030 // saved an extra phi we don't need inside the loop.
3031 B.buildBitcast(Dst: DstReg, Src: InsHi);
3032
3033 // Re-insert the constant offset add inside the waterfall loop.
3034 if (ShouldMoveIndexIntoLoop)
3035 reinsertVectorIndexAdd(B, IdxUseInstr&: *IdxLo, OpIdx: 1, ConstOffset);
3036
3037 return;
3038 }
3039 case AMDGPU::G_AMDGPU_BUFFER_LOAD:
3040 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
3041 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
3042 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
3043 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
3044 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
3045 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE:
3046 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
3047 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
3048 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
3049 case AMDGPU::G_AMDGPU_BUFFER_STORE:
3050 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
3051 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
3052 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
3053 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16:
3054 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
3055 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: {
3056 applyDefaultMapping(OpdMapper);
3057 executeInWaterfallLoop(B, MI, OpIndices: {1, 4});
3058 return;
3059 }
3060 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
3061 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
3062 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
3063 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
3064 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
3065 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
3066 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
3067 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
3068 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
3069 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
3070 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
3071 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: {
3072 applyDefaultMapping(OpdMapper);
3073 executeInWaterfallLoop(B, MI, OpIndices: {2, 5});
3074 return;
3075 }
3076 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
3077 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD_BF16:
3078 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
3079 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
3080 applyDefaultMapping(OpdMapper);
3081 executeInWaterfallLoop(B, MI, OpIndices: {2, 5});
3082 return;
3083 }
3084 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
3085 applyDefaultMapping(OpdMapper);
3086 executeInWaterfallLoop(B, MI, OpIndices: {3, 6});
3087 return;
3088 }
3089 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD:
3090 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
3091 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE:
3092 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
3093 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT: {
3094 applyMappingSBufferLoad(B, OpdMapper);
3095 return;
3096 }
3097 case AMDGPU::G_INTRINSIC:
3098 case AMDGPU::G_INTRINSIC_CONVERGENT: {
3099 switch (cast<GIntrinsic>(Val&: MI).getIntrinsicID()) {
3100 case Intrinsic::amdgcn_readlane: {
3101 substituteSimpleCopyRegs(OpdMapper, OpIdx: 2);
3102
3103 assert(OpdMapper.getVRegs(0).empty());
3104 assert(OpdMapper.getVRegs(3).empty());
3105
3106 // Make sure the index is an SGPR. It doesn't make sense to run this in a
3107 // waterfall loop, so assume it's a uniform value.
3108 constrainOpWithReadfirstlane(B, MI, OpIdx: 3); // Index
3109 return;
3110 }
3111 case Intrinsic::amdgcn_writelane: {
3112 assert(OpdMapper.getVRegs(0).empty());
3113 assert(OpdMapper.getVRegs(2).empty());
3114 assert(OpdMapper.getVRegs(3).empty());
3115
3116 substituteSimpleCopyRegs(OpdMapper, OpIdx: 4); // VGPR input val
3117 constrainOpWithReadfirstlane(B, MI, OpIdx: 2); // Source value
3118 constrainOpWithReadfirstlane(B, MI, OpIdx: 3); // Index
3119 return;
3120 }
3121 case Intrinsic::amdgcn_interp_p1:
3122 case Intrinsic::amdgcn_interp_p2:
3123 case Intrinsic::amdgcn_interp_mov:
3124 case Intrinsic::amdgcn_interp_p1_f16:
3125 case Intrinsic::amdgcn_interp_p2_f16:
3126 case Intrinsic::amdgcn_lds_param_load: {
3127 applyDefaultMapping(OpdMapper);
3128
3129 // Readlane for m0 value, which is always the last operand.
3130 // FIXME: Should this be a waterfall loop instead?
3131 constrainOpWithReadfirstlane(B, MI, OpIdx: MI.getNumOperands() - 1); // Index
3132 return;
3133 }
3134 case Intrinsic::amdgcn_interp_inreg_p10:
3135 case Intrinsic::amdgcn_interp_inreg_p2:
3136 case Intrinsic::amdgcn_interp_inreg_p10_f16:
3137 case Intrinsic::amdgcn_interp_inreg_p2_f16:
3138 case Intrinsic::amdgcn_interp_p10_rtz_f16:
3139 case Intrinsic::amdgcn_interp_p2_rtz_f16:
3140 applyDefaultMapping(OpdMapper);
3141 return;
3142 case Intrinsic::amdgcn_permlane16:
3143 case Intrinsic::amdgcn_permlanex16: {
3144 // Doing a waterfall loop over these wouldn't make any sense.
3145 substituteSimpleCopyRegs(OpdMapper, OpIdx: 2);
3146 substituteSimpleCopyRegs(OpdMapper, OpIdx: 3);
3147 constrainOpWithReadfirstlane(B, MI, OpIdx: 4);
3148 constrainOpWithReadfirstlane(B, MI, OpIdx: 5);
3149 return;
3150 }
3151 case Intrinsic::amdgcn_sbfe:
3152 applyMappingBFE(B, OpdMapper, Signed: true);
3153 return;
3154 case Intrinsic::amdgcn_ubfe:
3155 applyMappingBFE(B, OpdMapper, Signed: false);
3156 return;
3157 case Intrinsic::amdgcn_inverse_ballot:
3158 case Intrinsic::amdgcn_s_bitreplicate:
3159 case Intrinsic::amdgcn_s_quadmask:
3160 case Intrinsic::amdgcn_s_wqm:
3161 applyDefaultMapping(OpdMapper);
3162 constrainOpWithReadfirstlane(B, MI, OpIdx: 2); // Mask
3163 return;
3164 case Intrinsic::amdgcn_ballot:
3165 // Use default handling and insert copy to vcc source.
3166 break;
3167 }
3168 break;
3169 }
3170 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3171 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
3172 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
3173 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
3174 const AMDGPU::RsrcIntrinsic *RSrcIntrin =
3175 AMDGPU::lookupRsrcIntrinsic(Intr: AMDGPU::getIntrinsicID(I: MI));
3176 assert(RSrcIntrin && RSrcIntrin->IsImage);
3177 // Non-images can have complications from operands that allow both SGPR
3178 // and VGPR. For now it's too complicated to figure out the final opcode
3179 // to derive the register bank from the MCInstrDesc.
3180 applyMappingImage(B, MI, OpdMapper, RsrcIdx: RSrcIntrin->RsrcArg);
3181 return;
3182 }
3183 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
3184 unsigned N = MI.getNumExplicitOperands() - 2;
3185 applyDefaultMapping(OpdMapper);
3186 executeInWaterfallLoop(B, MI, OpIndices: {N});
3187 return;
3188 }
3189 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
3190 case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: {
3191 auto IntrID = cast<GIntrinsic>(Val&: MI).getIntrinsicID();
3192 switch (IntrID) {
3193 case Intrinsic::amdgcn_ds_ordered_add:
3194 case Intrinsic::amdgcn_ds_ordered_swap: {
3195 // This is only allowed to execute with 1 lane, so readfirstlane is safe.
3196 assert(OpdMapper.getVRegs(0).empty());
3197 substituteSimpleCopyRegs(OpdMapper, OpIdx: 3);
3198 constrainOpWithReadfirstlane(B, MI, OpIdx: 2); // M0
3199 return;
3200 }
3201 case Intrinsic::amdgcn_ds_gws_init:
3202 case Intrinsic::amdgcn_ds_gws_barrier:
3203 case Intrinsic::amdgcn_ds_gws_sema_br: {
3204 // Only the first lane is executes, so readfirstlane is safe.
3205 substituteSimpleCopyRegs(OpdMapper, OpIdx: 1);
3206 constrainOpWithReadfirstlane(B, MI, OpIdx: 2); // M0
3207 return;
3208 }
3209 case Intrinsic::amdgcn_ds_gws_sema_v:
3210 case Intrinsic::amdgcn_ds_gws_sema_p:
3211 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
3212 // Only the first lane is executes, so readfirstlane is safe.
3213 constrainOpWithReadfirstlane(B, MI, OpIdx: 1); // M0
3214 return;
3215 }
3216 case Intrinsic::amdgcn_ds_append:
3217 case Intrinsic::amdgcn_ds_consume: {
3218 constrainOpWithReadfirstlane(B, MI, OpIdx: 2); // M0
3219 return;
3220 }
3221 case Intrinsic::amdgcn_s_sendmsg:
3222 case Intrinsic::amdgcn_s_sendmsghalt: {
3223 // FIXME: Should this use a waterfall loop?
3224 constrainOpWithReadfirstlane(B, MI, OpIdx: 2); // M0
3225 return;
3226 }
3227 case Intrinsic::amdgcn_s_setreg: {
3228 constrainOpWithReadfirstlane(B, MI, OpIdx: 2);
3229 return;
3230 }
3231 case Intrinsic::amdgcn_s_ttracedata:
3232 constrainOpWithReadfirstlane(B, MI, OpIdx: 1); // M0
3233 return;
3234 case Intrinsic::amdgcn_raw_buffer_load_lds:
3235 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: {
3236 applyDefaultMapping(OpdMapper);
3237 constrainOpWithReadfirstlane(B, MI, OpIdx: 1); // rsrc
3238 constrainOpWithReadfirstlane(B, MI, OpIdx: 2); // M0
3239 constrainOpWithReadfirstlane(B, MI, OpIdx: 5); // soffset
3240 return;
3241 }
3242 case Intrinsic::amdgcn_struct_buffer_load_lds:
3243 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
3244 applyDefaultMapping(OpdMapper);
3245 constrainOpWithReadfirstlane(B, MI, OpIdx: 1); // rsrc
3246 constrainOpWithReadfirstlane(B, MI, OpIdx: 2); // M0
3247 constrainOpWithReadfirstlane(B, MI, OpIdx: 6); // soffset
3248 return;
3249 }
3250 case Intrinsic::amdgcn_global_load_lds: {
3251 applyDefaultMapping(OpdMapper);
3252 constrainOpWithReadfirstlane(B, MI, OpIdx: 2);
3253 return;
3254 }
3255 case Intrinsic::amdgcn_lds_direct_load: {
3256 applyDefaultMapping(OpdMapper);
3257 // Readlane for m0 value, which is always the last operand.
3258 constrainOpWithReadfirstlane(B, MI, OpIdx: MI.getNumOperands() - 1); // Index
3259 return;
3260 }
3261 case Intrinsic::amdgcn_exp_row:
3262 applyDefaultMapping(OpdMapper);
3263 constrainOpWithReadfirstlane(B, MI, OpIdx: 8); // M0
3264 return;
3265 case Intrinsic::amdgcn_s_sleep_var:
3266 assert(OpdMapper.getVRegs(1).empty());
3267 constrainOpWithReadfirstlane(B, MI, OpIdx: 1);
3268 return;
3269 case Intrinsic::amdgcn_s_barrier_signal_var:
3270 case Intrinsic::amdgcn_s_barrier_join:
3271 case Intrinsic::amdgcn_s_wakeup_barrier:
3272 constrainOpWithReadfirstlane(B, MI, OpIdx: 1);
3273 return;
3274 case Intrinsic::amdgcn_s_barrier_signal_isfirst_var:
3275 constrainOpWithReadfirstlane(B, MI, OpIdx: 2);
3276 return;
3277 case Intrinsic::amdgcn_s_barrier_init:
3278 constrainOpWithReadfirstlane(B, MI, OpIdx: 1);
3279 constrainOpWithReadfirstlane(B, MI, OpIdx: 2);
3280 return;
3281 case Intrinsic::amdgcn_s_get_barrier_state: {
3282 constrainOpWithReadfirstlane(B, MI, OpIdx: 2);
3283 return;
3284 }
3285 default: {
3286 if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
3287 AMDGPU::lookupRsrcIntrinsic(Intr: IntrID)) {
3288 // Non-images can have complications from operands that allow both SGPR
3289 // and VGPR. For now it's too complicated to figure out the final opcode
3290 // to derive the register bank from the MCInstrDesc.
3291 if (RSrcIntrin->IsImage) {
3292 applyMappingImage(B, MI, OpdMapper, RsrcIdx: RSrcIntrin->RsrcArg);
3293 return;
3294 }
3295 }
3296
3297 break;
3298 }
3299 }
3300 break;
3301 }
3302 case AMDGPU::G_SI_CALL: {
3303 // Use a set to avoid extra readfirstlanes in the case where multiple
3304 // operands are the same register.
3305 SmallSet<Register, 4> SGPROperandRegs;
3306
3307 if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, {1}))
3308 break;
3309
3310 // Move all copies to physical SGPRs that are used by the call instruction
3311 // into the loop block. Start searching for these copies until the
3312 // ADJCALLSTACKUP.
3313 unsigned FrameSetupOpcode = AMDGPU::ADJCALLSTACKUP;
3314 unsigned FrameDestroyOpcode = AMDGPU::ADJCALLSTACKDOWN;
3315
3316 // Move all non-copies before the copies, so that a complete range can be
3317 // moved into the waterfall loop.
3318 SmallVector<MachineInstr *, 4> NonCopyInstrs;
3319 // Count of NonCopyInstrs found until the current LastCopy.
3320 unsigned NonCopyInstrsLen = 0;
3321 MachineBasicBlock::iterator Start(&MI);
3322 MachineBasicBlock::iterator LastCopy = Start;
3323 MachineBasicBlock *MBB = MI.getParent();
3324 const SIMachineFunctionInfo *Info =
3325 MBB->getParent()->getInfo<SIMachineFunctionInfo>();
3326 while (Start->getOpcode() != FrameSetupOpcode) {
3327 --Start;
3328 bool IsCopy = false;
3329 if (Start->getOpcode() == AMDGPU::COPY) {
3330 auto &Dst = Start->getOperand(i: 0);
3331 if (Dst.isReg()) {
3332 Register Reg = Dst.getReg();
3333 if (Reg.isPhysical() && MI.readsRegister(Reg, TRI)) {
3334 IsCopy = true;
3335 } else {
3336 // Also move the copy from the scratch rsrc descriptor into the loop
3337 // to allow it to be optimized away.
3338 auto &Src = Start->getOperand(i: 1);
3339 if (Src.isReg()) {
3340 Reg = Src.getReg();
3341 IsCopy = Info->getScratchRSrcReg() == Reg;
3342 }
3343 }
3344 }
3345 }
3346
3347 if (IsCopy) {
3348 LastCopy = Start;
3349 NonCopyInstrsLen = NonCopyInstrs.size();
3350 } else {
3351 NonCopyInstrs.push_back(Elt: &*Start);
3352 }
3353 }
3354 NonCopyInstrs.resize(N: NonCopyInstrsLen);
3355
3356 for (auto *NonCopy : reverse(C&: NonCopyInstrs)) {
3357 MBB->splice(Where: LastCopy, Other: MBB, From: NonCopy->getIterator());
3358 }
3359 Start = LastCopy;
3360
3361 // Do the same for copies after the loop
3362 NonCopyInstrs.clear();
3363 NonCopyInstrsLen = 0;
3364 MachineBasicBlock::iterator End(&MI);
3365 LastCopy = End;
3366 while (End->getOpcode() != FrameDestroyOpcode) {
3367 ++End;
3368 bool IsCopy = false;
3369 if (End->getOpcode() == AMDGPU::COPY) {
3370 auto &Src = End->getOperand(i: 1);
3371 if (Src.isReg()) {
3372 Register Reg = Src.getReg();
3373 IsCopy = Reg.isPhysical() && MI.modifiesRegister(Reg, TRI);
3374 }
3375 }
3376
3377 if (IsCopy) {
3378 LastCopy = End;
3379 NonCopyInstrsLen = NonCopyInstrs.size();
3380 } else {
3381 NonCopyInstrs.push_back(Elt: &*End);
3382 }
3383 }
3384 NonCopyInstrs.resize(N: NonCopyInstrsLen);
3385
3386 End = LastCopy;
3387 ++LastCopy;
3388 for (auto *NonCopy : reverse(C&: NonCopyInstrs)) {
3389 MBB->splice(Where: LastCopy, Other: MBB, From: NonCopy->getIterator());
3390 }
3391
3392 ++End;
3393 B.setInsertPt(MBB&: B.getMBB(), II: Start);
3394 executeInWaterfallLoop(B, make_range(x: Start, y: End), SGPROperandRegs);
3395 break;
3396 }
3397 case AMDGPU::G_LOAD:
3398 case AMDGPU::G_ZEXTLOAD:
3399 case AMDGPU::G_SEXTLOAD: {
3400 if (applyMappingLoad(B, OpdMapper, MI))
3401 return;
3402 break;
3403 }
3404 case AMDGPU::G_DYN_STACKALLOC:
3405 applyMappingDynStackAlloc(B, OpdMapper, MI);
3406 return;
3407 case AMDGPU::G_STACKRESTORE: {
3408 applyDefaultMapping(OpdMapper);
3409 constrainOpWithReadfirstlane(B, MI, OpIdx: 0);
3410 return;
3411 }
3412 case AMDGPU::G_SBFX:
3413 applyMappingBFE(B, OpdMapper, /*Signed*/ true);
3414 return;
3415 case AMDGPU::G_UBFX:
3416 applyMappingBFE(B, OpdMapper, /*Signed*/ false);
3417 return;
3418 case AMDGPU::G_AMDGPU_MAD_U64_U32:
3419 case AMDGPU::G_AMDGPU_MAD_I64_I32:
3420 applyMappingMAD_64_32(B, OpdMapper);
3421 return;
3422 case AMDGPU::G_PREFETCH: {
3423 if (!Subtarget.hasPrefetch()) {
3424 MI.eraseFromParent();
3425 return;
3426 }
3427 Register PtrReg = MI.getOperand(i: 0).getReg();
3428 unsigned PtrBank = getRegBankID(PtrReg, MRI, AMDGPU::SGPRRegBankID);
3429 if (PtrBank == AMDGPU::VGPRRegBankID) {
3430 MI.eraseFromParent();
3431 return;
3432 }
3433 unsigned AS = MRI.getType(Reg: PtrReg).getAddressSpace();
3434 if (!AMDGPU::isFlatGlobalAddrSpace(AS) &&
3435 AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
3436 MI.eraseFromParent();
3437 return;
3438 }
3439 applyDefaultMapping(OpdMapper);
3440 return;
3441 }
3442 default:
3443 break;
3444 }
3445
3446 return applyDefaultMapping(OpdMapper);
3447}
3448
3449// vgpr, sgpr -> vgpr
3450// vgpr, agpr -> vgpr
3451// agpr, agpr -> agpr
3452// agpr, sgpr -> vgpr
3453static unsigned regBankUnion(unsigned RB0, unsigned RB1) {
3454 if (RB0 == AMDGPU::InvalidRegBankID)
3455 return RB1;
3456 if (RB1 == AMDGPU::InvalidRegBankID)
3457 return RB0;
3458
3459 if (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID)
3460 return AMDGPU::SGPRRegBankID;
3461
3462 if (RB0 == AMDGPU::AGPRRegBankID && RB1 == AMDGPU::AGPRRegBankID)
3463 return AMDGPU::AGPRRegBankID;
3464
3465 return AMDGPU::VGPRRegBankID;
3466}
3467
3468static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1) {
3469 if (RB0 == AMDGPU::InvalidRegBankID)
3470 return RB1;
3471 if (RB1 == AMDGPU::InvalidRegBankID)
3472 return RB0;
3473
3474 // vcc, vcc -> vcc
3475 // vcc, sgpr -> vcc
3476 // vcc, vgpr -> vcc
3477 if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID)
3478 return AMDGPU::VCCRegBankID;
3479
3480 // vcc, vgpr -> vgpr
3481 return regBankUnion(RB0, RB1);
3482}
3483
3484unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI,
3485 const MachineInstr &MI) const {
3486 unsigned RegBank = AMDGPU::InvalidRegBankID;
3487
3488 for (const MachineOperand &MO : MI.operands()) {
3489 if (!MO.isReg())
3490 continue;
3491 Register Reg = MO.getReg();
3492 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3493 RegBank = regBankUnion(RB0: RegBank, RB1: Bank->getID());
3494 if (RegBank == AMDGPU::VGPRRegBankID)
3495 break;
3496 }
3497 }
3498
3499 return RegBank;
3500}
3501
3502bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
3503 const MachineFunction &MF = *MI.getParent()->getParent();
3504 const MachineRegisterInfo &MRI = MF.getRegInfo();
3505 for (const MachineOperand &MO : MI.operands()) {
3506 if (!MO.isReg())
3507 continue;
3508 Register Reg = MO.getReg();
3509 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3510 if (Bank->getID() != AMDGPU::SGPRRegBankID)
3511 return false;
3512 }
3513 }
3514 return true;
3515}
3516
3517const RegisterBankInfo::InstructionMapping &
3518AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
3519 const MachineFunction &MF = *MI.getParent()->getParent();
3520 const MachineRegisterInfo &MRI = MF.getRegInfo();
3521 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3522
3523 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3524 const MachineOperand &SrcOp = MI.getOperand(i);
3525 if (!SrcOp.isReg())
3526 continue;
3527
3528 unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI);
3529 OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3530 }
3531 return getInstructionMapping(ID: 1, Cost: 1, OperandsMapping: getOperandsMapping(OpdsMapping),
3532 NumOperands: MI.getNumOperands());
3533}
3534
3535const RegisterBankInfo::InstructionMapping &
3536AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
3537 const MachineFunction &MF = *MI.getParent()->getParent();
3538 const MachineRegisterInfo &MRI = MF.getRegInfo();
3539 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3540
3541 // Even though we technically could use SGPRs, this would require knowledge of
3542 // the constant bus restriction. Force all sources to VGPR (except for VCC).
3543 //
3544 // TODO: Unary ops are trivially OK, so accept SGPRs?
3545 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3546 const MachineOperand &Src = MI.getOperand(i);
3547 if (!Src.isReg())
3548 continue;
3549
3550 unsigned Size = getSizeInBits(Src.getReg(), MRI, *TRI);
3551 unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
3552 OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
3553 }
3554
3555 return getInstructionMapping(ID: 1, Cost: 1, OperandsMapping: getOperandsMapping(OpdsMapping),
3556 NumOperands: MI.getNumOperands());
3557}
3558
3559const RegisterBankInfo::InstructionMapping &
3560AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
3561 const MachineFunction &MF = *MI.getParent()->getParent();
3562 const MachineRegisterInfo &MRI = MF.getRegInfo();
3563 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3564
3565 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
3566 const MachineOperand &Op = MI.getOperand(i: I);
3567 if (!Op.isReg())
3568 continue;
3569
3570 unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI);
3571 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3572 }
3573
3574 return getInstructionMapping(ID: 1, Cost: 1, OperandsMapping: getOperandsMapping(OpdsMapping),
3575 NumOperands: MI.getNumOperands());
3576}
3577
3578const RegisterBankInfo::InstructionMapping &
3579AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI,
3580 const MachineInstr &MI,
3581 int RsrcIdx) const {
3582 // The reported argument index is relative to the IR intrinsic call arguments,
3583 // so we need to shift by the number of defs and the intrinsic ID.
3584 RsrcIdx += MI.getNumExplicitDefs() + 1;
3585
3586 const int NumOps = MI.getNumOperands();
3587 SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps);
3588
3589 // TODO: Should packed/unpacked D16 difference be reported here as part of
3590 // the value mapping?
3591 for (int I = 0; I != NumOps; ++I) {
3592 if (!MI.getOperand(i: I).isReg())
3593 continue;
3594
3595 Register OpReg = MI.getOperand(i: I).getReg();
3596 // We replace some dead address operands with $noreg
3597 if (!OpReg)
3598 continue;
3599
3600 unsigned Size = getSizeInBits(OpReg, MRI, *TRI);
3601
3602 // FIXME: Probably need a new intrinsic register bank searchable table to
3603 // handle arbitrary intrinsics easily.
3604 //
3605 // If this has a sampler, it immediately follows rsrc.
3606 const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1;
3607
3608 if (MustBeSGPR) {
3609 // If this must be an SGPR, so we must report whatever it is as legal.
3610 unsigned NewBank = getRegBankID(OpReg, MRI, AMDGPU::SGPRRegBankID);
3611 OpdsMapping[I] = AMDGPU::getValueMapping(BankID: NewBank, Size);
3612 } else {
3613 // Some operands must be VGPR, and these are easy to copy to.
3614 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3615 }
3616 }
3617
3618 return getInstructionMapping(ID: 1, Cost: 1, OperandsMapping: getOperandsMapping(OpdsMapping), NumOperands: NumOps);
3619}
3620
3621/// Return the mapping for a pointer argument.
3622const RegisterBankInfo::ValueMapping *
3623AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI,
3624 Register PtrReg) const {
3625 LLT PtrTy = MRI.getType(Reg: PtrReg);
3626 unsigned Size = PtrTy.getSizeInBits();
3627 if (Subtarget.useFlatForGlobal() ||
3628 !AMDGPU::isFlatGlobalAddrSpace(PtrTy.getAddressSpace()))
3629 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3630
3631 // If we're using MUBUF instructions for global memory, an SGPR base register
3632 // is possible. Otherwise this needs to be a VGPR.
3633 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3634 return AMDGPU::getValueMapping(BankID: PtrBank->getID(), Size);
3635}
3636
3637const RegisterBankInfo::InstructionMapping &
3638AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
3639
3640 const MachineFunction &MF = *MI.getParent()->getParent();
3641 const MachineRegisterInfo &MRI = MF.getRegInfo();
3642 SmallVector<const ValueMapping*, 2> OpdsMapping(2);
3643 unsigned Size = getSizeInBits(MI.getOperand(i: 0).getReg(), MRI, *TRI);
3644 Register PtrReg = MI.getOperand(i: 1).getReg();
3645 LLT PtrTy = MRI.getType(Reg: PtrReg);
3646 unsigned AS = PtrTy.getAddressSpace();
3647 unsigned PtrSize = PtrTy.getSizeInBits();
3648
3649 const ValueMapping *ValMapping;
3650 const ValueMapping *PtrMapping;
3651
3652 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3653
3654 if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) {
3655 if (isScalarLoadLegal(MI)) {
3656 // We have a uniform instruction so we want to use an SMRD load
3657 ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3658 PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
3659 } else {
3660 ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3661
3662 // If we're using MUBUF instructions for global memory, an SGPR base
3663 // register is possible. Otherwise this needs to be a VGPR.
3664 unsigned PtrBankID = Subtarget.useFlatForGlobal() ?
3665 AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID;
3666
3667 PtrMapping = AMDGPU::getValueMapping(BankID: PtrBankID, Size: PtrSize);
3668 }
3669 } else {
3670 ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3671 PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
3672 }
3673
3674 OpdsMapping[0] = ValMapping;
3675 OpdsMapping[1] = PtrMapping;
3676 const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping(
3677 ID: 1, Cost: 1, OperandsMapping: getOperandsMapping(OpdsMapping), NumOperands: MI.getNumOperands());
3678 return Mapping;
3679
3680 // FIXME: Do we want to add a mapping for FLAT load, or should we just
3681 // handle that during instruction selection?
3682}
3683
3684unsigned
3685AMDGPURegisterBankInfo::getRegBankID(Register Reg,
3686 const MachineRegisterInfo &MRI,
3687 unsigned Default) const {
3688 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3689 return Bank ? Bank->getID() : Default;
3690}
3691
3692const RegisterBankInfo::ValueMapping *
3693AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg,
3694 const MachineRegisterInfo &MRI,
3695 const TargetRegisterInfo &TRI) const {
3696 // Lie and claim anything is legal, even though this needs to be an SGPR
3697 // applyMapping will have to deal with it as a waterfall loop.
3698 unsigned Bank = getRegBankID(Reg, MRI, AMDGPU::SGPRRegBankID);
3699 unsigned Size = getSizeInBits(Reg, MRI, TRI);
3700 return AMDGPU::getValueMapping(BankID: Bank, Size);
3701}
3702
3703const RegisterBankInfo::ValueMapping *
3704AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg,
3705 const MachineRegisterInfo &MRI,
3706 const TargetRegisterInfo &TRI) const {
3707 unsigned Size = getSizeInBits(Reg, MRI, TRI);
3708 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3709}
3710
3711const RegisterBankInfo::ValueMapping *
3712AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg,
3713 const MachineRegisterInfo &MRI,
3714 const TargetRegisterInfo &TRI) const {
3715 unsigned Size = getSizeInBits(Reg, MRI, TRI);
3716 return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size);
3717}
3718
3719///
3720/// This function must return a legal mapping, because
3721/// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
3722/// in RegBankSelect::Mode::Fast. Any mapping that would cause a
3723/// VGPR to SGPR generated is illegal.
3724///
3725// Operands that must be SGPRs must accept potentially divergent VGPRs as
3726// legal. These will be dealt with in applyMappingImpl.
3727//
3728const RegisterBankInfo::InstructionMapping &
3729AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
3730 const MachineFunction &MF = *MI.getParent()->getParent();
3731 const MachineRegisterInfo &MRI = MF.getRegInfo();
3732
3733 if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) {
3734 // The default logic bothers to analyze impossible alternative mappings. We
3735 // want the most straightforward mapping, so just directly handle this.
3736 const RegisterBank *DstBank = getRegBank(MI.getOperand(i: 0).getReg(), MRI,
3737 *TRI);
3738 const RegisterBank *SrcBank = getRegBank(MI.getOperand(i: 1).getReg(), MRI,
3739 *TRI);
3740 assert(SrcBank && "src bank should have been assigned already");
3741 if (!DstBank)
3742 DstBank = SrcBank;
3743
3744 unsigned Size = getSizeInBits(MI.getOperand(i: 0).getReg(), MRI, *TRI);
3745 if (MI.getOpcode() != AMDGPU::G_FREEZE &&
3746 cannotCopy(*DstBank, *SrcBank, TypeSize::getFixed(Size)))
3747 return getInvalidInstructionMapping();
3748
3749 const ValueMapping &ValMap = getValueMapping(StartIdx: 0, Length: Size, RegBank: *DstBank);
3750 unsigned OpdsMappingSize = MI.isCopy() ? 1 : 2;
3751 SmallVector<const ValueMapping *, 1> OpdsMapping(OpdsMappingSize);
3752 OpdsMapping[0] = &ValMap;
3753 if (MI.getOpcode() == AMDGPU::G_FREEZE)
3754 OpdsMapping[1] = &ValMap;
3755
3756 return getInstructionMapping(
3757 ID: 1, /*Cost*/ 1,
3758 /*OperandsMapping*/ getOperandsMapping(OpdsMapping), NumOperands: OpdsMappingSize);
3759 }
3760
3761 if (MI.isRegSequence()) {
3762 // If any input is a VGPR, the result must be a VGPR. The default handling
3763 // assumes any copy between banks is legal.
3764 unsigned BankID = AMDGPU::SGPRRegBankID;
3765
3766 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3767 auto OpBank = getRegBankID(Reg: MI.getOperand(i: I).getReg(), MRI);
3768 // It doesn't make sense to use vcc or scc banks here, so just ignore
3769 // them.
3770 if (OpBank != AMDGPU::SGPRRegBankID) {
3771 BankID = AMDGPU::VGPRRegBankID;
3772 break;
3773 }
3774 }
3775 unsigned Size = getSizeInBits(MI.getOperand(i: 0).getReg(), MRI, *TRI);
3776
3777 const ValueMapping &ValMap = getValueMapping(StartIdx: 0, Length: Size, RegBank: getRegBank(ID: BankID));
3778 return getInstructionMapping(
3779 ID: 1, /*Cost*/ 1,
3780 /*OperandsMapping*/ getOperandsMapping(OpdsMapping: {&ValMap}), NumOperands: 1);
3781 }
3782
3783 // The default handling is broken and doesn't handle illegal SGPR->VGPR copies
3784 // properly.
3785 //
3786 // TODO: There are additional exec masking dependencies to analyze.
3787 if (auto *PHI = dyn_cast<GPhi>(Val: &MI)) {
3788 unsigned ResultBank = AMDGPU::InvalidRegBankID;
3789 Register DstReg = PHI->getReg(Idx: 0);
3790
3791 // Sometimes the result may have already been assigned a bank.
3792 if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI))
3793 ResultBank = DstBank->getID();
3794
3795 for (unsigned I = 0; I < PHI->getNumIncomingValues(); ++I) {
3796 Register Reg = PHI->getIncomingValue(I);
3797 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3798
3799 // FIXME: Assuming VGPR for any undetermined inputs.
3800 if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) {
3801 ResultBank = AMDGPU::VGPRRegBankID;
3802 break;
3803 }
3804
3805 // FIXME: Need to promote SGPR case to s32
3806 unsigned OpBank = Bank->getID();
3807 ResultBank = regBankBoolUnion(RB0: ResultBank, RB1: OpBank);
3808 }
3809
3810 assert(ResultBank != AMDGPU::InvalidRegBankID);
3811
3812 unsigned Size = MRI.getType(Reg: DstReg).getSizeInBits();
3813
3814 const ValueMapping &ValMap =
3815 getValueMapping(StartIdx: 0, Length: Size, RegBank: getRegBank(ID: ResultBank));
3816 return getInstructionMapping(
3817 ID: 1, /*Cost*/ 1,
3818 /*OperandsMapping*/ getOperandsMapping(OpdsMapping: {&ValMap}), NumOperands: 1);
3819 }
3820
3821 const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
3822 if (Mapping.isValid())
3823 return Mapping;
3824
3825 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3826
3827 switch (MI.getOpcode()) {
3828 default:
3829 return getInvalidInstructionMapping();
3830
3831 case AMDGPU::G_AND:
3832 case AMDGPU::G_OR:
3833 case AMDGPU::G_XOR:
3834 case AMDGPU::G_MUL: {
3835 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
3836 if (Size == 1) {
3837 const RegisterBank *DstBank
3838 = getRegBank(MI.getOperand(i: 0).getReg(), MRI, *TRI);
3839
3840 unsigned TargetBankID = AMDGPU::InvalidRegBankID;
3841 unsigned BankLHS = AMDGPU::InvalidRegBankID;
3842 unsigned BankRHS = AMDGPU::InvalidRegBankID;
3843 if (DstBank) {
3844 TargetBankID = DstBank->getID();
3845 if (DstBank == &AMDGPU::VCCRegBank) {
3846 TargetBankID = AMDGPU::VCCRegBankID;
3847 BankLHS = AMDGPU::VCCRegBankID;
3848 BankRHS = AMDGPU::VCCRegBankID;
3849 } else {
3850 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3851 AMDGPU::SGPRRegBankID);
3852 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3853 AMDGPU::SGPRRegBankID);
3854 }
3855 } else {
3856 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3857 AMDGPU::VCCRegBankID);
3858 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3859 AMDGPU::VCCRegBankID);
3860
3861 // Both inputs should be true booleans to produce a boolean result.
3862 if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) {
3863 TargetBankID = AMDGPU::VGPRRegBankID;
3864 } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) {
3865 TargetBankID = AMDGPU::VCCRegBankID;
3866 BankLHS = AMDGPU::VCCRegBankID;
3867 BankRHS = AMDGPU::VCCRegBankID;
3868 } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) {
3869 TargetBankID = AMDGPU::SGPRRegBankID;
3870 }
3871 }
3872
3873 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: TargetBankID, Size);
3874 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: BankLHS, Size);
3875 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: BankRHS, Size);
3876 break;
3877 }
3878
3879 if (Size == 64) {
3880
3881 if (isSALUMapping(MI)) {
3882 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);
3883 OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
3884 } else {
3885 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
3886 unsigned Bank1 = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI /*, DefaultBankID*/);
3887 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: Bank1, Size);
3888
3889 unsigned Bank2 = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI /*, DefaultBankID*/);
3890 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: Bank2, Size);
3891 }
3892
3893 break;
3894 }
3895
3896 [[fallthrough]];
3897 }
3898 case AMDGPU::G_PTR_ADD:
3899 case AMDGPU::G_PTRMASK:
3900 case AMDGPU::G_ADD:
3901 case AMDGPU::G_SUB:
3902 case AMDGPU::G_SHL:
3903 case AMDGPU::G_LSHR:
3904 case AMDGPU::G_ASHR:
3905 case AMDGPU::G_UADDO:
3906 case AMDGPU::G_USUBO:
3907 case AMDGPU::G_UADDE:
3908 case AMDGPU::G_SADDE:
3909 case AMDGPU::G_USUBE:
3910 case AMDGPU::G_SSUBE:
3911 case AMDGPU::G_SMIN:
3912 case AMDGPU::G_SMAX:
3913 case AMDGPU::G_UMIN:
3914 case AMDGPU::G_UMAX:
3915 case AMDGPU::G_ABS:
3916 case AMDGPU::G_SHUFFLE_VECTOR:
3917 case AMDGPU::G_SBFX:
3918 case AMDGPU::G_UBFX:
3919 case AMDGPU::G_AMDGPU_S_MUL_I64_I32:
3920 case AMDGPU::G_AMDGPU_S_MUL_U64_U32:
3921 if (isSALUMapping(MI))
3922 return getDefaultMappingSOP(MI);
3923 return getDefaultMappingVOP(MI);
3924 case AMDGPU::G_FADD:
3925 case AMDGPU::G_FSUB:
3926 case AMDGPU::G_FMUL:
3927 case AMDGPU::G_FMA:
3928 case AMDGPU::G_FFLOOR:
3929 case AMDGPU::G_FCEIL:
3930 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
3931 case AMDGPU::G_FMINNUM:
3932 case AMDGPU::G_FMAXNUM:
3933 case AMDGPU::G_FMINIMUM:
3934 case AMDGPU::G_FMAXIMUM:
3935 case AMDGPU::G_INTRINSIC_TRUNC:
3936 case AMDGPU::G_STRICT_FADD:
3937 case AMDGPU::G_STRICT_FSUB:
3938 case AMDGPU::G_STRICT_FMUL:
3939 case AMDGPU::G_STRICT_FMA: {
3940 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
3941 unsigned Size = Ty.getSizeInBits();
3942 if (Subtarget.hasSALUFloatInsts() && Ty.isScalar() &&
3943 (Size == 32 || Size == 16) && isSALUMapping(MI))
3944 return getDefaultMappingSOP(MI);
3945 return getDefaultMappingVOP(MI);
3946 }
3947 case AMDGPU::G_FPTOSI:
3948 case AMDGPU::G_FPTOUI:
3949 case AMDGPU::G_SITOFP:
3950 case AMDGPU::G_UITOFP: {
3951 unsigned SizeDst = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
3952 unsigned SizeSrc = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
3953 if (Subtarget.hasSALUFloatInsts() && SizeDst == 32 && SizeSrc == 32 &&
3954 isSALUMapping(MI))
3955 return getDefaultMappingSOP(MI);
3956 return getDefaultMappingVOP(MI);
3957 }
3958 case AMDGPU::G_FPTRUNC:
3959 case AMDGPU::G_FPEXT: {
3960 unsigned SizeDst = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
3961 unsigned SizeSrc = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
3962 if (Subtarget.hasSALUFloatInsts() && SizeDst != 64 && SizeSrc != 64 &&
3963 isSALUMapping(MI))
3964 return getDefaultMappingSOP(MI);
3965 return getDefaultMappingVOP(MI);
3966 }
3967 case AMDGPU::G_FSQRT:
3968 case AMDGPU::G_FEXP2:
3969 case AMDGPU::G_FLOG2: {
3970 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
3971 if (Subtarget.hasPseudoScalarTrans() && (Size == 16 || Size == 32) &&
3972 isSALUMapping(MI))
3973 return getDefaultMappingSOP(MI);
3974 return getDefaultMappingVOP(MI);
3975 }
3976 case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU
3977 case AMDGPU::G_SSUBSAT:
3978 case AMDGPU::G_UADDSAT:
3979 case AMDGPU::G_USUBSAT:
3980 case AMDGPU::G_FMAD:
3981 case AMDGPU::G_FLDEXP:
3982 case AMDGPU::G_FMINNUM_IEEE:
3983 case AMDGPU::G_FMAXNUM_IEEE:
3984 case AMDGPU::G_FCANONICALIZE:
3985 case AMDGPU::G_STRICT_FLDEXP:
3986 case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar?
3987 case AMDGPU::G_FSHR: // TODO: Expand for scalar
3988 case AMDGPU::G_AMDGPU_FMIN_LEGACY:
3989 case AMDGPU::G_AMDGPU_FMAX_LEGACY:
3990 case AMDGPU::G_AMDGPU_RCP_IFLAG:
3991 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
3992 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
3993 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
3994 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
3995 case AMDGPU::G_AMDGPU_CVT_PK_I16_I32:
3996 case AMDGPU::G_AMDGPU_SMED3:
3997 case AMDGPU::G_AMDGPU_FMED3:
3998 return getDefaultMappingVOP(MI);
3999 case AMDGPU::G_UMULH:
4000 case AMDGPU::G_SMULH: {
4001 if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI))
4002 return getDefaultMappingSOP(MI);
4003 return getDefaultMappingVOP(MI);
4004 }
4005 case AMDGPU::G_AMDGPU_MAD_U64_U32:
4006 case AMDGPU::G_AMDGPU_MAD_I64_I32: {
4007 // Three possible mappings:
4008 //
4009 // - Default SOP
4010 // - Default VOP
4011 // - Scalar multiply: src0 and src1 are SGPRs, the rest is VOP.
4012 //
4013 // This allows instruction selection to keep the multiplication part of the
4014 // instruction on the SALU.
4015 bool AllSalu = true;
4016 bool MulSalu = true;
4017 for (unsigned i = 0; i < 5; ++i) {
4018 Register Reg = MI.getOperand(i).getReg();
4019 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
4020 if (Bank->getID() != AMDGPU::SGPRRegBankID) {
4021 AllSalu = false;
4022 if (i == 2 || i == 3) {
4023 MulSalu = false;
4024 break;
4025 }
4026 }
4027 }
4028 }
4029
4030 if (AllSalu)
4031 return getDefaultMappingSOP(MI);
4032
4033 // If the multiply-add is full-rate in VALU, use that even if the
4034 // multiplication part is scalar. Accumulating separately on the VALU would
4035 // take two instructions.
4036 if (!MulSalu || Subtarget.hasFullRate64Ops())
4037 return getDefaultMappingVOP(MI);
4038
4039 // Keep the multiplication on the SALU, then accumulate on the VALU.
4040 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64);
4041 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4042 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
4043 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
4044 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64);
4045 break;
4046 }
4047 case AMDGPU::G_IMPLICIT_DEF: {
4048 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4049 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4050 break;
4051 }
4052 case AMDGPU::G_FCONSTANT:
4053 case AMDGPU::G_CONSTANT:
4054 case AMDGPU::G_GLOBAL_VALUE:
4055 case AMDGPU::G_BLOCK_ADDR:
4056 case AMDGPU::G_READSTEADYCOUNTER:
4057 case AMDGPU::G_READCYCLECOUNTER: {
4058 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4059 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4060 break;
4061 }
4062 case AMDGPU::G_FRAME_INDEX: {
4063 // TODO: This should be the same as other constants, but eliminateFrameIndex
4064 // currently assumes VALU uses.
4065 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4066 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4067 break;
4068 }
4069 case AMDGPU::G_DYN_STACKALLOC: {
4070 // Result is always uniform, and a wave reduction is needed for the source.
4071 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
4072 unsigned SrcBankID = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI);
4073 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: SrcBankID, Size: 32);
4074 break;
4075 }
4076 case AMDGPU::G_AMDGPU_WAVE_ADDRESS: {
4077 // This case is weird because we expect a physical register in the source,
4078 // but need to set a bank anyway.
4079 //
4080 // TODO: We could select the result to SGPR or VGPR
4081 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
4082 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
4083 break;
4084 }
4085 case AMDGPU::G_INSERT: {
4086 unsigned BankID = getMappingType(MRI, MI);
4087 unsigned DstSize = getSizeInBits(MI.getOperand(i: 0).getReg(), MRI, *TRI);
4088 unsigned SrcSize = getSizeInBits(MI.getOperand(i: 1).getReg(), MRI, *TRI);
4089 unsigned EltSize = getSizeInBits(MI.getOperand(i: 2).getReg(), MRI, *TRI);
4090 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, Size: DstSize);
4091 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size: SrcSize);
4092 OpdsMapping[2] = AMDGPU::getValueMapping(BankID, Size: EltSize);
4093 OpdsMapping[3] = nullptr;
4094 break;
4095 }
4096 case AMDGPU::G_EXTRACT: {
4097 unsigned BankID = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI);
4098 unsigned DstSize = getSizeInBits(MI.getOperand(i: 0).getReg(), MRI, *TRI);
4099 unsigned SrcSize = getSizeInBits(MI.getOperand(i: 1).getReg(), MRI, *TRI);
4100 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, Size: DstSize);
4101 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size: SrcSize);
4102 OpdsMapping[2] = nullptr;
4103 break;
4104 }
4105 case AMDGPU::G_BUILD_VECTOR:
4106 case AMDGPU::G_BUILD_VECTOR_TRUNC: {
4107 LLT DstTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
4108 if (DstTy == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16)) {
4109 unsigned DstSize = DstTy.getSizeInBits();
4110 unsigned SrcSize = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
4111 unsigned Src0BankID = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI);
4112 unsigned Src1BankID = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI);
4113 unsigned DstBankID = regBankUnion(RB0: Src0BankID, RB1: Src1BankID);
4114
4115 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: DstBankID, Size: DstSize);
4116 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: Src0BankID, Size: SrcSize);
4117 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: Src1BankID, Size: SrcSize);
4118 break;
4119 }
4120
4121 [[fallthrough]];
4122 }
4123 case AMDGPU::G_MERGE_VALUES:
4124 case AMDGPU::G_CONCAT_VECTORS: {
4125 unsigned Bank = getMappingType(MRI, MI);
4126 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4127 unsigned SrcSize = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
4128
4129 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: Bank, Size: DstSize);
4130 // Op1 and Dst should use the same register bank.
4131 for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i)
4132 OpdsMapping[i] = AMDGPU::getValueMapping(BankID: Bank, Size: SrcSize);
4133 break;
4134 }
4135 case AMDGPU::G_BITREVERSE:
4136 case AMDGPU::G_BITCAST:
4137 case AMDGPU::G_INTTOPTR:
4138 case AMDGPU::G_PTRTOINT:
4139 case AMDGPU::G_FABS:
4140 case AMDGPU::G_FNEG: {
4141 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4142 unsigned BankID = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI);
4143 OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
4144 break;
4145 }
4146 case AMDGPU::G_AMDGPU_FFBH_U32:
4147 case AMDGPU::G_AMDGPU_FFBL_B32:
4148 case AMDGPU::G_CTLZ_ZERO_UNDEF:
4149 case AMDGPU::G_CTTZ_ZERO_UNDEF: {
4150 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
4151 unsigned BankID = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI);
4152 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, Size: 32);
4153 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(BankID, Size);
4154 break;
4155 }
4156 case AMDGPU::G_CTPOP: {
4157 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
4158 unsigned BankID = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI);
4159 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, Size: 32);
4160
4161 // This should really be getValueMappingSGPR64Only, but allowing the generic
4162 // code to handle the register split just makes using LegalizerHelper more
4163 // difficult.
4164 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
4165 break;
4166 }
4167 case AMDGPU::G_TRUNC: {
4168 Register Dst = MI.getOperand(i: 0).getReg();
4169 Register Src = MI.getOperand(i: 1).getReg();
4170 unsigned Bank = getRegBankID(Reg: Src, MRI);
4171 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
4172 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
4173 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: Bank, Size: DstSize);
4174 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: Bank, Size: SrcSize);
4175 break;
4176 }
4177 case AMDGPU::G_ZEXT:
4178 case AMDGPU::G_SEXT:
4179 case AMDGPU::G_ANYEXT:
4180 case AMDGPU::G_SEXT_INREG: {
4181 Register Dst = MI.getOperand(i: 0).getReg();
4182 Register Src = MI.getOperand(i: 1).getReg();
4183 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
4184 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
4185
4186 unsigned DstBank;
4187 const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI);
4188 assert(SrcBank);
4189 switch (SrcBank->getID()) {
4190 case AMDGPU::SGPRRegBankID:
4191 DstBank = AMDGPU::SGPRRegBankID;
4192 break;
4193 default:
4194 DstBank = AMDGPU::VGPRRegBankID;
4195 break;
4196 }
4197
4198 // Scalar extend can use 64-bit BFE, but VGPRs require extending to
4199 // 32-bits, and then to 64.
4200 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(BankID: DstBank, Size: DstSize);
4201 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(BankID: SrcBank->getID(),
4202 Size: SrcSize);
4203 break;
4204 }
4205 case AMDGPU::G_IS_FPCLASS: {
4206 Register SrcReg = MI.getOperand(i: 1).getReg();
4207 unsigned SrcSize = MRI.getType(Reg: SrcReg).getSizeInBits();
4208 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4209 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
4210 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4211 break;
4212 }
4213 case AMDGPU::G_STORE: {
4214 assert(MI.getOperand(0).isReg());
4215 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4216
4217 // FIXME: We need to specify a different reg bank once scalar stores are
4218 // supported.
4219 const ValueMapping *ValMapping =
4220 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4221 OpdsMapping[0] = ValMapping;
4222 OpdsMapping[1] = getValueMappingForPtr(MRI, PtrReg: MI.getOperand(i: 1).getReg());
4223 break;
4224 }
4225 case AMDGPU::G_ICMP:
4226 case AMDGPU::G_FCMP: {
4227 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits();
4228
4229 // See if the result register has already been constrained to vcc, which may
4230 // happen due to control flow intrinsic lowering.
4231 unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI,
4232 AMDGPU::SGPRRegBankID);
4233 unsigned Op2Bank = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI);
4234 unsigned Op3Bank = getRegBankID(Reg: MI.getOperand(i: 3).getReg(), MRI);
4235
4236 auto canUseSCCICMP = [&]() {
4237 auto Pred =
4238 static_cast<CmpInst::Predicate>(MI.getOperand(i: 1).getPredicate());
4239 return Size == 32 ||
4240 (Size == 64 &&
4241 (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
4242 Subtarget.hasScalarCompareEq64());
4243 };
4244 auto canUseSCCFCMP = [&]() {
4245 return Subtarget.hasSALUFloatInsts() && (Size == 32 || Size == 16);
4246 };
4247
4248 bool isICMP = MI.getOpcode() == AMDGPU::G_ICMP;
4249 bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID &&
4250 Op2Bank == AMDGPU::SGPRRegBankID &&
4251 Op3Bank == AMDGPU::SGPRRegBankID &&
4252 (isICMP ? canUseSCCICMP() : canUseSCCFCMP());
4253
4254 DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4255 unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4256
4257 // TODO: Use 32-bit for scalar output size.
4258 // SCC results will need to be copied to a 32-bit SGPR virtual register.
4259 const unsigned ResultSize = 1;
4260
4261 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: DstBank, Size: ResultSize);
4262 OpdsMapping[1] = nullptr; // Predicate Operand.
4263 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: SrcBank, Size);
4264 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: SrcBank, Size);
4265 break;
4266 }
4267 case AMDGPU::G_EXTRACT_VECTOR_ELT: {
4268 // VGPR index can be used for waterfall when indexing a SGPR vector.
4269 unsigned SrcBankID = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI);
4270 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4271 unsigned SrcSize = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
4272 unsigned IdxSize = MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits();
4273 unsigned IdxBank = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI);
4274 unsigned OutputBankID = regBankUnion(RB0: SrcBankID, RB1: IdxBank);
4275
4276 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(BankID: OutputBankID, Size: DstSize);
4277 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: SrcBankID, Size: SrcSize);
4278
4279 // The index can be either if the source vector is VGPR.
4280 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: IdxBank, Size: IdxSize);
4281 break;
4282 }
4283 case AMDGPU::G_INSERT_VECTOR_ELT: {
4284 unsigned OutputBankID = isSALUMapping(MI) ?
4285 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4286
4287 unsigned VecSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4288 unsigned InsertSize = MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits();
4289 unsigned IdxSize = MRI.getType(Reg: MI.getOperand(i: 3).getReg()).getSizeInBits();
4290 unsigned InsertEltBankID = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI);
4291 unsigned IdxBankID = getRegBankID(Reg: MI.getOperand(i: 3).getReg(), MRI);
4292
4293 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: OutputBankID, Size: VecSize);
4294 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: OutputBankID, Size: VecSize);
4295
4296 // This is a weird case, because we need to break down the mapping based on
4297 // the register bank of a different operand.
4298 if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) {
4299 OpdsMapping[2] = AMDGPU::getValueMappingSplit64(BankID: InsertEltBankID,
4300 Size: InsertSize);
4301 } else {
4302 assert(InsertSize == 32 || InsertSize == 64);
4303 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: InsertEltBankID, Size: InsertSize);
4304 }
4305
4306 // The index can be either if the source vector is VGPR.
4307 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: IdxBankID, Size: IdxSize);
4308 break;
4309 }
4310 case AMDGPU::G_UNMERGE_VALUES: {
4311 unsigned Bank = getMappingType(MRI, MI);
4312
4313 // Op1 and Dst should use the same register bank.
4314 // FIXME: Shouldn't this be the default? Why do we need to handle this?
4315 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
4316 unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
4317 OpdsMapping[i] = AMDGPU::getValueMapping(BankID: Bank, Size);
4318 }
4319 break;
4320 }
4321 case AMDGPU::G_AMDGPU_BUFFER_LOAD:
4322 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
4323 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
4324 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
4325 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
4326 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
4327 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE:
4328 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
4329 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
4330 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
4331 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
4332 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16:
4333 case AMDGPU::G_AMDGPU_BUFFER_STORE:
4334 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
4335 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
4336 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
4337 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: {
4338 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(i: 0).getReg(), MRI, *TRI);
4339
4340 // rsrc
4341 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(i: 1).getReg(), MRI, *TRI);
4342
4343 // vindex
4344 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(i: 2).getReg(), MRI, *TRI);
4345
4346 // voffset
4347 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(i: 3).getReg(), MRI, *TRI);
4348
4349 // soffset
4350 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(i: 4).getReg(), MRI, *TRI);
4351
4352 // Any remaining operands are immediates and were correctly null
4353 // initialized.
4354 break;
4355 }
4356 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
4357 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
4358 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
4359 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
4360 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
4361 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
4362 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
4363 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
4364 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
4365 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
4366 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
4367 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
4368 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
4369 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD_BF16:
4370 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
4371 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
4372 // vdata_out
4373 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(i: 0).getReg(), MRI, *TRI);
4374
4375 // vdata_in
4376 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(i: 1).getReg(), MRI, *TRI);
4377
4378 // rsrc
4379 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(i: 2).getReg(), MRI, *TRI);
4380
4381 // vindex
4382 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(i: 3).getReg(), MRI, *TRI);
4383
4384 // voffset
4385 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(i: 4).getReg(), MRI, *TRI);
4386
4387 // soffset
4388 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(i: 5).getReg(), MRI, *TRI);
4389
4390 // Any remaining operands are immediates and were correctly null
4391 // initialized.
4392 break;
4393 }
4394 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
4395 // vdata_out
4396 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(i: 0).getReg(), MRI, *TRI);
4397
4398 // vdata_in
4399 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(i: 1).getReg(), MRI, *TRI);
4400
4401 // cmp
4402 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(i: 2).getReg(), MRI, *TRI);
4403
4404 // rsrc
4405 OpdsMapping[3] = getSGPROpMapping(MI.getOperand(i: 3).getReg(), MRI, *TRI);
4406
4407 // vindex
4408 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(i: 4).getReg(), MRI, *TRI);
4409
4410 // voffset
4411 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(i: 5).getReg(), MRI, *TRI);
4412
4413 // soffset
4414 OpdsMapping[6] = getSGPROpMapping(MI.getOperand(i: 6).getReg(), MRI, *TRI);
4415
4416 // Any remaining operands are immediates and were correctly null
4417 // initialized.
4418 break;
4419 }
4420 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD:
4421 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
4422 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE:
4423 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
4424 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT: {
4425 // Lie and claim everything is legal, even though some need to be
4426 // SGPRs. applyMapping will have to deal with it as a waterfall loop.
4427 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(i: 1).getReg(), MRI, *TRI);
4428 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(i: 2).getReg(), MRI, *TRI);
4429
4430 // We need to convert this to a MUBUF if either the resource of offset is
4431 // VGPR.
4432 unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID();
4433 unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID();
4434 unsigned ResultBank = regBankUnion(RB0: RSrcBank, RB1: OffsetBank);
4435
4436 unsigned Size0 = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4437 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: ResultBank, Size: Size0);
4438 break;
4439 }
4440 case AMDGPU::G_INTRINSIC:
4441 case AMDGPU::G_INTRINSIC_CONVERGENT: {
4442 switch (cast<GIntrinsic>(Val: MI).getIntrinsicID()) {
4443 default:
4444 return getInvalidInstructionMapping();
4445 case Intrinsic::amdgcn_div_fmas:
4446 case Intrinsic::amdgcn_div_fixup:
4447 case Intrinsic::amdgcn_trig_preop:
4448 case Intrinsic::amdgcn_sin:
4449 case Intrinsic::amdgcn_cos:
4450 case Intrinsic::amdgcn_log_clamp:
4451 case Intrinsic::amdgcn_rcp_legacy:
4452 case Intrinsic::amdgcn_rsq_legacy:
4453 case Intrinsic::amdgcn_rsq_clamp:
4454 case Intrinsic::amdgcn_fmul_legacy:
4455 case Intrinsic::amdgcn_fma_legacy:
4456 case Intrinsic::amdgcn_frexp_mant:
4457 case Intrinsic::amdgcn_frexp_exp:
4458 case Intrinsic::amdgcn_fract:
4459 case Intrinsic::amdgcn_cvt_pknorm_i16:
4460 case Intrinsic::amdgcn_cvt_pknorm_u16:
4461 case Intrinsic::amdgcn_cvt_pk_i16:
4462 case Intrinsic::amdgcn_cvt_pk_u16:
4463 case Intrinsic::amdgcn_fmed3:
4464 case Intrinsic::amdgcn_cubeid:
4465 case Intrinsic::amdgcn_cubema:
4466 case Intrinsic::amdgcn_cubesc:
4467 case Intrinsic::amdgcn_cubetc:
4468 case Intrinsic::amdgcn_sffbh:
4469 case Intrinsic::amdgcn_fmad_ftz:
4470 case Intrinsic::amdgcn_mbcnt_lo:
4471 case Intrinsic::amdgcn_mbcnt_hi:
4472 case Intrinsic::amdgcn_mul_u24:
4473 case Intrinsic::amdgcn_mul_i24:
4474 case Intrinsic::amdgcn_mulhi_u24:
4475 case Intrinsic::amdgcn_mulhi_i24:
4476 case Intrinsic::amdgcn_lerp:
4477 case Intrinsic::amdgcn_sad_u8:
4478 case Intrinsic::amdgcn_msad_u8:
4479 case Intrinsic::amdgcn_sad_hi_u8:
4480 case Intrinsic::amdgcn_sad_u16:
4481 case Intrinsic::amdgcn_qsad_pk_u16_u8:
4482 case Intrinsic::amdgcn_mqsad_pk_u16_u8:
4483 case Intrinsic::amdgcn_mqsad_u32_u8:
4484 case Intrinsic::amdgcn_cvt_pk_u8_f32:
4485 case Intrinsic::amdgcn_alignbyte:
4486 case Intrinsic::amdgcn_perm:
4487 case Intrinsic::amdgcn_fdot2:
4488 case Intrinsic::amdgcn_sdot2:
4489 case Intrinsic::amdgcn_udot2:
4490 case Intrinsic::amdgcn_sdot4:
4491 case Intrinsic::amdgcn_udot4:
4492 case Intrinsic::amdgcn_sdot8:
4493 case Intrinsic::amdgcn_udot8:
4494 case Intrinsic::amdgcn_fdot2_bf16_bf16:
4495 case Intrinsic::amdgcn_fdot2_f16_f16:
4496 case Intrinsic::amdgcn_fdot2_f32_bf16:
4497 case Intrinsic::amdgcn_sudot4:
4498 case Intrinsic::amdgcn_sudot8:
4499 case Intrinsic::amdgcn_dot4_f32_fp8_bf8:
4500 case Intrinsic::amdgcn_dot4_f32_bf8_fp8:
4501 case Intrinsic::amdgcn_dot4_f32_fp8_fp8:
4502 case Intrinsic::amdgcn_dot4_f32_bf8_bf8:
4503 case Intrinsic::amdgcn_cvt_f32_fp8:
4504 case Intrinsic::amdgcn_cvt_f32_bf8:
4505 case Intrinsic::amdgcn_cvt_pk_f32_fp8:
4506 case Intrinsic::amdgcn_cvt_pk_f32_bf8:
4507 case Intrinsic::amdgcn_cvt_pk_fp8_f32:
4508 case Intrinsic::amdgcn_cvt_pk_bf8_f32:
4509 case Intrinsic::amdgcn_cvt_sr_fp8_f32:
4510 case Intrinsic::amdgcn_cvt_sr_bf8_f32:
4511 case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16:
4512 case Intrinsic::amdgcn_wmma_f16_16x16x16_f16:
4513 case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied:
4514 case Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied:
4515 case Intrinsic::amdgcn_wmma_f32_16x16x16_bf16:
4516 case Intrinsic::amdgcn_wmma_f32_16x16x16_f16:
4517 case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4:
4518 case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8:
4519 case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8:
4520 case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8:
4521 case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8:
4522 case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8:
4523 case Intrinsic::amdgcn_wmma_i32_16x16x32_iu4:
4524 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
4525 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
4526 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
4527 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
4528 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
4529 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
4530 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4:
4531 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
4532 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
4533 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
4534 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8:
4535 return getDefaultMappingVOP(MI);
4536 case Intrinsic::amdgcn_log:
4537 case Intrinsic::amdgcn_exp2:
4538 case Intrinsic::amdgcn_rcp:
4539 case Intrinsic::amdgcn_rsq:
4540 case Intrinsic::amdgcn_sqrt: {
4541 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4542 if (Subtarget.hasPseudoScalarTrans() && (Size == 16 || Size == 32) &&
4543 isSALUMapping(MI))
4544 return getDefaultMappingSOP(MI);
4545 return getDefaultMappingVOP(MI);
4546 }
4547 case Intrinsic::amdgcn_sbfe:
4548 case Intrinsic::amdgcn_ubfe:
4549 if (isSALUMapping(MI))
4550 return getDefaultMappingSOP(MI);
4551 return getDefaultMappingVOP(MI);
4552 case Intrinsic::amdgcn_ds_swizzle:
4553 case Intrinsic::amdgcn_ds_permute:
4554 case Intrinsic::amdgcn_ds_bpermute:
4555 case Intrinsic::amdgcn_update_dpp:
4556 case Intrinsic::amdgcn_mov_dpp8:
4557 case Intrinsic::amdgcn_mov_dpp:
4558 case Intrinsic::amdgcn_strict_wwm:
4559 case Intrinsic::amdgcn_wwm:
4560 case Intrinsic::amdgcn_strict_wqm:
4561 case Intrinsic::amdgcn_wqm:
4562 case Intrinsic::amdgcn_softwqm:
4563 case Intrinsic::amdgcn_set_inactive:
4564 case Intrinsic::amdgcn_set_inactive_chain_arg:
4565 case Intrinsic::amdgcn_permlane64:
4566 return getDefaultMappingAllVGPR(MI);
4567 case Intrinsic::amdgcn_cvt_pkrtz:
4568 if (Subtarget.hasSALUFloatInsts() && isSALUMapping(MI))
4569 return getDefaultMappingSOP(MI);
4570 return getDefaultMappingVOP(MI);
4571 case Intrinsic::amdgcn_kernarg_segment_ptr:
4572 case Intrinsic::amdgcn_s_getpc:
4573 case Intrinsic::amdgcn_groupstaticsize:
4574 case Intrinsic::amdgcn_reloc_constant:
4575 case Intrinsic::returnaddress: {
4576 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4577 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4578 break;
4579 }
4580 case Intrinsic::amdgcn_wqm_vote: {
4581 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4582 OpdsMapping[0] = OpdsMapping[2]
4583 = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size);
4584 break;
4585 }
4586 case Intrinsic::amdgcn_ps_live: {
4587 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4588 break;
4589 }
4590 case Intrinsic::amdgcn_div_scale: {
4591 unsigned Dst0Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4592 unsigned Dst1Size = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
4593 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size);
4594 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size);
4595
4596 unsigned SrcSize = MRI.getType(Reg: MI.getOperand(i: 3).getReg()).getSizeInBits();
4597 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4598 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4599 break;
4600 }
4601 case Intrinsic::amdgcn_class: {
4602 Register Src0Reg = MI.getOperand(i: 2).getReg();
4603 Register Src1Reg = MI.getOperand(i: 3).getReg();
4604 unsigned Src0Size = MRI.getType(Reg: Src0Reg).getSizeInBits();
4605 unsigned Src1Size = MRI.getType(Reg: Src1Reg).getSizeInBits();
4606 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4607 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
4608 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src0Size);
4609 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src1Size);
4610 break;
4611 }
4612 case Intrinsic::amdgcn_icmp:
4613 case Intrinsic::amdgcn_fcmp: {
4614 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4615 // This is not VCCRegBank because this is not used in boolean contexts.
4616 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4617 unsigned OpSize = MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits();
4618 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4619 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4620 break;
4621 }
4622 case Intrinsic::amdgcn_readlane: {
4623 // This must be an SGPR, but accept a VGPR.
4624 Register IdxReg = MI.getOperand(i: 3).getReg();
4625 unsigned IdxSize = MRI.getType(Reg: IdxReg).getSizeInBits();
4626 unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4627 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: IdxBank, Size: IdxSize);
4628 [[fallthrough]];
4629 }
4630 case Intrinsic::amdgcn_readfirstlane: {
4631 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4632 unsigned SrcSize = MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits();
4633 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4634 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4635 break;
4636 }
4637 case Intrinsic::amdgcn_writelane: {
4638 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4639 Register SrcReg = MI.getOperand(i: 2).getReg();
4640 unsigned SrcSize = MRI.getType(Reg: SrcReg).getSizeInBits();
4641 unsigned SrcBank = getRegBankID(SrcReg, MRI, AMDGPU::SGPRRegBankID);
4642 Register IdxReg = MI.getOperand(i: 3).getReg();
4643 unsigned IdxSize = MRI.getType(Reg: IdxReg).getSizeInBits();
4644 unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4645 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4646
4647 // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
4648 // to legalize.
4649 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: SrcBank, Size: SrcSize);
4650 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: IdxBank, Size: IdxSize);
4651 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4652 break;
4653 }
4654 case Intrinsic::amdgcn_if_break: {
4655 unsigned Size = getSizeInBits(MI.getOperand(i: 0).getReg(), MRI, *TRI);
4656 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4657 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4658 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4659 break;
4660 }
4661 case Intrinsic::amdgcn_permlane16:
4662 case Intrinsic::amdgcn_permlanex16: {
4663 unsigned Size = getSizeInBits(MI.getOperand(i: 0).getReg(), MRI, *TRI);
4664 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4665 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4666 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4667 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(i: 3).getReg(), MRI, *TRI);
4668 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(i: 4).getReg(), MRI, *TRI);
4669 break;
4670 }
4671 case Intrinsic::amdgcn_permlane16_var:
4672 case Intrinsic::amdgcn_permlanex16_var: {
4673 unsigned Size = getSizeInBits(MI.getOperand(i: 0).getReg(), MRI, *TRI);
4674 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4675 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4676 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4677 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4678 break;
4679 }
4680 case Intrinsic::amdgcn_mfma_f32_4x4x1f32:
4681 case Intrinsic::amdgcn_mfma_f32_4x4x4f16:
4682 case Intrinsic::amdgcn_mfma_i32_4x4x4i8:
4683 case Intrinsic::amdgcn_mfma_f32_4x4x2bf16:
4684 case Intrinsic::amdgcn_mfma_f32_16x16x1f32:
4685 case Intrinsic::amdgcn_mfma_f32_16x16x4f32:
4686 case Intrinsic::amdgcn_mfma_f32_16x16x4f16:
4687 case Intrinsic::amdgcn_mfma_f32_16x16x16f16:
4688 case Intrinsic::amdgcn_mfma_i32_16x16x4i8:
4689 case Intrinsic::amdgcn_mfma_i32_16x16x16i8:
4690 case Intrinsic::amdgcn_mfma_f32_16x16x2bf16:
4691 case Intrinsic::amdgcn_mfma_f32_16x16x8bf16:
4692 case Intrinsic::amdgcn_mfma_f32_32x32x1f32:
4693 case Intrinsic::amdgcn_mfma_f32_32x32x2f32:
4694 case Intrinsic::amdgcn_mfma_f32_32x32x4f16:
4695 case Intrinsic::amdgcn_mfma_f32_32x32x8f16:
4696 case Intrinsic::amdgcn_mfma_i32_32x32x4i8:
4697 case Intrinsic::amdgcn_mfma_i32_32x32x8i8:
4698 case Intrinsic::amdgcn_mfma_f32_32x32x2bf16:
4699 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16:
4700 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k:
4701 case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k:
4702 case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k:
4703 case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k:
4704 case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k:
4705 case Intrinsic::amdgcn_mfma_f64_16x16x4f64:
4706 case Intrinsic::amdgcn_mfma_f64_4x4x4f64:
4707 case Intrinsic::amdgcn_mfma_i32_16x16x32_i8:
4708 case Intrinsic::amdgcn_mfma_i32_32x32x16_i8:
4709 case Intrinsic::amdgcn_mfma_f32_16x16x8_xf32:
4710 case Intrinsic::amdgcn_mfma_f32_32x32x4_xf32:
4711 case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_bf8:
4712 case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_fp8:
4713 case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_bf8:
4714 case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_fp8:
4715 case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_bf8:
4716 case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_fp8:
4717 case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_bf8:
4718 case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_fp8: {
4719 // Default for MAI intrinsics.
4720 // srcC can also be an immediate which can be folded later.
4721 // FIXME: Should we eventually add an alternative mapping with AGPR src
4722 // for srcA/srcB?
4723 //
4724 // vdst, srcA, srcB, srcC
4725 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4726 OpdsMapping[0] =
4727 Info->mayNeedAGPRs()
4728 ? getAGPROpMapping(MI.getOperand(i: 0).getReg(), MRI, *TRI)
4729 : getVGPROpMapping(MI.getOperand(i: 0).getReg(), MRI, *TRI);
4730 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(i: 2).getReg(), MRI, *TRI);
4731 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(i: 3).getReg(), MRI, *TRI);
4732 OpdsMapping[4] =
4733 Info->mayNeedAGPRs()
4734 ? getAGPROpMapping(MI.getOperand(i: 4).getReg(), MRI, *TRI)
4735 : getVGPROpMapping(MI.getOperand(i: 4).getReg(), MRI, *TRI);
4736 break;
4737 }
4738 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
4739 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
4740 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
4741 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
4742 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
4743 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
4744 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
4745 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
4746 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
4747 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
4748 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
4749 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
4750 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
4751 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8: {
4752 // vdst, srcA, srcB, srcC, idx
4753 OpdsMapping[0] = getAGPROpMapping(MI.getOperand(i: 0).getReg(), MRI, *TRI);
4754 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(i: 2).getReg(), MRI, *TRI);
4755 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(i: 3).getReg(), MRI, *TRI);
4756 OpdsMapping[4] = getAGPROpMapping(MI.getOperand(i: 4).getReg(), MRI, *TRI);
4757 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(i: 5).getReg(), MRI, *TRI);
4758 break;
4759 }
4760 case Intrinsic::amdgcn_interp_p1:
4761 case Intrinsic::amdgcn_interp_p2:
4762 case Intrinsic::amdgcn_interp_mov:
4763 case Intrinsic::amdgcn_interp_p1_f16:
4764 case Intrinsic::amdgcn_interp_p2_f16:
4765 case Intrinsic::amdgcn_lds_param_load: {
4766 const int M0Idx = MI.getNumOperands() - 1;
4767 Register M0Reg = MI.getOperand(i: M0Idx).getReg();
4768 unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
4769 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4770
4771 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4772 for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I)
4773 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4774
4775 // Must be SGPR, but we must take whatever the original bank is and fix it
4776 // later.
4777 OpdsMapping[M0Idx] = AMDGPU::getValueMapping(BankID: M0Bank, Size: 32);
4778 break;
4779 }
4780 case Intrinsic::amdgcn_interp_inreg_p10:
4781 case Intrinsic::amdgcn_interp_inreg_p2:
4782 case Intrinsic::amdgcn_interp_inreg_p10_f16:
4783 case Intrinsic::amdgcn_interp_inreg_p2_f16:
4784 case Intrinsic::amdgcn_interp_p10_rtz_f16:
4785 case Intrinsic::amdgcn_interp_p2_rtz_f16: {
4786 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4787 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4788 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4789 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4790 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4791 break;
4792 }
4793 case Intrinsic::amdgcn_ballot: {
4794 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4795 unsigned SrcSize = MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits();
4796 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4797 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize);
4798 break;
4799 }
4800 case Intrinsic::amdgcn_inverse_ballot: {
4801 // This must be an SGPR, but accept a VGPR.
4802 Register MaskReg = MI.getOperand(i: 2).getReg();
4803 unsigned MaskSize = MRI.getType(Reg: MaskReg).getSizeInBits();
4804 unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID);
4805 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4806 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: MaskBank, Size: MaskSize);
4807 break;
4808 }
4809 case Intrinsic::amdgcn_s_quadmask:
4810 case Intrinsic::amdgcn_s_wqm: {
4811 Register MaskReg = MI.getOperand(i: 2).getReg();
4812 unsigned MaskSize = MRI.getType(Reg: MaskReg).getSizeInBits();
4813 unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID);
4814 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, MaskSize);
4815 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: MaskBank, Size: MaskSize);
4816 break;
4817 }
4818 case Intrinsic::amdgcn_wave_reduce_umin:
4819 case Intrinsic::amdgcn_wave_reduce_umax: {
4820 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4821 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4822 unsigned OpSize = MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits();
4823 auto regBankID =
4824 isSALUMapping(MI) ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4825 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: regBankID, Size: OpSize);
4826 break;
4827 }
4828 case Intrinsic::amdgcn_s_bitreplicate:
4829 Register MaskReg = MI.getOperand(i: 2).getReg();
4830 unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID);
4831 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64);
4832 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: MaskBank, Size: 32);
4833 }
4834 break;
4835 }
4836 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4837 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
4838 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
4839 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
4840 auto IntrID = AMDGPU::getIntrinsicID(I: MI);
4841 const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(Intr: IntrID);
4842 assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic");
4843 // Non-images can have complications from operands that allow both SGPR
4844 // and VGPR. For now it's too complicated to figure out the final opcode
4845 // to derive the register bank from the MCInstrDesc.
4846 assert(RSrcIntrin->IsImage);
4847 return getImageMapping(MRI, MI, RsrcIdx: RSrcIntrin->RsrcArg);
4848 }
4849 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
4850 unsigned N = MI.getNumExplicitOperands() - 2;
4851 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128);
4852 OpdsMapping[N] = getSGPROpMapping(MI.getOperand(i: N).getReg(), MRI, *TRI);
4853 if (N == 3) {
4854 // Sequential form: all operands combined into VGPR256/VGPR512
4855 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits();
4856 if (Size > 256)
4857 Size = 512;
4858 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4859 } else {
4860 // NSA form
4861 for (unsigned I = 2; I < N; ++I) {
4862 unsigned Size = MRI.getType(Reg: MI.getOperand(i: I).getReg()).getSizeInBits();
4863 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4864 }
4865 }
4866 break;
4867 }
4868 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
4869 case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: {
4870 auto IntrID = cast<GIntrinsic>(Val: MI).getIntrinsicID();
4871 switch (IntrID) {
4872 case Intrinsic::amdgcn_s_getreg:
4873 case Intrinsic::amdgcn_s_memtime:
4874 case Intrinsic::amdgcn_s_memrealtime:
4875 case Intrinsic::amdgcn_s_get_waveid_in_workgroup:
4876 case Intrinsic::amdgcn_s_sendmsg_rtn: {
4877 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4878 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4879 break;
4880 }
4881 case Intrinsic::amdgcn_global_atomic_fadd:
4882 case Intrinsic::amdgcn_global_atomic_csub:
4883 case Intrinsic::amdgcn_global_atomic_fmin:
4884 case Intrinsic::amdgcn_global_atomic_fmax:
4885 case Intrinsic::amdgcn_global_atomic_fmin_num:
4886 case Intrinsic::amdgcn_global_atomic_fmax_num:
4887 case Intrinsic::amdgcn_flat_atomic_fadd:
4888 case Intrinsic::amdgcn_flat_atomic_fmin:
4889 case Intrinsic::amdgcn_flat_atomic_fmax:
4890 case Intrinsic::amdgcn_flat_atomic_fmin_num:
4891 case Intrinsic::amdgcn_flat_atomic_fmax_num:
4892 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
4893 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
4894 case Intrinsic::amdgcn_atomic_cond_sub_u32:
4895 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
4896 case Intrinsic::amdgcn_global_load_tr_b64:
4897 case Intrinsic::amdgcn_global_load_tr_b128:
4898 return getDefaultMappingAllVGPR(MI);
4899 case Intrinsic::amdgcn_ds_ordered_add:
4900 case Intrinsic::amdgcn_ds_ordered_swap:
4901 case Intrinsic::amdgcn_ds_fadd_v2bf16: {
4902 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4903 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4904 unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4905 AMDGPU::SGPRRegBankID);
4906 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: M0Bank, Size: 32);
4907 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4908 break;
4909 }
4910 case Intrinsic::amdgcn_ds_append:
4911 case Intrinsic::amdgcn_ds_consume: {
4912 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
4913 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4914 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(i: 2).getReg(), MRI, *TRI);
4915 break;
4916 }
4917 case Intrinsic::amdgcn_exp_compr:
4918 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4919 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4920 break;
4921 case Intrinsic::amdgcn_exp:
4922 // FIXME: Could we support packed types here?
4923 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4924 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4925 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4926 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4927 break;
4928 case Intrinsic::amdgcn_exp_row:
4929 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4930 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4931 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4932 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4933 OpdsMapping[8] = getSGPROpMapping(MI.getOperand(i: 8).getReg(), MRI, *TRI);
4934 break;
4935 case Intrinsic::amdgcn_s_sendmsg:
4936 case Intrinsic::amdgcn_s_sendmsghalt: {
4937 // This must be an SGPR, but accept a VGPR.
4938 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4939 AMDGPU::SGPRRegBankID);
4940 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: Bank, Size: 32);
4941 break;
4942 }
4943 case Intrinsic::amdgcn_s_setreg: {
4944 // This must be an SGPR, but accept a VGPR.
4945 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4946 AMDGPU::SGPRRegBankID);
4947 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: Bank, Size: 32);
4948 break;
4949 }
4950 case Intrinsic::amdgcn_s_ttracedata: {
4951 // This must be an SGPR, but accept a VGPR.
4952 unsigned Bank =
4953 getRegBankID(MI.getOperand(1).getReg(), MRI, AMDGPU::SGPRRegBankID);
4954 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: Bank, Size: 32);
4955 break;
4956 }
4957 case Intrinsic::amdgcn_end_cf: {
4958 unsigned Size = getSizeInBits(MI.getOperand(i: 1).getReg(), MRI, *TRI);
4959 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4960 break;
4961 }
4962 case Intrinsic::amdgcn_else: {
4963 unsigned WaveSize = getSizeInBits(MI.getOperand(i: 1).getReg(), MRI, *TRI);
4964 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4965 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4966 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4967 break;
4968 }
4969 case Intrinsic::amdgcn_live_mask: {
4970 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4971 break;
4972 }
4973 case Intrinsic::amdgcn_wqm_demote:
4974 case Intrinsic::amdgcn_kill: {
4975 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4976 break;
4977 }
4978 case Intrinsic::amdgcn_raw_buffer_load:
4979 case Intrinsic::amdgcn_raw_ptr_buffer_load:
4980 case Intrinsic::amdgcn_raw_tbuffer_load:
4981 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
4982 // FIXME: Should make intrinsic ID the last operand of the instruction,
4983 // then this would be the same as store
4984 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(i: 0).getReg(), MRI, *TRI);
4985 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(i: 2).getReg(), MRI, *TRI);
4986 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(i: 3).getReg(), MRI, *TRI);
4987 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(i: 4).getReg(), MRI, *TRI);
4988 break;
4989 }
4990 case Intrinsic::amdgcn_raw_buffer_load_lds:
4991 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: {
4992 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(i: 1).getReg(), MRI, *TRI);
4993 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(i: 2).getReg(), MRI, *TRI);
4994 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(i: 4).getReg(), MRI, *TRI);
4995 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(i: 5).getReg(), MRI, *TRI);
4996 break;
4997 }
4998 case Intrinsic::amdgcn_raw_buffer_store:
4999 case Intrinsic::amdgcn_raw_ptr_buffer_store:
5000 case Intrinsic::amdgcn_raw_buffer_store_format:
5001 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
5002 case Intrinsic::amdgcn_raw_tbuffer_store:
5003 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
5004 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(i: 1).getReg(), MRI, *TRI);
5005 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(i: 2).getReg(), MRI, *TRI);
5006 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(i: 3).getReg(), MRI, *TRI);
5007 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(i: 4).getReg(), MRI, *TRI);
5008 break;
5009 }
5010 case Intrinsic::amdgcn_struct_buffer_load:
5011 case Intrinsic::amdgcn_struct_ptr_buffer_load:
5012 case Intrinsic::amdgcn_struct_tbuffer_load:
5013 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
5014 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(i: 0).getReg(), MRI, *TRI);
5015 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(i: 2).getReg(), MRI, *TRI);
5016 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(i: 3).getReg(), MRI, *TRI);
5017 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(i: 4).getReg(), MRI, *TRI);
5018 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(i: 5).getReg(), MRI, *TRI);
5019 break;
5020 }
5021 case Intrinsic::amdgcn_struct_buffer_load_lds:
5022 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
5023 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(i: 1).getReg(), MRI, *TRI);
5024 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(i: 2).getReg(), MRI, *TRI);
5025 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(i: 4).getReg(), MRI, *TRI);
5026 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(i: 5).getReg(), MRI, *TRI);
5027 OpdsMapping[6] = getSGPROpMapping(MI.getOperand(i: 6).getReg(), MRI, *TRI);
5028 break;
5029 }
5030 case Intrinsic::amdgcn_struct_buffer_store:
5031 case Intrinsic::amdgcn_struct_ptr_buffer_store:
5032 case Intrinsic::amdgcn_struct_tbuffer_store:
5033 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
5034 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(i: 1).getReg(), MRI, *TRI);
5035 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(i: 2).getReg(), MRI, *TRI);
5036 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(i: 3).getReg(), MRI, *TRI);
5037 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(i: 4).getReg(), MRI, *TRI);
5038 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(i: 5).getReg(), MRI, *TRI);
5039 break;
5040 }
5041 case Intrinsic::amdgcn_init_exec_from_input: {
5042 unsigned Size = getSizeInBits(MI.getOperand(i: 1).getReg(), MRI, *TRI);
5043 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
5044 break;
5045 }
5046 case Intrinsic::amdgcn_ds_gws_init:
5047 case Intrinsic::amdgcn_ds_gws_barrier:
5048 case Intrinsic::amdgcn_ds_gws_sema_br: {
5049 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
5050
5051 // This must be an SGPR, but accept a VGPR.
5052 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
5053 AMDGPU::SGPRRegBankID);
5054 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: Bank, Size: 32);
5055 break;
5056 }
5057 case Intrinsic::amdgcn_ds_gws_sema_v:
5058 case Intrinsic::amdgcn_ds_gws_sema_p:
5059 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
5060 // This must be an SGPR, but accept a VGPR.
5061 unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI,
5062 AMDGPU::SGPRRegBankID);
5063 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: Bank, Size: 32);
5064 break;
5065 }
5066 case Intrinsic::amdgcn_global_load_lds: {
5067 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(i: 1).getReg(), MRI, *TRI);
5068 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(i: 2).getReg(), MRI, *TRI);
5069 break;
5070 }
5071 case Intrinsic::amdgcn_lds_direct_load: {
5072 const int M0Idx = MI.getNumOperands() - 1;
5073 Register M0Reg = MI.getOperand(i: M0Idx).getReg();
5074 unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
5075 unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
5076
5077 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
5078 for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I)
5079 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
5080
5081 // Must be SGPR, but we must take whatever the original bank is and fix it
5082 // later.
5083 OpdsMapping[M0Idx] = AMDGPU::getValueMapping(BankID: M0Bank, Size: 32);
5084 break;
5085 }
5086 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
5087 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn:
5088 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(i: 0).getReg(), MRI, *TRI);
5089 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(i: 2).getReg(), MRI, *TRI);
5090 break;
5091 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
5092 OpdsMapping[0] =
5093 getVGPROpMapping(MI.getOperand(i: 0).getReg(), MRI, *TRI); // %vdst
5094 OpdsMapping[1] =
5095 getVGPROpMapping(MI.getOperand(i: 1).getReg(), MRI, *TRI); // %addr
5096 OpdsMapping[3] =
5097 getVGPROpMapping(MI.getOperand(i: 3).getReg(), MRI, *TRI); // %addr
5098 OpdsMapping[4] =
5099 getVGPROpMapping(MI.getOperand(i: 4).getReg(), MRI, *TRI); // %data0
5100 OpdsMapping[5] =
5101 getVGPROpMapping(MI.getOperand(i: 5).getReg(), MRI, *TRI); // %data1
5102 break;
5103 }
5104 case Intrinsic::amdgcn_s_sleep_var:
5105 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(i: 1).getReg(), MRI, *TRI);
5106 break;
5107 case Intrinsic::amdgcn_s_barrier_signal_var:
5108 case Intrinsic::amdgcn_s_barrier_join:
5109 case Intrinsic::amdgcn_s_wakeup_barrier:
5110 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(i: 1).getReg(), MRI, *TRI);
5111 break;
5112 case Intrinsic::amdgcn_s_barrier_init:
5113 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(i: 1).getReg(), MRI, *TRI);
5114 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(i: 2).getReg(), MRI, *TRI);
5115 break;
5116 case Intrinsic::amdgcn_s_barrier_signal_isfirst_var: {
5117 const unsigned ResultSize = 1;
5118 OpdsMapping[0] =
5119 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, ResultSize);
5120 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(i: 2).getReg(), MRI, *TRI);
5121 break;
5122 }
5123 case Intrinsic::amdgcn_s_barrier_signal_isfirst:
5124 case Intrinsic::amdgcn_s_barrier_leave: {
5125 const unsigned ResultSize = 1;
5126 OpdsMapping[0] =
5127 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, ResultSize);
5128 break;
5129 }
5130 case Intrinsic::amdgcn_s_get_barrier_state: {
5131 OpdsMapping[0] = getSGPROpMapping(MI.getOperand(i: 0).getReg(), MRI, *TRI);
5132 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(i: 2).getReg(), MRI, *TRI);
5133 break;
5134 }
5135 default:
5136 return getInvalidInstructionMapping();
5137 }
5138 break;
5139 }
5140 case AMDGPU::G_SELECT: {
5141 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
5142 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
5143 AMDGPU::SGPRRegBankID);
5144 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI,
5145 AMDGPU::SGPRRegBankID);
5146 bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID &&
5147 Op3Bank == AMDGPU::SGPRRegBankID;
5148
5149 unsigned CondBankDefault = SGPRSrcs ?
5150 AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
5151 unsigned CondBank = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI,
5152 Default: CondBankDefault);
5153 if (CondBank == AMDGPU::SGPRRegBankID)
5154 CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
5155 else if (CondBank == AMDGPU::VGPRRegBankID)
5156 CondBank = AMDGPU::VCCRegBankID;
5157
5158 unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ?
5159 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
5160
5161 assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID);
5162
5163 // TODO: Should report 32-bit for scalar condition type.
5164 if (Size == 64) {
5165 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(BankID: Bank, Size);
5166 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: CondBank, Size: 1);
5167 OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(BankID: Bank, Size);
5168 OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(BankID: Bank, Size);
5169 } else {
5170 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: Bank, Size);
5171 OpdsMapping[1] = AMDGPU::getValueMapping(BankID: CondBank, Size: 1);
5172 OpdsMapping[2] = AMDGPU::getValueMapping(BankID: Bank, Size);
5173 OpdsMapping[3] = AMDGPU::getValueMapping(BankID: Bank, Size);
5174 }
5175
5176 break;
5177 }
5178
5179 case AMDGPU::G_SI_CALL: {
5180 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64);
5181 // Lie and claim everything is legal, even though some need to be
5182 // SGPRs. applyMapping will have to deal with it as a waterfall loop.
5183 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(i: 1).getReg(), MRI, *TRI);
5184
5185 // Allow anything for implicit arguments
5186 for (unsigned I = 4; I < MI.getNumOperands(); ++I) {
5187 if (MI.getOperand(i: I).isReg()) {
5188 Register Reg = MI.getOperand(i: I).getReg();
5189 auto OpBank = getRegBankID(Reg, MRI);
5190 unsigned Size = getSizeInBits(Reg, MRI, *TRI);
5191 OpdsMapping[I] = AMDGPU::getValueMapping(BankID: OpBank, Size);
5192 }
5193 }
5194 break;
5195 }
5196 case AMDGPU::G_LOAD:
5197 case AMDGPU::G_ZEXTLOAD:
5198 case AMDGPU::G_SEXTLOAD:
5199 return getInstrMappingForLoad(MI);
5200
5201 case AMDGPU::G_ATOMICRMW_XCHG:
5202 case AMDGPU::G_ATOMICRMW_ADD:
5203 case AMDGPU::G_ATOMICRMW_SUB:
5204 case AMDGPU::G_ATOMICRMW_AND:
5205 case AMDGPU::G_ATOMICRMW_OR:
5206 case AMDGPU::G_ATOMICRMW_XOR:
5207 case AMDGPU::G_ATOMICRMW_MAX:
5208 case AMDGPU::G_ATOMICRMW_MIN:
5209 case AMDGPU::G_ATOMICRMW_UMAX:
5210 case AMDGPU::G_ATOMICRMW_UMIN:
5211 case AMDGPU::G_ATOMICRMW_FADD:
5212 case AMDGPU::G_ATOMICRMW_UINC_WRAP:
5213 case AMDGPU::G_ATOMICRMW_UDEC_WRAP:
5214 case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
5215 case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
5216 case AMDGPU::G_AMDGPU_ATOMIC_FMAX: {
5217 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(i: 0).getReg(), MRI, *TRI);
5218 OpdsMapping[1] = getValueMappingForPtr(MRI, PtrReg: MI.getOperand(i: 1).getReg());
5219 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(i: 2).getReg(), MRI, *TRI);
5220 break;
5221 }
5222 case AMDGPU::G_ATOMIC_CMPXCHG: {
5223 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(i: 0).getReg(), MRI, *TRI);
5224 OpdsMapping[1] = getValueMappingForPtr(MRI, PtrReg: MI.getOperand(i: 1).getReg());
5225 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(i: 2).getReg(), MRI, *TRI);
5226 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(i: 3).getReg(), MRI, *TRI);
5227 break;
5228 }
5229 case AMDGPU::G_BRCOND: {
5230 unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI,
5231 AMDGPU::SGPRRegBankID);
5232 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
5233 if (Bank != AMDGPU::SGPRRegBankID)
5234 Bank = AMDGPU::VCCRegBankID;
5235
5236 OpdsMapping[0] = AMDGPU::getValueMapping(BankID: Bank, Size: 1);
5237 break;
5238 }
5239 case AMDGPU::G_FPTRUNC_ROUND_UPWARD:
5240 case AMDGPU::G_FPTRUNC_ROUND_DOWNWARD:
5241 return getDefaultMappingVOP(MI);
5242 case AMDGPU::G_PREFETCH:
5243 OpdsMapping[0] = getSGPROpMapping(MI.getOperand(i: 0).getReg(), MRI, *TRI);
5244 break;
5245 }
5246
5247 return getInstructionMapping(/*ID*/1, /*Cost*/1,
5248 OperandsMapping: getOperandsMapping(OpdsMapping),
5249 NumOperands: MI.getNumOperands());
5250}
5251

source code of llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp