1 | //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | /// \file |
9 | /// This file implements the targeting of the RegisterBankInfo class for |
10 | /// AMDGPU. |
11 | /// |
12 | /// \par |
13 | /// |
14 | /// AMDGPU has unique register bank constraints that require special high level |
15 | /// strategies to deal with. There are two main true physical register banks |
16 | /// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a |
17 | /// sort of pseudo-register bank needed to represent SGPRs used in a vector |
18 | /// boolean context. There is also the AGPR bank, which is a special purpose |
19 | /// physical register bank present on some subtargets. |
20 | /// |
21 | /// Copying from VGPR to SGPR is generally illegal, unless the value is known to |
22 | /// be uniform. It is generally not valid to legalize operands by inserting |
23 | /// copies as on other targets. Operations which require uniform, SGPR operands |
24 | /// generally require scalarization by repeatedly executing the instruction, |
25 | /// activating each set of lanes using a unique set of input values. This is |
26 | /// referred to as a waterfall loop. |
27 | /// |
28 | /// \par Booleans |
29 | /// |
30 | /// Booleans (s1 values) requires special consideration. A vector compare result |
31 | /// is naturally a bitmask with one bit per lane, in a 32 or 64-bit |
32 | /// register. These are represented with the VCC bank. During selection, we need |
33 | /// to be able to unambiguously go back from a register class to a register |
34 | /// bank. To distinguish whether an SGPR should use the SGPR or VCC register |
35 | /// bank, we need to know the use context type. An SGPR s1 value always means a |
36 | /// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets |
37 | /// SCC, which is a 1-bit unaddressable register. This will need to be copied to |
38 | /// a 32-bit virtual register. Taken together, this means we need to adjust the |
39 | /// type of boolean operations to be regbank legal. All SALU booleans need to be |
40 | /// widened to 32-bits, and all VALU booleans need to be s1 values. |
41 | /// |
42 | /// A noteworthy exception to the s1-means-vcc rule is for legalization artifact |
43 | /// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc |
44 | /// bank. A non-boolean source (such as a truncate from a 1-bit load from |
45 | /// memory) will require a copy to the VCC bank which will require clearing the |
46 | /// high bits and inserting a compare. |
47 | /// |
48 | /// \par Constant bus restriction |
49 | /// |
50 | /// VALU instructions have a limitation known as the constant bus |
51 | /// restriction. Most VALU instructions can use SGPR operands, but may read at |
52 | /// most 1 SGPR or constant literal value (this to 2 in gfx10 for most |
53 | /// instructions). This is one unique SGPR, so the same SGPR may be used for |
54 | /// multiple operands. From a register bank perspective, any combination of |
55 | /// operands should be legal as an SGPR, but this is contextually dependent on |
56 | /// the SGPR operands all being the same register. There is therefore optimal to |
57 | /// choose the SGPR with the most uses to minimize the number of copies. |
58 | /// |
59 | /// We avoid trying to solve this problem in RegBankSelect. Any VALU G_* |
60 | /// operation should have its source operands all mapped to VGPRs (except for |
61 | /// VCC), inserting copies from any SGPR operands. This the most trivial legal |
62 | /// mapping. Anything beyond the simplest 1:1 instruction selection would be too |
63 | /// complicated to solve here. Every optimization pattern or instruction |
64 | /// selected to multiple outputs would have to enforce this rule, and there |
65 | /// would be additional complexity in tracking this rule for every G_* |
66 | /// operation. By forcing all inputs to VGPRs, it also simplifies the task of |
67 | /// picking the optimal operand combination from a post-isel optimization pass. |
68 | /// |
69 | //===----------------------------------------------------------------------===// |
70 | |
71 | #include "AMDGPURegisterBankInfo.h" |
72 | |
73 | #include "AMDGPU.h" |
74 | #include "AMDGPUGlobalISelUtils.h" |
75 | #include "AMDGPUInstrInfo.h" |
76 | #include "GCNSubtarget.h" |
77 | #include "SIMachineFunctionInfo.h" |
78 | #include "SIRegisterInfo.h" |
79 | #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" |
80 | #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" |
81 | #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" |
82 | #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" |
83 | #include "llvm/CodeGen/RegisterBank.h" |
84 | #include "llvm/IR/IntrinsicsAMDGPU.h" |
85 | |
86 | #define GET_TARGET_REGBANK_IMPL |
87 | #include "AMDGPUGenRegisterBank.inc" |
88 | |
89 | // This file will be TableGen'ed at some point. |
90 | #include "AMDGPUGenRegisterBankInfo.def" |
91 | |
92 | using namespace llvm; |
93 | using namespace MIPatternMatch; |
94 | |
95 | namespace { |
96 | |
97 | // Observer to apply a register bank to new registers created by LegalizerHelper. |
98 | class ApplyRegBankMapping final : public GISelChangeObserver { |
99 | private: |
100 | MachineIRBuilder &B; |
101 | const AMDGPURegisterBankInfo &RBI; |
102 | MachineRegisterInfo &MRI; |
103 | const RegisterBank *NewBank; |
104 | SmallVector<MachineInstr *, 4> NewInsts; |
105 | |
106 | public: |
107 | ApplyRegBankMapping(MachineIRBuilder &B, const AMDGPURegisterBankInfo &RBI_, |
108 | MachineRegisterInfo &MRI_, const RegisterBank *RB) |
109 | : B(B), RBI(RBI_), MRI(MRI_), NewBank(RB) { |
110 | assert(!B.isObservingChanges()); |
111 | B.setChangeObserver(*this); |
112 | } |
113 | |
114 | ~ApplyRegBankMapping() { |
115 | for (MachineInstr *MI : NewInsts) |
116 | applyBank(MI&: *MI); |
117 | |
118 | B.stopObservingChanges(); |
119 | } |
120 | |
121 | /// Set any registers that don't have a set register class or bank to SALU. |
122 | void applyBank(MachineInstr &MI) { |
123 | const unsigned Opc = MI.getOpcode(); |
124 | if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT || |
125 | Opc == AMDGPU::G_SEXT) { |
126 | // LegalizerHelper wants to use the basic legalization artifacts when |
127 | // widening etc. We don't handle selection with vcc in artifact sources, |
128 | // so we need to use a select instead to handle these properly. |
129 | Register DstReg = MI.getOperand(i: 0).getReg(); |
130 | Register SrcReg = MI.getOperand(i: 1).getReg(); |
131 | const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI); |
132 | if (SrcBank == &AMDGPU::VCCRegBank) { |
133 | const LLT S32 = LLT::scalar(SizeInBits: 32); |
134 | assert(MRI.getType(SrcReg) == LLT::scalar(1)); |
135 | assert(MRI.getType(DstReg) == S32); |
136 | assert(NewBank == &AMDGPU::VGPRRegBank); |
137 | |
138 | // Replace the extension with a select, which really uses the boolean |
139 | // source. |
140 | B.setInsertPt(MBB&: *MI.getParent(), II: MI); |
141 | |
142 | auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1); |
143 | auto False = B.buildConstant(Res: S32, Val: 0); |
144 | B.buildSelect(Res: DstReg, Tst: SrcReg, Op0: True, Op1: False); |
145 | MRI.setRegBank(Reg: True.getReg(0), RegBank: *NewBank); |
146 | MRI.setRegBank(Reg: False.getReg(Idx: 0), RegBank: *NewBank); |
147 | MI.eraseFromParent(); |
148 | } |
149 | |
150 | assert(!MRI.getRegClassOrRegBank(DstReg)); |
151 | MRI.setRegBank(Reg: DstReg, RegBank: *NewBank); |
152 | return; |
153 | } |
154 | |
155 | #ifndef NDEBUG |
156 | if (Opc == AMDGPU::G_TRUNC) { |
157 | Register DstReg = MI.getOperand(i: 0).getReg(); |
158 | const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI); |
159 | assert(DstBank != &AMDGPU::VCCRegBank); |
160 | } |
161 | #endif |
162 | |
163 | for (MachineOperand &Op : MI.operands()) { |
164 | if (!Op.isReg()) |
165 | continue; |
166 | |
167 | // We may see physical registers if building a real MI |
168 | Register Reg = Op.getReg(); |
169 | if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg)) |
170 | continue; |
171 | |
172 | const RegisterBank *RB = NewBank; |
173 | if (MRI.getType(Reg) == LLT::scalar(SizeInBits: 1)) { |
174 | assert(NewBank == &AMDGPU::VGPRRegBank && |
175 | "s1 operands should only be used for vector bools" ); |
176 | assert((MI.getOpcode() != AMDGPU::G_TRUNC && |
177 | MI.getOpcode() != AMDGPU::G_ANYEXT) && |
178 | "not expecting legalization artifacts here" ); |
179 | RB = &AMDGPU::VCCRegBank; |
180 | } |
181 | |
182 | MRI.setRegBank(Reg, RegBank: *RB); |
183 | } |
184 | } |
185 | |
186 | void erasingInstr(MachineInstr &MI) override {} |
187 | |
188 | void createdInstr(MachineInstr &MI) override { |
189 | // At this point, the instruction was just inserted and has no operands. |
190 | NewInsts.push_back(Elt: &MI); |
191 | } |
192 | |
193 | void changingInstr(MachineInstr &MI) override {} |
194 | void changedInstr(MachineInstr &MI) override { |
195 | // FIXME: In principle we should probably add the instruction to NewInsts, |
196 | // but the way the LegalizerHelper uses the observer, we will always see the |
197 | // registers we need to set the regbank on also referenced in a new |
198 | // instruction. |
199 | } |
200 | }; |
201 | |
202 | } |
203 | |
204 | AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST) |
205 | : Subtarget(ST), TRI(Subtarget.getRegisterInfo()), |
206 | TII(Subtarget.getInstrInfo()) { |
207 | |
208 | // HACK: Until this is fully tablegen'd. |
209 | static llvm::once_flag InitializeRegisterBankFlag; |
210 | |
211 | static auto InitializeRegisterBankOnce = [this]() { |
212 | assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank && |
213 | &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank && |
214 | &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank); |
215 | (void)this; |
216 | }; |
217 | |
218 | llvm::call_once(flag&: InitializeRegisterBankFlag, F&: InitializeRegisterBankOnce); |
219 | } |
220 | |
221 | static bool isVectorRegisterBank(const RegisterBank &Bank) { |
222 | unsigned BankID = Bank.getID(); |
223 | return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID; |
224 | } |
225 | |
226 | bool AMDGPURegisterBankInfo::isDivergentRegBank(const RegisterBank *RB) const { |
227 | return RB != &AMDGPU::SGPRRegBank; |
228 | } |
229 | |
230 | unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst, |
231 | const RegisterBank &Src, |
232 | TypeSize Size) const { |
233 | // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane? |
234 | if (Dst.getID() == AMDGPU::SGPRRegBankID && |
235 | (isVectorRegisterBank(Src) || Src.getID() == AMDGPU::VCCRegBankID)) { |
236 | return std::numeric_limits<unsigned>::max(); |
237 | } |
238 | |
239 | // Bool values are tricky, because the meaning is based on context. The SCC |
240 | // and VCC banks are for the natural scalar and vector conditions produced by |
241 | // a compare. |
242 | // |
243 | // Legalization doesn't know about the necessary context, so an s1 use may |
244 | // have been a truncate from an arbitrary value, in which case a copy (lowered |
245 | // as a compare with 0) needs to be inserted. |
246 | if (Size == 1 && |
247 | (Dst.getID() == AMDGPU::SGPRRegBankID) && |
248 | (isVectorRegisterBank(Src) || |
249 | Src.getID() == AMDGPU::SGPRRegBankID || |
250 | Src.getID() == AMDGPU::VCCRegBankID)) |
251 | return std::numeric_limits<unsigned>::max(); |
252 | |
253 | // There is no direct copy between AGPRs. |
254 | if (Dst.getID() == AMDGPU::AGPRRegBankID && |
255 | Src.getID() == AMDGPU::AGPRRegBankID) |
256 | return 4; |
257 | |
258 | return RegisterBankInfo::copyCost(A: Dst, B: Src, Size); |
259 | } |
260 | |
261 | unsigned AMDGPURegisterBankInfo::getBreakDownCost( |
262 | const ValueMapping &ValMapping, |
263 | const RegisterBank *CurBank) const { |
264 | // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to |
265 | // VGPR. |
266 | // FIXME: Is there a better way to do this? |
267 | if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64) |
268 | return 10; // This is expensive. |
269 | |
270 | assert(ValMapping.NumBreakDowns == 2 && |
271 | ValMapping.BreakDown[0].Length == 32 && |
272 | ValMapping.BreakDown[0].StartIdx == 0 && |
273 | ValMapping.BreakDown[1].Length == 32 && |
274 | ValMapping.BreakDown[1].StartIdx == 32 && |
275 | ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank); |
276 | |
277 | // 32-bit extract of a 64-bit value is just access of a subregister, so free. |
278 | // TODO: Cost of 0 hits assert, though it's not clear it's what we really |
279 | // want. |
280 | |
281 | // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR |
282 | // alignment restrictions, but this probably isn't important. |
283 | return 1; |
284 | } |
285 | |
286 | const RegisterBank & |
287 | AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC, |
288 | LLT Ty) const { |
289 | if (&RC == &AMDGPU::SReg_1RegClass) |
290 | return AMDGPU::VCCRegBank; |
291 | |
292 | // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a |
293 | // VCC-like use. |
294 | if (TRI->isSGPRClass(RC: &RC)) { |
295 | // FIXME: This probably came from a copy from a physical register, which |
296 | // should be inferable from the copied to-type. We don't have many boolean |
297 | // physical register constraints so just assume a normal SGPR for now. |
298 | if (!Ty.isValid()) |
299 | return AMDGPU::SGPRRegBank; |
300 | |
301 | return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank; |
302 | } |
303 | |
304 | return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank; |
305 | } |
306 | |
307 | template <unsigned NumOps> |
308 | RegisterBankInfo::InstructionMappings |
309 | AMDGPURegisterBankInfo::addMappingFromTable( |
310 | const MachineInstr &MI, const MachineRegisterInfo &MRI, |
311 | const std::array<unsigned, NumOps> RegSrcOpIdx, |
312 | ArrayRef<OpRegBankEntry<NumOps>> Table) const { |
313 | |
314 | InstructionMappings AltMappings; |
315 | |
316 | SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands()); |
317 | |
318 | unsigned Sizes[NumOps]; |
319 | for (unsigned I = 0; I < NumOps; ++I) { |
320 | Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg(); |
321 | Sizes[I] = getSizeInBits(Reg, MRI, *TRI); |
322 | } |
323 | |
324 | for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) { |
325 | unsigned SizeI = getSizeInBits(MI.getOperand(i: I).getReg(), MRI, *TRI); |
326 | Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI); |
327 | } |
328 | |
329 | // getInstrMapping's default mapping uses ID 1, so start at 2. |
330 | unsigned MappingID = 2; |
331 | for (const auto &Entry : Table) { |
332 | for (unsigned I = 0; I < NumOps; ++I) { |
333 | int OpIdx = RegSrcOpIdx[I]; |
334 | Operands[OpIdx] = AMDGPU::getValueMapping(BankID: Entry.RegBanks[I], Size: Sizes[I]); |
335 | } |
336 | |
337 | AltMappings.push_back(Elt: &getInstructionMapping(ID: MappingID++, Cost: Entry.Cost, |
338 | OperandsMapping: getOperandsMapping(OpdsMapping: Operands), |
339 | NumOperands: Operands.size())); |
340 | } |
341 | |
342 | return AltMappings; |
343 | } |
344 | |
345 | RegisterBankInfo::InstructionMappings |
346 | AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic( |
347 | const MachineInstr &MI, const MachineRegisterInfo &MRI) const { |
348 | switch (cast<GIntrinsic>(Val: MI).getIntrinsicID()) { |
349 | case Intrinsic::amdgcn_readlane: { |
350 | static const OpRegBankEntry<3> Table[2] = { |
351 | // Perfectly legal. |
352 | { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, |
353 | |
354 | // Need a readfirstlane for the index. |
355 | { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 } |
356 | }; |
357 | |
358 | const std::array<unsigned, 3> RegSrcOpIdx = { ._M_elems: { 0, 2, 3 } }; |
359 | return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, Table); |
360 | } |
361 | case Intrinsic::amdgcn_writelane: { |
362 | static const OpRegBankEntry<4> Table[4] = { |
363 | // Perfectly legal. |
364 | { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, |
365 | |
366 | // Need readfirstlane of first op |
367 | { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }, |
368 | |
369 | // Need readfirstlane of second op |
370 | { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }, |
371 | |
372 | // Need readfirstlane of both ops |
373 | { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 } |
374 | }; |
375 | |
376 | // rsrc, voffset, offset |
377 | const std::array<unsigned, 4> RegSrcOpIdx = { ._M_elems: { 0, 2, 3, 4 } }; |
378 | return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, Table); |
379 | } |
380 | default: |
381 | return RegisterBankInfo::getInstrAlternativeMappings(MI); |
382 | } |
383 | } |
384 | |
385 | RegisterBankInfo::InstructionMappings |
386 | AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects( |
387 | const MachineInstr &MI, const MachineRegisterInfo &MRI) const { |
388 | |
389 | switch (cast<GIntrinsic>(Val: MI).getIntrinsicID()) { |
390 | case Intrinsic::amdgcn_s_buffer_load: { |
391 | static const OpRegBankEntry<2> Table[4] = { |
392 | // Perfectly legal. |
393 | { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, |
394 | |
395 | // Only need 1 register in loop |
396 | { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 }, |
397 | |
398 | // Have to waterfall the resource. |
399 | { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 }, |
400 | |
401 | // Have to waterfall the resource, and the offset. |
402 | { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 } |
403 | }; |
404 | |
405 | // rsrc, offset |
406 | const std::array<unsigned, 2> RegSrcOpIdx = { ._M_elems: { 2, 3 } }; |
407 | return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, Table); |
408 | } |
409 | case Intrinsic::amdgcn_ds_ordered_add: |
410 | case Intrinsic::amdgcn_ds_ordered_swap: { |
411 | // VGPR = M0, VGPR |
412 | static const OpRegBankEntry<3> Table[2] = { |
413 | // Perfectly legal. |
414 | { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, |
415 | |
416 | // Need a readfirstlane for m0 |
417 | { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 } |
418 | }; |
419 | |
420 | const std::array<unsigned, 3> RegSrcOpIdx = { ._M_elems: { 0, 2, 3 } }; |
421 | return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, Table); |
422 | } |
423 | case Intrinsic::amdgcn_s_sendmsg: |
424 | case Intrinsic::amdgcn_s_sendmsghalt: { |
425 | // FIXME: Should have no register for immediate |
426 | static const OpRegBankEntry<1> Table[2] = { |
427 | // Perfectly legal. |
428 | { { AMDGPU::SGPRRegBankID }, 1 }, |
429 | |
430 | // Need readlane |
431 | { { AMDGPU::VGPRRegBankID }, 3 } |
432 | }; |
433 | |
434 | const std::array<unsigned, 1> RegSrcOpIdx = { ._M_elems: { 2 } }; |
435 | return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, Table); |
436 | } |
437 | default: |
438 | return RegisterBankInfo::getInstrAlternativeMappings(MI); |
439 | } |
440 | } |
441 | |
442 | // FIXME: Returns uniform if there's no source value information. This is |
443 | // probably wrong. |
444 | bool AMDGPURegisterBankInfo::isScalarLoadLegal(const MachineInstr &MI) const { |
445 | if (!MI.hasOneMemOperand()) |
446 | return false; |
447 | |
448 | const MachineMemOperand *MMO = *MI.memoperands_begin(); |
449 | const unsigned AS = MMO->getAddrSpace(); |
450 | const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS || |
451 | AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT; |
452 | const unsigned MemSize = 8 * MMO->getSize().getValue(); |
453 | |
454 | // Require 4-byte alignment. |
455 | return (MMO->getAlign() >= Align(4) || |
456 | (Subtarget.hasScalarSubwordLoads() && |
457 | ((MemSize == 16 && MMO->getAlign() >= Align(2)) || |
458 | (MemSize == 8 && MMO->getAlign() >= Align(1))))) && |
459 | // Can't do a scalar atomic load. |
460 | !MMO->isAtomic() && |
461 | // Don't use scalar loads for volatile accesses to non-constant address |
462 | // spaces. |
463 | (IsConst || !MMO->isVolatile()) && |
464 | // Memory must be known constant, or not written before this load. |
465 | (IsConst || MMO->isInvariant() || (MMO->getFlags() & MONoClobber)) && |
466 | AMDGPUInstrInfo::isUniformMMO(MMO); |
467 | } |
468 | |
469 | RegisterBankInfo::InstructionMappings |
470 | AMDGPURegisterBankInfo::getInstrAlternativeMappings( |
471 | const MachineInstr &MI) const { |
472 | |
473 | const MachineFunction &MF = *MI.getParent()->getParent(); |
474 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
475 | |
476 | |
477 | InstructionMappings AltMappings; |
478 | switch (MI.getOpcode()) { |
479 | case TargetOpcode::G_CONSTANT: |
480 | case TargetOpcode::G_IMPLICIT_DEF: { |
481 | unsigned Size = getSizeInBits(MI.getOperand(i: 0).getReg(), MRI, *TRI); |
482 | if (Size == 1) { |
483 | static const OpRegBankEntry<1> Table[3] = { |
484 | { { AMDGPU::VGPRRegBankID }, 1 }, |
485 | { { AMDGPU::SGPRRegBankID }, 1 }, |
486 | { { AMDGPU::VCCRegBankID }, 1 } |
487 | }; |
488 | |
489 | return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx: {._M_elems: { 0 }}, Table); |
490 | } |
491 | |
492 | [[fallthrough]]; |
493 | } |
494 | case TargetOpcode::G_FCONSTANT: |
495 | case TargetOpcode::G_FRAME_INDEX: |
496 | case TargetOpcode::G_GLOBAL_VALUE: { |
497 | static const OpRegBankEntry<1> Table[2] = { |
498 | { { AMDGPU::VGPRRegBankID }, 1 }, |
499 | { { AMDGPU::SGPRRegBankID }, 1 } |
500 | }; |
501 | |
502 | return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx: {._M_elems: { 0 }}, Table); |
503 | } |
504 | case TargetOpcode::G_AND: |
505 | case TargetOpcode::G_OR: |
506 | case TargetOpcode::G_XOR: { |
507 | unsigned Size = getSizeInBits(MI.getOperand(i: 0).getReg(), MRI, *TRI); |
508 | |
509 | if (Size == 1) { |
510 | // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0. |
511 | const InstructionMapping &SCCMapping = getInstructionMapping( |
512 | 1, 1, getOperandsMapping( |
513 | {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32), |
514 | AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32), |
515 | AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}), |
516 | 3); // Num Operands |
517 | AltMappings.push_back(Elt: &SCCMapping); |
518 | |
519 | const InstructionMapping &VCCMapping0 = getInstructionMapping( |
520 | 2, 1, getOperandsMapping( |
521 | {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size), |
522 | AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size), |
523 | AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}), |
524 | 3); // Num Operands |
525 | AltMappings.push_back(Elt: &VCCMapping0); |
526 | return AltMappings; |
527 | } |
528 | |
529 | if (Size != 64) |
530 | break; |
531 | |
532 | const InstructionMapping &SSMapping = getInstructionMapping( |
533 | 1, 1, getOperandsMapping( |
534 | {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), |
535 | AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), |
536 | AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), |
537 | 3); // Num Operands |
538 | AltMappings.push_back(Elt: &SSMapping); |
539 | |
540 | const InstructionMapping &VVMapping = getInstructionMapping( |
541 | 2, 2, getOperandsMapping( |
542 | {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), |
543 | AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), |
544 | AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}), |
545 | 3); // Num Operands |
546 | AltMappings.push_back(Elt: &VVMapping); |
547 | break; |
548 | } |
549 | case TargetOpcode::G_LOAD: |
550 | case TargetOpcode::G_ZEXTLOAD: |
551 | case TargetOpcode::G_SEXTLOAD: { |
552 | unsigned Size = getSizeInBits(MI.getOperand(i: 0).getReg(), MRI, *TRI); |
553 | LLT PtrTy = MRI.getType(Reg: MI.getOperand(i: 1).getReg()); |
554 | unsigned PtrSize = PtrTy.getSizeInBits(); |
555 | unsigned AS = PtrTy.getAddressSpace(); |
556 | |
557 | if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS && |
558 | AS != AMDGPUAS::PRIVATE_ADDRESS) && |
559 | isScalarLoadLegal(MI)) { |
560 | const InstructionMapping &SSMapping = getInstructionMapping( |
561 | 1, 1, getOperandsMapping( |
562 | {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), |
563 | AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}), |
564 | 2); // Num Operands |
565 | AltMappings.push_back(Elt: &SSMapping); |
566 | } |
567 | |
568 | const InstructionMapping &VVMapping = getInstructionMapping( |
569 | 2, 1, |
570 | getOperandsMapping( |
571 | {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), |
572 | AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}), |
573 | 2); // Num Operands |
574 | AltMappings.push_back(Elt: &VVMapping); |
575 | |
576 | // It may be possible to have a vgpr = load sgpr mapping here, because |
577 | // the mubuf instructions support this kind of load, but probably for only |
578 | // gfx7 and older. However, the addressing mode matching in the instruction |
579 | // selector should be able to do a better job of detecting and selecting |
580 | // these kinds of loads from the vgpr = load vgpr mapping. |
581 | |
582 | return AltMappings; |
583 | |
584 | } |
585 | case TargetOpcode::G_SELECT: { |
586 | unsigned Size = getSizeInBits(MI.getOperand(i: 0).getReg(), MRI, *TRI); |
587 | const InstructionMapping &SSMapping = getInstructionMapping(1, 1, |
588 | getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), |
589 | AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), |
590 | AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), |
591 | AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), |
592 | 4); // Num Operands |
593 | AltMappings.push_back(Elt: &SSMapping); |
594 | |
595 | const InstructionMapping &VVMapping = getInstructionMapping(2, 1, |
596 | getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), |
597 | AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), |
598 | AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), |
599 | AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}), |
600 | 4); // Num Operands |
601 | AltMappings.push_back(Elt: &VVMapping); |
602 | |
603 | return AltMappings; |
604 | } |
605 | case TargetOpcode::G_UADDE: |
606 | case TargetOpcode::G_USUBE: |
607 | case TargetOpcode::G_SADDE: |
608 | case TargetOpcode::G_SSUBE: { |
609 | unsigned Size = getSizeInBits(MI.getOperand(i: 0).getReg(), MRI, *TRI); |
610 | const InstructionMapping &SSMapping = getInstructionMapping(1, 1, |
611 | getOperandsMapping( |
612 | {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), |
613 | AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), |
614 | AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), |
615 | AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), |
616 | AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}), |
617 | 5); // Num Operands |
618 | AltMappings.push_back(Elt: &SSMapping); |
619 | |
620 | const InstructionMapping &VVMapping = getInstructionMapping(2, 1, |
621 | getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), |
622 | AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), |
623 | AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), |
624 | AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), |
625 | AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}), |
626 | 5); // Num Operands |
627 | AltMappings.push_back(Elt: &VVMapping); |
628 | return AltMappings; |
629 | } |
630 | case AMDGPU::G_BRCOND: { |
631 | assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1); |
632 | |
633 | // TODO: Change type to 32 for scalar |
634 | const InstructionMapping &SMapping = getInstructionMapping( |
635 | 1, 1, getOperandsMapping( |
636 | {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}), |
637 | 2); // Num Operands |
638 | AltMappings.push_back(Elt: &SMapping); |
639 | |
640 | const InstructionMapping &VMapping = getInstructionMapping( |
641 | 1, 1, getOperandsMapping( |
642 | {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }), |
643 | 2); // Num Operands |
644 | AltMappings.push_back(Elt: &VMapping); |
645 | return AltMappings; |
646 | } |
647 | case AMDGPU::G_INTRINSIC: |
648 | case AMDGPU::G_INTRINSIC_CONVERGENT: |
649 | return getInstrAlternativeMappingsIntrinsic(MI, MRI); |
650 | case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: |
651 | case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: |
652 | return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI); |
653 | default: |
654 | break; |
655 | } |
656 | return RegisterBankInfo::getInstrAlternativeMappings(MI); |
657 | } |
658 | |
659 | void AMDGPURegisterBankInfo::split64BitValueForMapping( |
660 | MachineIRBuilder &B, |
661 | SmallVector<Register, 2> &Regs, |
662 | LLT HalfTy, |
663 | Register Reg) const { |
664 | assert(HalfTy.getSizeInBits() == 32); |
665 | MachineRegisterInfo *MRI = B.getMRI(); |
666 | Register LoLHS = MRI->createGenericVirtualRegister(Ty: HalfTy); |
667 | Register HiLHS = MRI->createGenericVirtualRegister(Ty: HalfTy); |
668 | const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI); |
669 | MRI->setRegBank(Reg: LoLHS, RegBank: *Bank); |
670 | MRI->setRegBank(Reg: HiLHS, RegBank: *Bank); |
671 | |
672 | Regs.push_back(Elt: LoLHS); |
673 | Regs.push_back(Elt: HiLHS); |
674 | |
675 | B.buildInstr(AMDGPU::G_UNMERGE_VALUES) |
676 | .addDef(LoLHS) |
677 | .addDef(HiLHS) |
678 | .addUse(Reg); |
679 | } |
680 | |
681 | /// Replace the current type each register in \p Regs has with \p NewTy |
682 | static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs, |
683 | LLT NewTy) { |
684 | for (Register Reg : Regs) { |
685 | assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits()); |
686 | MRI.setType(VReg: Reg, Ty: NewTy); |
687 | } |
688 | } |
689 | |
690 | static LLT getHalfSizedType(LLT Ty) { |
691 | if (Ty.isVector()) { |
692 | assert(Ty.getElementCount().isKnownMultipleOf(2)); |
693 | return LLT::scalarOrVector(EC: Ty.getElementCount().divideCoefficientBy(RHS: 2), |
694 | ScalarTy: Ty.getElementType()); |
695 | } |
696 | |
697 | assert(Ty.getScalarSizeInBits() % 2 == 0); |
698 | return LLT::scalar(SizeInBits: Ty.getScalarSizeInBits() / 2); |
699 | } |
700 | |
701 | // Build one or more V_READFIRSTLANE_B32 instructions to move the given vector |
702 | // source value into a scalar register. |
703 | Register AMDGPURegisterBankInfo::buildReadFirstLane(MachineIRBuilder &B, |
704 | MachineRegisterInfo &MRI, |
705 | Register Src) const { |
706 | LLT Ty = MRI.getType(Reg: Src); |
707 | const RegisterBank *Bank = getRegBank(Src, MRI, *TRI); |
708 | |
709 | if (Bank == &AMDGPU::SGPRRegBank) |
710 | return Src; |
711 | |
712 | unsigned Bits = Ty.getSizeInBits(); |
713 | assert(Bits % 32 == 0); |
714 | |
715 | if (Bank != &AMDGPU::VGPRRegBank) { |
716 | // We need to copy from AGPR to VGPR |
717 | Src = B.buildCopy(Res: Ty, Op: Src).getReg(Idx: 0); |
718 | MRI.setRegBank(Src, AMDGPU::VGPRRegBank); |
719 | } |
720 | |
721 | LLT S32 = LLT::scalar(SizeInBits: 32); |
722 | unsigned NumParts = Bits / 32; |
723 | SmallVector<Register, 8> SrcParts; |
724 | SmallVector<Register, 8> DstParts; |
725 | |
726 | if (Bits == 32) { |
727 | SrcParts.push_back(Elt: Src); |
728 | } else { |
729 | auto Unmerge = B.buildUnmerge(Res: S32, Op: Src); |
730 | for (unsigned i = 0; i < NumParts; ++i) |
731 | SrcParts.push_back(Elt: Unmerge.getReg(Idx: i)); |
732 | } |
733 | |
734 | for (unsigned i = 0; i < NumParts; ++i) { |
735 | Register SrcPart = SrcParts[i]; |
736 | Register DstPart = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); |
737 | MRI.setType(VReg: DstPart, Ty: NumParts == 1 ? Ty : S32); |
738 | |
739 | const TargetRegisterClass *Constrained = |
740 | constrainGenericRegister(SrcPart, AMDGPU::VGPR_32RegClass, MRI); |
741 | (void)Constrained; |
742 | assert(Constrained && "Failed to constrain readfirstlane src reg" ); |
743 | |
744 | B.buildInstr(AMDGPU::V_READFIRSTLANE_B32, {DstPart}, {SrcPart}); |
745 | |
746 | DstParts.push_back(Elt: DstPart); |
747 | } |
748 | |
749 | if (Bits == 32) |
750 | return DstParts[0]; |
751 | |
752 | Register Dst = B.buildMergeLikeInstr(Res: Ty, Ops: DstParts).getReg(Idx: 0); |
753 | MRI.setRegBank(Dst, AMDGPU::SGPRRegBank); |
754 | return Dst; |
755 | } |
756 | |
757 | /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If |
758 | /// any of the required SGPR operands are VGPRs, perform a waterfall loop to |
759 | /// execute the instruction for each unique combination of values in all lanes |
760 | /// in the wave. The block will be split such that rest of the instructions are |
761 | /// moved to a new block. |
762 | /// |
763 | /// Essentially performs this loop: |
764 | // |
765 | /// Save Execution Mask |
766 | /// For (Lane : Wavefront) { |
767 | /// Enable Lane, Disable all other lanes |
768 | /// SGPR = read SGPR value for current lane from VGPR |
769 | /// VGPRResult[Lane] = use_op SGPR |
770 | /// } |
771 | /// Restore Execution Mask |
772 | /// |
773 | /// There is additional complexity to try for compare values to identify the |
774 | /// unique values used. |
775 | bool AMDGPURegisterBankInfo::executeInWaterfallLoop( |
776 | MachineIRBuilder &B, iterator_range<MachineBasicBlock::iterator> Range, |
777 | SmallSet<Register, 4> &SGPROperandRegs) const { |
778 | // Track use registers which have already been expanded with a readfirstlane |
779 | // sequence. This may have multiple uses if moving a sequence. |
780 | DenseMap<Register, Register> WaterfalledRegMap; |
781 | |
782 | MachineBasicBlock &MBB = B.getMBB(); |
783 | MachineFunction *MF = &B.getMF(); |
784 | |
785 | const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass(); |
786 | const unsigned MovExecOpc = |
787 | Subtarget.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; |
788 | const unsigned MovExecTermOpc = |
789 | Subtarget.isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term; |
790 | |
791 | const unsigned XorTermOpc = Subtarget.isWave32() ? |
792 | AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term; |
793 | const unsigned AndSaveExecOpc = Subtarget.isWave32() ? |
794 | AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; |
795 | const unsigned ExecReg = Subtarget.isWave32() ? |
796 | AMDGPU::EXEC_LO : AMDGPU::EXEC; |
797 | |
798 | #ifndef NDEBUG |
799 | const int OrigRangeSize = std::distance(first: Range.begin(), last: Range.end()); |
800 | #endif |
801 | |
802 | MachineRegisterInfo &MRI = *B.getMRI(); |
803 | Register SaveExecReg = MRI.createVirtualRegister(RegClass: WaveRC); |
804 | Register InitSaveExecReg = MRI.createVirtualRegister(RegClass: WaveRC); |
805 | |
806 | // Don't bother using generic instructions/registers for the exec mask. |
807 | B.buildInstr(Opcode: TargetOpcode::IMPLICIT_DEF) |
808 | .addDef(RegNo: InitSaveExecReg); |
809 | |
810 | Register PhiExec = MRI.createVirtualRegister(RegClass: WaveRC); |
811 | Register NewExec = MRI.createVirtualRegister(RegClass: WaveRC); |
812 | |
813 | // To insert the loop we need to split the block. Move everything before this |
814 | // point to a new block, and insert a new empty block before this instruction. |
815 | MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); |
816 | MachineBasicBlock *BodyBB = MF->CreateMachineBasicBlock(); |
817 | MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); |
818 | MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock(); |
819 | MachineFunction::iterator MBBI(MBB); |
820 | ++MBBI; |
821 | MF->insert(MBBI, MBB: LoopBB); |
822 | MF->insert(MBBI, MBB: BodyBB); |
823 | MF->insert(MBBI, MBB: RestoreExecBB); |
824 | MF->insert(MBBI, MBB: RemainderBB); |
825 | |
826 | LoopBB->addSuccessor(Succ: BodyBB); |
827 | BodyBB->addSuccessor(Succ: RestoreExecBB); |
828 | BodyBB->addSuccessor(Succ: LoopBB); |
829 | |
830 | // Move the rest of the block into a new block. |
831 | RemainderBB->transferSuccessorsAndUpdatePHIs(FromMBB: &MBB); |
832 | RemainderBB->splice(Where: RemainderBB->begin(), Other: &MBB, From: Range.end(), To: MBB.end()); |
833 | |
834 | MBB.addSuccessor(Succ: LoopBB); |
835 | RestoreExecBB->addSuccessor(Succ: RemainderBB); |
836 | |
837 | B.setInsertPt(MBB&: *LoopBB, II: LoopBB->end()); |
838 | |
839 | B.buildInstr(Opcode: TargetOpcode::PHI) |
840 | .addDef(RegNo: PhiExec) |
841 | .addReg(RegNo: InitSaveExecReg) |
842 | .addMBB(MBB: &MBB) |
843 | .addReg(RegNo: NewExec) |
844 | .addMBB(MBB: BodyBB); |
845 | |
846 | const DebugLoc &DL = B.getDL(); |
847 | |
848 | MachineInstr &FirstInst = *Range.begin(); |
849 | |
850 | // Move the instruction into the loop body. Note we moved everything after |
851 | // Range.end() already into a new block, so Range.end() is no longer valid. |
852 | BodyBB->splice(Where: BodyBB->end(), Other: &MBB, From: Range.begin(), To: MBB.end()); |
853 | |
854 | // Figure out the iterator range after splicing the instructions. |
855 | MachineBasicBlock::iterator NewBegin = FirstInst.getIterator(); |
856 | auto NewEnd = BodyBB->end(); |
857 | |
858 | B.setMBB(*LoopBB); |
859 | |
860 | LLT S1 = LLT::scalar(SizeInBits: 1); |
861 | Register CondReg; |
862 | |
863 | assert(std::distance(NewBegin, NewEnd) == OrigRangeSize); |
864 | |
865 | for (MachineInstr &MI : make_range(x: NewBegin, y: NewEnd)) { |
866 | for (MachineOperand &Op : MI.all_uses()) { |
867 | Register OldReg = Op.getReg(); |
868 | if (!SGPROperandRegs.count(V: OldReg)) |
869 | continue; |
870 | |
871 | // See if we already processed this register in another instruction in the |
872 | // sequence. |
873 | auto OldVal = WaterfalledRegMap.find(Val: OldReg); |
874 | if (OldVal != WaterfalledRegMap.end()) { |
875 | Op.setReg(OldVal->second); |
876 | continue; |
877 | } |
878 | |
879 | Register OpReg = Op.getReg(); |
880 | LLT OpTy = MRI.getType(Reg: OpReg); |
881 | |
882 | const RegisterBank *OpBank = getRegBank(OpReg, MRI, *TRI); |
883 | if (OpBank != &AMDGPU::VGPRRegBank) { |
884 | // Insert copy from AGPR to VGPR before the loop. |
885 | B.setMBB(MBB); |
886 | OpReg = B.buildCopy(Res: OpTy, Op: OpReg).getReg(Idx: 0); |
887 | MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank); |
888 | B.setMBB(*LoopBB); |
889 | } |
890 | |
891 | Register CurrentLaneReg = buildReadFirstLane(B, MRI, Src: OpReg); |
892 | |
893 | // Build the comparison(s). |
894 | unsigned OpSize = OpTy.getSizeInBits(); |
895 | bool Is64 = OpSize % 64 == 0; |
896 | unsigned PartSize = Is64 ? 64 : 32; |
897 | LLT PartTy = LLT::scalar(SizeInBits: PartSize); |
898 | unsigned NumParts = OpSize / PartSize; |
899 | SmallVector<Register, 8> OpParts; |
900 | SmallVector<Register, 8> CurrentLaneParts; |
901 | |
902 | if (NumParts == 1) { |
903 | OpParts.push_back(Elt: OpReg); |
904 | CurrentLaneParts.push_back(Elt: CurrentLaneReg); |
905 | } else { |
906 | auto UnmergeOp = B.buildUnmerge(Res: PartTy, Op: OpReg); |
907 | auto UnmergeCurrentLane = B.buildUnmerge(Res: PartTy, Op: CurrentLaneReg); |
908 | for (unsigned i = 0; i < NumParts; ++i) { |
909 | OpParts.push_back(Elt: UnmergeOp.getReg(Idx: i)); |
910 | CurrentLaneParts.push_back(Elt: UnmergeCurrentLane.getReg(Idx: i)); |
911 | MRI.setRegBank(OpParts[i], AMDGPU::VGPRRegBank); |
912 | MRI.setRegBank(CurrentLaneParts[i], AMDGPU::SGPRRegBank); |
913 | } |
914 | } |
915 | |
916 | for (unsigned i = 0; i < NumParts; ++i) { |
917 | auto CmpReg = B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: S1, Op0: CurrentLaneParts[i], |
918 | Op1: OpParts[i]).getReg(Idx: 0); |
919 | MRI.setRegBank(CmpReg, AMDGPU::VCCRegBank); |
920 | |
921 | if (!CondReg) { |
922 | CondReg = CmpReg; |
923 | } else { |
924 | CondReg = B.buildAnd(Dst: S1, Src0: CondReg, Src1: CmpReg).getReg(Idx: 0); |
925 | MRI.setRegBank(CondReg, AMDGPU::VCCRegBank); |
926 | } |
927 | } |
928 | |
929 | Op.setReg(CurrentLaneReg); |
930 | |
931 | // Make sure we don't re-process this register again. |
932 | WaterfalledRegMap.insert(KV: std::pair(OldReg, Op.getReg())); |
933 | } |
934 | } |
935 | |
936 | // The ballot becomes a no-op during instruction selection. |
937 | CondReg = B.buildIntrinsic(Intrinsic::amdgcn_ballot, |
938 | {LLT::scalar(Subtarget.isWave32() ? 32 : 64)}) |
939 | .addReg(CondReg) |
940 | .getReg(0); |
941 | MRI.setRegClass(Reg: CondReg, RC: WaveRC); |
942 | |
943 | // Update EXEC, save the original EXEC value to VCC. |
944 | B.buildInstr(Opcode: AndSaveExecOpc) |
945 | .addDef(RegNo: NewExec) |
946 | .addReg(RegNo: CondReg, flags: RegState::Kill); |
947 | |
948 | MRI.setSimpleHint(VReg: NewExec, PrefReg: CondReg); |
949 | |
950 | B.setInsertPt(MBB&: *BodyBB, II: BodyBB->end()); |
951 | |
952 | // Update EXEC, switch all done bits to 0 and all todo bits to 1. |
953 | B.buildInstr(Opcode: XorTermOpc) |
954 | .addDef(RegNo: ExecReg) |
955 | .addReg(RegNo: ExecReg) |
956 | .addReg(RegNo: NewExec); |
957 | |
958 | // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use |
959 | // s_cbranch_scc0? |
960 | |
961 | // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover. |
962 | B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB); |
963 | |
964 | // Save the EXEC mask before the loop. |
965 | BuildMI(MBB, MBB.end(), DL, TII->get(MovExecOpc), SaveExecReg) |
966 | .addReg(ExecReg); |
967 | |
968 | // Restore the EXEC mask after the loop. |
969 | B.setMBB(*RestoreExecBB); |
970 | B.buildInstr(Opcode: MovExecTermOpc) |
971 | .addDef(RegNo: ExecReg) |
972 | .addReg(RegNo: SaveExecReg); |
973 | |
974 | // Set the insert point after the original instruction, so any new |
975 | // instructions will be in the remainder. |
976 | B.setInsertPt(MBB&: *RemainderBB, II: RemainderBB->begin()); |
977 | |
978 | return true; |
979 | } |
980 | |
981 | // Return any unique registers used by \p MI at \p OpIndices that need to be |
982 | // handled in a waterfall loop. Returns these registers in \p |
983 | // SGPROperandRegs. Returns true if there are any operands to handle and a |
984 | // waterfall loop is necessary. |
985 | bool AMDGPURegisterBankInfo::collectWaterfallOperands( |
986 | SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI, |
987 | MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const { |
988 | for (unsigned Op : OpIndices) { |
989 | assert(MI.getOperand(Op).isUse()); |
990 | Register Reg = MI.getOperand(i: Op).getReg(); |
991 | const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI); |
992 | if (OpBank->getID() != AMDGPU::SGPRRegBankID) |
993 | SGPROperandRegs.insert(V: Reg); |
994 | } |
995 | |
996 | // No operands need to be replaced, so no need to loop. |
997 | return !SGPROperandRegs.empty(); |
998 | } |
999 | |
1000 | bool AMDGPURegisterBankInfo::executeInWaterfallLoop( |
1001 | MachineIRBuilder &B, MachineInstr &MI, ArrayRef<unsigned> OpIndices) const { |
1002 | // Use a set to avoid extra readfirstlanes in the case where multiple operands |
1003 | // are the same register. |
1004 | SmallSet<Register, 4> SGPROperandRegs; |
1005 | |
1006 | if (!collectWaterfallOperands(SGPROperandRegs, MI, *B.getMRI(), OpIndices)) |
1007 | return false; |
1008 | |
1009 | MachineBasicBlock::iterator I = MI.getIterator(); |
1010 | return executeInWaterfallLoop(B, make_range(x: I, y: std::next(x: I)), |
1011 | SGPROperandRegs); |
1012 | } |
1013 | |
1014 | // Legalize an operand that must be an SGPR by inserting a readfirstlane. |
1015 | void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane( |
1016 | MachineIRBuilder &B, MachineInstr &MI, unsigned OpIdx) const { |
1017 | Register Reg = MI.getOperand(i: OpIdx).getReg(); |
1018 | MachineRegisterInfo &MRI = *B.getMRI(); |
1019 | const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); |
1020 | if (Bank == &AMDGPU::SGPRRegBank) |
1021 | return; |
1022 | |
1023 | Reg = buildReadFirstLane(B, MRI, Src: Reg); |
1024 | MI.getOperand(i: OpIdx).setReg(Reg); |
1025 | } |
1026 | |
1027 | /// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the |
1028 | /// rest will be in the remainder. |
1029 | static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) { |
1030 | unsigned TotalSize = Ty.getSizeInBits(); |
1031 | if (!Ty.isVector()) |
1032 | return {LLT::scalar(SizeInBits: FirstSize), LLT::scalar(SizeInBits: TotalSize - FirstSize)}; |
1033 | |
1034 | LLT EltTy = Ty.getElementType(); |
1035 | unsigned EltSize = EltTy.getSizeInBits(); |
1036 | assert(FirstSize % EltSize == 0); |
1037 | |
1038 | unsigned FirstPartNumElts = FirstSize / EltSize; |
1039 | unsigned RemainderElts = (TotalSize - FirstSize) / EltSize; |
1040 | |
1041 | return {LLT::scalarOrVector(EC: ElementCount::getFixed(MinVal: FirstPartNumElts), ScalarTy: EltTy), |
1042 | LLT::scalarOrVector(EC: ElementCount::getFixed(MinVal: RemainderElts), ScalarTy: EltTy)}; |
1043 | } |
1044 | |
1045 | static LLT widen96To128(LLT Ty) { |
1046 | if (!Ty.isVector()) |
1047 | return LLT::scalar(SizeInBits: 128); |
1048 | |
1049 | LLT EltTy = Ty.getElementType(); |
1050 | assert(128 % EltTy.getSizeInBits() == 0); |
1051 | return LLT::fixed_vector(NumElements: 128 / EltTy.getSizeInBits(), ScalarTy: EltTy); |
1052 | } |
1053 | |
1054 | bool AMDGPURegisterBankInfo::applyMappingLoad( |
1055 | MachineIRBuilder &B, |
1056 | const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, |
1057 | MachineInstr &MI) const { |
1058 | MachineRegisterInfo &MRI = *B.getMRI(); |
1059 | Register DstReg = MI.getOperand(i: 0).getReg(); |
1060 | const LLT LoadTy = MRI.getType(Reg: DstReg); |
1061 | unsigned LoadSize = LoadTy.getSizeInBits(); |
1062 | const unsigned MaxNonSmrdLoadSize = 128; |
1063 | |
1064 | const RegisterBank *DstBank = |
1065 | OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank; |
1066 | if (DstBank == &AMDGPU::SGPRRegBank) { |
1067 | // There are some special cases that we need to look at for 32 bit and 96 |
1068 | // bit SGPR loads otherwise we have nothing to do. |
1069 | if (LoadSize != 32 && (LoadSize != 96 || Subtarget.hasScalarDwordx3Loads())) |
1070 | return false; |
1071 | |
1072 | MachineMemOperand *MMO = *MI.memoperands_begin(); |
1073 | const unsigned MemSize = 8 * MMO->getSize().getValue(); |
1074 | // Scalar loads of size 8 or 16 bit with proper alignment may be widened to |
1075 | // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit |
1076 | // scalar loads should have a load size of 32 but memory access size of less |
1077 | // than 32. |
1078 | if (LoadSize == 32 && |
1079 | (MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI))) |
1080 | return false; |
1081 | |
1082 | if (LoadSize == 32 && |
1083 | ((MemSize == 8 && MMO->getAlign() >= Align(1)) || |
1084 | (MemSize == 16 && MMO->getAlign() >= Align(2))) && |
1085 | isScalarLoadLegal(MI) && |
1086 | Subtarget.getGeneration() >= AMDGPUSubtarget::GFX12) |
1087 | return false; |
1088 | |
1089 | Register PtrReg = MI.getOperand(i: 1).getReg(); |
1090 | |
1091 | ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank); |
1092 | |
1093 | if (LoadSize == 32) { |
1094 | // This is an extending load from a sub-dword size. Widen the memory |
1095 | // access size to 4 bytes and clear the extra high bits appropriately |
1096 | const LLT S32 = LLT::scalar(SizeInBits: 32); |
1097 | if (MI.getOpcode() == AMDGPU::G_SEXTLOAD) { |
1098 | // Must extend the sign bit into higher bits for a G_SEXTLOAD |
1099 | auto WideLoad = B.buildLoadFromOffset(Dst: S32, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: 0); |
1100 | B.buildSExtInReg(Res: MI.getOperand(i: 0), Op: WideLoad, ImmOp: MemSize); |
1101 | } else if (MI.getOpcode() == AMDGPU::G_ZEXTLOAD) { |
1102 | // Must extend zero into higher bits with an AND for a G_ZEXTLOAD |
1103 | auto WideLoad = B.buildLoadFromOffset(Dst: S32, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: 0); |
1104 | B.buildZExtInReg(Res: MI.getOperand(i: 0), Op: WideLoad, ImmOp: MemSize); |
1105 | } else |
1106 | // We do not need to touch the higher bits for regular loads. |
1107 | B.buildLoadFromOffset(Dst: MI.getOperand(i: 0), BasePtr: PtrReg, BaseMMO&: *MMO, Offset: 0); |
1108 | } else { |
1109 | // 96-bit loads are only available for vector loads. We need to split this |
1110 | // into a 64-bit part, and 32 (unless we can widen to a 128-bit load). |
1111 | if (MMO->getAlign() < Align(16)) { |
1112 | LegalizerHelper Helper(B.getMF(), ApplyBank, B); |
1113 | LLT Part64, Part32; |
1114 | std::tie(args&: Part64, args&: Part32) = splitUnequalType(Ty: LoadTy, FirstSize: 64); |
1115 | if (Helper.reduceLoadStoreWidth(MI&: cast<GAnyLoad>(Val&: MI), TypeIdx: 0, NarrowTy: Part64) != |
1116 | LegalizerHelper::Legalized) |
1117 | return false; |
1118 | return true; |
1119 | } else { |
1120 | LLT WiderTy = widen96To128(Ty: LoadTy); |
1121 | auto WideLoad = B.buildLoadFromOffset(Dst: WiderTy, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: 0); |
1122 | if (WiderTy.isScalar()) |
1123 | B.buildTrunc(Res: MI.getOperand(i: 0), Op: WideLoad); |
1124 | else { |
1125 | B.buildDeleteTrailingVectorElements(Res: MI.getOperand(i: 0).getReg(), |
1126 | Op0: WideLoad); |
1127 | } |
1128 | } |
1129 | } |
1130 | |
1131 | MI.eraseFromParent(); |
1132 | return true; |
1133 | } |
1134 | |
1135 | // 128-bit loads are supported for all instruction types. |
1136 | if (LoadSize <= MaxNonSmrdLoadSize) |
1137 | return false; |
1138 | |
1139 | SmallVector<Register, 16> DefRegs(OpdMapper.getVRegs(OpIdx: 0)); |
1140 | SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(OpIdx: 1)); |
1141 | |
1142 | if (SrcRegs.empty()) |
1143 | SrcRegs.push_back(Elt: MI.getOperand(i: 1).getReg()); |
1144 | |
1145 | assert(LoadSize % MaxNonSmrdLoadSize == 0); |
1146 | |
1147 | // RegBankSelect only emits scalar types, so we need to reset the pointer |
1148 | // operand to a pointer type. |
1149 | Register BasePtrReg = SrcRegs[0]; |
1150 | LLT PtrTy = MRI.getType(Reg: MI.getOperand(i: 1).getReg()); |
1151 | MRI.setType(VReg: BasePtrReg, Ty: PtrTy); |
1152 | |
1153 | unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize; |
1154 | const LLT LoadSplitTy = LoadTy.divide(Factor: NumSplitParts); |
1155 | ApplyRegBankMapping O(B, *this, MRI, &AMDGPU::VGPRRegBank); |
1156 | LegalizerHelper Helper(B.getMF(), O, B); |
1157 | |
1158 | if (LoadTy.isVector()) { |
1159 | if (Helper.fewerElementsVector(MI, TypeIdx: 0, NarrowTy: LoadSplitTy) != LegalizerHelper::Legalized) |
1160 | return false; |
1161 | } else { |
1162 | if (Helper.narrowScalar(MI, TypeIdx: 0, NarrowTy: LoadSplitTy) != LegalizerHelper::Legalized) |
1163 | return false; |
1164 | } |
1165 | |
1166 | MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); |
1167 | return true; |
1168 | } |
1169 | |
1170 | bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc( |
1171 | MachineIRBuilder &B, |
1172 | const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, |
1173 | MachineInstr &MI) const { |
1174 | MachineRegisterInfo &MRI = *B.getMRI(); |
1175 | const MachineFunction &MF = B.getMF(); |
1176 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
1177 | const auto &TFI = *ST.getFrameLowering(); |
1178 | |
1179 | // Guard in case the stack growth direction ever changes with scratch |
1180 | // instructions. |
1181 | if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown) |
1182 | return false; |
1183 | |
1184 | Register Dst = MI.getOperand(i: 0).getReg(); |
1185 | Register AllocSize = MI.getOperand(i: 1).getReg(); |
1186 | Align Alignment = assumeAligned(Value: MI.getOperand(i: 2).getImm()); |
1187 | |
1188 | const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI); |
1189 | |
1190 | // TODO: Need to emit a wave reduction to get the maximum size. |
1191 | if (SizeBank != &AMDGPU::SGPRRegBank) |
1192 | return false; |
1193 | |
1194 | LLT PtrTy = MRI.getType(Reg: Dst); |
1195 | LLT IntPtrTy = LLT::scalar(SizeInBits: PtrTy.getSizeInBits()); |
1196 | |
1197 | const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); |
1198 | Register SPReg = Info->getStackPtrOffsetReg(); |
1199 | ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank); |
1200 | |
1201 | auto WaveSize = B.buildConstant(LLT::scalar(SizeInBits: 32), ST.getWavefrontSizeLog2()); |
1202 | auto ScaledSize = B.buildShl(Dst: IntPtrTy, Src0: AllocSize, Src1: WaveSize); |
1203 | |
1204 | auto SPCopy = B.buildCopy(Res: PtrTy, Op: SPReg); |
1205 | if (Alignment > TFI.getStackAlign()) { |
1206 | auto PtrAdd = B.buildPtrAdd(Res: PtrTy, Op0: SPCopy, Op1: ScaledSize); |
1207 | B.buildMaskLowPtrBits(Res: Dst, Op0: PtrAdd, |
1208 | NumBits: Log2(A: Alignment) + ST.getWavefrontSizeLog2()); |
1209 | } else { |
1210 | B.buildPtrAdd(Res: Dst, Op0: SPCopy, Op1: ScaledSize); |
1211 | } |
1212 | |
1213 | MI.eraseFromParent(); |
1214 | return true; |
1215 | } |
1216 | |
1217 | bool AMDGPURegisterBankInfo::applyMappingImage( |
1218 | MachineIRBuilder &B, MachineInstr &MI, |
1219 | const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, |
1220 | int RsrcIdx) const { |
1221 | const int NumDefs = MI.getNumExplicitDefs(); |
1222 | |
1223 | // The reported argument index is relative to the IR intrinsic call arguments, |
1224 | // so we need to shift by the number of defs and the intrinsic ID. |
1225 | RsrcIdx += NumDefs + 1; |
1226 | |
1227 | // Insert copies to VGPR arguments. |
1228 | applyDefaultMapping(OpdMapper); |
1229 | |
1230 | // Fixup any SGPR arguments. |
1231 | SmallVector<unsigned, 4> SGPRIndexes; |
1232 | for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) { |
1233 | if (!MI.getOperand(i: I).isReg()) |
1234 | continue; |
1235 | |
1236 | // If this intrinsic has a sampler, it immediately follows rsrc. |
1237 | if (I == RsrcIdx || I == RsrcIdx + 1) |
1238 | SGPRIndexes.push_back(Elt: I); |
1239 | } |
1240 | |
1241 | executeInWaterfallLoop(B, MI, OpIndices: SGPRIndexes); |
1242 | return true; |
1243 | } |
1244 | |
1245 | // Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store |
1246 | // the three offsets (voffset, soffset and instoffset) |
1247 | unsigned AMDGPURegisterBankInfo::setBufferOffsets( |
1248 | MachineIRBuilder &B, Register CombinedOffset, Register &VOffsetReg, |
1249 | Register &SOffsetReg, int64_t &InstOffsetVal, Align Alignment) const { |
1250 | const LLT S32 = LLT::scalar(SizeInBits: 32); |
1251 | MachineRegisterInfo *MRI = B.getMRI(); |
1252 | |
1253 | if (std::optional<int64_t> Imm = |
1254 | getIConstantVRegSExtVal(VReg: CombinedOffset, MRI: *MRI)) { |
1255 | uint32_t SOffset, ImmOffset; |
1256 | if (TII->splitMUBUFOffset(Imm: *Imm, SOffset, ImmOffset, Alignment)) { |
1257 | VOffsetReg = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0); |
1258 | SOffsetReg = B.buildConstant(Res: S32, Val: SOffset).getReg(Idx: 0); |
1259 | InstOffsetVal = ImmOffset; |
1260 | |
1261 | B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); |
1262 | B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); |
1263 | return SOffset + ImmOffset; |
1264 | } |
1265 | } |
1266 | |
1267 | Register Base; |
1268 | unsigned Offset; |
1269 | |
1270 | std::tie(args&: Base, args&: Offset) = |
1271 | AMDGPU::getBaseWithConstantOffset(MRI&: *MRI, Reg: CombinedOffset); |
1272 | |
1273 | uint32_t SOffset, ImmOffset; |
1274 | if ((int)Offset > 0 && |
1275 | TII->splitMUBUFOffset(Imm: Offset, SOffset, ImmOffset, Alignment)) { |
1276 | if (getRegBank(Base, *MRI, *TRI) == &AMDGPU::VGPRRegBank) { |
1277 | VOffsetReg = Base; |
1278 | SOffsetReg = B.buildConstant(Res: S32, Val: SOffset).getReg(Idx: 0); |
1279 | B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); |
1280 | InstOffsetVal = ImmOffset; |
1281 | return 0; // XXX - Why is this 0? |
1282 | } |
1283 | |
1284 | // If we have SGPR base, we can use it for soffset. |
1285 | if (SOffset == 0) { |
1286 | VOffsetReg = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0); |
1287 | B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); |
1288 | SOffsetReg = Base; |
1289 | InstOffsetVal = ImmOffset; |
1290 | return 0; // XXX - Why is this 0? |
1291 | } |
1292 | } |
1293 | |
1294 | // Handle the variable sgpr + vgpr case. |
1295 | MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI); |
1296 | if (Add && (int)Offset >= 0) { |
1297 | Register Src0 = getSrcRegIgnoringCopies(Reg: Add->getOperand(i: 1).getReg(), MRI: *MRI); |
1298 | Register Src1 = getSrcRegIgnoringCopies(Reg: Add->getOperand(i: 2).getReg(), MRI: *MRI); |
1299 | |
1300 | const RegisterBank *Src0Bank = getRegBank(Src0, *MRI, *TRI); |
1301 | const RegisterBank *Src1Bank = getRegBank(Src1, *MRI, *TRI); |
1302 | |
1303 | if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) { |
1304 | VOffsetReg = Src0; |
1305 | SOffsetReg = Src1; |
1306 | return 0; |
1307 | } |
1308 | |
1309 | if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) { |
1310 | VOffsetReg = Src1; |
1311 | SOffsetReg = Src0; |
1312 | return 0; |
1313 | } |
1314 | } |
1315 | |
1316 | // Ensure we have a VGPR for the combined offset. This could be an issue if we |
1317 | // have an SGPR offset and a VGPR resource. |
1318 | if (getRegBank(CombinedOffset, *MRI, *TRI) == &AMDGPU::VGPRRegBank) { |
1319 | VOffsetReg = CombinedOffset; |
1320 | } else { |
1321 | VOffsetReg = B.buildCopy(Res: S32, Op: CombinedOffset).getReg(Idx: 0); |
1322 | B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); |
1323 | } |
1324 | |
1325 | SOffsetReg = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0); |
1326 | B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); |
1327 | return 0; |
1328 | } |
1329 | |
1330 | bool AMDGPURegisterBankInfo::applyMappingSBufferLoad( |
1331 | MachineIRBuilder &B, const OperandsMapper &OpdMapper) const { |
1332 | MachineInstr &MI = OpdMapper.getMI(); |
1333 | MachineRegisterInfo &MRI = OpdMapper.getMRI(); |
1334 | |
1335 | const LLT S32 = LLT::scalar(SizeInBits: 32); |
1336 | Register Dst = MI.getOperand(i: 0).getReg(); |
1337 | LLT Ty = MRI.getType(Reg: Dst); |
1338 | |
1339 | const RegisterBank *RSrcBank = |
1340 | OpdMapper.getInstrMapping().getOperandMapping(i: 1).BreakDown[0].RegBank; |
1341 | const RegisterBank *OffsetBank = |
1342 | OpdMapper.getInstrMapping().getOperandMapping(i: 2).BreakDown[0].RegBank; |
1343 | if (RSrcBank == &AMDGPU::SGPRRegBank && |
1344 | OffsetBank == &AMDGPU::SGPRRegBank) |
1345 | return true; // Legal mapping |
1346 | |
1347 | // FIXME: 96-bit case was widened during legalize. We need to narrow it back |
1348 | // here but don't have an MMO. |
1349 | |
1350 | unsigned LoadSize = Ty.getSizeInBits(); |
1351 | int NumLoads = 1; |
1352 | if (LoadSize == 256 || LoadSize == 512) { |
1353 | NumLoads = LoadSize / 128; |
1354 | Ty = Ty.divide(Factor: NumLoads); |
1355 | } |
1356 | |
1357 | // Use the alignment to ensure that the required offsets will fit into the |
1358 | // immediate offsets. |
1359 | const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1); |
1360 | |
1361 | MachineFunction &MF = B.getMF(); |
1362 | |
1363 | Register SOffset; |
1364 | Register VOffset; |
1365 | int64_t ImmOffset = 0; |
1366 | |
1367 | unsigned MMOOffset = setBufferOffsets(B, CombinedOffset: MI.getOperand(i: 2).getReg(), VOffsetReg&: VOffset, |
1368 | SOffsetReg&: SOffset, InstOffsetVal&: ImmOffset, Alignment); |
1369 | |
1370 | // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we |
1371 | // can, but we need to track an MMO for that. |
1372 | const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8; |
1373 | const Align MemAlign(4); // FIXME: ABI type alignment? |
1374 | MachineMemOperand *BaseMMO = MF.getMachineMemOperand( |
1375 | PtrInfo: MachinePointerInfo(), |
1376 | F: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | |
1377 | MachineMemOperand::MOInvariant, |
1378 | Size: MemSize, BaseAlignment: MemAlign); |
1379 | if (MMOOffset != 0) |
1380 | BaseMMO = MF.getMachineMemOperand(MMO: BaseMMO, Offset: MMOOffset, Size: MemSize); |
1381 | |
1382 | // If only the offset is divergent, emit a MUBUF buffer load instead. We can |
1383 | // assume that the buffer is unswizzled. |
1384 | |
1385 | Register RSrc = MI.getOperand(i: 1).getReg(); |
1386 | Register VIndex = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0); |
1387 | B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank); |
1388 | |
1389 | SmallVector<Register, 4> LoadParts(NumLoads); |
1390 | |
1391 | MachineBasicBlock::iterator MII = MI.getIterator(); |
1392 | MachineInstrSpan Span(MII, &B.getMBB()); |
1393 | |
1394 | for (int i = 0; i < NumLoads; ++i) { |
1395 | if (NumLoads == 1) { |
1396 | LoadParts[i] = Dst; |
1397 | } else { |
1398 | LoadParts[i] = MRI.createGenericVirtualRegister(Ty); |
1399 | MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank); |
1400 | } |
1401 | |
1402 | MachineMemOperand *MMO = BaseMMO; |
1403 | if (i != 0) |
1404 | BaseMMO = MF.getMachineMemOperand(MMO: BaseMMO, Offset: MMOOffset + 16 * i, Size: MemSize); |
1405 | |
1406 | B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD) |
1407 | .addDef(LoadParts[i]) // vdata |
1408 | .addUse(RSrc) // rsrc |
1409 | .addUse(VIndex) // vindex |
1410 | .addUse(VOffset) // voffset |
1411 | .addUse(SOffset) // soffset |
1412 | .addImm(ImmOffset + 16 * i) // offset(imm) |
1413 | .addImm(0) // cachepolicy, swizzled buffer(imm) |
1414 | .addImm(0) // idxen(imm) |
1415 | .addMemOperand(MMO); |
1416 | } |
1417 | |
1418 | // TODO: If only the resource is a VGPR, it may be better to execute the |
1419 | // scalar load in the waterfall loop if the resource is expected to frequently |
1420 | // be dynamically uniform. |
1421 | if (RSrcBank != &AMDGPU::SGPRRegBank) { |
1422 | // Remove the original instruction to avoid potentially confusing the |
1423 | // waterfall loop logic. |
1424 | B.setInstr(*Span.begin()); |
1425 | MI.eraseFromParent(); |
1426 | |
1427 | SmallSet<Register, 4> OpsToWaterfall; |
1428 | |
1429 | OpsToWaterfall.insert(V: RSrc); |
1430 | executeInWaterfallLoop(B, make_range(x: Span.begin(), y: Span.end()), |
1431 | OpsToWaterfall); |
1432 | } |
1433 | |
1434 | if (NumLoads != 1) { |
1435 | if (Ty.isVector()) |
1436 | B.buildConcatVectors(Res: Dst, Ops: LoadParts); |
1437 | else |
1438 | B.buildMergeLikeInstr(Res: Dst, Ops: LoadParts); |
1439 | } |
1440 | |
1441 | // We removed the instruction earlier with a waterfall loop. |
1442 | if (RSrcBank == &AMDGPU::SGPRRegBank) |
1443 | MI.eraseFromParent(); |
1444 | |
1445 | return true; |
1446 | } |
1447 | |
1448 | bool AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B, |
1449 | const OperandsMapper &OpdMapper, |
1450 | bool Signed) const { |
1451 | MachineInstr &MI = OpdMapper.getMI(); |
1452 | MachineRegisterInfo &MRI = OpdMapper.getMRI(); |
1453 | |
1454 | // Insert basic copies |
1455 | applyDefaultMapping(OpdMapper); |
1456 | |
1457 | Register DstReg = MI.getOperand(i: 0).getReg(); |
1458 | LLT Ty = MRI.getType(Reg: DstReg); |
1459 | |
1460 | const LLT S32 = LLT::scalar(SizeInBits: 32); |
1461 | |
1462 | unsigned FirstOpnd = isa<GIntrinsic>(Val: MI) ? 2 : 1; |
1463 | Register SrcReg = MI.getOperand(i: FirstOpnd).getReg(); |
1464 | Register OffsetReg = MI.getOperand(i: FirstOpnd + 1).getReg(); |
1465 | Register WidthReg = MI.getOperand(i: FirstOpnd + 2).getReg(); |
1466 | |
1467 | const RegisterBank *DstBank = |
1468 | OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank; |
1469 | if (DstBank == &AMDGPU::VGPRRegBank) { |
1470 | if (Ty == S32) |
1471 | return true; |
1472 | |
1473 | // There is no 64-bit vgpr bitfield extract instructions so the operation |
1474 | // is expanded to a sequence of instructions that implement the operation. |
1475 | ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank); |
1476 | |
1477 | const LLT S64 = LLT::scalar(SizeInBits: 64); |
1478 | // Shift the source operand so that extracted bits start at bit 0. |
1479 | auto ShiftOffset = Signed ? B.buildAShr(Dst: S64, Src0: SrcReg, Src1: OffsetReg) |
1480 | : B.buildLShr(Dst: S64, Src0: SrcReg, Src1: OffsetReg); |
1481 | auto UnmergeSOffset = B.buildUnmerge(Res: {S32, S32}, Op: ShiftOffset); |
1482 | |
1483 | // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions |
1484 | // if the width is a constant. |
1485 | if (auto ConstWidth = getIConstantVRegValWithLookThrough(VReg: WidthReg, MRI)) { |
1486 | // Use the 32-bit bitfield extract instruction if the width is a constant. |
1487 | // Depending on the width size, use either the low or high 32-bits. |
1488 | auto Zero = B.buildConstant(Res: S32, Val: 0); |
1489 | auto WidthImm = ConstWidth->Value.getZExtValue(); |
1490 | if (WidthImm <= 32) { |
1491 | // Use bitfield extract on the lower 32-bit source, and then sign-extend |
1492 | // or clear the upper 32-bits. |
1493 | auto = |
1494 | Signed ? B.buildSbfx(Dst: S32, Src: UnmergeSOffset.getReg(Idx: 0), LSB: Zero, Width: WidthReg) |
1495 | : B.buildUbfx(Dst: S32, Src: UnmergeSOffset.getReg(Idx: 0), LSB: Zero, Width: WidthReg); |
1496 | auto Extend = |
1497 | Signed ? B.buildAShr(Dst: S32, Src0: Extract, Src1: B.buildConstant(Res: S32, Val: 31)) : Zero; |
1498 | B.buildMergeLikeInstr(Res: DstReg, Ops: {Extract, Extend}); |
1499 | } else { |
1500 | // Use bitfield extract on upper 32-bit source, and combine with lower |
1501 | // 32-bit source. |
1502 | auto UpperWidth = B.buildConstant(Res: S32, Val: WidthImm - 32); |
1503 | auto = |
1504 | Signed |
1505 | ? B.buildSbfx(Dst: S32, Src: UnmergeSOffset.getReg(Idx: 1), LSB: Zero, Width: UpperWidth) |
1506 | : B.buildUbfx(Dst: S32, Src: UnmergeSOffset.getReg(Idx: 1), LSB: Zero, Width: UpperWidth); |
1507 | B.buildMergeLikeInstr(Res: DstReg, Ops: {UnmergeSOffset.getReg(Idx: 0), Extract}); |
1508 | } |
1509 | MI.eraseFromParent(); |
1510 | return true; |
1511 | } |
1512 | |
1513 | // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit |
1514 | // operations. |
1515 | auto ExtShift = B.buildSub(Dst: S32, Src0: B.buildConstant(Res: S32, Val: 64), Src1: WidthReg); |
1516 | auto SignBit = B.buildShl(Dst: S64, Src0: ShiftOffset, Src1: ExtShift); |
1517 | if (Signed) |
1518 | B.buildAShr(Dst: S64, Src0: SignBit, Src1: ExtShift); |
1519 | else |
1520 | B.buildLShr(Dst: S64, Src0: SignBit, Src1: ExtShift); |
1521 | MI.eraseFromParent(); |
1522 | return true; |
1523 | } |
1524 | |
1525 | // The scalar form packs the offset and width in a single operand. |
1526 | |
1527 | ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank); |
1528 | |
1529 | // Ensure the high bits are clear to insert the offset. |
1530 | auto OffsetMask = B.buildConstant(Res: S32, Val: maskTrailingOnes<unsigned>(N: 6)); |
1531 | auto ClampOffset = B.buildAnd(Dst: S32, Src0: OffsetReg, Src1: OffsetMask); |
1532 | |
1533 | // Zeros out the low bits, so don't bother clamping the input value. |
1534 | auto ShiftWidth = B.buildShl(Dst: S32, Src0: WidthReg, Src1: B.buildConstant(Res: S32, Val: 16)); |
1535 | |
1536 | // Transformation function, pack the offset and width of a BFE into |
1537 | // the format expected by the S_BFE_I32 / S_BFE_U32. In the second |
1538 | // source, bits [5:0] contain the offset and bits [22:16] the width. |
1539 | auto MergedInputs = B.buildOr(Dst: S32, Src0: ClampOffset, Src1: ShiftWidth); |
1540 | |
1541 | // TODO: It might be worth using a pseudo here to avoid scc clobber and |
1542 | // register class constraints. |
1543 | unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) : |
1544 | (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64); |
1545 | |
1546 | auto MIB = B.buildInstr(Opc, DstOps: {DstReg}, SrcOps: {SrcReg, MergedInputs}); |
1547 | if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this)) |
1548 | llvm_unreachable("failed to constrain BFE" ); |
1549 | |
1550 | MI.eraseFromParent(); |
1551 | return true; |
1552 | } |
1553 | |
1554 | bool AMDGPURegisterBankInfo::applyMappingMAD_64_32( |
1555 | MachineIRBuilder &B, const OperandsMapper &OpdMapper) const { |
1556 | MachineInstr &MI = OpdMapper.getMI(); |
1557 | MachineRegisterInfo &MRI = OpdMapper.getMRI(); |
1558 | |
1559 | // Insert basic copies. |
1560 | applyDefaultMapping(OpdMapper); |
1561 | |
1562 | Register Dst0 = MI.getOperand(i: 0).getReg(); |
1563 | Register Dst1 = MI.getOperand(i: 1).getReg(); |
1564 | Register Src0 = MI.getOperand(i: 2).getReg(); |
1565 | Register Src1 = MI.getOperand(i: 3).getReg(); |
1566 | Register Src2 = MI.getOperand(i: 4).getReg(); |
1567 | |
1568 | if (MRI.getRegBankOrNull(Src0) == &AMDGPU::VGPRRegBank) |
1569 | return true; |
1570 | |
1571 | bool IsUnsigned = MI.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32; |
1572 | LLT S1 = LLT::scalar(SizeInBits: 1); |
1573 | LLT S32 = LLT::scalar(SizeInBits: 32); |
1574 | |
1575 | bool DstOnValu = MRI.getRegBankOrNull(Src2) == &AMDGPU::VGPRRegBank; |
1576 | bool Accumulate = true; |
1577 | |
1578 | if (!DstOnValu) { |
1579 | if (mi_match(R: Src2, MRI, P: m_ZeroInt())) |
1580 | Accumulate = false; |
1581 | } |
1582 | |
1583 | // Keep the multiplication on the SALU. |
1584 | Register DstHi; |
1585 | Register DstLo = B.buildMul(Dst: S32, Src0, Src1).getReg(Idx: 0); |
1586 | bool MulHiInVgpr = false; |
1587 | |
1588 | MRI.setRegBank(DstLo, AMDGPU::SGPRRegBank); |
1589 | |
1590 | if (Subtarget.hasSMulHi()) { |
1591 | DstHi = IsUnsigned ? B.buildUMulH(Dst: S32, Src0, Src1).getReg(Idx: 0) |
1592 | : B.buildSMulH(Dst: S32, Src0, Src1).getReg(Idx: 0); |
1593 | MRI.setRegBank(DstHi, AMDGPU::SGPRRegBank); |
1594 | } else { |
1595 | Register VSrc0 = B.buildCopy(Res: S32, Op: Src0).getReg(Idx: 0); |
1596 | Register VSrc1 = B.buildCopy(Res: S32, Op: Src1).getReg(Idx: 0); |
1597 | |
1598 | MRI.setRegBank(VSrc0, AMDGPU::VGPRRegBank); |
1599 | MRI.setRegBank(VSrc1, AMDGPU::VGPRRegBank); |
1600 | |
1601 | DstHi = IsUnsigned ? B.buildUMulH(Dst: S32, Src0: VSrc0, Src1: VSrc1).getReg(Idx: 0) |
1602 | : B.buildSMulH(Dst: S32, Src0: VSrc0, Src1: VSrc1).getReg(Idx: 0); |
1603 | MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank); |
1604 | |
1605 | if (!DstOnValu) { |
1606 | DstHi = buildReadFirstLane(B, MRI, Src: DstHi); |
1607 | } else { |
1608 | MulHiInVgpr = true; |
1609 | } |
1610 | } |
1611 | |
1612 | // Accumulate and produce the "carry-out" bit. |
1613 | // |
1614 | // The "carry-out" is defined as bit 64 of the result when computed as a |
1615 | // big integer. For unsigned multiply-add, this matches the usual definition |
1616 | // of carry-out. For signed multiply-add, bit 64 is the sign bit of the |
1617 | // result, which is determined as: |
1618 | // sign(Src0 * Src1) + sign(Src2) + carry-out from unsigned 64-bit add |
1619 | LLT CarryType = DstOnValu ? S1 : S32; |
1620 | const RegisterBank &CarryBank = |
1621 | DstOnValu ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank; |
1622 | const RegisterBank &DstBank = |
1623 | DstOnValu ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank; |
1624 | Register Carry; |
1625 | Register Zero; |
1626 | |
1627 | if (!IsUnsigned) { |
1628 | Zero = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0); |
1629 | MRI.setRegBank(Zero, |
1630 | MulHiInVgpr ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank); |
1631 | |
1632 | Carry = B.buildICmp(Pred: CmpInst::ICMP_SLT, Res: MulHiInVgpr ? S1 : S32, Op0: DstHi, Op1: Zero) |
1633 | .getReg(Idx: 0); |
1634 | MRI.setRegBank(Carry, MulHiInVgpr ? AMDGPU::VCCRegBank |
1635 | : AMDGPU::SGPRRegBank); |
1636 | |
1637 | if (DstOnValu && !MulHiInVgpr) { |
1638 | Carry = B.buildTrunc(Res: S1, Op: Carry).getReg(Idx: 0); |
1639 | MRI.setRegBank(Carry, AMDGPU::VCCRegBank); |
1640 | } |
1641 | } |
1642 | |
1643 | if (Accumulate) { |
1644 | if (DstOnValu) { |
1645 | DstLo = B.buildCopy(Res: S32, Op: DstLo).getReg(Idx: 0); |
1646 | DstHi = B.buildCopy(Res: S32, Op: DstHi).getReg(Idx: 0); |
1647 | MRI.setRegBank(DstLo, AMDGPU::VGPRRegBank); |
1648 | MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank); |
1649 | } |
1650 | |
1651 | auto Unmerge = B.buildUnmerge(Res: S32, Op: Src2); |
1652 | Register Src2Lo = Unmerge.getReg(Idx: 0); |
1653 | Register Src2Hi = Unmerge.getReg(Idx: 1); |
1654 | MRI.setRegBank(Reg: Src2Lo, RegBank: DstBank); |
1655 | MRI.setRegBank(Reg: Src2Hi, RegBank: DstBank); |
1656 | |
1657 | if (!IsUnsigned) { |
1658 | auto Src2Sign = B.buildICmp(Pred: CmpInst::ICMP_SLT, Res: CarryType, Op0: Src2Hi, Op1: Zero); |
1659 | MRI.setRegBank(Reg: Src2Sign.getReg(Idx: 0), RegBank: CarryBank); |
1660 | |
1661 | Carry = B.buildXor(Dst: CarryType, Src0: Carry, Src1: Src2Sign).getReg(Idx: 0); |
1662 | MRI.setRegBank(Reg: Carry, RegBank: CarryBank); |
1663 | } |
1664 | |
1665 | auto AddLo = B.buildUAddo(Res: S32, CarryOut: CarryType, Op0: DstLo, Op1: Src2Lo); |
1666 | DstLo = AddLo.getReg(Idx: 0); |
1667 | Register CarryLo = AddLo.getReg(Idx: 1); |
1668 | MRI.setRegBank(Reg: DstLo, RegBank: DstBank); |
1669 | MRI.setRegBank(Reg: CarryLo, RegBank: CarryBank); |
1670 | |
1671 | auto AddHi = B.buildUAdde(Res: S32, CarryOut: CarryType, Op0: DstHi, Op1: Src2Hi, CarryIn: CarryLo); |
1672 | DstHi = AddHi.getReg(Idx: 0); |
1673 | MRI.setRegBank(Reg: DstHi, RegBank: DstBank); |
1674 | |
1675 | Register CarryHi = AddHi.getReg(Idx: 1); |
1676 | MRI.setRegBank(Reg: CarryHi, RegBank: CarryBank); |
1677 | |
1678 | if (IsUnsigned) { |
1679 | Carry = CarryHi; |
1680 | } else { |
1681 | Carry = B.buildXor(Dst: CarryType, Src0: Carry, Src1: CarryHi).getReg(Idx: 0); |
1682 | MRI.setRegBank(Reg: Carry, RegBank: CarryBank); |
1683 | } |
1684 | } else { |
1685 | if (IsUnsigned) { |
1686 | Carry = B.buildConstant(Res: CarryType, Val: 0).getReg(Idx: 0); |
1687 | MRI.setRegBank(Reg: Carry, RegBank: CarryBank); |
1688 | } |
1689 | } |
1690 | |
1691 | B.buildMergeLikeInstr(Res: Dst0, Ops: {DstLo, DstHi}); |
1692 | |
1693 | if (DstOnValu) { |
1694 | B.buildCopy(Res: Dst1, Op: Carry); |
1695 | } else { |
1696 | B.buildTrunc(Res: Dst1, Op: Carry); |
1697 | } |
1698 | |
1699 | MI.eraseFromParent(); |
1700 | return true; |
1701 | } |
1702 | |
1703 | // Return a suitable opcode for extending the operands of Opc when widening. |
1704 | static unsigned getExtendOp(unsigned Opc) { |
1705 | switch (Opc) { |
1706 | case TargetOpcode::G_ASHR: |
1707 | case TargetOpcode::G_SMIN: |
1708 | case TargetOpcode::G_SMAX: |
1709 | return TargetOpcode::G_SEXT; |
1710 | case TargetOpcode::G_LSHR: |
1711 | case TargetOpcode::G_UMIN: |
1712 | case TargetOpcode::G_UMAX: |
1713 | return TargetOpcode::G_ZEXT; |
1714 | default: |
1715 | return TargetOpcode::G_ANYEXT; |
1716 | } |
1717 | } |
1718 | |
1719 | // Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding |
1720 | // any illegal vector extend or unmerge operations. |
1721 | static std::pair<Register, Register> |
1722 | unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) { |
1723 | const LLT S32 = LLT::scalar(SizeInBits: 32); |
1724 | auto Bitcast = B.buildBitcast(Dst: S32, Src); |
1725 | |
1726 | if (ExtOpcode == TargetOpcode::G_SEXT) { |
1727 | auto ExtLo = B.buildSExtInReg(Res: S32, Op: Bitcast, ImmOp: 16); |
1728 | auto ShiftHi = B.buildAShr(Dst: S32, Src0: Bitcast, Src1: B.buildConstant(Res: S32, Val: 16)); |
1729 | return std::pair(ExtLo.getReg(Idx: 0), ShiftHi.getReg(Idx: 0)); |
1730 | } |
1731 | |
1732 | auto ShiftHi = B.buildLShr(Dst: S32, Src0: Bitcast, Src1: B.buildConstant(Res: S32, Val: 16)); |
1733 | if (ExtOpcode == TargetOpcode::G_ZEXT) { |
1734 | auto ExtLo = B.buildAnd(Dst: S32, Src0: Bitcast, Src1: B.buildConstant(Res: S32, Val: 0xffff)); |
1735 | return std::pair(ExtLo.getReg(Idx: 0), ShiftHi.getReg(Idx: 0)); |
1736 | } |
1737 | |
1738 | assert(ExtOpcode == TargetOpcode::G_ANYEXT); |
1739 | return std::pair(Bitcast.getReg(Idx: 0), ShiftHi.getReg(Idx: 0)); |
1740 | } |
1741 | |
1742 | // For cases where only a single copy is inserted for matching register banks. |
1743 | // Replace the register in the instruction operand |
1744 | static bool substituteSimpleCopyRegs( |
1745 | const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) { |
1746 | SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx)); |
1747 | if (!SrcReg.empty()) { |
1748 | assert(SrcReg.size() == 1); |
1749 | OpdMapper.getMI().getOperand(i: OpIdx).setReg(SrcReg[0]); |
1750 | return true; |
1751 | } |
1752 | |
1753 | return false; |
1754 | } |
1755 | |
1756 | /// Handle register layout difference for f16 images for some subtargets. |
1757 | Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B, |
1758 | MachineRegisterInfo &MRI, |
1759 | Register Reg) const { |
1760 | if (!Subtarget.hasUnpackedD16VMem()) |
1761 | return Reg; |
1762 | |
1763 | const LLT S16 = LLT::scalar(SizeInBits: 16); |
1764 | LLT StoreVT = MRI.getType(Reg); |
1765 | if (!StoreVT.isVector() || StoreVT.getElementType() != S16) |
1766 | return Reg; |
1767 | |
1768 | auto Unmerge = B.buildUnmerge(Res: S16, Op: Reg); |
1769 | |
1770 | |
1771 | SmallVector<Register, 4> WideRegs; |
1772 | for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) |
1773 | WideRegs.push_back(Elt: Unmerge.getReg(Idx: I)); |
1774 | |
1775 | const LLT S32 = LLT::scalar(SizeInBits: 32); |
1776 | int NumElts = StoreVT.getNumElements(); |
1777 | |
1778 | return B.buildMergeLikeInstr(Res: LLT::fixed_vector(NumElements: NumElts, ScalarTy: S32), Ops: WideRegs) |
1779 | .getReg(Idx: 0); |
1780 | } |
1781 | |
1782 | static std::pair<Register, unsigned> |
1783 | getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) { |
1784 | int64_t Const; |
1785 | if (mi_match(R: Reg, MRI, P: m_ICst(Cst&: Const))) |
1786 | return std::pair(Register(), Const); |
1787 | |
1788 | Register Base; |
1789 | if (mi_match(R: Reg, MRI, P: m_GAdd(L: m_Reg(R&: Base), R: m_ICst(Cst&: Const)))) |
1790 | return std::pair(Base, Const); |
1791 | |
1792 | // TODO: Handle G_OR used for add case |
1793 | return std::pair(Reg, 0); |
1794 | } |
1795 | |
1796 | std::pair<Register, unsigned> |
1797 | AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B, |
1798 | Register OrigOffset) const { |
1799 | const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST: Subtarget); |
1800 | Register BaseReg; |
1801 | unsigned ImmOffset; |
1802 | const LLT S32 = LLT::scalar(SizeInBits: 32); |
1803 | |
1804 | // TODO: Use AMDGPU::getBaseWithConstantOffset() instead. |
1805 | std::tie(args&: BaseReg, args&: ImmOffset) = getBaseWithConstantOffset(MRI&: *B.getMRI(), |
1806 | Reg: OrigOffset); |
1807 | |
1808 | unsigned C1 = 0; |
1809 | if (ImmOffset != 0) { |
1810 | // If the immediate value is too big for the immoffset field, put only bits |
1811 | // that would normally fit in the immoffset field. The remaining value that |
1812 | // is copied/added for the voffset field is a large power of 2, and it |
1813 | // stands more chance of being CSEd with the copy/add for another similar |
1814 | // load/store. |
1815 | // However, do not do that rounding down if that is a negative |
1816 | // number, as it appears to be illegal to have a negative offset in the |
1817 | // vgpr, even if adding the immediate offset makes it positive. |
1818 | unsigned Overflow = ImmOffset & ~MaxImm; |
1819 | ImmOffset -= Overflow; |
1820 | if ((int32_t)Overflow < 0) { |
1821 | Overflow += ImmOffset; |
1822 | ImmOffset = 0; |
1823 | } |
1824 | |
1825 | C1 = ImmOffset; |
1826 | if (Overflow != 0) { |
1827 | if (!BaseReg) |
1828 | BaseReg = B.buildConstant(Res: S32, Val: Overflow).getReg(Idx: 0); |
1829 | else { |
1830 | auto OverflowVal = B.buildConstant(Res: S32, Val: Overflow); |
1831 | BaseReg = B.buildAdd(Dst: S32, Src0: BaseReg, Src1: OverflowVal).getReg(Idx: 0); |
1832 | } |
1833 | } |
1834 | } |
1835 | |
1836 | if (!BaseReg) |
1837 | BaseReg = B.buildConstant(Res: S32, Val: 0).getReg(Idx: 0); |
1838 | |
1839 | return {BaseReg, C1}; |
1840 | } |
1841 | |
1842 | bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg, |
1843 | Register SrcReg) const { |
1844 | MachineRegisterInfo &MRI = *B.getMRI(); |
1845 | LLT SrcTy = MRI.getType(Reg: SrcReg); |
1846 | if (SrcTy.getSizeInBits() == 32) { |
1847 | // Use a v_mov_b32 here to make the exec dependency explicit. |
1848 | B.buildInstr(AMDGPU::V_MOV_B32_e32) |
1849 | .addDef(DstReg) |
1850 | .addUse(SrcReg); |
1851 | return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) && |
1852 | constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI); |
1853 | } |
1854 | |
1855 | Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
1856 | Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
1857 | |
1858 | B.buildInstr(AMDGPU::V_MOV_B32_e32) |
1859 | .addDef(TmpReg0) |
1860 | .addUse(SrcReg, 0, AMDGPU::sub0); |
1861 | B.buildInstr(AMDGPU::V_MOV_B32_e32) |
1862 | .addDef(TmpReg1) |
1863 | .addUse(SrcReg, 0, AMDGPU::sub1); |
1864 | B.buildInstr(AMDGPU::REG_SEQUENCE) |
1865 | .addDef(DstReg) |
1866 | .addUse(TmpReg0) |
1867 | .addImm(AMDGPU::sub0) |
1868 | .addUse(TmpReg1) |
1869 | .addImm(AMDGPU::sub1); |
1870 | |
1871 | return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) && |
1872 | constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI); |
1873 | } |
1874 | |
1875 | /// Utility function for pushing dynamic vector indexes with a constant offset |
1876 | /// into waterfall loops. |
1877 | static void reinsertVectorIndexAdd(MachineIRBuilder &B, |
1878 | MachineInstr &IdxUseInstr, |
1879 | unsigned OpIdx, |
1880 | unsigned ConstOffset) { |
1881 | MachineRegisterInfo &MRI = *B.getMRI(); |
1882 | const LLT S32 = LLT::scalar(SizeInBits: 32); |
1883 | Register WaterfallIdx = IdxUseInstr.getOperand(i: OpIdx).getReg(); |
1884 | B.setInsertPt(MBB&: *IdxUseInstr.getParent(), II: IdxUseInstr.getIterator()); |
1885 | |
1886 | auto MaterializedOffset = B.buildConstant(Res: S32, Val: ConstOffset); |
1887 | |
1888 | auto Add = B.buildAdd(Dst: S32, Src0: WaterfallIdx, Src1: MaterializedOffset); |
1889 | MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank); |
1890 | MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank); |
1891 | IdxUseInstr.getOperand(i: OpIdx).setReg(Add.getReg(Idx: 0)); |
1892 | } |
1893 | |
1894 | /// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the |
1895 | /// original 32-bit source value (to be inserted in the low part of the combined |
1896 | /// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit |
1897 | /// value. |
1898 | static void extendLow32IntoHigh32(MachineIRBuilder &B, |
1899 | Register Hi32Reg, Register Lo32Reg, |
1900 | unsigned ExtOpc, |
1901 | const RegisterBank &RegBank, |
1902 | bool IsBooleanSrc = false) { |
1903 | if (ExtOpc == AMDGPU::G_ZEXT) { |
1904 | B.buildConstant(Res: Hi32Reg, Val: 0); |
1905 | } else if (ExtOpc == AMDGPU::G_SEXT) { |
1906 | if (IsBooleanSrc) { |
1907 | // If we know the original source was an s1, the high half is the same as |
1908 | // the low. |
1909 | B.buildCopy(Res: Hi32Reg, Op: Lo32Reg); |
1910 | } else { |
1911 | // Replicate sign bit from 32-bit extended part. |
1912 | auto ShiftAmt = B.buildConstant(Res: LLT::scalar(SizeInBits: 32), Val: 31); |
1913 | B.getMRI()->setRegBank(Reg: ShiftAmt.getReg(Idx: 0), RegBank); |
1914 | B.buildAShr(Dst: Hi32Reg, Src0: Lo32Reg, Src1: ShiftAmt); |
1915 | } |
1916 | } else { |
1917 | assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension" ); |
1918 | B.buildUndef(Res: Hi32Reg); |
1919 | } |
1920 | } |
1921 | |
1922 | bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect( |
1923 | MachineIRBuilder &B, MachineInstr &MI, |
1924 | const OperandsMapper &OpdMapper) const { |
1925 | MachineRegisterInfo &MRI = *B.getMRI(); |
1926 | |
1927 | Register VecReg = MI.getOperand(i: 1).getReg(); |
1928 | Register Idx = MI.getOperand(i: 2).getReg(); |
1929 | |
1930 | const RegisterBank &IdxBank = |
1931 | *OpdMapper.getInstrMapping().getOperandMapping(i: 2).BreakDown[0].RegBank; |
1932 | |
1933 | bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank; |
1934 | |
1935 | LLT VecTy = MRI.getType(Reg: VecReg); |
1936 | unsigned EltSize = VecTy.getScalarSizeInBits(); |
1937 | unsigned NumElem = VecTy.getNumElements(); |
1938 | |
1939 | if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem, |
1940 | IsDivergentIdx, Subtarget: &Subtarget)) |
1941 | return false; |
1942 | |
1943 | LLT S32 = LLT::scalar(SizeInBits: 32); |
1944 | |
1945 | const RegisterBank &DstBank = |
1946 | *OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank; |
1947 | const RegisterBank &SrcBank = |
1948 | *OpdMapper.getInstrMapping().getOperandMapping(i: 1).BreakDown[0].RegBank; |
1949 | |
1950 | const RegisterBank &CCBank = |
1951 | (DstBank == AMDGPU::SGPRRegBank && |
1952 | SrcBank == AMDGPU::SGPRRegBank && |
1953 | IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank |
1954 | : AMDGPU::VCCRegBank; |
1955 | LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1); |
1956 | |
1957 | if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) { |
1958 | Idx = B.buildCopy(Res: S32, Op: Idx)->getOperand(i: 0).getReg(); |
1959 | MRI.setRegBank(Idx, AMDGPU::VGPRRegBank); |
1960 | } |
1961 | |
1962 | LLT EltTy = VecTy.getScalarType(); |
1963 | SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(OpIdx: 0)); |
1964 | unsigned NumLanes = DstRegs.size(); |
1965 | if (!NumLanes) |
1966 | NumLanes = 1; |
1967 | else |
1968 | EltTy = MRI.getType(Reg: DstRegs[0]); |
1969 | |
1970 | auto UnmergeToEltTy = B.buildUnmerge(Res: EltTy, Op: VecReg); |
1971 | SmallVector<Register, 2> Res(NumLanes); |
1972 | for (unsigned L = 0; L < NumLanes; ++L) |
1973 | Res[L] = UnmergeToEltTy.getReg(Idx: L); |
1974 | |
1975 | for (unsigned I = 1; I < NumElem; ++I) { |
1976 | auto IC = B.buildConstant(Res: S32, Val: I); |
1977 | MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank); |
1978 | auto Cmp = B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: CCTy, Op0: Idx, Op1: IC); |
1979 | MRI.setRegBank(Reg: Cmp->getOperand(i: 0).getReg(), RegBank: CCBank); |
1980 | |
1981 | for (unsigned L = 0; L < NumLanes; ++L) { |
1982 | auto S = B.buildSelect(Res: EltTy, Tst: Cmp, |
1983 | Op0: UnmergeToEltTy.getReg(Idx: I * NumLanes + L), Op1: Res[L]); |
1984 | |
1985 | for (unsigned N : { 0, 2, 3 }) |
1986 | MRI.setRegBank(Reg: S->getOperand(i: N).getReg(), RegBank: DstBank); |
1987 | |
1988 | Res[L] = S->getOperand(i: 0).getReg(); |
1989 | } |
1990 | } |
1991 | |
1992 | for (unsigned L = 0; L < NumLanes; ++L) { |
1993 | Register DstReg = (NumLanes == 1) ? MI.getOperand(i: 0).getReg() : DstRegs[L]; |
1994 | B.buildCopy(Res: DstReg, Op: Res[L]); |
1995 | MRI.setRegBank(Reg: DstReg, RegBank: DstBank); |
1996 | } |
1997 | |
1998 | MRI.setRegBank(Reg: MI.getOperand(i: 0).getReg(), RegBank: DstBank); |
1999 | MI.eraseFromParent(); |
2000 | |
2001 | return true; |
2002 | } |
2003 | |
2004 | // Insert a cross regbank copy for a register if it already has a bank that |
2005 | // differs from the one we want to set. |
2006 | static Register constrainRegToBank(MachineRegisterInfo &MRI, |
2007 | MachineIRBuilder &B, Register &Reg, |
2008 | const RegisterBank &Bank) { |
2009 | const RegisterBank *CurrBank = MRI.getRegBankOrNull(Reg); |
2010 | if (CurrBank && *CurrBank != Bank) { |
2011 | Register Copy = B.buildCopy(Res: MRI.getType(Reg), Op: Reg).getReg(Idx: 0); |
2012 | MRI.setRegBank(Reg: Copy, RegBank: Bank); |
2013 | return Copy; |
2014 | } |
2015 | |
2016 | MRI.setRegBank(Reg, RegBank: Bank); |
2017 | return Reg; |
2018 | } |
2019 | |
2020 | bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect( |
2021 | MachineIRBuilder &B, MachineInstr &MI, |
2022 | const OperandsMapper &OpdMapper) const { |
2023 | |
2024 | MachineRegisterInfo &MRI = *B.getMRI(); |
2025 | Register VecReg = MI.getOperand(i: 1).getReg(); |
2026 | Register Idx = MI.getOperand(i: 3).getReg(); |
2027 | |
2028 | const RegisterBank &IdxBank = |
2029 | *OpdMapper.getInstrMapping().getOperandMapping(i: 3).BreakDown[0].RegBank; |
2030 | |
2031 | bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank; |
2032 | |
2033 | LLT VecTy = MRI.getType(Reg: VecReg); |
2034 | unsigned EltSize = VecTy.getScalarSizeInBits(); |
2035 | unsigned NumElem = VecTy.getNumElements(); |
2036 | |
2037 | if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem, |
2038 | IsDivergentIdx, Subtarget: &Subtarget)) |
2039 | return false; |
2040 | |
2041 | LLT S32 = LLT::scalar(SizeInBits: 32); |
2042 | |
2043 | const RegisterBank &DstBank = |
2044 | *OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank; |
2045 | const RegisterBank &SrcBank = |
2046 | *OpdMapper.getInstrMapping().getOperandMapping(i: 1).BreakDown[0].RegBank; |
2047 | const RegisterBank &InsBank = |
2048 | *OpdMapper.getInstrMapping().getOperandMapping(i: 2).BreakDown[0].RegBank; |
2049 | |
2050 | const RegisterBank &CCBank = |
2051 | (DstBank == AMDGPU::SGPRRegBank && |
2052 | SrcBank == AMDGPU::SGPRRegBank && |
2053 | InsBank == AMDGPU::SGPRRegBank && |
2054 | IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank |
2055 | : AMDGPU::VCCRegBank; |
2056 | LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1); |
2057 | |
2058 | if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) { |
2059 | Idx = B.buildCopy(Res: S32, Op: Idx)->getOperand(i: 0).getReg(); |
2060 | MRI.setRegBank(Idx, AMDGPU::VGPRRegBank); |
2061 | } |
2062 | |
2063 | LLT EltTy = VecTy.getScalarType(); |
2064 | SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(OpIdx: 2)); |
2065 | unsigned NumLanes = InsRegs.size(); |
2066 | if (!NumLanes) { |
2067 | NumLanes = 1; |
2068 | InsRegs.push_back(Elt: MI.getOperand(i: 2).getReg()); |
2069 | } else { |
2070 | EltTy = MRI.getType(Reg: InsRegs[0]); |
2071 | } |
2072 | |
2073 | auto UnmergeToEltTy = B.buildUnmerge(Res: EltTy, Op: VecReg); |
2074 | SmallVector<Register, 16> Ops(NumElem * NumLanes); |
2075 | |
2076 | for (unsigned I = 0; I < NumElem; ++I) { |
2077 | auto IC = B.buildConstant(Res: S32, Val: I); |
2078 | MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank); |
2079 | auto Cmp = B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: CCTy, Op0: Idx, Op1: IC); |
2080 | MRI.setRegBank(Reg: Cmp->getOperand(i: 0).getReg(), RegBank: CCBank); |
2081 | |
2082 | for (unsigned L = 0; L < NumLanes; ++L) { |
2083 | Register Op0 = constrainRegToBank(MRI, B, Reg&: InsRegs[L], Bank: DstBank); |
2084 | Register Op1 = UnmergeToEltTy.getReg(Idx: I * NumLanes + L); |
2085 | Op1 = constrainRegToBank(MRI, B, Reg&: Op1, Bank: DstBank); |
2086 | |
2087 | Register Select = B.buildSelect(Res: EltTy, Tst: Cmp, Op0, Op1).getReg(Idx: 0); |
2088 | MRI.setRegBank(Reg: Select, RegBank: DstBank); |
2089 | |
2090 | Ops[I * NumLanes + L] = Select; |
2091 | } |
2092 | } |
2093 | |
2094 | LLT MergeTy = LLT::fixed_vector(NumElements: Ops.size(), ScalarTy: EltTy); |
2095 | if (MergeTy == MRI.getType(Reg: MI.getOperand(i: 0).getReg())) { |
2096 | B.buildBuildVector(Res: MI.getOperand(i: 0), Ops); |
2097 | } else { |
2098 | auto Vec = B.buildBuildVector(Res: MergeTy, Ops); |
2099 | MRI.setRegBank(Reg: Vec->getOperand(i: 0).getReg(), RegBank: DstBank); |
2100 | B.buildBitcast(Dst: MI.getOperand(i: 0).getReg(), Src: Vec); |
2101 | } |
2102 | |
2103 | MRI.setRegBank(Reg: MI.getOperand(i: 0).getReg(), RegBank: DstBank); |
2104 | MI.eraseFromParent(); |
2105 | |
2106 | return true; |
2107 | } |
2108 | |
2109 | // Break s_mul_u64 into 32-bit vector operations. |
2110 | void AMDGPURegisterBankInfo::applyMappingSMULU64( |
2111 | MachineIRBuilder &B, const OperandsMapper &OpdMapper) const { |
2112 | SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(OpIdx: 0)); |
2113 | SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(OpIdx: 1)); |
2114 | SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(OpIdx: 2)); |
2115 | |
2116 | // All inputs are SGPRs, nothing special to do. |
2117 | if (DefRegs.empty()) { |
2118 | assert(Src0Regs.empty() && Src1Regs.empty()); |
2119 | applyDefaultMapping(OpdMapper); |
2120 | return; |
2121 | } |
2122 | |
2123 | assert(DefRegs.size() == 2); |
2124 | assert(Src0Regs.size() == Src1Regs.size() && |
2125 | (Src0Regs.empty() || Src0Regs.size() == 2)); |
2126 | |
2127 | MachineRegisterInfo &MRI = OpdMapper.getMRI(); |
2128 | MachineInstr &MI = OpdMapper.getMI(); |
2129 | Register DstReg = MI.getOperand(i: 0).getReg(); |
2130 | LLT HalfTy = LLT::scalar(SizeInBits: 32); |
2131 | |
2132 | // Depending on where the source registers came from, the generic code may |
2133 | // have decided to split the inputs already or not. If not, we still need to |
2134 | // extract the values. |
2135 | |
2136 | if (Src0Regs.empty()) |
2137 | split64BitValueForMapping(B, Regs&: Src0Regs, HalfTy, Reg: MI.getOperand(i: 1).getReg()); |
2138 | else |
2139 | setRegsToType(MRI, Regs: Src0Regs, NewTy: HalfTy); |
2140 | |
2141 | if (Src1Regs.empty()) |
2142 | split64BitValueForMapping(B, Regs&: Src1Regs, HalfTy, Reg: MI.getOperand(i: 2).getReg()); |
2143 | else |
2144 | setRegsToType(MRI, Regs: Src1Regs, NewTy: HalfTy); |
2145 | |
2146 | setRegsToType(MRI, Regs: DefRegs, NewTy: HalfTy); |
2147 | |
2148 | // The multiplication is done as follows: |
2149 | // |
2150 | // Op1H Op1L |
2151 | // * Op0H Op0L |
2152 | // -------------------- |
2153 | // Op1H*Op0L Op1L*Op0L |
2154 | // + Op1H*Op0H Op1L*Op0H |
2155 | // ----------------------------------------- |
2156 | // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L |
2157 | // |
2158 | // We drop Op1H*Op0H because the result of the multiplication is a 64-bit |
2159 | // value and that would overflow. |
2160 | // The low 32-bit value is Op1L*Op0L. |
2161 | // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from |
2162 | // Op1L*Op0L). |
2163 | |
2164 | ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank); |
2165 | |
2166 | Register Hi = B.buildUMulH(Dst: HalfTy, Src0: Src0Regs[0], Src1: Src1Regs[0]).getReg(Idx: 0); |
2167 | Register MulLoHi = B.buildMul(Dst: HalfTy, Src0: Src0Regs[0], Src1: Src1Regs[1]).getReg(Idx: 0); |
2168 | Register Add = B.buildAdd(Dst: HalfTy, Src0: Hi, Src1: MulLoHi).getReg(Idx: 0); |
2169 | Register MulHiLo = B.buildMul(Dst: HalfTy, Src0: Src0Regs[1], Src1: Src1Regs[0]).getReg(Idx: 0); |
2170 | B.buildAdd(Dst: DefRegs[1], Src0: Add, Src1: MulHiLo); |
2171 | B.buildMul(Dst: DefRegs[0], Src0: Src0Regs[0], Src1: Src1Regs[0]); |
2172 | |
2173 | MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); |
2174 | MI.eraseFromParent(); |
2175 | } |
2176 | |
2177 | void AMDGPURegisterBankInfo::applyMappingImpl( |
2178 | MachineIRBuilder &B, const OperandsMapper &OpdMapper) const { |
2179 | MachineInstr &MI = OpdMapper.getMI(); |
2180 | B.setInstrAndDebugLoc(MI); |
2181 | unsigned Opc = MI.getOpcode(); |
2182 | MachineRegisterInfo &MRI = OpdMapper.getMRI(); |
2183 | switch (Opc) { |
2184 | case AMDGPU::G_CONSTANT: |
2185 | case AMDGPU::G_IMPLICIT_DEF: { |
2186 | Register DstReg = MI.getOperand(i: 0).getReg(); |
2187 | LLT DstTy = MRI.getType(Reg: DstReg); |
2188 | if (DstTy != LLT::scalar(SizeInBits: 1)) |
2189 | break; |
2190 | |
2191 | const RegisterBank *DstBank = |
2192 | OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank; |
2193 | if (DstBank == &AMDGPU::VCCRegBank) |
2194 | break; |
2195 | SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(OpIdx: 0)); |
2196 | if (DefRegs.empty()) |
2197 | DefRegs.push_back(Elt: DstReg); |
2198 | |
2199 | B.setInsertPt(MBB&: *MI.getParent(), II: ++MI.getIterator()); |
2200 | |
2201 | Register NewDstReg = MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: 32)); |
2202 | LLVMContext &Ctx = B.getMF().getFunction().getContext(); |
2203 | |
2204 | MI.getOperand(i: 0).setReg(NewDstReg); |
2205 | if (Opc != AMDGPU::G_IMPLICIT_DEF) { |
2206 | uint64_t ConstVal = MI.getOperand(i: 1).getCImm()->getZExtValue(); |
2207 | MI.getOperand(i: 1).setCImm( |
2208 | ConstantInt::get(Ty: IntegerType::getInt32Ty(C&: Ctx), V: ConstVal)); |
2209 | } |
2210 | |
2211 | MRI.setRegBank(Reg: NewDstReg, RegBank: *DstBank); |
2212 | B.buildTrunc(Res: DefRegs[0], Op: NewDstReg); |
2213 | return; |
2214 | } |
2215 | case AMDGPU::G_PHI: { |
2216 | Register DstReg = MI.getOperand(i: 0).getReg(); |
2217 | LLT DstTy = MRI.getType(Reg: DstReg); |
2218 | if (DstTy != LLT::scalar(SizeInBits: 1)) |
2219 | break; |
2220 | |
2221 | const LLT S32 = LLT::scalar(SizeInBits: 32); |
2222 | const RegisterBank *DstBank = |
2223 | OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank; |
2224 | if (DstBank == &AMDGPU::VCCRegBank) { |
2225 | applyDefaultMapping(OpdMapper); |
2226 | // The standard handling only considers the result register bank for |
2227 | // phis. For VCC, blindly inserting a copy when the phi is lowered will |
2228 | // produce an invalid copy. We can only copy with some kind of compare to |
2229 | // get a vector boolean result. Insert a register bank copy that will be |
2230 | // correctly lowered to a compare. |
2231 | for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { |
2232 | Register SrcReg = MI.getOperand(i: I).getReg(); |
2233 | const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI); |
2234 | |
2235 | if (SrcBank != &AMDGPU::VCCRegBank) { |
2236 | MachineBasicBlock *SrcMBB = MI.getOperand(i: I + 1).getMBB(); |
2237 | B.setInsertPt(MBB&: *SrcMBB, II: SrcMBB->getFirstTerminator()); |
2238 | |
2239 | auto Copy = B.buildCopy(Res: LLT::scalar(SizeInBits: 1), Op: SrcReg); |
2240 | MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank); |
2241 | MI.getOperand(i: I).setReg(Copy.getReg(Idx: 0)); |
2242 | } |
2243 | } |
2244 | |
2245 | return; |
2246 | } |
2247 | |
2248 | // Phi handling is strange and only considers the bank of the destination. |
2249 | substituteSimpleCopyRegs(OpdMapper, OpIdx: 0); |
2250 | |
2251 | // Promote SGPR/VGPR booleans to s32 |
2252 | ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank); |
2253 | B.setInsertPt(MBB&: B.getMBB(), II: MI); |
2254 | LegalizerHelper Helper(B.getMF(), ApplyBank, B); |
2255 | |
2256 | if (Helper.widenScalar(MI, TypeIdx: 0, WideTy: S32) != LegalizerHelper::Legalized) |
2257 | llvm_unreachable("widen scalar should have succeeded" ); |
2258 | |
2259 | return; |
2260 | } |
2261 | case AMDGPU::G_FCMP: |
2262 | if (!Subtarget.hasSALUFloatInsts()) |
2263 | break; |
2264 | LLVM_FALLTHROUGH; |
2265 | case AMDGPU::G_ICMP: |
2266 | case AMDGPU::G_UADDO: |
2267 | case AMDGPU::G_USUBO: |
2268 | case AMDGPU::G_UADDE: |
2269 | case AMDGPU::G_SADDE: |
2270 | case AMDGPU::G_USUBE: |
2271 | case AMDGPU::G_SSUBE: { |
2272 | unsigned BoolDstOp = |
2273 | (Opc == AMDGPU::G_ICMP || Opc == AMDGPU::G_FCMP) ? 0 : 1; |
2274 | Register DstReg = MI.getOperand(i: BoolDstOp).getReg(); |
2275 | |
2276 | const RegisterBank *DstBank = |
2277 | OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank; |
2278 | if (DstBank != &AMDGPU::SGPRRegBank) |
2279 | break; |
2280 | |
2281 | const bool HasCarryIn = MI.getNumOperands() == 5; |
2282 | |
2283 | // If this is a scalar compare, promote the result to s32, as the selection |
2284 | // will end up using a copy to a 32-bit vreg. |
2285 | const LLT S32 = LLT::scalar(SizeInBits: 32); |
2286 | Register NewDstReg = MRI.createGenericVirtualRegister(Ty: S32); |
2287 | MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank); |
2288 | MI.getOperand(i: BoolDstOp).setReg(NewDstReg); |
2289 | |
2290 | if (HasCarryIn) { |
2291 | Register NewSrcReg = MRI.createGenericVirtualRegister(Ty: S32); |
2292 | MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank); |
2293 | B.buildZExt(Res: NewSrcReg, Op: MI.getOperand(i: 4).getReg()); |
2294 | MI.getOperand(i: 4).setReg(NewSrcReg); |
2295 | } |
2296 | |
2297 | MachineBasicBlock *MBB = MI.getParent(); |
2298 | B.setInsertPt(MBB&: *MBB, II: std::next(x: MI.getIterator())); |
2299 | |
2300 | // If we had a constrained VCC result register, a copy was inserted to VCC |
2301 | // from SGPR. |
2302 | SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(OpIdx: 0)); |
2303 | if (DefRegs.empty()) |
2304 | DefRegs.push_back(Elt: DstReg); |
2305 | B.buildTrunc(Res: DefRegs[0], Op: NewDstReg); |
2306 | return; |
2307 | } |
2308 | case AMDGPU::G_SELECT: { |
2309 | Register DstReg = MI.getOperand(i: 0).getReg(); |
2310 | LLT DstTy = MRI.getType(Reg: DstReg); |
2311 | |
2312 | SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(OpIdx: 1)); |
2313 | if (CondRegs.empty()) |
2314 | CondRegs.push_back(Elt: MI.getOperand(i: 1).getReg()); |
2315 | else { |
2316 | assert(CondRegs.size() == 1); |
2317 | } |
2318 | |
2319 | const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI); |
2320 | if (CondBank == &AMDGPU::SGPRRegBank) { |
2321 | const LLT S32 = LLT::scalar(SizeInBits: 32); |
2322 | Register NewCondReg = MRI.createGenericVirtualRegister(Ty: S32); |
2323 | MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank); |
2324 | |
2325 | MI.getOperand(i: 1).setReg(NewCondReg); |
2326 | B.buildZExt(Res: NewCondReg, Op: CondRegs[0]); |
2327 | } |
2328 | |
2329 | if (DstTy.getSizeInBits() != 64) |
2330 | break; |
2331 | |
2332 | LLT HalfTy = getHalfSizedType(Ty: DstTy); |
2333 | |
2334 | SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(OpIdx: 0)); |
2335 | SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(OpIdx: 2)); |
2336 | SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(OpIdx: 3)); |
2337 | |
2338 | // All inputs are SGPRs, nothing special to do. |
2339 | if (DefRegs.empty()) { |
2340 | assert(Src1Regs.empty() && Src2Regs.empty()); |
2341 | break; |
2342 | } |
2343 | |
2344 | if (Src1Regs.empty()) |
2345 | split64BitValueForMapping(B, Regs&: Src1Regs, HalfTy, Reg: MI.getOperand(i: 2).getReg()); |
2346 | else { |
2347 | setRegsToType(MRI, Regs: Src1Regs, NewTy: HalfTy); |
2348 | } |
2349 | |
2350 | if (Src2Regs.empty()) |
2351 | split64BitValueForMapping(B, Regs&: Src2Regs, HalfTy, Reg: MI.getOperand(i: 3).getReg()); |
2352 | else |
2353 | setRegsToType(MRI, Regs: Src2Regs, NewTy: HalfTy); |
2354 | |
2355 | setRegsToType(MRI, Regs: DefRegs, NewTy: HalfTy); |
2356 | |
2357 | B.buildSelect(Res: DefRegs[0], Tst: CondRegs[0], Op0: Src1Regs[0], Op1: Src2Regs[0]); |
2358 | B.buildSelect(Res: DefRegs[1], Tst: CondRegs[0], Op0: Src1Regs[1], Op1: Src2Regs[1]); |
2359 | |
2360 | MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); |
2361 | MI.eraseFromParent(); |
2362 | return; |
2363 | } |
2364 | case AMDGPU::G_BRCOND: { |
2365 | Register CondReg = MI.getOperand(i: 0).getReg(); |
2366 | // FIXME: Should use legalizer helper, but should change bool ext type. |
2367 | const RegisterBank *CondBank = |
2368 | OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank; |
2369 | |
2370 | if (CondBank == &AMDGPU::SGPRRegBank) { |
2371 | const LLT S32 = LLT::scalar(SizeInBits: 32); |
2372 | Register NewCondReg = MRI.createGenericVirtualRegister(Ty: S32); |
2373 | MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank); |
2374 | |
2375 | MI.getOperand(i: 0).setReg(NewCondReg); |
2376 | B.buildZExt(Res: NewCondReg, Op: CondReg); |
2377 | return; |
2378 | } |
2379 | |
2380 | break; |
2381 | } |
2382 | case AMDGPU::G_AND: |
2383 | case AMDGPU::G_OR: |
2384 | case AMDGPU::G_XOR: { |
2385 | // 64-bit and is only available on the SALU, so split into 2 32-bit ops if |
2386 | // there is a VGPR input. |
2387 | Register DstReg = MI.getOperand(i: 0).getReg(); |
2388 | LLT DstTy = MRI.getType(Reg: DstReg); |
2389 | |
2390 | if (DstTy.getSizeInBits() == 1) { |
2391 | const RegisterBank *DstBank = |
2392 | OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank; |
2393 | if (DstBank == &AMDGPU::VCCRegBank) |
2394 | break; |
2395 | |
2396 | MachineFunction *MF = MI.getParent()->getParent(); |
2397 | ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank); |
2398 | LegalizerHelper Helper(*MF, ApplyBank, B); |
2399 | |
2400 | if (Helper.widenScalar(MI, TypeIdx: 0, WideTy: LLT::scalar(SizeInBits: 32)) != |
2401 | LegalizerHelper::Legalized) |
2402 | llvm_unreachable("widen scalar should have succeeded" ); |
2403 | return; |
2404 | } |
2405 | |
2406 | if (DstTy.getSizeInBits() != 64) |
2407 | break; |
2408 | |
2409 | LLT HalfTy = getHalfSizedType(Ty: DstTy); |
2410 | SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(OpIdx: 0)); |
2411 | SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(OpIdx: 1)); |
2412 | SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(OpIdx: 2)); |
2413 | |
2414 | // All inputs are SGPRs, nothing special to do. |
2415 | if (DefRegs.empty()) { |
2416 | assert(Src0Regs.empty() && Src1Regs.empty()); |
2417 | break; |
2418 | } |
2419 | |
2420 | assert(DefRegs.size() == 2); |
2421 | assert(Src0Regs.size() == Src1Regs.size() && |
2422 | (Src0Regs.empty() || Src0Regs.size() == 2)); |
2423 | |
2424 | // Depending on where the source registers came from, the generic code may |
2425 | // have decided to split the inputs already or not. If not, we still need to |
2426 | // extract the values. |
2427 | |
2428 | if (Src0Regs.empty()) |
2429 | split64BitValueForMapping(B, Regs&: Src0Regs, HalfTy, Reg: MI.getOperand(i: 1).getReg()); |
2430 | else |
2431 | setRegsToType(MRI, Regs: Src0Regs, NewTy: HalfTy); |
2432 | |
2433 | if (Src1Regs.empty()) |
2434 | split64BitValueForMapping(B, Regs&: Src1Regs, HalfTy, Reg: MI.getOperand(i: 2).getReg()); |
2435 | else |
2436 | setRegsToType(MRI, Regs: Src1Regs, NewTy: HalfTy); |
2437 | |
2438 | setRegsToType(MRI, Regs: DefRegs, NewTy: HalfTy); |
2439 | |
2440 | B.buildInstr(Opc, DstOps: {DefRegs[0]}, SrcOps: {Src0Regs[0], Src1Regs[0]}); |
2441 | B.buildInstr(Opc, DstOps: {DefRegs[1]}, SrcOps: {Src0Regs[1], Src1Regs[1]}); |
2442 | |
2443 | MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); |
2444 | MI.eraseFromParent(); |
2445 | return; |
2446 | } |
2447 | case AMDGPU::G_ABS: { |
2448 | Register SrcReg = MI.getOperand(i: 1).getReg(); |
2449 | const RegisterBank *SrcBank = MRI.getRegBankOrNull(Reg: SrcReg); |
2450 | |
2451 | // There is no VALU abs instruction so we need to replace it with a sub and |
2452 | // max combination. |
2453 | if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) { |
2454 | MachineFunction *MF = MI.getParent()->getParent(); |
2455 | ApplyRegBankMapping Apply(B, *this, MRI, &AMDGPU::VGPRRegBank); |
2456 | LegalizerHelper Helper(*MF, Apply, B); |
2457 | |
2458 | if (Helper.lowerAbsToMaxNeg(MI) != LegalizerHelper::Legalized) |
2459 | llvm_unreachable("lowerAbsToMaxNeg should have succeeded" ); |
2460 | return; |
2461 | } |
2462 | [[fallthrough]]; |
2463 | } |
2464 | case AMDGPU::G_ADD: |
2465 | case AMDGPU::G_SUB: |
2466 | case AMDGPU::G_MUL: |
2467 | case AMDGPU::G_SHL: |
2468 | case AMDGPU::G_LSHR: |
2469 | case AMDGPU::G_ASHR: |
2470 | case AMDGPU::G_SMIN: |
2471 | case AMDGPU::G_SMAX: |
2472 | case AMDGPU::G_UMIN: |
2473 | case AMDGPU::G_UMAX: { |
2474 | Register DstReg = MI.getOperand(i: 0).getReg(); |
2475 | LLT DstTy = MRI.getType(Reg: DstReg); |
2476 | |
2477 | // Special case for s_mul_u64. There is not a vector equivalent of |
2478 | // s_mul_u64. Hence, we have to break down s_mul_u64 into 32-bit vector |
2479 | // multiplications. |
2480 | if (Opc == AMDGPU::G_MUL && DstTy.getSizeInBits() == 64) { |
2481 | applyMappingSMULU64(B, OpdMapper); |
2482 | return; |
2483 | } |
2484 | |
2485 | // 16-bit operations are VALU only, but can be promoted to 32-bit SALU. |
2486 | // Packed 16-bit operations need to be scalarized and promoted. |
2487 | if (DstTy != LLT::scalar(SizeInBits: 16) && DstTy != LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16)) |
2488 | break; |
2489 | |
2490 | const RegisterBank *DstBank = |
2491 | OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank; |
2492 | if (DstBank == &AMDGPU::VGPRRegBank) |
2493 | break; |
2494 | |
2495 | const LLT S32 = LLT::scalar(SizeInBits: 32); |
2496 | MachineBasicBlock *MBB = MI.getParent(); |
2497 | MachineFunction *MF = MBB->getParent(); |
2498 | ApplyRegBankMapping ApplySALU(B, *this, MRI, &AMDGPU::SGPRRegBank); |
2499 | |
2500 | if (DstTy.isVector() && Opc == AMDGPU::G_ABS) { |
2501 | Register WideSrcLo, WideSrcHi; |
2502 | |
2503 | std::tie(args&: WideSrcLo, args&: WideSrcHi) = |
2504 | unpackV2S16ToS32(B, Src: MI.getOperand(i: 1).getReg(), ExtOpcode: TargetOpcode::G_SEXT); |
2505 | auto Lo = B.buildInstr(AMDGPU::G_ABS, {S32}, {WideSrcLo}); |
2506 | auto Hi = B.buildInstr(AMDGPU::G_ABS, {S32}, {WideSrcHi}); |
2507 | B.buildBuildVectorTrunc(Res: DstReg, Ops: {Lo.getReg(0), Hi.getReg(0)}); |
2508 | MI.eraseFromParent(); |
2509 | return; |
2510 | } |
2511 | |
2512 | if (DstTy.isVector()) { |
2513 | Register WideSrc0Lo, WideSrc0Hi; |
2514 | Register WideSrc1Lo, WideSrc1Hi; |
2515 | |
2516 | unsigned ExtendOp = getExtendOp(Opc: MI.getOpcode()); |
2517 | std::tie(args&: WideSrc0Lo, args&: WideSrc0Hi) |
2518 | = unpackV2S16ToS32(B, Src: MI.getOperand(i: 1).getReg(), ExtOpcode: ExtendOp); |
2519 | std::tie(args&: WideSrc1Lo, args&: WideSrc1Hi) |
2520 | = unpackV2S16ToS32(B, Src: MI.getOperand(i: 2).getReg(), ExtOpcode: ExtendOp); |
2521 | auto Lo = B.buildInstr(Opc: MI.getOpcode(), DstOps: {S32}, SrcOps: {WideSrc0Lo, WideSrc1Lo}); |
2522 | auto Hi = B.buildInstr(Opc: MI.getOpcode(), DstOps: {S32}, SrcOps: {WideSrc0Hi, WideSrc1Hi}); |
2523 | B.buildBuildVectorTrunc(Res: DstReg, Ops: {Lo.getReg(Idx: 0), Hi.getReg(Idx: 0)}); |
2524 | MI.eraseFromParent(); |
2525 | } else { |
2526 | LegalizerHelper Helper(*MF, ApplySALU, B); |
2527 | |
2528 | if (Helper.widenScalar(MI, TypeIdx: 0, WideTy: S32) != LegalizerHelper::Legalized) |
2529 | llvm_unreachable("widen scalar should have succeeded" ); |
2530 | |
2531 | // FIXME: s16 shift amounts should be legal. |
2532 | if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR || |
2533 | Opc == AMDGPU::G_ASHR) { |
2534 | B.setInsertPt(MBB&: *MBB, II: MI.getIterator()); |
2535 | if (Helper.widenScalar(MI, TypeIdx: 1, WideTy: S32) != LegalizerHelper::Legalized) |
2536 | llvm_unreachable("widen scalar should have succeeded" ); |
2537 | } |
2538 | } |
2539 | |
2540 | return; |
2541 | } |
2542 | case AMDGPU::G_AMDGPU_S_MUL_I64_I32: |
2543 | case AMDGPU::G_AMDGPU_S_MUL_U64_U32: { |
2544 | // This is a special case for s_mul_u64. We use |
2545 | // G_AMDGPU_S_MUL_I64_I32 opcode to represent an s_mul_u64 operation |
2546 | // where the 33 higher bits are sign-extended and |
2547 | // G_AMDGPU_S_MUL_U64_U32 opcode to represent an s_mul_u64 operation |
2548 | // where the 32 higher bits are zero-extended. In case scalar registers are |
2549 | // selected, both opcodes are lowered as s_mul_u64. If the vector registers |
2550 | // are selected, then G_AMDGPU_S_MUL_I64_I32 and |
2551 | // G_AMDGPU_S_MUL_U64_U32 are lowered with a vector mad instruction. |
2552 | |
2553 | // Insert basic copies. |
2554 | applyDefaultMapping(OpdMapper); |
2555 | |
2556 | Register DstReg = MI.getOperand(i: 0).getReg(); |
2557 | Register SrcReg0 = MI.getOperand(i: 1).getReg(); |
2558 | Register SrcReg1 = MI.getOperand(i: 2).getReg(); |
2559 | const LLT S32 = LLT::scalar(SizeInBits: 32); |
2560 | const LLT S64 = LLT::scalar(SizeInBits: 64); |
2561 | assert(MRI.getType(DstReg) == S64 && "This is a special case for s_mul_u64 " |
2562 | "that handles only 64-bit operands." ); |
2563 | const RegisterBank *DstBank = |
2564 | OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank; |
2565 | |
2566 | // Replace G_AMDGPU_S_MUL_I64_I32 and G_AMDGPU_S_MUL_U64_U32 |
2567 | // with s_mul_u64 operation. |
2568 | if (DstBank == &AMDGPU::SGPRRegBank) { |
2569 | MI.setDesc(TII->get(AMDGPU::S_MUL_U64)); |
2570 | MRI.setRegClass(DstReg, &AMDGPU::SGPR_64RegClass); |
2571 | MRI.setRegClass(SrcReg0, &AMDGPU::SGPR_64RegClass); |
2572 | MRI.setRegClass(SrcReg1, &AMDGPU::SGPR_64RegClass); |
2573 | return; |
2574 | } |
2575 | |
2576 | // Replace G_AMDGPU_S_MUL_I64_I32 and G_AMDGPU_S_MUL_U64_U32 |
2577 | // with a vector mad. |
2578 | assert(MRI.getRegBankOrNull(DstReg) == &AMDGPU::VGPRRegBank && |
2579 | "The destination operand should be in vector registers." ); |
2580 | |
2581 | DebugLoc DL = MI.getDebugLoc(); |
2582 | |
2583 | // Extract the lower subregister from the first operand. |
2584 | Register Op0L = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
2585 | MRI.setRegClass(Op0L, &AMDGPU::VGPR_32RegClass); |
2586 | MRI.setType(VReg: Op0L, Ty: S32); |
2587 | B.buildTrunc(Res: Op0L, Op: SrcReg0); |
2588 | |
2589 | // Extract the lower subregister from the second operand. |
2590 | Register Op1L = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
2591 | MRI.setRegClass(Op1L, &AMDGPU::VGPR_32RegClass); |
2592 | MRI.setType(VReg: Op1L, Ty: S32); |
2593 | B.buildTrunc(Res: Op1L, Op: SrcReg1); |
2594 | |
2595 | unsigned NewOpc = Opc == AMDGPU::G_AMDGPU_S_MUL_U64_U32 |
2596 | ? AMDGPU::G_AMDGPU_MAD_U64_U32 |
2597 | : AMDGPU::G_AMDGPU_MAD_I64_I32; |
2598 | |
2599 | MachineIRBuilder B(MI); |
2600 | Register Zero64 = B.buildConstant(Res: S64, Val: 0).getReg(Idx: 0); |
2601 | MRI.setRegClass(Zero64, &AMDGPU::VReg_64RegClass); |
2602 | Register CarryOut = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); |
2603 | MRI.setRegClass(CarryOut, &AMDGPU::VReg_64RegClass); |
2604 | B.buildInstr(Opc: NewOpc, DstOps: {DstReg, CarryOut}, SrcOps: {Op0L, Op1L, Zero64}); |
2605 | MI.eraseFromParent(); |
2606 | return; |
2607 | } |
2608 | case AMDGPU::G_SEXT_INREG: { |
2609 | SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(OpIdx: 1)); |
2610 | if (SrcRegs.empty()) |
2611 | break; // Nothing to repair |
2612 | |
2613 | const LLT S32 = LLT::scalar(SizeInBits: 32); |
2614 | ApplyRegBankMapping O(B, *this, MRI, &AMDGPU::VGPRRegBank); |
2615 | |
2616 | // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs |
2617 | // we would need to further expand, and doesn't let us directly set the |
2618 | // result registers. |
2619 | SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(OpIdx: 0)); |
2620 | |
2621 | int Amt = MI.getOperand(i: 2).getImm(); |
2622 | if (Amt <= 32) { |
2623 | // Downstream users have expectations for the high bit behavior, so freeze |
2624 | // incoming undefined bits. |
2625 | if (Amt == 32) { |
2626 | // The low bits are unchanged. |
2627 | B.buildFreeze(Dst: DstRegs[0], Src: SrcRegs[0]); |
2628 | } else { |
2629 | auto Freeze = B.buildFreeze(Dst: S32, Src: SrcRegs[0]); |
2630 | // Extend in the low bits and propagate the sign bit to the high half. |
2631 | B.buildSExtInReg(Res: DstRegs[0], Op: Freeze, ImmOp: Amt); |
2632 | } |
2633 | |
2634 | B.buildAShr(Dst: DstRegs[1], Src0: DstRegs[0], Src1: B.buildConstant(Res: S32, Val: 31)); |
2635 | } else { |
2636 | // The low bits are unchanged, and extend in the high bits. |
2637 | // No freeze required |
2638 | B.buildCopy(Res: DstRegs[0], Op: SrcRegs[0]); |
2639 | B.buildSExtInReg(Res: DstRegs[1], Op: DstRegs[0], ImmOp: Amt - 32); |
2640 | } |
2641 | |
2642 | Register DstReg = MI.getOperand(i: 0).getReg(); |
2643 | MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); |
2644 | MI.eraseFromParent(); |
2645 | return; |
2646 | } |
2647 | case AMDGPU::G_CTPOP: |
2648 | case AMDGPU::G_BITREVERSE: { |
2649 | const RegisterBank *DstBank = |
2650 | OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank; |
2651 | if (DstBank == &AMDGPU::SGPRRegBank) |
2652 | break; |
2653 | |
2654 | Register SrcReg = MI.getOperand(i: 1).getReg(); |
2655 | const LLT S32 = LLT::scalar(SizeInBits: 32); |
2656 | LLT Ty = MRI.getType(Reg: SrcReg); |
2657 | if (Ty == S32) |
2658 | break; |
2659 | |
2660 | ApplyRegBankMapping ApplyVALU(B, *this, MRI, &AMDGPU::VGPRRegBank); |
2661 | |
2662 | MachineFunction &MF = B.getMF(); |
2663 | LegalizerHelper Helper(MF, ApplyVALU, B); |
2664 | |
2665 | if (Helper.narrowScalar(MI, TypeIdx: 1, NarrowTy: S32) != LegalizerHelper::Legalized) |
2666 | llvm_unreachable("narrowScalar should have succeeded" ); |
2667 | return; |
2668 | } |
2669 | case AMDGPU::G_AMDGPU_FFBH_U32: |
2670 | case AMDGPU::G_AMDGPU_FFBL_B32: |
2671 | case AMDGPU::G_CTLZ_ZERO_UNDEF: |
2672 | case AMDGPU::G_CTTZ_ZERO_UNDEF: { |
2673 | const RegisterBank *DstBank = |
2674 | OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank; |
2675 | if (DstBank == &AMDGPU::SGPRRegBank) |
2676 | break; |
2677 | |
2678 | Register SrcReg = MI.getOperand(i: 1).getReg(); |
2679 | const LLT S32 = LLT::scalar(SizeInBits: 32); |
2680 | LLT Ty = MRI.getType(Reg: SrcReg); |
2681 | if (Ty == S32) |
2682 | break; |
2683 | |
2684 | // We can narrow this more efficiently than Helper can by using ffbh/ffbl |
2685 | // which return -1 when the input is zero: |
2686 | // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32)) |
2687 | // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo)) |
2688 | // (ffbh hi:lo) -> (umin (ffbh hi), (uaddsat (ffbh lo), 32)) |
2689 | // (ffbl hi:lo) -> (umin (uaddsat (ffbh hi), 32), (ffbh lo)) |
2690 | ApplyRegBankMapping ApplyVALU(B, *this, MRI, &AMDGPU::VGPRRegBank); |
2691 | SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(OpIdx: 1)); |
2692 | unsigned NewOpc = Opc == AMDGPU::G_CTLZ_ZERO_UNDEF |
2693 | ? (unsigned)AMDGPU::G_AMDGPU_FFBH_U32 |
2694 | : Opc == AMDGPU::G_CTTZ_ZERO_UNDEF |
2695 | ? (unsigned)AMDGPU::G_AMDGPU_FFBL_B32 |
2696 | : Opc; |
2697 | unsigned Idx = NewOpc == AMDGPU::G_AMDGPU_FFBH_U32; |
2698 | auto X = B.buildInstr(Opc: NewOpc, DstOps: {S32}, SrcOps: {SrcRegs[Idx]}); |
2699 | auto Y = B.buildInstr(Opc: NewOpc, DstOps: {S32}, SrcOps: {SrcRegs[Idx ^ 1]}); |
2700 | unsigned AddOpc = |
2701 | Opc == AMDGPU::G_CTLZ_ZERO_UNDEF || Opc == AMDGPU::G_CTTZ_ZERO_UNDEF |
2702 | ? AMDGPU::G_ADD |
2703 | : AMDGPU::G_UADDSAT; |
2704 | Y = B.buildInstr(Opc: AddOpc, DstOps: {S32}, SrcOps: {Y, B.buildConstant(Res: S32, Val: 32)}); |
2705 | Register DstReg = MI.getOperand(i: 0).getReg(); |
2706 | B.buildUMin(Dst: DstReg, Src0: X, Src1: Y); |
2707 | MI.eraseFromParent(); |
2708 | return; |
2709 | } |
2710 | case AMDGPU::G_SEXT: |
2711 | case AMDGPU::G_ZEXT: |
2712 | case AMDGPU::G_ANYEXT: { |
2713 | Register SrcReg = MI.getOperand(i: 1).getReg(); |
2714 | LLT SrcTy = MRI.getType(Reg: SrcReg); |
2715 | const bool Signed = Opc == AMDGPU::G_SEXT; |
2716 | |
2717 | assert(OpdMapper.getVRegs(1).empty()); |
2718 | |
2719 | const RegisterBank *SrcBank = |
2720 | OpdMapper.getInstrMapping().getOperandMapping(i: 1).BreakDown[0].RegBank; |
2721 | |
2722 | Register DstReg = MI.getOperand(i: 0).getReg(); |
2723 | LLT DstTy = MRI.getType(Reg: DstReg); |
2724 | if (DstTy.isScalar() && |
2725 | SrcBank != &AMDGPU::SGPRRegBank && |
2726 | SrcBank != &AMDGPU::VCCRegBank && |
2727 | // FIXME: Should handle any type that round to s64 when irregular |
2728 | // breakdowns supported. |
2729 | DstTy.getSizeInBits() == 64 && |
2730 | SrcTy.getSizeInBits() <= 32) { |
2731 | SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(OpIdx: 0)); |
2732 | |
2733 | // Extend to 32-bit, and then extend the low half. |
2734 | if (Signed) { |
2735 | // TODO: Should really be buildSExtOrCopy |
2736 | B.buildSExtOrTrunc(Res: DefRegs[0], Op: SrcReg); |
2737 | } else if (Opc == AMDGPU::G_ZEXT) { |
2738 | B.buildZExtOrTrunc(Res: DefRegs[0], Op: SrcReg); |
2739 | } else { |
2740 | B.buildAnyExtOrTrunc(Res: DefRegs[0], Op: SrcReg); |
2741 | } |
2742 | |
2743 | extendLow32IntoHigh32(B, Hi32Reg: DefRegs[1], Lo32Reg: DefRegs[0], ExtOpc: Opc, RegBank: *SrcBank); |
2744 | MRI.setRegBank(Reg: DstReg, RegBank: *SrcBank); |
2745 | MI.eraseFromParent(); |
2746 | return; |
2747 | } |
2748 | |
2749 | if (SrcTy != LLT::scalar(SizeInBits: 1)) |
2750 | return; |
2751 | |
2752 | // It is not legal to have a legalization artifact with a VCC source. Rather |
2753 | // than introducing a copy, insert the select we would have to select the |
2754 | // copy to. |
2755 | if (SrcBank == &AMDGPU::VCCRegBank) { |
2756 | SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(OpIdx: 0)); |
2757 | |
2758 | const RegisterBank *DstBank = &AMDGPU::VGPRRegBank; |
2759 | |
2760 | unsigned DstSize = DstTy.getSizeInBits(); |
2761 | // 64-bit select is SGPR only |
2762 | const bool UseSel64 = DstSize > 32 && |
2763 | SrcBank->getID() == AMDGPU::SGPRRegBankID; |
2764 | |
2765 | // TODO: Should s16 select be legal? |
2766 | LLT SelType = UseSel64 ? LLT::scalar(SizeInBits: 64) : LLT::scalar(SizeInBits: 32); |
2767 | auto True = B.buildConstant(Res: SelType, Val: Signed ? -1 : 1); |
2768 | auto False = B.buildConstant(Res: SelType, Val: 0); |
2769 | |
2770 | MRI.setRegBank(Reg: True.getReg(Idx: 0), RegBank: *DstBank); |
2771 | MRI.setRegBank(Reg: False.getReg(Idx: 0), RegBank: *DstBank); |
2772 | MRI.setRegBank(Reg: DstReg, RegBank: *DstBank); |
2773 | |
2774 | if (DstSize > 32) { |
2775 | B.buildSelect(Res: DefRegs[0], Tst: SrcReg, Op0: True, Op1: False); |
2776 | extendLow32IntoHigh32(B, Hi32Reg: DefRegs[1], Lo32Reg: DefRegs[0], ExtOpc: Opc, RegBank: *SrcBank, IsBooleanSrc: true); |
2777 | } else if (DstSize < 32) { |
2778 | auto Sel = B.buildSelect(Res: SelType, Tst: SrcReg, Op0: True, Op1: False); |
2779 | MRI.setRegBank(Reg: Sel.getReg(Idx: 0), RegBank: *DstBank); |
2780 | B.buildTrunc(Res: DstReg, Op: Sel); |
2781 | } else { |
2782 | B.buildSelect(Res: DstReg, Tst: SrcReg, Op0: True, Op1: False); |
2783 | } |
2784 | |
2785 | MI.eraseFromParent(); |
2786 | return; |
2787 | } |
2788 | |
2789 | break; |
2790 | } |
2791 | case AMDGPU::G_EXTRACT_VECTOR_ELT: { |
2792 | SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(OpIdx: 0)); |
2793 | |
2794 | assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty()); |
2795 | |
2796 | Register DstReg = MI.getOperand(i: 0).getReg(); |
2797 | Register SrcReg = MI.getOperand(i: 1).getReg(); |
2798 | |
2799 | const LLT S32 = LLT::scalar(SizeInBits: 32); |
2800 | LLT DstTy = MRI.getType(Reg: DstReg); |
2801 | LLT SrcTy = MRI.getType(Reg: SrcReg); |
2802 | |
2803 | if (foldExtractEltToCmpSelect(B, MI, OpdMapper)) |
2804 | return; |
2805 | |
2806 | const ValueMapping &DstMapping |
2807 | = OpdMapper.getInstrMapping().getOperandMapping(i: 0); |
2808 | const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank; |
2809 | const RegisterBank *SrcBank = |
2810 | OpdMapper.getInstrMapping().getOperandMapping(i: 1).BreakDown[0].RegBank; |
2811 | const RegisterBank *IdxBank = |
2812 | OpdMapper.getInstrMapping().getOperandMapping(i: 2).BreakDown[0].RegBank; |
2813 | |
2814 | Register BaseIdxReg; |
2815 | unsigned ConstOffset; |
2816 | std::tie(args&: BaseIdxReg, args&: ConstOffset) = |
2817 | AMDGPU::getBaseWithConstantOffset(MRI, Reg: MI.getOperand(i: 2).getReg()); |
2818 | |
2819 | // See if the index is an add of a constant which will be foldable by moving |
2820 | // the base register of the index later if this is going to be executed in a |
2821 | // waterfall loop. This is essentially to reassociate the add of a constant |
2822 | // with the readfirstlane. |
2823 | bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank && |
2824 | ConstOffset > 0 && |
2825 | ConstOffset < SrcTy.getNumElements(); |
2826 | |
2827 | // Move the base register. We'll re-insert the add later. |
2828 | if (ShouldMoveIndexIntoLoop) |
2829 | MI.getOperand(i: 2).setReg(BaseIdxReg); |
2830 | |
2831 | // If this is a VGPR result only because the index was a VGPR result, the |
2832 | // actual indexing will be done on the SGPR source vector, which will |
2833 | // produce a scalar result. We need to copy to the VGPR result inside the |
2834 | // waterfall loop. |
2835 | const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank && |
2836 | SrcBank == &AMDGPU::SGPRRegBank; |
2837 | if (DstRegs.empty()) { |
2838 | applyDefaultMapping(OpdMapper); |
2839 | |
2840 | executeInWaterfallLoop(B, MI, OpIndices: {2}); |
2841 | |
2842 | if (NeedCopyToVGPR) { |
2843 | // We don't want a phi for this temporary reg. |
2844 | Register TmpReg = MRI.createGenericVirtualRegister(Ty: DstTy); |
2845 | MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank); |
2846 | MI.getOperand(i: 0).setReg(TmpReg); |
2847 | B.setInsertPt(MBB&: *MI.getParent(), II: ++MI.getIterator()); |
2848 | |
2849 | // Use a v_mov_b32 here to make the exec dependency explicit. |
2850 | buildVCopy(B, DstReg, SrcReg: TmpReg); |
2851 | } |
2852 | |
2853 | // Re-insert the constant offset add inside the waterfall loop. |
2854 | if (ShouldMoveIndexIntoLoop) |
2855 | reinsertVectorIndexAdd(B, IdxUseInstr&: MI, OpIdx: 2, ConstOffset); |
2856 | |
2857 | return; |
2858 | } |
2859 | |
2860 | assert(DstTy.getSizeInBits() == 64); |
2861 | |
2862 | LLT Vec32 = LLT::fixed_vector(NumElements: 2 * SrcTy.getNumElements(), ScalarSizeInBits: 32); |
2863 | |
2864 | auto CastSrc = B.buildBitcast(Dst: Vec32, Src: SrcReg); |
2865 | auto One = B.buildConstant(Res: S32, Val: 1); |
2866 | |
2867 | MachineBasicBlock::iterator MII = MI.getIterator(); |
2868 | |
2869 | // Split the vector index into 32-bit pieces. Prepare to move all of the |
2870 | // new instructions into a waterfall loop if necessary. |
2871 | // |
2872 | // Don't put the bitcast or constant in the loop. |
2873 | MachineInstrSpan Span(MII, &B.getMBB()); |
2874 | |
2875 | // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1). |
2876 | auto IdxLo = B.buildShl(Dst: S32, Src0: BaseIdxReg, Src1: One); |
2877 | auto IdxHi = B.buildAdd(Dst: S32, Src0: IdxLo, Src1: One); |
2878 | |
2879 | auto = B.buildExtractVectorElement(Res: DstRegs[0], Val: CastSrc, Idx: IdxLo); |
2880 | auto = B.buildExtractVectorElement(Res: DstRegs[1], Val: CastSrc, Idx: IdxHi); |
2881 | |
2882 | MRI.setRegBank(Reg: DstReg, RegBank: *DstBank); |
2883 | MRI.setRegBank(Reg: CastSrc.getReg(Idx: 0), RegBank: *SrcBank); |
2884 | MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank); |
2885 | MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank); |
2886 | MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank); |
2887 | |
2888 | SmallSet<Register, 4> OpsToWaterfall; |
2889 | if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) { |
2890 | MI.eraseFromParent(); |
2891 | return; |
2892 | } |
2893 | |
2894 | // Remove the original instruction to avoid potentially confusing the |
2895 | // waterfall loop logic. |
2896 | B.setInstr(*Span.begin()); |
2897 | MI.eraseFromParent(); |
2898 | executeInWaterfallLoop(B, make_range(x: Span.begin(), y: Span.end()), |
2899 | OpsToWaterfall); |
2900 | |
2901 | if (NeedCopyToVGPR) { |
2902 | MachineBasicBlock *LoopBB = Extract1->getParent(); |
2903 | Register TmpReg0 = MRI.createGenericVirtualRegister(Ty: S32); |
2904 | Register TmpReg1 = MRI.createGenericVirtualRegister(Ty: S32); |
2905 | MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank); |
2906 | MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank); |
2907 | |
2908 | Extract0->getOperand(i: 0).setReg(TmpReg0); |
2909 | Extract1->getOperand(i: 0).setReg(TmpReg1); |
2910 | |
2911 | B.setInsertPt(MBB&: *LoopBB, II: ++Extract1->getIterator()); |
2912 | |
2913 | buildVCopy(B, DstReg: DstRegs[0], SrcReg: TmpReg0); |
2914 | buildVCopy(B, DstReg: DstRegs[1], SrcReg: TmpReg1); |
2915 | } |
2916 | |
2917 | if (ShouldMoveIndexIntoLoop) |
2918 | reinsertVectorIndexAdd(B, IdxUseInstr&: *IdxLo, OpIdx: 1, ConstOffset); |
2919 | |
2920 | return; |
2921 | } |
2922 | case AMDGPU::G_INSERT_VECTOR_ELT: { |
2923 | SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(OpIdx: 2)); |
2924 | |
2925 | Register DstReg = MI.getOperand(i: 0).getReg(); |
2926 | LLT VecTy = MRI.getType(Reg: DstReg); |
2927 | |
2928 | assert(OpdMapper.getVRegs(0).empty()); |
2929 | assert(OpdMapper.getVRegs(3).empty()); |
2930 | |
2931 | if (substituteSimpleCopyRegs(OpdMapper, OpIdx: 1)) |
2932 | MRI.setType(VReg: MI.getOperand(i: 1).getReg(), Ty: VecTy); |
2933 | |
2934 | if (foldInsertEltToCmpSelect(B, MI, OpdMapper)) |
2935 | return; |
2936 | |
2937 | const RegisterBank *IdxBank = |
2938 | OpdMapper.getInstrMapping().getOperandMapping(i: 3).BreakDown[0].RegBank; |
2939 | |
2940 | Register SrcReg = MI.getOperand(i: 1).getReg(); |
2941 | Register InsReg = MI.getOperand(i: 2).getReg(); |
2942 | LLT InsTy = MRI.getType(Reg: InsReg); |
2943 | (void)InsTy; |
2944 | |
2945 | Register BaseIdxReg; |
2946 | unsigned ConstOffset; |
2947 | std::tie(args&: BaseIdxReg, args&: ConstOffset) = |
2948 | AMDGPU::getBaseWithConstantOffset(MRI, Reg: MI.getOperand(i: 3).getReg()); |
2949 | |
2950 | // See if the index is an add of a constant which will be foldable by moving |
2951 | // the base register of the index later if this is going to be executed in a |
2952 | // waterfall loop. This is essentially to reassociate the add of a constant |
2953 | // with the readfirstlane. |
2954 | bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank && |
2955 | ConstOffset > 0 && |
2956 | ConstOffset < VecTy.getNumElements(); |
2957 | |
2958 | // Move the base register. We'll re-insert the add later. |
2959 | if (ShouldMoveIndexIntoLoop) |
2960 | MI.getOperand(i: 3).setReg(BaseIdxReg); |
2961 | |
2962 | |
2963 | if (InsRegs.empty()) { |
2964 | executeInWaterfallLoop(B, MI, OpIndices: {3}); |
2965 | |
2966 | // Re-insert the constant offset add inside the waterfall loop. |
2967 | if (ShouldMoveIndexIntoLoop) { |
2968 | reinsertVectorIndexAdd(B, IdxUseInstr&: MI, OpIdx: 3, ConstOffset); |
2969 | } |
2970 | |
2971 | return; |
2972 | } |
2973 | |
2974 | assert(InsTy.getSizeInBits() == 64); |
2975 | |
2976 | const LLT S32 = LLT::scalar(SizeInBits: 32); |
2977 | LLT Vec32 = LLT::fixed_vector(NumElements: 2 * VecTy.getNumElements(), ScalarSizeInBits: 32); |
2978 | |
2979 | auto CastSrc = B.buildBitcast(Dst: Vec32, Src: SrcReg); |
2980 | auto One = B.buildConstant(Res: S32, Val: 1); |
2981 | |
2982 | // Split the vector index into 32-bit pieces. Prepare to move all of the |
2983 | // new instructions into a waterfall loop if necessary. |
2984 | // |
2985 | // Don't put the bitcast or constant in the loop. |
2986 | MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB()); |
2987 | |
2988 | // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1). |
2989 | auto IdxLo = B.buildShl(Dst: S32, Src0: BaseIdxReg, Src1: One); |
2990 | auto IdxHi = B.buildAdd(Dst: S32, Src0: IdxLo, Src1: One); |
2991 | |
2992 | auto InsLo = B.buildInsertVectorElement(Res: Vec32, Val: CastSrc, Elt: InsRegs[0], Idx: IdxLo); |
2993 | auto InsHi = B.buildInsertVectorElement(Res: Vec32, Val: InsLo, Elt: InsRegs[1], Idx: IdxHi); |
2994 | |
2995 | const RegisterBank *DstBank = |
2996 | OpdMapper.getInstrMapping().getOperandMapping(i: 0).BreakDown[0].RegBank; |
2997 | const RegisterBank *SrcBank = |
2998 | OpdMapper.getInstrMapping().getOperandMapping(i: 1).BreakDown[0].RegBank; |
2999 | const RegisterBank *InsSrcBank = |
3000 | OpdMapper.getInstrMapping().getOperandMapping(i: 2).BreakDown[0].RegBank; |
3001 | |
3002 | MRI.setRegBank(Reg: InsReg, RegBank: *InsSrcBank); |
3003 | MRI.setRegBank(Reg: CastSrc.getReg(Idx: 0), RegBank: *SrcBank); |
3004 | MRI.setRegBank(Reg: InsLo.getReg(Idx: 0), RegBank: *DstBank); |
3005 | MRI.setRegBank(Reg: InsHi.getReg(Idx: 0), RegBank: *DstBank); |
3006 | MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank); |
3007 | MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank); |
3008 | MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank); |
3009 | |
3010 | |
3011 | SmallSet<Register, 4> OpsToWaterfall; |
3012 | if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) { |
3013 | B.setInsertPt(MBB&: B.getMBB(), II: MI); |
3014 | B.buildBitcast(Dst: DstReg, Src: InsHi); |
3015 | MI.eraseFromParent(); |
3016 | return; |
3017 | } |
3018 | |
3019 | B.setInstr(*Span.begin()); |
3020 | MI.eraseFromParent(); |
3021 | |
3022 | // Figure out the point after the waterfall loop before mangling the control |
3023 | // flow. |
3024 | executeInWaterfallLoop(B, make_range(x: Span.begin(), y: Span.end()), |
3025 | OpsToWaterfall); |
3026 | |
3027 | // The insertion point is now right after the original instruction. |
3028 | // |
3029 | // Keep the bitcast to the original vector type out of the loop. Doing this |
3030 | // saved an extra phi we don't need inside the loop. |
3031 | B.buildBitcast(Dst: DstReg, Src: InsHi); |
3032 | |
3033 | // Re-insert the constant offset add inside the waterfall loop. |
3034 | if (ShouldMoveIndexIntoLoop) |
3035 | reinsertVectorIndexAdd(B, IdxUseInstr&: *IdxLo, OpIdx: 1, ConstOffset); |
3036 | |
3037 | return; |
3038 | } |
3039 | case AMDGPU::G_AMDGPU_BUFFER_LOAD: |
3040 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: |
3041 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT: |
3042 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: |
3043 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE: |
3044 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT: |
3045 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE: |
3046 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16: |
3047 | case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT: |
3048 | case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16: |
3049 | case AMDGPU::G_AMDGPU_BUFFER_STORE: |
3050 | case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE: |
3051 | case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT: |
3052 | case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT: |
3053 | case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: |
3054 | case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT: |
3055 | case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: { |
3056 | applyDefaultMapping(OpdMapper); |
3057 | executeInWaterfallLoop(B, MI, OpIndices: {1, 4}); |
3058 | return; |
3059 | } |
3060 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP: |
3061 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD: |
3062 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB: |
3063 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN: |
3064 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN: |
3065 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX: |
3066 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX: |
3067 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND: |
3068 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR: |
3069 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR: |
3070 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC: |
3071 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: { |
3072 | applyDefaultMapping(OpdMapper); |
3073 | executeInWaterfallLoop(B, MI, OpIndices: {2, 5}); |
3074 | return; |
3075 | } |
3076 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: |
3077 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD_BF16: |
3078 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN: |
3079 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: { |
3080 | applyDefaultMapping(OpdMapper); |
3081 | executeInWaterfallLoop(B, MI, OpIndices: {2, 5}); |
3082 | return; |
3083 | } |
3084 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: { |
3085 | applyDefaultMapping(OpdMapper); |
3086 | executeInWaterfallLoop(B, MI, OpIndices: {3, 6}); |
3087 | return; |
3088 | } |
3089 | case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: |
3090 | case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE: |
3091 | case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE: |
3092 | case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT: |
3093 | case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT: { |
3094 | applyMappingSBufferLoad(B, OpdMapper); |
3095 | return; |
3096 | } |
3097 | case AMDGPU::G_INTRINSIC: |
3098 | case AMDGPU::G_INTRINSIC_CONVERGENT: { |
3099 | switch (cast<GIntrinsic>(Val&: MI).getIntrinsicID()) { |
3100 | case Intrinsic::amdgcn_readlane: { |
3101 | substituteSimpleCopyRegs(OpdMapper, OpIdx: 2); |
3102 | |
3103 | assert(OpdMapper.getVRegs(0).empty()); |
3104 | assert(OpdMapper.getVRegs(3).empty()); |
3105 | |
3106 | // Make sure the index is an SGPR. It doesn't make sense to run this in a |
3107 | // waterfall loop, so assume it's a uniform value. |
3108 | constrainOpWithReadfirstlane(B, MI, OpIdx: 3); // Index |
3109 | return; |
3110 | } |
3111 | case Intrinsic::amdgcn_writelane: { |
3112 | assert(OpdMapper.getVRegs(0).empty()); |
3113 | assert(OpdMapper.getVRegs(2).empty()); |
3114 | assert(OpdMapper.getVRegs(3).empty()); |
3115 | |
3116 | substituteSimpleCopyRegs(OpdMapper, OpIdx: 4); // VGPR input val |
3117 | constrainOpWithReadfirstlane(B, MI, OpIdx: 2); // Source value |
3118 | constrainOpWithReadfirstlane(B, MI, OpIdx: 3); // Index |
3119 | return; |
3120 | } |
3121 | case Intrinsic::amdgcn_interp_p1: |
3122 | case Intrinsic::amdgcn_interp_p2: |
3123 | case Intrinsic::amdgcn_interp_mov: |
3124 | case Intrinsic::amdgcn_interp_p1_f16: |
3125 | case Intrinsic::amdgcn_interp_p2_f16: |
3126 | case Intrinsic::amdgcn_lds_param_load: { |
3127 | applyDefaultMapping(OpdMapper); |
3128 | |
3129 | // Readlane for m0 value, which is always the last operand. |
3130 | // FIXME: Should this be a waterfall loop instead? |
3131 | constrainOpWithReadfirstlane(B, MI, OpIdx: MI.getNumOperands() - 1); // Index |
3132 | return; |
3133 | } |
3134 | case Intrinsic::amdgcn_interp_inreg_p10: |
3135 | case Intrinsic::amdgcn_interp_inreg_p2: |
3136 | case Intrinsic::amdgcn_interp_inreg_p10_f16: |
3137 | case Intrinsic::amdgcn_interp_inreg_p2_f16: |
3138 | case Intrinsic::amdgcn_interp_p10_rtz_f16: |
3139 | case Intrinsic::amdgcn_interp_p2_rtz_f16: |
3140 | applyDefaultMapping(OpdMapper); |
3141 | return; |
3142 | case Intrinsic::amdgcn_permlane16: |
3143 | case Intrinsic::amdgcn_permlanex16: { |
3144 | // Doing a waterfall loop over these wouldn't make any sense. |
3145 | substituteSimpleCopyRegs(OpdMapper, OpIdx: 2); |
3146 | substituteSimpleCopyRegs(OpdMapper, OpIdx: 3); |
3147 | constrainOpWithReadfirstlane(B, MI, OpIdx: 4); |
3148 | constrainOpWithReadfirstlane(B, MI, OpIdx: 5); |
3149 | return; |
3150 | } |
3151 | case Intrinsic::amdgcn_sbfe: |
3152 | applyMappingBFE(B, OpdMapper, Signed: true); |
3153 | return; |
3154 | case Intrinsic::amdgcn_ubfe: |
3155 | applyMappingBFE(B, OpdMapper, Signed: false); |
3156 | return; |
3157 | case Intrinsic::amdgcn_inverse_ballot: |
3158 | case Intrinsic::amdgcn_s_bitreplicate: |
3159 | case Intrinsic::amdgcn_s_quadmask: |
3160 | case Intrinsic::amdgcn_s_wqm: |
3161 | applyDefaultMapping(OpdMapper); |
3162 | constrainOpWithReadfirstlane(B, MI, OpIdx: 2); // Mask |
3163 | return; |
3164 | case Intrinsic::amdgcn_ballot: |
3165 | // Use default handling and insert copy to vcc source. |
3166 | break; |
3167 | } |
3168 | break; |
3169 | } |
3170 | case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: |
3171 | case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16: |
3172 | case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: |
3173 | case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: { |
3174 | const AMDGPU::RsrcIntrinsic *RSrcIntrin = |
3175 | AMDGPU::lookupRsrcIntrinsic(Intr: AMDGPU::getIntrinsicID(I: MI)); |
3176 | assert(RSrcIntrin && RSrcIntrin->IsImage); |
3177 | // Non-images can have complications from operands that allow both SGPR |
3178 | // and VGPR. For now it's too complicated to figure out the final opcode |
3179 | // to derive the register bank from the MCInstrDesc. |
3180 | applyMappingImage(B, MI, OpdMapper, RsrcIdx: RSrcIntrin->RsrcArg); |
3181 | return; |
3182 | } |
3183 | case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: { |
3184 | unsigned N = MI.getNumExplicitOperands() - 2; |
3185 | applyDefaultMapping(OpdMapper); |
3186 | executeInWaterfallLoop(B, MI, OpIndices: {N}); |
3187 | return; |
3188 | } |
3189 | case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: |
3190 | case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: { |
3191 | auto IntrID = cast<GIntrinsic>(Val&: MI).getIntrinsicID(); |
3192 | switch (IntrID) { |
3193 | case Intrinsic::amdgcn_ds_ordered_add: |
3194 | case Intrinsic::amdgcn_ds_ordered_swap: { |
3195 | // This is only allowed to execute with 1 lane, so readfirstlane is safe. |
3196 | assert(OpdMapper.getVRegs(0).empty()); |
3197 | substituteSimpleCopyRegs(OpdMapper, OpIdx: 3); |
3198 | constrainOpWithReadfirstlane(B, MI, OpIdx: 2); // M0 |
3199 | return; |
3200 | } |
3201 | case Intrinsic::amdgcn_ds_gws_init: |
3202 | case Intrinsic::amdgcn_ds_gws_barrier: |
3203 | case Intrinsic::amdgcn_ds_gws_sema_br: { |
3204 | // Only the first lane is executes, so readfirstlane is safe. |
3205 | substituteSimpleCopyRegs(OpdMapper, OpIdx: 1); |
3206 | constrainOpWithReadfirstlane(B, MI, OpIdx: 2); // M0 |
3207 | return; |
3208 | } |
3209 | case Intrinsic::amdgcn_ds_gws_sema_v: |
3210 | case Intrinsic::amdgcn_ds_gws_sema_p: |
3211 | case Intrinsic::amdgcn_ds_gws_sema_release_all: { |
3212 | // Only the first lane is executes, so readfirstlane is safe. |
3213 | constrainOpWithReadfirstlane(B, MI, OpIdx: 1); // M0 |
3214 | return; |
3215 | } |
3216 | case Intrinsic::amdgcn_ds_append: |
3217 | case Intrinsic::amdgcn_ds_consume: { |
3218 | constrainOpWithReadfirstlane(B, MI, OpIdx: 2); // M0 |
3219 | return; |
3220 | } |
3221 | case Intrinsic::amdgcn_s_sendmsg: |
3222 | case Intrinsic::amdgcn_s_sendmsghalt: { |
3223 | // FIXME: Should this use a waterfall loop? |
3224 | constrainOpWithReadfirstlane(B, MI, OpIdx: 2); // M0 |
3225 | return; |
3226 | } |
3227 | case Intrinsic::amdgcn_s_setreg: { |
3228 | constrainOpWithReadfirstlane(B, MI, OpIdx: 2); |
3229 | return; |
3230 | } |
3231 | case Intrinsic::amdgcn_s_ttracedata: |
3232 | constrainOpWithReadfirstlane(B, MI, OpIdx: 1); // M0 |
3233 | return; |
3234 | case Intrinsic::amdgcn_raw_buffer_load_lds: |
3235 | case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: { |
3236 | applyDefaultMapping(OpdMapper); |
3237 | constrainOpWithReadfirstlane(B, MI, OpIdx: 1); // rsrc |
3238 | constrainOpWithReadfirstlane(B, MI, OpIdx: 2); // M0 |
3239 | constrainOpWithReadfirstlane(B, MI, OpIdx: 5); // soffset |
3240 | return; |
3241 | } |
3242 | case Intrinsic::amdgcn_struct_buffer_load_lds: |
3243 | case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: { |
3244 | applyDefaultMapping(OpdMapper); |
3245 | constrainOpWithReadfirstlane(B, MI, OpIdx: 1); // rsrc |
3246 | constrainOpWithReadfirstlane(B, MI, OpIdx: 2); // M0 |
3247 | constrainOpWithReadfirstlane(B, MI, OpIdx: 6); // soffset |
3248 | return; |
3249 | } |
3250 | case Intrinsic::amdgcn_global_load_lds: { |
3251 | applyDefaultMapping(OpdMapper); |
3252 | constrainOpWithReadfirstlane(B, MI, OpIdx: 2); |
3253 | return; |
3254 | } |
3255 | case Intrinsic::amdgcn_lds_direct_load: { |
3256 | applyDefaultMapping(OpdMapper); |
3257 | // Readlane for m0 value, which is always the last operand. |
3258 | constrainOpWithReadfirstlane(B, MI, OpIdx: MI.getNumOperands() - 1); // Index |
3259 | return; |
3260 | } |
3261 | case Intrinsic::amdgcn_exp_row: |
3262 | applyDefaultMapping(OpdMapper); |
3263 | constrainOpWithReadfirstlane(B, MI, OpIdx: 8); // M0 |
3264 | return; |
3265 | case Intrinsic::amdgcn_s_sleep_var: |
3266 | assert(OpdMapper.getVRegs(1).empty()); |
3267 | constrainOpWithReadfirstlane(B, MI, OpIdx: 1); |
3268 | return; |
3269 | case Intrinsic::amdgcn_s_barrier_signal_var: |
3270 | case Intrinsic::amdgcn_s_barrier_join: |
3271 | case Intrinsic::amdgcn_s_wakeup_barrier: |
3272 | constrainOpWithReadfirstlane(B, MI, OpIdx: 1); |
3273 | return; |
3274 | case Intrinsic::amdgcn_s_barrier_signal_isfirst_var: |
3275 | constrainOpWithReadfirstlane(B, MI, OpIdx: 2); |
3276 | return; |
3277 | case Intrinsic::amdgcn_s_barrier_init: |
3278 | constrainOpWithReadfirstlane(B, MI, OpIdx: 1); |
3279 | constrainOpWithReadfirstlane(B, MI, OpIdx: 2); |
3280 | return; |
3281 | case Intrinsic::amdgcn_s_get_barrier_state: { |
3282 | constrainOpWithReadfirstlane(B, MI, OpIdx: 2); |
3283 | return; |
3284 | } |
3285 | default: { |
3286 | if (const AMDGPU::RsrcIntrinsic *RSrcIntrin = |
3287 | AMDGPU::lookupRsrcIntrinsic(Intr: IntrID)) { |
3288 | // Non-images can have complications from operands that allow both SGPR |
3289 | // and VGPR. For now it's too complicated to figure out the final opcode |
3290 | // to derive the register bank from the MCInstrDesc. |
3291 | if (RSrcIntrin->IsImage) { |
3292 | applyMappingImage(B, MI, OpdMapper, RsrcIdx: RSrcIntrin->RsrcArg); |
3293 | return; |
3294 | } |
3295 | } |
3296 | |
3297 | break; |
3298 | } |
3299 | } |
3300 | break; |
3301 | } |
3302 | case AMDGPU::G_SI_CALL: { |
3303 | // Use a set to avoid extra readfirstlanes in the case where multiple |
3304 | // operands are the same register. |
3305 | SmallSet<Register, 4> SGPROperandRegs; |
3306 | |
3307 | if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, {1})) |
3308 | break; |
3309 | |
3310 | // Move all copies to physical SGPRs that are used by the call instruction |
3311 | // into the loop block. Start searching for these copies until the |
3312 | // ADJCALLSTACKUP. |
3313 | unsigned FrameSetupOpcode = AMDGPU::ADJCALLSTACKUP; |
3314 | unsigned FrameDestroyOpcode = AMDGPU::ADJCALLSTACKDOWN; |
3315 | |
3316 | // Move all non-copies before the copies, so that a complete range can be |
3317 | // moved into the waterfall loop. |
3318 | SmallVector<MachineInstr *, 4> NonCopyInstrs; |
3319 | // Count of NonCopyInstrs found until the current LastCopy. |
3320 | unsigned NonCopyInstrsLen = 0; |
3321 | MachineBasicBlock::iterator Start(&MI); |
3322 | MachineBasicBlock::iterator LastCopy = Start; |
3323 | MachineBasicBlock *MBB = MI.getParent(); |
3324 | const SIMachineFunctionInfo *Info = |
3325 | MBB->getParent()->getInfo<SIMachineFunctionInfo>(); |
3326 | while (Start->getOpcode() != FrameSetupOpcode) { |
3327 | --Start; |
3328 | bool IsCopy = false; |
3329 | if (Start->getOpcode() == AMDGPU::COPY) { |
3330 | auto &Dst = Start->getOperand(i: 0); |
3331 | if (Dst.isReg()) { |
3332 | Register Reg = Dst.getReg(); |
3333 | if (Reg.isPhysical() && MI.readsRegister(Reg, TRI)) { |
3334 | IsCopy = true; |
3335 | } else { |
3336 | // Also move the copy from the scratch rsrc descriptor into the loop |
3337 | // to allow it to be optimized away. |
3338 | auto &Src = Start->getOperand(i: 1); |
3339 | if (Src.isReg()) { |
3340 | Reg = Src.getReg(); |
3341 | IsCopy = Info->getScratchRSrcReg() == Reg; |
3342 | } |
3343 | } |
3344 | } |
3345 | } |
3346 | |
3347 | if (IsCopy) { |
3348 | LastCopy = Start; |
3349 | NonCopyInstrsLen = NonCopyInstrs.size(); |
3350 | } else { |
3351 | NonCopyInstrs.push_back(Elt: &*Start); |
3352 | } |
3353 | } |
3354 | NonCopyInstrs.resize(N: NonCopyInstrsLen); |
3355 | |
3356 | for (auto *NonCopy : reverse(C&: NonCopyInstrs)) { |
3357 | MBB->splice(Where: LastCopy, Other: MBB, From: NonCopy->getIterator()); |
3358 | } |
3359 | Start = LastCopy; |
3360 | |
3361 | // Do the same for copies after the loop |
3362 | NonCopyInstrs.clear(); |
3363 | NonCopyInstrsLen = 0; |
3364 | MachineBasicBlock::iterator End(&MI); |
3365 | LastCopy = End; |
3366 | while (End->getOpcode() != FrameDestroyOpcode) { |
3367 | ++End; |
3368 | bool IsCopy = false; |
3369 | if (End->getOpcode() == AMDGPU::COPY) { |
3370 | auto &Src = End->getOperand(i: 1); |
3371 | if (Src.isReg()) { |
3372 | Register Reg = Src.getReg(); |
3373 | IsCopy = Reg.isPhysical() && MI.modifiesRegister(Reg, TRI); |
3374 | } |
3375 | } |
3376 | |
3377 | if (IsCopy) { |
3378 | LastCopy = End; |
3379 | NonCopyInstrsLen = NonCopyInstrs.size(); |
3380 | } else { |
3381 | NonCopyInstrs.push_back(Elt: &*End); |
3382 | } |
3383 | } |
3384 | NonCopyInstrs.resize(N: NonCopyInstrsLen); |
3385 | |
3386 | End = LastCopy; |
3387 | ++LastCopy; |
3388 | for (auto *NonCopy : reverse(C&: NonCopyInstrs)) { |
3389 | MBB->splice(Where: LastCopy, Other: MBB, From: NonCopy->getIterator()); |
3390 | } |
3391 | |
3392 | ++End; |
3393 | B.setInsertPt(MBB&: B.getMBB(), II: Start); |
3394 | executeInWaterfallLoop(B, make_range(x: Start, y: End), SGPROperandRegs); |
3395 | break; |
3396 | } |
3397 | case AMDGPU::G_LOAD: |
3398 | case AMDGPU::G_ZEXTLOAD: |
3399 | case AMDGPU::G_SEXTLOAD: { |
3400 | if (applyMappingLoad(B, OpdMapper, MI)) |
3401 | return; |
3402 | break; |
3403 | } |
3404 | case AMDGPU::G_DYN_STACKALLOC: |
3405 | applyMappingDynStackAlloc(B, OpdMapper, MI); |
3406 | return; |
3407 | case AMDGPU::G_STACKRESTORE: { |
3408 | applyDefaultMapping(OpdMapper); |
3409 | constrainOpWithReadfirstlane(B, MI, OpIdx: 0); |
3410 | return; |
3411 | } |
3412 | case AMDGPU::G_SBFX: |
3413 | applyMappingBFE(B, OpdMapper, /*Signed*/ true); |
3414 | return; |
3415 | case AMDGPU::G_UBFX: |
3416 | applyMappingBFE(B, OpdMapper, /*Signed*/ false); |
3417 | return; |
3418 | case AMDGPU::G_AMDGPU_MAD_U64_U32: |
3419 | case AMDGPU::G_AMDGPU_MAD_I64_I32: |
3420 | applyMappingMAD_64_32(B, OpdMapper); |
3421 | return; |
3422 | case AMDGPU::G_PREFETCH: { |
3423 | if (!Subtarget.hasPrefetch()) { |
3424 | MI.eraseFromParent(); |
3425 | return; |
3426 | } |
3427 | Register PtrReg = MI.getOperand(i: 0).getReg(); |
3428 | unsigned PtrBank = getRegBankID(PtrReg, MRI, AMDGPU::SGPRRegBankID); |
3429 | if (PtrBank == AMDGPU::VGPRRegBankID) { |
3430 | MI.eraseFromParent(); |
3431 | return; |
3432 | } |
3433 | unsigned AS = MRI.getType(Reg: PtrReg).getAddressSpace(); |
3434 | if (!AMDGPU::isFlatGlobalAddrSpace(AS) && |
3435 | AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) { |
3436 | MI.eraseFromParent(); |
3437 | return; |
3438 | } |
3439 | applyDefaultMapping(OpdMapper); |
3440 | return; |
3441 | } |
3442 | default: |
3443 | break; |
3444 | } |
3445 | |
3446 | return applyDefaultMapping(OpdMapper); |
3447 | } |
3448 | |
3449 | // vgpr, sgpr -> vgpr |
3450 | // vgpr, agpr -> vgpr |
3451 | // agpr, agpr -> agpr |
3452 | // agpr, sgpr -> vgpr |
3453 | static unsigned regBankUnion(unsigned RB0, unsigned RB1) { |
3454 | if (RB0 == AMDGPU::InvalidRegBankID) |
3455 | return RB1; |
3456 | if (RB1 == AMDGPU::InvalidRegBankID) |
3457 | return RB0; |
3458 | |
3459 | if (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID) |
3460 | return AMDGPU::SGPRRegBankID; |
3461 | |
3462 | if (RB0 == AMDGPU::AGPRRegBankID && RB1 == AMDGPU::AGPRRegBankID) |
3463 | return AMDGPU::AGPRRegBankID; |
3464 | |
3465 | return AMDGPU::VGPRRegBankID; |
3466 | } |
3467 | |
3468 | static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1) { |
3469 | if (RB0 == AMDGPU::InvalidRegBankID) |
3470 | return RB1; |
3471 | if (RB1 == AMDGPU::InvalidRegBankID) |
3472 | return RB0; |
3473 | |
3474 | // vcc, vcc -> vcc |
3475 | // vcc, sgpr -> vcc |
3476 | // vcc, vgpr -> vcc |
3477 | if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID) |
3478 | return AMDGPU::VCCRegBankID; |
3479 | |
3480 | // vcc, vgpr -> vgpr |
3481 | return regBankUnion(RB0, RB1); |
3482 | } |
3483 | |
3484 | unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI, |
3485 | const MachineInstr &MI) const { |
3486 | unsigned RegBank = AMDGPU::InvalidRegBankID; |
3487 | |
3488 | for (const MachineOperand &MO : MI.operands()) { |
3489 | if (!MO.isReg()) |
3490 | continue; |
3491 | Register Reg = MO.getReg(); |
3492 | if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) { |
3493 | RegBank = regBankUnion(RB0: RegBank, RB1: Bank->getID()); |
3494 | if (RegBank == AMDGPU::VGPRRegBankID) |
3495 | break; |
3496 | } |
3497 | } |
3498 | |
3499 | return RegBank; |
3500 | } |
3501 | |
3502 | bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const { |
3503 | const MachineFunction &MF = *MI.getParent()->getParent(); |
3504 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
3505 | for (const MachineOperand &MO : MI.operands()) { |
3506 | if (!MO.isReg()) |
3507 | continue; |
3508 | Register Reg = MO.getReg(); |
3509 | if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) { |
3510 | if (Bank->getID() != AMDGPU::SGPRRegBankID) |
3511 | return false; |
3512 | } |
3513 | } |
3514 | return true; |
3515 | } |
3516 | |
3517 | const RegisterBankInfo::InstructionMapping & |
3518 | AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const { |
3519 | const MachineFunction &MF = *MI.getParent()->getParent(); |
3520 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
3521 | SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); |
3522 | |
3523 | for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { |
3524 | const MachineOperand &SrcOp = MI.getOperand(i); |
3525 | if (!SrcOp.isReg()) |
3526 | continue; |
3527 | |
3528 | unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI); |
3529 | OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); |
3530 | } |
3531 | return getInstructionMapping(ID: 1, Cost: 1, OperandsMapping: getOperandsMapping(OpdsMapping), |
3532 | NumOperands: MI.getNumOperands()); |
3533 | } |
3534 | |
3535 | const RegisterBankInfo::InstructionMapping & |
3536 | AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const { |
3537 | const MachineFunction &MF = *MI.getParent()->getParent(); |
3538 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
3539 | SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); |
3540 | |
3541 | // Even though we technically could use SGPRs, this would require knowledge of |
3542 | // the constant bus restriction. Force all sources to VGPR (except for VCC). |
3543 | // |
3544 | // TODO: Unary ops are trivially OK, so accept SGPRs? |
3545 | for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { |
3546 | const MachineOperand &Src = MI.getOperand(i); |
3547 | if (!Src.isReg()) |
3548 | continue; |
3549 | |
3550 | unsigned Size = getSizeInBits(Src.getReg(), MRI, *TRI); |
3551 | unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID; |
3552 | OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size); |
3553 | } |
3554 | |
3555 | return getInstructionMapping(ID: 1, Cost: 1, OperandsMapping: getOperandsMapping(OpdsMapping), |
3556 | NumOperands: MI.getNumOperands()); |
3557 | } |
3558 | |
3559 | const RegisterBankInfo::InstructionMapping & |
3560 | AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const { |
3561 | const MachineFunction &MF = *MI.getParent()->getParent(); |
3562 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
3563 | SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); |
3564 | |
3565 | for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { |
3566 | const MachineOperand &Op = MI.getOperand(i: I); |
3567 | if (!Op.isReg()) |
3568 | continue; |
3569 | |
3570 | unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI); |
3571 | OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); |
3572 | } |
3573 | |
3574 | return getInstructionMapping(ID: 1, Cost: 1, OperandsMapping: getOperandsMapping(OpdsMapping), |
3575 | NumOperands: MI.getNumOperands()); |
3576 | } |
3577 | |
3578 | const RegisterBankInfo::InstructionMapping & |
3579 | AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI, |
3580 | const MachineInstr &MI, |
3581 | int RsrcIdx) const { |
3582 | // The reported argument index is relative to the IR intrinsic call arguments, |
3583 | // so we need to shift by the number of defs and the intrinsic ID. |
3584 | RsrcIdx += MI.getNumExplicitDefs() + 1; |
3585 | |
3586 | const int NumOps = MI.getNumOperands(); |
3587 | SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps); |
3588 | |
3589 | // TODO: Should packed/unpacked D16 difference be reported here as part of |
3590 | // the value mapping? |
3591 | for (int I = 0; I != NumOps; ++I) { |
3592 | if (!MI.getOperand(i: I).isReg()) |
3593 | continue; |
3594 | |
3595 | Register OpReg = MI.getOperand(i: I).getReg(); |
3596 | // We replace some dead address operands with $noreg |
3597 | if (!OpReg) |
3598 | continue; |
3599 | |
3600 | unsigned Size = getSizeInBits(OpReg, MRI, *TRI); |
3601 | |
3602 | // FIXME: Probably need a new intrinsic register bank searchable table to |
3603 | // handle arbitrary intrinsics easily. |
3604 | // |
3605 | // If this has a sampler, it immediately follows rsrc. |
3606 | const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1; |
3607 | |
3608 | if (MustBeSGPR) { |
3609 | // If this must be an SGPR, so we must report whatever it is as legal. |
3610 | unsigned NewBank = getRegBankID(OpReg, MRI, AMDGPU::SGPRRegBankID); |
3611 | OpdsMapping[I] = AMDGPU::getValueMapping(BankID: NewBank, Size); |
3612 | } else { |
3613 | // Some operands must be VGPR, and these are easy to copy to. |
3614 | OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); |
3615 | } |
3616 | } |
3617 | |
3618 | return getInstructionMapping(ID: 1, Cost: 1, OperandsMapping: getOperandsMapping(OpdsMapping), NumOperands: NumOps); |
3619 | } |
3620 | |
3621 | /// Return the mapping for a pointer argument. |
3622 | const RegisterBankInfo::ValueMapping * |
3623 | AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI, |
3624 | Register PtrReg) const { |
3625 | LLT PtrTy = MRI.getType(Reg: PtrReg); |
3626 | unsigned Size = PtrTy.getSizeInBits(); |
3627 | if (Subtarget.useFlatForGlobal() || |
3628 | !AMDGPU::isFlatGlobalAddrSpace(PtrTy.getAddressSpace())) |
3629 | return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); |
3630 | |
3631 | // If we're using MUBUF instructions for global memory, an SGPR base register |
3632 | // is possible. Otherwise this needs to be a VGPR. |
3633 | const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI); |
3634 | return AMDGPU::getValueMapping(BankID: PtrBank->getID(), Size); |
3635 | } |
3636 | |
3637 | const RegisterBankInfo::InstructionMapping & |
3638 | AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const { |
3639 | |
3640 | const MachineFunction &MF = *MI.getParent()->getParent(); |
3641 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
3642 | SmallVector<const ValueMapping*, 2> OpdsMapping(2); |
3643 | unsigned Size = getSizeInBits(MI.getOperand(i: 0).getReg(), MRI, *TRI); |
3644 | Register PtrReg = MI.getOperand(i: 1).getReg(); |
3645 | LLT PtrTy = MRI.getType(Reg: PtrReg); |
3646 | unsigned AS = PtrTy.getAddressSpace(); |
3647 | unsigned PtrSize = PtrTy.getSizeInBits(); |
3648 | |
3649 | const ValueMapping *ValMapping; |
3650 | const ValueMapping *PtrMapping; |
3651 | |
3652 | const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI); |
3653 | |
3654 | if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) { |
3655 | if (isScalarLoadLegal(MI)) { |
3656 | // We have a uniform instruction so we want to use an SMRD load |
3657 | ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); |
3658 | PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize); |
3659 | } else { |
3660 | ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); |
3661 | |
3662 | // If we're using MUBUF instructions for global memory, an SGPR base |
3663 | // register is possible. Otherwise this needs to be a VGPR. |
3664 | unsigned PtrBankID = Subtarget.useFlatForGlobal() ? |
3665 | AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID; |
3666 | |
3667 | PtrMapping = AMDGPU::getValueMapping(BankID: PtrBankID, Size: PtrSize); |
3668 | } |
3669 | } else { |
3670 | ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); |
3671 | PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize); |
3672 | } |
3673 | |
3674 | OpdsMapping[0] = ValMapping; |
3675 | OpdsMapping[1] = PtrMapping; |
3676 | const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping( |
3677 | ID: 1, Cost: 1, OperandsMapping: getOperandsMapping(OpdsMapping), NumOperands: MI.getNumOperands()); |
3678 | return Mapping; |
3679 | |
3680 | // FIXME: Do we want to add a mapping for FLAT load, or should we just |
3681 | // handle that during instruction selection? |
3682 | } |
3683 | |
3684 | unsigned |
3685 | AMDGPURegisterBankInfo::getRegBankID(Register Reg, |
3686 | const MachineRegisterInfo &MRI, |
3687 | unsigned Default) const { |
3688 | const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); |
3689 | return Bank ? Bank->getID() : Default; |
3690 | } |
3691 | |
3692 | const RegisterBankInfo::ValueMapping * |
3693 | AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg, |
3694 | const MachineRegisterInfo &MRI, |
3695 | const TargetRegisterInfo &TRI) const { |
3696 | // Lie and claim anything is legal, even though this needs to be an SGPR |
3697 | // applyMapping will have to deal with it as a waterfall loop. |
3698 | unsigned Bank = getRegBankID(Reg, MRI, AMDGPU::SGPRRegBankID); |
3699 | unsigned Size = getSizeInBits(Reg, MRI, TRI); |
3700 | return AMDGPU::getValueMapping(BankID: Bank, Size); |
3701 | } |
3702 | |
3703 | const RegisterBankInfo::ValueMapping * |
3704 | AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg, |
3705 | const MachineRegisterInfo &MRI, |
3706 | const TargetRegisterInfo &TRI) const { |
3707 | unsigned Size = getSizeInBits(Reg, MRI, TRI); |
3708 | return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); |
3709 | } |
3710 | |
3711 | const RegisterBankInfo::ValueMapping * |
3712 | AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg, |
3713 | const MachineRegisterInfo &MRI, |
3714 | const TargetRegisterInfo &TRI) const { |
3715 | unsigned Size = getSizeInBits(Reg, MRI, TRI); |
3716 | return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size); |
3717 | } |
3718 | |
3719 | /// |
3720 | /// This function must return a legal mapping, because |
3721 | /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called |
3722 | /// in RegBankSelect::Mode::Fast. Any mapping that would cause a |
3723 | /// VGPR to SGPR generated is illegal. |
3724 | /// |
3725 | // Operands that must be SGPRs must accept potentially divergent VGPRs as |
3726 | // legal. These will be dealt with in applyMappingImpl. |
3727 | // |
3728 | const RegisterBankInfo::InstructionMapping & |
3729 | AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { |
3730 | const MachineFunction &MF = *MI.getParent()->getParent(); |
3731 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
3732 | |
3733 | if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) { |
3734 | // The default logic bothers to analyze impossible alternative mappings. We |
3735 | // want the most straightforward mapping, so just directly handle this. |
3736 | const RegisterBank *DstBank = getRegBank(MI.getOperand(i: 0).getReg(), MRI, |
3737 | *TRI); |
3738 | const RegisterBank *SrcBank = getRegBank(MI.getOperand(i: 1).getReg(), MRI, |
3739 | *TRI); |
3740 | assert(SrcBank && "src bank should have been assigned already" ); |
3741 | if (!DstBank) |
3742 | DstBank = SrcBank; |
3743 | |
3744 | unsigned Size = getSizeInBits(MI.getOperand(i: 0).getReg(), MRI, *TRI); |
3745 | if (MI.getOpcode() != AMDGPU::G_FREEZE && |
3746 | cannotCopy(*DstBank, *SrcBank, TypeSize::getFixed(Size))) |
3747 | return getInvalidInstructionMapping(); |
3748 | |
3749 | const ValueMapping &ValMap = getValueMapping(StartIdx: 0, Length: Size, RegBank: *DstBank); |
3750 | unsigned OpdsMappingSize = MI.isCopy() ? 1 : 2; |
3751 | SmallVector<const ValueMapping *, 1> OpdsMapping(OpdsMappingSize); |
3752 | OpdsMapping[0] = &ValMap; |
3753 | if (MI.getOpcode() == AMDGPU::G_FREEZE) |
3754 | OpdsMapping[1] = &ValMap; |
3755 | |
3756 | return getInstructionMapping( |
3757 | ID: 1, /*Cost*/ 1, |
3758 | /*OperandsMapping*/ getOperandsMapping(OpdsMapping), NumOperands: OpdsMappingSize); |
3759 | } |
3760 | |
3761 | if (MI.isRegSequence()) { |
3762 | // If any input is a VGPR, the result must be a VGPR. The default handling |
3763 | // assumes any copy between banks is legal. |
3764 | unsigned BankID = AMDGPU::SGPRRegBankID; |
3765 | |
3766 | for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { |
3767 | auto OpBank = getRegBankID(Reg: MI.getOperand(i: I).getReg(), MRI); |
3768 | // It doesn't make sense to use vcc or scc banks here, so just ignore |
3769 | // them. |
3770 | if (OpBank != AMDGPU::SGPRRegBankID) { |
3771 | BankID = AMDGPU::VGPRRegBankID; |
3772 | break; |
3773 | } |
3774 | } |
3775 | unsigned Size = getSizeInBits(MI.getOperand(i: 0).getReg(), MRI, *TRI); |
3776 | |
3777 | const ValueMapping &ValMap = getValueMapping(StartIdx: 0, Length: Size, RegBank: getRegBank(ID: BankID)); |
3778 | return getInstructionMapping( |
3779 | ID: 1, /*Cost*/ 1, |
3780 | /*OperandsMapping*/ getOperandsMapping(OpdsMapping: {&ValMap}), NumOperands: 1); |
3781 | } |
3782 | |
3783 | // The default handling is broken and doesn't handle illegal SGPR->VGPR copies |
3784 | // properly. |
3785 | // |
3786 | // TODO: There are additional exec masking dependencies to analyze. |
3787 | if (auto *PHI = dyn_cast<GPhi>(Val: &MI)) { |
3788 | unsigned ResultBank = AMDGPU::InvalidRegBankID; |
3789 | Register DstReg = PHI->getReg(Idx: 0); |
3790 | |
3791 | // Sometimes the result may have already been assigned a bank. |
3792 | if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI)) |
3793 | ResultBank = DstBank->getID(); |
3794 | |
3795 | for (unsigned I = 0; I < PHI->getNumIncomingValues(); ++I) { |
3796 | Register Reg = PHI->getIncomingValue(I); |
3797 | const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); |
3798 | |
3799 | // FIXME: Assuming VGPR for any undetermined inputs. |
3800 | if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) { |
3801 | ResultBank = AMDGPU::VGPRRegBankID; |
3802 | break; |
3803 | } |
3804 | |
3805 | // FIXME: Need to promote SGPR case to s32 |
3806 | unsigned OpBank = Bank->getID(); |
3807 | ResultBank = regBankBoolUnion(RB0: ResultBank, RB1: OpBank); |
3808 | } |
3809 | |
3810 | assert(ResultBank != AMDGPU::InvalidRegBankID); |
3811 | |
3812 | unsigned Size = MRI.getType(Reg: DstReg).getSizeInBits(); |
3813 | |
3814 | const ValueMapping &ValMap = |
3815 | getValueMapping(StartIdx: 0, Length: Size, RegBank: getRegBank(ID: ResultBank)); |
3816 | return getInstructionMapping( |
3817 | ID: 1, /*Cost*/ 1, |
3818 | /*OperandsMapping*/ getOperandsMapping(OpdsMapping: {&ValMap}), NumOperands: 1); |
3819 | } |
3820 | |
3821 | const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI); |
3822 | if (Mapping.isValid()) |
3823 | return Mapping; |
3824 | |
3825 | SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); |
3826 | |
3827 | switch (MI.getOpcode()) { |
3828 | default: |
3829 | return getInvalidInstructionMapping(); |
3830 | |
3831 | case AMDGPU::G_AND: |
3832 | case AMDGPU::G_OR: |
3833 | case AMDGPU::G_XOR: |
3834 | case AMDGPU::G_MUL: { |
3835 | unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
3836 | if (Size == 1) { |
3837 | const RegisterBank *DstBank |
3838 | = getRegBank(MI.getOperand(i: 0).getReg(), MRI, *TRI); |
3839 | |
3840 | unsigned TargetBankID = AMDGPU::InvalidRegBankID; |
3841 | unsigned BankLHS = AMDGPU::InvalidRegBankID; |
3842 | unsigned BankRHS = AMDGPU::InvalidRegBankID; |
3843 | if (DstBank) { |
3844 | TargetBankID = DstBank->getID(); |
3845 | if (DstBank == &AMDGPU::VCCRegBank) { |
3846 | TargetBankID = AMDGPU::VCCRegBankID; |
3847 | BankLHS = AMDGPU::VCCRegBankID; |
3848 | BankRHS = AMDGPU::VCCRegBankID; |
3849 | } else { |
3850 | BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, |
3851 | AMDGPU::SGPRRegBankID); |
3852 | BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, |
3853 | AMDGPU::SGPRRegBankID); |
3854 | } |
3855 | } else { |
3856 | BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, |
3857 | AMDGPU::VCCRegBankID); |
3858 | BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, |
3859 | AMDGPU::VCCRegBankID); |
3860 | |
3861 | // Both inputs should be true booleans to produce a boolean result. |
3862 | if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) { |
3863 | TargetBankID = AMDGPU::VGPRRegBankID; |
3864 | } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) { |
3865 | TargetBankID = AMDGPU::VCCRegBankID; |
3866 | BankLHS = AMDGPU::VCCRegBankID; |
3867 | BankRHS = AMDGPU::VCCRegBankID; |
3868 | } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) { |
3869 | TargetBankID = AMDGPU::SGPRRegBankID; |
3870 | } |
3871 | } |
3872 | |
3873 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: TargetBankID, Size); |
3874 | OpdsMapping[1] = AMDGPU::getValueMapping(BankID: BankLHS, Size); |
3875 | OpdsMapping[2] = AMDGPU::getValueMapping(BankID: BankRHS, Size); |
3876 | break; |
3877 | } |
3878 | |
3879 | if (Size == 64) { |
3880 | |
3881 | if (isSALUMapping(MI)) { |
3882 | OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size); |
3883 | OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0]; |
3884 | } else { |
3885 | OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size); |
3886 | unsigned Bank1 = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI /*, DefaultBankID*/); |
3887 | OpdsMapping[1] = AMDGPU::getValueMapping(BankID: Bank1, Size); |
3888 | |
3889 | unsigned Bank2 = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI /*, DefaultBankID*/); |
3890 | OpdsMapping[2] = AMDGPU::getValueMapping(BankID: Bank2, Size); |
3891 | } |
3892 | |
3893 | break; |
3894 | } |
3895 | |
3896 | [[fallthrough]]; |
3897 | } |
3898 | case AMDGPU::G_PTR_ADD: |
3899 | case AMDGPU::G_PTRMASK: |
3900 | case AMDGPU::G_ADD: |
3901 | case AMDGPU::G_SUB: |
3902 | case AMDGPU::G_SHL: |
3903 | case AMDGPU::G_LSHR: |
3904 | case AMDGPU::G_ASHR: |
3905 | case AMDGPU::G_UADDO: |
3906 | case AMDGPU::G_USUBO: |
3907 | case AMDGPU::G_UADDE: |
3908 | case AMDGPU::G_SADDE: |
3909 | case AMDGPU::G_USUBE: |
3910 | case AMDGPU::G_SSUBE: |
3911 | case AMDGPU::G_SMIN: |
3912 | case AMDGPU::G_SMAX: |
3913 | case AMDGPU::G_UMIN: |
3914 | case AMDGPU::G_UMAX: |
3915 | case AMDGPU::G_ABS: |
3916 | case AMDGPU::G_SHUFFLE_VECTOR: |
3917 | case AMDGPU::G_SBFX: |
3918 | case AMDGPU::G_UBFX: |
3919 | case AMDGPU::G_AMDGPU_S_MUL_I64_I32: |
3920 | case AMDGPU::G_AMDGPU_S_MUL_U64_U32: |
3921 | if (isSALUMapping(MI)) |
3922 | return getDefaultMappingSOP(MI); |
3923 | return getDefaultMappingVOP(MI); |
3924 | case AMDGPU::G_FADD: |
3925 | case AMDGPU::G_FSUB: |
3926 | case AMDGPU::G_FMUL: |
3927 | case AMDGPU::G_FMA: |
3928 | case AMDGPU::G_FFLOOR: |
3929 | case AMDGPU::G_FCEIL: |
3930 | case AMDGPU::G_INTRINSIC_ROUNDEVEN: |
3931 | case AMDGPU::G_FMINNUM: |
3932 | case AMDGPU::G_FMAXNUM: |
3933 | case AMDGPU::G_FMINIMUM: |
3934 | case AMDGPU::G_FMAXIMUM: |
3935 | case AMDGPU::G_INTRINSIC_TRUNC: |
3936 | case AMDGPU::G_STRICT_FADD: |
3937 | case AMDGPU::G_STRICT_FSUB: |
3938 | case AMDGPU::G_STRICT_FMUL: |
3939 | case AMDGPU::G_STRICT_FMA: { |
3940 | LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg()); |
3941 | unsigned Size = Ty.getSizeInBits(); |
3942 | if (Subtarget.hasSALUFloatInsts() && Ty.isScalar() && |
3943 | (Size == 32 || Size == 16) && isSALUMapping(MI)) |
3944 | return getDefaultMappingSOP(MI); |
3945 | return getDefaultMappingVOP(MI); |
3946 | } |
3947 | case AMDGPU::G_FPTOSI: |
3948 | case AMDGPU::G_FPTOUI: |
3949 | case AMDGPU::G_SITOFP: |
3950 | case AMDGPU::G_UITOFP: { |
3951 | unsigned SizeDst = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
3952 | unsigned SizeSrc = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits(); |
3953 | if (Subtarget.hasSALUFloatInsts() && SizeDst == 32 && SizeSrc == 32 && |
3954 | isSALUMapping(MI)) |
3955 | return getDefaultMappingSOP(MI); |
3956 | return getDefaultMappingVOP(MI); |
3957 | } |
3958 | case AMDGPU::G_FPTRUNC: |
3959 | case AMDGPU::G_FPEXT: { |
3960 | unsigned SizeDst = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
3961 | unsigned SizeSrc = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits(); |
3962 | if (Subtarget.hasSALUFloatInsts() && SizeDst != 64 && SizeSrc != 64 && |
3963 | isSALUMapping(MI)) |
3964 | return getDefaultMappingSOP(MI); |
3965 | return getDefaultMappingVOP(MI); |
3966 | } |
3967 | case AMDGPU::G_FSQRT: |
3968 | case AMDGPU::G_FEXP2: |
3969 | case AMDGPU::G_FLOG2: { |
3970 | unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
3971 | if (Subtarget.hasPseudoScalarTrans() && (Size == 16 || Size == 32) && |
3972 | isSALUMapping(MI)) |
3973 | return getDefaultMappingSOP(MI); |
3974 | return getDefaultMappingVOP(MI); |
3975 | } |
3976 | case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU |
3977 | case AMDGPU::G_SSUBSAT: |
3978 | case AMDGPU::G_UADDSAT: |
3979 | case AMDGPU::G_USUBSAT: |
3980 | case AMDGPU::G_FMAD: |
3981 | case AMDGPU::G_FLDEXP: |
3982 | case AMDGPU::G_FMINNUM_IEEE: |
3983 | case AMDGPU::G_FMAXNUM_IEEE: |
3984 | case AMDGPU::G_FCANONICALIZE: |
3985 | case AMDGPU::G_STRICT_FLDEXP: |
3986 | case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar? |
3987 | case AMDGPU::G_FSHR: // TODO: Expand for scalar |
3988 | case AMDGPU::G_AMDGPU_FMIN_LEGACY: |
3989 | case AMDGPU::G_AMDGPU_FMAX_LEGACY: |
3990 | case AMDGPU::G_AMDGPU_RCP_IFLAG: |
3991 | case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0: |
3992 | case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1: |
3993 | case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2: |
3994 | case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3: |
3995 | case AMDGPU::G_AMDGPU_CVT_PK_I16_I32: |
3996 | case AMDGPU::G_AMDGPU_SMED3: |
3997 | case AMDGPU::G_AMDGPU_FMED3: |
3998 | return getDefaultMappingVOP(MI); |
3999 | case AMDGPU::G_UMULH: |
4000 | case AMDGPU::G_SMULH: { |
4001 | if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI)) |
4002 | return getDefaultMappingSOP(MI); |
4003 | return getDefaultMappingVOP(MI); |
4004 | } |
4005 | case AMDGPU::G_AMDGPU_MAD_U64_U32: |
4006 | case AMDGPU::G_AMDGPU_MAD_I64_I32: { |
4007 | // Three possible mappings: |
4008 | // |
4009 | // - Default SOP |
4010 | // - Default VOP |
4011 | // - Scalar multiply: src0 and src1 are SGPRs, the rest is VOP. |
4012 | // |
4013 | // This allows instruction selection to keep the multiplication part of the |
4014 | // instruction on the SALU. |
4015 | bool AllSalu = true; |
4016 | bool MulSalu = true; |
4017 | for (unsigned i = 0; i < 5; ++i) { |
4018 | Register Reg = MI.getOperand(i).getReg(); |
4019 | if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) { |
4020 | if (Bank->getID() != AMDGPU::SGPRRegBankID) { |
4021 | AllSalu = false; |
4022 | if (i == 2 || i == 3) { |
4023 | MulSalu = false; |
4024 | break; |
4025 | } |
4026 | } |
4027 | } |
4028 | } |
4029 | |
4030 | if (AllSalu) |
4031 | return getDefaultMappingSOP(MI); |
4032 | |
4033 | // If the multiply-add is full-rate in VALU, use that even if the |
4034 | // multiplication part is scalar. Accumulating separately on the VALU would |
4035 | // take two instructions. |
4036 | if (!MulSalu || Subtarget.hasFullRate64Ops()) |
4037 | return getDefaultMappingVOP(MI); |
4038 | |
4039 | // Keep the multiplication on the SALU, then accumulate on the VALU. |
4040 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64); |
4041 | OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); |
4042 | OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); |
4043 | OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); |
4044 | OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64); |
4045 | break; |
4046 | } |
4047 | case AMDGPU::G_IMPLICIT_DEF: { |
4048 | unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
4049 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); |
4050 | break; |
4051 | } |
4052 | case AMDGPU::G_FCONSTANT: |
4053 | case AMDGPU::G_CONSTANT: |
4054 | case AMDGPU::G_GLOBAL_VALUE: |
4055 | case AMDGPU::G_BLOCK_ADDR: |
4056 | case AMDGPU::G_READSTEADYCOUNTER: |
4057 | case AMDGPU::G_READCYCLECOUNTER: { |
4058 | unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
4059 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); |
4060 | break; |
4061 | } |
4062 | case AMDGPU::G_FRAME_INDEX: { |
4063 | // TODO: This should be the same as other constants, but eliminateFrameIndex |
4064 | // currently assumes VALU uses. |
4065 | unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
4066 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); |
4067 | break; |
4068 | } |
4069 | case AMDGPU::G_DYN_STACKALLOC: { |
4070 | // Result is always uniform, and a wave reduction is needed for the source. |
4071 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); |
4072 | unsigned SrcBankID = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI); |
4073 | OpdsMapping[1] = AMDGPU::getValueMapping(BankID: SrcBankID, Size: 32); |
4074 | break; |
4075 | } |
4076 | case AMDGPU::G_AMDGPU_WAVE_ADDRESS: { |
4077 | // This case is weird because we expect a physical register in the source, |
4078 | // but need to set a bank anyway. |
4079 | // |
4080 | // TODO: We could select the result to SGPR or VGPR |
4081 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); |
4082 | OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); |
4083 | break; |
4084 | } |
4085 | case AMDGPU::G_INSERT: { |
4086 | unsigned BankID = getMappingType(MRI, MI); |
4087 | unsigned DstSize = getSizeInBits(MI.getOperand(i: 0).getReg(), MRI, *TRI); |
4088 | unsigned SrcSize = getSizeInBits(MI.getOperand(i: 1).getReg(), MRI, *TRI); |
4089 | unsigned EltSize = getSizeInBits(MI.getOperand(i: 2).getReg(), MRI, *TRI); |
4090 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID, Size: DstSize); |
4091 | OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size: SrcSize); |
4092 | OpdsMapping[2] = AMDGPU::getValueMapping(BankID, Size: EltSize); |
4093 | OpdsMapping[3] = nullptr; |
4094 | break; |
4095 | } |
4096 | case AMDGPU::G_EXTRACT: { |
4097 | unsigned BankID = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI); |
4098 | unsigned DstSize = getSizeInBits(MI.getOperand(i: 0).getReg(), MRI, *TRI); |
4099 | unsigned SrcSize = getSizeInBits(MI.getOperand(i: 1).getReg(), MRI, *TRI); |
4100 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID, Size: DstSize); |
4101 | OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size: SrcSize); |
4102 | OpdsMapping[2] = nullptr; |
4103 | break; |
4104 | } |
4105 | case AMDGPU::G_BUILD_VECTOR: |
4106 | case AMDGPU::G_BUILD_VECTOR_TRUNC: { |
4107 | LLT DstTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg()); |
4108 | if (DstTy == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16)) { |
4109 | unsigned DstSize = DstTy.getSizeInBits(); |
4110 | unsigned SrcSize = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits(); |
4111 | unsigned Src0BankID = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI); |
4112 | unsigned Src1BankID = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI); |
4113 | unsigned DstBankID = regBankUnion(RB0: Src0BankID, RB1: Src1BankID); |
4114 | |
4115 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: DstBankID, Size: DstSize); |
4116 | OpdsMapping[1] = AMDGPU::getValueMapping(BankID: Src0BankID, Size: SrcSize); |
4117 | OpdsMapping[2] = AMDGPU::getValueMapping(BankID: Src1BankID, Size: SrcSize); |
4118 | break; |
4119 | } |
4120 | |
4121 | [[fallthrough]]; |
4122 | } |
4123 | case AMDGPU::G_MERGE_VALUES: |
4124 | case AMDGPU::G_CONCAT_VECTORS: { |
4125 | unsigned Bank = getMappingType(MRI, MI); |
4126 | unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
4127 | unsigned SrcSize = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits(); |
4128 | |
4129 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: Bank, Size: DstSize); |
4130 | // Op1 and Dst should use the same register bank. |
4131 | for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i) |
4132 | OpdsMapping[i] = AMDGPU::getValueMapping(BankID: Bank, Size: SrcSize); |
4133 | break; |
4134 | } |
4135 | case AMDGPU::G_BITREVERSE: |
4136 | case AMDGPU::G_BITCAST: |
4137 | case AMDGPU::G_INTTOPTR: |
4138 | case AMDGPU::G_PTRTOINT: |
4139 | case AMDGPU::G_FABS: |
4140 | case AMDGPU::G_FNEG: { |
4141 | unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
4142 | unsigned BankID = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI); |
4143 | OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size); |
4144 | break; |
4145 | } |
4146 | case AMDGPU::G_AMDGPU_FFBH_U32: |
4147 | case AMDGPU::G_AMDGPU_FFBL_B32: |
4148 | case AMDGPU::G_CTLZ_ZERO_UNDEF: |
4149 | case AMDGPU::G_CTTZ_ZERO_UNDEF: { |
4150 | unsigned Size = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits(); |
4151 | unsigned BankID = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI); |
4152 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID, Size: 32); |
4153 | OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(BankID, Size); |
4154 | break; |
4155 | } |
4156 | case AMDGPU::G_CTPOP: { |
4157 | unsigned Size = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits(); |
4158 | unsigned BankID = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI); |
4159 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID, Size: 32); |
4160 | |
4161 | // This should really be getValueMappingSGPR64Only, but allowing the generic |
4162 | // code to handle the register split just makes using LegalizerHelper more |
4163 | // difficult. |
4164 | OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size); |
4165 | break; |
4166 | } |
4167 | case AMDGPU::G_TRUNC: { |
4168 | Register Dst = MI.getOperand(i: 0).getReg(); |
4169 | Register Src = MI.getOperand(i: 1).getReg(); |
4170 | unsigned Bank = getRegBankID(Reg: Src, MRI); |
4171 | unsigned DstSize = getSizeInBits(Dst, MRI, *TRI); |
4172 | unsigned SrcSize = getSizeInBits(Src, MRI, *TRI); |
4173 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: Bank, Size: DstSize); |
4174 | OpdsMapping[1] = AMDGPU::getValueMapping(BankID: Bank, Size: SrcSize); |
4175 | break; |
4176 | } |
4177 | case AMDGPU::G_ZEXT: |
4178 | case AMDGPU::G_SEXT: |
4179 | case AMDGPU::G_ANYEXT: |
4180 | case AMDGPU::G_SEXT_INREG: { |
4181 | Register Dst = MI.getOperand(i: 0).getReg(); |
4182 | Register Src = MI.getOperand(i: 1).getReg(); |
4183 | unsigned DstSize = getSizeInBits(Dst, MRI, *TRI); |
4184 | unsigned SrcSize = getSizeInBits(Src, MRI, *TRI); |
4185 | |
4186 | unsigned DstBank; |
4187 | const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI); |
4188 | assert(SrcBank); |
4189 | switch (SrcBank->getID()) { |
4190 | case AMDGPU::SGPRRegBankID: |
4191 | DstBank = AMDGPU::SGPRRegBankID; |
4192 | break; |
4193 | default: |
4194 | DstBank = AMDGPU::VGPRRegBankID; |
4195 | break; |
4196 | } |
4197 | |
4198 | // Scalar extend can use 64-bit BFE, but VGPRs require extending to |
4199 | // 32-bits, and then to 64. |
4200 | OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(BankID: DstBank, Size: DstSize); |
4201 | OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(BankID: SrcBank->getID(), |
4202 | Size: SrcSize); |
4203 | break; |
4204 | } |
4205 | case AMDGPU::G_IS_FPCLASS: { |
4206 | Register SrcReg = MI.getOperand(i: 1).getReg(); |
4207 | unsigned SrcSize = MRI.getType(Reg: SrcReg).getSizeInBits(); |
4208 | unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
4209 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize); |
4210 | OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); |
4211 | break; |
4212 | } |
4213 | case AMDGPU::G_STORE: { |
4214 | assert(MI.getOperand(0).isReg()); |
4215 | unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
4216 | |
4217 | // FIXME: We need to specify a different reg bank once scalar stores are |
4218 | // supported. |
4219 | const ValueMapping *ValMapping = |
4220 | AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); |
4221 | OpdsMapping[0] = ValMapping; |
4222 | OpdsMapping[1] = getValueMappingForPtr(MRI, PtrReg: MI.getOperand(i: 1).getReg()); |
4223 | break; |
4224 | } |
4225 | case AMDGPU::G_ICMP: |
4226 | case AMDGPU::G_FCMP: { |
4227 | unsigned Size = MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits(); |
4228 | |
4229 | // See if the result register has already been constrained to vcc, which may |
4230 | // happen due to control flow intrinsic lowering. |
4231 | unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI, |
4232 | AMDGPU::SGPRRegBankID); |
4233 | unsigned Op2Bank = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI); |
4234 | unsigned Op3Bank = getRegBankID(Reg: MI.getOperand(i: 3).getReg(), MRI); |
4235 | |
4236 | auto canUseSCCICMP = [&]() { |
4237 | auto Pred = |
4238 | static_cast<CmpInst::Predicate>(MI.getOperand(i: 1).getPredicate()); |
4239 | return Size == 32 || |
4240 | (Size == 64 && |
4241 | (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) && |
4242 | Subtarget.hasScalarCompareEq64()); |
4243 | }; |
4244 | auto canUseSCCFCMP = [&]() { |
4245 | return Subtarget.hasSALUFloatInsts() && (Size == 32 || Size == 16); |
4246 | }; |
4247 | |
4248 | bool isICMP = MI.getOpcode() == AMDGPU::G_ICMP; |
4249 | bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID && |
4250 | Op2Bank == AMDGPU::SGPRRegBankID && |
4251 | Op3Bank == AMDGPU::SGPRRegBankID && |
4252 | (isICMP ? canUseSCCICMP() : canUseSCCFCMP()); |
4253 | |
4254 | DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; |
4255 | unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; |
4256 | |
4257 | // TODO: Use 32-bit for scalar output size. |
4258 | // SCC results will need to be copied to a 32-bit SGPR virtual register. |
4259 | const unsigned ResultSize = 1; |
4260 | |
4261 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: DstBank, Size: ResultSize); |
4262 | OpdsMapping[1] = nullptr; // Predicate Operand. |
4263 | OpdsMapping[2] = AMDGPU::getValueMapping(BankID: SrcBank, Size); |
4264 | OpdsMapping[3] = AMDGPU::getValueMapping(BankID: SrcBank, Size); |
4265 | break; |
4266 | } |
4267 | case AMDGPU::G_EXTRACT_VECTOR_ELT: { |
4268 | // VGPR index can be used for waterfall when indexing a SGPR vector. |
4269 | unsigned SrcBankID = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI); |
4270 | unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
4271 | unsigned SrcSize = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits(); |
4272 | unsigned IdxSize = MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits(); |
4273 | unsigned IdxBank = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI); |
4274 | unsigned OutputBankID = regBankUnion(RB0: SrcBankID, RB1: IdxBank); |
4275 | |
4276 | OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(BankID: OutputBankID, Size: DstSize); |
4277 | OpdsMapping[1] = AMDGPU::getValueMapping(BankID: SrcBankID, Size: SrcSize); |
4278 | |
4279 | // The index can be either if the source vector is VGPR. |
4280 | OpdsMapping[2] = AMDGPU::getValueMapping(BankID: IdxBank, Size: IdxSize); |
4281 | break; |
4282 | } |
4283 | case AMDGPU::G_INSERT_VECTOR_ELT: { |
4284 | unsigned OutputBankID = isSALUMapping(MI) ? |
4285 | AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; |
4286 | |
4287 | unsigned VecSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
4288 | unsigned InsertSize = MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits(); |
4289 | unsigned IdxSize = MRI.getType(Reg: MI.getOperand(i: 3).getReg()).getSizeInBits(); |
4290 | unsigned InsertEltBankID = getRegBankID(Reg: MI.getOperand(i: 2).getReg(), MRI); |
4291 | unsigned IdxBankID = getRegBankID(Reg: MI.getOperand(i: 3).getReg(), MRI); |
4292 | |
4293 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: OutputBankID, Size: VecSize); |
4294 | OpdsMapping[1] = AMDGPU::getValueMapping(BankID: OutputBankID, Size: VecSize); |
4295 | |
4296 | // This is a weird case, because we need to break down the mapping based on |
4297 | // the register bank of a different operand. |
4298 | if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) { |
4299 | OpdsMapping[2] = AMDGPU::getValueMappingSplit64(BankID: InsertEltBankID, |
4300 | Size: InsertSize); |
4301 | } else { |
4302 | assert(InsertSize == 32 || InsertSize == 64); |
4303 | OpdsMapping[2] = AMDGPU::getValueMapping(BankID: InsertEltBankID, Size: InsertSize); |
4304 | } |
4305 | |
4306 | // The index can be either if the source vector is VGPR. |
4307 | OpdsMapping[3] = AMDGPU::getValueMapping(BankID: IdxBankID, Size: IdxSize); |
4308 | break; |
4309 | } |
4310 | case AMDGPU::G_UNMERGE_VALUES: { |
4311 | unsigned Bank = getMappingType(MRI, MI); |
4312 | |
4313 | // Op1 and Dst should use the same register bank. |
4314 | // FIXME: Shouldn't this be the default? Why do we need to handle this? |
4315 | for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { |
4316 | unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI); |
4317 | OpdsMapping[i] = AMDGPU::getValueMapping(BankID: Bank, Size); |
4318 | } |
4319 | break; |
4320 | } |
4321 | case AMDGPU::G_AMDGPU_BUFFER_LOAD: |
4322 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: |
4323 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE: |
4324 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: |
4325 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT: |
4326 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT: |
4327 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE: |
4328 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16: |
4329 | case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT: |
4330 | case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16: |
4331 | case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT: |
4332 | case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: |
4333 | case AMDGPU::G_AMDGPU_BUFFER_STORE: |
4334 | case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE: |
4335 | case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT: |
4336 | case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT: |
4337 | case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: { |
4338 | OpdsMapping[0] = getVGPROpMapping(MI.getOperand(i: 0).getReg(), MRI, *TRI); |
4339 | |
4340 | // rsrc |
4341 | OpdsMapping[1] = getSGPROpMapping(MI.getOperand(i: 1).getReg(), MRI, *TRI); |
4342 | |
4343 | // vindex |
4344 | OpdsMapping[2] = getVGPROpMapping(MI.getOperand(i: 2).getReg(), MRI, *TRI); |
4345 | |
4346 | // voffset |
4347 | OpdsMapping[3] = getVGPROpMapping(MI.getOperand(i: 3).getReg(), MRI, *TRI); |
4348 | |
4349 | // soffset |
4350 | OpdsMapping[4] = getSGPROpMapping(MI.getOperand(i: 4).getReg(), MRI, *TRI); |
4351 | |
4352 | // Any remaining operands are immediates and were correctly null |
4353 | // initialized. |
4354 | break; |
4355 | } |
4356 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP: |
4357 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD: |
4358 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB: |
4359 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN: |
4360 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN: |
4361 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX: |
4362 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX: |
4363 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND: |
4364 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR: |
4365 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR: |
4366 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC: |
4367 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: |
4368 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: |
4369 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD_BF16: |
4370 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN: |
4371 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: { |
4372 | // vdata_out |
4373 | OpdsMapping[0] = getVGPROpMapping(MI.getOperand(i: 0).getReg(), MRI, *TRI); |
4374 | |
4375 | // vdata_in |
4376 | OpdsMapping[1] = getVGPROpMapping(MI.getOperand(i: 1).getReg(), MRI, *TRI); |
4377 | |
4378 | // rsrc |
4379 | OpdsMapping[2] = getSGPROpMapping(MI.getOperand(i: 2).getReg(), MRI, *TRI); |
4380 | |
4381 | // vindex |
4382 | OpdsMapping[3] = getVGPROpMapping(MI.getOperand(i: 3).getReg(), MRI, *TRI); |
4383 | |
4384 | // voffset |
4385 | OpdsMapping[4] = getVGPROpMapping(MI.getOperand(i: 4).getReg(), MRI, *TRI); |
4386 | |
4387 | // soffset |
4388 | OpdsMapping[5] = getSGPROpMapping(MI.getOperand(i: 5).getReg(), MRI, *TRI); |
4389 | |
4390 | // Any remaining operands are immediates and were correctly null |
4391 | // initialized. |
4392 | break; |
4393 | } |
4394 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: { |
4395 | // vdata_out |
4396 | OpdsMapping[0] = getVGPROpMapping(MI.getOperand(i: 0).getReg(), MRI, *TRI); |
4397 | |
4398 | // vdata_in |
4399 | OpdsMapping[1] = getVGPROpMapping(MI.getOperand(i: 1).getReg(), MRI, *TRI); |
4400 | |
4401 | // cmp |
4402 | OpdsMapping[2] = getVGPROpMapping(MI.getOperand(i: 2).getReg(), MRI, *TRI); |
4403 | |
4404 | // rsrc |
4405 | OpdsMapping[3] = getSGPROpMapping(MI.getOperand(i: 3).getReg(), MRI, *TRI); |
4406 | |
4407 | // vindex |
4408 | OpdsMapping[4] = getVGPROpMapping(MI.getOperand(i: 4).getReg(), MRI, *TRI); |
4409 | |
4410 | // voffset |
4411 | OpdsMapping[5] = getVGPROpMapping(MI.getOperand(i: 5).getReg(), MRI, *TRI); |
4412 | |
4413 | // soffset |
4414 | OpdsMapping[6] = getSGPROpMapping(MI.getOperand(i: 6).getReg(), MRI, *TRI); |
4415 | |
4416 | // Any remaining operands are immediates and were correctly null |
4417 | // initialized. |
4418 | break; |
4419 | } |
4420 | case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: |
4421 | case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE: |
4422 | case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE: |
4423 | case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT: |
4424 | case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT: { |
4425 | // Lie and claim everything is legal, even though some need to be |
4426 | // SGPRs. applyMapping will have to deal with it as a waterfall loop. |
4427 | OpdsMapping[1] = getSGPROpMapping(MI.getOperand(i: 1).getReg(), MRI, *TRI); |
4428 | OpdsMapping[2] = getSGPROpMapping(MI.getOperand(i: 2).getReg(), MRI, *TRI); |
4429 | |
4430 | // We need to convert this to a MUBUF if either the resource of offset is |
4431 | // VGPR. |
4432 | unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID(); |
4433 | unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID(); |
4434 | unsigned ResultBank = regBankUnion(RB0: RSrcBank, RB1: OffsetBank); |
4435 | |
4436 | unsigned Size0 = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
4437 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: ResultBank, Size: Size0); |
4438 | break; |
4439 | } |
4440 | case AMDGPU::G_INTRINSIC: |
4441 | case AMDGPU::G_INTRINSIC_CONVERGENT: { |
4442 | switch (cast<GIntrinsic>(Val: MI).getIntrinsicID()) { |
4443 | default: |
4444 | return getInvalidInstructionMapping(); |
4445 | case Intrinsic::amdgcn_div_fmas: |
4446 | case Intrinsic::amdgcn_div_fixup: |
4447 | case Intrinsic::amdgcn_trig_preop: |
4448 | case Intrinsic::amdgcn_sin: |
4449 | case Intrinsic::amdgcn_cos: |
4450 | case Intrinsic::amdgcn_log_clamp: |
4451 | case Intrinsic::amdgcn_rcp_legacy: |
4452 | case Intrinsic::amdgcn_rsq_legacy: |
4453 | case Intrinsic::amdgcn_rsq_clamp: |
4454 | case Intrinsic::amdgcn_fmul_legacy: |
4455 | case Intrinsic::amdgcn_fma_legacy: |
4456 | case Intrinsic::amdgcn_frexp_mant: |
4457 | case Intrinsic::amdgcn_frexp_exp: |
4458 | case Intrinsic::amdgcn_fract: |
4459 | case Intrinsic::amdgcn_cvt_pknorm_i16: |
4460 | case Intrinsic::amdgcn_cvt_pknorm_u16: |
4461 | case Intrinsic::amdgcn_cvt_pk_i16: |
4462 | case Intrinsic::amdgcn_cvt_pk_u16: |
4463 | case Intrinsic::amdgcn_fmed3: |
4464 | case Intrinsic::amdgcn_cubeid: |
4465 | case Intrinsic::amdgcn_cubema: |
4466 | case Intrinsic::amdgcn_cubesc: |
4467 | case Intrinsic::amdgcn_cubetc: |
4468 | case Intrinsic::amdgcn_sffbh: |
4469 | case Intrinsic::amdgcn_fmad_ftz: |
4470 | case Intrinsic::amdgcn_mbcnt_lo: |
4471 | case Intrinsic::amdgcn_mbcnt_hi: |
4472 | case Intrinsic::amdgcn_mul_u24: |
4473 | case Intrinsic::amdgcn_mul_i24: |
4474 | case Intrinsic::amdgcn_mulhi_u24: |
4475 | case Intrinsic::amdgcn_mulhi_i24: |
4476 | case Intrinsic::amdgcn_lerp: |
4477 | case Intrinsic::amdgcn_sad_u8: |
4478 | case Intrinsic::amdgcn_msad_u8: |
4479 | case Intrinsic::amdgcn_sad_hi_u8: |
4480 | case Intrinsic::amdgcn_sad_u16: |
4481 | case Intrinsic::amdgcn_qsad_pk_u16_u8: |
4482 | case Intrinsic::amdgcn_mqsad_pk_u16_u8: |
4483 | case Intrinsic::amdgcn_mqsad_u32_u8: |
4484 | case Intrinsic::amdgcn_cvt_pk_u8_f32: |
4485 | case Intrinsic::amdgcn_alignbyte: |
4486 | case Intrinsic::amdgcn_perm: |
4487 | case Intrinsic::amdgcn_fdot2: |
4488 | case Intrinsic::amdgcn_sdot2: |
4489 | case Intrinsic::amdgcn_udot2: |
4490 | case Intrinsic::amdgcn_sdot4: |
4491 | case Intrinsic::amdgcn_udot4: |
4492 | case Intrinsic::amdgcn_sdot8: |
4493 | case Intrinsic::amdgcn_udot8: |
4494 | case Intrinsic::amdgcn_fdot2_bf16_bf16: |
4495 | case Intrinsic::amdgcn_fdot2_f16_f16: |
4496 | case Intrinsic::amdgcn_fdot2_f32_bf16: |
4497 | case Intrinsic::amdgcn_sudot4: |
4498 | case Intrinsic::amdgcn_sudot8: |
4499 | case Intrinsic::amdgcn_dot4_f32_fp8_bf8: |
4500 | case Intrinsic::amdgcn_dot4_f32_bf8_fp8: |
4501 | case Intrinsic::amdgcn_dot4_f32_fp8_fp8: |
4502 | case Intrinsic::amdgcn_dot4_f32_bf8_bf8: |
4503 | case Intrinsic::amdgcn_cvt_f32_fp8: |
4504 | case Intrinsic::amdgcn_cvt_f32_bf8: |
4505 | case Intrinsic::amdgcn_cvt_pk_f32_fp8: |
4506 | case Intrinsic::amdgcn_cvt_pk_f32_bf8: |
4507 | case Intrinsic::amdgcn_cvt_pk_fp8_f32: |
4508 | case Intrinsic::amdgcn_cvt_pk_bf8_f32: |
4509 | case Intrinsic::amdgcn_cvt_sr_fp8_f32: |
4510 | case Intrinsic::amdgcn_cvt_sr_bf8_f32: |
4511 | case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16: |
4512 | case Intrinsic::amdgcn_wmma_f16_16x16x16_f16: |
4513 | case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied: |
4514 | case Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied: |
4515 | case Intrinsic::amdgcn_wmma_f32_16x16x16_bf16: |
4516 | case Intrinsic::amdgcn_wmma_f32_16x16x16_f16: |
4517 | case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4: |
4518 | case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8: |
4519 | case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8: |
4520 | case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8: |
4521 | case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8: |
4522 | case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8: |
4523 | case Intrinsic::amdgcn_wmma_i32_16x16x32_iu4: |
4524 | case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16: |
4525 | case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16: |
4526 | case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16: |
4527 | case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16: |
4528 | case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8: |
4529 | case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4: |
4530 | case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: |
4531 | case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8: |
4532 | case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8: |
4533 | case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8: |
4534 | case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: |
4535 | return getDefaultMappingVOP(MI); |
4536 | case Intrinsic::amdgcn_log: |
4537 | case Intrinsic::amdgcn_exp2: |
4538 | case Intrinsic::amdgcn_rcp: |
4539 | case Intrinsic::amdgcn_rsq: |
4540 | case Intrinsic::amdgcn_sqrt: { |
4541 | unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
4542 | if (Subtarget.hasPseudoScalarTrans() && (Size == 16 || Size == 32) && |
4543 | isSALUMapping(MI)) |
4544 | return getDefaultMappingSOP(MI); |
4545 | return getDefaultMappingVOP(MI); |
4546 | } |
4547 | case Intrinsic::amdgcn_sbfe: |
4548 | case Intrinsic::amdgcn_ubfe: |
4549 | if (isSALUMapping(MI)) |
4550 | return getDefaultMappingSOP(MI); |
4551 | return getDefaultMappingVOP(MI); |
4552 | case Intrinsic::amdgcn_ds_swizzle: |
4553 | case Intrinsic::amdgcn_ds_permute: |
4554 | case Intrinsic::amdgcn_ds_bpermute: |
4555 | case Intrinsic::amdgcn_update_dpp: |
4556 | case Intrinsic::amdgcn_mov_dpp8: |
4557 | case Intrinsic::amdgcn_mov_dpp: |
4558 | case Intrinsic::amdgcn_strict_wwm: |
4559 | case Intrinsic::amdgcn_wwm: |
4560 | case Intrinsic::amdgcn_strict_wqm: |
4561 | case Intrinsic::amdgcn_wqm: |
4562 | case Intrinsic::amdgcn_softwqm: |
4563 | case Intrinsic::amdgcn_set_inactive: |
4564 | case Intrinsic::amdgcn_set_inactive_chain_arg: |
4565 | case Intrinsic::amdgcn_permlane64: |
4566 | return getDefaultMappingAllVGPR(MI); |
4567 | case Intrinsic::amdgcn_cvt_pkrtz: |
4568 | if (Subtarget.hasSALUFloatInsts() && isSALUMapping(MI)) |
4569 | return getDefaultMappingSOP(MI); |
4570 | return getDefaultMappingVOP(MI); |
4571 | case Intrinsic::amdgcn_kernarg_segment_ptr: |
4572 | case Intrinsic::amdgcn_s_getpc: |
4573 | case Intrinsic::amdgcn_groupstaticsize: |
4574 | case Intrinsic::amdgcn_reloc_constant: |
4575 | case Intrinsic::returnaddress: { |
4576 | unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
4577 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); |
4578 | break; |
4579 | } |
4580 | case Intrinsic::amdgcn_wqm_vote: { |
4581 | unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
4582 | OpdsMapping[0] = OpdsMapping[2] |
4583 | = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size); |
4584 | break; |
4585 | } |
4586 | case Intrinsic::amdgcn_ps_live: { |
4587 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); |
4588 | break; |
4589 | } |
4590 | case Intrinsic::amdgcn_div_scale: { |
4591 | unsigned Dst0Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
4592 | unsigned Dst1Size = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits(); |
4593 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size); |
4594 | OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size); |
4595 | |
4596 | unsigned SrcSize = MRI.getType(Reg: MI.getOperand(i: 3).getReg()).getSizeInBits(); |
4597 | OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); |
4598 | OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); |
4599 | break; |
4600 | } |
4601 | case Intrinsic::amdgcn_class: { |
4602 | Register Src0Reg = MI.getOperand(i: 2).getReg(); |
4603 | Register Src1Reg = MI.getOperand(i: 3).getReg(); |
4604 | unsigned Src0Size = MRI.getType(Reg: Src0Reg).getSizeInBits(); |
4605 | unsigned Src1Size = MRI.getType(Reg: Src1Reg).getSizeInBits(); |
4606 | unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
4607 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize); |
4608 | OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src0Size); |
4609 | OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src1Size); |
4610 | break; |
4611 | } |
4612 | case Intrinsic::amdgcn_icmp: |
4613 | case Intrinsic::amdgcn_fcmp: { |
4614 | unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
4615 | // This is not VCCRegBank because this is not used in boolean contexts. |
4616 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); |
4617 | unsigned OpSize = MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits(); |
4618 | OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize); |
4619 | OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize); |
4620 | break; |
4621 | } |
4622 | case Intrinsic::amdgcn_readlane: { |
4623 | // This must be an SGPR, but accept a VGPR. |
4624 | Register IdxReg = MI.getOperand(i: 3).getReg(); |
4625 | unsigned IdxSize = MRI.getType(Reg: IdxReg).getSizeInBits(); |
4626 | unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID); |
4627 | OpdsMapping[3] = AMDGPU::getValueMapping(BankID: IdxBank, Size: IdxSize); |
4628 | [[fallthrough]]; |
4629 | } |
4630 | case Intrinsic::amdgcn_readfirstlane: { |
4631 | unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
4632 | unsigned SrcSize = MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits(); |
4633 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); |
4634 | OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); |
4635 | break; |
4636 | } |
4637 | case Intrinsic::amdgcn_writelane: { |
4638 | unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
4639 | Register SrcReg = MI.getOperand(i: 2).getReg(); |
4640 | unsigned SrcSize = MRI.getType(Reg: SrcReg).getSizeInBits(); |
4641 | unsigned SrcBank = getRegBankID(SrcReg, MRI, AMDGPU::SGPRRegBankID); |
4642 | Register IdxReg = MI.getOperand(i: 3).getReg(); |
4643 | unsigned IdxSize = MRI.getType(Reg: IdxReg).getSizeInBits(); |
4644 | unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID); |
4645 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); |
4646 | |
4647 | // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted |
4648 | // to legalize. |
4649 | OpdsMapping[2] = AMDGPU::getValueMapping(BankID: SrcBank, Size: SrcSize); |
4650 | OpdsMapping[3] = AMDGPU::getValueMapping(BankID: IdxBank, Size: IdxSize); |
4651 | OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); |
4652 | break; |
4653 | } |
4654 | case Intrinsic::amdgcn_if_break: { |
4655 | unsigned Size = getSizeInBits(MI.getOperand(i: 0).getReg(), MRI, *TRI); |
4656 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); |
4657 | OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); |
4658 | OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); |
4659 | break; |
4660 | } |
4661 | case Intrinsic::amdgcn_permlane16: |
4662 | case Intrinsic::amdgcn_permlanex16: { |
4663 | unsigned Size = getSizeInBits(MI.getOperand(i: 0).getReg(), MRI, *TRI); |
4664 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); |
4665 | OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); |
4666 | OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); |
4667 | OpdsMapping[4] = getSGPROpMapping(MI.getOperand(i: 3).getReg(), MRI, *TRI); |
4668 | OpdsMapping[5] = getSGPROpMapping(MI.getOperand(i: 4).getReg(), MRI, *TRI); |
4669 | break; |
4670 | } |
4671 | case Intrinsic::amdgcn_permlane16_var: |
4672 | case Intrinsic::amdgcn_permlanex16_var: { |
4673 | unsigned Size = getSizeInBits(MI.getOperand(i: 0).getReg(), MRI, *TRI); |
4674 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); |
4675 | OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); |
4676 | OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); |
4677 | OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); |
4678 | break; |
4679 | } |
4680 | case Intrinsic::amdgcn_mfma_f32_4x4x1f32: |
4681 | case Intrinsic::amdgcn_mfma_f32_4x4x4f16: |
4682 | case Intrinsic::amdgcn_mfma_i32_4x4x4i8: |
4683 | case Intrinsic::amdgcn_mfma_f32_4x4x2bf16: |
4684 | case Intrinsic::amdgcn_mfma_f32_16x16x1f32: |
4685 | case Intrinsic::amdgcn_mfma_f32_16x16x4f32: |
4686 | case Intrinsic::amdgcn_mfma_f32_16x16x4f16: |
4687 | case Intrinsic::amdgcn_mfma_f32_16x16x16f16: |
4688 | case Intrinsic::amdgcn_mfma_i32_16x16x4i8: |
4689 | case Intrinsic::amdgcn_mfma_i32_16x16x16i8: |
4690 | case Intrinsic::amdgcn_mfma_f32_16x16x2bf16: |
4691 | case Intrinsic::amdgcn_mfma_f32_16x16x8bf16: |
4692 | case Intrinsic::amdgcn_mfma_f32_32x32x1f32: |
4693 | case Intrinsic::amdgcn_mfma_f32_32x32x2f32: |
4694 | case Intrinsic::amdgcn_mfma_f32_32x32x4f16: |
4695 | case Intrinsic::amdgcn_mfma_f32_32x32x8f16: |
4696 | case Intrinsic::amdgcn_mfma_i32_32x32x4i8: |
4697 | case Intrinsic::amdgcn_mfma_i32_32x32x8i8: |
4698 | case Intrinsic::amdgcn_mfma_f32_32x32x2bf16: |
4699 | case Intrinsic::amdgcn_mfma_f32_32x32x4bf16: |
4700 | case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k: |
4701 | case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k: |
4702 | case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k: |
4703 | case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k: |
4704 | case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k: |
4705 | case Intrinsic::amdgcn_mfma_f64_16x16x4f64: |
4706 | case Intrinsic::amdgcn_mfma_f64_4x4x4f64: |
4707 | case Intrinsic::amdgcn_mfma_i32_16x16x32_i8: |
4708 | case Intrinsic::amdgcn_mfma_i32_32x32x16_i8: |
4709 | case Intrinsic::amdgcn_mfma_f32_16x16x8_xf32: |
4710 | case Intrinsic::amdgcn_mfma_f32_32x32x4_xf32: |
4711 | case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_bf8: |
4712 | case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_fp8: |
4713 | case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_bf8: |
4714 | case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_fp8: |
4715 | case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_bf8: |
4716 | case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_fp8: |
4717 | case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_bf8: |
4718 | case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_fp8: { |
4719 | // Default for MAI intrinsics. |
4720 | // srcC can also be an immediate which can be folded later. |
4721 | // FIXME: Should we eventually add an alternative mapping with AGPR src |
4722 | // for srcA/srcB? |
4723 | // |
4724 | // vdst, srcA, srcB, srcC |
4725 | const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); |
4726 | OpdsMapping[0] = |
4727 | Info->mayNeedAGPRs() |
4728 | ? getAGPROpMapping(MI.getOperand(i: 0).getReg(), MRI, *TRI) |
4729 | : getVGPROpMapping(MI.getOperand(i: 0).getReg(), MRI, *TRI); |
4730 | OpdsMapping[2] = getVGPROpMapping(MI.getOperand(i: 2).getReg(), MRI, *TRI); |
4731 | OpdsMapping[3] = getVGPROpMapping(MI.getOperand(i: 3).getReg(), MRI, *TRI); |
4732 | OpdsMapping[4] = |
4733 | Info->mayNeedAGPRs() |
4734 | ? getAGPROpMapping(MI.getOperand(i: 4).getReg(), MRI, *TRI) |
4735 | : getVGPROpMapping(MI.getOperand(i: 4).getReg(), MRI, *TRI); |
4736 | break; |
4737 | } |
4738 | case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16: |
4739 | case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16: |
4740 | case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16: |
4741 | case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16: |
4742 | case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8: |
4743 | case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8: |
4744 | case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8: |
4745 | case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8: |
4746 | case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8: |
4747 | case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8: |
4748 | case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8: |
4749 | case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8: |
4750 | case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8: |
4751 | case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8: { |
4752 | // vdst, srcA, srcB, srcC, idx |
4753 | OpdsMapping[0] = getAGPROpMapping(MI.getOperand(i: 0).getReg(), MRI, *TRI); |
4754 | OpdsMapping[2] = getVGPROpMapping(MI.getOperand(i: 2).getReg(), MRI, *TRI); |
4755 | OpdsMapping[3] = getVGPROpMapping(MI.getOperand(i: 3).getReg(), MRI, *TRI); |
4756 | OpdsMapping[4] = getAGPROpMapping(MI.getOperand(i: 4).getReg(), MRI, *TRI); |
4757 | OpdsMapping[5] = getVGPROpMapping(MI.getOperand(i: 5).getReg(), MRI, *TRI); |
4758 | break; |
4759 | } |
4760 | case Intrinsic::amdgcn_interp_p1: |
4761 | case Intrinsic::amdgcn_interp_p2: |
4762 | case Intrinsic::amdgcn_interp_mov: |
4763 | case Intrinsic::amdgcn_interp_p1_f16: |
4764 | case Intrinsic::amdgcn_interp_p2_f16: |
4765 | case Intrinsic::amdgcn_lds_param_load: { |
4766 | const int M0Idx = MI.getNumOperands() - 1; |
4767 | Register M0Reg = MI.getOperand(i: M0Idx).getReg(); |
4768 | unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID); |
4769 | unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
4770 | |
4771 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); |
4772 | for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I) |
4773 | OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); |
4774 | |
4775 | // Must be SGPR, but we must take whatever the original bank is and fix it |
4776 | // later. |
4777 | OpdsMapping[M0Idx] = AMDGPU::getValueMapping(BankID: M0Bank, Size: 32); |
4778 | break; |
4779 | } |
4780 | case Intrinsic::amdgcn_interp_inreg_p10: |
4781 | case Intrinsic::amdgcn_interp_inreg_p2: |
4782 | case Intrinsic::amdgcn_interp_inreg_p10_f16: |
4783 | case Intrinsic::amdgcn_interp_inreg_p2_f16: |
4784 | case Intrinsic::amdgcn_interp_p10_rtz_f16: |
4785 | case Intrinsic::amdgcn_interp_p2_rtz_f16: { |
4786 | unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
4787 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); |
4788 | OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); |
4789 | OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); |
4790 | OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); |
4791 | break; |
4792 | } |
4793 | case Intrinsic::amdgcn_ballot: { |
4794 | unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
4795 | unsigned SrcSize = MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits(); |
4796 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); |
4797 | OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize); |
4798 | break; |
4799 | } |
4800 | case Intrinsic::amdgcn_inverse_ballot: { |
4801 | // This must be an SGPR, but accept a VGPR. |
4802 | Register MaskReg = MI.getOperand(i: 2).getReg(); |
4803 | unsigned MaskSize = MRI.getType(Reg: MaskReg).getSizeInBits(); |
4804 | unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID); |
4805 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); |
4806 | OpdsMapping[2] = AMDGPU::getValueMapping(BankID: MaskBank, Size: MaskSize); |
4807 | break; |
4808 | } |
4809 | case Intrinsic::amdgcn_s_quadmask: |
4810 | case Intrinsic::amdgcn_s_wqm: { |
4811 | Register MaskReg = MI.getOperand(i: 2).getReg(); |
4812 | unsigned MaskSize = MRI.getType(Reg: MaskReg).getSizeInBits(); |
4813 | unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID); |
4814 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, MaskSize); |
4815 | OpdsMapping[2] = AMDGPU::getValueMapping(BankID: MaskBank, Size: MaskSize); |
4816 | break; |
4817 | } |
4818 | case Intrinsic::amdgcn_wave_reduce_umin: |
4819 | case Intrinsic::amdgcn_wave_reduce_umax: { |
4820 | unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
4821 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); |
4822 | unsigned OpSize = MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits(); |
4823 | auto regBankID = |
4824 | isSALUMapping(MI) ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; |
4825 | OpdsMapping[2] = AMDGPU::getValueMapping(BankID: regBankID, Size: OpSize); |
4826 | break; |
4827 | } |
4828 | case Intrinsic::amdgcn_s_bitreplicate: |
4829 | Register MaskReg = MI.getOperand(i: 2).getReg(); |
4830 | unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID); |
4831 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64); |
4832 | OpdsMapping[2] = AMDGPU::getValueMapping(BankID: MaskBank, Size: 32); |
4833 | } |
4834 | break; |
4835 | } |
4836 | case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: |
4837 | case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16: |
4838 | case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: |
4839 | case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: { |
4840 | auto IntrID = AMDGPU::getIntrinsicID(I: MI); |
4841 | const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(Intr: IntrID); |
4842 | assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic" ); |
4843 | // Non-images can have complications from operands that allow both SGPR |
4844 | // and VGPR. For now it's too complicated to figure out the final opcode |
4845 | // to derive the register bank from the MCInstrDesc. |
4846 | assert(RSrcIntrin->IsImage); |
4847 | return getImageMapping(MRI, MI, RsrcIdx: RSrcIntrin->RsrcArg); |
4848 | } |
4849 | case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: { |
4850 | unsigned N = MI.getNumExplicitOperands() - 2; |
4851 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128); |
4852 | OpdsMapping[N] = getSGPROpMapping(MI.getOperand(i: N).getReg(), MRI, *TRI); |
4853 | if (N == 3) { |
4854 | // Sequential form: all operands combined into VGPR256/VGPR512 |
4855 | unsigned Size = MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits(); |
4856 | if (Size > 256) |
4857 | Size = 512; |
4858 | OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); |
4859 | } else { |
4860 | // NSA form |
4861 | for (unsigned I = 2; I < N; ++I) { |
4862 | unsigned Size = MRI.getType(Reg: MI.getOperand(i: I).getReg()).getSizeInBits(); |
4863 | OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); |
4864 | } |
4865 | } |
4866 | break; |
4867 | } |
4868 | case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: |
4869 | case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: { |
4870 | auto IntrID = cast<GIntrinsic>(Val: MI).getIntrinsicID(); |
4871 | switch (IntrID) { |
4872 | case Intrinsic::amdgcn_s_getreg: |
4873 | case Intrinsic::amdgcn_s_memtime: |
4874 | case Intrinsic::amdgcn_s_memrealtime: |
4875 | case Intrinsic::amdgcn_s_get_waveid_in_workgroup: |
4876 | case Intrinsic::amdgcn_s_sendmsg_rtn: { |
4877 | unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
4878 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); |
4879 | break; |
4880 | } |
4881 | case Intrinsic::amdgcn_global_atomic_fadd: |
4882 | case Intrinsic::amdgcn_global_atomic_csub: |
4883 | case Intrinsic::amdgcn_global_atomic_fmin: |
4884 | case Intrinsic::amdgcn_global_atomic_fmax: |
4885 | case Intrinsic::amdgcn_global_atomic_fmin_num: |
4886 | case Intrinsic::amdgcn_global_atomic_fmax_num: |
4887 | case Intrinsic::amdgcn_flat_atomic_fadd: |
4888 | case Intrinsic::amdgcn_flat_atomic_fmin: |
4889 | case Intrinsic::amdgcn_flat_atomic_fmax: |
4890 | case Intrinsic::amdgcn_flat_atomic_fmin_num: |
4891 | case Intrinsic::amdgcn_flat_atomic_fmax_num: |
4892 | case Intrinsic::amdgcn_global_atomic_fadd_v2bf16: |
4893 | case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: |
4894 | case Intrinsic::amdgcn_atomic_cond_sub_u32: |
4895 | case Intrinsic::amdgcn_global_atomic_ordered_add_b64: |
4896 | case Intrinsic::amdgcn_global_load_tr_b64: |
4897 | case Intrinsic::amdgcn_global_load_tr_b128: |
4898 | return getDefaultMappingAllVGPR(MI); |
4899 | case Intrinsic::amdgcn_ds_ordered_add: |
4900 | case Intrinsic::amdgcn_ds_ordered_swap: |
4901 | case Intrinsic::amdgcn_ds_fadd_v2bf16: { |
4902 | unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
4903 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); |
4904 | unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, |
4905 | AMDGPU::SGPRRegBankID); |
4906 | OpdsMapping[2] = AMDGPU::getValueMapping(BankID: M0Bank, Size: 32); |
4907 | OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); |
4908 | break; |
4909 | } |
4910 | case Intrinsic::amdgcn_ds_append: |
4911 | case Intrinsic::amdgcn_ds_consume: { |
4912 | unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
4913 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); |
4914 | OpdsMapping[2] = getSGPROpMapping(MI.getOperand(i: 2).getReg(), MRI, *TRI); |
4915 | break; |
4916 | } |
4917 | case Intrinsic::amdgcn_exp_compr: |
4918 | OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); |
4919 | OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); |
4920 | break; |
4921 | case Intrinsic::amdgcn_exp: |
4922 | // FIXME: Could we support packed types here? |
4923 | OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); |
4924 | OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); |
4925 | OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); |
4926 | OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); |
4927 | break; |
4928 | case Intrinsic::amdgcn_exp_row: |
4929 | OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); |
4930 | OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); |
4931 | OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); |
4932 | OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); |
4933 | OpdsMapping[8] = getSGPROpMapping(MI.getOperand(i: 8).getReg(), MRI, *TRI); |
4934 | break; |
4935 | case Intrinsic::amdgcn_s_sendmsg: |
4936 | case Intrinsic::amdgcn_s_sendmsghalt: { |
4937 | // This must be an SGPR, but accept a VGPR. |
4938 | unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, |
4939 | AMDGPU::SGPRRegBankID); |
4940 | OpdsMapping[2] = AMDGPU::getValueMapping(BankID: Bank, Size: 32); |
4941 | break; |
4942 | } |
4943 | case Intrinsic::amdgcn_s_setreg: { |
4944 | // This must be an SGPR, but accept a VGPR. |
4945 | unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, |
4946 | AMDGPU::SGPRRegBankID); |
4947 | OpdsMapping[2] = AMDGPU::getValueMapping(BankID: Bank, Size: 32); |
4948 | break; |
4949 | } |
4950 | case Intrinsic::amdgcn_s_ttracedata: { |
4951 | // This must be an SGPR, but accept a VGPR. |
4952 | unsigned Bank = |
4953 | getRegBankID(MI.getOperand(1).getReg(), MRI, AMDGPU::SGPRRegBankID); |
4954 | OpdsMapping[1] = AMDGPU::getValueMapping(BankID: Bank, Size: 32); |
4955 | break; |
4956 | } |
4957 | case Intrinsic::amdgcn_end_cf: { |
4958 | unsigned Size = getSizeInBits(MI.getOperand(i: 1).getReg(), MRI, *TRI); |
4959 | OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); |
4960 | break; |
4961 | } |
4962 | case Intrinsic::amdgcn_else: { |
4963 | unsigned WaveSize = getSizeInBits(MI.getOperand(i: 1).getReg(), MRI, *TRI); |
4964 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); |
4965 | OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize); |
4966 | OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize); |
4967 | break; |
4968 | } |
4969 | case Intrinsic::amdgcn_live_mask: { |
4970 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); |
4971 | break; |
4972 | } |
4973 | case Intrinsic::amdgcn_wqm_demote: |
4974 | case Intrinsic::amdgcn_kill: { |
4975 | OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); |
4976 | break; |
4977 | } |
4978 | case Intrinsic::amdgcn_raw_buffer_load: |
4979 | case Intrinsic::amdgcn_raw_ptr_buffer_load: |
4980 | case Intrinsic::amdgcn_raw_tbuffer_load: |
4981 | case Intrinsic::amdgcn_raw_ptr_tbuffer_load: { |
4982 | // FIXME: Should make intrinsic ID the last operand of the instruction, |
4983 | // then this would be the same as store |
4984 | OpdsMapping[0] = getVGPROpMapping(MI.getOperand(i: 0).getReg(), MRI, *TRI); |
4985 | OpdsMapping[2] = getSGPROpMapping(MI.getOperand(i: 2).getReg(), MRI, *TRI); |
4986 | OpdsMapping[3] = getVGPROpMapping(MI.getOperand(i: 3).getReg(), MRI, *TRI); |
4987 | OpdsMapping[4] = getSGPROpMapping(MI.getOperand(i: 4).getReg(), MRI, *TRI); |
4988 | break; |
4989 | } |
4990 | case Intrinsic::amdgcn_raw_buffer_load_lds: |
4991 | case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: { |
4992 | OpdsMapping[1] = getSGPROpMapping(MI.getOperand(i: 1).getReg(), MRI, *TRI); |
4993 | OpdsMapping[2] = getSGPROpMapping(MI.getOperand(i: 2).getReg(), MRI, *TRI); |
4994 | OpdsMapping[4] = getVGPROpMapping(MI.getOperand(i: 4).getReg(), MRI, *TRI); |
4995 | OpdsMapping[5] = getSGPROpMapping(MI.getOperand(i: 5).getReg(), MRI, *TRI); |
4996 | break; |
4997 | } |
4998 | case Intrinsic::amdgcn_raw_buffer_store: |
4999 | case Intrinsic::amdgcn_raw_ptr_buffer_store: |
5000 | case Intrinsic::amdgcn_raw_buffer_store_format: |
5001 | case Intrinsic::amdgcn_raw_ptr_buffer_store_format: |
5002 | case Intrinsic::amdgcn_raw_tbuffer_store: |
5003 | case Intrinsic::amdgcn_raw_ptr_tbuffer_store: { |
5004 | OpdsMapping[1] = getVGPROpMapping(MI.getOperand(i: 1).getReg(), MRI, *TRI); |
5005 | OpdsMapping[2] = getSGPROpMapping(MI.getOperand(i: 2).getReg(), MRI, *TRI); |
5006 | OpdsMapping[3] = getVGPROpMapping(MI.getOperand(i: 3).getReg(), MRI, *TRI); |
5007 | OpdsMapping[4] = getSGPROpMapping(MI.getOperand(i: 4).getReg(), MRI, *TRI); |
5008 | break; |
5009 | } |
5010 | case Intrinsic::amdgcn_struct_buffer_load: |
5011 | case Intrinsic::amdgcn_struct_ptr_buffer_load: |
5012 | case Intrinsic::amdgcn_struct_tbuffer_load: |
5013 | case Intrinsic::amdgcn_struct_ptr_tbuffer_load: { |
5014 | OpdsMapping[0] = getVGPROpMapping(MI.getOperand(i: 0).getReg(), MRI, *TRI); |
5015 | OpdsMapping[2] = getSGPROpMapping(MI.getOperand(i: 2).getReg(), MRI, *TRI); |
5016 | OpdsMapping[3] = getVGPROpMapping(MI.getOperand(i: 3).getReg(), MRI, *TRI); |
5017 | OpdsMapping[4] = getVGPROpMapping(MI.getOperand(i: 4).getReg(), MRI, *TRI); |
5018 | OpdsMapping[5] = getSGPROpMapping(MI.getOperand(i: 5).getReg(), MRI, *TRI); |
5019 | break; |
5020 | } |
5021 | case Intrinsic::amdgcn_struct_buffer_load_lds: |
5022 | case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: { |
5023 | OpdsMapping[1] = getSGPROpMapping(MI.getOperand(i: 1).getReg(), MRI, *TRI); |
5024 | OpdsMapping[2] = getSGPROpMapping(MI.getOperand(i: 2).getReg(), MRI, *TRI); |
5025 | OpdsMapping[4] = getVGPROpMapping(MI.getOperand(i: 4).getReg(), MRI, *TRI); |
5026 | OpdsMapping[5] = getVGPROpMapping(MI.getOperand(i: 5).getReg(), MRI, *TRI); |
5027 | OpdsMapping[6] = getSGPROpMapping(MI.getOperand(i: 6).getReg(), MRI, *TRI); |
5028 | break; |
5029 | } |
5030 | case Intrinsic::amdgcn_struct_buffer_store: |
5031 | case Intrinsic::amdgcn_struct_ptr_buffer_store: |
5032 | case Intrinsic::amdgcn_struct_tbuffer_store: |
5033 | case Intrinsic::amdgcn_struct_ptr_tbuffer_store: { |
5034 | OpdsMapping[1] = getVGPROpMapping(MI.getOperand(i: 1).getReg(), MRI, *TRI); |
5035 | OpdsMapping[2] = getSGPROpMapping(MI.getOperand(i: 2).getReg(), MRI, *TRI); |
5036 | OpdsMapping[3] = getVGPROpMapping(MI.getOperand(i: 3).getReg(), MRI, *TRI); |
5037 | OpdsMapping[4] = getVGPROpMapping(MI.getOperand(i: 4).getReg(), MRI, *TRI); |
5038 | OpdsMapping[5] = getSGPROpMapping(MI.getOperand(i: 5).getReg(), MRI, *TRI); |
5039 | break; |
5040 | } |
5041 | case Intrinsic::amdgcn_init_exec_from_input: { |
5042 | unsigned Size = getSizeInBits(MI.getOperand(i: 1).getReg(), MRI, *TRI); |
5043 | OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); |
5044 | break; |
5045 | } |
5046 | case Intrinsic::amdgcn_ds_gws_init: |
5047 | case Intrinsic::amdgcn_ds_gws_barrier: |
5048 | case Intrinsic::amdgcn_ds_gws_sema_br: { |
5049 | OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); |
5050 | |
5051 | // This must be an SGPR, but accept a VGPR. |
5052 | unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, |
5053 | AMDGPU::SGPRRegBankID); |
5054 | OpdsMapping[2] = AMDGPU::getValueMapping(BankID: Bank, Size: 32); |
5055 | break; |
5056 | } |
5057 | case Intrinsic::amdgcn_ds_gws_sema_v: |
5058 | case Intrinsic::amdgcn_ds_gws_sema_p: |
5059 | case Intrinsic::amdgcn_ds_gws_sema_release_all: { |
5060 | // This must be an SGPR, but accept a VGPR. |
5061 | unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI, |
5062 | AMDGPU::SGPRRegBankID); |
5063 | OpdsMapping[1] = AMDGPU::getValueMapping(BankID: Bank, Size: 32); |
5064 | break; |
5065 | } |
5066 | case Intrinsic::amdgcn_global_load_lds: { |
5067 | OpdsMapping[1] = getVGPROpMapping(MI.getOperand(i: 1).getReg(), MRI, *TRI); |
5068 | OpdsMapping[2] = getSGPROpMapping(MI.getOperand(i: 2).getReg(), MRI, *TRI); |
5069 | break; |
5070 | } |
5071 | case Intrinsic::amdgcn_lds_direct_load: { |
5072 | const int M0Idx = MI.getNumOperands() - 1; |
5073 | Register M0Reg = MI.getOperand(i: M0Idx).getReg(); |
5074 | unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID); |
5075 | unsigned DstSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
5076 | |
5077 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); |
5078 | for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I) |
5079 | OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); |
5080 | |
5081 | // Must be SGPR, but we must take whatever the original bank is and fix it |
5082 | // later. |
5083 | OpdsMapping[M0Idx] = AMDGPU::getValueMapping(BankID: M0Bank, Size: 32); |
5084 | break; |
5085 | } |
5086 | case Intrinsic::amdgcn_ds_add_gs_reg_rtn: |
5087 | case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: |
5088 | OpdsMapping[0] = getVGPROpMapping(MI.getOperand(i: 0).getReg(), MRI, *TRI); |
5089 | OpdsMapping[2] = getVGPROpMapping(MI.getOperand(i: 2).getReg(), MRI, *TRI); |
5090 | break; |
5091 | case Intrinsic::amdgcn_ds_bvh_stack_rtn: { |
5092 | OpdsMapping[0] = |
5093 | getVGPROpMapping(MI.getOperand(i: 0).getReg(), MRI, *TRI); // %vdst |
5094 | OpdsMapping[1] = |
5095 | getVGPROpMapping(MI.getOperand(i: 1).getReg(), MRI, *TRI); // %addr |
5096 | OpdsMapping[3] = |
5097 | getVGPROpMapping(MI.getOperand(i: 3).getReg(), MRI, *TRI); // %addr |
5098 | OpdsMapping[4] = |
5099 | getVGPROpMapping(MI.getOperand(i: 4).getReg(), MRI, *TRI); // %data0 |
5100 | OpdsMapping[5] = |
5101 | getVGPROpMapping(MI.getOperand(i: 5).getReg(), MRI, *TRI); // %data1 |
5102 | break; |
5103 | } |
5104 | case Intrinsic::amdgcn_s_sleep_var: |
5105 | OpdsMapping[1] = getSGPROpMapping(MI.getOperand(i: 1).getReg(), MRI, *TRI); |
5106 | break; |
5107 | case Intrinsic::amdgcn_s_barrier_signal_var: |
5108 | case Intrinsic::amdgcn_s_barrier_join: |
5109 | case Intrinsic::amdgcn_s_wakeup_barrier: |
5110 | OpdsMapping[1] = getSGPROpMapping(MI.getOperand(i: 1).getReg(), MRI, *TRI); |
5111 | break; |
5112 | case Intrinsic::amdgcn_s_barrier_init: |
5113 | OpdsMapping[1] = getSGPROpMapping(MI.getOperand(i: 1).getReg(), MRI, *TRI); |
5114 | OpdsMapping[2] = getSGPROpMapping(MI.getOperand(i: 2).getReg(), MRI, *TRI); |
5115 | break; |
5116 | case Intrinsic::amdgcn_s_barrier_signal_isfirst_var: { |
5117 | const unsigned ResultSize = 1; |
5118 | OpdsMapping[0] = |
5119 | AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, ResultSize); |
5120 | OpdsMapping[2] = getSGPROpMapping(MI.getOperand(i: 2).getReg(), MRI, *TRI); |
5121 | break; |
5122 | } |
5123 | case Intrinsic::amdgcn_s_barrier_signal_isfirst: |
5124 | case Intrinsic::amdgcn_s_barrier_leave: { |
5125 | const unsigned ResultSize = 1; |
5126 | OpdsMapping[0] = |
5127 | AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, ResultSize); |
5128 | break; |
5129 | } |
5130 | case Intrinsic::amdgcn_s_get_barrier_state: { |
5131 | OpdsMapping[0] = getSGPROpMapping(MI.getOperand(i: 0).getReg(), MRI, *TRI); |
5132 | OpdsMapping[2] = getSGPROpMapping(MI.getOperand(i: 2).getReg(), MRI, *TRI); |
5133 | break; |
5134 | } |
5135 | default: |
5136 | return getInvalidInstructionMapping(); |
5137 | } |
5138 | break; |
5139 | } |
5140 | case AMDGPU::G_SELECT: { |
5141 | unsigned Size = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits(); |
5142 | unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, |
5143 | AMDGPU::SGPRRegBankID); |
5144 | unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, |
5145 | AMDGPU::SGPRRegBankID); |
5146 | bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID && |
5147 | Op3Bank == AMDGPU::SGPRRegBankID; |
5148 | |
5149 | unsigned CondBankDefault = SGPRSrcs ? |
5150 | AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; |
5151 | unsigned CondBank = getRegBankID(Reg: MI.getOperand(i: 1).getReg(), MRI, |
5152 | Default: CondBankDefault); |
5153 | if (CondBank == AMDGPU::SGPRRegBankID) |
5154 | CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; |
5155 | else if (CondBank == AMDGPU::VGPRRegBankID) |
5156 | CondBank = AMDGPU::VCCRegBankID; |
5157 | |
5158 | unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ? |
5159 | AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; |
5160 | |
5161 | assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID); |
5162 | |
5163 | // TODO: Should report 32-bit for scalar condition type. |
5164 | if (Size == 64) { |
5165 | OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(BankID: Bank, Size); |
5166 | OpdsMapping[1] = AMDGPU::getValueMapping(BankID: CondBank, Size: 1); |
5167 | OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(BankID: Bank, Size); |
5168 | OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(BankID: Bank, Size); |
5169 | } else { |
5170 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: Bank, Size); |
5171 | OpdsMapping[1] = AMDGPU::getValueMapping(BankID: CondBank, Size: 1); |
5172 | OpdsMapping[2] = AMDGPU::getValueMapping(BankID: Bank, Size); |
5173 | OpdsMapping[3] = AMDGPU::getValueMapping(BankID: Bank, Size); |
5174 | } |
5175 | |
5176 | break; |
5177 | } |
5178 | |
5179 | case AMDGPU::G_SI_CALL: { |
5180 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64); |
5181 | // Lie and claim everything is legal, even though some need to be |
5182 | // SGPRs. applyMapping will have to deal with it as a waterfall loop. |
5183 | OpdsMapping[1] = getSGPROpMapping(MI.getOperand(i: 1).getReg(), MRI, *TRI); |
5184 | |
5185 | // Allow anything for implicit arguments |
5186 | for (unsigned I = 4; I < MI.getNumOperands(); ++I) { |
5187 | if (MI.getOperand(i: I).isReg()) { |
5188 | Register Reg = MI.getOperand(i: I).getReg(); |
5189 | auto OpBank = getRegBankID(Reg, MRI); |
5190 | unsigned Size = getSizeInBits(Reg, MRI, *TRI); |
5191 | OpdsMapping[I] = AMDGPU::getValueMapping(BankID: OpBank, Size); |
5192 | } |
5193 | } |
5194 | break; |
5195 | } |
5196 | case AMDGPU::G_LOAD: |
5197 | case AMDGPU::G_ZEXTLOAD: |
5198 | case AMDGPU::G_SEXTLOAD: |
5199 | return getInstrMappingForLoad(MI); |
5200 | |
5201 | case AMDGPU::G_ATOMICRMW_XCHG: |
5202 | case AMDGPU::G_ATOMICRMW_ADD: |
5203 | case AMDGPU::G_ATOMICRMW_SUB: |
5204 | case AMDGPU::G_ATOMICRMW_AND: |
5205 | case AMDGPU::G_ATOMICRMW_OR: |
5206 | case AMDGPU::G_ATOMICRMW_XOR: |
5207 | case AMDGPU::G_ATOMICRMW_MAX: |
5208 | case AMDGPU::G_ATOMICRMW_MIN: |
5209 | case AMDGPU::G_ATOMICRMW_UMAX: |
5210 | case AMDGPU::G_ATOMICRMW_UMIN: |
5211 | case AMDGPU::G_ATOMICRMW_FADD: |
5212 | case AMDGPU::G_ATOMICRMW_UINC_WRAP: |
5213 | case AMDGPU::G_ATOMICRMW_UDEC_WRAP: |
5214 | case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: |
5215 | case AMDGPU::G_AMDGPU_ATOMIC_FMIN: |
5216 | case AMDGPU::G_AMDGPU_ATOMIC_FMAX: { |
5217 | OpdsMapping[0] = getVGPROpMapping(MI.getOperand(i: 0).getReg(), MRI, *TRI); |
5218 | OpdsMapping[1] = getValueMappingForPtr(MRI, PtrReg: MI.getOperand(i: 1).getReg()); |
5219 | OpdsMapping[2] = getVGPROpMapping(MI.getOperand(i: 2).getReg(), MRI, *TRI); |
5220 | break; |
5221 | } |
5222 | case AMDGPU::G_ATOMIC_CMPXCHG: { |
5223 | OpdsMapping[0] = getVGPROpMapping(MI.getOperand(i: 0).getReg(), MRI, *TRI); |
5224 | OpdsMapping[1] = getValueMappingForPtr(MRI, PtrReg: MI.getOperand(i: 1).getReg()); |
5225 | OpdsMapping[2] = getVGPROpMapping(MI.getOperand(i: 2).getReg(), MRI, *TRI); |
5226 | OpdsMapping[3] = getVGPROpMapping(MI.getOperand(i: 3).getReg(), MRI, *TRI); |
5227 | break; |
5228 | } |
5229 | case AMDGPU::G_BRCOND: { |
5230 | unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI, |
5231 | AMDGPU::SGPRRegBankID); |
5232 | assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1); |
5233 | if (Bank != AMDGPU::SGPRRegBankID) |
5234 | Bank = AMDGPU::VCCRegBankID; |
5235 | |
5236 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID: Bank, Size: 1); |
5237 | break; |
5238 | } |
5239 | case AMDGPU::G_FPTRUNC_ROUND_UPWARD: |
5240 | case AMDGPU::G_FPTRUNC_ROUND_DOWNWARD: |
5241 | return getDefaultMappingVOP(MI); |
5242 | case AMDGPU::G_PREFETCH: |
5243 | OpdsMapping[0] = getSGPROpMapping(MI.getOperand(i: 0).getReg(), MRI, *TRI); |
5244 | break; |
5245 | } |
5246 | |
5247 | return getInstructionMapping(/*ID*/1, /*Cost*/1, |
5248 | OperandsMapping: getOperandsMapping(OpdsMapping), |
5249 | NumOperands: MI.getNumOperands()); |
5250 | } |
5251 | |