1 | //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | /// \file |
9 | /// This file implements the targeting of the InstructionSelector class for |
10 | /// AMDGPU. |
11 | /// \todo This should be generated by TableGen. |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "AMDGPUInstructionSelector.h" |
15 | #include "AMDGPU.h" |
16 | #include "AMDGPUGlobalISelUtils.h" |
17 | #include "AMDGPUInstrInfo.h" |
18 | #include "AMDGPURegisterBankInfo.h" |
19 | #include "AMDGPUTargetMachine.h" |
20 | #include "SIMachineFunctionInfo.h" |
21 | #include "Utils/AMDGPUBaseInfo.h" |
22 | #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h" |
23 | #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" |
24 | #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" |
25 | #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" |
26 | #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" |
27 | #include "llvm/CodeGen/MachineFrameInfo.h" |
28 | #include "llvm/IR/DiagnosticInfo.h" |
29 | #include "llvm/IR/IntrinsicsAMDGPU.h" |
30 | #include <optional> |
31 | |
32 | #define DEBUG_TYPE "amdgpu-isel" |
33 | |
34 | using namespace llvm; |
35 | using namespace MIPatternMatch; |
36 | |
37 | #define GET_GLOBALISEL_IMPL |
38 | #define AMDGPUSubtarget GCNSubtarget |
39 | #include "AMDGPUGenGlobalISel.inc" |
40 | #undef GET_GLOBALISEL_IMPL |
41 | #undef AMDGPUSubtarget |
42 | |
43 | AMDGPUInstructionSelector::AMDGPUInstructionSelector( |
44 | const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, |
45 | const AMDGPUTargetMachine &TM) |
46 | : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM), |
47 | STI(STI), |
48 | EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG), |
49 | #define GET_GLOBALISEL_PREDICATES_INIT |
50 | #include "AMDGPUGenGlobalISel.inc" |
51 | #undef GET_GLOBALISEL_PREDICATES_INIT |
52 | #define GET_GLOBALISEL_TEMPORARIES_INIT |
53 | #include "AMDGPUGenGlobalISel.inc" |
54 | #undef GET_GLOBALISEL_TEMPORARIES_INIT |
55 | { |
56 | } |
57 | |
58 | const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; } |
59 | |
60 | void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits *KB, |
61 | CodeGenCoverage *CoverageInfo, |
62 | ProfileSummaryInfo *PSI, |
63 | BlockFrequencyInfo *BFI) { |
64 | MRI = &MF.getRegInfo(); |
65 | Subtarget = &MF.getSubtarget<GCNSubtarget>(); |
66 | InstructionSelector::setupMF(mf&: MF, kb: KB, covinfo: CoverageInfo, psi: PSI, bfi: BFI); |
67 | } |
68 | |
69 | // Return the wave level SGPR base address if this is a wave address. |
70 | static Register getWaveAddress(const MachineInstr *Def) { |
71 | return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS |
72 | ? Def->getOperand(i: 1).getReg() |
73 | : Register(); |
74 | } |
75 | |
76 | bool AMDGPUInstructionSelector::isVCC(Register Reg, |
77 | const MachineRegisterInfo &MRI) const { |
78 | // The verifier is oblivious to s1 being a valid value for wavesize registers. |
79 | if (Reg.isPhysical()) |
80 | return false; |
81 | |
82 | auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); |
83 | const TargetRegisterClass *RC = |
84 | RegClassOrBank.dyn_cast<const TargetRegisterClass*>(); |
85 | if (RC) { |
86 | const LLT Ty = MRI.getType(Reg); |
87 | if (!Ty.isValid() || Ty.getSizeInBits() != 1) |
88 | return false; |
89 | // G_TRUNC s1 result is never vcc. |
90 | return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC && |
91 | RC->hasSuperClassEq(RC: TRI.getBoolRC()); |
92 | } |
93 | |
94 | const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>(); |
95 | return RB->getID() == AMDGPU::VCCRegBankID; |
96 | } |
97 | |
98 | bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI, |
99 | unsigned NewOpc) const { |
100 | MI.setDesc(TII.get(NewOpc)); |
101 | MI.removeOperand(OpNo: 1); // Remove intrinsic ID. |
102 | MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::Reg: EXEC, isDef: false, isImp: true)); |
103 | |
104 | MachineOperand &Dst = MI.getOperand(i: 0); |
105 | MachineOperand &Src = MI.getOperand(i: 1); |
106 | |
107 | // TODO: This should be legalized to s32 if needed |
108 | if (MRI->getType(Reg: Dst.getReg()) == LLT::scalar(SizeInBits: 1)) |
109 | return false; |
110 | |
111 | const TargetRegisterClass *DstRC |
112 | = TRI.getConstrainedRegClassForOperand(MO: Dst, MRI: *MRI); |
113 | const TargetRegisterClass *SrcRC |
114 | = TRI.getConstrainedRegClassForOperand(MO: Src, MRI: *MRI); |
115 | if (!DstRC || DstRC != SrcRC) |
116 | return false; |
117 | |
118 | return RBI.constrainGenericRegister(Reg: Dst.getReg(), RC: *DstRC, MRI&: *MRI) && |
119 | RBI.constrainGenericRegister(Reg: Src.getReg(), RC: *SrcRC, MRI&: *MRI); |
120 | } |
121 | |
122 | bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { |
123 | const DebugLoc &DL = I.getDebugLoc(); |
124 | MachineBasicBlock *BB = I.getParent(); |
125 | I.setDesc(TII.get(TargetOpcode::COPY)); |
126 | |
127 | const MachineOperand &Src = I.getOperand(i: 1); |
128 | MachineOperand &Dst = I.getOperand(i: 0); |
129 | Register DstReg = Dst.getReg(); |
130 | Register SrcReg = Src.getReg(); |
131 | |
132 | if (isVCC(Reg: DstReg, MRI: *MRI)) { |
133 | if (SrcReg == AMDGPU::SCC) { |
134 | const TargetRegisterClass *RC |
135 | = TRI.getConstrainedRegClassForOperand(MO: Dst, MRI: *MRI); |
136 | if (!RC) |
137 | return true; |
138 | return RBI.constrainGenericRegister(Reg: DstReg, RC: *RC, MRI&: *MRI); |
139 | } |
140 | |
141 | if (!isVCC(Reg: SrcReg, MRI: *MRI)) { |
142 | // TODO: Should probably leave the copy and let copyPhysReg expand it. |
143 | if (!RBI.constrainGenericRegister(Reg: DstReg, RC: *TRI.getBoolRC(), MRI&: *MRI)) |
144 | return false; |
145 | |
146 | const TargetRegisterClass *SrcRC |
147 | = TRI.getConstrainedRegClassForOperand(MO: Src, MRI: *MRI); |
148 | |
149 | std::optional<ValueAndVReg> ConstVal = |
150 | getIConstantVRegValWithLookThrough(VReg: SrcReg, MRI: *MRI, LookThroughInstrs: true); |
151 | if (ConstVal) { |
152 | unsigned MovOpc = |
153 | STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; |
154 | BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg) |
155 | .addImm(ConstVal->Value.getBoolValue() ? -1 : 0); |
156 | } else { |
157 | Register MaskedReg = MRI->createVirtualRegister(RegClass: SrcRC); |
158 | |
159 | // We can't trust the high bits at this point, so clear them. |
160 | |
161 | // TODO: Skip masking high bits if def is known boolean. |
162 | |
163 | bool IsSGPR = TRI.isSGPRClass(RC: SrcRC); |
164 | unsigned AndOpc = |
165 | IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32; |
166 | auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg) |
167 | .addImm(1) |
168 | .addReg(SrcReg); |
169 | if (IsSGPR) |
170 | And.setOperandDead(3); // Dead scc |
171 | |
172 | BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg) |
173 | .addImm(0) |
174 | .addReg(MaskedReg); |
175 | } |
176 | |
177 | if (!MRI->getRegClassOrNull(Reg: SrcReg)) |
178 | MRI->setRegClass(Reg: SrcReg, RC: SrcRC); |
179 | I.eraseFromParent(); |
180 | return true; |
181 | } |
182 | |
183 | const TargetRegisterClass *RC = |
184 | TRI.getConstrainedRegClassForOperand(MO: Dst, MRI: *MRI); |
185 | if (RC && !RBI.constrainGenericRegister(Reg: DstReg, RC: *RC, MRI&: *MRI)) |
186 | return false; |
187 | |
188 | return true; |
189 | } |
190 | |
191 | for (const MachineOperand &MO : I.operands()) { |
192 | if (MO.getReg().isPhysical()) |
193 | continue; |
194 | |
195 | const TargetRegisterClass *RC = |
196 | TRI.getConstrainedRegClassForOperand(MO, MRI: *MRI); |
197 | if (!RC) |
198 | continue; |
199 | RBI.constrainGenericRegister(Reg: MO.getReg(), RC: *RC, MRI&: *MRI); |
200 | } |
201 | return true; |
202 | } |
203 | |
204 | bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const { |
205 | const Register DefReg = I.getOperand(i: 0).getReg(); |
206 | const LLT DefTy = MRI->getType(Reg: DefReg); |
207 | |
208 | // S1 G_PHIs should not be selected in instruction-select, instead: |
209 | // - divergent S1 G_PHI should go through lane mask merging algorithm |
210 | // and be fully inst-selected in AMDGPUGlobalISelDivergenceLowering |
211 | // - uniform S1 G_PHI should be lowered into S32 G_PHI in AMDGPURegBankSelect |
212 | if (DefTy == LLT::scalar(SizeInBits: 1)) |
213 | return false; |
214 | |
215 | // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy) |
216 | |
217 | const RegClassOrRegBank &RegClassOrBank = |
218 | MRI->getRegClassOrRegBank(Reg: DefReg); |
219 | |
220 | const TargetRegisterClass *DefRC |
221 | = RegClassOrBank.dyn_cast<const TargetRegisterClass *>(); |
222 | if (!DefRC) { |
223 | if (!DefTy.isValid()) { |
224 | LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n" ); |
225 | return false; |
226 | } |
227 | |
228 | const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>(); |
229 | DefRC = TRI.getRegClassForTypeOnBank(Ty: DefTy, Bank: RB); |
230 | if (!DefRC) { |
231 | LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n" ); |
232 | return false; |
233 | } |
234 | } |
235 | |
236 | // TODO: Verify that all registers have the same bank |
237 | I.setDesc(TII.get(TargetOpcode::PHI)); |
238 | return RBI.constrainGenericRegister(Reg: DefReg, RC: *DefRC, MRI&: *MRI); |
239 | } |
240 | |
241 | MachineOperand |
242 | AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO, |
243 | const TargetRegisterClass &SubRC, |
244 | unsigned SubIdx) const { |
245 | |
246 | MachineInstr *MI = MO.getParent(); |
247 | MachineBasicBlock *BB = MO.getParent()->getParent(); |
248 | Register DstReg = MRI->createVirtualRegister(RegClass: &SubRC); |
249 | |
250 | if (MO.isReg()) { |
251 | unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx); |
252 | Register Reg = MO.getReg(); |
253 | BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg) |
254 | .addReg(Reg, 0, ComposedSubIdx); |
255 | |
256 | return MachineOperand::CreateReg(Reg: DstReg, isDef: MO.isDef(), isImp: MO.isImplicit(), |
257 | isKill: MO.isKill(), isDead: MO.isDead(), isUndef: MO.isUndef(), |
258 | isEarlyClobber: MO.isEarlyClobber(), SubReg: 0, isDebug: MO.isDebug(), |
259 | isInternalRead: MO.isInternalRead()); |
260 | } |
261 | |
262 | assert(MO.isImm()); |
263 | |
264 | APInt Imm(64, MO.getImm()); |
265 | |
266 | switch (SubIdx) { |
267 | default: |
268 | llvm_unreachable("do not know to split immediate with this sub index." ); |
269 | case AMDGPU::sub0: |
270 | return MachineOperand::CreateImm(Val: Imm.getLoBits(numBits: 32).getSExtValue()); |
271 | case AMDGPU::sub1: |
272 | return MachineOperand::CreateImm(Val: Imm.getHiBits(numBits: 32).getSExtValue()); |
273 | } |
274 | } |
275 | |
276 | static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) { |
277 | switch (Opc) { |
278 | case AMDGPU::G_AND: |
279 | return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32; |
280 | case AMDGPU::G_OR: |
281 | return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32; |
282 | case AMDGPU::G_XOR: |
283 | return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32; |
284 | default: |
285 | llvm_unreachable("not a bit op" ); |
286 | } |
287 | } |
288 | |
289 | bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const { |
290 | Register DstReg = I.getOperand(i: 0).getReg(); |
291 | unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); |
292 | |
293 | const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); |
294 | if (DstRB->getID() != AMDGPU::SGPRRegBankID && |
295 | DstRB->getID() != AMDGPU::VCCRegBankID) |
296 | return false; |
297 | |
298 | bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID && |
299 | STI.isWave64()); |
300 | I.setDesc(TII.get(getLogicalBitOpcode(Opc: I.getOpcode(), Is64))); |
301 | |
302 | // Dead implicit-def of scc |
303 | I.addOperand(Op: MachineOperand::CreateReg(Reg: AMDGPU::SCC, isDef: true, // isDef |
304 | isImp: true, // isImp |
305 | isKill: false, // isKill |
306 | isDead: true)); // isDead |
307 | return constrainSelectedInstRegOperands(I, TII, TRI, RBI); |
308 | } |
309 | |
310 | bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const { |
311 | MachineBasicBlock *BB = I.getParent(); |
312 | MachineFunction *MF = BB->getParent(); |
313 | Register DstReg = I.getOperand(i: 0).getReg(); |
314 | const DebugLoc &DL = I.getDebugLoc(); |
315 | LLT Ty = MRI->getType(Reg: DstReg); |
316 | if (Ty.isVector()) |
317 | return false; |
318 | |
319 | unsigned Size = Ty.getSizeInBits(); |
320 | const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); |
321 | const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID; |
322 | const bool Sub = I.getOpcode() == TargetOpcode::G_SUB; |
323 | |
324 | if (Size == 32) { |
325 | if (IsSALU) { |
326 | const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32; |
327 | MachineInstr *Add = |
328 | BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) |
329 | .add(I.getOperand(i: 1)) |
330 | .add(I.getOperand(i: 2)) |
331 | .setOperandDead(3); // Dead scc |
332 | I.eraseFromParent(); |
333 | return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); |
334 | } |
335 | |
336 | if (STI.hasAddNoCarry()) { |
337 | const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64; |
338 | I.setDesc(TII.get(Opc)); |
339 | I.addOperand(MF&: *MF, Op: MachineOperand::CreateImm(Val: 0)); |
340 | I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); |
341 | return constrainSelectedInstRegOperands(I, TII, TRI, RBI); |
342 | } |
343 | |
344 | const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64; |
345 | |
346 | Register UnusedCarry = MRI->createVirtualRegister(RegClass: TRI.getWaveMaskRegClass()); |
347 | MachineInstr *Add |
348 | = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) |
349 | .addDef(UnusedCarry, RegState::Dead) |
350 | .add(I.getOperand(i: 1)) |
351 | .add(I.getOperand(i: 2)) |
352 | .addImm(0); |
353 | I.eraseFromParent(); |
354 | return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); |
355 | } |
356 | |
357 | assert(!Sub && "illegal sub should not reach here" ); |
358 | |
359 | const TargetRegisterClass &RC |
360 | = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass; |
361 | const TargetRegisterClass &HalfRC |
362 | = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass; |
363 | |
364 | MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0)); |
365 | MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0)); |
366 | MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1)); |
367 | MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1)); |
368 | |
369 | Register DstLo = MRI->createVirtualRegister(RegClass: &HalfRC); |
370 | Register DstHi = MRI->createVirtualRegister(RegClass: &HalfRC); |
371 | |
372 | if (IsSALU) { |
373 | BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo) |
374 | .add(Lo1) |
375 | .add(Lo2); |
376 | BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi) |
377 | .add(Hi1) |
378 | .add(Hi2) |
379 | .setOperandDead(3); // Dead scc |
380 | } else { |
381 | const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass(); |
382 | Register CarryReg = MRI->createVirtualRegister(RegClass: CarryRC); |
383 | BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo) |
384 | .addDef(CarryReg) |
385 | .add(Lo1) |
386 | .add(Lo2) |
387 | .addImm(0); |
388 | MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi) |
389 | .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead) |
390 | .add(Hi1) |
391 | .add(Hi2) |
392 | .addReg(CarryReg, RegState::Kill) |
393 | .addImm(0); |
394 | |
395 | if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI)) |
396 | return false; |
397 | } |
398 | |
399 | BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) |
400 | .addReg(DstLo) |
401 | .addImm(AMDGPU::sub0) |
402 | .addReg(DstHi) |
403 | .addImm(AMDGPU::sub1); |
404 | |
405 | |
406 | if (!RBI.constrainGenericRegister(Reg: DstReg, RC, MRI&: *MRI)) |
407 | return false; |
408 | |
409 | I.eraseFromParent(); |
410 | return true; |
411 | } |
412 | |
413 | bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE( |
414 | MachineInstr &I) const { |
415 | MachineBasicBlock *BB = I.getParent(); |
416 | MachineFunction *MF = BB->getParent(); |
417 | const DebugLoc &DL = I.getDebugLoc(); |
418 | Register Dst0Reg = I.getOperand(i: 0).getReg(); |
419 | Register Dst1Reg = I.getOperand(i: 1).getReg(); |
420 | const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO || |
421 | I.getOpcode() == AMDGPU::G_UADDE; |
422 | const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE || |
423 | I.getOpcode() == AMDGPU::G_USUBE; |
424 | |
425 | if (isVCC(Reg: Dst1Reg, MRI: *MRI)) { |
426 | unsigned NoCarryOpc = |
427 | IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64; |
428 | unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64; |
429 | I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc)); |
430 | I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); |
431 | I.addOperand(MF&: *MF, Op: MachineOperand::CreateImm(Val: 0)); |
432 | return constrainSelectedInstRegOperands(I, TII, TRI, RBI); |
433 | } |
434 | |
435 | Register Src0Reg = I.getOperand(i: 2).getReg(); |
436 | Register Src1Reg = I.getOperand(i: 3).getReg(); |
437 | |
438 | if (HasCarryIn) { |
439 | BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) |
440 | .addReg(I.getOperand(4).getReg()); |
441 | } |
442 | |
443 | unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; |
444 | unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; |
445 | |
446 | auto CarryInst = BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg) |
447 | .add(I.getOperand(i: 2)) |
448 | .add(I.getOperand(i: 3)); |
449 | |
450 | if (MRI->use_nodbg_empty(RegNo: Dst1Reg)) { |
451 | CarryInst.setOperandDead(3); // Dead scc |
452 | } else { |
453 | BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg) |
454 | .addReg(AMDGPU::SCC); |
455 | if (!MRI->getRegClassOrNull(Dst1Reg)) |
456 | MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass); |
457 | } |
458 | |
459 | if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) || |
460 | !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) || |
461 | !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI)) |
462 | return false; |
463 | |
464 | if (HasCarryIn && |
465 | !RBI.constrainGenericRegister(I.getOperand(4).getReg(), |
466 | AMDGPU::SReg_32RegClass, *MRI)) |
467 | return false; |
468 | |
469 | I.eraseFromParent(); |
470 | return true; |
471 | } |
472 | |
473 | bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32( |
474 | MachineInstr &I) const { |
475 | MachineBasicBlock *BB = I.getParent(); |
476 | MachineFunction *MF = BB->getParent(); |
477 | const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32; |
478 | |
479 | unsigned Opc; |
480 | if (Subtarget->hasMADIntraFwdBug()) |
481 | Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64 |
482 | : AMDGPU::V_MAD_I64_I32_gfx11_e64; |
483 | else |
484 | Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64; |
485 | I.setDesc(TII.get(Opc)); |
486 | I.addOperand(MF&: *MF, Op: MachineOperand::CreateImm(Val: 0)); |
487 | I.addImplicitDefUseOperands(MF&: *MF); |
488 | return constrainSelectedInstRegOperands(I, TII, TRI, RBI); |
489 | } |
490 | |
491 | // TODO: We should probably legalize these to only using 32-bit results. |
492 | bool AMDGPUInstructionSelector::(MachineInstr &I) const { |
493 | MachineBasicBlock *BB = I.getParent(); |
494 | Register DstReg = I.getOperand(i: 0).getReg(); |
495 | Register SrcReg = I.getOperand(i: 1).getReg(); |
496 | LLT DstTy = MRI->getType(Reg: DstReg); |
497 | LLT SrcTy = MRI->getType(Reg: SrcReg); |
498 | const unsigned SrcSize = SrcTy.getSizeInBits(); |
499 | unsigned DstSize = DstTy.getSizeInBits(); |
500 | |
501 | // TODO: Should handle any multiple of 32 offset. |
502 | unsigned Offset = I.getOperand(i: 2).getImm(); |
503 | if (Offset % 32 != 0 || DstSize > 128) |
504 | return false; |
505 | |
506 | // 16-bit operations really use 32-bit registers. |
507 | // FIXME: Probably should not allow 16-bit G_EXTRACT results. |
508 | if (DstSize == 16) |
509 | DstSize = 32; |
510 | |
511 | const TargetRegisterClass *DstRC = |
512 | TRI.getConstrainedRegClassForOperand(MO: I.getOperand(i: 0), MRI: *MRI); |
513 | if (!DstRC || !RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI)) |
514 | return false; |
515 | |
516 | const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); |
517 | const TargetRegisterClass *SrcRC = |
518 | TRI.getRegClassForSizeOnBank(Size: SrcSize, Bank: *SrcBank); |
519 | if (!SrcRC) |
520 | return false; |
521 | unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Channel: Offset / 32, |
522 | NumRegs: DstSize / 32); |
523 | SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg); |
524 | if (!SrcRC) |
525 | return false; |
526 | |
527 | SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I, |
528 | *SrcRC, I.getOperand(i: 1)); |
529 | const DebugLoc &DL = I.getDebugLoc(); |
530 | BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg) |
531 | .addReg(SrcReg, 0, SubReg); |
532 | |
533 | I.eraseFromParent(); |
534 | return true; |
535 | } |
536 | |
537 | bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const { |
538 | MachineBasicBlock *BB = MI.getParent(); |
539 | Register DstReg = MI.getOperand(i: 0).getReg(); |
540 | LLT DstTy = MRI->getType(Reg: DstReg); |
541 | LLT SrcTy = MRI->getType(Reg: MI.getOperand(i: 1).getReg()); |
542 | |
543 | const unsigned SrcSize = SrcTy.getSizeInBits(); |
544 | if (SrcSize < 32) |
545 | return selectImpl(I&: MI, CoverageInfo&: *CoverageInfo); |
546 | |
547 | const DebugLoc &DL = MI.getDebugLoc(); |
548 | const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); |
549 | const unsigned DstSize = DstTy.getSizeInBits(); |
550 | const TargetRegisterClass *DstRC = |
551 | TRI.getRegClassForSizeOnBank(Size: DstSize, Bank: *DstBank); |
552 | if (!DstRC) |
553 | return false; |
554 | |
555 | ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(RC: DstRC, EltSize: SrcSize / 8); |
556 | MachineInstrBuilder MIB = |
557 | BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg); |
558 | for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) { |
559 | MachineOperand &Src = MI.getOperand(i: I + 1); |
560 | MIB.addReg(RegNo: Src.getReg(), flags: getUndefRegState(B: Src.isUndef())); |
561 | MIB.addImm(Val: SubRegs[I]); |
562 | |
563 | const TargetRegisterClass *SrcRC |
564 | = TRI.getConstrainedRegClassForOperand(MO: Src, MRI: *MRI); |
565 | if (SrcRC && !RBI.constrainGenericRegister(Reg: Src.getReg(), RC: *SrcRC, MRI&: *MRI)) |
566 | return false; |
567 | } |
568 | |
569 | if (!RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI)) |
570 | return false; |
571 | |
572 | MI.eraseFromParent(); |
573 | return true; |
574 | } |
575 | |
576 | bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const { |
577 | MachineBasicBlock *BB = MI.getParent(); |
578 | const int NumDst = MI.getNumOperands() - 1; |
579 | |
580 | MachineOperand &Src = MI.getOperand(i: NumDst); |
581 | |
582 | Register SrcReg = Src.getReg(); |
583 | Register DstReg0 = MI.getOperand(i: 0).getReg(); |
584 | LLT DstTy = MRI->getType(Reg: DstReg0); |
585 | LLT SrcTy = MRI->getType(Reg: SrcReg); |
586 | |
587 | const unsigned DstSize = DstTy.getSizeInBits(); |
588 | const unsigned SrcSize = SrcTy.getSizeInBits(); |
589 | const DebugLoc &DL = MI.getDebugLoc(); |
590 | const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); |
591 | |
592 | const TargetRegisterClass *SrcRC = |
593 | TRI.getRegClassForSizeOnBank(Size: SrcSize, Bank: *SrcBank); |
594 | if (!SrcRC || !RBI.constrainGenericRegister(Reg: SrcReg, RC: *SrcRC, MRI&: *MRI)) |
595 | return false; |
596 | |
597 | // Note we could have mixed SGPR and VGPR destination banks for an SGPR |
598 | // source, and this relies on the fact that the same subregister indices are |
599 | // used for both. |
600 | ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(RC: SrcRC, EltSize: DstSize / 8); |
601 | for (int I = 0, E = NumDst; I != E; ++I) { |
602 | MachineOperand &Dst = MI.getOperand(i: I); |
603 | BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg()) |
604 | .addReg(SrcReg, 0, SubRegs[I]); |
605 | |
606 | // Make sure the subregister index is valid for the source register. |
607 | SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]); |
608 | if (!SrcRC || !RBI.constrainGenericRegister(Reg: SrcReg, RC: *SrcRC, MRI&: *MRI)) |
609 | return false; |
610 | |
611 | const TargetRegisterClass *DstRC = |
612 | TRI.getConstrainedRegClassForOperand(MO: Dst, MRI: *MRI); |
613 | if (DstRC && !RBI.constrainGenericRegister(Reg: Dst.getReg(), RC: *DstRC, MRI&: *MRI)) |
614 | return false; |
615 | } |
616 | |
617 | MI.eraseFromParent(); |
618 | return true; |
619 | } |
620 | |
621 | bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const { |
622 | assert(MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC || |
623 | MI.getOpcode() == AMDGPU::G_BUILD_VECTOR); |
624 | |
625 | Register Src0 = MI.getOperand(i: 1).getReg(); |
626 | Register Src1 = MI.getOperand(i: 2).getReg(); |
627 | LLT SrcTy = MRI->getType(Reg: Src0); |
628 | const unsigned SrcSize = SrcTy.getSizeInBits(); |
629 | |
630 | // BUILD_VECTOR with >=32 bits source is handled by MERGE_VALUE. |
631 | if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR && SrcSize >= 32) { |
632 | return selectG_MERGE_VALUES(MI); |
633 | } |
634 | |
635 | // Selection logic below is for V2S16 only. |
636 | // For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32. |
637 | Register Dst = MI.getOperand(i: 0).getReg(); |
638 | if (MRI->getType(Dst) != LLT::fixed_vector(2, 16) || |
639 | (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC && |
640 | SrcTy != LLT::scalar(32))) |
641 | return selectImpl(I&: MI, CoverageInfo&: *CoverageInfo); |
642 | |
643 | const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI); |
644 | if (DstBank->getID() == AMDGPU::AGPRRegBankID) |
645 | return false; |
646 | |
647 | assert(DstBank->getID() == AMDGPU::SGPRRegBankID || |
648 | DstBank->getID() == AMDGPU::VGPRRegBankID); |
649 | const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID; |
650 | |
651 | const DebugLoc &DL = MI.getDebugLoc(); |
652 | MachineBasicBlock *BB = MI.getParent(); |
653 | |
654 | // First, before trying TableGen patterns, check if both sources are |
655 | // constants. In those cases, we can trivially compute the final constant |
656 | // and emit a simple move. |
657 | auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(VReg: Src1, MRI: *MRI, LookThroughInstrs: true, LookThroughAnyExt: true); |
658 | if (ConstSrc1) { |
659 | auto ConstSrc0 = |
660 | getAnyConstantVRegValWithLookThrough(VReg: Src0, MRI: *MRI, LookThroughInstrs: true, LookThroughAnyExt: true); |
661 | if (ConstSrc0) { |
662 | const int64_t K0 = ConstSrc0->Value.getSExtValue(); |
663 | const int64_t K1 = ConstSrc1->Value.getSExtValue(); |
664 | uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff; |
665 | uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff; |
666 | uint32_t Imm = Lo16 | (Hi16 << 16); |
667 | |
668 | // VALU |
669 | if (IsVector) { |
670 | BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), Dst).addImm(Imm); |
671 | MI.eraseFromParent(); |
672 | return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI); |
673 | } |
674 | |
675 | // SALU |
676 | BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst).addImm(Imm); |
677 | MI.eraseFromParent(); |
678 | return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI); |
679 | } |
680 | } |
681 | |
682 | // Now try TableGen patterns. |
683 | if (selectImpl(I&: MI, CoverageInfo&: *CoverageInfo)) |
684 | return true; |
685 | |
686 | // TODO: This should probably be a combine somewhere |
687 | // (build_vector $src0, undef) -> copy $src0 |
688 | MachineInstr *Src1Def = getDefIgnoringCopies(Reg: Src1, MRI: *MRI); |
689 | if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) { |
690 | MI.setDesc(TII.get(AMDGPU::COPY)); |
691 | MI.removeOperand(OpNo: 2); |
692 | const auto &RC = |
693 | IsVector ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; |
694 | return RBI.constrainGenericRegister(Reg: Dst, RC: RC, MRI&: *MRI) && |
695 | RBI.constrainGenericRegister(Reg: Src0, RC: RC, MRI&: *MRI); |
696 | } |
697 | |
698 | // TODO: Can be improved? |
699 | if (IsVector) { |
700 | Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
701 | auto MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg) |
702 | .addImm(0xFFFF) |
703 | .addReg(Src0); |
704 | if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI)) |
705 | return false; |
706 | |
707 | MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), Dst) |
708 | .addReg(Src1) |
709 | .addImm(16) |
710 | .addReg(TmpReg); |
711 | if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI)) |
712 | return false; |
713 | |
714 | MI.eraseFromParent(); |
715 | return true; |
716 | } |
717 | |
718 | Register ShiftSrc0; |
719 | Register ShiftSrc1; |
720 | |
721 | // With multiple uses of the shift, this will duplicate the shift and |
722 | // increase register pressure. |
723 | // |
724 | // (build_vector (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16) |
725 | // => (S_PACK_HH_B32_B16 $src0, $src1) |
726 | // (build_vector (lshr_oneuse SReg_32:$src0, 16), $src1) |
727 | // => (S_PACK_HL_B32_B16 $src0, $src1) |
728 | // (build_vector $src0, (lshr_oneuse SReg_32:$src1, 16)) |
729 | // => (S_PACK_LH_B32_B16 $src0, $src1) |
730 | // (build_vector $src0, $src1) |
731 | // => (S_PACK_LL_B32_B16 $src0, $src1) |
732 | |
733 | bool Shift0 = mi_match( |
734 | R: Src0, MRI: *MRI, P: m_OneUse(SP: m_GLShr(L: m_Reg(R&: ShiftSrc0), R: m_SpecificICst(RequestedValue: 16)))); |
735 | |
736 | bool Shift1 = mi_match( |
737 | R: Src1, MRI: *MRI, P: m_OneUse(SP: m_GLShr(L: m_Reg(R&: ShiftSrc1), R: m_SpecificICst(RequestedValue: 16)))); |
738 | |
739 | unsigned Opc = AMDGPU::S_PACK_LL_B32_B16; |
740 | if (Shift0 && Shift1) { |
741 | Opc = AMDGPU::S_PACK_HH_B32_B16; |
742 | MI.getOperand(i: 1).setReg(ShiftSrc0); |
743 | MI.getOperand(i: 2).setReg(ShiftSrc1); |
744 | } else if (Shift1) { |
745 | Opc = AMDGPU::S_PACK_LH_B32_B16; |
746 | MI.getOperand(i: 2).setReg(ShiftSrc1); |
747 | } else if (Shift0) { |
748 | auto ConstSrc1 = |
749 | getAnyConstantVRegValWithLookThrough(VReg: Src1, MRI: *MRI, LookThroughInstrs: true, LookThroughAnyExt: true); |
750 | if (ConstSrc1 && ConstSrc1->Value == 0) { |
751 | // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16 |
752 | auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst) |
753 | .addReg(ShiftSrc0) |
754 | .addImm(16) |
755 | .setOperandDead(3); // Dead scc |
756 | |
757 | MI.eraseFromParent(); |
758 | return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); |
759 | } |
760 | if (STI.hasSPackHL()) { |
761 | Opc = AMDGPU::S_PACK_HL_B32_B16; |
762 | MI.getOperand(i: 1).setReg(ShiftSrc0); |
763 | } |
764 | } |
765 | |
766 | MI.setDesc(TII.get(Opc)); |
767 | return constrainSelectedInstRegOperands(MI, TII, TRI, RBI); |
768 | } |
769 | |
770 | bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr &I) const { |
771 | return selectG_ADD_SUB(I); |
772 | } |
773 | |
774 | bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const { |
775 | const MachineOperand &MO = I.getOperand(i: 0); |
776 | |
777 | // FIXME: Interface for getConstrainedRegClassForOperand needs work. The |
778 | // regbank check here is to know why getConstrainedRegClassForOperand failed. |
779 | const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, MRI: *MRI); |
780 | if ((!RC && !MRI->getRegBankOrNull(Reg: MO.getReg())) || |
781 | (RC && RBI.constrainGenericRegister(Reg: MO.getReg(), RC: *RC, MRI&: *MRI))) { |
782 | I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF)); |
783 | return true; |
784 | } |
785 | |
786 | return false; |
787 | } |
788 | |
789 | bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const { |
790 | MachineBasicBlock *BB = I.getParent(); |
791 | |
792 | Register DstReg = I.getOperand(i: 0).getReg(); |
793 | Register Src0Reg = I.getOperand(i: 1).getReg(); |
794 | Register Src1Reg = I.getOperand(i: 2).getReg(); |
795 | LLT Src1Ty = MRI->getType(Reg: Src1Reg); |
796 | |
797 | unsigned DstSize = MRI->getType(Reg: DstReg).getSizeInBits(); |
798 | unsigned InsSize = Src1Ty.getSizeInBits(); |
799 | |
800 | int64_t Offset = I.getOperand(i: 3).getImm(); |
801 | |
802 | // FIXME: These cases should have been illegal and unnecessary to check here. |
803 | if (Offset % 32 != 0 || InsSize % 32 != 0) |
804 | return false; |
805 | |
806 | // Currently not handled by getSubRegFromChannel. |
807 | if (InsSize > 128) |
808 | return false; |
809 | |
810 | unsigned SubReg = TRI.getSubRegFromChannel(Channel: Offset / 32, NumRegs: InsSize / 32); |
811 | if (SubReg == AMDGPU::NoSubRegister) |
812 | return false; |
813 | |
814 | const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); |
815 | const TargetRegisterClass *DstRC = |
816 | TRI.getRegClassForSizeOnBank(Size: DstSize, Bank: *DstBank); |
817 | if (!DstRC) |
818 | return false; |
819 | |
820 | const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI); |
821 | const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI); |
822 | const TargetRegisterClass *Src0RC = |
823 | TRI.getRegClassForSizeOnBank(Size: DstSize, Bank: *Src0Bank); |
824 | const TargetRegisterClass *Src1RC = |
825 | TRI.getRegClassForSizeOnBank(Size: InsSize, Bank: *Src1Bank); |
826 | |
827 | // Deal with weird cases where the class only partially supports the subreg |
828 | // index. |
829 | Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg); |
830 | if (!Src0RC || !Src1RC) |
831 | return false; |
832 | |
833 | if (!RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI) || |
834 | !RBI.constrainGenericRegister(Reg: Src0Reg, RC: *Src0RC, MRI&: *MRI) || |
835 | !RBI.constrainGenericRegister(Reg: Src1Reg, RC: *Src1RC, MRI&: *MRI)) |
836 | return false; |
837 | |
838 | const DebugLoc &DL = I.getDebugLoc(); |
839 | BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg) |
840 | .addReg(Src0Reg) |
841 | .addReg(Src1Reg) |
842 | .addImm(SubReg); |
843 | |
844 | I.eraseFromParent(); |
845 | return true; |
846 | } |
847 | |
848 | bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const { |
849 | Register DstReg = MI.getOperand(i: 0).getReg(); |
850 | Register SrcReg = MI.getOperand(i: 1).getReg(); |
851 | Register OffsetReg = MI.getOperand(i: 2).getReg(); |
852 | Register WidthReg = MI.getOperand(i: 3).getReg(); |
853 | |
854 | assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID && |
855 | "scalar BFX instructions are expanded in regbankselect" ); |
856 | assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 && |
857 | "64-bit vector BFX instructions are expanded in regbankselect" ); |
858 | |
859 | const DebugLoc &DL = MI.getDebugLoc(); |
860 | MachineBasicBlock *MBB = MI.getParent(); |
861 | |
862 | bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX; |
863 | unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64; |
864 | auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg) |
865 | .addReg(SrcReg) |
866 | .addReg(OffsetReg) |
867 | .addReg(WidthReg); |
868 | MI.eraseFromParent(); |
869 | return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); |
870 | } |
871 | |
872 | bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const { |
873 | if (STI.getLDSBankCount() != 16) |
874 | return selectImpl(I&: MI, CoverageInfo&: *CoverageInfo); |
875 | |
876 | Register Dst = MI.getOperand(i: 0).getReg(); |
877 | Register Src0 = MI.getOperand(i: 2).getReg(); |
878 | Register M0Val = MI.getOperand(i: 6).getReg(); |
879 | if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) || |
880 | !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) || |
881 | !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI)) |
882 | return false; |
883 | |
884 | // This requires 2 instructions. It is possible to write a pattern to support |
885 | // this, but the generated isel emitter doesn't correctly deal with multiple |
886 | // output instructions using the same physical register input. The copy to m0 |
887 | // is incorrectly placed before the second instruction. |
888 | // |
889 | // TODO: Match source modifiers. |
890 | |
891 | Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
892 | const DebugLoc &DL = MI.getDebugLoc(); |
893 | MachineBasicBlock *MBB = MI.getParent(); |
894 | |
895 | BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) |
896 | .addReg(M0Val); |
897 | BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov) |
898 | .addImm(2) |
899 | .addImm(MI.getOperand(4).getImm()) // $attr |
900 | .addImm(MI.getOperand(3).getImm()); // $attrchan |
901 | |
902 | BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst) |
903 | .addImm(0) // $src0_modifiers |
904 | .addReg(Src0) // $src0 |
905 | .addImm(MI.getOperand(4).getImm()) // $attr |
906 | .addImm(MI.getOperand(3).getImm()) // $attrchan |
907 | .addImm(0) // $src2_modifiers |
908 | .addReg(InterpMov) // $src2 - 2 f16 values selected by high |
909 | .addImm(MI.getOperand(5).getImm()) // $high |
910 | .addImm(0) // $clamp |
911 | .addImm(0); // $omod |
912 | |
913 | MI.eraseFromParent(); |
914 | return true; |
915 | } |
916 | |
917 | // Writelane is special in that it can use SGPR and M0 (which would normally |
918 | // count as using the constant bus twice - but in this case it is allowed since |
919 | // the lane selector doesn't count as a use of the constant bus). However, it is |
920 | // still required to abide by the 1 SGPR rule. Fix this up if we might have |
921 | // multiple SGPRs. |
922 | bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const { |
923 | // With a constant bus limit of at least 2, there's no issue. |
924 | if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1) |
925 | return selectImpl(I&: MI, CoverageInfo&: *CoverageInfo); |
926 | |
927 | MachineBasicBlock *MBB = MI.getParent(); |
928 | const DebugLoc &DL = MI.getDebugLoc(); |
929 | Register VDst = MI.getOperand(i: 0).getReg(); |
930 | Register Val = MI.getOperand(i: 2).getReg(); |
931 | Register LaneSelect = MI.getOperand(i: 3).getReg(); |
932 | Register VDstIn = MI.getOperand(i: 4).getReg(); |
933 | |
934 | auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst); |
935 | |
936 | std::optional<ValueAndVReg> ConstSelect = |
937 | getIConstantVRegValWithLookThrough(VReg: LaneSelect, MRI: *MRI); |
938 | if (ConstSelect) { |
939 | // The selector has to be an inline immediate, so we can use whatever for |
940 | // the other operands. |
941 | MIB.addReg(Val); |
942 | MIB.addImm(ConstSelect->Value.getSExtValue() & |
943 | maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2())); |
944 | } else { |
945 | std::optional<ValueAndVReg> ConstVal = |
946 | getIConstantVRegValWithLookThrough(VReg: Val, MRI: *MRI); |
947 | |
948 | // If the value written is an inline immediate, we can get away without a |
949 | // copy to m0. |
950 | if (ConstVal && AMDGPU::isInlinableLiteral32(Literal: ConstVal->Value.getSExtValue(), |
951 | HasInv2Pi: STI.hasInv2PiInlineImm())) { |
952 | MIB.addImm(ConstVal->Value.getSExtValue()); |
953 | MIB.addReg(LaneSelect); |
954 | } else { |
955 | MIB.addReg(Val); |
956 | |
957 | // If the lane selector was originally in a VGPR and copied with |
958 | // readfirstlane, there's a hazard to read the same SGPR from the |
959 | // VALU. Constrain to a different SGPR to help avoid needing a nop later. |
960 | RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI); |
961 | |
962 | BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) |
963 | .addReg(LaneSelect); |
964 | MIB.addReg(AMDGPU::M0); |
965 | } |
966 | } |
967 | |
968 | MIB.addReg(VDstIn); |
969 | |
970 | MI.eraseFromParent(); |
971 | return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); |
972 | } |
973 | |
974 | // We need to handle this here because tablegen doesn't support matching |
975 | // instructions with multiple outputs. |
976 | bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const { |
977 | Register Dst0 = MI.getOperand(i: 0).getReg(); |
978 | Register Dst1 = MI.getOperand(i: 1).getReg(); |
979 | |
980 | LLT Ty = MRI->getType(Reg: Dst0); |
981 | unsigned Opc; |
982 | if (Ty == LLT::scalar(SizeInBits: 32)) |
983 | Opc = AMDGPU::V_DIV_SCALE_F32_e64; |
984 | else if (Ty == LLT::scalar(SizeInBits: 64)) |
985 | Opc = AMDGPU::V_DIV_SCALE_F64_e64; |
986 | else |
987 | return false; |
988 | |
989 | // TODO: Match source modifiers. |
990 | |
991 | const DebugLoc &DL = MI.getDebugLoc(); |
992 | MachineBasicBlock *MBB = MI.getParent(); |
993 | |
994 | Register Numer = MI.getOperand(i: 3).getReg(); |
995 | Register Denom = MI.getOperand(i: 4).getReg(); |
996 | unsigned ChooseDenom = MI.getOperand(i: 5).getImm(); |
997 | |
998 | Register Src0 = ChooseDenom != 0 ? Numer : Denom; |
999 | |
1000 | auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0) |
1001 | .addDef(Dst1) |
1002 | .addImm(0) // $src0_modifiers |
1003 | .addUse(Src0) // $src0 |
1004 | .addImm(0) // $src1_modifiers |
1005 | .addUse(Denom) // $src1 |
1006 | .addImm(0) // $src2_modifiers |
1007 | .addUse(Numer) // $src2 |
1008 | .addImm(0) // $clamp |
1009 | .addImm(0); // $omod |
1010 | |
1011 | MI.eraseFromParent(); |
1012 | return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); |
1013 | } |
1014 | |
1015 | bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const { |
1016 | unsigned IntrinsicID = cast<GIntrinsic>(Val&: I).getIntrinsicID(); |
1017 | switch (IntrinsicID) { |
1018 | case Intrinsic::amdgcn_if_break: { |
1019 | MachineBasicBlock *BB = I.getParent(); |
1020 | |
1021 | // FIXME: Manually selecting to avoid dealing with the SReg_1 trick |
1022 | // SelectionDAG uses for wave32 vs wave64. |
1023 | BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK)) |
1024 | .add(I.getOperand(0)) |
1025 | .add(I.getOperand(2)) |
1026 | .add(I.getOperand(3)); |
1027 | |
1028 | Register DstReg = I.getOperand(i: 0).getReg(); |
1029 | Register Src0Reg = I.getOperand(i: 2).getReg(); |
1030 | Register Src1Reg = I.getOperand(i: 3).getReg(); |
1031 | |
1032 | I.eraseFromParent(); |
1033 | |
1034 | for (Register Reg : { DstReg, Src0Reg, Src1Reg }) |
1035 | MRI->setRegClass(Reg, RC: TRI.getWaveMaskRegClass()); |
1036 | |
1037 | return true; |
1038 | } |
1039 | case Intrinsic::amdgcn_interp_p1_f16: |
1040 | return selectInterpP1F16(MI&: I); |
1041 | case Intrinsic::amdgcn_wqm: |
1042 | return constrainCopyLikeIntrin(I, AMDGPU::WQM); |
1043 | case Intrinsic::amdgcn_softwqm: |
1044 | return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM); |
1045 | case Intrinsic::amdgcn_strict_wwm: |
1046 | case Intrinsic::amdgcn_wwm: |
1047 | return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM); |
1048 | case Intrinsic::amdgcn_strict_wqm: |
1049 | return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM); |
1050 | case Intrinsic::amdgcn_writelane: |
1051 | return selectWritelane(MI&: I); |
1052 | case Intrinsic::amdgcn_div_scale: |
1053 | return selectDivScale(MI&: I); |
1054 | case Intrinsic::amdgcn_icmp: |
1055 | case Intrinsic::amdgcn_fcmp: |
1056 | if (selectImpl(I, CoverageInfo&: *CoverageInfo)) |
1057 | return true; |
1058 | return selectIntrinsicCmp(MI&: I); |
1059 | case Intrinsic::amdgcn_ballot: |
1060 | return selectBallot(I); |
1061 | case Intrinsic::amdgcn_inverse_ballot: |
1062 | return selectInverseBallot(I); |
1063 | case Intrinsic::amdgcn_reloc_constant: |
1064 | return selectRelocConstant(I); |
1065 | case Intrinsic::amdgcn_groupstaticsize: |
1066 | return selectGroupStaticSize(I); |
1067 | case Intrinsic::returnaddress: |
1068 | return selectReturnAddress(I); |
1069 | case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16: |
1070 | case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16: |
1071 | case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16: |
1072 | case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16: |
1073 | case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8: |
1074 | case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8: |
1075 | case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8: |
1076 | case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8: |
1077 | case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8: |
1078 | case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8: |
1079 | case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8: |
1080 | case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8: |
1081 | case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8: |
1082 | case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8: |
1083 | return selectSMFMACIntrin(I); |
1084 | default: |
1085 | return selectImpl(I, CoverageInfo&: *CoverageInfo); |
1086 | } |
1087 | } |
1088 | |
1089 | static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size, |
1090 | const GCNSubtarget &ST) { |
1091 | if (Size != 16 && Size != 32 && Size != 64) |
1092 | return -1; |
1093 | |
1094 | if (Size == 16 && !ST.has16BitInsts()) |
1095 | return -1; |
1096 | |
1097 | const auto Select = [&](unsigned S16Opc, unsigned TrueS16Opc, unsigned S32Opc, |
1098 | unsigned S64Opc) { |
1099 | if (Size == 16) |
1100 | return ST.hasTrue16BitInsts() ? TrueS16Opc : S16Opc; |
1101 | if (Size == 32) |
1102 | return S32Opc; |
1103 | return S64Opc; |
1104 | }; |
1105 | |
1106 | switch (P) { |
1107 | default: |
1108 | llvm_unreachable("Unknown condition code!" ); |
1109 | case CmpInst::ICMP_NE: |
1110 | return Select(AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_U16_t16_e64, |
1111 | AMDGPU::V_CMP_NE_U32_e64, AMDGPU::V_CMP_NE_U64_e64); |
1112 | case CmpInst::ICMP_EQ: |
1113 | return Select(AMDGPU::V_CMP_EQ_U16_e64, AMDGPU::V_CMP_EQ_U16_t16_e64, |
1114 | AMDGPU::V_CMP_EQ_U32_e64, AMDGPU::V_CMP_EQ_U64_e64); |
1115 | case CmpInst::ICMP_SGT: |
1116 | return Select(AMDGPU::V_CMP_GT_I16_e64, AMDGPU::V_CMP_GT_I16_t16_e64, |
1117 | AMDGPU::V_CMP_GT_I32_e64, AMDGPU::V_CMP_GT_I64_e64); |
1118 | case CmpInst::ICMP_SGE: |
1119 | return Select(AMDGPU::V_CMP_GE_I16_e64, AMDGPU::V_CMP_GE_I16_t16_e64, |
1120 | AMDGPU::V_CMP_GE_I32_e64, AMDGPU::V_CMP_GE_I64_e64); |
1121 | case CmpInst::ICMP_SLT: |
1122 | return Select(AMDGPU::V_CMP_LT_I16_e64, AMDGPU::V_CMP_LT_I16_t16_e64, |
1123 | AMDGPU::V_CMP_LT_I32_e64, AMDGPU::V_CMP_LT_I64_e64); |
1124 | case CmpInst::ICMP_SLE: |
1125 | return Select(AMDGPU::V_CMP_LE_I16_e64, AMDGPU::V_CMP_LE_I16_t16_e64, |
1126 | AMDGPU::V_CMP_LE_I32_e64, AMDGPU::V_CMP_LE_I64_e64); |
1127 | case CmpInst::ICMP_UGT: |
1128 | return Select(AMDGPU::V_CMP_GT_U16_e64, AMDGPU::V_CMP_GT_U16_t16_e64, |
1129 | AMDGPU::V_CMP_GT_U32_e64, AMDGPU::V_CMP_GT_U64_e64); |
1130 | case CmpInst::ICMP_UGE: |
1131 | return Select(AMDGPU::V_CMP_GE_U16_e64, AMDGPU::V_CMP_GE_U16_t16_e64, |
1132 | AMDGPU::V_CMP_GE_U32_e64, AMDGPU::V_CMP_GE_U64_e64); |
1133 | case CmpInst::ICMP_ULT: |
1134 | return Select(AMDGPU::V_CMP_LT_U16_e64, AMDGPU::V_CMP_LT_U16_t16_e64, |
1135 | AMDGPU::V_CMP_LT_U32_e64, AMDGPU::V_CMP_LT_U64_e64); |
1136 | case CmpInst::ICMP_ULE: |
1137 | return Select(AMDGPU::V_CMP_LE_U16_e64, AMDGPU::V_CMP_LE_U16_t16_e64, |
1138 | AMDGPU::V_CMP_LE_U32_e64, AMDGPU::V_CMP_LE_U64_e64); |
1139 | |
1140 | case CmpInst::FCMP_OEQ: |
1141 | return Select(AMDGPU::V_CMP_EQ_F16_e64, AMDGPU::V_CMP_EQ_F16_t16_e64, |
1142 | AMDGPU::V_CMP_EQ_F32_e64, AMDGPU::V_CMP_EQ_F64_e64); |
1143 | case CmpInst::FCMP_OGT: |
1144 | return Select(AMDGPU::V_CMP_GT_F16_e64, AMDGPU::V_CMP_GT_F16_t16_e64, |
1145 | AMDGPU::V_CMP_GT_F32_e64, AMDGPU::V_CMP_GT_F64_e64); |
1146 | case CmpInst::FCMP_OGE: |
1147 | return Select(AMDGPU::V_CMP_GE_F16_e64, AMDGPU::V_CMP_GE_F16_t16_e64, |
1148 | AMDGPU::V_CMP_GE_F32_e64, AMDGPU::V_CMP_GE_F64_e64); |
1149 | case CmpInst::FCMP_OLT: |
1150 | return Select(AMDGPU::V_CMP_LT_F16_e64, AMDGPU::V_CMP_LT_F16_t16_e64, |
1151 | AMDGPU::V_CMP_LT_F32_e64, AMDGPU::V_CMP_LT_F64_e64); |
1152 | case CmpInst::FCMP_OLE: |
1153 | return Select(AMDGPU::V_CMP_LE_F16_e64, AMDGPU::V_CMP_LE_F16_t16_e64, |
1154 | AMDGPU::V_CMP_LE_F32_e64, AMDGPU::V_CMP_LE_F64_e64); |
1155 | case CmpInst::FCMP_ONE: |
1156 | return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64, |
1157 | AMDGPU::V_CMP_NEQ_F32_e64, AMDGPU::V_CMP_NEQ_F64_e64); |
1158 | case CmpInst::FCMP_ORD: |
1159 | return Select(AMDGPU::V_CMP_O_F16_e64, AMDGPU::V_CMP_O_F16_t16_e64, |
1160 | AMDGPU::V_CMP_O_F32_e64, AMDGPU::V_CMP_O_F64_e64); |
1161 | case CmpInst::FCMP_UNO: |
1162 | return Select(AMDGPU::V_CMP_U_F16_e64, AMDGPU::V_CMP_U_F16_t16_e64, |
1163 | AMDGPU::V_CMP_U_F32_e64, AMDGPU::V_CMP_U_F64_e64); |
1164 | case CmpInst::FCMP_UEQ: |
1165 | return Select(AMDGPU::V_CMP_NLG_F16_e64, AMDGPU::V_CMP_NLG_F16_t16_e64, |
1166 | AMDGPU::V_CMP_NLG_F32_e64, AMDGPU::V_CMP_NLG_F64_e64); |
1167 | case CmpInst::FCMP_UGT: |
1168 | return Select(AMDGPU::V_CMP_NLE_F16_e64, AMDGPU::V_CMP_NLE_F16_t16_e64, |
1169 | AMDGPU::V_CMP_NLE_F32_e64, AMDGPU::V_CMP_NLE_F64_e64); |
1170 | case CmpInst::FCMP_UGE: |
1171 | return Select(AMDGPU::V_CMP_NLT_F16_e64, AMDGPU::V_CMP_NLT_F16_t16_e64, |
1172 | AMDGPU::V_CMP_NLT_F32_e64, AMDGPU::V_CMP_NLT_F64_e64); |
1173 | case CmpInst::FCMP_ULT: |
1174 | return Select(AMDGPU::V_CMP_NGE_F16_e64, AMDGPU::V_CMP_NGE_F16_t16_e64, |
1175 | AMDGPU::V_CMP_NGE_F32_e64, AMDGPU::V_CMP_NGE_F64_e64); |
1176 | case CmpInst::FCMP_ULE: |
1177 | return Select(AMDGPU::V_CMP_NGT_F16_e64, AMDGPU::V_CMP_NGT_F16_t16_e64, |
1178 | AMDGPU::V_CMP_NGT_F32_e64, AMDGPU::V_CMP_NGT_F64_e64); |
1179 | case CmpInst::FCMP_UNE: |
1180 | return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64, |
1181 | AMDGPU::V_CMP_NEQ_F32_e64, AMDGPU::V_CMP_NEQ_F64_e64); |
1182 | case CmpInst::FCMP_TRUE: |
1183 | return Select(AMDGPU::V_CMP_TRU_F16_e64, AMDGPU::V_CMP_TRU_F16_t16_e64, |
1184 | AMDGPU::V_CMP_TRU_F32_e64, AMDGPU::V_CMP_TRU_F64_e64); |
1185 | case CmpInst::FCMP_FALSE: |
1186 | return Select(AMDGPU::V_CMP_F_F16_e64, AMDGPU::V_CMP_F_F16_t16_e64, |
1187 | AMDGPU::V_CMP_F_F32_e64, AMDGPU::V_CMP_F_F64_e64); |
1188 | } |
1189 | } |
1190 | |
1191 | int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P, |
1192 | unsigned Size) const { |
1193 | if (Size == 64) { |
1194 | if (!STI.hasScalarCompareEq64()) |
1195 | return -1; |
1196 | |
1197 | switch (P) { |
1198 | case CmpInst::ICMP_NE: |
1199 | return AMDGPU::S_CMP_LG_U64; |
1200 | case CmpInst::ICMP_EQ: |
1201 | return AMDGPU::S_CMP_EQ_U64; |
1202 | default: |
1203 | return -1; |
1204 | } |
1205 | } |
1206 | |
1207 | if (Size == 32) { |
1208 | switch (P) { |
1209 | case CmpInst::ICMP_NE: |
1210 | return AMDGPU::S_CMP_LG_U32; |
1211 | case CmpInst::ICMP_EQ: |
1212 | return AMDGPU::S_CMP_EQ_U32; |
1213 | case CmpInst::ICMP_SGT: |
1214 | return AMDGPU::S_CMP_GT_I32; |
1215 | case CmpInst::ICMP_SGE: |
1216 | return AMDGPU::S_CMP_GE_I32; |
1217 | case CmpInst::ICMP_SLT: |
1218 | return AMDGPU::S_CMP_LT_I32; |
1219 | case CmpInst::ICMP_SLE: |
1220 | return AMDGPU::S_CMP_LE_I32; |
1221 | case CmpInst::ICMP_UGT: |
1222 | return AMDGPU::S_CMP_GT_U32; |
1223 | case CmpInst::ICMP_UGE: |
1224 | return AMDGPU::S_CMP_GE_U32; |
1225 | case CmpInst::ICMP_ULT: |
1226 | return AMDGPU::S_CMP_LT_U32; |
1227 | case CmpInst::ICMP_ULE: |
1228 | return AMDGPU::S_CMP_LE_U32; |
1229 | case CmpInst::FCMP_OEQ: |
1230 | return AMDGPU::S_CMP_EQ_F32; |
1231 | case CmpInst::FCMP_OGT: |
1232 | return AMDGPU::S_CMP_GT_F32; |
1233 | case CmpInst::FCMP_OGE: |
1234 | return AMDGPU::S_CMP_GE_F32; |
1235 | case CmpInst::FCMP_OLT: |
1236 | return AMDGPU::S_CMP_LT_F32; |
1237 | case CmpInst::FCMP_OLE: |
1238 | return AMDGPU::S_CMP_LE_F32; |
1239 | case CmpInst::FCMP_ONE: |
1240 | return AMDGPU::S_CMP_LG_F32; |
1241 | case CmpInst::FCMP_ORD: |
1242 | return AMDGPU::S_CMP_O_F32; |
1243 | case CmpInst::FCMP_UNO: |
1244 | return AMDGPU::S_CMP_U_F32; |
1245 | case CmpInst::FCMP_UEQ: |
1246 | return AMDGPU::S_CMP_NLG_F32; |
1247 | case CmpInst::FCMP_UGT: |
1248 | return AMDGPU::S_CMP_NLE_F32; |
1249 | case CmpInst::FCMP_UGE: |
1250 | return AMDGPU::S_CMP_NLT_F32; |
1251 | case CmpInst::FCMP_ULT: |
1252 | return AMDGPU::S_CMP_NGE_F32; |
1253 | case CmpInst::FCMP_ULE: |
1254 | return AMDGPU::S_CMP_NGT_F32; |
1255 | case CmpInst::FCMP_UNE: |
1256 | return AMDGPU::S_CMP_NEQ_F32; |
1257 | default: |
1258 | llvm_unreachable("Unknown condition code!" ); |
1259 | } |
1260 | } |
1261 | |
1262 | if (Size == 16) { |
1263 | if (!STI.hasSALUFloatInsts()) |
1264 | return -1; |
1265 | |
1266 | switch (P) { |
1267 | case CmpInst::FCMP_OEQ: |
1268 | return AMDGPU::S_CMP_EQ_F16; |
1269 | case CmpInst::FCMP_OGT: |
1270 | return AMDGPU::S_CMP_GT_F16; |
1271 | case CmpInst::FCMP_OGE: |
1272 | return AMDGPU::S_CMP_GE_F16; |
1273 | case CmpInst::FCMP_OLT: |
1274 | return AMDGPU::S_CMP_LT_F16; |
1275 | case CmpInst::FCMP_OLE: |
1276 | return AMDGPU::S_CMP_LE_F16; |
1277 | case CmpInst::FCMP_ONE: |
1278 | return AMDGPU::S_CMP_LG_F16; |
1279 | case CmpInst::FCMP_ORD: |
1280 | return AMDGPU::S_CMP_O_F16; |
1281 | case CmpInst::FCMP_UNO: |
1282 | return AMDGPU::S_CMP_U_F16; |
1283 | case CmpInst::FCMP_UEQ: |
1284 | return AMDGPU::S_CMP_NLG_F16; |
1285 | case CmpInst::FCMP_UGT: |
1286 | return AMDGPU::S_CMP_NLE_F16; |
1287 | case CmpInst::FCMP_UGE: |
1288 | return AMDGPU::S_CMP_NLT_F16; |
1289 | case CmpInst::FCMP_ULT: |
1290 | return AMDGPU::S_CMP_NGE_F16; |
1291 | case CmpInst::FCMP_ULE: |
1292 | return AMDGPU::S_CMP_NGT_F16; |
1293 | case CmpInst::FCMP_UNE: |
1294 | return AMDGPU::S_CMP_NEQ_F16; |
1295 | default: |
1296 | llvm_unreachable("Unknown condition code!" ); |
1297 | } |
1298 | } |
1299 | |
1300 | return -1; |
1301 | } |
1302 | |
1303 | bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const { |
1304 | |
1305 | MachineBasicBlock *BB = I.getParent(); |
1306 | const DebugLoc &DL = I.getDebugLoc(); |
1307 | |
1308 | Register SrcReg = I.getOperand(i: 2).getReg(); |
1309 | unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI); |
1310 | |
1311 | auto Pred = (CmpInst::Predicate)I.getOperand(i: 1).getPredicate(); |
1312 | |
1313 | Register CCReg = I.getOperand(i: 0).getReg(); |
1314 | if (!isVCC(Reg: CCReg, MRI: *MRI)) { |
1315 | int Opcode = getS_CMPOpcode(P: Pred, Size); |
1316 | if (Opcode == -1) |
1317 | return false; |
1318 | MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode)) |
1319 | .add(I.getOperand(i: 2)) |
1320 | .add(I.getOperand(i: 3)); |
1321 | BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg) |
1322 | .addReg(AMDGPU::SCC); |
1323 | bool Ret = |
1324 | constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) && |
1325 | RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI); |
1326 | I.eraseFromParent(); |
1327 | return Ret; |
1328 | } |
1329 | |
1330 | if (I.getOpcode() == AMDGPU::G_FCMP) |
1331 | return false; |
1332 | |
1333 | int Opcode = getV_CMPOpcode(P: Pred, Size, ST: *Subtarget); |
1334 | if (Opcode == -1) |
1335 | return false; |
1336 | |
1337 | MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), |
1338 | I.getOperand(i: 0).getReg()) |
1339 | .add(I.getOperand(i: 2)) |
1340 | .add(I.getOperand(i: 3)); |
1341 | RBI.constrainGenericRegister(Reg: ICmp->getOperand(i: 0).getReg(), |
1342 | RC: *TRI.getBoolRC(), MRI&: *MRI); |
1343 | bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI); |
1344 | I.eraseFromParent(); |
1345 | return Ret; |
1346 | } |
1347 | |
1348 | bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const { |
1349 | Register Dst = I.getOperand(i: 0).getReg(); |
1350 | if (isVCC(Reg: Dst, MRI: *MRI)) |
1351 | return false; |
1352 | |
1353 | LLT DstTy = MRI->getType(Reg: Dst); |
1354 | if (DstTy.getSizeInBits() != STI.getWavefrontSize()) |
1355 | return false; |
1356 | |
1357 | MachineBasicBlock *BB = I.getParent(); |
1358 | const DebugLoc &DL = I.getDebugLoc(); |
1359 | Register SrcReg = I.getOperand(i: 2).getReg(); |
1360 | unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI); |
1361 | |
1362 | // i1 inputs are not supported in GlobalISel. |
1363 | if (Size == 1) |
1364 | return false; |
1365 | |
1366 | auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(i: 4).getImm()); |
1367 | if (!CmpInst::isIntPredicate(P: Pred) && !CmpInst::isFPPredicate(P: Pred)) { |
1368 | BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Dst); |
1369 | I.eraseFromParent(); |
1370 | return RBI.constrainGenericRegister(Reg: Dst, RC: *TRI.getBoolRC(), MRI&: *MRI); |
1371 | } |
1372 | |
1373 | const int Opcode = getV_CMPOpcode(P: Pred, Size, ST: *Subtarget); |
1374 | if (Opcode == -1) |
1375 | return false; |
1376 | |
1377 | MachineInstrBuilder SelectedMI; |
1378 | MachineOperand &LHS = I.getOperand(i: 2); |
1379 | MachineOperand &RHS = I.getOperand(i: 3); |
1380 | auto [Src0, Src0Mods] = selectVOP3ModsImpl(Root&: LHS); |
1381 | auto [Src1, Src1Mods] = selectVOP3ModsImpl(Root&: RHS); |
1382 | Register Src0Reg = |
1383 | copyToVGPRIfSrcFolded(Src: Src0, Mods: Src0Mods, Root: LHS, InsertPt: &I, /*ForceVGPR*/ true); |
1384 | Register Src1Reg = |
1385 | copyToVGPRIfSrcFolded(Src: Src1, Mods: Src1Mods, Root: RHS, InsertPt: &I, /*ForceVGPR*/ true); |
1386 | SelectedMI = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst); |
1387 | if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers)) |
1388 | SelectedMI.addImm(Val: Src0Mods); |
1389 | SelectedMI.addReg(RegNo: Src0Reg); |
1390 | if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src1_modifiers)) |
1391 | SelectedMI.addImm(Val: Src1Mods); |
1392 | SelectedMI.addReg(RegNo: Src1Reg); |
1393 | if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::clamp)) |
1394 | SelectedMI.addImm(Val: 0); // clamp |
1395 | if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel)) |
1396 | SelectedMI.addImm(Val: 0); // op_sel |
1397 | |
1398 | RBI.constrainGenericRegister(Reg: Dst, RC: *TRI.getBoolRC(), MRI&: *MRI); |
1399 | if (!constrainSelectedInstRegOperands(*SelectedMI, TII, TRI, RBI)) |
1400 | return false; |
1401 | |
1402 | I.eraseFromParent(); |
1403 | return true; |
1404 | } |
1405 | |
1406 | bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const { |
1407 | MachineBasicBlock *BB = I.getParent(); |
1408 | const DebugLoc &DL = I.getDebugLoc(); |
1409 | Register DstReg = I.getOperand(i: 0).getReg(); |
1410 | const unsigned Size = MRI->getType(Reg: DstReg).getSizeInBits(); |
1411 | const bool Is64 = Size == 64; |
1412 | const bool IsWave32 = (STI.getWavefrontSize() == 32); |
1413 | |
1414 | // In the common case, the return type matches the wave size. |
1415 | // However we also support emitting i64 ballots in wave32 mode. |
1416 | if (Size != STI.getWavefrontSize() && (!Is64 || !IsWave32)) |
1417 | return false; |
1418 | |
1419 | std::optional<ValueAndVReg> Arg = |
1420 | getIConstantVRegValWithLookThrough(VReg: I.getOperand(i: 2).getReg(), MRI: *MRI); |
1421 | |
1422 | const auto BuildCopy = [&](Register SrcReg) { |
1423 | if (Size == STI.getWavefrontSize()) { |
1424 | BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg) |
1425 | .addReg(SrcReg); |
1426 | return; |
1427 | } |
1428 | |
1429 | // If emitting a i64 ballot in wave32, fill the upper bits with zeroes. |
1430 | Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); |
1431 | BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0); |
1432 | BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) |
1433 | .addReg(SrcReg) |
1434 | .addImm(AMDGPU::sub0) |
1435 | .addReg(HiReg) |
1436 | .addImm(AMDGPU::sub1); |
1437 | }; |
1438 | |
1439 | if (Arg) { |
1440 | const int64_t Value = Arg->Value.getSExtValue(); |
1441 | if (Value == 0) { |
1442 | unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; |
1443 | BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0); |
1444 | } else if (Value == -1) // all ones |
1445 | BuildCopy(IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC); |
1446 | else |
1447 | return false; |
1448 | } else |
1449 | BuildCopy(I.getOperand(i: 2).getReg()); |
1450 | |
1451 | I.eraseFromParent(); |
1452 | return true; |
1453 | } |
1454 | |
1455 | bool AMDGPUInstructionSelector::selectInverseBallot(MachineInstr &I) const { |
1456 | MachineBasicBlock *BB = I.getParent(); |
1457 | const DebugLoc &DL = I.getDebugLoc(); |
1458 | const Register DstReg = I.getOperand(i: 0).getReg(); |
1459 | const Register MaskReg = I.getOperand(i: 2).getReg(); |
1460 | |
1461 | BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(MaskReg); |
1462 | I.eraseFromParent(); |
1463 | return true; |
1464 | } |
1465 | |
1466 | bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const { |
1467 | Register DstReg = I.getOperand(i: 0).getReg(); |
1468 | const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); |
1469 | const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(Size: 32, Bank: *DstBank); |
1470 | if (!DstRC || !RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI)) |
1471 | return false; |
1472 | |
1473 | const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID; |
1474 | |
1475 | Module *M = MF->getFunction().getParent(); |
1476 | const MDNode *Metadata = I.getOperand(i: 2).getMetadata(); |
1477 | auto SymbolName = cast<MDString>(Val: Metadata->getOperand(I: 0))->getString(); |
1478 | auto RelocSymbol = cast<GlobalVariable>( |
1479 | Val: M->getOrInsertGlobal(Name: SymbolName, Ty: Type::getInt32Ty(C&: M->getContext()))); |
1480 | |
1481 | MachineBasicBlock *BB = I.getParent(); |
1482 | BuildMI(*BB, &I, I.getDebugLoc(), |
1483 | TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg) |
1484 | .addGlobalAddress(RelocSymbol, 0, SIInstrInfo::MO_ABS32_LO); |
1485 | |
1486 | I.eraseFromParent(); |
1487 | return true; |
1488 | } |
1489 | |
1490 | bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const { |
1491 | Triple::OSType OS = MF->getTarget().getTargetTriple().getOS(); |
1492 | |
1493 | Register DstReg = I.getOperand(i: 0).getReg(); |
1494 | const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); |
1495 | unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ? |
1496 | AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; |
1497 | |
1498 | MachineBasicBlock *MBB = I.getParent(); |
1499 | const DebugLoc &DL = I.getDebugLoc(); |
1500 | |
1501 | auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg); |
1502 | |
1503 | if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) { |
1504 | const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); |
1505 | MIB.addImm(MFI->getLDSSize()); |
1506 | } else { |
1507 | Module *M = MF->getFunction().getParent(); |
1508 | const GlobalValue *GV |
1509 | = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_groupstaticsize); |
1510 | MIB.addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO); |
1511 | } |
1512 | |
1513 | I.eraseFromParent(); |
1514 | return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); |
1515 | } |
1516 | |
1517 | bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const { |
1518 | MachineBasicBlock *MBB = I.getParent(); |
1519 | MachineFunction &MF = *MBB->getParent(); |
1520 | const DebugLoc &DL = I.getDebugLoc(); |
1521 | |
1522 | MachineOperand &Dst = I.getOperand(i: 0); |
1523 | Register DstReg = Dst.getReg(); |
1524 | unsigned Depth = I.getOperand(i: 2).getImm(); |
1525 | |
1526 | const TargetRegisterClass *RC |
1527 | = TRI.getConstrainedRegClassForOperand(MO: Dst, MRI: *MRI); |
1528 | if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) || |
1529 | !RBI.constrainGenericRegister(DstReg, *RC, *MRI)) |
1530 | return false; |
1531 | |
1532 | // Check for kernel and shader functions |
1533 | if (Depth != 0 || |
1534 | MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) { |
1535 | BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg) |
1536 | .addImm(0); |
1537 | I.eraseFromParent(); |
1538 | return true; |
1539 | } |
1540 | |
1541 | MachineFrameInfo &MFI = MF.getFrameInfo(); |
1542 | // There is a call to @llvm.returnaddress in this function |
1543 | MFI.setReturnAddressIsTaken(true); |
1544 | |
1545 | // Get the return address reg and mark it as an implicit live-in |
1546 | Register ReturnAddrReg = TRI.getReturnAddressReg(MF); |
1547 | Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg, |
1548 | AMDGPU::SReg_64RegClass, DL); |
1549 | BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg) |
1550 | .addReg(LiveIn); |
1551 | I.eraseFromParent(); |
1552 | return true; |
1553 | } |
1554 | |
1555 | bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const { |
1556 | // FIXME: Manually selecting to avoid dealing with the SReg_1 trick |
1557 | // SelectionDAG uses for wave32 vs wave64. |
1558 | MachineBasicBlock *BB = MI.getParent(); |
1559 | BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF)) |
1560 | .add(MI.getOperand(1)); |
1561 | |
1562 | Register Reg = MI.getOperand(i: 1).getReg(); |
1563 | MI.eraseFromParent(); |
1564 | |
1565 | if (!MRI->getRegClassOrNull(Reg)) |
1566 | MRI->setRegClass(Reg, RC: TRI.getWaveMaskRegClass()); |
1567 | return true; |
1568 | } |
1569 | |
1570 | bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic( |
1571 | MachineInstr &MI, Intrinsic::ID IntrID) const { |
1572 | MachineBasicBlock *MBB = MI.getParent(); |
1573 | MachineFunction *MF = MBB->getParent(); |
1574 | const DebugLoc &DL = MI.getDebugLoc(); |
1575 | |
1576 | unsigned IndexOperand = MI.getOperand(i: 7).getImm(); |
1577 | bool WaveRelease = MI.getOperand(i: 8).getImm() != 0; |
1578 | bool WaveDone = MI.getOperand(i: 9).getImm() != 0; |
1579 | |
1580 | if (WaveDone && !WaveRelease) |
1581 | report_fatal_error(reason: "ds_ordered_count: wave_done requires wave_release" ); |
1582 | |
1583 | unsigned OrderedCountIndex = IndexOperand & 0x3f; |
1584 | IndexOperand &= ~0x3f; |
1585 | unsigned CountDw = 0; |
1586 | |
1587 | if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) { |
1588 | CountDw = (IndexOperand >> 24) & 0xf; |
1589 | IndexOperand &= ~(0xf << 24); |
1590 | |
1591 | if (CountDw < 1 || CountDw > 4) { |
1592 | report_fatal_error( |
1593 | reason: "ds_ordered_count: dword count must be between 1 and 4" ); |
1594 | } |
1595 | } |
1596 | |
1597 | if (IndexOperand) |
1598 | report_fatal_error(reason: "ds_ordered_count: bad index operand" ); |
1599 | |
1600 | unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1; |
1601 | unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(MF: *MF); |
1602 | |
1603 | unsigned Offset0 = OrderedCountIndex << 2; |
1604 | unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4); |
1605 | |
1606 | if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) |
1607 | Offset1 |= (CountDw - 1) << 6; |
1608 | |
1609 | if (STI.getGeneration() < AMDGPUSubtarget::GFX11) |
1610 | Offset1 |= ShaderType << 2; |
1611 | |
1612 | unsigned Offset = Offset0 | (Offset1 << 8); |
1613 | |
1614 | Register M0Val = MI.getOperand(i: 2).getReg(); |
1615 | BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) |
1616 | .addReg(M0Val); |
1617 | |
1618 | Register DstReg = MI.getOperand(i: 0).getReg(); |
1619 | Register ValReg = MI.getOperand(i: 3).getReg(); |
1620 | MachineInstrBuilder DS = |
1621 | BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg) |
1622 | .addReg(ValReg) |
1623 | .addImm(Offset) |
1624 | .cloneMemRefs(MI); |
1625 | |
1626 | if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI)) |
1627 | return false; |
1628 | |
1629 | bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI); |
1630 | MI.eraseFromParent(); |
1631 | return Ret; |
1632 | } |
1633 | |
1634 | static unsigned gwsIntrinToOpcode(unsigned IntrID) { |
1635 | switch (IntrID) { |
1636 | case Intrinsic::amdgcn_ds_gws_init: |
1637 | return AMDGPU::DS_GWS_INIT; |
1638 | case Intrinsic::amdgcn_ds_gws_barrier: |
1639 | return AMDGPU::DS_GWS_BARRIER; |
1640 | case Intrinsic::amdgcn_ds_gws_sema_v: |
1641 | return AMDGPU::DS_GWS_SEMA_V; |
1642 | case Intrinsic::amdgcn_ds_gws_sema_br: |
1643 | return AMDGPU::DS_GWS_SEMA_BR; |
1644 | case Intrinsic::amdgcn_ds_gws_sema_p: |
1645 | return AMDGPU::DS_GWS_SEMA_P; |
1646 | case Intrinsic::amdgcn_ds_gws_sema_release_all: |
1647 | return AMDGPU::DS_GWS_SEMA_RELEASE_ALL; |
1648 | default: |
1649 | llvm_unreachable("not a gws intrinsic" ); |
1650 | } |
1651 | } |
1652 | |
1653 | bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI, |
1654 | Intrinsic::ID IID) const { |
1655 | if (!STI.hasGWS() || (IID == Intrinsic::amdgcn_ds_gws_sema_release_all && |
1656 | !STI.hasGWSSemaReleaseAll())) |
1657 | return false; |
1658 | |
1659 | // intrinsic ID, vsrc, offset |
1660 | const bool HasVSrc = MI.getNumOperands() == 3; |
1661 | assert(HasVSrc || MI.getNumOperands() == 2); |
1662 | |
1663 | Register BaseOffset = MI.getOperand(i: HasVSrc ? 2 : 1).getReg(); |
1664 | const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI); |
1665 | if (OffsetRB->getID() != AMDGPU::SGPRRegBankID) |
1666 | return false; |
1667 | |
1668 | MachineInstr *OffsetDef = getDefIgnoringCopies(Reg: BaseOffset, MRI: *MRI); |
1669 | unsigned ImmOffset; |
1670 | |
1671 | MachineBasicBlock *MBB = MI.getParent(); |
1672 | const DebugLoc &DL = MI.getDebugLoc(); |
1673 | |
1674 | MachineInstr *Readfirstlane = nullptr; |
1675 | |
1676 | // If we legalized the VGPR input, strip out the readfirstlane to analyze the |
1677 | // incoming offset, in case there's an add of a constant. We'll have to put it |
1678 | // back later. |
1679 | if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) { |
1680 | Readfirstlane = OffsetDef; |
1681 | BaseOffset = OffsetDef->getOperand(i: 1).getReg(); |
1682 | OffsetDef = getDefIgnoringCopies(Reg: BaseOffset, MRI: *MRI); |
1683 | } |
1684 | |
1685 | if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) { |
1686 | // If we have a constant offset, try to use the 0 in m0 as the base. |
1687 | // TODO: Look into changing the default m0 initialization value. If the |
1688 | // default -1 only set the low 16-bits, we could leave it as-is and add 1 to |
1689 | // the immediate offset. |
1690 | |
1691 | ImmOffset = OffsetDef->getOperand(i: 1).getCImm()->getZExtValue(); |
1692 | BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0) |
1693 | .addImm(0); |
1694 | } else { |
1695 | std::tie(args&: BaseOffset, args&: ImmOffset) = |
1696 | AMDGPU::getBaseWithConstantOffset(MRI&: *MRI, Reg: BaseOffset, KnownBits: KB); |
1697 | |
1698 | if (Readfirstlane) { |
1699 | // We have the constant offset now, so put the readfirstlane back on the |
1700 | // variable component. |
1701 | if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI)) |
1702 | return false; |
1703 | |
1704 | Readfirstlane->getOperand(i: 1).setReg(BaseOffset); |
1705 | BaseOffset = Readfirstlane->getOperand(i: 0).getReg(); |
1706 | } else { |
1707 | if (!RBI.constrainGenericRegister(BaseOffset, |
1708 | AMDGPU::SReg_32RegClass, *MRI)) |
1709 | return false; |
1710 | } |
1711 | |
1712 | Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); |
1713 | BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base) |
1714 | .addReg(BaseOffset) |
1715 | .addImm(16) |
1716 | .setOperandDead(3); // Dead scc |
1717 | |
1718 | BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) |
1719 | .addReg(M0Base); |
1720 | } |
1721 | |
1722 | // The resource id offset is computed as (<isa opaque base> + M0[21:16] + |
1723 | // offset field) % 64. Some versions of the programming guide omit the m0 |
1724 | // part, or claim it's from offset 0. |
1725 | auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IntrID: IID))); |
1726 | |
1727 | if (HasVSrc) { |
1728 | Register VSrc = MI.getOperand(i: 1).getReg(); |
1729 | MIB.addReg(VSrc); |
1730 | |
1731 | if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI)) |
1732 | return false; |
1733 | } |
1734 | |
1735 | MIB.addImm(ImmOffset) |
1736 | .cloneMemRefs(MI); |
1737 | |
1738 | TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::data0); |
1739 | |
1740 | MI.eraseFromParent(); |
1741 | return true; |
1742 | } |
1743 | |
1744 | bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI, |
1745 | bool IsAppend) const { |
1746 | Register PtrBase = MI.getOperand(i: 2).getReg(); |
1747 | LLT PtrTy = MRI->getType(Reg: PtrBase); |
1748 | bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS; |
1749 | |
1750 | unsigned Offset; |
1751 | std::tie(args&: PtrBase, args&: Offset) = selectDS1Addr1OffsetImpl(Root&: MI.getOperand(i: 2)); |
1752 | |
1753 | // TODO: Should this try to look through readfirstlane like GWS? |
1754 | if (!isDSOffsetLegal(Base: PtrBase, Offset)) { |
1755 | PtrBase = MI.getOperand(i: 2).getReg(); |
1756 | Offset = 0; |
1757 | } |
1758 | |
1759 | MachineBasicBlock *MBB = MI.getParent(); |
1760 | const DebugLoc &DL = MI.getDebugLoc(); |
1761 | const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME; |
1762 | |
1763 | BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) |
1764 | .addReg(PtrBase); |
1765 | if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI)) |
1766 | return false; |
1767 | |
1768 | auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(i: 0).getReg()) |
1769 | .addImm(Offset) |
1770 | .addImm(IsGDS ? -1 : 0) |
1771 | .cloneMemRefs(MI); |
1772 | MI.eraseFromParent(); |
1773 | return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); |
1774 | } |
1775 | |
1776 | bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const { |
1777 | if (TM.getOptLevel() > CodeGenOptLevel::None) { |
1778 | unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second; |
1779 | if (WGSize <= STI.getWavefrontSize()) { |
1780 | MachineBasicBlock *MBB = MI.getParent(); |
1781 | const DebugLoc &DL = MI.getDebugLoc(); |
1782 | BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER)); |
1783 | MI.eraseFromParent(); |
1784 | return true; |
1785 | } |
1786 | } |
1787 | |
1788 | // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait |
1789 | if (STI.hasSplitBarriers()) { |
1790 | MachineBasicBlock *MBB = MI.getParent(); |
1791 | const DebugLoc &DL = MI.getDebugLoc(); |
1792 | BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM)) |
1793 | .addImm(AMDGPU::Barrier::WORKGROUP); |
1794 | BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_WAIT)) |
1795 | .addImm(AMDGPU::Barrier::WORKGROUP); |
1796 | MI.eraseFromParent(); |
1797 | return true; |
1798 | } |
1799 | |
1800 | return selectImpl(I&: MI, CoverageInfo&: *CoverageInfo); |
1801 | } |
1802 | |
1803 | static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, |
1804 | bool &IsTexFail) { |
1805 | if (TexFailCtrl) |
1806 | IsTexFail = true; |
1807 | |
1808 | TFE = (TexFailCtrl & 0x1) ? true : false; |
1809 | TexFailCtrl &= ~(uint64_t)0x1; |
1810 | LWE = (TexFailCtrl & 0x2) ? true : false; |
1811 | TexFailCtrl &= ~(uint64_t)0x2; |
1812 | |
1813 | return TexFailCtrl == 0; |
1814 | } |
1815 | |
1816 | bool AMDGPUInstructionSelector::selectImageIntrinsic( |
1817 | MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const { |
1818 | MachineBasicBlock *MBB = MI.getParent(); |
1819 | const DebugLoc &DL = MI.getDebugLoc(); |
1820 | |
1821 | const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = |
1822 | AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Intr->BaseOpcode); |
1823 | |
1824 | const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(DimEnum: Intr->Dim); |
1825 | unsigned IntrOpcode = Intr->BaseOpcode; |
1826 | const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI); |
1827 | const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI); |
1828 | const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI); |
1829 | |
1830 | const unsigned ArgOffset = MI.getNumExplicitDefs() + 1; |
1831 | |
1832 | Register VDataIn, VDataOut; |
1833 | LLT VDataTy; |
1834 | int NumVDataDwords = -1; |
1835 | bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 || |
1836 | MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16; |
1837 | |
1838 | bool Unorm; |
1839 | if (!BaseOpcode->Sampler) |
1840 | Unorm = true; |
1841 | else |
1842 | Unorm = MI.getOperand(i: ArgOffset + Intr->UnormIndex).getImm() != 0; |
1843 | |
1844 | bool TFE; |
1845 | bool LWE; |
1846 | bool IsTexFail = false; |
1847 | if (!parseTexFail(TexFailCtrl: MI.getOperand(i: ArgOffset + Intr->TexFailCtrlIndex).getImm(), |
1848 | TFE, LWE, IsTexFail)) |
1849 | return false; |
1850 | |
1851 | const int Flags = MI.getOperand(i: ArgOffset + Intr->NumArgs).getImm(); |
1852 | const bool IsA16 = (Flags & 1) != 0; |
1853 | const bool IsG16 = (Flags & 2) != 0; |
1854 | |
1855 | // A16 implies 16 bit gradients if subtarget doesn't support G16 |
1856 | if (IsA16 && !STI.hasG16() && !IsG16) |
1857 | return false; |
1858 | |
1859 | unsigned DMask = 0; |
1860 | unsigned DMaskLanes = 0; |
1861 | |
1862 | if (BaseOpcode->Atomic) { |
1863 | VDataOut = MI.getOperand(i: 0).getReg(); |
1864 | VDataIn = MI.getOperand(i: 2).getReg(); |
1865 | LLT Ty = MRI->getType(Reg: VDataIn); |
1866 | |
1867 | // Be careful to allow atomic swap on 16-bit element vectors. |
1868 | const bool Is64Bit = BaseOpcode->AtomicX2 ? |
1869 | Ty.getSizeInBits() == 128 : |
1870 | Ty.getSizeInBits() == 64; |
1871 | |
1872 | if (BaseOpcode->AtomicX2) { |
1873 | assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister); |
1874 | |
1875 | DMask = Is64Bit ? 0xf : 0x3; |
1876 | NumVDataDwords = Is64Bit ? 4 : 2; |
1877 | } else { |
1878 | DMask = Is64Bit ? 0x3 : 0x1; |
1879 | NumVDataDwords = Is64Bit ? 2 : 1; |
1880 | } |
1881 | } else { |
1882 | DMask = MI.getOperand(i: ArgOffset + Intr->DMaskIndex).getImm(); |
1883 | DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(Value: DMask); |
1884 | |
1885 | if (BaseOpcode->Store) { |
1886 | VDataIn = MI.getOperand(i: 1).getReg(); |
1887 | VDataTy = MRI->getType(Reg: VDataIn); |
1888 | NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32; |
1889 | } else { |
1890 | VDataOut = MI.getOperand(i: 0).getReg(); |
1891 | VDataTy = MRI->getType(Reg: VDataOut); |
1892 | NumVDataDwords = DMaskLanes; |
1893 | |
1894 | if (IsD16 && !STI.hasUnpackedD16VMem()) |
1895 | NumVDataDwords = (DMaskLanes + 1) / 2; |
1896 | } |
1897 | } |
1898 | |
1899 | // Set G16 opcode |
1900 | if (Subtarget->hasG16() && IsG16) { |
1901 | const AMDGPU::MIMGG16MappingInfo *G16MappingInfo = |
1902 | AMDGPU::getMIMGG16MappingInfo(G: Intr->BaseOpcode); |
1903 | assert(G16MappingInfo); |
1904 | IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16 |
1905 | } |
1906 | |
1907 | // TODO: Check this in verifier. |
1908 | assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this" ); |
1909 | |
1910 | unsigned CPol = MI.getOperand(i: ArgOffset + Intr->CachePolicyIndex).getImm(); |
1911 | if (BaseOpcode->Atomic) |
1912 | CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization |
1913 | if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) | |
1914 | AMDGPU::CPol::VOLATILE)) |
1915 | return false; |
1916 | |
1917 | int NumVAddrRegs = 0; |
1918 | int NumVAddrDwords = 0; |
1919 | for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { |
1920 | // Skip the $noregs and 0s inserted during legalization. |
1921 | MachineOperand &AddrOp = MI.getOperand(i: ArgOffset + I); |
1922 | if (!AddrOp.isReg()) |
1923 | continue; // XXX - Break? |
1924 | |
1925 | Register Addr = AddrOp.getReg(); |
1926 | if (!Addr) |
1927 | break; |
1928 | |
1929 | ++NumVAddrRegs; |
1930 | NumVAddrDwords += (MRI->getType(Reg: Addr).getSizeInBits() + 31) / 32; |
1931 | } |
1932 | |
1933 | // The legalizer preprocessed the intrinsic arguments. If we aren't using |
1934 | // NSA, these should have been packed into a single value in the first |
1935 | // address register |
1936 | const bool UseNSA = |
1937 | NumVAddrRegs != 1 && |
1938 | (STI.hasPartialNSAEncoding() ? NumVAddrDwords >= NumVAddrRegs |
1939 | : NumVAddrDwords == NumVAddrRegs); |
1940 | if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) { |
1941 | LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n" ); |
1942 | return false; |
1943 | } |
1944 | |
1945 | if (IsTexFail) |
1946 | ++NumVDataDwords; |
1947 | |
1948 | int Opcode = -1; |
1949 | if (IsGFX12Plus) { |
1950 | Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12, |
1951 | NumVDataDwords, NumVAddrDwords); |
1952 | } else if (IsGFX11Plus) { |
1953 | Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, |
1954 | UseNSA ? AMDGPU::MIMGEncGfx11NSA |
1955 | : AMDGPU::MIMGEncGfx11Default, |
1956 | NumVDataDwords, NumVAddrDwords); |
1957 | } else if (IsGFX10Plus) { |
1958 | Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, |
1959 | UseNSA ? AMDGPU::MIMGEncGfx10NSA |
1960 | : AMDGPU::MIMGEncGfx10Default, |
1961 | NumVDataDwords, NumVAddrDwords); |
1962 | } else { |
1963 | if (Subtarget->hasGFX90AInsts()) { |
1964 | Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a, |
1965 | NumVDataDwords, NumVAddrDwords); |
1966 | if (Opcode == -1) { |
1967 | LLVM_DEBUG( |
1968 | dbgs() |
1969 | << "requested image instruction is not supported on this GPU\n" ); |
1970 | return false; |
1971 | } |
1972 | } |
1973 | if (Opcode == -1 && |
1974 | STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) |
1975 | Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8, |
1976 | NumVDataDwords, NumVAddrDwords); |
1977 | if (Opcode == -1) |
1978 | Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6, |
1979 | NumVDataDwords, NumVAddrDwords); |
1980 | } |
1981 | if (Opcode == -1) |
1982 | return false; |
1983 | |
1984 | auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode)) |
1985 | .cloneMemRefs(MI); |
1986 | |
1987 | if (VDataOut) { |
1988 | if (BaseOpcode->AtomicX2) { |
1989 | const bool Is64 = MRI->getType(Reg: VDataOut).getSizeInBits() == 64; |
1990 | |
1991 | Register TmpReg = MRI->createVirtualRegister( |
1992 | Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass); |
1993 | unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; |
1994 | |
1995 | MIB.addDef(TmpReg); |
1996 | if (!MRI->use_empty(RegNo: VDataOut)) { |
1997 | BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut) |
1998 | .addReg(TmpReg, RegState::Kill, SubReg); |
1999 | } |
2000 | |
2001 | } else { |
2002 | MIB.addDef(VDataOut); // vdata output |
2003 | } |
2004 | } |
2005 | |
2006 | if (VDataIn) |
2007 | MIB.addReg(VDataIn); // vdata input |
2008 | |
2009 | for (int I = 0; I != NumVAddrRegs; ++I) { |
2010 | MachineOperand &SrcOp = MI.getOperand(i: ArgOffset + Intr->VAddrStart + I); |
2011 | if (SrcOp.isReg()) { |
2012 | assert(SrcOp.getReg() != 0); |
2013 | MIB.addReg(SrcOp.getReg()); |
2014 | } |
2015 | } |
2016 | |
2017 | MIB.addReg(MI.getOperand(i: ArgOffset + Intr->RsrcIndex).getReg()); |
2018 | if (BaseOpcode->Sampler) |
2019 | MIB.addReg(MI.getOperand(i: ArgOffset + Intr->SampIndex).getReg()); |
2020 | |
2021 | MIB.addImm(DMask); // dmask |
2022 | |
2023 | if (IsGFX10Plus) |
2024 | MIB.addImm(DimInfo->Encoding); |
2025 | if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::unorm)) |
2026 | MIB.addImm(Unorm); |
2027 | |
2028 | MIB.addImm(CPol); |
2029 | MIB.addImm(IsA16 && // a16 or r128 |
2030 | STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0); |
2031 | if (IsGFX10Plus) |
2032 | MIB.addImm(IsA16 ? -1 : 0); |
2033 | |
2034 | if (!Subtarget->hasGFX90AInsts()) { |
2035 | MIB.addImm(TFE); // tfe |
2036 | } else if (TFE) { |
2037 | LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n" ); |
2038 | return false; |
2039 | } |
2040 | |
2041 | if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::lwe)) |
2042 | MIB.addImm(LWE); // lwe |
2043 | if (!IsGFX10Plus) |
2044 | MIB.addImm(DimInfo->DA ? -1 : 0); |
2045 | if (BaseOpcode->HasD16) |
2046 | MIB.addImm(IsD16 ? -1 : 0); |
2047 | |
2048 | MI.eraseFromParent(); |
2049 | constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); |
2050 | TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::vaddr); |
2051 | return true; |
2052 | } |
2053 | |
2054 | // We need to handle this here because tablegen doesn't support matching |
2055 | // instructions with multiple outputs. |
2056 | bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic( |
2057 | MachineInstr &MI) const { |
2058 | Register Dst0 = MI.getOperand(i: 0).getReg(); |
2059 | Register Dst1 = MI.getOperand(i: 1).getReg(); |
2060 | |
2061 | const DebugLoc &DL = MI.getDebugLoc(); |
2062 | MachineBasicBlock *MBB = MI.getParent(); |
2063 | |
2064 | Register Addr = MI.getOperand(i: 3).getReg(); |
2065 | Register Data0 = MI.getOperand(i: 4).getReg(); |
2066 | Register Data1 = MI.getOperand(i: 5).getReg(); |
2067 | unsigned Offset = MI.getOperand(i: 6).getImm(); |
2068 | |
2069 | auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_BVH_STACK_RTN_B32), Dst0) |
2070 | .addDef(Dst1) |
2071 | .addUse(Addr) |
2072 | .addUse(Data0) |
2073 | .addUse(Data1) |
2074 | .addImm(Offset) |
2075 | .cloneMemRefs(MI); |
2076 | |
2077 | MI.eraseFromParent(); |
2078 | return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); |
2079 | } |
2080 | |
2081 | bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( |
2082 | MachineInstr &I) const { |
2083 | unsigned IntrinsicID = cast<GIntrinsic>(Val&: I).getIntrinsicID(); |
2084 | switch (IntrinsicID) { |
2085 | case Intrinsic::amdgcn_end_cf: |
2086 | return selectEndCfIntrinsic(MI&: I); |
2087 | case Intrinsic::amdgcn_ds_ordered_add: |
2088 | case Intrinsic::amdgcn_ds_ordered_swap: |
2089 | return selectDSOrderedIntrinsic(MI&: I, IntrID: IntrinsicID); |
2090 | case Intrinsic::amdgcn_ds_gws_init: |
2091 | case Intrinsic::amdgcn_ds_gws_barrier: |
2092 | case Intrinsic::amdgcn_ds_gws_sema_v: |
2093 | case Intrinsic::amdgcn_ds_gws_sema_br: |
2094 | case Intrinsic::amdgcn_ds_gws_sema_p: |
2095 | case Intrinsic::amdgcn_ds_gws_sema_release_all: |
2096 | return selectDSGWSIntrinsic(MI&: I, IID: IntrinsicID); |
2097 | case Intrinsic::amdgcn_ds_append: |
2098 | return selectDSAppendConsume(MI&: I, IsAppend: true); |
2099 | case Intrinsic::amdgcn_ds_consume: |
2100 | return selectDSAppendConsume(MI&: I, IsAppend: false); |
2101 | case Intrinsic::amdgcn_s_barrier: |
2102 | return selectSBarrier(MI&: I); |
2103 | case Intrinsic::amdgcn_raw_buffer_load_lds: |
2104 | case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: |
2105 | case Intrinsic::amdgcn_struct_buffer_load_lds: |
2106 | case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: |
2107 | return selectBufferLoadLds(MI&: I); |
2108 | case Intrinsic::amdgcn_global_load_lds: |
2109 | return selectGlobalLoadLds(MI&: I); |
2110 | case Intrinsic::amdgcn_exp_compr: |
2111 | if (!STI.hasCompressedExport()) { |
2112 | Function &F = I.getMF()->getFunction(); |
2113 | DiagnosticInfoUnsupported NoFpRet( |
2114 | F, "intrinsic not supported on subtarget" , I.getDebugLoc(), DS_Error); |
2115 | F.getContext().diagnose(DI: NoFpRet); |
2116 | return false; |
2117 | } |
2118 | break; |
2119 | case Intrinsic::amdgcn_ds_bvh_stack_rtn: |
2120 | return selectDSBvhStackIntrinsic(MI&: I); |
2121 | case Intrinsic::amdgcn_s_barrier_init: |
2122 | case Intrinsic::amdgcn_s_barrier_join: |
2123 | case Intrinsic::amdgcn_s_wakeup_barrier: |
2124 | case Intrinsic::amdgcn_s_get_barrier_state: |
2125 | return selectNamedBarrierInst(I, IID: IntrinsicID); |
2126 | case Intrinsic::amdgcn_s_barrier_signal_isfirst: |
2127 | case Intrinsic::amdgcn_s_barrier_signal_isfirst_var: |
2128 | return selectSBarrierSignalIsfirst(I, IID: IntrinsicID); |
2129 | case Intrinsic::amdgcn_s_barrier_leave: |
2130 | return selectSBarrierLeave(I); |
2131 | } |
2132 | return selectImpl(I, CoverageInfo&: *CoverageInfo); |
2133 | } |
2134 | |
2135 | bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const { |
2136 | if (selectImpl(I, CoverageInfo&: *CoverageInfo)) |
2137 | return true; |
2138 | |
2139 | MachineBasicBlock *BB = I.getParent(); |
2140 | const DebugLoc &DL = I.getDebugLoc(); |
2141 | |
2142 | Register DstReg = I.getOperand(i: 0).getReg(); |
2143 | unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); |
2144 | assert(Size <= 32 || Size == 64); |
2145 | const MachineOperand &CCOp = I.getOperand(i: 1); |
2146 | Register CCReg = CCOp.getReg(); |
2147 | if (!isVCC(Reg: CCReg, MRI: *MRI)) { |
2148 | unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 : |
2149 | AMDGPU::S_CSELECT_B32; |
2150 | MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) |
2151 | .addReg(CCReg); |
2152 | |
2153 | // The generic constrainSelectedInstRegOperands doesn't work for the scc register |
2154 | // bank, because it does not cover the register class that we used to represent |
2155 | // for it. So we need to manually set the register class here. |
2156 | if (!MRI->getRegClassOrNull(Reg: CCReg)) |
2157 | MRI->setRegClass(Reg: CCReg, RC: TRI.getConstrainedRegClassForOperand(MO: CCOp, MRI: *MRI)); |
2158 | MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg) |
2159 | .add(I.getOperand(i: 2)) |
2160 | .add(I.getOperand(i: 3)); |
2161 | |
2162 | bool Ret = false; |
2163 | Ret |= constrainSelectedInstRegOperands(*Select, TII, TRI, RBI); |
2164 | Ret |= constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI); |
2165 | I.eraseFromParent(); |
2166 | return Ret; |
2167 | } |
2168 | |
2169 | // Wide VGPR select should have been split in RegBankSelect. |
2170 | if (Size > 32) |
2171 | return false; |
2172 | |
2173 | MachineInstr *Select = |
2174 | BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg) |
2175 | .addImm(0) |
2176 | .add(I.getOperand(3)) |
2177 | .addImm(0) |
2178 | .add(I.getOperand(2)) |
2179 | .add(I.getOperand(1)); |
2180 | |
2181 | bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI); |
2182 | I.eraseFromParent(); |
2183 | return Ret; |
2184 | } |
2185 | |
2186 | static int sizeToSubRegIndex(unsigned Size) { |
2187 | switch (Size) { |
2188 | case 32: |
2189 | return AMDGPU::sub0; |
2190 | case 64: |
2191 | return AMDGPU::sub0_sub1; |
2192 | case 96: |
2193 | return AMDGPU::sub0_sub1_sub2; |
2194 | case 128: |
2195 | return AMDGPU::sub0_sub1_sub2_sub3; |
2196 | case 256: |
2197 | return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7; |
2198 | default: |
2199 | if (Size < 32) |
2200 | return AMDGPU::sub0; |
2201 | if (Size > 256) |
2202 | return -1; |
2203 | return sizeToSubRegIndex(Size: llvm::bit_ceil(Value: Size)); |
2204 | } |
2205 | } |
2206 | |
2207 | bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const { |
2208 | Register DstReg = I.getOperand(i: 0).getReg(); |
2209 | Register SrcReg = I.getOperand(i: 1).getReg(); |
2210 | const LLT DstTy = MRI->getType(Reg: DstReg); |
2211 | const LLT SrcTy = MRI->getType(Reg: SrcReg); |
2212 | const LLT S1 = LLT::scalar(SizeInBits: 1); |
2213 | |
2214 | const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); |
2215 | const RegisterBank *DstRB; |
2216 | if (DstTy == S1) { |
2217 | // This is a special case. We don't treat s1 for legalization artifacts as |
2218 | // vcc booleans. |
2219 | DstRB = SrcRB; |
2220 | } else { |
2221 | DstRB = RBI.getRegBank(DstReg, *MRI, TRI); |
2222 | if (SrcRB != DstRB) |
2223 | return false; |
2224 | } |
2225 | |
2226 | const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID; |
2227 | |
2228 | unsigned DstSize = DstTy.getSizeInBits(); |
2229 | unsigned SrcSize = SrcTy.getSizeInBits(); |
2230 | |
2231 | const TargetRegisterClass *SrcRC = |
2232 | TRI.getRegClassForSizeOnBank(Size: SrcSize, Bank: *SrcRB); |
2233 | const TargetRegisterClass *DstRC = |
2234 | TRI.getRegClassForSizeOnBank(Size: DstSize, Bank: *DstRB); |
2235 | if (!SrcRC || !DstRC) |
2236 | return false; |
2237 | |
2238 | if (!RBI.constrainGenericRegister(Reg: SrcReg, RC: *SrcRC, MRI&: *MRI) || |
2239 | !RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI)) { |
2240 | LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n" ); |
2241 | return false; |
2242 | } |
2243 | |
2244 | if (DstTy == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16) && SrcTy == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 32)) { |
2245 | MachineBasicBlock *MBB = I.getParent(); |
2246 | const DebugLoc &DL = I.getDebugLoc(); |
2247 | |
2248 | Register LoReg = MRI->createVirtualRegister(RegClass: DstRC); |
2249 | Register HiReg = MRI->createVirtualRegister(RegClass: DstRC); |
2250 | BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg) |
2251 | .addReg(SrcReg, 0, AMDGPU::sub0); |
2252 | BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg) |
2253 | .addReg(SrcReg, 0, AMDGPU::sub1); |
2254 | |
2255 | if (IsVALU && STI.hasSDWA()) { |
2256 | // Write the low 16-bits of the high element into the high 16-bits of the |
2257 | // low element. |
2258 | MachineInstr *MovSDWA = |
2259 | BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg) |
2260 | .addImm(0) // $src0_modifiers |
2261 | .addReg(HiReg) // $src0 |
2262 | .addImm(0) // $clamp |
2263 | .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel |
2264 | .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused |
2265 | .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel |
2266 | .addReg(LoReg, RegState::Implicit); |
2267 | MovSDWA->tieOperands(DefIdx: 0, UseIdx: MovSDWA->getNumOperands() - 1); |
2268 | } else { |
2269 | Register TmpReg0 = MRI->createVirtualRegister(RegClass: DstRC); |
2270 | Register TmpReg1 = MRI->createVirtualRegister(RegClass: DstRC); |
2271 | Register ImmReg = MRI->createVirtualRegister(RegClass: DstRC); |
2272 | if (IsVALU) { |
2273 | BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0) |
2274 | .addImm(16) |
2275 | .addReg(HiReg); |
2276 | } else { |
2277 | BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0) |
2278 | .addReg(HiReg) |
2279 | .addImm(16) |
2280 | .setOperandDead(3); // Dead scc |
2281 | } |
2282 | |
2283 | unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; |
2284 | unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32; |
2285 | unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32; |
2286 | |
2287 | BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg) |
2288 | .addImm(0xffff); |
2289 | auto And = BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1) |
2290 | .addReg(LoReg) |
2291 | .addReg(ImmReg); |
2292 | auto Or = BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg) |
2293 | .addReg(TmpReg0) |
2294 | .addReg(TmpReg1); |
2295 | |
2296 | if (!IsVALU) { |
2297 | And.setOperandDead(3); // Dead scc |
2298 | Or.setOperandDead(3); // Dead scc |
2299 | } |
2300 | } |
2301 | |
2302 | I.eraseFromParent(); |
2303 | return true; |
2304 | } |
2305 | |
2306 | if (!DstTy.isScalar()) |
2307 | return false; |
2308 | |
2309 | if (SrcSize > 32) { |
2310 | int SubRegIdx = sizeToSubRegIndex(Size: DstSize); |
2311 | if (SubRegIdx == -1) |
2312 | return false; |
2313 | |
2314 | // Deal with weird cases where the class only partially supports the subreg |
2315 | // index. |
2316 | const TargetRegisterClass *SrcWithSubRC |
2317 | = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx); |
2318 | if (!SrcWithSubRC) |
2319 | return false; |
2320 | |
2321 | if (SrcWithSubRC != SrcRC) { |
2322 | if (!RBI.constrainGenericRegister(Reg: SrcReg, RC: *SrcWithSubRC, MRI&: *MRI)) |
2323 | return false; |
2324 | } |
2325 | |
2326 | I.getOperand(i: 1).setSubReg(SubRegIdx); |
2327 | } |
2328 | |
2329 | I.setDesc(TII.get(TargetOpcode::COPY)); |
2330 | return true; |
2331 | } |
2332 | |
2333 | /// \returns true if a bitmask for \p Size bits will be an inline immediate. |
2334 | static bool shouldUseAndMask(unsigned Size, unsigned &Mask) { |
2335 | Mask = maskTrailingOnes<unsigned>(N: Size); |
2336 | int SignedMask = static_cast<int>(Mask); |
2337 | return SignedMask >= -16 && SignedMask <= 64; |
2338 | } |
2339 | |
2340 | // Like RegisterBankInfo::getRegBank, but don't assume vcc for s1. |
2341 | const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank( |
2342 | Register Reg, const MachineRegisterInfo &MRI, |
2343 | const TargetRegisterInfo &TRI) const { |
2344 | const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); |
2345 | if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>()) |
2346 | return RB; |
2347 | |
2348 | // Ignore the type, since we don't use vcc in artifacts. |
2349 | if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>()) |
2350 | return &RBI.getRegBankFromRegClass(RC: *RC, LLT()); |
2351 | return nullptr; |
2352 | } |
2353 | |
2354 | bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { |
2355 | bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG; |
2356 | bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg; |
2357 | const DebugLoc &DL = I.getDebugLoc(); |
2358 | MachineBasicBlock &MBB = *I.getParent(); |
2359 | const Register DstReg = I.getOperand(i: 0).getReg(); |
2360 | const Register SrcReg = I.getOperand(i: 1).getReg(); |
2361 | |
2362 | const LLT DstTy = MRI->getType(Reg: DstReg); |
2363 | const LLT SrcTy = MRI->getType(Reg: SrcReg); |
2364 | const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ? |
2365 | I.getOperand(2).getImm() : SrcTy.getSizeInBits(); |
2366 | const unsigned DstSize = DstTy.getSizeInBits(); |
2367 | if (!DstTy.isScalar()) |
2368 | return false; |
2369 | |
2370 | // Artifact casts should never use vcc. |
2371 | const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI); |
2372 | |
2373 | // FIXME: This should probably be illegal and split earlier. |
2374 | if (I.getOpcode() == AMDGPU::G_ANYEXT) { |
2375 | if (DstSize <= 32) |
2376 | return selectCOPY(I); |
2377 | |
2378 | const TargetRegisterClass *SrcRC = |
2379 | TRI.getRegClassForTypeOnBank(Ty: SrcTy, Bank: *SrcBank); |
2380 | const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); |
2381 | const TargetRegisterClass *DstRC = |
2382 | TRI.getRegClassForSizeOnBank(Size: DstSize, Bank: *DstBank); |
2383 | |
2384 | Register UndefReg = MRI->createVirtualRegister(RegClass: SrcRC); |
2385 | BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg); |
2386 | BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) |
2387 | .addReg(SrcReg) |
2388 | .addImm(AMDGPU::sub0) |
2389 | .addReg(UndefReg) |
2390 | .addImm(AMDGPU::sub1); |
2391 | I.eraseFromParent(); |
2392 | |
2393 | return RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI) && |
2394 | RBI.constrainGenericRegister(Reg: SrcReg, RC: *SrcRC, MRI&: *MRI); |
2395 | } |
2396 | |
2397 | if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) { |
2398 | // 64-bit should have been split up in RegBankSelect |
2399 | |
2400 | // Try to use an and with a mask if it will save code size. |
2401 | unsigned Mask; |
2402 | if (!Signed && shouldUseAndMask(Size: SrcSize, Mask)) { |
2403 | MachineInstr *ExtI = |
2404 | BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg) |
2405 | .addImm(Mask) |
2406 | .addReg(SrcReg); |
2407 | I.eraseFromParent(); |
2408 | return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); |
2409 | } |
2410 | |
2411 | const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64; |
2412 | MachineInstr *ExtI = |
2413 | BuildMI(MBB, I, DL, TII.get(BFE), DstReg) |
2414 | .addReg(SrcReg) |
2415 | .addImm(0) // Offset |
2416 | .addImm(SrcSize); // Width |
2417 | I.eraseFromParent(); |
2418 | return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); |
2419 | } |
2420 | |
2421 | if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) { |
2422 | const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ? |
2423 | AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass; |
2424 | if (!RBI.constrainGenericRegister(Reg: SrcReg, RC: SrcRC, MRI&: *MRI)) |
2425 | return false; |
2426 | |
2427 | if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) { |
2428 | const unsigned SextOpc = SrcSize == 8 ? |
2429 | AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16; |
2430 | BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg) |
2431 | .addReg(SrcReg); |
2432 | I.eraseFromParent(); |
2433 | return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); |
2434 | } |
2435 | |
2436 | // Using a single 32-bit SALU to calculate the high half is smaller than |
2437 | // S_BFE with a literal constant operand. |
2438 | if (DstSize > 32 && SrcSize == 32) { |
2439 | Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); |
2440 | unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister; |
2441 | if (Signed) { |
2442 | BuildMI(MBB, I, DL, TII.get(AMDGPU::S_ASHR_I32), HiReg) |
2443 | .addReg(SrcReg, 0, SubReg) |
2444 | .addImm(31) |
2445 | .setOperandDead(3); // Dead scc |
2446 | } else { |
2447 | BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg) |
2448 | .addImm(0); |
2449 | } |
2450 | BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) |
2451 | .addReg(SrcReg, 0, SubReg) |
2452 | .addImm(AMDGPU::sub0) |
2453 | .addReg(HiReg) |
2454 | .addImm(AMDGPU::sub1); |
2455 | I.eraseFromParent(); |
2456 | return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, |
2457 | *MRI); |
2458 | } |
2459 | |
2460 | const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64; |
2461 | const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32; |
2462 | |
2463 | // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width. |
2464 | if (DstSize > 32 && (SrcSize <= 32 || InReg)) { |
2465 | // We need a 64-bit register source, but the high bits don't matter. |
2466 | Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); |
2467 | Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); |
2468 | unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister; |
2469 | |
2470 | BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg); |
2471 | BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg) |
2472 | .addReg(SrcReg, 0, SubReg) |
2473 | .addImm(AMDGPU::sub0) |
2474 | .addReg(UndefReg) |
2475 | .addImm(AMDGPU::sub1); |
2476 | |
2477 | BuildMI(MBB, I, DL, TII.get(BFE64), DstReg) |
2478 | .addReg(ExtReg) |
2479 | .addImm(SrcSize << 16); |
2480 | |
2481 | I.eraseFromParent(); |
2482 | return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI); |
2483 | } |
2484 | |
2485 | unsigned Mask; |
2486 | if (!Signed && shouldUseAndMask(Size: SrcSize, Mask)) { |
2487 | BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg) |
2488 | .addReg(SrcReg) |
2489 | .addImm(Mask) |
2490 | .setOperandDead(3); // Dead scc |
2491 | } else { |
2492 | BuildMI(MBB, I, DL, TII.get(BFE32), DstReg) |
2493 | .addReg(SrcReg) |
2494 | .addImm(SrcSize << 16); |
2495 | } |
2496 | |
2497 | I.eraseFromParent(); |
2498 | return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); |
2499 | } |
2500 | |
2501 | return false; |
2502 | } |
2503 | |
2504 | static bool (MachineRegisterInfo &MRI, Register In, |
2505 | Register &Out) { |
2506 | Register LShlSrc; |
2507 | if (mi_match(R: In, MRI, |
2508 | P: m_GTrunc(Src: m_GLShr(L: m_Reg(R&: LShlSrc), R: m_SpecificICst(RequestedValue: 16))))) { |
2509 | Out = LShlSrc; |
2510 | return true; |
2511 | } |
2512 | return false; |
2513 | } |
2514 | |
2515 | bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const { |
2516 | if (!Subtarget->hasSALUFloatInsts()) |
2517 | return false; |
2518 | |
2519 | Register Dst = I.getOperand(i: 0).getReg(); |
2520 | const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI); |
2521 | if (DstRB->getID() != AMDGPU::SGPRRegBankID) |
2522 | return false; |
2523 | |
2524 | Register Src = I.getOperand(i: 1).getReg(); |
2525 | |
2526 | if (MRI->getType(Reg: Dst) == LLT::scalar(SizeInBits: 32) && |
2527 | MRI->getType(Reg: Src) == LLT::scalar(SizeInBits: 16)) { |
2528 | if (isExtractHiElt(MRI&: *MRI, In: Src, Out&: Src)) { |
2529 | MachineBasicBlock *BB = I.getParent(); |
2530 | BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_CVT_HI_F32_F16), Dst) |
2531 | .addUse(Src); |
2532 | I.eraseFromParent(); |
2533 | return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI); |
2534 | } |
2535 | } |
2536 | |
2537 | return false; |
2538 | } |
2539 | |
2540 | bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const { |
2541 | MachineBasicBlock *BB = I.getParent(); |
2542 | MachineOperand &ImmOp = I.getOperand(i: 1); |
2543 | Register DstReg = I.getOperand(i: 0).getReg(); |
2544 | unsigned Size = MRI->getType(Reg: DstReg).getSizeInBits(); |
2545 | bool IsFP = false; |
2546 | |
2547 | // The AMDGPU backend only supports Imm operands and not CImm or FPImm. |
2548 | if (ImmOp.isFPImm()) { |
2549 | const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt(); |
2550 | ImmOp.ChangeToImmediate(ImmVal: Imm.getZExtValue()); |
2551 | IsFP = true; |
2552 | } else if (ImmOp.isCImm()) { |
2553 | ImmOp.ChangeToImmediate(ImmVal: ImmOp.getCImm()->getSExtValue()); |
2554 | } else { |
2555 | llvm_unreachable("Not supported by g_constants" ); |
2556 | } |
2557 | |
2558 | const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); |
2559 | const bool IsSgpr = DstRB->getID() == AMDGPU::SGPRRegBankID; |
2560 | |
2561 | unsigned Opcode; |
2562 | if (DstRB->getID() == AMDGPU::VCCRegBankID) { |
2563 | Opcode = STI.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; |
2564 | } else if (Size == 64 && |
2565 | AMDGPU::isValid32BitLiteral(Val: I.getOperand(i: 1).getImm(), IsFP64: IsFP)) { |
2566 | Opcode = IsSgpr ? AMDGPU::S_MOV_B64_IMM_PSEUDO : AMDGPU::V_MOV_B64_PSEUDO; |
2567 | I.setDesc(TII.get(Opcode)); |
2568 | I.addImplicitDefUseOperands(MF&: *MF); |
2569 | return constrainSelectedInstRegOperands(I, TII, TRI, RBI); |
2570 | } else { |
2571 | Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; |
2572 | |
2573 | // We should never produce s1 values on banks other than VCC. If the user of |
2574 | // this already constrained the register, we may incorrectly think it's VCC |
2575 | // if it wasn't originally. |
2576 | if (Size == 1) |
2577 | return false; |
2578 | } |
2579 | |
2580 | if (Size != 64) { |
2581 | I.setDesc(TII.get(Opcode)); |
2582 | I.addImplicitDefUseOperands(MF&: *MF); |
2583 | return constrainSelectedInstRegOperands(I, TII, TRI, RBI); |
2584 | } |
2585 | |
2586 | const DebugLoc &DL = I.getDebugLoc(); |
2587 | |
2588 | APInt Imm(Size, I.getOperand(i: 1).getImm()); |
2589 | |
2590 | MachineInstr *ResInst; |
2591 | if (IsSgpr && TII.isInlineConstant(Imm)) { |
2592 | ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg) |
2593 | .addImm(I.getOperand(1).getImm()); |
2594 | } else { |
2595 | const TargetRegisterClass *RC = IsSgpr ? |
2596 | &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass; |
2597 | Register LoReg = MRI->createVirtualRegister(RegClass: RC); |
2598 | Register HiReg = MRI->createVirtualRegister(RegClass: RC); |
2599 | |
2600 | BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg) |
2601 | .addImm(Imm.trunc(width: 32).getZExtValue()); |
2602 | |
2603 | BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg) |
2604 | .addImm(Imm.ashr(ShiftAmt: 32).getZExtValue()); |
2605 | |
2606 | ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) |
2607 | .addReg(LoReg) |
2608 | .addImm(AMDGPU::sub0) |
2609 | .addReg(HiReg) |
2610 | .addImm(AMDGPU::sub1); |
2611 | } |
2612 | |
2613 | // We can't call constrainSelectedInstRegOperands here, because it doesn't |
2614 | // work for target independent opcodes |
2615 | I.eraseFromParent(); |
2616 | const TargetRegisterClass *DstRC = |
2617 | TRI.getConstrainedRegClassForOperand(MO: ResInst->getOperand(i: 0), MRI: *MRI); |
2618 | if (!DstRC) |
2619 | return true; |
2620 | return RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI); |
2621 | } |
2622 | |
2623 | bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const { |
2624 | // Only manually handle the f64 SGPR case. |
2625 | // |
2626 | // FIXME: This is a workaround for 2.5 different tablegen problems. Because |
2627 | // the bit ops theoretically have a second result due to the implicit def of |
2628 | // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing |
2629 | // that is easy by disabling the check. The result works, but uses a |
2630 | // nonsensical sreg32orlds_and_sreg_1 regclass. |
2631 | // |
2632 | // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to |
2633 | // the variadic REG_SEQUENCE operands. |
2634 | |
2635 | Register Dst = MI.getOperand(i: 0).getReg(); |
2636 | const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI); |
2637 | if (DstRB->getID() != AMDGPU::SGPRRegBankID || |
2638 | MRI->getType(Dst) != LLT::scalar(64)) |
2639 | return false; |
2640 | |
2641 | Register Src = MI.getOperand(i: 1).getReg(); |
2642 | MachineInstr *Fabs = getOpcodeDef(Opcode: TargetOpcode::G_FABS, Reg: Src, MRI: *MRI); |
2643 | if (Fabs) |
2644 | Src = Fabs->getOperand(i: 1).getReg(); |
2645 | |
2646 | if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) || |
2647 | !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI)) |
2648 | return false; |
2649 | |
2650 | MachineBasicBlock *BB = MI.getParent(); |
2651 | const DebugLoc &DL = MI.getDebugLoc(); |
2652 | Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); |
2653 | Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); |
2654 | Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); |
2655 | Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); |
2656 | |
2657 | BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg) |
2658 | .addReg(Src, 0, AMDGPU::sub0); |
2659 | BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg) |
2660 | .addReg(Src, 0, AMDGPU::sub1); |
2661 | BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg) |
2662 | .addImm(0x80000000); |
2663 | |
2664 | // Set or toggle sign bit. |
2665 | unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32; |
2666 | BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg) |
2667 | .addReg(HiReg) |
2668 | .addReg(ConstReg) |
2669 | .setOperandDead(3); // Dead scc |
2670 | BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst) |
2671 | .addReg(LoReg) |
2672 | .addImm(AMDGPU::sub0) |
2673 | .addReg(OpReg) |
2674 | .addImm(AMDGPU::sub1); |
2675 | MI.eraseFromParent(); |
2676 | return true; |
2677 | } |
2678 | |
2679 | // FIXME: This is a workaround for the same tablegen problems as G_FNEG |
2680 | bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const { |
2681 | Register Dst = MI.getOperand(i: 0).getReg(); |
2682 | const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI); |
2683 | if (DstRB->getID() != AMDGPU::SGPRRegBankID || |
2684 | MRI->getType(Dst) != LLT::scalar(64)) |
2685 | return false; |
2686 | |
2687 | Register Src = MI.getOperand(i: 1).getReg(); |
2688 | MachineBasicBlock *BB = MI.getParent(); |
2689 | const DebugLoc &DL = MI.getDebugLoc(); |
2690 | Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); |
2691 | Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); |
2692 | Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); |
2693 | Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); |
2694 | |
2695 | if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) || |
2696 | !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI)) |
2697 | return false; |
2698 | |
2699 | BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg) |
2700 | .addReg(Src, 0, AMDGPU::sub0); |
2701 | BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg) |
2702 | .addReg(Src, 0, AMDGPU::sub1); |
2703 | BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg) |
2704 | .addImm(0x7fffffff); |
2705 | |
2706 | // Clear sign bit. |
2707 | // TODO: Should this used S_BITSET0_*? |
2708 | BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg) |
2709 | .addReg(HiReg) |
2710 | .addReg(ConstReg) |
2711 | .setOperandDead(3); // Dead scc |
2712 | BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst) |
2713 | .addReg(LoReg) |
2714 | .addImm(AMDGPU::sub0) |
2715 | .addReg(OpReg) |
2716 | .addImm(AMDGPU::sub1); |
2717 | |
2718 | MI.eraseFromParent(); |
2719 | return true; |
2720 | } |
2721 | |
2722 | static bool isConstant(const MachineInstr &MI) { |
2723 | return MI.getOpcode() == TargetOpcode::G_CONSTANT; |
2724 | } |
2725 | |
2726 | void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load, |
2727 | const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const { |
2728 | |
2729 | unsigned OpNo = Load.getOpcode() == AMDGPU::G_PREFETCH ? 0 : 1; |
2730 | const MachineInstr *PtrMI = |
2731 | MRI.getUniqueVRegDef(Reg: Load.getOperand(i: OpNo).getReg()); |
2732 | |
2733 | assert(PtrMI); |
2734 | |
2735 | if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD) |
2736 | return; |
2737 | |
2738 | GEPInfo GEPInfo; |
2739 | |
2740 | for (unsigned i = 1; i != 3; ++i) { |
2741 | const MachineOperand &GEPOp = PtrMI->getOperand(i); |
2742 | const MachineInstr *OpDef = MRI.getUniqueVRegDef(Reg: GEPOp.getReg()); |
2743 | assert(OpDef); |
2744 | if (i == 2 && isConstant(MI: *OpDef)) { |
2745 | // TODO: Could handle constant base + variable offset, but a combine |
2746 | // probably should have commuted it. |
2747 | assert(GEPInfo.Imm == 0); |
2748 | GEPInfo.Imm = OpDef->getOperand(i: 1).getCImm()->getSExtValue(); |
2749 | continue; |
2750 | } |
2751 | const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI); |
2752 | if (OpBank->getID() == AMDGPU::SGPRRegBankID) |
2753 | GEPInfo.SgprParts.push_back(GEPOp.getReg()); |
2754 | else |
2755 | GEPInfo.VgprParts.push_back(GEPOp.getReg()); |
2756 | } |
2757 | |
2758 | AddrInfo.push_back(Elt: GEPInfo); |
2759 | getAddrModeInfo(Load: *PtrMI, MRI, AddrInfo); |
2760 | } |
2761 | |
2762 | bool AMDGPUInstructionSelector::isSGPR(Register Reg) const { |
2763 | return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID; |
2764 | } |
2765 | |
2766 | bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const { |
2767 | if (!MI.hasOneMemOperand()) |
2768 | return false; |
2769 | |
2770 | const MachineMemOperand *MMO = *MI.memoperands_begin(); |
2771 | const Value *Ptr = MMO->getValue(); |
2772 | |
2773 | // UndefValue means this is a load of a kernel input. These are uniform. |
2774 | // Sometimes LDS instructions have constant pointers. |
2775 | // If Ptr is null, then that means this mem operand contains a |
2776 | // PseudoSourceValue like GOT. |
2777 | if (!Ptr || isa<UndefValue>(Val: Ptr) || isa<Argument>(Val: Ptr) || |
2778 | isa<Constant>(Val: Ptr) || isa<GlobalValue>(Val: Ptr)) |
2779 | return true; |
2780 | |
2781 | if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) |
2782 | return true; |
2783 | |
2784 | if (MI.getOpcode() == AMDGPU::G_PREFETCH) |
2785 | return RBI.getRegBank(MI.getOperand(0).getReg(), *MRI, TRI)->getID() == |
2786 | AMDGPU::SGPRRegBankID; |
2787 | |
2788 | const Instruction *I = dyn_cast<Instruction>(Val: Ptr); |
2789 | return I && I->getMetadata(Kind: "amdgpu.uniform" ); |
2790 | } |
2791 | |
2792 | bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const { |
2793 | for (const GEPInfo &GEPInfo : AddrInfo) { |
2794 | if (!GEPInfo.VgprParts.empty()) |
2795 | return true; |
2796 | } |
2797 | return false; |
2798 | } |
2799 | |
2800 | void AMDGPUInstructionSelector::initM0(MachineInstr &I) const { |
2801 | const LLT PtrTy = MRI->getType(Reg: I.getOperand(i: 1).getReg()); |
2802 | unsigned AS = PtrTy.getAddressSpace(); |
2803 | if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) && |
2804 | STI.ldsRequiresM0Init()) { |
2805 | MachineBasicBlock *BB = I.getParent(); |
2806 | |
2807 | // If DS instructions require M0 initialization, insert it before selecting. |
2808 | BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0) |
2809 | .addImm(-1); |
2810 | } |
2811 | } |
2812 | |
2813 | bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW( |
2814 | MachineInstr &I) const { |
2815 | initM0(I); |
2816 | return selectImpl(I, CoverageInfo&: *CoverageInfo); |
2817 | } |
2818 | |
2819 | static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI) { |
2820 | if (Reg.isPhysical()) |
2821 | return false; |
2822 | |
2823 | MachineInstr &MI = *MRI.getUniqueVRegDef(Reg); |
2824 | const unsigned Opcode = MI.getOpcode(); |
2825 | |
2826 | if (Opcode == AMDGPU::COPY) |
2827 | return isVCmpResult(Reg: MI.getOperand(i: 1).getReg(), MRI); |
2828 | |
2829 | if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR || |
2830 | Opcode == AMDGPU::G_XOR) |
2831 | return isVCmpResult(Reg: MI.getOperand(i: 1).getReg(), MRI) && |
2832 | isVCmpResult(Reg: MI.getOperand(i: 2).getReg(), MRI); |
2833 | |
2834 | if (auto *GI = dyn_cast<GIntrinsic>(&MI)) |
2835 | return GI->is(Intrinsic::amdgcn_class); |
2836 | |
2837 | return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP; |
2838 | } |
2839 | |
2840 | bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const { |
2841 | MachineBasicBlock *BB = I.getParent(); |
2842 | MachineOperand &CondOp = I.getOperand(i: 0); |
2843 | Register CondReg = CondOp.getReg(); |
2844 | const DebugLoc &DL = I.getDebugLoc(); |
2845 | |
2846 | unsigned BrOpcode; |
2847 | Register CondPhysReg; |
2848 | const TargetRegisterClass *ConstrainRC; |
2849 | |
2850 | // In SelectionDAG, we inspect the IR block for uniformity metadata to decide |
2851 | // whether the branch is uniform when selecting the instruction. In |
2852 | // GlobalISel, we should push that decision into RegBankSelect. Assume for now |
2853 | // RegBankSelect knows what it's doing if the branch condition is scc, even |
2854 | // though it currently does not. |
2855 | if (!isVCC(Reg: CondReg, MRI: *MRI)) { |
2856 | if (MRI->getType(Reg: CondReg) != LLT::scalar(SizeInBits: 32)) |
2857 | return false; |
2858 | |
2859 | CondPhysReg = AMDGPU::SCC; |
2860 | BrOpcode = AMDGPU::S_CBRANCH_SCC1; |
2861 | ConstrainRC = &AMDGPU::SReg_32RegClass; |
2862 | } else { |
2863 | // FIXME: Should scc->vcc copies and with exec? |
2864 | |
2865 | // Unless the value of CondReg is a result of a V_CMP* instruction then we |
2866 | // need to insert an and with exec. |
2867 | if (!isVCmpResult(Reg: CondReg, MRI&: *MRI)) { |
2868 | const bool Is64 = STI.isWave64(); |
2869 | const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32; |
2870 | const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO; |
2871 | |
2872 | Register TmpReg = MRI->createVirtualRegister(RegClass: TRI.getBoolRC()); |
2873 | BuildMI(*BB, &I, DL, TII.get(Opcode), TmpReg) |
2874 | .addReg(CondReg) |
2875 | .addReg(Exec) |
2876 | .setOperandDead(3); // Dead scc |
2877 | CondReg = TmpReg; |
2878 | } |
2879 | |
2880 | CondPhysReg = TRI.getVCC(); |
2881 | BrOpcode = AMDGPU::S_CBRANCH_VCCNZ; |
2882 | ConstrainRC = TRI.getBoolRC(); |
2883 | } |
2884 | |
2885 | if (!MRI->getRegClassOrNull(Reg: CondReg)) |
2886 | MRI->setRegClass(Reg: CondReg, RC: ConstrainRC); |
2887 | |
2888 | BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg) |
2889 | .addReg(CondReg); |
2890 | BuildMI(*BB, &I, DL, TII.get(BrOpcode)) |
2891 | .addMBB(I.getOperand(i: 1).getMBB()); |
2892 | |
2893 | I.eraseFromParent(); |
2894 | return true; |
2895 | } |
2896 | |
2897 | bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE( |
2898 | MachineInstr &I) const { |
2899 | Register DstReg = I.getOperand(i: 0).getReg(); |
2900 | const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); |
2901 | const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; |
2902 | I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32)); |
2903 | if (IsVGPR) |
2904 | I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); |
2905 | |
2906 | return RBI.constrainGenericRegister( |
2907 | DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI); |
2908 | } |
2909 | |
2910 | bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const { |
2911 | Register DstReg = I.getOperand(i: 0).getReg(); |
2912 | Register SrcReg = I.getOperand(i: 1).getReg(); |
2913 | Register MaskReg = I.getOperand(i: 2).getReg(); |
2914 | LLT Ty = MRI->getType(Reg: DstReg); |
2915 | LLT MaskTy = MRI->getType(Reg: MaskReg); |
2916 | MachineBasicBlock *BB = I.getParent(); |
2917 | const DebugLoc &DL = I.getDebugLoc(); |
2918 | |
2919 | const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); |
2920 | const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); |
2921 | const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI); |
2922 | const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; |
2923 | if (DstRB != SrcRB) // Should only happen for hand written MIR. |
2924 | return false; |
2925 | |
2926 | // Try to avoid emitting a bit operation when we only need to touch half of |
2927 | // the 64-bit pointer. |
2928 | APInt MaskOnes = KB->getKnownOnes(R: MaskReg).zext(width: 64); |
2929 | const APInt MaskHi32 = APInt::getHighBitsSet(numBits: 64, hiBitsSet: 32); |
2930 | const APInt MaskLo32 = APInt::getLowBitsSet(numBits: 64, loBitsSet: 32); |
2931 | |
2932 | const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32; |
2933 | const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32; |
2934 | |
2935 | if (!IsVGPR && Ty.getSizeInBits() == 64 && |
2936 | !CanCopyLow32 && !CanCopyHi32) { |
2937 | auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg) |
2938 | .addReg(SrcReg) |
2939 | .addReg(MaskReg) |
2940 | .setOperandDead(3); // Dead scc |
2941 | I.eraseFromParent(); |
2942 | return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); |
2943 | } |
2944 | |
2945 | unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32; |
2946 | const TargetRegisterClass &RegRC |
2947 | = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; |
2948 | |
2949 | const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, Bank: *DstRB); |
2950 | const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, Bank: *SrcRB); |
2951 | const TargetRegisterClass *MaskRC = |
2952 | TRI.getRegClassForTypeOnBank(Ty: MaskTy, Bank: *MaskRB); |
2953 | |
2954 | if (!RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI&: *MRI) || |
2955 | !RBI.constrainGenericRegister(Reg: SrcReg, RC: *SrcRC, MRI&: *MRI) || |
2956 | !RBI.constrainGenericRegister(Reg: MaskReg, RC: *MaskRC, MRI&: *MRI)) |
2957 | return false; |
2958 | |
2959 | if (Ty.getSizeInBits() == 32) { |
2960 | assert(MaskTy.getSizeInBits() == 32 && |
2961 | "ptrmask should have been narrowed during legalize" ); |
2962 | |
2963 | auto NewOp = BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg) |
2964 | .addReg(SrcReg) |
2965 | .addReg(MaskReg); |
2966 | |
2967 | if (!IsVGPR) |
2968 | NewOp.setOperandDead(3); // Dead scc |
2969 | I.eraseFromParent(); |
2970 | return true; |
2971 | } |
2972 | |
2973 | Register HiReg = MRI->createVirtualRegister(RegClass: &RegRC); |
2974 | Register LoReg = MRI->createVirtualRegister(RegClass: &RegRC); |
2975 | |
2976 | // Extract the subregisters from the source pointer. |
2977 | BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg) |
2978 | .addReg(SrcReg, 0, AMDGPU::sub0); |
2979 | BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg) |
2980 | .addReg(SrcReg, 0, AMDGPU::sub1); |
2981 | |
2982 | Register MaskedLo, MaskedHi; |
2983 | |
2984 | if (CanCopyLow32) { |
2985 | // If all the bits in the low half are 1, we only need a copy for it. |
2986 | MaskedLo = LoReg; |
2987 | } else { |
2988 | // Extract the mask subregister and apply the and. |
2989 | Register MaskLo = MRI->createVirtualRegister(RegClass: &RegRC); |
2990 | MaskedLo = MRI->createVirtualRegister(RegClass: &RegRC); |
2991 | |
2992 | BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo) |
2993 | .addReg(MaskReg, 0, AMDGPU::sub0); |
2994 | BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo) |
2995 | .addReg(LoReg) |
2996 | .addReg(MaskLo); |
2997 | } |
2998 | |
2999 | if (CanCopyHi32) { |
3000 | // If all the bits in the high half are 1, we only need a copy for it. |
3001 | MaskedHi = HiReg; |
3002 | } else { |
3003 | Register MaskHi = MRI->createVirtualRegister(RegClass: &RegRC); |
3004 | MaskedHi = MRI->createVirtualRegister(RegClass: &RegRC); |
3005 | |
3006 | BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi) |
3007 | .addReg(MaskReg, 0, AMDGPU::sub1); |
3008 | BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi) |
3009 | .addReg(HiReg) |
3010 | .addReg(MaskHi); |
3011 | } |
3012 | |
3013 | BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) |
3014 | .addReg(MaskedLo) |
3015 | .addImm(AMDGPU::sub0) |
3016 | .addReg(MaskedHi) |
3017 | .addImm(AMDGPU::sub1); |
3018 | I.eraseFromParent(); |
3019 | return true; |
3020 | } |
3021 | |
3022 | /// Return the register to use for the index value, and the subregister to use |
3023 | /// for the indirectly accessed register. |
3024 | static std::pair<Register, unsigned> |
3025 | computeIndirectRegIndex(MachineRegisterInfo &MRI, const SIRegisterInfo &TRI, |
3026 | const TargetRegisterClass *SuperRC, Register IdxReg, |
3027 | unsigned EltSize, GISelKnownBits &KnownBits) { |
3028 | Register IdxBaseReg; |
3029 | int Offset; |
3030 | |
3031 | std::tie(args&: IdxBaseReg, args&: Offset) = |
3032 | AMDGPU::getBaseWithConstantOffset(MRI, Reg: IdxReg, KnownBits: &KnownBits); |
3033 | if (IdxBaseReg == AMDGPU::NoRegister) { |
3034 | // This will happen if the index is a known constant. This should ordinarily |
3035 | // be legalized out, but handle it as a register just in case. |
3036 | assert(Offset == 0); |
3037 | IdxBaseReg = IdxReg; |
3038 | } |
3039 | |
3040 | ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(RC: SuperRC, EltSize); |
3041 | |
3042 | // Skip out of bounds offsets, or else we would end up using an undefined |
3043 | // register. |
3044 | if (static_cast<unsigned>(Offset) >= SubRegs.size()) |
3045 | return std::pair(IdxReg, SubRegs[0]); |
3046 | return std::pair(IdxBaseReg, SubRegs[Offset]); |
3047 | } |
3048 | |
3049 | bool AMDGPUInstructionSelector::( |
3050 | MachineInstr &MI) const { |
3051 | Register DstReg = MI.getOperand(i: 0).getReg(); |
3052 | Register SrcReg = MI.getOperand(i: 1).getReg(); |
3053 | Register IdxReg = MI.getOperand(i: 2).getReg(); |
3054 | |
3055 | LLT DstTy = MRI->getType(Reg: DstReg); |
3056 | LLT SrcTy = MRI->getType(Reg: SrcReg); |
3057 | |
3058 | const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); |
3059 | const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); |
3060 | const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI); |
3061 | |
3062 | // The index must be scalar. If it wasn't RegBankSelect should have moved this |
3063 | // into a waterfall loop. |
3064 | if (IdxRB->getID() != AMDGPU::SGPRRegBankID) |
3065 | return false; |
3066 | |
3067 | const TargetRegisterClass *SrcRC = |
3068 | TRI.getRegClassForTypeOnBank(Ty: SrcTy, Bank: *SrcRB); |
3069 | const TargetRegisterClass *DstRC = |
3070 | TRI.getRegClassForTypeOnBank(Ty: DstTy, Bank: *DstRB); |
3071 | if (!SrcRC || !DstRC) |
3072 | return false; |
3073 | if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || |
3074 | !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || |
3075 | !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI)) |
3076 | return false; |
3077 | |
3078 | MachineBasicBlock *BB = MI.getParent(); |
3079 | const DebugLoc &DL = MI.getDebugLoc(); |
3080 | const bool Is64 = DstTy.getSizeInBits() == 64; |
3081 | |
3082 | unsigned SubReg; |
3083 | std::tie(args&: IdxReg, args&: SubReg) = computeIndirectRegIndex( |
3084 | MRI&: *MRI, TRI, SuperRC: SrcRC, IdxReg, EltSize: DstTy.getSizeInBits() / 8, KnownBits&: *KB); |
3085 | |
3086 | if (SrcRB->getID() == AMDGPU::SGPRRegBankID) { |
3087 | if (DstTy.getSizeInBits() != 32 && !Is64) |
3088 | return false; |
3089 | |
3090 | BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) |
3091 | .addReg(IdxReg); |
3092 | |
3093 | unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32; |
3094 | BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg) |
3095 | .addReg(SrcReg, 0, SubReg) |
3096 | .addReg(SrcReg, RegState::Implicit); |
3097 | MI.eraseFromParent(); |
3098 | return true; |
3099 | } |
3100 | |
3101 | if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32) |
3102 | return false; |
3103 | |
3104 | if (!STI.useVGPRIndexMode()) { |
3105 | BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) |
3106 | .addReg(IdxReg); |
3107 | BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg) |
3108 | .addReg(SrcReg, 0, SubReg) |
3109 | .addReg(SrcReg, RegState::Implicit); |
3110 | MI.eraseFromParent(); |
3111 | return true; |
3112 | } |
3113 | |
3114 | const MCInstrDesc &GPRIDXDesc = |
3115 | TII.getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(*SrcRC), IsIndirectSrc: true); |
3116 | BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: GPRIDXDesc, DestReg: DstReg) |
3117 | .addReg(RegNo: SrcReg) |
3118 | .addReg(RegNo: IdxReg) |
3119 | .addImm(Val: SubReg); |
3120 | |
3121 | MI.eraseFromParent(); |
3122 | return true; |
3123 | } |
3124 | |
3125 | // TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd |
3126 | bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT( |
3127 | MachineInstr &MI) const { |
3128 | Register DstReg = MI.getOperand(i: 0).getReg(); |
3129 | Register VecReg = MI.getOperand(i: 1).getReg(); |
3130 | Register ValReg = MI.getOperand(i: 2).getReg(); |
3131 | Register IdxReg = MI.getOperand(i: 3).getReg(); |
3132 | |
3133 | LLT VecTy = MRI->getType(Reg: DstReg); |
3134 | LLT ValTy = MRI->getType(Reg: ValReg); |
3135 | unsigned VecSize = VecTy.getSizeInBits(); |
3136 | unsigned ValSize = ValTy.getSizeInBits(); |
3137 | |
3138 | const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI); |
3139 | const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI); |
3140 | const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI); |
3141 | |
3142 | assert(VecTy.getElementType() == ValTy); |
3143 | |
3144 | // The index must be scalar. If it wasn't RegBankSelect should have moved this |
3145 | // into a waterfall loop. |
3146 | if (IdxRB->getID() != AMDGPU::SGPRRegBankID) |
3147 | return false; |
3148 | |
3149 | const TargetRegisterClass *VecRC = |
3150 | TRI.getRegClassForTypeOnBank(Ty: VecTy, Bank: *VecRB); |
3151 | const TargetRegisterClass *ValRC = |
3152 | TRI.getRegClassForTypeOnBank(Ty: ValTy, Bank: *ValRB); |
3153 | |
3154 | if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) || |
3155 | !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) || |
3156 | !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) || |
3157 | !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI)) |
3158 | return false; |
3159 | |
3160 | if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32) |
3161 | return false; |
3162 | |
3163 | unsigned SubReg; |
3164 | std::tie(args&: IdxReg, args&: SubReg) = |
3165 | computeIndirectRegIndex(MRI&: *MRI, TRI, SuperRC: VecRC, IdxReg, EltSize: ValSize / 8, KnownBits&: *KB); |
3166 | |
3167 | const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID && |
3168 | STI.useVGPRIndexMode(); |
3169 | |
3170 | MachineBasicBlock *BB = MI.getParent(); |
3171 | const DebugLoc &DL = MI.getDebugLoc(); |
3172 | |
3173 | if (!IndexMode) { |
3174 | BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) |
3175 | .addReg(IdxReg); |
3176 | |
3177 | const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo( |
3178 | VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID); |
3179 | BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: RegWriteOp, DestReg: DstReg) |
3180 | .addReg(RegNo: VecReg) |
3181 | .addReg(RegNo: ValReg) |
3182 | .addImm(Val: SubReg); |
3183 | MI.eraseFromParent(); |
3184 | return true; |
3185 | } |
3186 | |
3187 | const MCInstrDesc &GPRIDXDesc = |
3188 | TII.getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(*VecRC), IsIndirectSrc: false); |
3189 | BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: GPRIDXDesc, DestReg: DstReg) |
3190 | .addReg(RegNo: VecReg) |
3191 | .addReg(RegNo: ValReg) |
3192 | .addReg(RegNo: IdxReg) |
3193 | .addImm(Val: SubReg); |
3194 | |
3195 | MI.eraseFromParent(); |
3196 | return true; |
3197 | } |
3198 | |
3199 | bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const { |
3200 | assert(!AMDGPU::isGFX12Plus(STI)); |
3201 | unsigned Opc; |
3202 | unsigned Size = MI.getOperand(i: 3).getImm(); |
3203 | |
3204 | // The struct intrinsic variants add one additional operand over raw. |
3205 | const bool HasVIndex = MI.getNumOperands() == 9; |
3206 | Register VIndex; |
3207 | int OpOffset = 0; |
3208 | if (HasVIndex) { |
3209 | VIndex = MI.getOperand(i: 4).getReg(); |
3210 | OpOffset = 1; |
3211 | } |
3212 | |
3213 | Register VOffset = MI.getOperand(i: 4 + OpOffset).getReg(); |
3214 | std::optional<ValueAndVReg> MaybeVOffset = |
3215 | getIConstantVRegValWithLookThrough(VReg: VOffset, MRI: *MRI); |
3216 | const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue(); |
3217 | |
3218 | switch (Size) { |
3219 | default: |
3220 | return false; |
3221 | case 1: |
3222 | Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN |
3223 | : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN |
3224 | : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN |
3225 | : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET; |
3226 | break; |
3227 | case 2: |
3228 | Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN |
3229 | : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN |
3230 | : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN |
3231 | : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET; |
3232 | break; |
3233 | case 4: |
3234 | Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN |
3235 | : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN |
3236 | : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN |
3237 | : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET; |
3238 | break; |
3239 | } |
3240 | |
3241 | MachineBasicBlock *MBB = MI.getParent(); |
3242 | const DebugLoc &DL = MI.getDebugLoc(); |
3243 | BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) |
3244 | .add(MI.getOperand(2)); |
3245 | |
3246 | auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc)); |
3247 | |
3248 | if (HasVIndex && HasVOffset) { |
3249 | Register IdxReg = MRI->createVirtualRegister(RegClass: TRI.getVGPR64Class()); |
3250 | BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg) |
3251 | .addReg(VIndex) |
3252 | .addImm(AMDGPU::sub0) |
3253 | .addReg(VOffset) |
3254 | .addImm(AMDGPU::sub1); |
3255 | |
3256 | MIB.addReg(IdxReg); |
3257 | } else if (HasVIndex) { |
3258 | MIB.addReg(VIndex); |
3259 | } else if (HasVOffset) { |
3260 | MIB.addReg(VOffset); |
3261 | } |
3262 | |
3263 | MIB.add(MI.getOperand(i: 1)); // rsrc |
3264 | MIB.add(MI.getOperand(i: 5 + OpOffset)); // soffset |
3265 | MIB.add(MI.getOperand(i: 6 + OpOffset)); // imm offset |
3266 | unsigned Aux = MI.getOperand(i: 7 + OpOffset).getImm(); |
3267 | MIB.addImm(Aux & AMDGPU::CPol::ALL); // cpol |
3268 | MIB.addImm(Aux & AMDGPU::CPol::SWZ_pregfx12 ? 1 : 0); // swz |
3269 | |
3270 | MachineMemOperand *LoadMMO = *MI.memoperands_begin(); |
3271 | MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo(); |
3272 | LoadPtrI.Offset = MI.getOperand(i: 6 + OpOffset).getImm(); |
3273 | MachinePointerInfo StorePtrI = LoadPtrI; |
3274 | StorePtrI.V = nullptr; |
3275 | StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS; |
3276 | |
3277 | auto F = LoadMMO->getFlags() & |
3278 | ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad); |
3279 | LoadMMO = MF->getMachineMemOperand(PtrInfo: LoadPtrI, F: F | MachineMemOperand::MOLoad, |
3280 | Size, BaseAlignment: LoadMMO->getBaseAlign()); |
3281 | |
3282 | MachineMemOperand *StoreMMO = |
3283 | MF->getMachineMemOperand(PtrInfo: StorePtrI, F: F | MachineMemOperand::MOStore, |
3284 | Size: sizeof(int32_t), BaseAlignment: LoadMMO->getBaseAlign()); |
3285 | |
3286 | MIB.setMemRefs({LoadMMO, StoreMMO}); |
3287 | |
3288 | MI.eraseFromParent(); |
3289 | return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); |
3290 | } |
3291 | |
3292 | /// Match a zero extend from a 32-bit value to 64-bits. |
3293 | static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) { |
3294 | Register ZExtSrc; |
3295 | if (mi_match(R: Reg, MRI, P: m_GZExt(Src: m_Reg(R&: ZExtSrc)))) |
3296 | return MRI.getType(Reg: ZExtSrc) == LLT::scalar(SizeInBits: 32) ? ZExtSrc : Register(); |
3297 | |
3298 | // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0) |
3299 | const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI); |
3300 | if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES) |
3301 | return Register(); |
3302 | |
3303 | assert(Def->getNumOperands() == 3 && |
3304 | MRI.getType(Def->getOperand(0).getReg()) == LLT::scalar(64)); |
3305 | if (mi_match(R: Def->getOperand(i: 2).getReg(), MRI, P: m_ZeroInt())) { |
3306 | return Def->getOperand(i: 1).getReg(); |
3307 | } |
3308 | |
3309 | return Register(); |
3310 | } |
3311 | |
3312 | bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{ |
3313 | unsigned Opc; |
3314 | unsigned Size = MI.getOperand(i: 3).getImm(); |
3315 | |
3316 | switch (Size) { |
3317 | default: |
3318 | return false; |
3319 | case 1: |
3320 | Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE; |
3321 | break; |
3322 | case 2: |
3323 | Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT; |
3324 | break; |
3325 | case 4: |
3326 | Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD; |
3327 | break; |
3328 | } |
3329 | |
3330 | MachineBasicBlock *MBB = MI.getParent(); |
3331 | const DebugLoc &DL = MI.getDebugLoc(); |
3332 | BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) |
3333 | .add(MI.getOperand(2)); |
3334 | |
3335 | Register Addr = MI.getOperand(i: 1).getReg(); |
3336 | Register VOffset; |
3337 | // Try to split SAddr and VOffset. Global and LDS pointers share the same |
3338 | // immediate offset, so we cannot use a regular SelectGlobalSAddr(). |
3339 | if (!isSGPR(Reg: Addr)) { |
3340 | auto AddrDef = getDefSrcRegIgnoringCopies(Reg: Addr, MRI: *MRI); |
3341 | if (isSGPR(Reg: AddrDef->Reg)) { |
3342 | Addr = AddrDef->Reg; |
3343 | } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) { |
3344 | Register SAddr = |
3345 | getSrcRegIgnoringCopies(Reg: AddrDef->MI->getOperand(i: 1).getReg(), MRI: *MRI); |
3346 | if (isSGPR(Reg: SAddr)) { |
3347 | Register PtrBaseOffset = AddrDef->MI->getOperand(i: 2).getReg(); |
3348 | if (Register Off = matchZeroExtendFromS32(MRI&: *MRI, Reg: PtrBaseOffset)) { |
3349 | Addr = SAddr; |
3350 | VOffset = Off; |
3351 | } |
3352 | } |
3353 | } |
3354 | } |
3355 | |
3356 | if (isSGPR(Reg: Addr)) { |
3357 | Opc = AMDGPU::getGlobalSaddrOp(Opcode: Opc); |
3358 | if (!VOffset) { |
3359 | VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
3360 | BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset) |
3361 | .addImm(0); |
3362 | } |
3363 | } |
3364 | |
3365 | auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc)) |
3366 | .addReg(Addr); |
3367 | |
3368 | if (isSGPR(Reg: Addr)) |
3369 | MIB.addReg(VOffset); |
3370 | |
3371 | MIB.add(MI.getOperand(i: 4)) // offset |
3372 | .add(MI.getOperand(i: 5)); // cpol |
3373 | |
3374 | MachineMemOperand *LoadMMO = *MI.memoperands_begin(); |
3375 | MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo(); |
3376 | LoadPtrI.Offset = MI.getOperand(i: 4).getImm(); |
3377 | MachinePointerInfo StorePtrI = LoadPtrI; |
3378 | LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS; |
3379 | StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS; |
3380 | auto F = LoadMMO->getFlags() & |
3381 | ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad); |
3382 | LoadMMO = MF->getMachineMemOperand(PtrInfo: LoadPtrI, F: F | MachineMemOperand::MOLoad, |
3383 | Size, BaseAlignment: LoadMMO->getBaseAlign()); |
3384 | MachineMemOperand *StoreMMO = |
3385 | MF->getMachineMemOperand(PtrInfo: StorePtrI, F: F | MachineMemOperand::MOStore, |
3386 | Size: sizeof(int32_t), BaseAlignment: Align(4)); |
3387 | |
3388 | MIB.setMemRefs({LoadMMO, StoreMMO}); |
3389 | |
3390 | MI.eraseFromParent(); |
3391 | return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); |
3392 | } |
3393 | |
3394 | bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{ |
3395 | MI.setDesc(TII.get(MI.getOperand(i: 1).getImm())); |
3396 | MI.removeOperand(OpNo: 1); |
3397 | MI.addImplicitDefUseOperands(MF&: *MI.getParent()->getParent()); |
3398 | return true; |
3399 | } |
3400 | |
3401 | bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const { |
3402 | unsigned Opc; |
3403 | switch (cast<GIntrinsic>(Val&: MI).getIntrinsicID()) { |
3404 | case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16: |
3405 | Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64; |
3406 | break; |
3407 | case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16: |
3408 | Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64; |
3409 | break; |
3410 | case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16: |
3411 | Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64; |
3412 | break; |
3413 | case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16: |
3414 | Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64; |
3415 | break; |
3416 | case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8: |
3417 | Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64; |
3418 | break; |
3419 | case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8: |
3420 | Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64; |
3421 | break; |
3422 | case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8: |
3423 | Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64; |
3424 | break; |
3425 | case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8: |
3426 | Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64; |
3427 | break; |
3428 | case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8: |
3429 | Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64; |
3430 | break; |
3431 | case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8: |
3432 | Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64; |
3433 | break; |
3434 | case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8: |
3435 | Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64; |
3436 | break; |
3437 | case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8: |
3438 | Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64; |
3439 | break; |
3440 | case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8: |
3441 | Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64; |
3442 | break; |
3443 | case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8: |
3444 | Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64; |
3445 | break; |
3446 | default: |
3447 | llvm_unreachable("unhandled smfmac intrinsic" ); |
3448 | } |
3449 | |
3450 | auto VDst_In = MI.getOperand(i: 4); |
3451 | |
3452 | MI.setDesc(TII.get(Opc)); |
3453 | MI.removeOperand(OpNo: 4); // VDst_In |
3454 | MI.removeOperand(OpNo: 1); // Intrinsic ID |
3455 | MI.addOperand(Op: VDst_In); // Readd VDst_In to the end |
3456 | MI.addImplicitDefUseOperands(MF&: *MI.getParent()->getParent()); |
3457 | return true; |
3458 | } |
3459 | |
3460 | bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const { |
3461 | Register DstReg = MI.getOperand(i: 0).getReg(); |
3462 | Register SrcReg = MI.getOperand(i: 1).getReg(); |
3463 | const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); |
3464 | const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID; |
3465 | MachineBasicBlock *MBB = MI.getParent(); |
3466 | const DebugLoc &DL = MI.getDebugLoc(); |
3467 | |
3468 | if (IsVALU) { |
3469 | BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg) |
3470 | .addImm(Subtarget->getWavefrontSizeLog2()) |
3471 | .addReg(SrcReg); |
3472 | } else { |
3473 | BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg) |
3474 | .addReg(SrcReg) |
3475 | .addImm(Subtarget->getWavefrontSizeLog2()) |
3476 | .setOperandDead(3); // Dead scc |
3477 | } |
3478 | |
3479 | const TargetRegisterClass &RC = |
3480 | IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; |
3481 | if (!RBI.constrainGenericRegister(Reg: DstReg, RC, MRI&: *MRI)) |
3482 | return false; |
3483 | |
3484 | MI.eraseFromParent(); |
3485 | return true; |
3486 | } |
3487 | |
3488 | bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const { |
3489 | Register SrcReg = MI.getOperand(i: 0).getReg(); |
3490 | if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI)) |
3491 | return false; |
3492 | |
3493 | MachineInstr *DefMI = MRI->getVRegDef(Reg: SrcReg); |
3494 | Register SP = |
3495 | Subtarget->getTargetLowering()->getStackPointerRegisterToSaveRestore(); |
3496 | Register WaveAddr = getWaveAddress(Def: DefMI); |
3497 | MachineBasicBlock *MBB = MI.getParent(); |
3498 | const DebugLoc &DL = MI.getDebugLoc(); |
3499 | |
3500 | if (!WaveAddr) { |
3501 | WaveAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); |
3502 | BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), WaveAddr) |
3503 | .addReg(SrcReg) |
3504 | .addImm(Subtarget->getWavefrontSizeLog2()) |
3505 | .setOperandDead(3); // Dead scc |
3506 | } |
3507 | |
3508 | BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), SP) |
3509 | .addReg(WaveAddr); |
3510 | |
3511 | MI.eraseFromParent(); |
3512 | return true; |
3513 | } |
3514 | |
3515 | bool AMDGPUInstructionSelector::select(MachineInstr &I) { |
3516 | |
3517 | if (!I.isPreISelOpcode()) { |
3518 | if (I.isCopy()) |
3519 | return selectCOPY(I); |
3520 | return true; |
3521 | } |
3522 | |
3523 | switch (I.getOpcode()) { |
3524 | case TargetOpcode::G_AND: |
3525 | case TargetOpcode::G_OR: |
3526 | case TargetOpcode::G_XOR: |
3527 | if (selectImpl(I, CoverageInfo&: *CoverageInfo)) |
3528 | return true; |
3529 | return selectG_AND_OR_XOR(I); |
3530 | case TargetOpcode::G_ADD: |
3531 | case TargetOpcode::G_SUB: |
3532 | if (selectImpl(I, CoverageInfo&: *CoverageInfo)) |
3533 | return true; |
3534 | return selectG_ADD_SUB(I); |
3535 | case TargetOpcode::G_UADDO: |
3536 | case TargetOpcode::G_USUBO: |
3537 | case TargetOpcode::G_UADDE: |
3538 | case TargetOpcode::G_USUBE: |
3539 | return selectG_UADDO_USUBO_UADDE_USUBE(I); |
3540 | case AMDGPU::G_AMDGPU_MAD_U64_U32: |
3541 | case AMDGPU::G_AMDGPU_MAD_I64_I32: |
3542 | return selectG_AMDGPU_MAD_64_32(I); |
3543 | case TargetOpcode::G_INTTOPTR: |
3544 | case TargetOpcode::G_BITCAST: |
3545 | case TargetOpcode::G_PTRTOINT: |
3546 | return selectCOPY(I); |
3547 | case TargetOpcode::G_CONSTANT: |
3548 | case TargetOpcode::G_FCONSTANT: |
3549 | return selectG_CONSTANT(I); |
3550 | case TargetOpcode::G_FNEG: |
3551 | if (selectImpl(I, CoverageInfo&: *CoverageInfo)) |
3552 | return true; |
3553 | return selectG_FNEG(MI&: I); |
3554 | case TargetOpcode::G_FABS: |
3555 | if (selectImpl(I, CoverageInfo&: *CoverageInfo)) |
3556 | return true; |
3557 | return selectG_FABS(MI&: I); |
3558 | case TargetOpcode::G_EXTRACT: |
3559 | return selectG_EXTRACT(I); |
3560 | case TargetOpcode::G_MERGE_VALUES: |
3561 | case TargetOpcode::G_CONCAT_VECTORS: |
3562 | return selectG_MERGE_VALUES(MI&: I); |
3563 | case TargetOpcode::G_UNMERGE_VALUES: |
3564 | return selectG_UNMERGE_VALUES(MI&: I); |
3565 | case TargetOpcode::G_BUILD_VECTOR: |
3566 | case TargetOpcode::G_BUILD_VECTOR_TRUNC: |
3567 | return selectG_BUILD_VECTOR(MI&: I); |
3568 | case TargetOpcode::G_PTR_ADD: |
3569 | if (selectImpl(I, CoverageInfo&: *CoverageInfo)) |
3570 | return true; |
3571 | return selectG_PTR_ADD(I); |
3572 | case TargetOpcode::G_IMPLICIT_DEF: |
3573 | return selectG_IMPLICIT_DEF(I); |
3574 | case TargetOpcode::G_FREEZE: |
3575 | return selectCOPY(I); |
3576 | case TargetOpcode::G_INSERT: |
3577 | return selectG_INSERT(I); |
3578 | case TargetOpcode::G_INTRINSIC: |
3579 | case TargetOpcode::G_INTRINSIC_CONVERGENT: |
3580 | return selectG_INTRINSIC(I); |
3581 | case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: |
3582 | case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: |
3583 | return selectG_INTRINSIC_W_SIDE_EFFECTS(I); |
3584 | case TargetOpcode::G_ICMP: |
3585 | case TargetOpcode::G_FCMP: |
3586 | if (selectG_ICMP_or_FCMP(I)) |
3587 | return true; |
3588 | return selectImpl(I, CoverageInfo&: *CoverageInfo); |
3589 | case TargetOpcode::G_LOAD: |
3590 | case TargetOpcode::G_STORE: |
3591 | case TargetOpcode::G_ATOMIC_CMPXCHG: |
3592 | case TargetOpcode::G_ATOMICRMW_XCHG: |
3593 | case TargetOpcode::G_ATOMICRMW_ADD: |
3594 | case TargetOpcode::G_ATOMICRMW_SUB: |
3595 | case TargetOpcode::G_ATOMICRMW_AND: |
3596 | case TargetOpcode::G_ATOMICRMW_OR: |
3597 | case TargetOpcode::G_ATOMICRMW_XOR: |
3598 | case TargetOpcode::G_ATOMICRMW_MIN: |
3599 | case TargetOpcode::G_ATOMICRMW_MAX: |
3600 | case TargetOpcode::G_ATOMICRMW_UMIN: |
3601 | case TargetOpcode::G_ATOMICRMW_UMAX: |
3602 | case TargetOpcode::G_ATOMICRMW_UINC_WRAP: |
3603 | case TargetOpcode::G_ATOMICRMW_UDEC_WRAP: |
3604 | case TargetOpcode::G_ATOMICRMW_FADD: |
3605 | case AMDGPU::G_AMDGPU_ATOMIC_FMIN: |
3606 | case AMDGPU::G_AMDGPU_ATOMIC_FMAX: |
3607 | return selectG_LOAD_STORE_ATOMICRMW(I); |
3608 | case TargetOpcode::G_SELECT: |
3609 | return selectG_SELECT(I); |
3610 | case TargetOpcode::G_TRUNC: |
3611 | return selectG_TRUNC(I); |
3612 | case TargetOpcode::G_SEXT: |
3613 | case TargetOpcode::G_ZEXT: |
3614 | case TargetOpcode::G_ANYEXT: |
3615 | case TargetOpcode::G_SEXT_INREG: |
3616 | // This is a workaround. For extension from type i1, `selectImpl()` uses |
3617 | // patterns from TD file and generates an illegal VGPR to SGPR COPY as type |
3618 | // i1 can only be hold in a SGPR class. |
3619 | if (MRI->getType(Reg: I.getOperand(i: 1).getReg()) != LLT::scalar(SizeInBits: 1) && |
3620 | selectImpl(I, CoverageInfo&: *CoverageInfo)) |
3621 | return true; |
3622 | return selectG_SZA_EXT(I); |
3623 | case TargetOpcode::G_FPEXT: |
3624 | if (selectG_FPEXT(I)) |
3625 | return true; |
3626 | return selectImpl(I, CoverageInfo&: *CoverageInfo); |
3627 | case TargetOpcode::G_BRCOND: |
3628 | return selectG_BRCOND(I); |
3629 | case TargetOpcode::G_GLOBAL_VALUE: |
3630 | return selectG_GLOBAL_VALUE(I); |
3631 | case TargetOpcode::G_PTRMASK: |
3632 | return selectG_PTRMASK(I); |
3633 | case TargetOpcode::G_EXTRACT_VECTOR_ELT: |
3634 | return selectG_EXTRACT_VECTOR_ELT(MI&: I); |
3635 | case TargetOpcode::G_INSERT_VECTOR_ELT: |
3636 | return selectG_INSERT_VECTOR_ELT(MI&: I); |
3637 | case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: |
3638 | case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16: |
3639 | case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: |
3640 | case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: { |
3641 | const AMDGPU::ImageDimIntrinsicInfo *Intr = |
3642 | AMDGPU::getImageDimIntrinsicInfo(Intr: AMDGPU::getIntrinsicID(I)); |
3643 | assert(Intr && "not an image intrinsic with image pseudo" ); |
3644 | return selectImageIntrinsic(MI&: I, Intr); |
3645 | } |
3646 | case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: |
3647 | return selectBVHIntrinsic(MI&: I); |
3648 | case AMDGPU::G_SBFX: |
3649 | case AMDGPU::G_UBFX: |
3650 | return selectG_SBFX_UBFX(MI&: I); |
3651 | case AMDGPU::G_SI_CALL: |
3652 | I.setDesc(TII.get(AMDGPU::SI_CALL)); |
3653 | return true; |
3654 | case AMDGPU::G_AMDGPU_WAVE_ADDRESS: |
3655 | return selectWaveAddress(MI&: I); |
3656 | case AMDGPU::G_STACKRESTORE: |
3657 | return selectStackRestore(MI&: I); |
3658 | case AMDGPU::G_PHI: |
3659 | return selectPHI(I); |
3660 | default: |
3661 | return selectImpl(I, CoverageInfo&: *CoverageInfo); |
3662 | } |
3663 | return false; |
3664 | } |
3665 | |
3666 | InstructionSelector::ComplexRendererFns |
3667 | AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const { |
3668 | return {{ |
3669 | [=](MachineInstrBuilder &MIB) { MIB.add(MO: Root); } |
3670 | }}; |
3671 | |
3672 | } |
3673 | |
3674 | std::pair<Register, unsigned> |
3675 | AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root, |
3676 | bool IsCanonicalizing, |
3677 | bool AllowAbs, bool OpSel) const { |
3678 | Register Src = Root.getReg(); |
3679 | unsigned Mods = 0; |
3680 | MachineInstr *MI = getDefIgnoringCopies(Reg: Src, MRI: *MRI); |
3681 | |
3682 | if (MI->getOpcode() == AMDGPU::G_FNEG) { |
3683 | Src = MI->getOperand(i: 1).getReg(); |
3684 | Mods |= SISrcMods::NEG; |
3685 | MI = getDefIgnoringCopies(Reg: Src, MRI: *MRI); |
3686 | } else if (MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) { |
3687 | // Fold fsub [+-]0 into fneg. This may not have folded depending on the |
3688 | // denormal mode, but we're implicitly canonicalizing in a source operand. |
3689 | const ConstantFP *LHS = |
3690 | getConstantFPVRegVal(VReg: MI->getOperand(i: 1).getReg(), MRI: *MRI); |
3691 | if (LHS && LHS->isZero()) { |
3692 | Mods |= SISrcMods::NEG; |
3693 | Src = MI->getOperand(i: 2).getReg(); |
3694 | } |
3695 | } |
3696 | |
3697 | if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) { |
3698 | Src = MI->getOperand(i: 1).getReg(); |
3699 | Mods |= SISrcMods::ABS; |
3700 | } |
3701 | |
3702 | if (OpSel) |
3703 | Mods |= SISrcMods::OP_SEL_0; |
3704 | |
3705 | return std::pair(Src, Mods); |
3706 | } |
3707 | |
3708 | Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded( |
3709 | Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt, |
3710 | bool ForceVGPR) const { |
3711 | if ((Mods != 0 || ForceVGPR) && |
3712 | RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) { |
3713 | |
3714 | // If we looked through copies to find source modifiers on an SGPR operand, |
3715 | // we now have an SGPR register source. To avoid potentially violating the |
3716 | // constant bus restriction, we need to insert a copy to a VGPR. |
3717 | Register VGPRSrc = MRI->cloneVirtualRegister(VReg: Root.getReg()); |
3718 | BuildMI(*InsertPt->getParent(), InsertPt, InsertPt->getDebugLoc(), |
3719 | TII.get(AMDGPU::COPY), VGPRSrc) |
3720 | .addReg(Src); |
3721 | Src = VGPRSrc; |
3722 | } |
3723 | |
3724 | return Src; |
3725 | } |
3726 | |
3727 | /// |
3728 | /// This will select either an SGPR or VGPR operand and will save us from |
3729 | /// having to write an extra tablegen pattern. |
3730 | InstructionSelector::ComplexRendererFns |
3731 | AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const { |
3732 | return {{ |
3733 | [=](MachineInstrBuilder &MIB) { MIB.add(MO: Root); } |
3734 | }}; |
3735 | } |
3736 | |
3737 | InstructionSelector::ComplexRendererFns |
3738 | AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const { |
3739 | Register Src; |
3740 | unsigned Mods; |
3741 | std::tie(args&: Src, args&: Mods) = selectVOP3ModsImpl(Root); |
3742 | |
3743 | return {{ |
3744 | [=](MachineInstrBuilder &MIB) { |
3745 | MIB.addReg(RegNo: copyToVGPRIfSrcFolded(Src, Mods, Root, InsertPt: MIB)); |
3746 | }, |
3747 | [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); }, // src0_mods |
3748 | [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: 0); }, // clamp |
3749 | [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: 0); } // omod |
3750 | }}; |
3751 | } |
3752 | |
3753 | InstructionSelector::ComplexRendererFns |
3754 | AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const { |
3755 | Register Src; |
3756 | unsigned Mods; |
3757 | std::tie(args&: Src, args&: Mods) = selectVOP3ModsImpl(Root, |
3758 | /*IsCanonicalizing=*/true, |
3759 | /*AllowAbs=*/false); |
3760 | |
3761 | return {{ |
3762 | [=](MachineInstrBuilder &MIB) { |
3763 | MIB.addReg(RegNo: copyToVGPRIfSrcFolded(Src, Mods, Root, InsertPt: MIB)); |
3764 | }, |
3765 | [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); }, // src0_mods |
3766 | [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: 0); }, // clamp |
3767 | [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: 0); } // omod |
3768 | }}; |
3769 | } |
3770 | |
3771 | InstructionSelector::ComplexRendererFns |
3772 | AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const { |
3773 | return {{ |
3774 | [=](MachineInstrBuilder &MIB) { MIB.add(MO: Root); }, |
3775 | [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: 0); }, // clamp |
3776 | [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: 0); } // omod |
3777 | }}; |
3778 | } |
3779 | |
3780 | InstructionSelector::ComplexRendererFns |
3781 | AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const { |
3782 | Register Src; |
3783 | unsigned Mods; |
3784 | std::tie(args&: Src, args&: Mods) = selectVOP3ModsImpl(Root); |
3785 | |
3786 | return {{ |
3787 | [=](MachineInstrBuilder &MIB) { |
3788 | MIB.addReg(RegNo: copyToVGPRIfSrcFolded(Src, Mods, Root, InsertPt: MIB)); |
3789 | }, |
3790 | [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); } // src_mods |
3791 | }}; |
3792 | } |
3793 | |
3794 | InstructionSelector::ComplexRendererFns |
3795 | AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing( |
3796 | MachineOperand &Root) const { |
3797 | Register Src; |
3798 | unsigned Mods; |
3799 | std::tie(args&: Src, args&: Mods) = selectVOP3ModsImpl(Root, /*IsCanonicalizing=*/false); |
3800 | |
3801 | return {{ |
3802 | [=](MachineInstrBuilder &MIB) { |
3803 | MIB.addReg(RegNo: copyToVGPRIfSrcFolded(Src, Mods, Root, InsertPt: MIB)); |
3804 | }, |
3805 | [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); } // src_mods |
3806 | }}; |
3807 | } |
3808 | |
3809 | InstructionSelector::ComplexRendererFns |
3810 | AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const { |
3811 | Register Src; |
3812 | unsigned Mods; |
3813 | std::tie(args&: Src, args&: Mods) = selectVOP3ModsImpl(Root, /*IsCanonicalizing=*/true, |
3814 | /*AllowAbs=*/false); |
3815 | |
3816 | return {{ |
3817 | [=](MachineInstrBuilder &MIB) { |
3818 | MIB.addReg(RegNo: copyToVGPRIfSrcFolded(Src, Mods, Root, InsertPt: MIB)); |
3819 | }, |
3820 | [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); } // src_mods |
3821 | }}; |
3822 | } |
3823 | |
3824 | InstructionSelector::ComplexRendererFns |
3825 | AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const { |
3826 | Register Reg = Root.getReg(); |
3827 | const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI: *MRI); |
3828 | if (Def->getOpcode() == AMDGPU::G_FNEG || Def->getOpcode() == AMDGPU::G_FABS) |
3829 | return {}; |
3830 | return {{ |
3831 | [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Reg); }, |
3832 | }}; |
3833 | } |
3834 | |
3835 | std::pair<Register, unsigned> |
3836 | AMDGPUInstructionSelector::selectVOP3PModsImpl( |
3837 | Register Src, const MachineRegisterInfo &MRI, bool IsDOT) const { |
3838 | unsigned Mods = 0; |
3839 | MachineInstr *MI = MRI.getVRegDef(Reg: Src); |
3840 | |
3841 | if (MI && MI->getOpcode() == AMDGPU::G_FNEG && |
3842 | // It's possible to see an f32 fneg here, but unlikely. |
3843 | // TODO: Treat f32 fneg as only high bit. |
3844 | MRI.getType(Src) == LLT::fixed_vector(2, 16)) { |
3845 | Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); |
3846 | Src = MI->getOperand(i: 1).getReg(); |
3847 | MI = MRI.getVRegDef(Reg: Src); |
3848 | } |
3849 | |
3850 | // TODO: Handle G_FSUB 0 as fneg |
3851 | |
3852 | // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector. |
3853 | (void)IsDOT; // DOTs do not use OPSEL on gfx940+, check ST.hasDOTOpSelHazard() |
3854 | |
3855 | // Packed instructions do not have abs modifiers. |
3856 | Mods |= SISrcMods::OP_SEL_1; |
3857 | |
3858 | return std::pair(Src, Mods); |
3859 | } |
3860 | |
3861 | InstructionSelector::ComplexRendererFns |
3862 | AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const { |
3863 | MachineRegisterInfo &MRI |
3864 | = Root.getParent()->getParent()->getParent()->getRegInfo(); |
3865 | |
3866 | Register Src; |
3867 | unsigned Mods; |
3868 | std::tie(args&: Src, args&: Mods) = selectVOP3PModsImpl(Src: Root.getReg(), MRI); |
3869 | |
3870 | return {{ |
3871 | [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); }, |
3872 | [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); } // src_mods |
3873 | }}; |
3874 | } |
3875 | |
3876 | InstructionSelector::ComplexRendererFns |
3877 | AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const { |
3878 | MachineRegisterInfo &MRI |
3879 | = Root.getParent()->getParent()->getParent()->getRegInfo(); |
3880 | |
3881 | Register Src; |
3882 | unsigned Mods; |
3883 | std::tie(args&: Src, args&: Mods) = selectVOP3PModsImpl(Src: Root.getReg(), MRI, IsDOT: true); |
3884 | |
3885 | return {{ |
3886 | [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); }, |
3887 | [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); } // src_mods |
3888 | }}; |
3889 | } |
3890 | |
3891 | InstructionSelector::ComplexRendererFns |
3892 | AMDGPUInstructionSelector::selectVOP3PModsNeg(MachineOperand &Root) const { |
3893 | // Literal i1 value set in intrinsic, represents SrcMods for the next operand. |
3894 | // Value is in Imm operand as i1 sign extended to int64_t. |
3895 | // 1(-1) promotes packed values to signed, 0 treats them as unsigned. |
3896 | assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) && |
3897 | "expected i1 value" ); |
3898 | unsigned Mods = SISrcMods::OP_SEL_1; |
3899 | if (Root.getImm() == -1) |
3900 | Mods ^= SISrcMods::NEG; |
3901 | return {{ |
3902 | [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); } // src_mods |
3903 | }}; |
3904 | } |
3905 | |
3906 | InstructionSelector::ComplexRendererFns |
3907 | AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods( |
3908 | MachineOperand &Root) const { |
3909 | assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) && |
3910 | "expected i1 value" ); |
3911 | unsigned Mods = SISrcMods::OP_SEL_1; |
3912 | if (Root.getImm() != 0) |
3913 | Mods |= SISrcMods::OP_SEL_0; |
3914 | |
3915 | return {{ |
3916 | [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); } // src_mods |
3917 | }}; |
3918 | } |
3919 | |
3920 | static Register buildRegSequence(SmallVectorImpl<Register> &Elts, |
3921 | MachineInstr *InsertPt, |
3922 | MachineRegisterInfo &MRI) { |
3923 | const TargetRegisterClass *DstRegClass; |
3924 | switch (Elts.size()) { |
3925 | case 8: |
3926 | DstRegClass = &AMDGPU::VReg_256RegClass; |
3927 | break; |
3928 | case 4: |
3929 | DstRegClass = &AMDGPU::VReg_128RegClass; |
3930 | break; |
3931 | case 2: |
3932 | DstRegClass = &AMDGPU::VReg_64RegClass; |
3933 | break; |
3934 | default: |
3935 | llvm_unreachable("unhandled Reg sequence size" ); |
3936 | } |
3937 | |
3938 | MachineIRBuilder B(*InsertPt); |
3939 | auto MIB = B.buildInstr(AMDGPU::REG_SEQUENCE) |
3940 | .addDef(MRI.createVirtualRegister(DstRegClass)); |
3941 | for (unsigned i = 0; i < Elts.size(); ++i) { |
3942 | MIB.addReg(Elts[i]); |
3943 | MIB.addImm(SIRegisterInfo::getSubRegFromChannel(Channel: i)); |
3944 | } |
3945 | return MIB->getOperand(0).getReg(); |
3946 | } |
3947 | |
3948 | static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, |
3949 | SmallVectorImpl<Register> &Elts, Register &Src, |
3950 | MachineInstr *InsertPt, |
3951 | MachineRegisterInfo &MRI) { |
3952 | if (ModOpcode == TargetOpcode::G_FNEG) { |
3953 | Mods |= SISrcMods::NEG; |
3954 | // Check if all elements also have abs modifier |
3955 | SmallVector<Register, 8> NegAbsElts; |
3956 | for (auto El : Elts) { |
3957 | Register FabsSrc; |
3958 | if (!mi_match(R: El, MRI, P: m_GFabs(Src: m_Reg(R&: FabsSrc)))) |
3959 | break; |
3960 | NegAbsElts.push_back(Elt: FabsSrc); |
3961 | } |
3962 | if (Elts.size() != NegAbsElts.size()) { |
3963 | // Neg |
3964 | Src = buildRegSequence(Elts, InsertPt, MRI); |
3965 | } else { |
3966 | // Neg and Abs |
3967 | Mods |= SISrcMods::NEG_HI; |
3968 | Src = buildRegSequence(Elts&: NegAbsElts, InsertPt, MRI); |
3969 | } |
3970 | } else { |
3971 | assert(ModOpcode == TargetOpcode::G_FABS); |
3972 | // Abs |
3973 | Mods |= SISrcMods::NEG_HI; |
3974 | Src = buildRegSequence(Elts, InsertPt, MRI); |
3975 | } |
3976 | } |
3977 | |
3978 | InstructionSelector::ComplexRendererFns |
3979 | AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const { |
3980 | Register Src = Root.getReg(); |
3981 | unsigned Mods = SISrcMods::OP_SEL_1; |
3982 | SmallVector<Register, 8> EltsF32; |
3983 | |
3984 | if (GBuildVector *BV = dyn_cast<GBuildVector>(Val: MRI->getVRegDef(Reg: Src))) { |
3985 | assert(BV->getNumSources() > 0); |
3986 | // Based on first element decide which mod we match, neg or abs |
3987 | MachineInstr *ElF32 = MRI->getVRegDef(Reg: BV->getSourceReg(I: 0)); |
3988 | unsigned ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG) |
3989 | ? AMDGPU::G_FNEG |
3990 | : AMDGPU::G_FABS; |
3991 | for (unsigned i = 0; i < BV->getNumSources(); ++i) { |
3992 | ElF32 = MRI->getVRegDef(Reg: BV->getSourceReg(I: i)); |
3993 | if (ElF32->getOpcode() != ModOpcode) |
3994 | break; |
3995 | EltsF32.push_back(Elt: ElF32->getOperand(i: 1).getReg()); |
3996 | } |
3997 | |
3998 | // All elements had ModOpcode modifier |
3999 | if (BV->getNumSources() == EltsF32.size()) { |
4000 | selectWMMAModsNegAbs(ModOpcode, Mods, Elts&: EltsF32, Src, InsertPt: Root.getParent(), |
4001 | MRI&: *MRI); |
4002 | } |
4003 | } |
4004 | |
4005 | return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); }, |
4006 | [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); }}}; |
4007 | } |
4008 | |
4009 | InstructionSelector::ComplexRendererFns |
4010 | AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const { |
4011 | Register Src = Root.getReg(); |
4012 | unsigned Mods = SISrcMods::OP_SEL_1; |
4013 | SmallVector<Register, 8> EltsV2F16; |
4014 | |
4015 | if (GConcatVectors *CV = dyn_cast<GConcatVectors>(Val: MRI->getVRegDef(Reg: Src))) { |
4016 | for (unsigned i = 0; i < CV->getNumSources(); ++i) { |
4017 | Register FNegSrc; |
4018 | if (!mi_match(R: CV->getSourceReg(I: i), MRI: *MRI, P: m_GFNeg(Src: m_Reg(R&: FNegSrc)))) |
4019 | break; |
4020 | EltsV2F16.push_back(Elt: FNegSrc); |
4021 | } |
4022 | |
4023 | // All elements had ModOpcode modifier |
4024 | if (CV->getNumSources() == EltsV2F16.size()) { |
4025 | Mods |= SISrcMods::NEG; |
4026 | Mods |= SISrcMods::NEG_HI; |
4027 | Src = buildRegSequence(Elts&: EltsV2F16, InsertPt: Root.getParent(), MRI&: *MRI); |
4028 | } |
4029 | } |
4030 | |
4031 | return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); }, |
4032 | [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); }}}; |
4033 | } |
4034 | |
4035 | InstructionSelector::ComplexRendererFns |
4036 | AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const { |
4037 | Register Src = Root.getReg(); |
4038 | unsigned Mods = SISrcMods::OP_SEL_1; |
4039 | SmallVector<Register, 8> EltsV2F16; |
4040 | |
4041 | if (GConcatVectors *CV = dyn_cast<GConcatVectors>(Val: MRI->getVRegDef(Reg: Src))) { |
4042 | assert(CV->getNumSources() > 0); |
4043 | MachineInstr *ElV2F16 = MRI->getVRegDef(Reg: CV->getSourceReg(I: 0)); |
4044 | // Based on first element decide which mod we match, neg or abs |
4045 | unsigned ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG) |
4046 | ? AMDGPU::G_FNEG |
4047 | : AMDGPU::G_FABS; |
4048 | |
4049 | for (unsigned i = 0; i < CV->getNumSources(); ++i) { |
4050 | ElV2F16 = MRI->getVRegDef(Reg: CV->getSourceReg(I: i)); |
4051 | if (ElV2F16->getOpcode() != ModOpcode) |
4052 | break; |
4053 | EltsV2F16.push_back(Elt: ElV2F16->getOperand(i: 1).getReg()); |
4054 | } |
4055 | |
4056 | // All elements had ModOpcode modifier |
4057 | if (CV->getNumSources() == EltsV2F16.size()) { |
4058 | MachineIRBuilder B(*Root.getParent()); |
4059 | selectWMMAModsNegAbs(ModOpcode, Mods, Elts&: EltsV2F16, Src, InsertPt: Root.getParent(), |
4060 | MRI&: *MRI); |
4061 | } |
4062 | } |
4063 | |
4064 | return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); }, |
4065 | [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); }}}; |
4066 | } |
4067 | |
4068 | InstructionSelector::ComplexRendererFns |
4069 | AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const { |
4070 | std::optional<FPValueAndVReg> FPValReg; |
4071 | if (mi_match(R: Root.getReg(), MRI: *MRI, P: m_GFCstOrSplat(FPValReg))) { |
4072 | if (TII.isInlineConstant(Imm: FPValReg->Value)) { |
4073 | return {{[=](MachineInstrBuilder &MIB) { |
4074 | MIB.addImm(Val: FPValReg->Value.bitcastToAPInt().getSExtValue()); |
4075 | }}}; |
4076 | } |
4077 | // Non-inlineable splat floats should not fall-through for integer immediate |
4078 | // checks. |
4079 | return {}; |
4080 | } |
4081 | |
4082 | APInt ICst; |
4083 | if (mi_match(R: Root.getReg(), MRI: *MRI, P: m_ICstOrSplat(Cst&: ICst))) { |
4084 | if (TII.isInlineConstant(Imm: ICst)) { |
4085 | return { |
4086 | {[=](MachineInstrBuilder &MIB) { MIB.addImm(Val: ICst.getSExtValue()); }}}; |
4087 | } |
4088 | } |
4089 | |
4090 | return {}; |
4091 | } |
4092 | |
4093 | InstructionSelector::ComplexRendererFns |
4094 | AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const { |
4095 | Register Src = |
4096 | getDefIgnoringCopies(Reg: Root.getReg(), MRI: *MRI)->getOperand(i: 0).getReg(); |
4097 | unsigned Key = 0; |
4098 | |
4099 | Register ShiftSrc; |
4100 | std::optional<ValueAndVReg> ShiftAmt; |
4101 | if (mi_match(R: Src, MRI: *MRI, P: m_GLShr(L: m_Reg(R&: ShiftSrc), R: m_GCst(ValReg&: ShiftAmt))) && |
4102 | MRI->getType(Reg: ShiftSrc).getSizeInBits() == 32 && |
4103 | ShiftAmt->Value.getZExtValue() % 8 == 0) { |
4104 | Key = ShiftAmt->Value.getZExtValue() / 8; |
4105 | Src = ShiftSrc; |
4106 | } |
4107 | |
4108 | return {{ |
4109 | [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); }, |
4110 | [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Key); } // index_key |
4111 | }}; |
4112 | } |
4113 | |
4114 | InstructionSelector::ComplexRendererFns |
4115 | AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const { |
4116 | |
4117 | Register Src = |
4118 | getDefIgnoringCopies(Reg: Root.getReg(), MRI: *MRI)->getOperand(i: 0).getReg(); |
4119 | unsigned Key = 0; |
4120 | |
4121 | Register ShiftSrc; |
4122 | std::optional<ValueAndVReg> ShiftAmt; |
4123 | if (mi_match(R: Src, MRI: *MRI, P: m_GLShr(L: m_Reg(R&: ShiftSrc), R: m_GCst(ValReg&: ShiftAmt))) && |
4124 | MRI->getType(Reg: ShiftSrc).getSizeInBits() == 32 && |
4125 | ShiftAmt->Value.getZExtValue() == 16) { |
4126 | Src = ShiftSrc; |
4127 | Key = 1; |
4128 | } |
4129 | |
4130 | return {{ |
4131 | [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); }, |
4132 | [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Key); } // index_key |
4133 | }}; |
4134 | } |
4135 | |
4136 | InstructionSelector::ComplexRendererFns |
4137 | AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const { |
4138 | Register Src; |
4139 | unsigned Mods; |
4140 | std::tie(args&: Src, args&: Mods) = selectVOP3ModsImpl(Root); |
4141 | |
4142 | // FIXME: Handle op_sel |
4143 | return {{ |
4144 | [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); }, |
4145 | [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); } // src_mods |
4146 | }}; |
4147 | } |
4148 | |
4149 | InstructionSelector::ComplexRendererFns |
4150 | AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const { |
4151 | Register Src; |
4152 | unsigned Mods; |
4153 | std::tie(args&: Src, args&: Mods) = selectVOP3ModsImpl(Root, |
4154 | /*IsCanonicalizing=*/true, |
4155 | /*AllowAbs=*/false, |
4156 | /*OpSel=*/false); |
4157 | |
4158 | return {{ |
4159 | [=](MachineInstrBuilder &MIB) { |
4160 | MIB.addReg( |
4161 | RegNo: copyToVGPRIfSrcFolded(Src, Mods, Root, InsertPt: MIB, /* ForceVGPR */ true)); |
4162 | }, |
4163 | [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); }, // src0_mods |
4164 | }}; |
4165 | } |
4166 | |
4167 | InstructionSelector::ComplexRendererFns |
4168 | AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const { |
4169 | Register Src; |
4170 | unsigned Mods; |
4171 | std::tie(args&: Src, args&: Mods) = selectVOP3ModsImpl(Root, |
4172 | /*IsCanonicalizing=*/true, |
4173 | /*AllowAbs=*/false, |
4174 | /*OpSel=*/true); |
4175 | |
4176 | return {{ |
4177 | [=](MachineInstrBuilder &MIB) { |
4178 | MIB.addReg( |
4179 | RegNo: copyToVGPRIfSrcFolded(Src, Mods, Root, InsertPt: MIB, /* ForceVGPR */ true)); |
4180 | }, |
4181 | [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); }, // src0_mods |
4182 | }}; |
4183 | } |
4184 | |
4185 | bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root, |
4186 | Register &Base, |
4187 | Register *SOffset, |
4188 | int64_t *Offset) const { |
4189 | MachineInstr *MI = Root.getParent(); |
4190 | MachineBasicBlock *MBB = MI->getParent(); |
4191 | |
4192 | // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits, |
4193 | // then we can select all ptr + 32-bit offsets. |
4194 | SmallVector<GEPInfo, 4> AddrInfo; |
4195 | getAddrModeInfo(Load: *MI, MRI: *MRI, AddrInfo); |
4196 | |
4197 | if (AddrInfo.empty()) |
4198 | return false; |
4199 | |
4200 | const GEPInfo &GEPI = AddrInfo[0]; |
4201 | std::optional<int64_t> EncodedImm = |
4202 | AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, false); |
4203 | |
4204 | if (SOffset && Offset) { |
4205 | if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm && |
4206 | AddrInfo.size() > 1) { |
4207 | const GEPInfo &GEPI2 = AddrInfo[1]; |
4208 | if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) { |
4209 | if (Register OffsetReg = |
4210 | matchZeroExtendFromS32(*MRI, GEPI2.SgprParts[1])) { |
4211 | Base = GEPI2.SgprParts[0]; |
4212 | *SOffset = OffsetReg; |
4213 | *Offset = *EncodedImm; |
4214 | return true; |
4215 | } |
4216 | } |
4217 | } |
4218 | return false; |
4219 | } |
4220 | |
4221 | if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) { |
4222 | Base = GEPI.SgprParts[0]; |
4223 | *Offset = *EncodedImm; |
4224 | return true; |
4225 | } |
4226 | |
4227 | // SGPR offset is unsigned. |
4228 | if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(x: GEPI.Imm) && |
4229 | GEPI.Imm != 0) { |
4230 | // If we make it this far we have a load with an 32-bit immediate offset. |
4231 | // It is OK to select this using a sgpr offset, because we have already |
4232 | // failed trying to select this load into one of the _IMM variants since |
4233 | // the _IMM Patterns are considered before the _SGPR patterns. |
4234 | Base = GEPI.SgprParts[0]; |
4235 | *SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); |
4236 | BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), *SOffset) |
4237 | .addImm(GEPI.Imm); |
4238 | return true; |
4239 | } |
4240 | |
4241 | if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) { |
4242 | if (Register OffsetReg = matchZeroExtendFromS32(*MRI, GEPI.SgprParts[1])) { |
4243 | Base = GEPI.SgprParts[0]; |
4244 | *SOffset = OffsetReg; |
4245 | return true; |
4246 | } |
4247 | } |
4248 | |
4249 | return false; |
4250 | } |
4251 | |
4252 | InstructionSelector::ComplexRendererFns |
4253 | AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const { |
4254 | Register Base; |
4255 | int64_t Offset; |
4256 | if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, Offset: &Offset)) |
4257 | return std::nullopt; |
4258 | |
4259 | return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Base); }, |
4260 | [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Offset); }}}; |
4261 | } |
4262 | |
4263 | InstructionSelector::ComplexRendererFns |
4264 | AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const { |
4265 | SmallVector<GEPInfo, 4> AddrInfo; |
4266 | getAddrModeInfo(Load: *Root.getParent(), MRI: *MRI, AddrInfo); |
4267 | |
4268 | if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) |
4269 | return std::nullopt; |
4270 | |
4271 | const GEPInfo &GEPInfo = AddrInfo[0]; |
4272 | Register PtrReg = GEPInfo.SgprParts[0]; |
4273 | std::optional<int64_t> EncodedImm = |
4274 | AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm); |
4275 | if (!EncodedImm) |
4276 | return std::nullopt; |
4277 | |
4278 | return {{ |
4279 | [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: PtrReg); }, |
4280 | [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: *EncodedImm); } |
4281 | }}; |
4282 | } |
4283 | |
4284 | InstructionSelector::ComplexRendererFns |
4285 | AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const { |
4286 | Register Base, SOffset; |
4287 | if (!selectSmrdOffset(Root, Base, SOffset: &SOffset, /* Offset= */ nullptr)) |
4288 | return std::nullopt; |
4289 | |
4290 | return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Base); }, |
4291 | [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: SOffset); }}}; |
4292 | } |
4293 | |
4294 | InstructionSelector::ComplexRendererFns |
4295 | AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const { |
4296 | Register Base, SOffset; |
4297 | int64_t Offset; |
4298 | if (!selectSmrdOffset(Root, Base, SOffset: &SOffset, Offset: &Offset)) |
4299 | return std::nullopt; |
4300 | |
4301 | return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Base); }, |
4302 | [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: SOffset); }, |
4303 | [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Offset); }}}; |
4304 | } |
4305 | |
4306 | std::pair<Register, int> |
4307 | AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root, |
4308 | uint64_t FlatVariant) const { |
4309 | MachineInstr *MI = Root.getParent(); |
4310 | |
4311 | auto Default = std::pair(Root.getReg(), 0); |
4312 | |
4313 | if (!STI.hasFlatInstOffsets()) |
4314 | return Default; |
4315 | |
4316 | Register PtrBase; |
4317 | int64_t ConstOffset; |
4318 | std::tie(args&: PtrBase, args&: ConstOffset) = |
4319 | getPtrBaseWithConstantOffset(Root: Root.getReg(), MRI: *MRI); |
4320 | |
4321 | if (ConstOffset == 0 || (FlatVariant == SIInstrFlags::FlatScratch && |
4322 | !isFlatScratchBaseLegal(Addr: Root.getReg()))) |
4323 | return Default; |
4324 | |
4325 | unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace(); |
4326 | if (!TII.isLegalFLATOffset(Offset: ConstOffset, AddrSpace, FlatVariant)) |
4327 | return Default; |
4328 | |
4329 | return std::pair(PtrBase, ConstOffset); |
4330 | } |
4331 | |
4332 | InstructionSelector::ComplexRendererFns |
4333 | AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const { |
4334 | auto PtrWithOffset = selectFlatOffsetImpl(Root, FlatVariant: SIInstrFlags::FLAT); |
4335 | |
4336 | return {{ |
4337 | [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: PtrWithOffset.first); }, |
4338 | [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: PtrWithOffset.second); }, |
4339 | }}; |
4340 | } |
4341 | |
4342 | InstructionSelector::ComplexRendererFns |
4343 | AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const { |
4344 | auto PtrWithOffset = selectFlatOffsetImpl(Root, FlatVariant: SIInstrFlags::FlatGlobal); |
4345 | |
4346 | return {{ |
4347 | [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: PtrWithOffset.first); }, |
4348 | [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: PtrWithOffset.second); }, |
4349 | }}; |
4350 | } |
4351 | |
4352 | InstructionSelector::ComplexRendererFns |
4353 | AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const { |
4354 | auto PtrWithOffset = selectFlatOffsetImpl(Root, FlatVariant: SIInstrFlags::FlatScratch); |
4355 | |
4356 | return {{ |
4357 | [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: PtrWithOffset.first); }, |
4358 | [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: PtrWithOffset.second); }, |
4359 | }}; |
4360 | } |
4361 | |
4362 | // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset) |
4363 | InstructionSelector::ComplexRendererFns |
4364 | AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const { |
4365 | Register Addr = Root.getReg(); |
4366 | Register PtrBase; |
4367 | int64_t ConstOffset; |
4368 | int64_t ImmOffset = 0; |
4369 | |
4370 | // Match the immediate offset first, which canonically is moved as low as |
4371 | // possible. |
4372 | std::tie(args&: PtrBase, args&: ConstOffset) = getPtrBaseWithConstantOffset(Root: Addr, MRI: *MRI); |
4373 | |
4374 | if (ConstOffset != 0) { |
4375 | if (TII.isLegalFLATOffset(Offset: ConstOffset, AddrSpace: AMDGPUAS::GLOBAL_ADDRESS, |
4376 | FlatVariant: SIInstrFlags::FlatGlobal)) { |
4377 | Addr = PtrBase; |
4378 | ImmOffset = ConstOffset; |
4379 | } else { |
4380 | auto PtrBaseDef = getDefSrcRegIgnoringCopies(Reg: PtrBase, MRI: *MRI); |
4381 | if (isSGPR(Reg: PtrBaseDef->Reg)) { |
4382 | if (ConstOffset > 0) { |
4383 | // Offset is too large. |
4384 | // |
4385 | // saddr + large_offset -> saddr + |
4386 | // (voffset = large_offset & ~MaxOffset) + |
4387 | // (large_offset & MaxOffset); |
4388 | int64_t SplitImmOffset, RemainderOffset; |
4389 | std::tie(args&: SplitImmOffset, args&: RemainderOffset) = TII.splitFlatOffset( |
4390 | COffsetVal: ConstOffset, AddrSpace: AMDGPUAS::GLOBAL_ADDRESS, FlatVariant: SIInstrFlags::FlatGlobal); |
4391 | |
4392 | if (isUInt<32>(x: RemainderOffset)) { |
4393 | MachineInstr *MI = Root.getParent(); |
4394 | MachineBasicBlock *MBB = MI->getParent(); |
4395 | Register HighBits = |
4396 | MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
4397 | |
4398 | BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), |
4399 | HighBits) |
4400 | .addImm(RemainderOffset); |
4401 | |
4402 | return {{ |
4403 | [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: PtrBase); }, // saddr |
4404 | [=](MachineInstrBuilder &MIB) { |
4405 | MIB.addReg(RegNo: HighBits); |
4406 | }, // voffset |
4407 | [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: SplitImmOffset); }, |
4408 | }}; |
4409 | } |
4410 | } |
4411 | |
4412 | // We are adding a 64 bit SGPR and a constant. If constant bus limit |
4413 | // is 1 we would need to perform 1 or 2 extra moves for each half of |
4414 | // the constant and it is better to do a scalar add and then issue a |
4415 | // single VALU instruction to materialize zero. Otherwise it is less |
4416 | // instructions to perform VALU adds with immediates or inline literals. |
4417 | unsigned NumLiterals = |
4418 | !TII.isInlineConstant(Imm: APInt(32, ConstOffset & 0xffffffff)) + |
4419 | !TII.isInlineConstant(Imm: APInt(32, ConstOffset >> 32)); |
4420 | if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals) |
4421 | return std::nullopt; |
4422 | } |
4423 | } |
4424 | } |
4425 | |
4426 | // Match the variable offset. |
4427 | auto AddrDef = getDefSrcRegIgnoringCopies(Reg: Addr, MRI: *MRI); |
4428 | if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) { |
4429 | // Look through the SGPR->VGPR copy. |
4430 | Register SAddr = |
4431 | getSrcRegIgnoringCopies(Reg: AddrDef->MI->getOperand(i: 1).getReg(), MRI: *MRI); |
4432 | |
4433 | if (isSGPR(Reg: SAddr)) { |
4434 | Register PtrBaseOffset = AddrDef->MI->getOperand(i: 2).getReg(); |
4435 | |
4436 | // It's possible voffset is an SGPR here, but the copy to VGPR will be |
4437 | // inserted later. |
4438 | if (Register VOffset = matchZeroExtendFromS32(MRI&: *MRI, Reg: PtrBaseOffset)) { |
4439 | return {{[=](MachineInstrBuilder &MIB) { // saddr |
4440 | MIB.addReg(RegNo: SAddr); |
4441 | }, |
4442 | [=](MachineInstrBuilder &MIB) { // voffset |
4443 | MIB.addReg(RegNo: VOffset); |
4444 | }, |
4445 | [=](MachineInstrBuilder &MIB) { // offset |
4446 | MIB.addImm(Val: ImmOffset); |
4447 | }}}; |
4448 | } |
4449 | } |
4450 | } |
4451 | |
4452 | // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and |
4453 | // drop this. |
4454 | if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF || |
4455 | AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg)) |
4456 | return std::nullopt; |
4457 | |
4458 | // It's cheaper to materialize a single 32-bit zero for vaddr than the two |
4459 | // moves required to copy a 64-bit SGPR to VGPR. |
4460 | MachineInstr *MI = Root.getParent(); |
4461 | MachineBasicBlock *MBB = MI->getParent(); |
4462 | Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
4463 | |
4464 | BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset) |
4465 | .addImm(0); |
4466 | |
4467 | return {{ |
4468 | [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: AddrDef->Reg); }, // saddr |
4469 | [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: VOffset); }, // voffset |
4470 | [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: ImmOffset); } // offset |
4471 | }}; |
4472 | } |
4473 | |
4474 | InstructionSelector::ComplexRendererFns |
4475 | AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const { |
4476 | Register Addr = Root.getReg(); |
4477 | Register PtrBase; |
4478 | int64_t ConstOffset; |
4479 | int64_t ImmOffset = 0; |
4480 | |
4481 | // Match the immediate offset first, which canonically is moved as low as |
4482 | // possible. |
4483 | std::tie(args&: PtrBase, args&: ConstOffset) = getPtrBaseWithConstantOffset(Root: Addr, MRI: *MRI); |
4484 | |
4485 | if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) && |
4486 | TII.isLegalFLATOffset(Offset: ConstOffset, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS, |
4487 | FlatVariant: SIInstrFlags::FlatScratch)) { |
4488 | Addr = PtrBase; |
4489 | ImmOffset = ConstOffset; |
4490 | } |
4491 | |
4492 | auto AddrDef = getDefSrcRegIgnoringCopies(Reg: Addr, MRI: *MRI); |
4493 | if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) { |
4494 | int FI = AddrDef->MI->getOperand(i: 1).getIndex(); |
4495 | return {{ |
4496 | [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(Idx: FI); }, // saddr |
4497 | [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: ImmOffset); } // offset |
4498 | }}; |
4499 | } |
4500 | |
4501 | Register SAddr = AddrDef->Reg; |
4502 | |
4503 | if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) { |
4504 | Register LHS = AddrDef->MI->getOperand(i: 1).getReg(); |
4505 | Register RHS = AddrDef->MI->getOperand(i: 2).getReg(); |
4506 | auto LHSDef = getDefSrcRegIgnoringCopies(Reg: LHS, MRI: *MRI); |
4507 | auto RHSDef = getDefSrcRegIgnoringCopies(Reg: RHS, MRI: *MRI); |
4508 | |
4509 | if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX && |
4510 | isSGPR(RHSDef->Reg)) { |
4511 | int FI = LHSDef->MI->getOperand(i: 1).getIndex(); |
4512 | MachineInstr &I = *Root.getParent(); |
4513 | MachineBasicBlock *BB = I.getParent(); |
4514 | const DebugLoc &DL = I.getDebugLoc(); |
4515 | SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); |
4516 | |
4517 | BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr) |
4518 | .addFrameIndex(FI) |
4519 | .addReg(RHSDef->Reg) |
4520 | .setOperandDead(3); // Dead scc |
4521 | } |
4522 | } |
4523 | |
4524 | if (!isSGPR(Reg: SAddr)) |
4525 | return std::nullopt; |
4526 | |
4527 | return {{ |
4528 | [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: SAddr); }, // saddr |
4529 | [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: ImmOffset); } // offset |
4530 | }}; |
4531 | } |
4532 | |
4533 | // Check whether the flat scratch SVS swizzle bug affects this access. |
4534 | bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug( |
4535 | Register VAddr, Register SAddr, uint64_t ImmOffset) const { |
4536 | if (!Subtarget->hasFlatScratchSVSSwizzleBug()) |
4537 | return false; |
4538 | |
4539 | // The bug affects the swizzling of SVS accesses if there is any carry out |
4540 | // from the two low order bits (i.e. from bit 1 into bit 2) when adding |
4541 | // voffset to (soffset + inst_offset). |
4542 | auto VKnown = KB->getKnownBits(R: VAddr); |
4543 | auto SKnown = KnownBits::computeForAddSub( |
4544 | /*Add=*/true, /*NSW=*/false, /*NUW=*/false, LHS: KB->getKnownBits(R: SAddr), |
4545 | RHS: KnownBits::makeConstant(C: APInt(32, ImmOffset))); |
4546 | uint64_t VMax = VKnown.getMaxValue().getZExtValue(); |
4547 | uint64_t SMax = SKnown.getMaxValue().getZExtValue(); |
4548 | return (VMax & 3) + (SMax & 3) >= 4; |
4549 | } |
4550 | |
4551 | InstructionSelector::ComplexRendererFns |
4552 | AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const { |
4553 | Register Addr = Root.getReg(); |
4554 | Register PtrBase; |
4555 | int64_t ConstOffset; |
4556 | int64_t ImmOffset = 0; |
4557 | |
4558 | // Match the immediate offset first, which canonically is moved as low as |
4559 | // possible. |
4560 | std::tie(args&: PtrBase, args&: ConstOffset) = getPtrBaseWithConstantOffset(Root: Addr, MRI: *MRI); |
4561 | |
4562 | Register OrigAddr = Addr; |
4563 | if (ConstOffset != 0 && |
4564 | TII.isLegalFLATOffset(Offset: ConstOffset, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS, FlatVariant: true)) { |
4565 | Addr = PtrBase; |
4566 | ImmOffset = ConstOffset; |
4567 | } |
4568 | |
4569 | auto AddrDef = getDefSrcRegIgnoringCopies(Reg: Addr, MRI: *MRI); |
4570 | if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD) |
4571 | return std::nullopt; |
4572 | |
4573 | Register RHS = AddrDef->MI->getOperand(i: 2).getReg(); |
4574 | if (RBI.getRegBank(RHS, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) |
4575 | return std::nullopt; |
4576 | |
4577 | Register LHS = AddrDef->MI->getOperand(i: 1).getReg(); |
4578 | auto LHSDef = getDefSrcRegIgnoringCopies(Reg: LHS, MRI: *MRI); |
4579 | |
4580 | if (OrigAddr != Addr) { |
4581 | if (!isFlatScratchBaseLegalSVImm(Addr: OrigAddr)) |
4582 | return std::nullopt; |
4583 | } else { |
4584 | if (!isFlatScratchBaseLegalSV(Addr: OrigAddr)) |
4585 | return std::nullopt; |
4586 | } |
4587 | |
4588 | if (checkFlatScratchSVSSwizzleBug(VAddr: RHS, SAddr: LHS, ImmOffset)) |
4589 | return std::nullopt; |
4590 | |
4591 | if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) { |
4592 | int FI = LHSDef->MI->getOperand(i: 1).getIndex(); |
4593 | return {{ |
4594 | [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: RHS); }, // vaddr |
4595 | [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(Idx: FI); }, // saddr |
4596 | [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: ImmOffset); } // offset |
4597 | }}; |
4598 | } |
4599 | |
4600 | if (!isSGPR(Reg: LHS)) |
4601 | return std::nullopt; |
4602 | |
4603 | return {{ |
4604 | [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: RHS); }, // vaddr |
4605 | [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: LHS); }, // saddr |
4606 | [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: ImmOffset); } // offset |
4607 | }}; |
4608 | } |
4609 | |
4610 | InstructionSelector::ComplexRendererFns |
4611 | AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { |
4612 | MachineInstr *MI = Root.getParent(); |
4613 | MachineBasicBlock *MBB = MI->getParent(); |
4614 | MachineFunction *MF = MBB->getParent(); |
4615 | const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); |
4616 | |
4617 | int64_t Offset = 0; |
4618 | if (mi_match(R: Root.getReg(), MRI: *MRI, P: m_ICst(Cst&: Offset)) && |
4619 | Offset != TM.getNullPointerValue(AddrSpace: AMDGPUAS::PRIVATE_ADDRESS)) { |
4620 | Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
4621 | |
4622 | // TODO: Should this be inside the render function? The iterator seems to |
4623 | // move. |
4624 | const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST: *Subtarget); |
4625 | BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), |
4626 | HighBits) |
4627 | .addImm(Offset & ~MaxOffset); |
4628 | |
4629 | return {{[=](MachineInstrBuilder &MIB) { // rsrc |
4630 | MIB.addReg(RegNo: Info->getScratchRSrcReg()); |
4631 | }, |
4632 | [=](MachineInstrBuilder &MIB) { // vaddr |
4633 | MIB.addReg(RegNo: HighBits); |
4634 | }, |
4635 | [=](MachineInstrBuilder &MIB) { // soffset |
4636 | // Use constant zero for soffset and rely on eliminateFrameIndex |
4637 | // to choose the appropriate frame register if need be. |
4638 | MIB.addImm(Val: 0); |
4639 | }, |
4640 | [=](MachineInstrBuilder &MIB) { // offset |
4641 | MIB.addImm(Val: Offset & MaxOffset); |
4642 | }}}; |
4643 | } |
4644 | |
4645 | assert(Offset == 0 || Offset == -1); |
4646 | |
4647 | // Try to fold a frame index directly into the MUBUF vaddr field, and any |
4648 | // offsets. |
4649 | std::optional<int> FI; |
4650 | Register VAddr = Root.getReg(); |
4651 | if (const MachineInstr *RootDef = MRI->getVRegDef(Reg: Root.getReg())) { |
4652 | Register PtrBase; |
4653 | int64_t ConstOffset; |
4654 | std::tie(args&: PtrBase, args&: ConstOffset) = getPtrBaseWithConstantOffset(Root: VAddr, MRI: *MRI); |
4655 | if (ConstOffset != 0) { |
4656 | if (TII.isLegalMUBUFImmOffset(Imm: ConstOffset) && |
4657 | (!STI.privateMemoryResourceIsRangeChecked() || |
4658 | KB->signBitIsZero(Op: PtrBase))) { |
4659 | const MachineInstr *PtrBaseDef = MRI->getVRegDef(Reg: PtrBase); |
4660 | if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX) |
4661 | FI = PtrBaseDef->getOperand(i: 1).getIndex(); |
4662 | else |
4663 | VAddr = PtrBase; |
4664 | Offset = ConstOffset; |
4665 | } |
4666 | } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) { |
4667 | FI = RootDef->getOperand(i: 1).getIndex(); |
4668 | } |
4669 | } |
4670 | |
4671 | return {{[=](MachineInstrBuilder &MIB) { // rsrc |
4672 | MIB.addReg(RegNo: Info->getScratchRSrcReg()); |
4673 | }, |
4674 | [=](MachineInstrBuilder &MIB) { // vaddr |
4675 | if (FI) |
4676 | MIB.addFrameIndex(Idx: *FI); |
4677 | else |
4678 | MIB.addReg(RegNo: VAddr); |
4679 | }, |
4680 | [=](MachineInstrBuilder &MIB) { // soffset |
4681 | // Use constant zero for soffset and rely on eliminateFrameIndex |
4682 | // to choose the appropriate frame register if need be. |
4683 | MIB.addImm(Val: 0); |
4684 | }, |
4685 | [=](MachineInstrBuilder &MIB) { // offset |
4686 | MIB.addImm(Val: Offset); |
4687 | }}}; |
4688 | } |
4689 | |
4690 | bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base, |
4691 | int64_t Offset) const { |
4692 | if (!isUInt<16>(x: Offset)) |
4693 | return false; |
4694 | |
4695 | if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled()) |
4696 | return true; |
4697 | |
4698 | // On Southern Islands instruction with a negative base value and an offset |
4699 | // don't seem to work. |
4700 | return KB->signBitIsZero(Op: Base); |
4701 | } |
4702 | |
4703 | bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0, |
4704 | int64_t Offset1, |
4705 | unsigned Size) const { |
4706 | if (Offset0 % Size != 0 || Offset1 % Size != 0) |
4707 | return false; |
4708 | if (!isUInt<8>(x: Offset0 / Size) || !isUInt<8>(x: Offset1 / Size)) |
4709 | return false; |
4710 | |
4711 | if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled()) |
4712 | return true; |
4713 | |
4714 | // On Southern Islands instruction with a negative base value and an offset |
4715 | // don't seem to work. |
4716 | return KB->signBitIsZero(Op: Base); |
4717 | } |
4718 | |
4719 | // Return whether the operation has NoUnsignedWrap property. |
4720 | static bool isNoUnsignedWrap(MachineInstr *Addr) { |
4721 | return Addr->getOpcode() == TargetOpcode::G_OR || |
4722 | (Addr->getOpcode() == TargetOpcode::G_PTR_ADD && |
4723 | Addr->getFlag(Flag: MachineInstr::NoUWrap)); |
4724 | } |
4725 | |
4726 | // Check that the base address of flat scratch load/store in the form of `base + |
4727 | // offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware |
4728 | // requirement). We always treat the first operand as the base address here. |
4729 | bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const { |
4730 | MachineInstr *AddrMI = getDefIgnoringCopies(Reg: Addr, MRI: *MRI); |
4731 | |
4732 | if (isNoUnsignedWrap(Addr: AddrMI)) |
4733 | return true; |
4734 | |
4735 | // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative |
4736 | // values. |
4737 | if (STI.hasSignedScratchOffsets()) |
4738 | return true; |
4739 | |
4740 | Register LHS = AddrMI->getOperand(i: 1).getReg(); |
4741 | Register RHS = AddrMI->getOperand(i: 2).getReg(); |
4742 | |
4743 | if (AddrMI->getOpcode() == TargetOpcode::G_PTR_ADD) { |
4744 | std::optional<ValueAndVReg> RhsValReg = |
4745 | getIConstantVRegValWithLookThrough(VReg: RHS, MRI: *MRI); |
4746 | // If the immediate offset is negative and within certain range, the base |
4747 | // address cannot also be negative. If the base is also negative, the sum |
4748 | // would be either negative or much larger than the valid range of scratch |
4749 | // memory a thread can access. |
4750 | if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 && |
4751 | RhsValReg->Value.getSExtValue() > -0x40000000) |
4752 | return true; |
4753 | } |
4754 | |
4755 | return KB->signBitIsZero(Op: LHS); |
4756 | } |
4757 | |
4758 | // Check address value in SGPR/VGPR are legal for flat scratch in the form |
4759 | // of: SGPR + VGPR. |
4760 | bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const { |
4761 | MachineInstr *AddrMI = getDefIgnoringCopies(Reg: Addr, MRI: *MRI); |
4762 | |
4763 | if (isNoUnsignedWrap(Addr: AddrMI)) |
4764 | return true; |
4765 | |
4766 | // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative |
4767 | // values. |
4768 | if (STI.hasSignedScratchOffsets()) |
4769 | return true; |
4770 | |
4771 | Register LHS = AddrMI->getOperand(i: 1).getReg(); |
4772 | Register RHS = AddrMI->getOperand(i: 2).getReg(); |
4773 | return KB->signBitIsZero(Op: RHS) && KB->signBitIsZero(Op: LHS); |
4774 | } |
4775 | |
4776 | // Check address value in SGPR/VGPR are legal for flat scratch in the form |
4777 | // of: SGPR + VGPR + Imm. |
4778 | bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm( |
4779 | Register Addr) const { |
4780 | // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative |
4781 | // values. |
4782 | if (STI.hasSignedScratchOffsets()) |
4783 | return true; |
4784 | |
4785 | MachineInstr *AddrMI = getDefIgnoringCopies(Reg: Addr, MRI: *MRI); |
4786 | Register Base = AddrMI->getOperand(i: 1).getReg(); |
4787 | std::optional<DefinitionAndSourceRegister> BaseDef = |
4788 | getDefSrcRegIgnoringCopies(Reg: Base, MRI: *MRI); |
4789 | std::optional<ValueAndVReg> RHSOffset = |
4790 | getIConstantVRegValWithLookThrough(VReg: AddrMI->getOperand(i: 2).getReg(), MRI: *MRI); |
4791 | assert(RHSOffset); |
4792 | |
4793 | // If the immediate offset is negative and within certain range, the base |
4794 | // address cannot also be negative. If the base is also negative, the sum |
4795 | // would be either negative or much larger than the valid range of scratch |
4796 | // memory a thread can access. |
4797 | if (isNoUnsignedWrap(Addr: BaseDef->MI) && |
4798 | (isNoUnsignedWrap(Addr: AddrMI) || |
4799 | (RHSOffset->Value.getSExtValue() < 0 && |
4800 | RHSOffset->Value.getSExtValue() > -0x40000000))) |
4801 | return true; |
4802 | |
4803 | Register LHS = BaseDef->MI->getOperand(i: 1).getReg(); |
4804 | Register RHS = BaseDef->MI->getOperand(i: 2).getReg(); |
4805 | return KB->signBitIsZero(Op: RHS) && KB->signBitIsZero(Op: LHS); |
4806 | } |
4807 | |
4808 | bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI, |
4809 | unsigned ShAmtBits) const { |
4810 | assert(MI.getOpcode() == TargetOpcode::G_AND); |
4811 | |
4812 | std::optional<APInt> RHS = |
4813 | getIConstantVRegVal(VReg: MI.getOperand(i: 2).getReg(), MRI: *MRI); |
4814 | if (!RHS) |
4815 | return false; |
4816 | |
4817 | if (RHS->countr_one() >= ShAmtBits) |
4818 | return true; |
4819 | |
4820 | const APInt &LHSKnownZeros = KB->getKnownZeroes(R: MI.getOperand(i: 1).getReg()); |
4821 | return (LHSKnownZeros | *RHS).countr_one() >= ShAmtBits; |
4822 | } |
4823 | |
4824 | InstructionSelector::ComplexRendererFns |
4825 | AMDGPUInstructionSelector::selectMUBUFScratchOffset( |
4826 | MachineOperand &Root) const { |
4827 | Register Reg = Root.getReg(); |
4828 | const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); |
4829 | |
4830 | std::optional<DefinitionAndSourceRegister> Def = |
4831 | getDefSrcRegIgnoringCopies(Reg, MRI: *MRI); |
4832 | assert(Def && "this shouldn't be an optional result" ); |
4833 | Reg = Def->Reg; |
4834 | |
4835 | if (Register WaveBase = getWaveAddress(Def: Def->MI)) { |
4836 | return {{ |
4837 | [=](MachineInstrBuilder &MIB) { // rsrc |
4838 | MIB.addReg(RegNo: Info->getScratchRSrcReg()); |
4839 | }, |
4840 | [=](MachineInstrBuilder &MIB) { // soffset |
4841 | MIB.addReg(RegNo: WaveBase); |
4842 | }, |
4843 | [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: 0); } // offset |
4844 | }}; |
4845 | } |
4846 | |
4847 | int64_t Offset = 0; |
4848 | |
4849 | // FIXME: Copy check is a hack |
4850 | Register BasePtr; |
4851 | if (mi_match(R: Reg, MRI: *MRI, |
4852 | P: m_GPtrAdd(L: m_Reg(R&: BasePtr), |
4853 | R: m_any_of(preds: m_ICst(Cst&: Offset), preds: m_Copy(Src: m_ICst(Cst&: Offset)))))) { |
4854 | if (!TII.isLegalMUBUFImmOffset(Imm: Offset)) |
4855 | return {}; |
4856 | MachineInstr *BasePtrDef = getDefIgnoringCopies(Reg: BasePtr, MRI: *MRI); |
4857 | Register WaveBase = getWaveAddress(Def: BasePtrDef); |
4858 | if (!WaveBase) |
4859 | return {}; |
4860 | |
4861 | return {{ |
4862 | [=](MachineInstrBuilder &MIB) { // rsrc |
4863 | MIB.addReg(RegNo: Info->getScratchRSrcReg()); |
4864 | }, |
4865 | [=](MachineInstrBuilder &MIB) { // soffset |
4866 | MIB.addReg(RegNo: WaveBase); |
4867 | }, |
4868 | [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Offset); } // offset |
4869 | }}; |
4870 | } |
4871 | |
4872 | if (!mi_match(R: Root.getReg(), MRI: *MRI, P: m_ICst(Cst&: Offset)) || |
4873 | !TII.isLegalMUBUFImmOffset(Imm: Offset)) |
4874 | return {}; |
4875 | |
4876 | return {{ |
4877 | [=](MachineInstrBuilder &MIB) { // rsrc |
4878 | MIB.addReg(RegNo: Info->getScratchRSrcReg()); |
4879 | }, |
4880 | [=](MachineInstrBuilder &MIB) { // soffset |
4881 | MIB.addImm(Val: 0); |
4882 | }, |
4883 | [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Offset); } // offset |
4884 | }}; |
4885 | } |
4886 | |
4887 | std::pair<Register, unsigned> |
4888 | AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const { |
4889 | const MachineInstr *RootDef = MRI->getVRegDef(Reg: Root.getReg()); |
4890 | if (!RootDef) |
4891 | return std::pair(Root.getReg(), 0); |
4892 | |
4893 | int64_t ConstAddr = 0; |
4894 | |
4895 | Register PtrBase; |
4896 | int64_t Offset; |
4897 | std::tie(args&: PtrBase, args&: Offset) = |
4898 | getPtrBaseWithConstantOffset(Root: Root.getReg(), MRI: *MRI); |
4899 | |
4900 | if (Offset) { |
4901 | if (isDSOffsetLegal(Base: PtrBase, Offset)) { |
4902 | // (add n0, c0) |
4903 | return std::pair(PtrBase, Offset); |
4904 | } |
4905 | } else if (RootDef->getOpcode() == AMDGPU::G_SUB) { |
4906 | // TODO |
4907 | |
4908 | |
4909 | } else if (mi_match(R: Root.getReg(), MRI: *MRI, P: m_ICst(Cst&: ConstAddr))) { |
4910 | // TODO |
4911 | |
4912 | } |
4913 | |
4914 | return std::pair(Root.getReg(), 0); |
4915 | } |
4916 | |
4917 | InstructionSelector::ComplexRendererFns |
4918 | AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const { |
4919 | Register Reg; |
4920 | unsigned Offset; |
4921 | std::tie(args&: Reg, args&: Offset) = selectDS1Addr1OffsetImpl(Root); |
4922 | return {{ |
4923 | [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Reg); }, |
4924 | [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Offset); } |
4925 | }}; |
4926 | } |
4927 | |
4928 | InstructionSelector::ComplexRendererFns |
4929 | AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const { |
4930 | return selectDSReadWrite2(Root, size: 4); |
4931 | } |
4932 | |
4933 | InstructionSelector::ComplexRendererFns |
4934 | AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const { |
4935 | return selectDSReadWrite2(Root, size: 8); |
4936 | } |
4937 | |
4938 | InstructionSelector::ComplexRendererFns |
4939 | AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root, |
4940 | unsigned Size) const { |
4941 | Register Reg; |
4942 | unsigned Offset; |
4943 | std::tie(args&: Reg, args&: Offset) = selectDSReadWrite2Impl(Root, size: Size); |
4944 | return {{ |
4945 | [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Reg); }, |
4946 | [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Offset); }, |
4947 | [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Offset+1); } |
4948 | }}; |
4949 | } |
4950 | |
4951 | std::pair<Register, unsigned> |
4952 | AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root, |
4953 | unsigned Size) const { |
4954 | const MachineInstr *RootDef = MRI->getVRegDef(Reg: Root.getReg()); |
4955 | if (!RootDef) |
4956 | return std::pair(Root.getReg(), 0); |
4957 | |
4958 | int64_t ConstAddr = 0; |
4959 | |
4960 | Register PtrBase; |
4961 | int64_t Offset; |
4962 | std::tie(args&: PtrBase, args&: Offset) = |
4963 | getPtrBaseWithConstantOffset(Root: Root.getReg(), MRI: *MRI); |
4964 | |
4965 | if (Offset) { |
4966 | int64_t OffsetValue0 = Offset; |
4967 | int64_t OffsetValue1 = Offset + Size; |
4968 | if (isDSOffset2Legal(Base: PtrBase, Offset0: OffsetValue0, Offset1: OffsetValue1, Size)) { |
4969 | // (add n0, c0) |
4970 | return std::pair(PtrBase, OffsetValue0 / Size); |
4971 | } |
4972 | } else if (RootDef->getOpcode() == AMDGPU::G_SUB) { |
4973 | // TODO |
4974 | |
4975 | } else if (mi_match(R: Root.getReg(), MRI: *MRI, P: m_ICst(Cst&: ConstAddr))) { |
4976 | // TODO |
4977 | |
4978 | } |
4979 | |
4980 | return std::pair(Root.getReg(), 0); |
4981 | } |
4982 | |
4983 | /// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return |
4984 | /// the base value with the constant offset. There may be intervening copies |
4985 | /// between \p Root and the identified constant. Returns \p Root, 0 if this does |
4986 | /// not match the pattern. |
4987 | std::pair<Register, int64_t> |
4988 | AMDGPUInstructionSelector::getPtrBaseWithConstantOffset( |
4989 | Register Root, const MachineRegisterInfo &MRI) const { |
4990 | MachineInstr *RootI = getDefIgnoringCopies(Reg: Root, MRI); |
4991 | if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD) |
4992 | return {Root, 0}; |
4993 | |
4994 | MachineOperand &RHS = RootI->getOperand(i: 2); |
4995 | std::optional<ValueAndVReg> MaybeOffset = |
4996 | getIConstantVRegValWithLookThrough(VReg: RHS.getReg(), MRI); |
4997 | if (!MaybeOffset) |
4998 | return {Root, 0}; |
4999 | return {RootI->getOperand(i: 1).getReg(), MaybeOffset->Value.getSExtValue()}; |
5000 | } |
5001 | |
5002 | static void addZeroImm(MachineInstrBuilder &MIB) { |
5003 | MIB.addImm(Val: 0); |
5004 | } |
5005 | |
5006 | /// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p |
5007 | /// BasePtr is not valid, a null base pointer will be used. |
5008 | static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, |
5009 | uint32_t FormatLo, uint32_t FormatHi, |
5010 | Register BasePtr) { |
5011 | Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); |
5012 | Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); |
5013 | Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); |
5014 | Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass); |
5015 | |
5016 | B.buildInstr(AMDGPU::S_MOV_B32) |
5017 | .addDef(RSrc2) |
5018 | .addImm(FormatLo); |
5019 | B.buildInstr(AMDGPU::S_MOV_B32) |
5020 | .addDef(RSrc3) |
5021 | .addImm(FormatHi); |
5022 | |
5023 | // Build the half of the subregister with the constants before building the |
5024 | // full 128-bit register. If we are building multiple resource descriptors, |
5025 | // this will allow CSEing of the 2-component register. |
5026 | B.buildInstr(AMDGPU::REG_SEQUENCE) |
5027 | .addDef(RSrcHi) |
5028 | .addReg(RSrc2) |
5029 | .addImm(AMDGPU::sub0) |
5030 | .addReg(RSrc3) |
5031 | .addImm(AMDGPU::sub1); |
5032 | |
5033 | Register RSrcLo = BasePtr; |
5034 | if (!BasePtr) { |
5035 | RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); |
5036 | B.buildInstr(AMDGPU::S_MOV_B64) |
5037 | .addDef(RSrcLo) |
5038 | .addImm(0); |
5039 | } |
5040 | |
5041 | B.buildInstr(AMDGPU::REG_SEQUENCE) |
5042 | .addDef(RSrc) |
5043 | .addReg(RSrcLo) |
5044 | .addImm(AMDGPU::sub0_sub1) |
5045 | .addReg(RSrcHi) |
5046 | .addImm(AMDGPU::sub2_sub3); |
5047 | |
5048 | return RSrc; |
5049 | } |
5050 | |
5051 | static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, |
5052 | const SIInstrInfo &TII, Register BasePtr) { |
5053 | uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat(); |
5054 | |
5055 | // FIXME: Why are half the "default" bits ignored based on the addressing |
5056 | // mode? |
5057 | return buildRSRC(B, MRI, FormatLo: 0, FormatHi: Hi_32(Value: DefaultFormat), BasePtr); |
5058 | } |
5059 | |
5060 | static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, |
5061 | const SIInstrInfo &TII, Register BasePtr) { |
5062 | uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat(); |
5063 | |
5064 | // FIXME: Why are half the "default" bits ignored based on the addressing |
5065 | // mode? |
5066 | return buildRSRC(B, MRI, FormatLo: -1, FormatHi: Hi_32(Value: DefaultFormat), BasePtr); |
5067 | } |
5068 | |
5069 | AMDGPUInstructionSelector::MUBUFAddressData |
5070 | AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const { |
5071 | MUBUFAddressData Data; |
5072 | Data.N0 = Src; |
5073 | |
5074 | Register PtrBase; |
5075 | int64_t Offset; |
5076 | |
5077 | std::tie(args&: PtrBase, args&: Offset) = getPtrBaseWithConstantOffset(Root: Src, MRI: *MRI); |
5078 | if (isUInt<32>(x: Offset)) { |
5079 | Data.N0 = PtrBase; |
5080 | Data.Offset = Offset; |
5081 | } |
5082 | |
5083 | if (MachineInstr *InputAdd |
5084 | = getOpcodeDef(Opcode: TargetOpcode::G_PTR_ADD, Reg: Data.N0, MRI: *MRI)) { |
5085 | Data.N2 = InputAdd->getOperand(i: 1).getReg(); |
5086 | Data.N3 = InputAdd->getOperand(i: 2).getReg(); |
5087 | |
5088 | // FIXME: Need to fix extra SGPR->VGPRcopies inserted |
5089 | // FIXME: Don't know this was defined by operand 0 |
5090 | // |
5091 | // TODO: Remove this when we have copy folding optimizations after |
5092 | // RegBankSelect. |
5093 | Data.N2 = getDefIgnoringCopies(Reg: Data.N2, MRI: *MRI)->getOperand(i: 0).getReg(); |
5094 | Data.N3 = getDefIgnoringCopies(Reg: Data.N3, MRI: *MRI)->getOperand(i: 0).getReg(); |
5095 | } |
5096 | |
5097 | return Data; |
5098 | } |
5099 | |
5100 | /// Return if the addr64 mubuf mode should be used for the given address. |
5101 | bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const { |
5102 | // (ptr_add N2, N3) -> addr64, or |
5103 | // (ptr_add (ptr_add N2, N3), C1) -> addr64 |
5104 | if (Addr.N2) |
5105 | return true; |
5106 | |
5107 | const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI); |
5108 | return N0Bank->getID() == AMDGPU::VGPRRegBankID; |
5109 | } |
5110 | |
5111 | /// Split an immediate offset \p ImmOffset depending on whether it fits in the |
5112 | /// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable |
5113 | /// component. |
5114 | void AMDGPUInstructionSelector::splitIllegalMUBUFOffset( |
5115 | MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const { |
5116 | if (TII.isLegalMUBUFImmOffset(Imm: ImmOffset)) |
5117 | return; |
5118 | |
5119 | // Illegal offset, store it in soffset. |
5120 | SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); |
5121 | B.buildInstr(AMDGPU::S_MOV_B32) |
5122 | .addDef(SOffset) |
5123 | .addImm(ImmOffset); |
5124 | ImmOffset = 0; |
5125 | } |
5126 | |
5127 | bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl( |
5128 | MachineOperand &Root, Register &VAddr, Register &RSrcReg, |
5129 | Register &SOffset, int64_t &Offset) const { |
5130 | // FIXME: Predicates should stop this from reaching here. |
5131 | // addr64 bit was removed for volcanic islands. |
5132 | if (!STI.hasAddr64() || STI.useFlatForGlobal()) |
5133 | return false; |
5134 | |
5135 | MUBUFAddressData AddrData = parseMUBUFAddress(Src: Root.getReg()); |
5136 | if (!shouldUseAddr64(Addr: AddrData)) |
5137 | return false; |
5138 | |
5139 | Register N0 = AddrData.N0; |
5140 | Register N2 = AddrData.N2; |
5141 | Register N3 = AddrData.N3; |
5142 | Offset = AddrData.Offset; |
5143 | |
5144 | // Base pointer for the SRD. |
5145 | Register SRDPtr; |
5146 | |
5147 | if (N2) { |
5148 | if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { |
5149 | assert(N3); |
5150 | if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { |
5151 | // Both N2 and N3 are divergent. Use N0 (the result of the add) as the |
5152 | // addr64, and construct the default resource from a 0 address. |
5153 | VAddr = N0; |
5154 | } else { |
5155 | SRDPtr = N3; |
5156 | VAddr = N2; |
5157 | } |
5158 | } else { |
5159 | // N2 is not divergent. |
5160 | SRDPtr = N2; |
5161 | VAddr = N3; |
5162 | } |
5163 | } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { |
5164 | // Use the default null pointer in the resource |
5165 | VAddr = N0; |
5166 | } else { |
5167 | // N0 -> offset, or |
5168 | // (N0 + C1) -> offset |
5169 | SRDPtr = N0; |
5170 | } |
5171 | |
5172 | MachineIRBuilder B(*Root.getParent()); |
5173 | RSrcReg = buildAddr64RSrc(B, MRI&: *MRI, TII, BasePtr: SRDPtr); |
5174 | splitIllegalMUBUFOffset(B, SOffset, ImmOffset&: Offset); |
5175 | return true; |
5176 | } |
5177 | |
5178 | bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl( |
5179 | MachineOperand &Root, Register &RSrcReg, Register &SOffset, |
5180 | int64_t &Offset) const { |
5181 | |
5182 | // FIXME: Pattern should not reach here. |
5183 | if (STI.useFlatForGlobal()) |
5184 | return false; |
5185 | |
5186 | MUBUFAddressData AddrData = parseMUBUFAddress(Src: Root.getReg()); |
5187 | if (shouldUseAddr64(Addr: AddrData)) |
5188 | return false; |
5189 | |
5190 | // N0 -> offset, or |
5191 | // (N0 + C1) -> offset |
5192 | Register SRDPtr = AddrData.N0; |
5193 | Offset = AddrData.Offset; |
5194 | |
5195 | // TODO: Look through extensions for 32-bit soffset. |
5196 | MachineIRBuilder B(*Root.getParent()); |
5197 | |
5198 | RSrcReg = buildOffsetSrc(B, MRI&: *MRI, TII, BasePtr: SRDPtr); |
5199 | splitIllegalMUBUFOffset(B, SOffset, ImmOffset&: Offset); |
5200 | return true; |
5201 | } |
5202 | |
5203 | InstructionSelector::ComplexRendererFns |
5204 | AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const { |
5205 | Register VAddr; |
5206 | Register RSrcReg; |
5207 | Register SOffset; |
5208 | int64_t Offset = 0; |
5209 | |
5210 | if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset)) |
5211 | return {}; |
5212 | |
5213 | // FIXME: Use defaulted operands for trailing 0s and remove from the complex |
5214 | // pattern. |
5215 | return {{ |
5216 | [=](MachineInstrBuilder &MIB) { // rsrc |
5217 | MIB.addReg(RegNo: RSrcReg); |
5218 | }, |
5219 | [=](MachineInstrBuilder &MIB) { // vaddr |
5220 | MIB.addReg(RegNo: VAddr); |
5221 | }, |
5222 | [=](MachineInstrBuilder &MIB) { // soffset |
5223 | if (SOffset) |
5224 | MIB.addReg(RegNo: SOffset); |
5225 | else if (STI.hasRestrictedSOffset()) |
5226 | MIB.addReg(AMDGPU::SGPR_NULL); |
5227 | else |
5228 | MIB.addImm(Val: 0); |
5229 | }, |
5230 | [=](MachineInstrBuilder &MIB) { // offset |
5231 | MIB.addImm(Val: Offset); |
5232 | }, |
5233 | addZeroImm, // cpol |
5234 | addZeroImm, // tfe |
5235 | addZeroImm // swz |
5236 | }}; |
5237 | } |
5238 | |
5239 | InstructionSelector::ComplexRendererFns |
5240 | AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const { |
5241 | Register RSrcReg; |
5242 | Register SOffset; |
5243 | int64_t Offset = 0; |
5244 | |
5245 | if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset)) |
5246 | return {}; |
5247 | |
5248 | return {{ |
5249 | [=](MachineInstrBuilder &MIB) { // rsrc |
5250 | MIB.addReg(RegNo: RSrcReg); |
5251 | }, |
5252 | [=](MachineInstrBuilder &MIB) { // soffset |
5253 | if (SOffset) |
5254 | MIB.addReg(RegNo: SOffset); |
5255 | else if (STI.hasRestrictedSOffset()) |
5256 | MIB.addReg(AMDGPU::SGPR_NULL); |
5257 | else |
5258 | MIB.addImm(Val: 0); |
5259 | }, |
5260 | [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Offset); }, // offset |
5261 | addZeroImm, // cpol |
5262 | addZeroImm, // tfe |
5263 | addZeroImm, // swz |
5264 | }}; |
5265 | } |
5266 | |
5267 | InstructionSelector::ComplexRendererFns |
5268 | AMDGPUInstructionSelector::selectBUFSOffset(MachineOperand &Root) const { |
5269 | |
5270 | Register SOffset = Root.getReg(); |
5271 | |
5272 | if (STI.hasRestrictedSOffset() && mi_match(SOffset, *MRI, m_ZeroInt())) |
5273 | SOffset = AMDGPU::SGPR_NULL; |
5274 | |
5275 | return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: SOffset); }}}; |
5276 | } |
5277 | |
5278 | /// Get an immediate that must be 32-bits, and treated as zero extended. |
5279 | static std::optional<uint64_t> |
5280 | getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI) { |
5281 | // getIConstantVRegVal sexts any values, so see if that matters. |
5282 | std::optional<int64_t> OffsetVal = getIConstantVRegSExtVal(VReg: Reg, MRI); |
5283 | if (!OffsetVal || !isInt<32>(x: *OffsetVal)) |
5284 | return std::nullopt; |
5285 | return Lo_32(Value: *OffsetVal); |
5286 | } |
5287 | |
5288 | InstructionSelector::ComplexRendererFns |
5289 | AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const { |
5290 | std::optional<uint64_t> OffsetVal = getConstantZext32Val(Reg: Root.getReg(), MRI: *MRI); |
5291 | if (!OffsetVal) |
5292 | return {}; |
5293 | |
5294 | std::optional<int64_t> EncodedImm = |
5295 | AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true); |
5296 | if (!EncodedImm) |
5297 | return {}; |
5298 | |
5299 | return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: *EncodedImm); } }}; |
5300 | } |
5301 | |
5302 | InstructionSelector::ComplexRendererFns |
5303 | AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const { |
5304 | assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS); |
5305 | |
5306 | std::optional<uint64_t> OffsetVal = getConstantZext32Val(Reg: Root.getReg(), MRI: *MRI); |
5307 | if (!OffsetVal) |
5308 | return {}; |
5309 | |
5310 | std::optional<int64_t> EncodedImm = |
5311 | AMDGPU::getSMRDEncodedLiteralOffset32(STI, *OffsetVal); |
5312 | if (!EncodedImm) |
5313 | return {}; |
5314 | |
5315 | return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: *EncodedImm); } }}; |
5316 | } |
5317 | |
5318 | InstructionSelector::ComplexRendererFns |
5319 | AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const { |
5320 | // Match the (soffset + offset) pair as a 32-bit register base and |
5321 | // an immediate offset. |
5322 | Register SOffset; |
5323 | unsigned Offset; |
5324 | std::tie(args&: SOffset, args&: Offset) = AMDGPU::getBaseWithConstantOffset( |
5325 | MRI&: *MRI, Reg: Root.getReg(), KnownBits: KB, /*CheckNUW*/ true); |
5326 | if (!SOffset) |
5327 | return std::nullopt; |
5328 | |
5329 | std::optional<int64_t> EncodedOffset = |
5330 | AMDGPU::getSMRDEncodedOffset(STI, Offset, /* IsBuffer */ true); |
5331 | if (!EncodedOffset) |
5332 | return std::nullopt; |
5333 | |
5334 | assert(MRI->getType(SOffset) == LLT::scalar(32)); |
5335 | return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: SOffset); }, |
5336 | [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: *EncodedOffset); }}}; |
5337 | } |
5338 | |
5339 | // Variant of stripBitCast that returns the instruction instead of a |
5340 | // MachineOperand. |
5341 | static MachineInstr *stripBitCast(MachineInstr *MI, MachineRegisterInfo &MRI) { |
5342 | if (MI->getOpcode() == AMDGPU::G_BITCAST) |
5343 | return getDefIgnoringCopies(Reg: MI->getOperand(i: 1).getReg(), MRI); |
5344 | return MI; |
5345 | } |
5346 | |
5347 | // Figure out if this is really an extract of the high 16-bits of a dword, |
5348 | // returns nullptr if it isn't. |
5349 | static MachineInstr *(MachineInstr *Inst, |
5350 | MachineRegisterInfo &MRI) { |
5351 | Inst = stripBitCast(MI: Inst, MRI); |
5352 | |
5353 | if (Inst->getOpcode() != AMDGPU::G_TRUNC) |
5354 | return nullptr; |
5355 | |
5356 | MachineInstr *TruncOp = |
5357 | getDefIgnoringCopies(Reg: Inst->getOperand(i: 1).getReg(), MRI); |
5358 | TruncOp = stripBitCast(MI: TruncOp, MRI); |
5359 | |
5360 | // G_LSHR x, (G_CONSTANT i32 16) |
5361 | if (TruncOp->getOpcode() == AMDGPU::G_LSHR) { |
5362 | auto SrlAmount = getIConstantVRegValWithLookThrough( |
5363 | VReg: TruncOp->getOperand(i: 2).getReg(), MRI); |
5364 | if (SrlAmount && SrlAmount->Value.getZExtValue() == 16) { |
5365 | MachineInstr *SrlOp = |
5366 | getDefIgnoringCopies(Reg: TruncOp->getOperand(i: 1).getReg(), MRI); |
5367 | return stripBitCast(MI: SrlOp, MRI); |
5368 | } |
5369 | } |
5370 | |
5371 | // G_SHUFFLE_VECTOR x, y, shufflemask(1, 1|0) |
5372 | // 1, 0 swaps the low/high 16 bits. |
5373 | // 1, 1 sets the high 16 bits to be the same as the low 16. |
5374 | // in any case, it selects the high elts. |
5375 | if (TruncOp->getOpcode() == AMDGPU::G_SHUFFLE_VECTOR) { |
5376 | assert(MRI.getType(TruncOp->getOperand(0).getReg()) == |
5377 | LLT::fixed_vector(2, 16)); |
5378 | |
5379 | ArrayRef<int> Mask = TruncOp->getOperand(i: 3).getShuffleMask(); |
5380 | assert(Mask.size() == 2); |
5381 | |
5382 | if (Mask[0] == 1 && Mask[1] <= 1) { |
5383 | MachineInstr *LHS = |
5384 | getDefIgnoringCopies(Reg: TruncOp->getOperand(i: 1).getReg(), MRI); |
5385 | return stripBitCast(MI: LHS, MRI); |
5386 | } |
5387 | } |
5388 | |
5389 | return nullptr; |
5390 | } |
5391 | |
5392 | std::pair<Register, unsigned> |
5393 | AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root, |
5394 | bool &Matched) const { |
5395 | Matched = false; |
5396 | |
5397 | Register Src; |
5398 | unsigned Mods; |
5399 | std::tie(args&: Src, args&: Mods) = selectVOP3ModsImpl(Root); |
5400 | |
5401 | MachineInstr *MI = getDefIgnoringCopies(Reg: Src, MRI: *MRI); |
5402 | if (MI->getOpcode() == AMDGPU::G_FPEXT) { |
5403 | MachineOperand *MO = &MI->getOperand(i: 1); |
5404 | Src = MO->getReg(); |
5405 | MI = getDefIgnoringCopies(Reg: Src, MRI: *MRI); |
5406 | |
5407 | assert(MRI->getType(Src) == LLT::scalar(16)); |
5408 | |
5409 | // See through bitcasts. |
5410 | // FIXME: Would be nice to use stripBitCast here. |
5411 | if (MI->getOpcode() == AMDGPU::G_BITCAST) { |
5412 | MO = &MI->getOperand(i: 1); |
5413 | Src = MO->getReg(); |
5414 | MI = getDefIgnoringCopies(Reg: Src, MRI: *MRI); |
5415 | } |
5416 | |
5417 | const auto CheckAbsNeg = [&]() { |
5418 | // Be careful about folding modifiers if we already have an abs. fneg is |
5419 | // applied last, so we don't want to apply an earlier fneg. |
5420 | if ((Mods & SISrcMods::ABS) == 0) { |
5421 | unsigned ModsTmp; |
5422 | std::tie(args&: Src, args&: ModsTmp) = selectVOP3ModsImpl(Root&: *MO); |
5423 | MI = getDefIgnoringCopies(Reg: Src, MRI: *MRI); |
5424 | |
5425 | if ((ModsTmp & SISrcMods::NEG) != 0) |
5426 | Mods ^= SISrcMods::NEG; |
5427 | |
5428 | if ((ModsTmp & SISrcMods::ABS) != 0) |
5429 | Mods |= SISrcMods::ABS; |
5430 | } |
5431 | }; |
5432 | |
5433 | CheckAbsNeg(); |
5434 | |
5435 | // op_sel/op_sel_hi decide the source type and source. |
5436 | // If the source's op_sel_hi is set, it indicates to do a conversion from |
5437 | // fp16. If the sources's op_sel is set, it picks the high half of the |
5438 | // source register. |
5439 | |
5440 | Mods |= SISrcMods::OP_SEL_1; |
5441 | |
5442 | if (MachineInstr * = isExtractHiElt(Inst: MI, MRI&: *MRI)) { |
5443 | Mods |= SISrcMods::OP_SEL_0; |
5444 | MI = ExtractHiEltMI; |
5445 | MO = &MI->getOperand(i: 0); |
5446 | Src = MO->getReg(); |
5447 | |
5448 | CheckAbsNeg(); |
5449 | } |
5450 | |
5451 | Matched = true; |
5452 | } |
5453 | |
5454 | return {Src, Mods}; |
5455 | } |
5456 | |
5457 | InstructionSelector::ComplexRendererFns |
5458 | AMDGPUInstructionSelector::selectVOP3PMadMixModsExt( |
5459 | MachineOperand &Root) const { |
5460 | Register Src; |
5461 | unsigned Mods; |
5462 | bool Matched; |
5463 | std::tie(args&: Src, args&: Mods) = selectVOP3PMadMixModsImpl(Root, Matched); |
5464 | if (!Matched) |
5465 | return {}; |
5466 | |
5467 | return {{ |
5468 | [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); }, |
5469 | [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); } // src_mods |
5470 | }}; |
5471 | } |
5472 | |
5473 | InstructionSelector::ComplexRendererFns |
5474 | AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const { |
5475 | Register Src; |
5476 | unsigned Mods; |
5477 | bool Matched; |
5478 | std::tie(args&: Src, args&: Mods) = selectVOP3PMadMixModsImpl(Root, Matched); |
5479 | |
5480 | return {{ |
5481 | [=](MachineInstrBuilder &MIB) { MIB.addReg(RegNo: Src); }, |
5482 | [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Mods); } // src_mods |
5483 | }}; |
5484 | } |
5485 | |
5486 | bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst( |
5487 | MachineInstr &I, Intrinsic::ID IntrID) const { |
5488 | MachineBasicBlock *MBB = I.getParent(); |
5489 | const DebugLoc &DL = I.getDebugLoc(); |
5490 | Register CCReg = I.getOperand(i: 0).getReg(); |
5491 | |
5492 | bool HasM0 = IntrID == Intrinsic::amdgcn_s_barrier_signal_isfirst_var; |
5493 | |
5494 | if (HasM0) { |
5495 | auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) |
5496 | .addReg(I.getOperand(2).getReg()); |
5497 | BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0)); |
5498 | if (!constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI)) |
5499 | return false; |
5500 | } else { |
5501 | BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM)) |
5502 | .addImm(I.getOperand(2).getImm()); |
5503 | } |
5504 | |
5505 | BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC); |
5506 | |
5507 | I.eraseFromParent(); |
5508 | return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass, |
5509 | *MRI); |
5510 | } |
5511 | |
5512 | unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) { |
5513 | if (HasInlineConst) { |
5514 | switch (IntrID) { |
5515 | default: |
5516 | llvm_unreachable("not a named barrier op" ); |
5517 | case Intrinsic::amdgcn_s_barrier_init: |
5518 | return AMDGPU::S_BARRIER_INIT_IMM; |
5519 | case Intrinsic::amdgcn_s_barrier_join: |
5520 | return AMDGPU::S_BARRIER_JOIN_IMM; |
5521 | case Intrinsic::amdgcn_s_wakeup_barrier: |
5522 | return AMDGPU::S_WAKEUP_BARRIER_IMM; |
5523 | case Intrinsic::amdgcn_s_get_barrier_state: |
5524 | return AMDGPU::S_GET_BARRIER_STATE_IMM; |
5525 | }; |
5526 | } else { |
5527 | switch (IntrID) { |
5528 | default: |
5529 | llvm_unreachable("not a named barrier op" ); |
5530 | case Intrinsic::amdgcn_s_barrier_init: |
5531 | return AMDGPU::S_BARRIER_INIT_M0; |
5532 | case Intrinsic::amdgcn_s_barrier_join: |
5533 | return AMDGPU::S_BARRIER_JOIN_M0; |
5534 | case Intrinsic::amdgcn_s_wakeup_barrier: |
5535 | return AMDGPU::S_WAKEUP_BARRIER_M0; |
5536 | case Intrinsic::amdgcn_s_get_barrier_state: |
5537 | return AMDGPU::S_GET_BARRIER_STATE_M0; |
5538 | }; |
5539 | } |
5540 | } |
5541 | |
5542 | bool AMDGPUInstructionSelector::selectNamedBarrierInst( |
5543 | MachineInstr &I, Intrinsic::ID IntrID) const { |
5544 | MachineBasicBlock *MBB = I.getParent(); |
5545 | const DebugLoc &DL = I.getDebugLoc(); |
5546 | MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_barrier_state |
5547 | ? I.getOperand(2) |
5548 | : I.getOperand(1); |
5549 | std::optional<int64_t> BarValImm = |
5550 | getIConstantVRegSExtVal(VReg: BarOp.getReg(), MRI: *MRI); |
5551 | Register M0Val; |
5552 | Register TmpReg0; |
5553 | |
5554 | // For S_BARRIER_INIT, member count will always be read from M0[16:22] |
5555 | if (IntrID == Intrinsic::amdgcn_s_barrier_init) { |
5556 | Register MemberCount = I.getOperand(i: 2).getReg(); |
5557 | TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); |
5558 | // TODO: This should be expanded during legalization so that the the S_LSHL |
5559 | // and S_OR can be constant-folded |
5560 | BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0) |
5561 | .addImm(16) |
5562 | .addReg(MemberCount); |
5563 | M0Val = TmpReg0; |
5564 | } |
5565 | |
5566 | // If not inlinable, get reference to barrier depending on the instruction |
5567 | if (!BarValImm) { |
5568 | if (IntrID == Intrinsic::amdgcn_s_barrier_init) { |
5569 | // If reference to barrier id is not an inlinable constant then it must be |
5570 | // referenced with M0[4:0]. Perform an OR with the member count to include |
5571 | // it in M0 for S_BARRIER_INIT. |
5572 | Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); |
5573 | BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_OR_B32), TmpReg1) |
5574 | .addReg(BarOp.getReg()) |
5575 | .addReg(TmpReg0); |
5576 | M0Val = TmpReg1; |
5577 | } else { |
5578 | M0Val = BarOp.getReg(); |
5579 | } |
5580 | } |
5581 | |
5582 | // Build copy to M0 if needed. For S_BARRIER_INIT, M0 is always required. |
5583 | if (M0Val) { |
5584 | auto CopyMIB = |
5585 | BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(M0Val); |
5586 | constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI); |
5587 | } |
5588 | |
5589 | MachineInstrBuilder MIB; |
5590 | unsigned Opc = getNamedBarrierOp(HasInlineConst: BarValImm.has_value(), IntrID); |
5591 | MIB = BuildMI(*MBB, &I, DL, TII.get(Opc)); |
5592 | |
5593 | if (IntrID == Intrinsic::amdgcn_s_get_barrier_state) |
5594 | MIB.addDef(RegNo: I.getOperand(i: 0).getReg()); |
5595 | |
5596 | if (BarValImm) |
5597 | MIB.addImm(Val: *BarValImm); |
5598 | |
5599 | I.eraseFromParent(); |
5600 | return true; |
5601 | } |
5602 | |
5603 | bool AMDGPUInstructionSelector::selectSBarrierLeave(MachineInstr &I) const { |
5604 | MachineBasicBlock *BB = I.getParent(); |
5605 | const DebugLoc &DL = I.getDebugLoc(); |
5606 | Register CCReg = I.getOperand(i: 0).getReg(); |
5607 | |
5608 | BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_BARRIER_LEAVE)); |
5609 | BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC); |
5610 | |
5611 | I.eraseFromParent(); |
5612 | return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass, |
5613 | *MRI); |
5614 | } |
5615 | |
5616 | void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB, |
5617 | const MachineInstr &MI, |
5618 | int OpIdx) const { |
5619 | assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && |
5620 | "Expected G_CONSTANT" ); |
5621 | MIB.addImm(Val: MI.getOperand(i: 1).getCImm()->getSExtValue()); |
5622 | } |
5623 | |
5624 | void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB, |
5625 | const MachineInstr &MI, |
5626 | int OpIdx) const { |
5627 | assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && |
5628 | "Expected G_CONSTANT" ); |
5629 | MIB.addImm(Val: -MI.getOperand(i: 1).getCImm()->getSExtValue()); |
5630 | } |
5631 | |
5632 | void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB, |
5633 | const MachineInstr &MI, |
5634 | int OpIdx) const { |
5635 | assert(OpIdx == -1); |
5636 | |
5637 | const MachineOperand &Op = MI.getOperand(i: 1); |
5638 | if (MI.getOpcode() == TargetOpcode::G_FCONSTANT) |
5639 | MIB.addImm(Val: Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue()); |
5640 | else { |
5641 | assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT" ); |
5642 | MIB.addImm(Val: Op.getCImm()->getSExtValue()); |
5643 | } |
5644 | } |
5645 | |
5646 | void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB, |
5647 | const MachineInstr &MI, |
5648 | int OpIdx) const { |
5649 | assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && |
5650 | "Expected G_CONSTANT" ); |
5651 | MIB.addImm(Val: MI.getOperand(i: 1).getCImm()->getValue().popcount()); |
5652 | } |
5653 | |
5654 | /// This only really exists to satisfy DAG type checking machinery, so is a |
5655 | /// no-op here. |
5656 | void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB, |
5657 | const MachineInstr &MI, |
5658 | int OpIdx) const { |
5659 | MIB.addImm(Val: MI.getOperand(i: OpIdx).getImm()); |
5660 | } |
5661 | |
5662 | void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB, |
5663 | const MachineInstr &MI, |
5664 | int OpIdx) const { |
5665 | assert(OpIdx >= 0 && "expected to match an immediate operand" ); |
5666 | MIB.addImm(Val: MI.getOperand(i: OpIdx).getImm() ? (int64_t)SISrcMods::OP_SEL_0 : 0); |
5667 | } |
5668 | |
5669 | void AMDGPUInstructionSelector::(MachineInstrBuilder &MIB, |
5670 | const MachineInstr &MI, |
5671 | int OpIdx) const { |
5672 | assert(OpIdx >= 0 && "expected to match an immediate operand" ); |
5673 | MIB.addImm(Val: MI.getOperand(i: OpIdx).getImm() & |
5674 | (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::ALL |
5675 | : AMDGPU::CPol::ALL_pregfx12)); |
5676 | } |
5677 | |
5678 | void AMDGPUInstructionSelector::(MachineInstrBuilder &MIB, |
5679 | const MachineInstr &MI, |
5680 | int OpIdx) const { |
5681 | assert(OpIdx >= 0 && "expected to match an immediate operand" ); |
5682 | const bool Swizzle = MI.getOperand(i: OpIdx).getImm() & |
5683 | (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::SWZ |
5684 | : AMDGPU::CPol::SWZ_pregfx12); |
5685 | MIB.addImm(Val: Swizzle); |
5686 | } |
5687 | |
5688 | void AMDGPUInstructionSelector::( |
5689 | MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { |
5690 | assert(OpIdx >= 0 && "expected to match an immediate operand" ); |
5691 | const uint32_t Cpol = MI.getOperand(i: OpIdx).getImm() & |
5692 | (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::ALL |
5693 | : AMDGPU::CPol::ALL_pregfx12); |
5694 | MIB.addImm(Val: Cpol | AMDGPU::CPol::GLC); |
5695 | } |
5696 | |
5697 | void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB, |
5698 | const MachineInstr &MI, |
5699 | int OpIdx) const { |
5700 | MIB.addFrameIndex(Idx: MI.getOperand(i: 1).getIndex()); |
5701 | } |
5702 | |
5703 | void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB, |
5704 | const MachineInstr &MI, |
5705 | int OpIdx) const { |
5706 | const APFloat &APF = MI.getOperand(i: 1).getFPImm()->getValueAPF(); |
5707 | int ExpVal = APF.getExactLog2Abs(); |
5708 | assert(ExpVal != INT_MIN); |
5709 | MIB.addImm(Val: ExpVal); |
5710 | } |
5711 | |
5712 | bool AMDGPUInstructionSelector::isInlineImmediate(const APInt &Imm) const { |
5713 | return TII.isInlineConstant(Imm); |
5714 | } |
5715 | |
5716 | bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const { |
5717 | return TII.isInlineConstant(Imm); |
5718 | } |
5719 | |