1//===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// Defines an instruction selector for the AMDGPU target.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPUISelDAGToDAG.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUSubtarget.h"
18#include "AMDGPUTargetMachine.h"
19#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20#include "MCTargetDesc/R600MCTargetDesc.h"
21#include "R600RegisterInfo.h"
22#include "SIISelLowering.h"
23#include "SIMachineFunctionInfo.h"
24#include "llvm/Analysis/UniformityAnalysis.h"
25#include "llvm/Analysis/ValueTracking.h"
26#include "llvm/CodeGen/FunctionLoweringInfo.h"
27#include "llvm/CodeGen/SelectionDAG.h"
28#include "llvm/CodeGen/SelectionDAGISel.h"
29#include "llvm/CodeGen/SelectionDAGNodes.h"
30#include "llvm/IR/IntrinsicsAMDGPU.h"
31#include "llvm/InitializePasses.h"
32#include "llvm/Support/ErrorHandling.h"
33
34#ifdef EXPENSIVE_CHECKS
35#include "llvm/Analysis/LoopInfo.h"
36#include "llvm/IR/Dominators.h"
37#endif
38
39#define DEBUG_TYPE "amdgpu-isel"
40
41using namespace llvm;
42
43//===----------------------------------------------------------------------===//
44// Instruction Selector Implementation
45//===----------------------------------------------------------------------===//
46
47namespace {
48static SDValue stripBitcast(SDValue Val) {
49 return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(i: 0) : Val;
50}
51
52// Figure out if this is really an extract of the high 16-bits of a dword.
53static bool isExtractHiElt(SDValue In, SDValue &Out) {
54 In = stripBitcast(Val: In);
55
56 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
57 if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(Val: In.getOperand(i: 1))) {
58 if (!Idx->isOne())
59 return false;
60 Out = In.getOperand(i: 0);
61 return true;
62 }
63 }
64
65 if (In.getOpcode() != ISD::TRUNCATE)
66 return false;
67
68 SDValue Srl = In.getOperand(i: 0);
69 if (Srl.getOpcode() == ISD::SRL) {
70 if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Val: Srl.getOperand(i: 1))) {
71 if (ShiftAmt->getZExtValue() == 16) {
72 Out = stripBitcast(Val: Srl.getOperand(i: 0));
73 return true;
74 }
75 }
76 }
77
78 return false;
79}
80
81// Look through operations that obscure just looking at the low 16-bits of the
82// same register.
83static SDValue stripExtractLoElt(SDValue In) {
84 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
85 SDValue Idx = In.getOperand(i: 1);
86 if (isNullConstant(V: Idx) && In.getValueSizeInBits() <= 32)
87 return In.getOperand(i: 0);
88 }
89
90 if (In.getOpcode() == ISD::TRUNCATE) {
91 SDValue Src = In.getOperand(i: 0);
92 if (Src.getValueType().getSizeInBits() == 32)
93 return stripBitcast(Val: Src);
94 }
95
96 return In;
97}
98
99} // end anonymous namespace
100
101INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISel, "amdgpu-isel",
102 "AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
103INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo)
104INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysis)
105INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
106#ifdef EXPENSIVE_CHECKS
107INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
108INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
109#endif
110INITIALIZE_PASS_END(AMDGPUDAGToDAGISel, "amdgpu-isel",
111 "AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
112
113/// This pass converts a legalized DAG into a AMDGPU-specific
114// DAG, ready for instruction scheduling.
115FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM,
116 CodeGenOptLevel OptLevel) {
117 return new AMDGPUDAGToDAGISel(TM, OptLevel);
118}
119
120AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM,
121 CodeGenOptLevel OptLevel)
122 : SelectionDAGISel(ID, TM, OptLevel) {
123 EnableLateStructurizeCFG = AMDGPUTargetMachine::EnableLateStructurizeCFG;
124}
125
126bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
127#ifdef EXPENSIVE_CHECKS
128 DominatorTree & DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
129 LoopInfo * LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
130 for (auto &L : LI->getLoopsInPreorder()) {
131 assert(L->isLCSSAForm(DT));
132 }
133#endif
134 Subtarget = &MF.getSubtarget<GCNSubtarget>();
135 Mode = SIModeRegisterDefaults(MF.getFunction(), *Subtarget);
136 return SelectionDAGISel::runOnMachineFunction(MF);
137}
138
139bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const {
140 // XXX - only need to list legal operations.
141 switch (Opc) {
142 case ISD::FADD:
143 case ISD::FSUB:
144 case ISD::FMUL:
145 case ISD::FDIV:
146 case ISD::FREM:
147 case ISD::FCANONICALIZE:
148 case ISD::UINT_TO_FP:
149 case ISD::SINT_TO_FP:
150 case ISD::FABS:
151 // Fabs is lowered to a bit operation, but it's an and which will clear the
152 // high bits anyway.
153 case ISD::FSQRT:
154 case ISD::FSIN:
155 case ISD::FCOS:
156 case ISD::FPOWI:
157 case ISD::FPOW:
158 case ISD::FLOG:
159 case ISD::FLOG2:
160 case ISD::FLOG10:
161 case ISD::FEXP:
162 case ISD::FEXP2:
163 case ISD::FCEIL:
164 case ISD::FTRUNC:
165 case ISD::FRINT:
166 case ISD::FNEARBYINT:
167 case ISD::FROUNDEVEN:
168 case ISD::FROUND:
169 case ISD::FFLOOR:
170 case ISD::FMINNUM:
171 case ISD::FMAXNUM:
172 case ISD::FLDEXP:
173 case AMDGPUISD::FRACT:
174 case AMDGPUISD::CLAMP:
175 case AMDGPUISD::COS_HW:
176 case AMDGPUISD::SIN_HW:
177 case AMDGPUISD::FMIN3:
178 case AMDGPUISD::FMAX3:
179 case AMDGPUISD::FMED3:
180 case AMDGPUISD::FMAD_FTZ:
181 case AMDGPUISD::RCP:
182 case AMDGPUISD::RSQ:
183 case AMDGPUISD::RCP_IFLAG:
184 // On gfx10, all 16-bit instructions preserve the high bits.
185 return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9;
186 case ISD::FP_ROUND:
187 // We may select fptrunc (fma/mad) to mad_mixlo, which does not zero the
188 // high bits on gfx9.
189 // TODO: If we had the source node we could see if the source was fma/mad
190 return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
191 case ISD::FMA:
192 case ISD::FMAD:
193 case AMDGPUISD::DIV_FIXUP:
194 return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
195 default:
196 // fcopysign, select and others may be lowered to 32-bit bit operations
197 // which don't zero the high bits.
198 return false;
199 }
200}
201
202void AMDGPUDAGToDAGISel::getAnalysisUsage(AnalysisUsage &AU) const {
203 AU.addRequired<AMDGPUArgumentUsageInfo>();
204 AU.addRequired<UniformityInfoWrapperPass>();
205#ifdef EXPENSIVE_CHECKS
206 AU.addRequired<DominatorTreeWrapperPass>();
207 AU.addRequired<LoopInfoWrapperPass>();
208#endif
209 SelectionDAGISel::getAnalysisUsage(AU);
210}
211
212bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const {
213 assert(Subtarget->d16PreservesUnusedBits());
214 MVT VT = N->getValueType(ResNo: 0).getSimpleVT();
215 if (VT != MVT::v2i16 && VT != MVT::v2f16)
216 return false;
217
218 SDValue Lo = N->getOperand(Num: 0);
219 SDValue Hi = N->getOperand(Num: 1);
220
221 LoadSDNode *LdHi = dyn_cast<LoadSDNode>(Val: stripBitcast(Val: Hi));
222
223 // build_vector lo, (load ptr) -> load_d16_hi ptr, lo
224 // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo
225 // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo
226
227 // Need to check for possible indirect dependencies on the other half of the
228 // vector to avoid introducing a cycle.
229 if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(N: Lo.getNode())) {
230 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
231
232 SDValue TiedIn = CurDAG->getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: SDLoc(N), VT, Operand: Lo);
233 SDValue Ops[] = {
234 LdHi->getChain(), LdHi->getBasePtr(), TiedIn
235 };
236
237 unsigned LoadOp = AMDGPUISD::LOAD_D16_HI;
238 if (LdHi->getMemoryVT() == MVT::i8) {
239 LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ?
240 AMDGPUISD::LOAD_D16_HI_I8 : AMDGPUISD::LOAD_D16_HI_U8;
241 } else {
242 assert(LdHi->getMemoryVT() == MVT::i16);
243 }
244
245 SDValue NewLoadHi =
246 CurDAG->getMemIntrinsicNode(Opcode: LoadOp, dl: SDLoc(LdHi), VTList,
247 Ops, MemVT: LdHi->getMemoryVT(),
248 MMO: LdHi->getMemOperand());
249
250 CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: NewLoadHi);
251 CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(LdHi, 1), To: NewLoadHi.getValue(R: 1));
252 return true;
253 }
254
255 // build_vector (load ptr), hi -> load_d16_lo ptr, hi
256 // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi
257 // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi
258 LoadSDNode *LdLo = dyn_cast<LoadSDNode>(Val: stripBitcast(Val: Lo));
259 if (LdLo && Lo.hasOneUse()) {
260 SDValue TiedIn = getHi16Elt(In: Hi);
261 if (!TiedIn || LdLo->isPredecessorOf(N: TiedIn.getNode()))
262 return false;
263
264 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
265 unsigned LoadOp = AMDGPUISD::LOAD_D16_LO;
266 if (LdLo->getMemoryVT() == MVT::i8) {
267 LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ?
268 AMDGPUISD::LOAD_D16_LO_I8 : AMDGPUISD::LOAD_D16_LO_U8;
269 } else {
270 assert(LdLo->getMemoryVT() == MVT::i16);
271 }
272
273 TiedIn = CurDAG->getNode(Opcode: ISD::BITCAST, DL: SDLoc(N), VT, Operand: TiedIn);
274
275 SDValue Ops[] = {
276 LdLo->getChain(), LdLo->getBasePtr(), TiedIn
277 };
278
279 SDValue NewLoadLo =
280 CurDAG->getMemIntrinsicNode(Opcode: LoadOp, dl: SDLoc(LdLo), VTList,
281 Ops, MemVT: LdLo->getMemoryVT(),
282 MMO: LdLo->getMemOperand());
283
284 CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: NewLoadLo);
285 CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(LdLo, 1), To: NewLoadLo.getValue(R: 1));
286 return true;
287 }
288
289 return false;
290}
291
292void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
293 if (!Subtarget->d16PreservesUnusedBits())
294 return;
295
296 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
297
298 bool MadeChange = false;
299 while (Position != CurDAG->allnodes_begin()) {
300 SDNode *N = &*--Position;
301 if (N->use_empty())
302 continue;
303
304 switch (N->getOpcode()) {
305 case ISD::BUILD_VECTOR:
306 // TODO: Match load d16 from shl (extload:i16), 16
307 MadeChange |= matchLoadD16FromBuildVector(N);
308 break;
309 default:
310 break;
311 }
312 }
313
314 if (MadeChange) {
315 CurDAG->RemoveDeadNodes();
316 LLVM_DEBUG(dbgs() << "After PreProcess:\n";
317 CurDAG->dump(););
318 }
319}
320
321bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
322 if (N->isUndef())
323 return true;
324
325 const SIInstrInfo *TII = Subtarget->getInstrInfo();
326 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val: N))
327 return TII->isInlineConstant(Imm: C->getAPIntValue());
328
329 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val: N))
330 return TII->isInlineConstant(Imm: C->getValueAPF());
331
332 return false;
333}
334
335/// Determine the register class for \p OpNo
336/// \returns The register class of the virtual register that will be used for
337/// the given operand number \OpNo or NULL if the register class cannot be
338/// determined.
339const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
340 unsigned OpNo) const {
341 if (!N->isMachineOpcode()) {
342 if (N->getOpcode() == ISD::CopyToReg) {
343 Register Reg = cast<RegisterSDNode>(Val: N->getOperand(Num: 1))->getReg();
344 if (Reg.isVirtual()) {
345 MachineRegisterInfo &MRI = CurDAG->getMachineFunction().getRegInfo();
346 return MRI.getRegClass(Reg);
347 }
348
349 const SIRegisterInfo *TRI
350 = static_cast<const GCNSubtarget *>(Subtarget)->getRegisterInfo();
351 return TRI->getPhysRegBaseClass(Reg);
352 }
353
354 return nullptr;
355 }
356
357 switch (N->getMachineOpcode()) {
358 default: {
359 const MCInstrDesc &Desc =
360 Subtarget->getInstrInfo()->get(N->getMachineOpcode());
361 unsigned OpIdx = Desc.getNumDefs() + OpNo;
362 if (OpIdx >= Desc.getNumOperands())
363 return nullptr;
364 int RegClass = Desc.operands()[OpIdx].RegClass;
365 if (RegClass == -1)
366 return nullptr;
367
368 return Subtarget->getRegisterInfo()->getRegClass(RCID: RegClass);
369 }
370 case AMDGPU::REG_SEQUENCE: {
371 unsigned RCID = N->getConstantOperandVal(Num: 0);
372 const TargetRegisterClass *SuperRC =
373 Subtarget->getRegisterInfo()->getRegClass(RCID);
374
375 SDValue SubRegOp = N->getOperand(Num: OpNo + 1);
376 unsigned SubRegIdx = SubRegOp->getAsZExtVal();
377 return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
378 SubRegIdx);
379 }
380 }
381}
382
383SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain,
384 SDValue Glue) const {
385 SmallVector <SDValue, 8> Ops;
386 Ops.push_back(Elt: NewChain); // Replace the chain.
387 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
388 Ops.push_back(Elt: N->getOperand(Num: i));
389
390 Ops.push_back(Elt: Glue);
391 return CurDAG->MorphNodeTo(N, Opc: N->getOpcode(), VTs: N->getVTList(), Ops);
392}
393
394SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
395 const SITargetLowering& Lowering =
396 *static_cast<const SITargetLowering*>(getTargetLowering());
397
398 assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain");
399
400 SDValue M0 = Lowering.copyToM0(DAG&: *CurDAG, Chain: N->getOperand(Num: 0), DL: SDLoc(N), V: Val);
401 return glueCopyToOp(N, NewChain: M0, Glue: M0.getValue(R: 1));
402}
403
404SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {
405 unsigned AS = cast<MemSDNode>(Val: N)->getAddressSpace();
406 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
407 if (Subtarget->ldsRequiresM0Init())
408 return glueCopyToM0(N, Val: CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32));
409 } else if (AS == AMDGPUAS::REGION_ADDRESS) {
410 MachineFunction &MF = CurDAG->getMachineFunction();
411 unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize();
412 return
413 glueCopyToM0(N, Val: CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32));
414 }
415 return N;
416}
417
418MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
419 EVT VT) const {
420 SDNode *Lo = CurDAG->getMachineNode(
421 AMDGPU::S_MOV_B32, DL, MVT::i32,
422 CurDAG->getTargetConstant(Imm & 0xFFFFFFFF, DL, MVT::i32));
423 SDNode *Hi =
424 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
425 CurDAG->getTargetConstant(Imm >> 32, DL, MVT::i32));
426 const SDValue Ops[] = {
427 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
428 SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
429 SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
430
431 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops);
432}
433
434void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
435 EVT VT = N->getValueType(ResNo: 0);
436 unsigned NumVectorElts = VT.getVectorNumElements();
437 EVT EltVT = VT.getVectorElementType();
438 SDLoc DL(N);
439 SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
440
441 if (NumVectorElts == 1) {
442 CurDAG->SelectNodeTo(N, MachineOpc: AMDGPU::COPY_TO_REGCLASS, VT: EltVT, Op1: N->getOperand(Num: 0),
443 Op2: RegClass);
444 return;
445 }
446
447 assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
448 "supported yet");
449 // 32 = Max Num Vector Elements
450 // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
451 // 1 = Vector Register Class
452 SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
453
454 bool IsGCN = CurDAG->getSubtarget().getTargetTriple().getArch() ==
455 Triple::amdgcn;
456 RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
457 bool IsRegSeq = true;
458 unsigned NOps = N->getNumOperands();
459 for (unsigned i = 0; i < NOps; i++) {
460 // XXX: Why is this here?
461 if (isa<RegisterSDNode>(Val: N->getOperand(Num: i))) {
462 IsRegSeq = false;
463 break;
464 }
465 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(Channel: i)
466 : R600RegisterInfo::getSubRegFromChannel(Channel: i);
467 RegSeqArgs[1 + (2 * i)] = N->getOperand(Num: i);
468 RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32);
469 }
470 if (NOps != NumVectorElts) {
471 // Fill in the missing undef elements if this was a scalar_to_vector.
472 assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
473 MachineSDNode *ImpDef = CurDAG->getMachineNode(Opcode: TargetOpcode::IMPLICIT_DEF,
474 dl: DL, VT: EltVT);
475 for (unsigned i = NOps; i < NumVectorElts; ++i) {
476 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(Channel: i)
477 : R600RegisterInfo::getSubRegFromChannel(Channel: i);
478 RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
479 RegSeqArgs[1 + (2 * i) + 1] =
480 CurDAG->getTargetConstant(Sub, DL, MVT::i32);
481 }
482 }
483
484 if (!IsRegSeq)
485 SelectCode(N);
486 CurDAG->SelectNodeTo(N, MachineOpc: AMDGPU::REG_SEQUENCE, VTs: N->getVTList(), Ops: RegSeqArgs);
487}
488
489void AMDGPUDAGToDAGISel::Select(SDNode *N) {
490 unsigned int Opc = N->getOpcode();
491 if (N->isMachineOpcode()) {
492 N->setNodeId(-1);
493 return; // Already selected.
494 }
495
496 // isa<MemSDNode> almost works but is slightly too permissive for some DS
497 // intrinsics.
498 if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(Val: N) ||
499 Opc == AMDGPUISD::ATOMIC_LOAD_FMIN ||
500 Opc == AMDGPUISD::ATOMIC_LOAD_FMAX) {
501 N = glueCopyToM0LDSInit(N);
502 SelectCode(N);
503 return;
504 }
505
506 switch (Opc) {
507 default:
508 break;
509 // We are selecting i64 ADD here instead of custom lower it during
510 // DAG legalization, so we can fold some i64 ADDs used for address
511 // calculation into the LOAD and STORE instructions.
512 case ISD::ADDC:
513 case ISD::ADDE:
514 case ISD::SUBC:
515 case ISD::SUBE: {
516 if (N->getValueType(ResNo: 0) != MVT::i64)
517 break;
518
519 SelectADD_SUB_I64(N);
520 return;
521 }
522 case ISD::UADDO_CARRY:
523 case ISD::USUBO_CARRY:
524 if (N->getValueType(ResNo: 0) != MVT::i32)
525 break;
526
527 SelectAddcSubb(N);
528 return;
529 case ISD::UADDO:
530 case ISD::USUBO: {
531 SelectUADDO_USUBO(N);
532 return;
533 }
534 case AMDGPUISD::FMUL_W_CHAIN: {
535 SelectFMUL_W_CHAIN(N);
536 return;
537 }
538 case AMDGPUISD::FMA_W_CHAIN: {
539 SelectFMA_W_CHAIN(N);
540 return;
541 }
542
543 case ISD::SCALAR_TO_VECTOR:
544 case ISD::BUILD_VECTOR: {
545 EVT VT = N->getValueType(ResNo: 0);
546 unsigned NumVectorElts = VT.getVectorNumElements();
547 if (VT.getScalarSizeInBits() == 16) {
548 if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) {
549 if (SDNode *Packed = packConstantV2I16(N, DAG&: *CurDAG)) {
550 ReplaceNode(F: N, T: Packed);
551 return;
552 }
553 }
554
555 break;
556 }
557
558 assert(VT.getVectorElementType().bitsEq(MVT::i32));
559 unsigned RegClassID =
560 SIRegisterInfo::getSGPRClassForBitWidth(BitWidth: NumVectorElts * 32)->getID();
561 SelectBuildVector(N, RegClassID);
562 return;
563 }
564 case ISD::BUILD_PAIR: {
565 SDValue RC, SubReg0, SubReg1;
566 SDLoc DL(N);
567 if (N->getValueType(ResNo: 0) == MVT::i128) {
568 RC = CurDAG->getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32);
569 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32);
570 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32);
571 } else if (N->getValueType(0) == MVT::i64) {
572 RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32);
573 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
574 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
575 } else {
576 llvm_unreachable("Unhandled value type for BUILD_PAIR");
577 }
578 const SDValue Ops[] = { RC, N->getOperand(Num: 0), SubReg0,
579 N->getOperand(Num: 1), SubReg1 };
580 ReplaceNode(F: N, T: CurDAG->getMachineNode(Opcode: TargetOpcode::REG_SEQUENCE, dl: DL,
581 VT: N->getValueType(ResNo: 0), Ops));
582 return;
583 }
584
585 case ISD::Constant:
586 case ISD::ConstantFP: {
587 if (N->getValueType(ResNo: 0).getSizeInBits() != 64 || isInlineImmediate(N))
588 break;
589
590 uint64_t Imm;
591 if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(Val: N)) {
592 Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
593 if (AMDGPU::isValid32BitLiteral(Val: Imm, IsFP64: true))
594 break;
595 } else {
596 ConstantSDNode *C = cast<ConstantSDNode>(Val: N);
597 Imm = C->getZExtValue();
598 if (AMDGPU::isValid32BitLiteral(Val: Imm, IsFP64: false))
599 break;
600 }
601
602 SDLoc DL(N);
603 ReplaceNode(F: N, T: buildSMovImm64(DL, Imm, VT: N->getValueType(ResNo: 0)));
604 return;
605 }
606 case AMDGPUISD::BFE_I32:
607 case AMDGPUISD::BFE_U32: {
608 // There is a scalar version available, but unlike the vector version which
609 // has a separate operand for the offset and width, the scalar version packs
610 // the width and offset into a single operand. Try to move to the scalar
611 // version if the offsets are constant, so that we can try to keep extended
612 // loads of kernel arguments in SGPRs.
613
614 // TODO: Technically we could try to pattern match scalar bitshifts of
615 // dynamic values, but it's probably not useful.
616 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
617 if (!Offset)
618 break;
619
620 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 2));
621 if (!Width)
622 break;
623
624 bool Signed = Opc == AMDGPUISD::BFE_I32;
625
626 uint32_t OffsetVal = Offset->getZExtValue();
627 uint32_t WidthVal = Width->getZExtValue();
628
629 ReplaceNode(F: N, T: getBFE32(IsSigned: Signed, DL: SDLoc(N), Val: N->getOperand(Num: 0), Offset: OffsetVal,
630 Width: WidthVal));
631 return;
632 }
633 case AMDGPUISD::DIV_SCALE: {
634 SelectDIV_SCALE(N);
635 return;
636 }
637 case AMDGPUISD::MAD_I64_I32:
638 case AMDGPUISD::MAD_U64_U32: {
639 SelectMAD_64_32(N);
640 return;
641 }
642 case ISD::SMUL_LOHI:
643 case ISD::UMUL_LOHI:
644 return SelectMUL_LOHI(N);
645 case ISD::CopyToReg: {
646 const SITargetLowering& Lowering =
647 *static_cast<const SITargetLowering*>(getTargetLowering());
648 N = Lowering.legalizeTargetIndependentNode(Node: N, DAG&: *CurDAG);
649 break;
650 }
651 case ISD::AND:
652 case ISD::SRL:
653 case ISD::SRA:
654 case ISD::SIGN_EXTEND_INREG:
655 if (N->getValueType(0) != MVT::i32)
656 break;
657
658 SelectS_BFE(N);
659 return;
660 case ISD::BRCOND:
661 SelectBRCOND(N);
662 return;
663 case ISD::FP_EXTEND:
664 SelectFP_EXTEND(N);
665 return;
666 case AMDGPUISD::CVT_PKRTZ_F16_F32:
667 case AMDGPUISD::CVT_PKNORM_I16_F32:
668 case AMDGPUISD::CVT_PKNORM_U16_F32:
669 case AMDGPUISD::CVT_PK_U16_U32:
670 case AMDGPUISD::CVT_PK_I16_I32: {
671 // Hack around using a legal type if f16 is illegal.
672 if (N->getValueType(0) == MVT::i32) {
673 MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16;
674 N = CurDAG->MorphNodeTo(N, Opc: N->getOpcode(), VTs: CurDAG->getVTList(VT: NewVT),
675 Ops: { N->getOperand(Num: 0), N->getOperand(Num: 1) });
676 SelectCode(N);
677 return;
678 }
679
680 break;
681 }
682 case ISD::INTRINSIC_W_CHAIN: {
683 SelectINTRINSIC_W_CHAIN(N);
684 return;
685 }
686 case ISD::INTRINSIC_WO_CHAIN: {
687 SelectINTRINSIC_WO_CHAIN(N);
688 return;
689 }
690 case ISD::INTRINSIC_VOID: {
691 SelectINTRINSIC_VOID(N);
692 return;
693 }
694 case AMDGPUISD::WAVE_ADDRESS: {
695 SelectWAVE_ADDRESS(N);
696 return;
697 }
698 case ISD::STACKRESTORE: {
699 SelectSTACKRESTORE(N);
700 return;
701 }
702 }
703
704 SelectCode(N);
705}
706
707bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
708 const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
709 const Instruction *Term = BB->getTerminator();
710 return Term->getMetadata(Kind: "amdgpu.uniform") ||
711 Term->getMetadata(Kind: "structurizecfg.uniform");
712}
713
714bool AMDGPUDAGToDAGISel::isUnneededShiftMask(const SDNode *N,
715 unsigned ShAmtBits) const {
716 assert(N->getOpcode() == ISD::AND);
717
718 const APInt &RHS = N->getConstantOperandAPInt(Num: 1);
719 if (RHS.countr_one() >= ShAmtBits)
720 return true;
721
722 const APInt &LHSKnownZeros = CurDAG->computeKnownBits(Op: N->getOperand(Num: 0)).Zero;
723 return (LHSKnownZeros | RHS).countr_one() >= ShAmtBits;
724}
725
726static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr,
727 SDValue &N0, SDValue &N1) {
728 if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST &&
729 Addr.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
730 // As we split 64-bit `or` earlier, it's complicated pattern to match, i.e.
731 // (i64 (bitcast (v2i32 (build_vector
732 // (or (extract_vector_elt V, 0), OFFSET),
733 // (extract_vector_elt V, 1)))))
734 SDValue Lo = Addr.getOperand(i: 0).getOperand(i: 0);
735 if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Op: Lo)) {
736 SDValue BaseLo = Lo.getOperand(i: 0);
737 SDValue BaseHi = Addr.getOperand(i: 0).getOperand(i: 1);
738 // Check that split base (Lo and Hi) are extracted from the same one.
739 if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
740 BaseHi.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
741 BaseLo.getOperand(i: 0) == BaseHi.getOperand(i: 0) &&
742 // Lo is statically extracted from index 0.
743 isa<ConstantSDNode>(Val: BaseLo.getOperand(i: 1)) &&
744 BaseLo.getConstantOperandVal(i: 1) == 0 &&
745 // Hi is statically extracted from index 0.
746 isa<ConstantSDNode>(Val: BaseHi.getOperand(i: 1)) &&
747 BaseHi.getConstantOperandVal(i: 1) == 1) {
748 N0 = BaseLo.getOperand(i: 0).getOperand(i: 0);
749 N1 = Lo.getOperand(i: 1);
750 return true;
751 }
752 }
753 }
754 return false;
755}
756
757bool AMDGPUDAGToDAGISel::isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS,
758 SDValue &RHS) const {
759 if (CurDAG->isBaseWithConstantOffset(Op: Addr)) {
760 LHS = Addr.getOperand(i: 0);
761 RHS = Addr.getOperand(i: 1);
762 return true;
763 }
764
765 if (getBaseWithOffsetUsingSplitOR(DAG&: *CurDAG, Addr, N0&: LHS, N1&: RHS)) {
766 assert(LHS && RHS && isa<ConstantSDNode>(RHS));
767 return true;
768 }
769
770 return false;
771}
772
773StringRef AMDGPUDAGToDAGISel::getPassName() const {
774 return "AMDGPU DAG->DAG Pattern Instruction Selection";
775}
776
777//===----------------------------------------------------------------------===//
778// Complex Patterns
779//===----------------------------------------------------------------------===//
780
781bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
782 SDValue &Offset) {
783 return false;
784}
785
786bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
787 SDValue &Offset) {
788 ConstantSDNode *C;
789 SDLoc DL(Addr);
790
791 if ((C = dyn_cast<ConstantSDNode>(Val&: Addr))) {
792 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
793 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
794 } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
795 (C = dyn_cast<ConstantSDNode>(Val: Addr.getOperand(i: 0)))) {
796 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
797 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
798 } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
799 (C = dyn_cast<ConstantSDNode>(Val: Addr.getOperand(i: 1)))) {
800 Base = Addr.getOperand(i: 0);
801 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
802 } else {
803 Base = Addr;
804 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
805 }
806
807 return true;
808}
809
810SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val,
811 const SDLoc &DL) const {
812 SDNode *Mov = CurDAG->getMachineNode(
813 AMDGPU::S_MOV_B32, DL, MVT::i32,
814 CurDAG->getTargetConstant(Val, DL, MVT::i32));
815 return SDValue(Mov, 0);
816}
817
818// FIXME: Should only handle uaddo_carry/usubo_carry
819void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
820 SDLoc DL(N);
821 SDValue LHS = N->getOperand(Num: 0);
822 SDValue RHS = N->getOperand(Num: 1);
823
824 unsigned Opcode = N->getOpcode();
825 bool ConsumeCarry = (Opcode == ISD::ADDE || Opcode == ISD::SUBE);
826 bool ProduceCarry =
827 ConsumeCarry || Opcode == ISD::ADDC || Opcode == ISD::SUBC;
828 bool IsAdd = Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE;
829
830 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
831 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
832
833 SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
834 DL, MVT::i32, LHS, Sub0);
835 SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
836 DL, MVT::i32, LHS, Sub1);
837
838 SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
839 DL, MVT::i32, RHS, Sub0);
840 SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
841 DL, MVT::i32, RHS, Sub1);
842
843 SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);
844
845 static const unsigned OpcMap[2][2][2] = {
846 {{AMDGPU::S_SUB_U32, AMDGPU::S_ADD_U32},
847 {AMDGPU::V_SUB_CO_U32_e32, AMDGPU::V_ADD_CO_U32_e32}},
848 {{AMDGPU::S_SUBB_U32, AMDGPU::S_ADDC_U32},
849 {AMDGPU::V_SUBB_U32_e32, AMDGPU::V_ADDC_U32_e32}}};
850
851 unsigned Opc = OpcMap[0][N->isDivergent()][IsAdd];
852 unsigned CarryOpc = OpcMap[1][N->isDivergent()][IsAdd];
853
854 SDNode *AddLo;
855 if (!ConsumeCarry) {
856 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) };
857 AddLo = CurDAG->getMachineNode(Opcode: Opc, dl: DL, VTs: VTList, Ops: Args);
858 } else {
859 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(Num: 2) };
860 AddLo = CurDAG->getMachineNode(Opcode: CarryOpc, dl: DL, VTs: VTList, Ops: Args);
861 }
862 SDValue AddHiArgs[] = {
863 SDValue(Hi0, 0),
864 SDValue(Hi1, 0),
865 SDValue(AddLo, 1)
866 };
867 SDNode *AddHi = CurDAG->getMachineNode(Opcode: CarryOpc, dl: DL, VTs: VTList, Ops: AddHiArgs);
868
869 SDValue RegSequenceArgs[] = {
870 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
871 SDValue(AddLo,0),
872 Sub0,
873 SDValue(AddHi,0),
874 Sub1,
875 };
876 SDNode *RegSequence = CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
877 MVT::i64, RegSequenceArgs);
878
879 if (ProduceCarry) {
880 // Replace the carry-use
881 ReplaceUses(F: SDValue(N, 1), T: SDValue(AddHi, 1));
882 }
883
884 // Replace the remaining uses.
885 ReplaceNode(F: N, T: RegSequence);
886}
887
888void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) {
889 SDLoc DL(N);
890 SDValue LHS = N->getOperand(Num: 0);
891 SDValue RHS = N->getOperand(Num: 1);
892 SDValue CI = N->getOperand(Num: 2);
893
894 if (N->isDivergent()) {
895 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::V_ADDC_U32_e64
896 : AMDGPU::V_SUBB_U32_e64;
897 CurDAG->SelectNodeTo(
898 N, Opc, N->getVTList(),
899 {LHS, RHS, CI,
900 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
901 } else {
902 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::S_ADD_CO_PSEUDO
903 : AMDGPU::S_SUB_CO_PSEUDO;
904 CurDAG->SelectNodeTo(N, MachineOpc: Opc, VTs: N->getVTList(), Ops: {LHS, RHS, CI});
905 }
906}
907
908void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
909 // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
910 // carry out despite the _i32 name. These were renamed in VI to _U32.
911 // FIXME: We should probably rename the opcodes here.
912 bool IsAdd = N->getOpcode() == ISD::UADDO;
913 bool IsVALU = N->isDivergent();
914
915 for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end(); UI != E;
916 ++UI)
917 if (UI.getUse().getResNo() == 1) {
918 if ((IsAdd && (UI->getOpcode() != ISD::UADDO_CARRY)) ||
919 (!IsAdd && (UI->getOpcode() != ISD::USUBO_CARRY))) {
920 IsVALU = true;
921 break;
922 }
923 }
924
925 if (IsVALU) {
926 unsigned Opc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
927
928 CurDAG->SelectNodeTo(
929 N, Opc, N->getVTList(),
930 {N->getOperand(0), N->getOperand(1),
931 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
932 } else {
933 unsigned Opc = N->getOpcode() == ISD::UADDO ? AMDGPU::S_UADDO_PSEUDO
934 : AMDGPU::S_USUBO_PSEUDO;
935
936 CurDAG->SelectNodeTo(N, MachineOpc: Opc, VTs: N->getVTList(),
937 Ops: {N->getOperand(Num: 0), N->getOperand(Num: 1)});
938 }
939}
940
941void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
942 SDLoc SL(N);
943 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod
944 SDValue Ops[10];
945
946 SelectVOP3Mods0(In: N->getOperand(Num: 1), Src&: Ops[1], SrcMods&: Ops[0], Clamp&: Ops[6], Omod&: Ops[7]);
947 SelectVOP3Mods(In: N->getOperand(Num: 2), Src&: Ops[3], SrcMods&: Ops[2]);
948 SelectVOP3Mods(In: N->getOperand(Num: 3), Src&: Ops[5], SrcMods&: Ops[4]);
949 Ops[8] = N->getOperand(Num: 0);
950 Ops[9] = N->getOperand(Num: 4);
951
952 // If there are no source modifiers, prefer fmac over fma because it can use
953 // the smaller VOP2 encoding.
954 bool UseFMAC = Subtarget->hasDLInsts() &&
955 cast<ConstantSDNode>(Val&: Ops[0])->isZero() &&
956 cast<ConstantSDNode>(Val&: Ops[2])->isZero() &&
957 cast<ConstantSDNode>(Val&: Ops[4])->isZero();
958 unsigned Opcode = UseFMAC ? AMDGPU::V_FMAC_F32_e64 : AMDGPU::V_FMA_F32_e64;
959 CurDAG->SelectNodeTo(N, MachineOpc: Opcode, VTs: N->getVTList(), Ops);
960}
961
962void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {
963 SDLoc SL(N);
964 // src0_modifiers, src0, src1_modifiers, src1, clamp, omod
965 SDValue Ops[8];
966
967 SelectVOP3Mods0(In: N->getOperand(Num: 1), Src&: Ops[1], SrcMods&: Ops[0], Clamp&: Ops[4], Omod&: Ops[5]);
968 SelectVOP3Mods(In: N->getOperand(Num: 2), Src&: Ops[3], SrcMods&: Ops[2]);
969 Ops[6] = N->getOperand(Num: 0);
970 Ops[7] = N->getOperand(Num: 3);
971
972 CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops);
973}
974
975// We need to handle this here because tablegen doesn't support matching
976// instructions with multiple outputs.
977void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
978 SDLoc SL(N);
979 EVT VT = N->getValueType(ResNo: 0);
980
981 assert(VT == MVT::f32 || VT == MVT::f64);
982
983 unsigned Opc
984 = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64 : AMDGPU::V_DIV_SCALE_F32_e64;
985
986 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
987 // omod
988 SDValue Ops[8];
989 SelectVOP3BMods0(In: N->getOperand(Num: 0), Src&: Ops[1], SrcMods&: Ops[0], Clamp&: Ops[6], Omod&: Ops[7]);
990 SelectVOP3BMods(In: N->getOperand(Num: 1), Src&: Ops[3], SrcMods&: Ops[2]);
991 SelectVOP3BMods(In: N->getOperand(Num: 2), Src&: Ops[5], SrcMods&: Ops[4]);
992 CurDAG->SelectNodeTo(N, MachineOpc: Opc, VTs: N->getVTList(), Ops);
993}
994
995// We need to handle this here because tablegen doesn't support matching
996// instructions with multiple outputs.
997void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
998 SDLoc SL(N);
999 bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
1000 unsigned Opc;
1001 if (Subtarget->hasMADIntraFwdBug())
1002 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1003 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1004 else
1005 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1006
1007 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1008 SDValue Ops[] = { N->getOperand(Num: 0), N->getOperand(Num: 1), N->getOperand(Num: 2),
1009 Clamp };
1010 CurDAG->SelectNodeTo(N, MachineOpc: Opc, VTs: N->getVTList(), Ops);
1011}
1012
1013// We need to handle this here because tablegen doesn't support matching
1014// instructions with multiple outputs.
1015void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) {
1016 SDLoc SL(N);
1017 bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
1018 unsigned Opc;
1019 if (Subtarget->hasMADIntraFwdBug())
1020 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1021 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1022 else
1023 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1024
1025 SDValue Zero = CurDAG->getTargetConstant(0, SL, MVT::i64);
1026 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1027 SDValue Ops[] = {N->getOperand(Num: 0), N->getOperand(Num: 1), Zero, Clamp};
1028 SDNode *Mad = CurDAG->getMachineNode(Opcode: Opc, dl: SL, VTs: N->getVTList(), Ops);
1029 if (!SDValue(N, 0).use_empty()) {
1030 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32);
1031 SDNode *Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1032 MVT::i32, SDValue(Mad, 0), Sub0);
1033 ReplaceUses(F: SDValue(N, 0), T: SDValue(Lo, 0));
1034 }
1035 if (!SDValue(N, 1).use_empty()) {
1036 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32);
1037 SDNode *Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1038 MVT::i32, SDValue(Mad, 0), Sub1);
1039 ReplaceUses(F: SDValue(N, 1), T: SDValue(Hi, 0));
1040 }
1041 CurDAG->RemoveDeadNode(N);
1042}
1043
1044bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset) const {
1045 if (!isUInt<16>(x: Offset))
1046 return false;
1047
1048 if (!Base || Subtarget->hasUsableDSOffset() ||
1049 Subtarget->unsafeDSOffsetFoldingEnabled())
1050 return true;
1051
1052 // On Southern Islands instruction with a negative base value and an offset
1053 // don't seem to work.
1054 return CurDAG->SignBitIsZero(Op: Base);
1055}
1056
1057bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
1058 SDValue &Offset) const {
1059 SDLoc DL(Addr);
1060 if (CurDAG->isBaseWithConstantOffset(Op: Addr)) {
1061 SDValue N0 = Addr.getOperand(i: 0);
1062 SDValue N1 = Addr.getOperand(i: 1);
1063 ConstantSDNode *C1 = cast<ConstantSDNode>(Val&: N1);
1064 if (isDSOffsetLegal(Base: N0, Offset: C1->getSExtValue())) {
1065 // (add n0, c0)
1066 Base = N0;
1067 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1068 return true;
1069 }
1070 } else if (Addr.getOpcode() == ISD::SUB) {
1071 // sub C, x -> add (sub 0, x), C
1072 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val: Addr.getOperand(i: 0))) {
1073 int64_t ByteOffset = C->getSExtValue();
1074 if (isDSOffsetLegal(Base: SDValue(), Offset: ByteOffset)) {
1075 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1076
1077 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1078 // the known bits in isDSOffsetLegal. We need to emit the selected node
1079 // here, so this is thrown away.
1080 SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
1081 Zero, Addr.getOperand(1));
1082
1083 if (isDSOffsetLegal(Base: Sub, Offset: ByteOffset)) {
1084 SmallVector<SDValue, 3> Opnds;
1085 Opnds.push_back(Elt: Zero);
1086 Opnds.push_back(Elt: Addr.getOperand(i: 1));
1087
1088 // FIXME: Select to VOP3 version for with-carry.
1089 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1090 if (Subtarget->hasAddNoCarry()) {
1091 SubOp = AMDGPU::V_SUB_U32_e64;
1092 Opnds.push_back(
1093 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1094 }
1095
1096 MachineSDNode *MachineSub =
1097 CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds);
1098
1099 Base = SDValue(MachineSub, 0);
1100 Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16);
1101 return true;
1102 }
1103 }
1104 }
1105 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Val&: Addr)) {
1106 // If we have a constant address, prefer to put the constant into the
1107 // offset. This can save moves to load the constant address since multiple
1108 // operations can share the zero base address register, and enables merging
1109 // into read2 / write2 instructions.
1110
1111 SDLoc DL(Addr);
1112
1113 if (isDSOffsetLegal(Base: SDValue(), Offset: CAddr->getZExtValue())) {
1114 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1115 MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
1116 DL, MVT::i32, Zero);
1117 Base = SDValue(MovZero, 0);
1118 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
1119 return true;
1120 }
1121 }
1122
1123 // default case
1124 Base = Addr;
1125 Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16);
1126 return true;
1127}
1128
1129bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
1130 unsigned Offset1,
1131 unsigned Size) const {
1132 if (Offset0 % Size != 0 || Offset1 % Size != 0)
1133 return false;
1134 if (!isUInt<8>(x: Offset0 / Size) || !isUInt<8>(x: Offset1 / Size))
1135 return false;
1136
1137 if (!Base || Subtarget->hasUsableDSOffset() ||
1138 Subtarget->unsafeDSOffsetFoldingEnabled())
1139 return true;
1140
1141 // On Southern Islands instruction with a negative base value and an offset
1142 // don't seem to work.
1143 return CurDAG->SignBitIsZero(Op: Base);
1144}
1145
1146// Return whether the operation has NoUnsignedWrap property.
1147static bool isNoUnsignedWrap(SDValue Addr) {
1148 return (Addr.getOpcode() == ISD::ADD &&
1149 Addr->getFlags().hasNoUnsignedWrap()) ||
1150 Addr->getOpcode() == ISD::OR;
1151}
1152
1153// Check that the base address of flat scratch load/store in the form of `base +
1154// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
1155// requirement). We always treat the first operand as the base address here.
1156bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr) const {
1157 if (isNoUnsignedWrap(Addr))
1158 return true;
1159
1160 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1161 // values.
1162 if (Subtarget->hasSignedScratchOffsets())
1163 return true;
1164
1165 auto LHS = Addr.getOperand(i: 0);
1166 auto RHS = Addr.getOperand(i: 1);
1167
1168 // If the immediate offset is negative and within certain range, the base
1169 // address cannot also be negative. If the base is also negative, the sum
1170 // would be either negative or much larger than the valid range of scratch
1171 // memory a thread can access.
1172 ConstantSDNode *ImmOp = nullptr;
1173 if (Addr.getOpcode() == ISD::ADD && (ImmOp = dyn_cast<ConstantSDNode>(Val&: RHS))) {
1174 if (ImmOp->getSExtValue() < 0 && ImmOp->getSExtValue() > -0x40000000)
1175 return true;
1176 }
1177
1178 return CurDAG->SignBitIsZero(Op: LHS);
1179}
1180
1181// Check address value in SGPR/VGPR are legal for flat scratch in the form
1182// of: SGPR + VGPR.
1183bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSV(SDValue Addr) const {
1184 if (isNoUnsignedWrap(Addr))
1185 return true;
1186
1187 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1188 // values.
1189 if (Subtarget->hasSignedScratchOffsets())
1190 return true;
1191
1192 auto LHS = Addr.getOperand(i: 0);
1193 auto RHS = Addr.getOperand(i: 1);
1194 return CurDAG->SignBitIsZero(Op: RHS) && CurDAG->SignBitIsZero(Op: LHS);
1195}
1196
1197// Check address value in SGPR/VGPR are legal for flat scratch in the form
1198// of: SGPR + VGPR + Imm.
1199bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSVImm(SDValue Addr) const {
1200 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1201 // values.
1202 if (AMDGPU::isGFX12Plus(*Subtarget))
1203 return true;
1204
1205 auto Base = Addr.getOperand(i: 0);
1206 auto *RHSImm = cast<ConstantSDNode>(Val: Addr.getOperand(i: 1));
1207 // If the immediate offset is negative and within certain range, the base
1208 // address cannot also be negative. If the base is also negative, the sum
1209 // would be either negative or much larger than the valid range of scratch
1210 // memory a thread can access.
1211 if (isNoUnsignedWrap(Addr: Base) &&
1212 (isNoUnsignedWrap(Addr) ||
1213 (RHSImm->getSExtValue() < 0 && RHSImm->getSExtValue() > -0x40000000)))
1214 return true;
1215
1216 auto LHS = Base.getOperand(i: 0);
1217 auto RHS = Base.getOperand(i: 1);
1218 return CurDAG->SignBitIsZero(Op: RHS) && CurDAG->SignBitIsZero(Op: LHS);
1219}
1220
1221// TODO: If offset is too big, put low 16-bit into offset.
1222bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
1223 SDValue &Offset0,
1224 SDValue &Offset1) const {
1225 return SelectDSReadWrite2(Ptr: Addr, Base, Offset0, Offset1, Size: 4);
1226}
1227
1228bool AMDGPUDAGToDAGISel::SelectDS128Bit8ByteAligned(SDValue Addr, SDValue &Base,
1229 SDValue &Offset0,
1230 SDValue &Offset1) const {
1231 return SelectDSReadWrite2(Ptr: Addr, Base, Offset0, Offset1, Size: 8);
1232}
1233
1234bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
1235 SDValue &Offset0, SDValue &Offset1,
1236 unsigned Size) const {
1237 SDLoc DL(Addr);
1238
1239 if (CurDAG->isBaseWithConstantOffset(Op: Addr)) {
1240 SDValue N0 = Addr.getOperand(i: 0);
1241 SDValue N1 = Addr.getOperand(i: 1);
1242 ConstantSDNode *C1 = cast<ConstantSDNode>(Val&: N1);
1243 unsigned OffsetValue0 = C1->getZExtValue();
1244 unsigned OffsetValue1 = OffsetValue0 + Size;
1245
1246 // (add n0, c0)
1247 if (isDSOffset2Legal(Base: N0, Offset0: OffsetValue0, Offset1: OffsetValue1, Size)) {
1248 Base = N0;
1249 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1250 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1251 return true;
1252 }
1253 } else if (Addr.getOpcode() == ISD::SUB) {
1254 // sub C, x -> add (sub 0, x), C
1255 if (const ConstantSDNode *C =
1256 dyn_cast<ConstantSDNode>(Val: Addr.getOperand(i: 0))) {
1257 unsigned OffsetValue0 = C->getZExtValue();
1258 unsigned OffsetValue1 = OffsetValue0 + Size;
1259
1260 if (isDSOffset2Legal(Base: SDValue(), Offset0: OffsetValue0, Offset1: OffsetValue1, Size)) {
1261 SDLoc DL(Addr);
1262 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1263
1264 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1265 // the known bits in isDSOffsetLegal. We need to emit the selected node
1266 // here, so this is thrown away.
1267 SDValue Sub =
1268 CurDAG->getNode(ISD::SUB, DL, MVT::i32, Zero, Addr.getOperand(1));
1269
1270 if (isDSOffset2Legal(Base: Sub, Offset0: OffsetValue0, Offset1: OffsetValue1, Size)) {
1271 SmallVector<SDValue, 3> Opnds;
1272 Opnds.push_back(Elt: Zero);
1273 Opnds.push_back(Elt: Addr.getOperand(i: 1));
1274 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1275 if (Subtarget->hasAddNoCarry()) {
1276 SubOp = AMDGPU::V_SUB_U32_e64;
1277 Opnds.push_back(
1278 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1279 }
1280
1281 MachineSDNode *MachineSub = CurDAG->getMachineNode(
1282 Opcode: SubOp, dl: DL, VT: MVT::getIntegerVT(BitWidth: Size * 8), Ops: Opnds);
1283
1284 Base = SDValue(MachineSub, 0);
1285 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1286 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1287 return true;
1288 }
1289 }
1290 }
1291 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Val&: Addr)) {
1292 unsigned OffsetValue0 = CAddr->getZExtValue();
1293 unsigned OffsetValue1 = OffsetValue0 + Size;
1294
1295 if (isDSOffset2Legal(Base: SDValue(), Offset0: OffsetValue0, Offset1: OffsetValue1, Size)) {
1296 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1297 MachineSDNode *MovZero =
1298 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero);
1299 Base = SDValue(MovZero, 0);
1300 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1301 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1302 return true;
1303 }
1304 }
1305
1306 // default case
1307
1308 Base = Addr;
1309 Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i8);
1310 Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i8);
1311 return true;
1312}
1313
1314bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr,
1315 SDValue &SOffset, SDValue &Offset,
1316 SDValue &Offen, SDValue &Idxen,
1317 SDValue &Addr64) const {
1318 // Subtarget prefers to use flat instruction
1319 // FIXME: This should be a pattern predicate and not reach here
1320 if (Subtarget->useFlatForGlobal())
1321 return false;
1322
1323 SDLoc DL(Addr);
1324
1325 Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1326 Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1327 Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1);
1328 SOffset = Subtarget->hasRestrictedSOffset()
1329 ? CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32)
1330 : CurDAG->getTargetConstant(0, DL, MVT::i32);
1331
1332 ConstantSDNode *C1 = nullptr;
1333 SDValue N0 = Addr;
1334 if (CurDAG->isBaseWithConstantOffset(Op: Addr)) {
1335 C1 = cast<ConstantSDNode>(Val: Addr.getOperand(i: 1));
1336 if (isUInt<32>(x: C1->getZExtValue()))
1337 N0 = Addr.getOperand(i: 0);
1338 else
1339 C1 = nullptr;
1340 }
1341
1342 if (N0.getOpcode() == ISD::ADD) {
1343 // (add N2, N3) -> addr64, or
1344 // (add (add N2, N3), C1) -> addr64
1345 SDValue N2 = N0.getOperand(i: 0);
1346 SDValue N3 = N0.getOperand(i: 1);
1347 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1348
1349 if (N2->isDivergent()) {
1350 if (N3->isDivergent()) {
1351 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
1352 // addr64, and construct the resource from a 0 address.
1353 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1354 VAddr = N0;
1355 } else {
1356 // N2 is divergent, N3 is not.
1357 Ptr = N3;
1358 VAddr = N2;
1359 }
1360 } else {
1361 // N2 is not divergent.
1362 Ptr = N2;
1363 VAddr = N3;
1364 }
1365 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1366 } else if (N0->isDivergent()) {
1367 // N0 is divergent. Use it as the addr64, and construct the resource from a
1368 // 0 address.
1369 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1370 VAddr = N0;
1371 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1372 } else {
1373 // N0 -> offset, or
1374 // (N0 + C1) -> offset
1375 VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
1376 Ptr = N0;
1377 }
1378
1379 if (!C1) {
1380 // No offset.
1381 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1382 return true;
1383 }
1384
1385 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1386 if (TII->isLegalMUBUFImmOffset(Imm: C1->getZExtValue())) {
1387 // Legal offset for instruction.
1388 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
1389 return true;
1390 }
1391
1392 // Illegal offset, store it in soffset.
1393 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1394 SOffset =
1395 SDValue(CurDAG->getMachineNode(
1396 AMDGPU::S_MOV_B32, DL, MVT::i32,
1397 CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
1398 0);
1399 return true;
1400}
1401
1402bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
1403 SDValue &VAddr, SDValue &SOffset,
1404 SDValue &Offset) const {
1405 SDValue Ptr, Offen, Idxen, Addr64;
1406
1407 // addr64 bit was removed for volcanic islands.
1408 // FIXME: This should be a pattern predicate and not reach here
1409 if (!Subtarget->hasAddr64())
1410 return false;
1411
1412 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1413 return false;
1414
1415 ConstantSDNode *C = cast<ConstantSDNode>(Val&: Addr64);
1416 if (C->getSExtValue()) {
1417 SDLoc DL(Addr);
1418
1419 const SITargetLowering& Lowering =
1420 *static_cast<const SITargetLowering*>(getTargetLowering());
1421
1422 SRsrc = SDValue(Lowering.wrapAddr64Rsrc(DAG&: *CurDAG, DL, Ptr), 0);
1423 return true;
1424 }
1425
1426 return false;
1427}
1428
1429std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
1430 SDLoc DL(N);
1431
1432 auto *FI = dyn_cast<FrameIndexSDNode>(Val&: N);
1433 SDValue TFI =
1434 FI ? CurDAG->getTargetFrameIndex(FI: FI->getIndex(), VT: FI->getValueType(ResNo: 0)) : N;
1435
1436 // We rebase the base address into an absolute stack address and hence
1437 // use constant 0 for soffset. This value must be retained until
1438 // frame elimination and eliminateFrameIndex will choose the appropriate
1439 // frame register if need be.
1440 return std::pair(TFI, CurDAG->getTargetConstant(0, DL, MVT::i32));
1441}
1442
1443bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
1444 SDValue Addr, SDValue &Rsrc,
1445 SDValue &VAddr, SDValue &SOffset,
1446 SDValue &ImmOffset) const {
1447
1448 SDLoc DL(Addr);
1449 MachineFunction &MF = CurDAG->getMachineFunction();
1450 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1451
1452 Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1453
1454 if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Val&: Addr)) {
1455 int64_t Imm = CAddr->getSExtValue();
1456 const int64_t NullPtr =
1457 AMDGPUTargetMachine::getNullPointerValue(AddrSpace: AMDGPUAS::PRIVATE_ADDRESS);
1458 // Don't fold null pointer.
1459 if (Imm != NullPtr) {
1460 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST: *Subtarget);
1461 SDValue HighBits =
1462 CurDAG->getTargetConstant(Imm & ~MaxOffset, DL, MVT::i32);
1463 MachineSDNode *MovHighBits = CurDAG->getMachineNode(
1464 AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);
1465 VAddr = SDValue(MovHighBits, 0);
1466
1467 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1468 ImmOffset = CurDAG->getTargetConstant(Imm & MaxOffset, DL, MVT::i32);
1469 return true;
1470 }
1471 }
1472
1473 if (CurDAG->isBaseWithConstantOffset(Op: Addr)) {
1474 // (add n0, c1)
1475
1476 SDValue N0 = Addr.getOperand(i: 0);
1477 uint64_t C1 = Addr.getConstantOperandVal(i: 1);
1478
1479 // Offsets in vaddr must be positive if range checking is enabled.
1480 //
1481 // The total computation of vaddr + soffset + offset must not overflow. If
1482 // vaddr is negative, even if offset is 0 the sgpr offset add will end up
1483 // overflowing.
1484 //
1485 // Prior to gfx9, MUBUF instructions with the vaddr offset enabled would
1486 // always perform a range check. If a negative vaddr base index was used,
1487 // this would fail the range check. The overall address computation would
1488 // compute a valid address, but this doesn't happen due to the range
1489 // check. For out-of-bounds MUBUF loads, a 0 is returned.
1490 //
1491 // Therefore it should be safe to fold any VGPR offset on gfx9 into the
1492 // MUBUF vaddr, but not on older subtargets which can only do this if the
1493 // sign bit is known 0.
1494 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1495 if (TII->isLegalMUBUFImmOffset(Imm: C1) &&
1496 (!Subtarget->privateMemoryResourceIsRangeChecked() ||
1497 CurDAG->SignBitIsZero(Op: N0))) {
1498 std::tie(args&: VAddr, args&: SOffset) = foldFrameIndex(N: N0);
1499 ImmOffset = CurDAG->getTargetConstant(C1, DL, MVT::i32);
1500 return true;
1501 }
1502 }
1503
1504 // (node)
1505 std::tie(args&: VAddr, args&: SOffset) = foldFrameIndex(N: Addr);
1506 ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1507 return true;
1508}
1509
1510static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val) {
1511 if (Val.getOpcode() != ISD::CopyFromReg)
1512 return false;
1513 auto Reg = cast<RegisterSDNode>(Val: Val.getOperand(i: 1))->getReg();
1514 if (!Reg.isPhysical())
1515 return false;
1516 auto RC = TRI.getPhysRegBaseClass(Reg);
1517 return RC && TRI.isSGPRClass(RC: RC);
1518}
1519
1520bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
1521 SDValue Addr,
1522 SDValue &SRsrc,
1523 SDValue &SOffset,
1524 SDValue &Offset) const {
1525 const SIRegisterInfo *TRI =
1526 static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
1527 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1528 MachineFunction &MF = CurDAG->getMachineFunction();
1529 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1530 SDLoc DL(Addr);
1531
1532 // CopyFromReg <sgpr>
1533 if (IsCopyFromSGPR(TRI: *TRI, Val: Addr)) {
1534 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1535 SOffset = Addr;
1536 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1537 return true;
1538 }
1539
1540 ConstantSDNode *CAddr;
1541 if (Addr.getOpcode() == ISD::ADD) {
1542 // Add (CopyFromReg <sgpr>) <constant>
1543 CAddr = dyn_cast<ConstantSDNode>(Val: Addr.getOperand(i: 1));
1544 if (!CAddr || !TII->isLegalMUBUFImmOffset(Imm: CAddr->getZExtValue()))
1545 return false;
1546 if (!IsCopyFromSGPR(TRI: *TRI, Val: Addr.getOperand(i: 0)))
1547 return false;
1548
1549 SOffset = Addr.getOperand(i: 0);
1550 } else if ((CAddr = dyn_cast<ConstantSDNode>(Val&: Addr)) &&
1551 TII->isLegalMUBUFImmOffset(Imm: CAddr->getZExtValue())) {
1552 // <constant>
1553 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1554 } else {
1555 return false;
1556 }
1557
1558 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1559
1560 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i32);
1561 return true;
1562}
1563
1564bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
1565 SDValue &SOffset, SDValue &Offset
1566 ) const {
1567 SDValue Ptr, VAddr, Offen, Idxen, Addr64;
1568 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1569
1570 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1571 return false;
1572
1573 if (!cast<ConstantSDNode>(Val&: Offen)->getSExtValue() &&
1574 !cast<ConstantSDNode>(Val&: Idxen)->getSExtValue() &&
1575 !cast<ConstantSDNode>(Val&: Addr64)->getSExtValue()) {
1576 uint64_t Rsrc = TII->getDefaultRsrcDataFormat() |
1577 APInt::getAllOnes(numBits: 32).getZExtValue(); // Size
1578 SDLoc DL(Addr);
1579
1580 const SITargetLowering& Lowering =
1581 *static_cast<const SITargetLowering*>(getTargetLowering());
1582
1583 SRsrc = SDValue(Lowering.buildRSRC(DAG&: *CurDAG, DL, Ptr, RsrcDword1: 0, RsrcDword2And3: Rsrc), 0);
1584 return true;
1585 }
1586 return false;
1587}
1588
1589bool AMDGPUDAGToDAGISel::SelectBUFSOffset(SDValue ByteOffsetNode,
1590 SDValue &SOffset) const {
1591 if (Subtarget->hasRestrictedSOffset() && isNullConstant(V: ByteOffsetNode)) {
1592 SOffset = CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32);
1593 return true;
1594 }
1595
1596 SOffset = ByteOffsetNode;
1597 return true;
1598}
1599
1600// Find a load or store from corresponding pattern root.
1601// Roots may be build_vector, bitconvert or their combinations.
1602static MemSDNode* findMemSDNode(SDNode *N) {
1603 N = AMDGPUTargetLowering::stripBitcast(Val: SDValue(N,0)).getNode();
1604 if (MemSDNode *MN = dyn_cast<MemSDNode>(Val: N))
1605 return MN;
1606 assert(isa<BuildVectorSDNode>(N));
1607 for (SDValue V : N->op_values())
1608 if (MemSDNode *MN =
1609 dyn_cast<MemSDNode>(Val: AMDGPUTargetLowering::stripBitcast(Val: V)))
1610 return MN;
1611 llvm_unreachable("cannot find MemSDNode in the pattern!");
1612}
1613
1614bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
1615 SDValue &VAddr, SDValue &Offset,
1616 uint64_t FlatVariant) const {
1617 int64_t OffsetVal = 0;
1618
1619 unsigned AS = findMemSDNode(N)->getAddressSpace();
1620
1621 bool CanHaveFlatSegmentOffsetBug =
1622 Subtarget->hasFlatSegmentOffsetBug() &&
1623 FlatVariant == SIInstrFlags::FLAT &&
1624 (AS == AMDGPUAS::FLAT_ADDRESS || AS == AMDGPUAS::GLOBAL_ADDRESS);
1625
1626 if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
1627 SDValue N0, N1;
1628 if (isBaseWithConstantOffset64(Addr, LHS&: N0, RHS&: N1) &&
1629 (FlatVariant != SIInstrFlags::FlatScratch ||
1630 isFlatScratchBaseLegal(Addr))) {
1631 int64_t COffsetVal = cast<ConstantSDNode>(Val&: N1)->getSExtValue();
1632
1633 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1634 if (TII->isLegalFLATOffset(Offset: COffsetVal, AddrSpace: AS, FlatVariant)) {
1635 Addr = N0;
1636 OffsetVal = COffsetVal;
1637 } else {
1638 // If the offset doesn't fit, put the low bits into the offset field and
1639 // add the rest.
1640 //
1641 // For a FLAT instruction the hardware decides whether to access
1642 // global/scratch/shared memory based on the high bits of vaddr,
1643 // ignoring the offset field, so we have to ensure that when we add
1644 // remainder to vaddr it still points into the same underlying object.
1645 // The easiest way to do that is to make sure that we split the offset
1646 // into two pieces that are both >= 0 or both <= 0.
1647
1648 SDLoc DL(N);
1649 uint64_t RemainderOffset;
1650
1651 std::tie(args&: OffsetVal, args&: RemainderOffset) =
1652 TII->splitFlatOffset(COffsetVal, AddrSpace: AS, FlatVariant);
1653
1654 SDValue AddOffsetLo =
1655 getMaterializedScalarImm32(Val: Lo_32(Value: RemainderOffset), DL);
1656 SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
1657
1658 if (Addr.getValueType().getSizeInBits() == 32) {
1659 SmallVector<SDValue, 3> Opnds;
1660 Opnds.push_back(Elt: N0);
1661 Opnds.push_back(Elt: AddOffsetLo);
1662 unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
1663 if (Subtarget->hasAddNoCarry()) {
1664 AddOp = AMDGPU::V_ADD_U32_e64;
1665 Opnds.push_back(Elt: Clamp);
1666 }
1667 Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
1668 } else {
1669 // TODO: Should this try to use a scalar add pseudo if the base address
1670 // is uniform and saddr is usable?
1671 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1672 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1673
1674 SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1675 DL, MVT::i32, N0, Sub0);
1676 SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1677 DL, MVT::i32, N0, Sub1);
1678
1679 SDValue AddOffsetHi =
1680 getMaterializedScalarImm32(Val: Hi_32(Value: RemainderOffset), DL);
1681
1682 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
1683
1684 SDNode *Add =
1685 CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
1686 {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
1687
1688 SDNode *Addc = CurDAG->getMachineNode(
1689 AMDGPU::V_ADDC_U32_e64, DL, VTs,
1690 {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
1691
1692 SDValue RegSequenceArgs[] = {
1693 CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
1694 SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
1695
1696 Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1697 MVT::i64, RegSequenceArgs),
1698 0);
1699 }
1700 }
1701 }
1702 }
1703
1704 VAddr = Addr;
1705 Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
1706 return true;
1707}
1708
1709bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDValue Addr,
1710 SDValue &VAddr,
1711 SDValue &Offset) const {
1712 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, FlatVariant: SIInstrFlags::FLAT);
1713}
1714
1715bool AMDGPUDAGToDAGISel::SelectGlobalOffset(SDNode *N, SDValue Addr,
1716 SDValue &VAddr,
1717 SDValue &Offset) const {
1718 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, FlatVariant: SIInstrFlags::FlatGlobal);
1719}
1720
1721bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr,
1722 SDValue &VAddr,
1723 SDValue &Offset) const {
1724 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,
1725 FlatVariant: SIInstrFlags::FlatScratch);
1726}
1727
1728// If this matches zero_extend i32:x, return x
1729static SDValue matchZExtFromI32(SDValue Op) {
1730 if (Op.getOpcode() != ISD::ZERO_EXTEND)
1731 return SDValue();
1732
1733 SDValue ExtSrc = Op.getOperand(i: 0);
1734 return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue();
1735}
1736
1737// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
1738bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
1739 SDValue Addr,
1740 SDValue &SAddr,
1741 SDValue &VOffset,
1742 SDValue &Offset) const {
1743 int64_t ImmOffset = 0;
1744
1745 // Match the immediate offset first, which canonically is moved as low as
1746 // possible.
1747
1748 SDValue LHS, RHS;
1749 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
1750 int64_t COffsetVal = cast<ConstantSDNode>(Val&: RHS)->getSExtValue();
1751 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1752
1753 if (TII->isLegalFLATOffset(Offset: COffsetVal, AddrSpace: AMDGPUAS::GLOBAL_ADDRESS,
1754 FlatVariant: SIInstrFlags::FlatGlobal)) {
1755 Addr = LHS;
1756 ImmOffset = COffsetVal;
1757 } else if (!LHS->isDivergent()) {
1758 if (COffsetVal > 0) {
1759 SDLoc SL(N);
1760 // saddr + large_offset -> saddr +
1761 // (voffset = large_offset & ~MaxOffset) +
1762 // (large_offset & MaxOffset);
1763 int64_t SplitImmOffset, RemainderOffset;
1764 std::tie(args&: SplitImmOffset, args&: RemainderOffset) = TII->splitFlatOffset(
1765 COffsetVal, AddrSpace: AMDGPUAS::GLOBAL_ADDRESS, FlatVariant: SIInstrFlags::FlatGlobal);
1766
1767 if (isUInt<32>(x: RemainderOffset)) {
1768 SDNode *VMov = CurDAG->getMachineNode(
1769 AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
1770 CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
1771 VOffset = SDValue(VMov, 0);
1772 SAddr = LHS;
1773 Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
1774 return true;
1775 }
1776 }
1777
1778 // We are adding a 64 bit SGPR and a constant. If constant bus limit
1779 // is 1 we would need to perform 1 or 2 extra moves for each half of
1780 // the constant and it is better to do a scalar add and then issue a
1781 // single VALU instruction to materialize zero. Otherwise it is less
1782 // instructions to perform VALU adds with immediates or inline literals.
1783 unsigned NumLiterals =
1784 !TII->isInlineConstant(Imm: APInt(32, COffsetVal & 0xffffffff)) +
1785 !TII->isInlineConstant(Imm: APInt(32, COffsetVal >> 32));
1786 if (Subtarget->getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
1787 return false;
1788 }
1789 }
1790
1791 // Match the variable offset.
1792 if (Addr.getOpcode() == ISD::ADD) {
1793 LHS = Addr.getOperand(i: 0);
1794 RHS = Addr.getOperand(i: 1);
1795
1796 if (!LHS->isDivergent()) {
1797 // add (i64 sgpr), (zero_extend (i32 vgpr))
1798 if (SDValue ZextRHS = matchZExtFromI32(Op: RHS)) {
1799 SAddr = LHS;
1800 VOffset = ZextRHS;
1801 }
1802 }
1803
1804 if (!SAddr && !RHS->isDivergent()) {
1805 // add (zero_extend (i32 vgpr)), (i64 sgpr)
1806 if (SDValue ZextLHS = matchZExtFromI32(Op: LHS)) {
1807 SAddr = RHS;
1808 VOffset = ZextLHS;
1809 }
1810 }
1811
1812 if (SAddr) {
1813 Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);
1814 return true;
1815 }
1816 }
1817
1818 if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
1819 isa<ConstantSDNode>(Val: Addr))
1820 return false;
1821
1822 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
1823 // moves required to copy a 64-bit SGPR to VGPR.
1824 SAddr = Addr;
1825 SDNode *VMov =
1826 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32,
1827 CurDAG->getTargetConstant(0, SDLoc(), MVT::i32));
1828 VOffset = SDValue(VMov, 0);
1829 Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);
1830 return true;
1831}
1832
1833static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr) {
1834 if (auto FI = dyn_cast<FrameIndexSDNode>(Val&: SAddr)) {
1835 SAddr = CurDAG->getTargetFrameIndex(FI: FI->getIndex(), VT: FI->getValueType(ResNo: 0));
1836 } else if (SAddr.getOpcode() == ISD::ADD &&
1837 isa<FrameIndexSDNode>(Val: SAddr.getOperand(i: 0))) {
1838 // Materialize this into a scalar move for scalar address to avoid
1839 // readfirstlane.
1840 auto FI = cast<FrameIndexSDNode>(Val: SAddr.getOperand(i: 0));
1841 SDValue TFI = CurDAG->getTargetFrameIndex(FI: FI->getIndex(),
1842 VT: FI->getValueType(ResNo: 0));
1843 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, SDLoc(SAddr),
1844 MVT::i32, TFI, SAddr.getOperand(1)),
1845 0);
1846 }
1847
1848 return SAddr;
1849}
1850
1851// Match (32-bit SGPR base) + sext(imm offset)
1852bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
1853 SDValue &SAddr,
1854 SDValue &Offset) const {
1855 if (Addr->isDivergent())
1856 return false;
1857
1858 SDLoc DL(Addr);
1859
1860 int64_t COffsetVal = 0;
1861
1862 if (CurDAG->isBaseWithConstantOffset(Op: Addr) && isFlatScratchBaseLegal(Addr)) {
1863 COffsetVal = cast<ConstantSDNode>(Val: Addr.getOperand(i: 1))->getSExtValue();
1864 SAddr = Addr.getOperand(i: 0);
1865 } else {
1866 SAddr = Addr;
1867 }
1868
1869 SAddr = SelectSAddrFI(CurDAG, SAddr);
1870
1871 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1872
1873 if (!TII->isLegalFLATOffset(Offset: COffsetVal, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS,
1874 FlatVariant: SIInstrFlags::FlatScratch)) {
1875 int64_t SplitImmOffset, RemainderOffset;
1876 std::tie(args&: SplitImmOffset, args&: RemainderOffset) = TII->splitFlatOffset(
1877 COffsetVal, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS, FlatVariant: SIInstrFlags::FlatScratch);
1878
1879 COffsetVal = SplitImmOffset;
1880
1881 SDValue AddOffset =
1882 SAddr.getOpcode() == ISD::TargetFrameIndex
1883 ? getMaterializedScalarImm32(Lo_32(RemainderOffset), DL)
1884 : CurDAG->getTargetConstant(RemainderOffset, DL, MVT::i32);
1885 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, DL, MVT::i32,
1886 SAddr, AddOffset),
1887 0);
1888 }
1889
1890 Offset = CurDAG->getTargetConstant(COffsetVal, DL, MVT::i16);
1891
1892 return true;
1893}
1894
1895// Check whether the flat scratch SVS swizzle bug affects this access.
1896bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug(
1897 SDValue VAddr, SDValue SAddr, uint64_t ImmOffset) const {
1898 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
1899 return false;
1900
1901 // The bug affects the swizzling of SVS accesses if there is any carry out
1902 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
1903 // voffset to (soffset + inst_offset).
1904 KnownBits VKnown = CurDAG->computeKnownBits(Op: VAddr);
1905 KnownBits SKnown = KnownBits::computeForAddSub(
1906 /*Add=*/true, /*NSW=*/false, /*NUW=*/false,
1907 LHS: CurDAG->computeKnownBits(Op: SAddr),
1908 RHS: KnownBits::makeConstant(C: APInt(32, ImmOffset)));
1909 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
1910 uint64_t SMax = SKnown.getMaxValue().getZExtValue();
1911 return (VMax & 3) + (SMax & 3) >= 4;
1912}
1913
1914bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
1915 SDValue &VAddr, SDValue &SAddr,
1916 SDValue &Offset) const {
1917 int64_t ImmOffset = 0;
1918
1919 SDValue LHS, RHS;
1920 SDValue OrigAddr = Addr;
1921 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
1922 int64_t COffsetVal = cast<ConstantSDNode>(Val&: RHS)->getSExtValue();
1923 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1924
1925 if (TII->isLegalFLATOffset(Offset: COffsetVal, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS, FlatVariant: true)) {
1926 Addr = LHS;
1927 ImmOffset = COffsetVal;
1928 } else if (!LHS->isDivergent() && COffsetVal > 0) {
1929 SDLoc SL(N);
1930 // saddr + large_offset -> saddr + (vaddr = large_offset & ~MaxOffset) +
1931 // (large_offset & MaxOffset);
1932 int64_t SplitImmOffset, RemainderOffset;
1933 std::tie(args&: SplitImmOffset, args&: RemainderOffset)
1934 = TII->splitFlatOffset(COffsetVal, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS, FlatVariant: true);
1935
1936 if (isUInt<32>(x: RemainderOffset)) {
1937 SDNode *VMov = CurDAG->getMachineNode(
1938 AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
1939 CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
1940 VAddr = SDValue(VMov, 0);
1941 SAddr = LHS;
1942 if (!isFlatScratchBaseLegal(Addr))
1943 return false;
1944 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset: SplitImmOffset))
1945 return false;
1946 Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16);
1947 return true;
1948 }
1949 }
1950 }
1951
1952 if (Addr.getOpcode() != ISD::ADD)
1953 return false;
1954
1955 LHS = Addr.getOperand(i: 0);
1956 RHS = Addr.getOperand(i: 1);
1957
1958 if (!LHS->isDivergent() && RHS->isDivergent()) {
1959 SAddr = LHS;
1960 VAddr = RHS;
1961 } else if (!RHS->isDivergent() && LHS->isDivergent()) {
1962 SAddr = RHS;
1963 VAddr = LHS;
1964 } else {
1965 return false;
1966 }
1967
1968 if (OrigAddr != Addr) {
1969 if (!isFlatScratchBaseLegalSVImm(Addr: OrigAddr))
1970 return false;
1971 } else {
1972 if (!isFlatScratchBaseLegalSV(Addr: OrigAddr))
1973 return false;
1974 }
1975
1976 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
1977 return false;
1978 SAddr = SelectSAddrFI(CurDAG, SAddr);
1979 Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
1980 return true;
1981}
1982
1983// Match an immediate (if Offset is not null) or an SGPR (if SOffset is
1984// not null) offset. If Imm32Only is true, match only 32-bit immediate
1985// offsets available on CI.
1986bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
1987 SDValue *SOffset, SDValue *Offset,
1988 bool Imm32Only, bool IsBuffer) const {
1989 assert((!SOffset || !Offset) &&
1990 "Cannot match both soffset and offset at the same time!");
1991
1992 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: ByteOffsetNode);
1993 if (!C) {
1994 if (!SOffset)
1995 return false;
1996 if (ByteOffsetNode.getValueType().isScalarInteger() &&
1997 ByteOffsetNode.getValueType().getSizeInBits() == 32) {
1998 *SOffset = ByteOffsetNode;
1999 return true;
2000 }
2001 if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
2002 if (ByteOffsetNode.getOperand(i: 0).getValueType().getSizeInBits() == 32) {
2003 *SOffset = ByteOffsetNode.getOperand(i: 0);
2004 return true;
2005 }
2006 }
2007 return false;
2008 }
2009
2010 SDLoc SL(ByteOffsetNode);
2011
2012 // GFX9 and GFX10 have signed byte immediate offsets. The immediate
2013 // offset for S_BUFFER instructions is unsigned.
2014 int64_t ByteOffset = IsBuffer ? C->getZExtValue() : C->getSExtValue();
2015 std::optional<int64_t> EncodedOffset =
2016 AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset, IsBuffer);
2017 if (EncodedOffset && Offset && !Imm32Only) {
2018 *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
2019 return true;
2020 }
2021
2022 // SGPR and literal offsets are unsigned.
2023 if (ByteOffset < 0)
2024 return false;
2025
2026 EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset);
2027 if (EncodedOffset && Offset && Imm32Only) {
2028 *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
2029 return true;
2030 }
2031
2032 if (!isUInt<32>(x: ByteOffset) && !isInt<32>(x: ByteOffset))
2033 return false;
2034
2035 if (SOffset) {
2036 SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
2037 *SOffset = SDValue(
2038 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0);
2039 return true;
2040 }
2041
2042 return false;
2043}
2044
2045SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
2046 if (Addr.getValueType() != MVT::i32)
2047 return Addr;
2048
2049 // Zero-extend a 32-bit address.
2050 SDLoc SL(Addr);
2051
2052 const MachineFunction &MF = CurDAG->getMachineFunction();
2053 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2054 unsigned AddrHiVal = Info->get32BitAddressHighBits();
2055 SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32);
2056
2057 const SDValue Ops[] = {
2058 CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32),
2059 Addr,
2060 CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
2061 SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi),
2062 0),
2063 CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32),
2064 };
2065
2066 return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64,
2067 Ops), 0);
2068}
2069
2070// Match a base and an immediate (if Offset is not null) or an SGPR (if
2071// SOffset is not null) or an immediate+SGPR offset. If Imm32Only is
2072// true, match only 32-bit immediate offsets available on CI.
2073bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
2074 SDValue *SOffset, SDValue *Offset,
2075 bool Imm32Only,
2076 bool IsBuffer) const {
2077 if (SOffset && Offset) {
2078 assert(!Imm32Only && !IsBuffer);
2079 SDValue B;
2080 return SelectSMRDBaseOffset(Addr, SBase&: B, SOffset: nullptr, Offset) &&
2081 SelectSMRDBaseOffset(Addr: B, SBase, SOffset, Offset: nullptr);
2082 }
2083
2084 // A 32-bit (address + offset) should not cause unsigned 32-bit integer
2085 // wraparound, because s_load instructions perform the addition in 64 bits.
2086 if (Addr.getValueType() == MVT::i32 && Addr.getOpcode() == ISD::ADD &&
2087 !Addr->getFlags().hasNoUnsignedWrap())
2088 return false;
2089
2090 SDValue N0, N1;
2091 // Extract the base and offset if possible.
2092 if (CurDAG->isBaseWithConstantOffset(Op: Addr) || Addr.getOpcode() == ISD::ADD) {
2093 N0 = Addr.getOperand(i: 0);
2094 N1 = Addr.getOperand(i: 1);
2095 } else if (getBaseWithOffsetUsingSplitOR(DAG&: *CurDAG, Addr, N0, N1)) {
2096 assert(N0 && N1 && isa<ConstantSDNode>(N1));
2097 }
2098 if (!N0 || !N1)
2099 return false;
2100 if (SelectSMRDOffset(ByteOffsetNode: N1, SOffset, Offset, Imm32Only, IsBuffer)) {
2101 SBase = N0;
2102 return true;
2103 }
2104 if (SelectSMRDOffset(ByteOffsetNode: N0, SOffset, Offset, Imm32Only, IsBuffer)) {
2105 SBase = N1;
2106 return true;
2107 }
2108 return false;
2109}
2110
2111bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
2112 SDValue *SOffset, SDValue *Offset,
2113 bool Imm32Only) const {
2114 if (SelectSMRDBaseOffset(Addr, SBase, SOffset, Offset, Imm32Only)) {
2115 SBase = Expand32BitAddress(Addr: SBase);
2116 return true;
2117 }
2118
2119 if (Addr.getValueType() == MVT::i32 && Offset && !SOffset) {
2120 SBase = Expand32BitAddress(Addr);
2121 *Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
2122 return true;
2123 }
2124
2125 return false;
2126}
2127
2128bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
2129 SDValue &Offset) const {
2130 return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, Offset: &Offset);
2131}
2132
2133bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
2134 SDValue &Offset) const {
2135 assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
2136 return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, Offset: &Offset,
2137 /* Imm32Only */ true);
2138}
2139
2140bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase,
2141 SDValue &SOffset) const {
2142 return SelectSMRD(Addr, SBase, SOffset: &SOffset, /* Offset */ nullptr);
2143}
2144
2145bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDValue Addr, SDValue &SBase,
2146 SDValue &SOffset,
2147 SDValue &Offset) const {
2148 return SelectSMRD(Addr, SBase, SOffset: &SOffset, Offset: &Offset);
2149}
2150
2151bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue N, SDValue &Offset) const {
2152 return SelectSMRDOffset(ByteOffsetNode: N, /* SOffset */ nullptr, Offset: &Offset,
2153 /* Imm32Only */ false, /* IsBuffer */ true);
2154}
2155
2156bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue N,
2157 SDValue &Offset) const {
2158 assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
2159 return SelectSMRDOffset(ByteOffsetNode: N, /* SOffset */ nullptr, Offset: &Offset,
2160 /* Imm32Only */ true, /* IsBuffer */ true);
2161}
2162
2163bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
2164 SDValue &Offset) const {
2165 // Match the (soffset + offset) pair as a 32-bit register base and
2166 // an immediate offset.
2167 return N.getValueType() == MVT::i32 &&
2168 SelectSMRDBaseOffset(N, /* SBase */ SOffset, /* SOffset*/ nullptr,
2169 &Offset, /* Imm32Only */ false,
2170 /* IsBuffer */ true);
2171}
2172
2173bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
2174 SDValue &Base,
2175 SDValue &Offset) const {
2176 SDLoc DL(Index);
2177
2178 if (CurDAG->isBaseWithConstantOffset(Op: Index)) {
2179 SDValue N0 = Index.getOperand(i: 0);
2180 SDValue N1 = Index.getOperand(i: 1);
2181 ConstantSDNode *C1 = cast<ConstantSDNode>(Val&: N1);
2182
2183 // (add n0, c0)
2184 // Don't peel off the offset (c0) if doing so could possibly lead
2185 // the base (n0) to be negative.
2186 // (or n0, |c0|) can never change a sign given isBaseWithConstantOffset.
2187 if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(Op: N0) ||
2188 (Index->getOpcode() == ISD::OR && C1->getSExtValue() >= 0)) {
2189 Base = N0;
2190 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
2191 return true;
2192 }
2193 }
2194
2195 if (isa<ConstantSDNode>(Val: Index))
2196 return false;
2197
2198 Base = Index;
2199 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
2200 return true;
2201}
2202
2203SDNode *AMDGPUDAGToDAGISel::getBFE32(bool IsSigned, const SDLoc &DL,
2204 SDValue Val, uint32_t Offset,
2205 uint32_t Width) {
2206 if (Val->isDivergent()) {
2207 unsigned Opcode = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2208 SDValue Off = CurDAG->getTargetConstant(Offset, DL, MVT::i32);
2209 SDValue W = CurDAG->getTargetConstant(Width, DL, MVT::i32);
2210
2211 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, Off, W);
2212 }
2213 unsigned Opcode = IsSigned ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2214 // Transformation function, pack the offset and width of a BFE into
2215 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
2216 // source, bits [5:0] contain the offset and bits [22:16] the width.
2217 uint32_t PackedVal = Offset | (Width << 16);
2218 SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32);
2219
2220 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst);
2221}
2222
2223void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
2224 // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)
2225 // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)
2226 // Predicate: 0 < b <= c < 32
2227
2228 const SDValue &Shl = N->getOperand(Num: 0);
2229 ConstantSDNode *B = dyn_cast<ConstantSDNode>(Val: Shl->getOperand(Num: 1));
2230 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
2231
2232 if (B && C) {
2233 uint32_t BVal = B->getZExtValue();
2234 uint32_t CVal = C->getZExtValue();
2235
2236 if (0 < BVal && BVal <= CVal && CVal < 32) {
2237 bool Signed = N->getOpcode() == ISD::SRA;
2238 ReplaceNode(F: N, T: getBFE32(IsSigned: Signed, DL: SDLoc(N), Val: Shl.getOperand(i: 0), Offset: CVal - BVal,
2239 Width: 32 - CVal));
2240 return;
2241 }
2242 }
2243 SelectCode(N);
2244}
2245
2246void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
2247 switch (N->getOpcode()) {
2248 case ISD::AND:
2249 if (N->getOperand(Num: 0).getOpcode() == ISD::SRL) {
2250 // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)"
2251 // Predicate: isMask(mask)
2252 const SDValue &Srl = N->getOperand(Num: 0);
2253 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Val: Srl.getOperand(i: 1));
2254 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
2255
2256 if (Shift && Mask) {
2257 uint32_t ShiftVal = Shift->getZExtValue();
2258 uint32_t MaskVal = Mask->getZExtValue();
2259
2260 if (isMask_32(Value: MaskVal)) {
2261 uint32_t WidthVal = llvm::popcount(Value: MaskVal);
2262 ReplaceNode(F: N, T: getBFE32(IsSigned: false, DL: SDLoc(N), Val: Srl.getOperand(i: 0), Offset: ShiftVal,
2263 Width: WidthVal));
2264 return;
2265 }
2266 }
2267 }
2268 break;
2269 case ISD::SRL:
2270 if (N->getOperand(Num: 0).getOpcode() == ISD::AND) {
2271 // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)"
2272 // Predicate: isMask(mask >> b)
2273 const SDValue &And = N->getOperand(Num: 0);
2274 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
2275 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(Val: And->getOperand(Num: 1));
2276
2277 if (Shift && Mask) {
2278 uint32_t ShiftVal = Shift->getZExtValue();
2279 uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal;
2280
2281 if (isMask_32(Value: MaskVal)) {
2282 uint32_t WidthVal = llvm::popcount(Value: MaskVal);
2283 ReplaceNode(F: N, T: getBFE32(IsSigned: false, DL: SDLoc(N), Val: And.getOperand(i: 0), Offset: ShiftVal,
2284 Width: WidthVal));
2285 return;
2286 }
2287 }
2288 } else if (N->getOperand(Num: 0).getOpcode() == ISD::SHL) {
2289 SelectS_BFEFromShifts(N);
2290 return;
2291 }
2292 break;
2293 case ISD::SRA:
2294 if (N->getOperand(Num: 0).getOpcode() == ISD::SHL) {
2295 SelectS_BFEFromShifts(N);
2296 return;
2297 }
2298 break;
2299
2300 case ISD::SIGN_EXTEND_INREG: {
2301 // sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8
2302 SDValue Src = N->getOperand(Num: 0);
2303 if (Src.getOpcode() != ISD::SRL)
2304 break;
2305
2306 const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Val: Src.getOperand(i: 1));
2307 if (!Amt)
2308 break;
2309
2310 unsigned Width = cast<VTSDNode>(Val: N->getOperand(Num: 1))->getVT().getSizeInBits();
2311 ReplaceNode(F: N, T: getBFE32(IsSigned: true, DL: SDLoc(N), Val: Src.getOperand(i: 0),
2312 Offset: Amt->getZExtValue(), Width));
2313 return;
2314 }
2315 }
2316
2317 SelectCode(N);
2318}
2319
2320bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
2321 assert(N->getOpcode() == ISD::BRCOND);
2322 if (!N->hasOneUse())
2323 return false;
2324
2325 SDValue Cond = N->getOperand(Num: 1);
2326 if (Cond.getOpcode() == ISD::CopyToReg)
2327 Cond = Cond.getOperand(i: 2);
2328
2329 if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
2330 return false;
2331
2332 MVT VT = Cond.getOperand(i: 0).getSimpleValueType();
2333 if (VT == MVT::i32)
2334 return true;
2335
2336 if (VT == MVT::i64) {
2337 auto ST = static_cast<const GCNSubtarget *>(Subtarget);
2338
2339 ISD::CondCode CC = cast<CondCodeSDNode>(Val: Cond.getOperand(i: 2))->get();
2340 return (CC == ISD::SETEQ || CC == ISD::SETNE) && ST->hasScalarCompareEq64();
2341 }
2342
2343 return false;
2344}
2345
2346static SDValue combineBallotPattern(SDValue VCMP, bool &Negate) {
2347 assert(VCMP->getOpcode() == AMDGPUISD::SETCC);
2348 // Special case for amdgcn.ballot:
2349 // %Cond = i1 (and/or combination of i1 ISD::SETCCs)
2350 // %VCMP = i(WaveSize) AMDGPUISD::SETCC (ext %Cond), 0, setne/seteq
2351 // =>
2352 // Use i1 %Cond value instead of i(WaveSize) %VCMP.
2353 // This is possible because divergent ISD::SETCC is selected as V_CMP and
2354 // Cond becomes a i(WaveSize) full mask value.
2355 // Note that ballot doesn't use SETEQ condition but its easy to support it
2356 // here for completeness, so in this case Negate is set true on return.
2357 auto VCMP_CC = cast<CondCodeSDNode>(Val: VCMP.getOperand(i: 2))->get();
2358 if ((VCMP_CC == ISD::SETEQ || VCMP_CC == ISD::SETNE) &&
2359 isNullConstant(V: VCMP.getOperand(i: 1))) {
2360
2361 auto Cond = VCMP.getOperand(i: 0);
2362 if (ISD::isExtOpcode(Opcode: Cond->getOpcode())) // Skip extension.
2363 Cond = Cond.getOperand(i: 0);
2364
2365 if (isBoolSGPR(V: Cond)) {
2366 Negate = VCMP_CC == ISD::SETEQ;
2367 return Cond;
2368 }
2369 }
2370 return SDValue();
2371}
2372
2373void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
2374 SDValue Cond = N->getOperand(Num: 1);
2375
2376 if (Cond.isUndef()) {
2377 CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other,
2378 N->getOperand(2), N->getOperand(0));
2379 return;
2380 }
2381
2382 const GCNSubtarget *ST = static_cast<const GCNSubtarget *>(Subtarget);
2383 const SIRegisterInfo *TRI = ST->getRegisterInfo();
2384
2385 bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
2386 bool AndExec = !UseSCCBr;
2387 bool Negate = false;
2388
2389 if (Cond.getOpcode() == ISD::SETCC &&
2390 Cond->getOperand(Num: 0)->getOpcode() == AMDGPUISD::SETCC) {
2391 SDValue VCMP = Cond->getOperand(Num: 0);
2392 auto CC = cast<CondCodeSDNode>(Val: Cond->getOperand(Num: 2))->get();
2393 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
2394 isNullConstant(V: Cond->getOperand(Num: 1)) &&
2395 // We may encounter ballot.i64 in wave32 mode on -O0.
2396 VCMP.getValueType().getSizeInBits() == ST->getWavefrontSize()) {
2397 // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2398 // %C = i1 ISD::SETCC %VCMP, 0, setne/seteq
2399 // BRCOND i1 %C, %BB
2400 // =>
2401 // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2402 // VCC = COPY i(WaveSize) %VCMP
2403 // S_CBRANCH_VCCNZ/VCCZ %BB
2404 Negate = CC == ISD::SETEQ;
2405 bool NegatedBallot = false;
2406 if (auto BallotCond = combineBallotPattern(VCMP, Negate&: NegatedBallot)) {
2407 Cond = BallotCond;
2408 UseSCCBr = !BallotCond->isDivergent();
2409 Negate = Negate ^ NegatedBallot;
2410 } else {
2411 // TODO: don't use SCC here assuming that AMDGPUISD::SETCC is always
2412 // selected as V_CMP, but this may change for uniform condition.
2413 Cond = VCMP;
2414 UseSCCBr = false;
2415 }
2416 }
2417 // Cond is either V_CMP resulted from AMDGPUISD::SETCC or a combination of
2418 // V_CMPs resulted from ballot or ballot has uniform condition and SCC is
2419 // used.
2420 AndExec = false;
2421 }
2422
2423 unsigned BrOp =
2424 UseSCCBr ? (Negate ? AMDGPU::S_CBRANCH_SCC0 : AMDGPU::S_CBRANCH_SCC1)
2425 : (Negate ? AMDGPU::S_CBRANCH_VCCZ : AMDGPU::S_CBRANCH_VCCNZ);
2426 Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC();
2427 SDLoc SL(N);
2428
2429 if (AndExec) {
2430 // This is the case that we are selecting to S_CBRANCH_VCCNZ. We have not
2431 // analyzed what generates the vcc value, so we do not know whether vcc
2432 // bits for disabled lanes are 0. Thus we need to mask out bits for
2433 // disabled lanes.
2434 //
2435 // For the case that we select S_CBRANCH_SCC1 and it gets
2436 // changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls
2437 // SIInstrInfo::moveToVALU which inserts the S_AND).
2438 //
2439 // We could add an analysis of what generates the vcc value here and omit
2440 // the S_AND when is unnecessary. But it would be better to add a separate
2441 // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it
2442 // catches both cases.
2443 Cond = SDValue(CurDAG->getMachineNode(ST->isWave32() ? AMDGPU::S_AND_B32
2444 : AMDGPU::S_AND_B64,
2445 SL, MVT::i1,
2446 CurDAG->getRegister(ST->isWave32() ? AMDGPU::EXEC_LO
2447 : AMDGPU::EXEC,
2448 MVT::i1),
2449 Cond),
2450 0);
2451 }
2452
2453 SDValue VCC = CurDAG->getCopyToReg(Chain: N->getOperand(Num: 0), dl: SL, Reg: CondReg, N: Cond);
2454 CurDAG->SelectNodeTo(N, BrOp, MVT::Other,
2455 N->getOperand(2), // Basic Block
2456 VCC.getValue(0));
2457}
2458
2459void AMDGPUDAGToDAGISel::SelectFP_EXTEND(SDNode *N) {
2460 if (Subtarget->hasSALUFloatInsts() && N->getValueType(0) == MVT::f32 &&
2461 !N->isDivergent()) {
2462 SDValue Src = N->getOperand(Num: 0);
2463 if (Src.getValueType() == MVT::f16) {
2464 if (isExtractHiElt(In: Src, Out&: Src)) {
2465 CurDAG->SelectNodeTo(N, AMDGPU::S_CVT_HI_F32_F16, N->getVTList(),
2466 {Src});
2467 return;
2468 }
2469 }
2470 }
2471
2472 SelectCode(N);
2473}
2474
2475void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
2476 // The address is assumed to be uniform, so if it ends up in a VGPR, it will
2477 // be copied to an SGPR with readfirstlane.
2478 unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?
2479 AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2480
2481 SDValue Chain = N->getOperand(Num: 0);
2482 SDValue Ptr = N->getOperand(Num: 2);
2483 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(Val: N);
2484 MachineMemOperand *MMO = M->getMemOperand();
2485 bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2486
2487 SDValue Offset;
2488 if (CurDAG->isBaseWithConstantOffset(Op: Ptr)) {
2489 SDValue PtrBase = Ptr.getOperand(i: 0);
2490 SDValue PtrOffset = Ptr.getOperand(i: 1);
2491
2492 const APInt &OffsetVal = PtrOffset->getAsAPIntVal();
2493 if (isDSOffsetLegal(Base: PtrBase, Offset: OffsetVal.getZExtValue())) {
2494 N = glueCopyToM0(N, Val: PtrBase);
2495 Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
2496 }
2497 }
2498
2499 if (!Offset) {
2500 N = glueCopyToM0(N, Val: Ptr);
2501 Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
2502 }
2503
2504 SDValue Ops[] = {
2505 Offset,
2506 CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32),
2507 Chain,
2508 N->getOperand(N->getNumOperands() - 1) // New glue
2509 };
2510
2511 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2512 CurDAG->setNodeMemRefs(N: cast<MachineSDNode>(Val: Selected), NewMemRefs: {MMO});
2513}
2514
2515// We need to handle this here because tablegen doesn't support matching
2516// instructions with multiple outputs.
2517void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N) {
2518 unsigned Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2519 SDValue Ops[] = {N->getOperand(Num: 2), N->getOperand(Num: 3), N->getOperand(Num: 4),
2520 N->getOperand(Num: 5), N->getOperand(Num: 0)};
2521
2522 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(Val: N);
2523 MachineMemOperand *MMO = M->getMemOperand();
2524 SDNode *Selected = CurDAG->SelectNodeTo(N, MachineOpc: Opc, VTs: N->getVTList(), Ops);
2525 CurDAG->setNodeMemRefs(N: cast<MachineSDNode>(Val: Selected), NewMemRefs: {MMO});
2526}
2527
2528static unsigned gwsIntrinToOpcode(unsigned IntrID) {
2529 switch (IntrID) {
2530 case Intrinsic::amdgcn_ds_gws_init:
2531 return AMDGPU::DS_GWS_INIT;
2532 case Intrinsic::amdgcn_ds_gws_barrier:
2533 return AMDGPU::DS_GWS_BARRIER;
2534 case Intrinsic::amdgcn_ds_gws_sema_v:
2535 return AMDGPU::DS_GWS_SEMA_V;
2536 case Intrinsic::amdgcn_ds_gws_sema_br:
2537 return AMDGPU::DS_GWS_SEMA_BR;
2538 case Intrinsic::amdgcn_ds_gws_sema_p:
2539 return AMDGPU::DS_GWS_SEMA_P;
2540 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2541 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
2542 default:
2543 llvm_unreachable("not a gws intrinsic");
2544 }
2545}
2546
2547void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
2548 if (!Subtarget->hasGWS() ||
2549 (IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
2550 !Subtarget->hasGWSSemaReleaseAll())) {
2551 // Let this error.
2552 SelectCode(N);
2553 return;
2554 }
2555
2556 // Chain, intrinsic ID, vsrc, offset
2557 const bool HasVSrc = N->getNumOperands() == 4;
2558 assert(HasVSrc || N->getNumOperands() == 3);
2559
2560 SDLoc SL(N);
2561 SDValue BaseOffset = N->getOperand(Num: HasVSrc ? 3 : 2);
2562 int ImmOffset = 0;
2563 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(Val: N);
2564 MachineMemOperand *MMO = M->getMemOperand();
2565
2566 // Don't worry if the offset ends up in a VGPR. Only one lane will have
2567 // effect, so SIFixSGPRCopies will validly insert readfirstlane.
2568
2569 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
2570 // offset field) % 64. Some versions of the programming guide omit the m0
2571 // part, or claim it's from offset 0.
2572 if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(Val&: BaseOffset)) {
2573 // If we have a constant offset, try to use the 0 in m0 as the base.
2574 // TODO: Look into changing the default m0 initialization value. If the
2575 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
2576 // the immediate offset.
2577 glueCopyToM0(N, CurDAG->getTargetConstant(0, SL, MVT::i32));
2578 ImmOffset = ConstOffset->getZExtValue();
2579 } else {
2580 if (CurDAG->isBaseWithConstantOffset(Op: BaseOffset)) {
2581 ImmOffset = BaseOffset.getConstantOperandVal(i: 1);
2582 BaseOffset = BaseOffset.getOperand(i: 0);
2583 }
2584
2585 // Prefer to do the shift in an SGPR since it should be possible to use m0
2586 // as the result directly. If it's already an SGPR, it will be eliminated
2587 // later.
2588 SDNode *SGPROffset
2589 = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32,
2590 BaseOffset);
2591 // Shift to offset in m0
2592 SDNode *M0Base
2593 = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
2594 SDValue(SGPROffset, 0),
2595 CurDAG->getTargetConstant(16, SL, MVT::i32));
2596 glueCopyToM0(N, Val: SDValue(M0Base, 0));
2597 }
2598
2599 SDValue Chain = N->getOperand(Num: 0);
2600 SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
2601
2602 const unsigned Opc = gwsIntrinToOpcode(IntrID);
2603 SmallVector<SDValue, 5> Ops;
2604 if (HasVSrc)
2605 Ops.push_back(Elt: N->getOperand(Num: 2));
2606 Ops.push_back(Elt: OffsetField);
2607 Ops.push_back(Elt: Chain);
2608
2609 SDNode *Selected = CurDAG->SelectNodeTo(N, MachineOpc: Opc, VTs: N->getVTList(), Ops);
2610 CurDAG->setNodeMemRefs(N: cast<MachineSDNode>(Val: Selected), NewMemRefs: {MMO});
2611}
2612
2613void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) {
2614 if (Subtarget->getLDSBankCount() != 16) {
2615 // This is a single instruction with a pattern.
2616 SelectCode(N);
2617 return;
2618 }
2619
2620 SDLoc DL(N);
2621
2622 // This requires 2 instructions. It is possible to write a pattern to support
2623 // this, but the generated isel emitter doesn't correctly deal with multiple
2624 // output instructions using the same physical register input. The copy to m0
2625 // is incorrectly placed before the second instruction.
2626 //
2627 // TODO: Match source modifiers.
2628 //
2629 // def : Pat <
2630 // (int_amdgcn_interp_p1_f16
2631 // (VOP3Mods f32:$src0, i32:$src0_modifiers),
2632 // (i32 timm:$attrchan), (i32 timm:$attr),
2633 // (i1 timm:$high), M0),
2634 // (V_INTERP_P1LV_F16 $src0_modifiers, VGPR_32:$src0, timm:$attr,
2635 // timm:$attrchan, 0,
2636 // (V_INTERP_MOV_F32 2, timm:$attr, timm:$attrchan), timm:$high)> {
2637 // let Predicates = [has16BankLDS];
2638 // }
2639
2640 // 16 bank LDS
2641 SDValue ToM0 = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, AMDGPU::M0,
2642 N->getOperand(5), SDValue());
2643
2644 SDVTList VTs = CurDAG->getVTList(MVT::f32, MVT::Other);
2645
2646 SDNode *InterpMov =
2647 CurDAG->getMachineNode(AMDGPU::V_INTERP_MOV_F32, DL, VTs, {
2648 CurDAG->getTargetConstant(2, DL, MVT::i32), // P0
2649 N->getOperand(3), // Attr
2650 N->getOperand(2), // Attrchan
2651 ToM0.getValue(1) // In glue
2652 });
2653
2654 SDNode *InterpP1LV =
2655 CurDAG->getMachineNode(AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32, {
2656 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
2657 N->getOperand(1), // Src0
2658 N->getOperand(3), // Attr
2659 N->getOperand(2), // Attrchan
2660 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
2661 SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high
2662 N->getOperand(4), // high
2663 CurDAG->getTargetConstant(0, DL, MVT::i1), // $clamp
2664 CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod
2665 SDValue(InterpMov, 1)
2666 });
2667
2668 CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: SDValue(InterpP1LV, 0));
2669}
2670
2671void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
2672 unsigned IntrID = N->getConstantOperandVal(Num: 1);
2673 switch (IntrID) {
2674 case Intrinsic::amdgcn_ds_append:
2675 case Intrinsic::amdgcn_ds_consume: {
2676 if (N->getValueType(0) != MVT::i32)
2677 break;
2678 SelectDSAppendConsume(N, IntrID);
2679 return;
2680 }
2681 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2682 SelectDSBvhStackIntrinsic(N);
2683 return;
2684 }
2685
2686 SelectCode(N);
2687}
2688
2689void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
2690 unsigned IntrID = N->getConstantOperandVal(Num: 0);
2691 unsigned Opcode = AMDGPU::INSTRUCTION_LIST_END;
2692 SDNode *ConvGlueNode = N->getGluedNode();
2693 if (ConvGlueNode) {
2694 // FIXME: Possibly iterate over multiple glue nodes?
2695 assert(ConvGlueNode->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
2696 ConvGlueNode = ConvGlueNode->getOperand(Num: 0).getNode();
2697 ConvGlueNode =
2698 CurDAG->getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, {},
2699 MVT::Glue, SDValue(ConvGlueNode, 0));
2700 } else {
2701 ConvGlueNode = nullptr;
2702 }
2703 switch (IntrID) {
2704 case Intrinsic::amdgcn_wqm:
2705 Opcode = AMDGPU::WQM;
2706 break;
2707 case Intrinsic::amdgcn_softwqm:
2708 Opcode = AMDGPU::SOFT_WQM;
2709 break;
2710 case Intrinsic::amdgcn_wwm:
2711 case Intrinsic::amdgcn_strict_wwm:
2712 Opcode = AMDGPU::STRICT_WWM;
2713 break;
2714 case Intrinsic::amdgcn_strict_wqm:
2715 Opcode = AMDGPU::STRICT_WQM;
2716 break;
2717 case Intrinsic::amdgcn_interp_p1_f16:
2718 SelectInterpP1F16(N);
2719 return;
2720 case Intrinsic::amdgcn_inverse_ballot:
2721 switch (N->getOperand(Num: 1).getValueSizeInBits()) {
2722 case 32:
2723 Opcode = AMDGPU::S_INVERSE_BALLOT_U32;
2724 break;
2725 case 64:
2726 Opcode = AMDGPU::S_INVERSE_BALLOT_U64;
2727 break;
2728 default:
2729 llvm_unreachable("Unsupported size for inverse ballot mask.");
2730 }
2731 break;
2732 default:
2733 SelectCode(N);
2734 break;
2735 }
2736
2737 if (Opcode != AMDGPU::INSTRUCTION_LIST_END) {
2738 SDValue Src = N->getOperand(Num: 1);
2739 CurDAG->SelectNodeTo(N, MachineOpc: Opcode, VTs: N->getVTList(), Ops: {Src});
2740 }
2741
2742 if (ConvGlueNode) {
2743 SmallVector<SDValue, 4> NewOps(N->op_begin(), N->op_end());
2744 NewOps.push_back(Elt: SDValue(ConvGlueNode, 0));
2745 CurDAG->MorphNodeTo(N, Opc: N->getOpcode(), VTs: N->getVTList(), Ops: NewOps);
2746 }
2747}
2748
2749void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
2750 unsigned IntrID = N->getConstantOperandVal(Num: 1);
2751 switch (IntrID) {
2752 case Intrinsic::amdgcn_ds_gws_init:
2753 case Intrinsic::amdgcn_ds_gws_barrier:
2754 case Intrinsic::amdgcn_ds_gws_sema_v:
2755 case Intrinsic::amdgcn_ds_gws_sema_br:
2756 case Intrinsic::amdgcn_ds_gws_sema_p:
2757 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2758 SelectDS_GWS(N, IntrID);
2759 return;
2760 default:
2761 break;
2762 }
2763
2764 SelectCode(N);
2765}
2766
2767void AMDGPUDAGToDAGISel::SelectWAVE_ADDRESS(SDNode *N) {
2768 SDValue Log2WaveSize =
2769 CurDAG->getTargetConstant(Subtarget->getWavefrontSizeLog2(), SDLoc(N), MVT::i32);
2770 CurDAG->SelectNodeTo(N, AMDGPU::S_LSHR_B32, N->getVTList(),
2771 {N->getOperand(0), Log2WaveSize});
2772}
2773
2774void AMDGPUDAGToDAGISel::SelectSTACKRESTORE(SDNode *N) {
2775 SDValue SrcVal = N->getOperand(Num: 1);
2776 if (SrcVal.getValueType() != MVT::i32) {
2777 SelectCode(N); // Emit default error
2778 return;
2779 }
2780
2781 SDValue CopyVal;
2782 Register SP = TLI->getStackPointerRegisterToSaveRestore();
2783 SDLoc SL(N);
2784
2785 if (SrcVal.getOpcode() == AMDGPUISD::WAVE_ADDRESS) {
2786 CopyVal = SrcVal.getOperand(i: 0);
2787 } else {
2788 SDValue Log2WaveSize = CurDAG->getTargetConstant(
2789 Subtarget->getWavefrontSizeLog2(), SL, MVT::i32);
2790
2791 if (N->isDivergent()) {
2792 SrcVal = SDValue(CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL,
2793 MVT::i32, SrcVal),
2794 0);
2795 }
2796
2797 CopyVal = SDValue(CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
2798 {SrcVal, Log2WaveSize}),
2799 0);
2800 }
2801
2802 SDValue CopyToSP = CurDAG->getCopyToReg(Chain: N->getOperand(Num: 0), dl: SL, Reg: SP, N: CopyVal);
2803 CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: CopyToSP);
2804}
2805
2806bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
2807 unsigned &Mods,
2808 bool IsCanonicalizing,
2809 bool AllowAbs) const {
2810 Mods = SISrcMods::NONE;
2811 Src = In;
2812
2813 if (Src.getOpcode() == ISD::FNEG) {
2814 Mods |= SISrcMods::NEG;
2815 Src = Src.getOperand(i: 0);
2816 } else if (Src.getOpcode() == ISD::FSUB && IsCanonicalizing) {
2817 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
2818 // denormal mode, but we're implicitly canonicalizing in a source operand.
2819 auto *LHS = dyn_cast<ConstantFPSDNode>(Val: Src.getOperand(i: 0));
2820 if (LHS && LHS->isZero()) {
2821 Mods |= SISrcMods::NEG;
2822 Src = Src.getOperand(i: 1);
2823 }
2824 }
2825
2826 if (AllowAbs && Src.getOpcode() == ISD::FABS) {
2827 Mods |= SISrcMods::ABS;
2828 Src = Src.getOperand(i: 0);
2829 }
2830
2831 return true;
2832}
2833
2834bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
2835 SDValue &SrcMods) const {
2836 unsigned Mods;
2837 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/true,
2838 /*AllowAbs=*/true)) {
2839 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2840 return true;
2841 }
2842
2843 return false;
2844}
2845
2846bool AMDGPUDAGToDAGISel::SelectVOP3ModsNonCanonicalizing(
2847 SDValue In, SDValue &Src, SDValue &SrcMods) const {
2848 unsigned Mods;
2849 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/false,
2850 /*AllowAbs=*/true)) {
2851 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2852 return true;
2853 }
2854
2855 return false;
2856}
2857
2858bool AMDGPUDAGToDAGISel::SelectVOP3BMods(SDValue In, SDValue &Src,
2859 SDValue &SrcMods) const {
2860 unsigned Mods;
2861 if (SelectVOP3ModsImpl(In, Src, Mods,
2862 /*IsCanonicalizing=*/true,
2863 /*AllowAbs=*/false)) {
2864 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2865 return true;
2866 }
2867
2868 return false;
2869}
2870
2871bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
2872 if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG)
2873 return false;
2874
2875 Src = In;
2876 return true;
2877}
2878
2879bool AMDGPUDAGToDAGISel::SelectVINTERPModsImpl(SDValue In, SDValue &Src,
2880 SDValue &SrcMods,
2881 bool OpSel) const {
2882 unsigned Mods;
2883 if (SelectVOP3ModsImpl(In, Src, Mods,
2884 /*IsCanonicalizing=*/true,
2885 /*AllowAbs=*/false)) {
2886 if (OpSel)
2887 Mods |= SISrcMods::OP_SEL_0;
2888 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2889 return true;
2890 }
2891
2892 return false;
2893}
2894
2895bool AMDGPUDAGToDAGISel::SelectVINTERPMods(SDValue In, SDValue &Src,
2896 SDValue &SrcMods) const {
2897 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ false);
2898}
2899
2900bool AMDGPUDAGToDAGISel::SelectVINTERPModsHi(SDValue In, SDValue &Src,
2901 SDValue &SrcMods) const {
2902 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ true);
2903}
2904
2905bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
2906 SDValue &SrcMods, SDValue &Clamp,
2907 SDValue &Omod) const {
2908 SDLoc DL(In);
2909 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2910 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2911
2912 return SelectVOP3Mods(In, Src, SrcMods);
2913}
2914
2915bool AMDGPUDAGToDAGISel::SelectVOP3BMods0(SDValue In, SDValue &Src,
2916 SDValue &SrcMods, SDValue &Clamp,
2917 SDValue &Omod) const {
2918 SDLoc DL(In);
2919 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2920 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2921
2922 return SelectVOP3BMods(In, Src, SrcMods);
2923}
2924
2925bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
2926 SDValue &Clamp, SDValue &Omod) const {
2927 Src = In;
2928
2929 SDLoc DL(In);
2930 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2931 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2932
2933 return true;
2934}
2935
2936bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
2937 SDValue &SrcMods, bool IsDOT) const {
2938 unsigned Mods = SISrcMods::NONE;
2939 Src = In;
2940
2941 // TODO: Handle G_FSUB 0 as fneg
2942 if (Src.getOpcode() == ISD::FNEG) {
2943 Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
2944 Src = Src.getOperand(i: 0);
2945 }
2946
2947 if (Src.getOpcode() == ISD::BUILD_VECTOR && Src.getNumOperands() == 2 &&
2948 (!IsDOT || !Subtarget->hasDOTOpSelHazard())) {
2949 unsigned VecMods = Mods;
2950
2951 SDValue Lo = stripBitcast(Val: Src.getOperand(i: 0));
2952 SDValue Hi = stripBitcast(Val: Src.getOperand(i: 1));
2953
2954 if (Lo.getOpcode() == ISD::FNEG) {
2955 Lo = stripBitcast(Val: Lo.getOperand(i: 0));
2956 Mods ^= SISrcMods::NEG;
2957 }
2958
2959 if (Hi.getOpcode() == ISD::FNEG) {
2960 Hi = stripBitcast(Val: Hi.getOperand(i: 0));
2961 Mods ^= SISrcMods::NEG_HI;
2962 }
2963
2964 if (isExtractHiElt(In: Lo, Out&: Lo))
2965 Mods |= SISrcMods::OP_SEL_0;
2966
2967 if (isExtractHiElt(In: Hi, Out&: Hi))
2968 Mods |= SISrcMods::OP_SEL_1;
2969
2970 unsigned VecSize = Src.getValueSizeInBits();
2971 Lo = stripExtractLoElt(In: Lo);
2972 Hi = stripExtractLoElt(In: Hi);
2973
2974 if (Lo.getValueSizeInBits() > VecSize) {
2975 Lo = CurDAG->getTargetExtractSubreg(
2976 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
2977 MVT::getIntegerVT(VecSize), Lo);
2978 }
2979
2980 if (Hi.getValueSizeInBits() > VecSize) {
2981 Hi = CurDAG->getTargetExtractSubreg(
2982 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
2983 MVT::getIntegerVT(VecSize), Hi);
2984 }
2985
2986 assert(Lo.getValueSizeInBits() <= VecSize &&
2987 Hi.getValueSizeInBits() <= VecSize);
2988
2989 if (Lo == Hi && !isInlineImmediate(N: Lo.getNode())) {
2990 // Really a scalar input. Just select from the low half of the register to
2991 // avoid packing.
2992
2993 if (VecSize == 32 || VecSize == Lo.getValueSizeInBits()) {
2994 Src = Lo;
2995 } else {
2996 assert(Lo.getValueSizeInBits() == 32 && VecSize == 64);
2997
2998 SDLoc SL(In);
2999 SDValue Undef = SDValue(
3000 CurDAG->getMachineNode(Opcode: TargetOpcode::IMPLICIT_DEF, dl: SL,
3001 VT: Lo.getValueType()), 0);
3002 auto RC = Lo->isDivergent() ? AMDGPU::VReg_64RegClassID
3003 : AMDGPU::SReg_64RegClassID;
3004 const SDValue Ops[] = {
3005 CurDAG->getTargetConstant(RC, SL, MVT::i32),
3006 Lo, CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
3007 Undef, CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32) };
3008
3009 Src = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
3010 Src.getValueType(), Ops), 0);
3011 }
3012 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3013 return true;
3014 }
3015
3016 if (VecSize == 64 && Lo == Hi && isa<ConstantFPSDNode>(Val: Lo)) {
3017 uint64_t Lit = cast<ConstantFPSDNode>(Val&: Lo)->getValueAPF()
3018 .bitcastToAPInt().getZExtValue();
3019 if (AMDGPU::isInlinableLiteral32(Literal: Lit, HasInv2Pi: Subtarget->hasInv2PiInlineImm())) {
3020 Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64);
3021 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3022 return true;
3023 }
3024 }
3025
3026 Mods = VecMods;
3027 }
3028
3029 // Packed instructions do not have abs modifiers.
3030 Mods |= SISrcMods::OP_SEL_1;
3031
3032 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3033 return true;
3034}
3035
3036bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src,
3037 SDValue &SrcMods) const {
3038 return SelectVOP3PMods(In, Src, SrcMods, IsDOT: true);
3039}
3040
3041bool AMDGPUDAGToDAGISel::SelectVOP3PModsNeg(SDValue In, SDValue &Src) const {
3042 const ConstantSDNode *C = cast<ConstantSDNode>(Val&: In);
3043 // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
3044 // 1 promotes packed values to signed, 0 treats them as unsigned.
3045 assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
3046
3047 unsigned Mods = SISrcMods::OP_SEL_1;
3048 unsigned SrcSign = C->getZExtValue();
3049 if (SrcSign == 1)
3050 Mods ^= SISrcMods::NEG;
3051
3052 Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3053 return true;
3054}
3055
3056bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,
3057 SDValue &Src) const {
3058 const ConstantSDNode *C = cast<ConstantSDNode>(Val&: In);
3059 assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
3060
3061 unsigned Mods = SISrcMods::OP_SEL_1;
3062 unsigned SrcVal = C->getZExtValue();
3063 if (SrcVal == 1)
3064 Mods |= SISrcMods::OP_SEL_0;
3065
3066 Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3067 return true;
3068}
3069
3070static MachineSDNode *buildRegSequence32(SmallVectorImpl<SDValue> &Elts,
3071 llvm::SelectionDAG *CurDAG,
3072 const SDLoc &DL) {
3073 unsigned DstRegClass;
3074 EVT DstTy;
3075 switch (Elts.size()) {
3076 case 8:
3077 DstRegClass = AMDGPU::VReg_256RegClassID;
3078 DstTy = MVT::v8i32;
3079 break;
3080 case 4:
3081 DstRegClass = AMDGPU::VReg_128RegClassID;
3082 DstTy = MVT::v4i32;
3083 break;
3084 case 2:
3085 DstRegClass = AMDGPU::VReg_64RegClassID;
3086 DstTy = MVT::v2i32;
3087 break;
3088 default:
3089 llvm_unreachable("unhandled Reg sequence size");
3090 }
3091
3092 SmallVector<SDValue, 17> Ops;
3093 Ops.push_back(CurDAG->getTargetConstant(DstRegClass, DL, MVT::i32));
3094 for (unsigned i = 0; i < Elts.size(); ++i) {
3095 Ops.push_back(Elt: Elts[i]);
3096 Ops.push_back(CurDAG->getTargetConstant(
3097 SIRegisterInfo::getSubRegFromChannel(i), DL, MVT::i32));
3098 }
3099 return CurDAG->getMachineNode(Opcode: TargetOpcode::REG_SEQUENCE, dl: DL, VT: DstTy, Ops);
3100}
3101
3102static MachineSDNode *buildRegSequence16(SmallVectorImpl<SDValue> &Elts,
3103 llvm::SelectionDAG *CurDAG,
3104 const SDLoc &DL) {
3105 SmallVector<SDValue, 8> PackedElts;
3106 assert("unhandled Reg sequence size" &&
3107 (Elts.size() == 8 || Elts.size() == 16));
3108
3109 // Pack 16-bit elements in pairs into 32-bit register. If both elements are
3110 // unpacked from 32-bit source use it, otherwise pack them using v_perm.
3111 for (unsigned i = 0; i < Elts.size(); i += 2) {
3112 SDValue LoSrc = stripExtractLoElt(In: stripBitcast(Val: Elts[i]));
3113 SDValue HiSrc;
3114 if (isExtractHiElt(In: Elts[i + 1], Out&: HiSrc) && LoSrc == HiSrc) {
3115 PackedElts.push_back(Elt: HiSrc);
3116 } else {
3117 SDValue PackLoLo = CurDAG->getTargetConstant(0x05040100, DL, MVT::i32);
3118 MachineSDNode *Packed =
3119 CurDAG->getMachineNode(AMDGPU::V_PERM_B32_e64, DL, MVT::i32,
3120 {Elts[i + 1], Elts[i], PackLoLo});
3121 PackedElts.push_back(Elt: SDValue(Packed, 0));
3122 }
3123 }
3124
3125 return buildRegSequence32(Elts&: PackedElts, CurDAG, DL);
3126}
3127
3128static MachineSDNode *buildRegSequence(SmallVectorImpl<SDValue> &Elts,
3129 llvm::SelectionDAG *CurDAG,
3130 const SDLoc &DL, unsigned ElementSize) {
3131 if (ElementSize == 16)
3132 return buildRegSequence16(Elts, CurDAG, DL);
3133 if (ElementSize == 32)
3134 return buildRegSequence32(Elts, CurDAG, DL);
3135 llvm_unreachable("Unhandled element size");
3136}
3137
3138static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
3139 SmallVectorImpl<SDValue> &Elts, SDValue &Src,
3140 llvm::SelectionDAG *CurDAG, const SDLoc &DL,
3141 unsigned ElementSize) {
3142 if (ModOpcode == ISD::FNEG) {
3143 Mods |= SISrcMods::NEG;
3144 // Check if all elements also have abs modifier
3145 SmallVector<SDValue, 8> NegAbsElts;
3146 for (auto El : Elts) {
3147 if (El.getOpcode() != ISD::FABS)
3148 break;
3149 NegAbsElts.push_back(Elt: El->getOperand(Num: 0));
3150 }
3151 if (Elts.size() != NegAbsElts.size()) {
3152 // Neg
3153 Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
3154 } else {
3155 // Neg and Abs
3156 Mods |= SISrcMods::NEG_HI;
3157 Src = SDValue(buildRegSequence(Elts&: NegAbsElts, CurDAG, DL, ElementSize), 0);
3158 }
3159 } else {
3160 assert(ModOpcode == ISD::FABS);
3161 // Abs
3162 Mods |= SISrcMods::NEG_HI;
3163 Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
3164 }
3165}
3166
3167// Check all f16 elements for modifiers while looking through b32 and v2b16
3168// build vector, stop if element does not satisfy ModifierCheck.
3169static void
3170checkWMMAElementsModifiersF16(BuildVectorSDNode *BV,
3171 std::function<bool(SDValue)> ModifierCheck) {
3172 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3173 if (auto *F16Pair =
3174 dyn_cast<BuildVectorSDNode>(Val: stripBitcast(Val: BV->getOperand(Num: i)))) {
3175 for (unsigned i = 0; i < F16Pair->getNumOperands(); ++i) {
3176 SDValue ElF16 = stripBitcast(Val: F16Pair->getOperand(Num: i));
3177 if (!ModifierCheck(ElF16))
3178 break;
3179 }
3180 }
3181 }
3182}
3183
3184bool AMDGPUDAGToDAGISel::SelectWMMAModsF16Neg(SDValue In, SDValue &Src,
3185 SDValue &SrcMods) const {
3186 Src = In;
3187 unsigned Mods = SISrcMods::OP_SEL_1;
3188
3189 // mods are on f16 elements
3190 if (auto *BV = dyn_cast<BuildVectorSDNode>(Val: stripBitcast(Val: In))) {
3191 SmallVector<SDValue, 8> EltsF16;
3192
3193 checkWMMAElementsModifiersF16(BV, ModifierCheck: [&](SDValue Element) -> bool {
3194 if (Element.getOpcode() != ISD::FNEG)
3195 return false;
3196 EltsF16.push_back(Elt: Element.getOperand(i: 0));
3197 return true;
3198 });
3199
3200 // All elements have neg modifier
3201 if (BV->getNumOperands() * 2 == EltsF16.size()) {
3202 Src = SDValue(buildRegSequence16(Elts&: EltsF16, CurDAG, DL: SDLoc(In)), 0);
3203 Mods |= SISrcMods::NEG;
3204 Mods |= SISrcMods::NEG_HI;
3205 }
3206 }
3207
3208 // mods are on v2f16 elements
3209 if (auto *BV = dyn_cast<BuildVectorSDNode>(Val: stripBitcast(Val: In))) {
3210 SmallVector<SDValue, 8> EltsV2F16;
3211 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3212 SDValue ElV2f16 = stripBitcast(Val: BV->getOperand(Num: i));
3213 // Based on first element decide which mod we match, neg or abs
3214 if (ElV2f16.getOpcode() != ISD::FNEG)
3215 break;
3216 EltsV2F16.push_back(Elt: ElV2f16.getOperand(i: 0));
3217 }
3218
3219 // All pairs of elements have neg modifier
3220 if (BV->getNumOperands() == EltsV2F16.size()) {
3221 Src = SDValue(buildRegSequence32(Elts&: EltsV2F16, CurDAG, DL: SDLoc(In)), 0);
3222 Mods |= SISrcMods::NEG;
3223 Mods |= SISrcMods::NEG_HI;
3224 }
3225 }
3226
3227 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3228 return true;
3229}
3230
3231bool AMDGPUDAGToDAGISel::SelectWMMAModsF16NegAbs(SDValue In, SDValue &Src,
3232 SDValue &SrcMods) const {
3233 Src = In;
3234 unsigned Mods = SISrcMods::OP_SEL_1;
3235 unsigned ModOpcode;
3236
3237 // mods are on f16 elements
3238 if (auto *BV = dyn_cast<BuildVectorSDNode>(Val: stripBitcast(Val: In))) {
3239 SmallVector<SDValue, 8> EltsF16;
3240 checkWMMAElementsModifiersF16(BV, ModifierCheck: [&](SDValue ElF16) -> bool {
3241 // Based on first element decide which mod we match, neg or abs
3242 if (EltsF16.empty())
3243 ModOpcode = (ElF16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3244 if (ElF16.getOpcode() != ModOpcode)
3245 return false;
3246 EltsF16.push_back(Elt: ElF16.getOperand(i: 0));
3247 return true;
3248 });
3249
3250 // All elements have ModOpcode modifier
3251 if (BV->getNumOperands() * 2 == EltsF16.size())
3252 selectWMMAModsNegAbs(ModOpcode, Mods, Elts&: EltsF16, Src, CurDAG, DL: SDLoc(In),
3253 ElementSize: 16);
3254 }
3255
3256 // mods are on v2f16 elements
3257 if (auto *BV = dyn_cast<BuildVectorSDNode>(Val: stripBitcast(Val: In))) {
3258 SmallVector<SDValue, 8> EltsV2F16;
3259
3260 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3261 SDValue ElV2f16 = stripBitcast(Val: BV->getOperand(Num: i));
3262 // Based on first element decide which mod we match, neg or abs
3263 if (EltsV2F16.empty())
3264 ModOpcode = (ElV2f16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3265 if (ElV2f16->getOpcode() != ModOpcode)
3266 break;
3267 EltsV2F16.push_back(Elt: ElV2f16->getOperand(Num: 0));
3268 }
3269
3270 // All elements have ModOpcode modifier
3271 if (BV->getNumOperands() == EltsV2F16.size())
3272 selectWMMAModsNegAbs(ModOpcode, Mods, Elts&: EltsV2F16, Src, CurDAG, DL: SDLoc(In),
3273 ElementSize: 32);
3274 }
3275
3276 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3277 return true;
3278}
3279
3280bool AMDGPUDAGToDAGISel::SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src,
3281 SDValue &SrcMods) const {
3282 Src = In;
3283 unsigned Mods = SISrcMods::OP_SEL_1;
3284 SmallVector<SDValue, 8> EltsF32;
3285
3286 if (auto *BV = dyn_cast<BuildVectorSDNode>(Val: stripBitcast(Val: In))) {
3287 assert(BV->getNumOperands() > 0);
3288 // Based on first element decide which mod we match, neg or abs
3289 SDValue ElF32 = stripBitcast(Val: BV->getOperand(Num: 0));
3290 unsigned ModOpcode =
3291 (ElF32.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3292 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3293 SDValue ElF32 = stripBitcast(Val: BV->getOperand(Num: i));
3294 if (ElF32.getOpcode() != ModOpcode)
3295 break;
3296 EltsF32.push_back(Elt: ElF32.getOperand(i: 0));
3297 }
3298
3299 // All elements had ModOpcode modifier
3300 if (BV->getNumOperands() == EltsF32.size())
3301 selectWMMAModsNegAbs(ModOpcode, Mods, Elts&: EltsF32, Src, CurDAG, DL: SDLoc(In),
3302 ElementSize: 32);
3303 }
3304
3305 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3306 return true;
3307}
3308
3309bool AMDGPUDAGToDAGISel::SelectWMMAVISrc(SDValue In, SDValue &Src) const {
3310 if (auto *BV = dyn_cast<BuildVectorSDNode>(Val&: In)) {
3311 BitVector UndefElements;
3312 if (SDValue Splat = BV->getSplatValue(UndefElements: &UndefElements))
3313 if (isInlineImmediate(N: Splat.getNode())) {
3314 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: Splat)) {
3315 unsigned Imm = C->getAPIntValue().getSExtValue();
3316 Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
3317 return true;
3318 }
3319 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val&: Splat)) {
3320 unsigned Imm = C->getValueAPF().bitcastToAPInt().getSExtValue();
3321 Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
3322 return true;
3323 }
3324 llvm_unreachable("unhandled Constant node");
3325 }
3326 }
3327
3328 // 16 bit splat
3329 SDValue SplatSrc32 = stripBitcast(Val: In);
3330 if (auto *SplatSrc32BV = dyn_cast<BuildVectorSDNode>(Val&: SplatSrc32))
3331 if (SDValue Splat32 = SplatSrc32BV->getSplatValue()) {
3332 SDValue SplatSrc16 = stripBitcast(Val: Splat32);
3333 if (auto *SplatSrc16BV = dyn_cast<BuildVectorSDNode>(Val&: SplatSrc16))
3334 if (SDValue Splat = SplatSrc16BV->getSplatValue()) {
3335 const SIInstrInfo *TII = Subtarget->getInstrInfo();
3336 std::optional<APInt> RawValue;
3337 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val&: Splat))
3338 RawValue = C->getValueAPF().bitcastToAPInt();
3339 else if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: Splat))
3340 RawValue = C->getAPIntValue();
3341
3342 if (RawValue.has_value()) {
3343 EVT VT = In.getValueType().getScalarType();
3344 if (VT.getSimpleVT() == MVT::f16 || VT.getSimpleVT() == MVT::bf16) {
3345 APFloat FloatVal(VT.getSimpleVT() == MVT::f16
3346 ? APFloatBase::IEEEhalf()
3347 : APFloatBase::BFloat(),
3348 RawValue.value());
3349 if (TII->isInlineConstant(Imm: FloatVal)) {
3350 Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),
3351 MVT::i16);
3352 return true;
3353 }
3354 } else if (VT.getSimpleVT() == MVT::i16) {
3355 if (TII->isInlineConstant(Imm: RawValue.value())) {
3356 Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),
3357 MVT::i16);
3358 return true;
3359 }
3360 } else
3361 llvm_unreachable("unknown 16-bit type");
3362 }
3363 }
3364 }
3365
3366 return false;
3367}
3368
3369bool AMDGPUDAGToDAGISel::SelectSWMMACIndex8(SDValue In, SDValue &Src,
3370 SDValue &IndexKey) const {
3371 unsigned Key = 0;
3372 Src = In;
3373
3374 if (In.getOpcode() == ISD::SRL) {
3375 const llvm::SDValue &ShiftSrc = In.getOperand(i: 0);
3376 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Val: In.getOperand(i: 1));
3377 if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
3378 ShiftAmt->getZExtValue() % 8 == 0) {
3379 Key = ShiftAmt->getZExtValue() / 8;
3380 Src = ShiftSrc;
3381 }
3382 }
3383
3384 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
3385 return true;
3386}
3387
3388bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src,
3389 SDValue &IndexKey) const {
3390 unsigned Key = 0;
3391 Src = In;
3392
3393 if (In.getOpcode() == ISD::SRL) {
3394 const llvm::SDValue &ShiftSrc = In.getOperand(i: 0);
3395 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Val: In.getOperand(i: 1));
3396 if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
3397 ShiftAmt->getZExtValue() == 16) {
3398 Key = 1;
3399 Src = ShiftSrc;
3400 }
3401 }
3402
3403 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
3404 return true;
3405}
3406
3407bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
3408 SDValue &SrcMods) const {
3409 Src = In;
3410 // FIXME: Handle op_sel
3411 SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
3412 return true;
3413}
3414
3415bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
3416 SDValue &SrcMods) const {
3417 // FIXME: Handle op_sel
3418 return SelectVOP3Mods(In, Src, SrcMods);
3419}
3420
3421// The return value is not whether the match is possible (which it always is),
3422// but whether or not it a conversion is really used.
3423bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
3424 unsigned &Mods) const {
3425 Mods = 0;
3426 SelectVOP3ModsImpl(In, Src, Mods);
3427
3428 if (Src.getOpcode() == ISD::FP_EXTEND) {
3429 Src = Src.getOperand(i: 0);
3430 assert(Src.getValueType() == MVT::f16);
3431 Src = stripBitcast(Val: Src);
3432
3433 // Be careful about folding modifiers if we already have an abs. fneg is
3434 // applied last, so we don't want to apply an earlier fneg.
3435 if ((Mods & SISrcMods::ABS) == 0) {
3436 unsigned ModsTmp;
3437 SelectVOP3ModsImpl(In: Src, Src, Mods&: ModsTmp);
3438
3439 if ((ModsTmp & SISrcMods::NEG) != 0)
3440 Mods ^= SISrcMods::NEG;
3441
3442 if ((ModsTmp & SISrcMods::ABS) != 0)
3443 Mods |= SISrcMods::ABS;
3444 }
3445
3446 // op_sel/op_sel_hi decide the source type and source.
3447 // If the source's op_sel_hi is set, it indicates to do a conversion from fp16.
3448 // If the sources's op_sel is set, it picks the high half of the source
3449 // register.
3450
3451 Mods |= SISrcMods::OP_SEL_1;
3452 if (isExtractHiElt(In: Src, Out&: Src)) {
3453 Mods |= SISrcMods::OP_SEL_0;
3454
3455 // TODO: Should we try to look for neg/abs here?
3456 }
3457
3458 return true;
3459 }
3460
3461 return false;
3462}
3463
3464bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,
3465 SDValue &SrcMods) const {
3466 unsigned Mods = 0;
3467 if (!SelectVOP3PMadMixModsImpl(In, Src, Mods))
3468 return false;
3469 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3470 return true;
3471}
3472
3473bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
3474 SDValue &SrcMods) const {
3475 unsigned Mods = 0;
3476 SelectVOP3PMadMixModsImpl(In, Src, Mods);
3477 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3478 return true;
3479}
3480
3481SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
3482 if (In.isUndef())
3483 return CurDAG->getUNDEF(MVT::i32);
3484
3485 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: In)) {
3486 SDLoc SL(In);
3487 return CurDAG->getConstant(C->getZExtValue() << 16, SL, MVT::i32);
3488 }
3489
3490 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val&: In)) {
3491 SDLoc SL(In);
3492 return CurDAG->getConstant(
3493 C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);
3494 }
3495
3496 SDValue Src;
3497 if (isExtractHiElt(In, Out&: Src))
3498 return Src;
3499
3500 return SDValue();
3501}
3502
3503bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
3504 assert(CurDAG->getTarget().getTargetTriple().getArch() == Triple::amdgcn);
3505
3506 const SIRegisterInfo *SIRI =
3507 static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
3508 const SIInstrInfo * SII =
3509 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
3510
3511 unsigned Limit = 0;
3512 bool AllUsesAcceptSReg = true;
3513 for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
3514 Limit < 10 && U != E; ++U, ++Limit) {
3515 const TargetRegisterClass *RC = getOperandRegClass(N: *U, OpNo: U.getOperandNo());
3516
3517 // If the register class is unknown, it could be an unknown
3518 // register class that needs to be an SGPR, e.g. an inline asm
3519 // constraint
3520 if (!RC || SIRI->isSGPRClass(RC))
3521 return false;
3522
3523 if (RC != &AMDGPU::VS_32RegClass && RC != &AMDGPU::VS_64RegClass) {
3524 AllUsesAcceptSReg = false;
3525 SDNode * User = *U;
3526 if (User->isMachineOpcode()) {
3527 unsigned Opc = User->getMachineOpcode();
3528 const MCInstrDesc &Desc = SII->get(Opc);
3529 if (Desc.isCommutable()) {
3530 unsigned OpIdx = Desc.getNumDefs() + U.getOperandNo();
3531 unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
3532 if (SII->findCommutedOpIndices(Desc, SrcOpIdx0&: OpIdx, SrcOpIdx1&: CommuteIdx1)) {
3533 unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();
3534 const TargetRegisterClass *CommutedRC = getOperandRegClass(N: *U, OpNo: CommutedOpNo);
3535 if (CommutedRC == &AMDGPU::VS_32RegClass ||
3536 CommutedRC == &AMDGPU::VS_64RegClass)
3537 AllUsesAcceptSReg = true;
3538 }
3539 }
3540 }
3541 // If "AllUsesAcceptSReg == false" so far we haven't succeeded
3542 // commuting current user. This means have at least one use
3543 // that strictly require VGPR. Thus, we will not attempt to commute
3544 // other user instructions.
3545 if (!AllUsesAcceptSReg)
3546 break;
3547 }
3548 }
3549 return !AllUsesAcceptSReg && (Limit < 10);
3550}
3551
3552bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode *N) const {
3553 auto Ld = cast<LoadSDNode>(Val: N);
3554
3555 const MachineMemOperand *MMO = Ld->getMemOperand();
3556 if (N->isDivergent() && !AMDGPUInstrInfo::isUniformMMO(MMO))
3557 return false;
3558
3559 return MMO->getSize().hasValue() &&
3560 Ld->getAlign() >=
3561 Align(std::min(a: MMO->getSize().getValue().getKnownMinValue(),
3562 b: uint64_t(4))) &&
3563 ((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
3564 Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||
3565 (Subtarget->getScalarizeGlobalBehavior() &&
3566 Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
3567 Ld->isSimple() &&
3568 static_cast<const SITargetLowering *>(getTargetLowering())
3569 ->isMemOpHasNoClobberedMemOperand(N)));
3570}
3571
3572void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
3573 const AMDGPUTargetLowering& Lowering =
3574 *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
3575 bool IsModified = false;
3576 do {
3577 IsModified = false;
3578
3579 // Go over all selected nodes and try to fold them a bit more
3580 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_begin();
3581 while (Position != CurDAG->allnodes_end()) {
3582 SDNode *Node = &*Position++;
3583 MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(Val: Node);
3584 if (!MachineNode)
3585 continue;
3586
3587 SDNode *ResNode = Lowering.PostISelFolding(N: MachineNode, DAG&: *CurDAG);
3588 if (ResNode != Node) {
3589 if (ResNode)
3590 ReplaceUses(F: Node, T: ResNode);
3591 IsModified = true;
3592 }
3593 }
3594 CurDAG->RemoveDeadNodes();
3595 } while (IsModified);
3596}
3597
3598char AMDGPUDAGToDAGISel::ID = 0;
3599

source code of llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp