1//===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// \file
10// This file implements a TargetTransformInfo analysis pass specific to the
11// AMDGPU target machine. It uses the target's detailed information to provide
12// more precise answers to certain TTI queries, while letting the target
13// independent and default TTI implementations handle the rest.
14//
15//===----------------------------------------------------------------------===//
16
17#include "AMDGPUTargetTransformInfo.h"
18#include "AMDGPUTargetMachine.h"
19#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20#include "SIModeRegisterDefaults.h"
21#include "llvm/Analysis/InlineCost.h"
22#include "llvm/Analysis/LoopInfo.h"
23#include "llvm/Analysis/ValueTracking.h"
24#include "llvm/CodeGen/Analysis.h"
25#include "llvm/IR/IRBuilder.h"
26#include "llvm/IR/IntrinsicsAMDGPU.h"
27#include "llvm/IR/PatternMatch.h"
28#include "llvm/Support/KnownBits.h"
29#include <optional>
30
31using namespace llvm;
32
33#define DEBUG_TYPE "AMDGPUtti"
34
35static cl::opt<unsigned> UnrollThresholdPrivate(
36 "amdgpu-unroll-threshold-private",
37 cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
38 cl::init(Val: 2700), cl::Hidden);
39
40static cl::opt<unsigned> UnrollThresholdLocal(
41 "amdgpu-unroll-threshold-local",
42 cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
43 cl::init(Val: 1000), cl::Hidden);
44
45static cl::opt<unsigned> UnrollThresholdIf(
46 "amdgpu-unroll-threshold-if",
47 cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
48 cl::init(Val: 200), cl::Hidden);
49
50static cl::opt<bool> UnrollRuntimeLocal(
51 "amdgpu-unroll-runtime-local",
52 cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
53 cl::init(Val: true), cl::Hidden);
54
55static cl::opt<unsigned> UnrollMaxBlockToAnalyze(
56 "amdgpu-unroll-max-block-to-analyze",
57 cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),
58 cl::init(Val: 32), cl::Hidden);
59
60static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost",
61 cl::Hidden, cl::init(Val: 4000),
62 cl::desc("Cost of alloca argument"));
63
64// If the amount of scratch memory to eliminate exceeds our ability to allocate
65// it into registers we gain nothing by aggressively inlining functions for that
66// heuristic.
67static cl::opt<unsigned>
68 ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden,
69 cl::init(Val: 256),
70 cl::desc("Maximum alloca size to use for inline cost"));
71
72// Inliner constraint to achieve reasonable compilation time.
73static cl::opt<size_t> InlineMaxBB(
74 "amdgpu-inline-max-bb", cl::Hidden, cl::init(Val: 1100),
75 cl::desc("Maximum number of BBs allowed in a function after inlining"
76 " (compile time constraint)"));
77
78static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
79 unsigned Depth = 0) {
80 const Instruction *I = dyn_cast<Instruction>(Val: Cond);
81 if (!I)
82 return false;
83
84 for (const Value *V : I->operand_values()) {
85 if (!L->contains(Inst: I))
86 continue;
87 if (const PHINode *PHI = dyn_cast<PHINode>(Val: V)) {
88 if (llvm::none_of(Range: L->getSubLoops(), P: [PHI](const Loop* SubLoop) {
89 return SubLoop->contains(Inst: PHI); }))
90 return true;
91 } else if (Depth < 10 && dependsOnLocalPhi(L, Cond: V, Depth: Depth+1))
92 return true;
93 }
94 return false;
95}
96
97AMDGPUTTIImpl::AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
98 : BaseT(TM, F.getParent()->getDataLayout()),
99 TargetTriple(TM->getTargetTriple()),
100 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
101 TLI(ST->getTargetLowering()) {}
102
103void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
104 TTI::UnrollingPreferences &UP,
105 OptimizationRemarkEmitter *ORE) {
106 const Function &F = *L->getHeader()->getParent();
107 UP.Threshold =
108 F.getFnAttributeAsParsedInteger(Kind: "amdgpu-unroll-threshold", Default: 300);
109 UP.MaxCount = std::numeric_limits<unsigned>::max();
110 UP.Partial = true;
111
112 // Conditional branch in a loop back edge needs 3 additional exec
113 // manipulations in average.
114 UP.BEInsns += 3;
115
116 // We want to run unroll even for the loops which have been vectorized.
117 UP.UnrollVectorizedLoop = true;
118
119 // TODO: Do we want runtime unrolling?
120
121 // Maximum alloca size than can fit registers. Reserve 16 registers.
122 const unsigned MaxAlloca = (256 - 16) * 4;
123 unsigned ThresholdPrivate = UnrollThresholdPrivate;
124 unsigned ThresholdLocal = UnrollThresholdLocal;
125
126 // If this loop has the amdgpu.loop.unroll.threshold metadata we will use the
127 // provided threshold value as the default for Threshold
128 if (MDNode *LoopUnrollThreshold =
129 findOptionMDForLoop(TheLoop: L, Name: "amdgpu.loop.unroll.threshold")) {
130 if (LoopUnrollThreshold->getNumOperands() == 2) {
131 ConstantInt *MetaThresholdValue = mdconst::extract_or_null<ConstantInt>(
132 MD: LoopUnrollThreshold->getOperand(I: 1));
133 if (MetaThresholdValue) {
134 // We will also use the supplied value for PartialThreshold for now.
135 // We may introduce additional metadata if it becomes necessary in the
136 // future.
137 UP.Threshold = MetaThresholdValue->getSExtValue();
138 UP.PartialThreshold = UP.Threshold;
139 ThresholdPrivate = std::min(a: ThresholdPrivate, b: UP.Threshold);
140 ThresholdLocal = std::min(a: ThresholdLocal, b: UP.Threshold);
141 }
142 }
143 }
144
145 unsigned MaxBoost = std::max(a: ThresholdPrivate, b: ThresholdLocal);
146 for (const BasicBlock *BB : L->getBlocks()) {
147 const DataLayout &DL = BB->getModule()->getDataLayout();
148 unsigned LocalGEPsSeen = 0;
149
150 if (llvm::any_of(Range: L->getSubLoops(), P: [BB](const Loop* SubLoop) {
151 return SubLoop->contains(BB); }))
152 continue; // Block belongs to an inner loop.
153
154 for (const Instruction &I : *BB) {
155 // Unroll a loop which contains an "if" statement whose condition
156 // defined by a PHI belonging to the loop. This may help to eliminate
157 // if region and potentially even PHI itself, saving on both divergence
158 // and registers used for the PHI.
159 // Add a small bonus for each of such "if" statements.
160 if (const BranchInst *Br = dyn_cast<BranchInst>(Val: &I)) {
161 if (UP.Threshold < MaxBoost && Br->isConditional()) {
162 BasicBlock *Succ0 = Br->getSuccessor(i: 0);
163 BasicBlock *Succ1 = Br->getSuccessor(i: 1);
164 if ((L->contains(BB: Succ0) && L->isLoopExiting(BB: Succ0)) ||
165 (L->contains(BB: Succ1) && L->isLoopExiting(BB: Succ1)))
166 continue;
167 if (dependsOnLocalPhi(L, Cond: Br->getCondition())) {
168 UP.Threshold += UnrollThresholdIf;
169 LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
170 << " for loop:\n"
171 << *L << " due to " << *Br << '\n');
172 if (UP.Threshold >= MaxBoost)
173 return;
174 }
175 }
176 continue;
177 }
178
179 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: &I);
180 if (!GEP)
181 continue;
182
183 unsigned AS = GEP->getAddressSpace();
184 unsigned Threshold = 0;
185 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
186 Threshold = ThresholdPrivate;
187 else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS)
188 Threshold = ThresholdLocal;
189 else
190 continue;
191
192 if (UP.Threshold >= Threshold)
193 continue;
194
195 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
196 const Value *Ptr = GEP->getPointerOperand();
197 const AllocaInst *Alloca =
198 dyn_cast<AllocaInst>(Val: getUnderlyingObject(V: Ptr));
199 if (!Alloca || !Alloca->isStaticAlloca())
200 continue;
201 Type *Ty = Alloca->getAllocatedType();
202 unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
203 if (AllocaSize > MaxAlloca)
204 continue;
205 } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
206 AS == AMDGPUAS::REGION_ADDRESS) {
207 LocalGEPsSeen++;
208 // Inhibit unroll for local memory if we have seen addressing not to
209 // a variable, most likely we will be unable to combine it.
210 // Do not unroll too deep inner loops for local memory to give a chance
211 // to unroll an outer loop for a more important reason.
212 if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
213 (!isa<GlobalVariable>(Val: GEP->getPointerOperand()) &&
214 !isa<Argument>(Val: GEP->getPointerOperand())))
215 continue;
216 LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"
217 << *L << " due to LDS use.\n");
218 UP.Runtime = UnrollRuntimeLocal;
219 }
220
221 // Check if GEP depends on a value defined by this loop itself.
222 bool HasLoopDef = false;
223 for (const Value *Op : GEP->operands()) {
224 const Instruction *Inst = dyn_cast<Instruction>(Val: Op);
225 if (!Inst || L->isLoopInvariant(V: Op))
226 continue;
227
228 if (llvm::any_of(Range: L->getSubLoops(), P: [Inst](const Loop* SubLoop) {
229 return SubLoop->contains(Inst); }))
230 continue;
231 HasLoopDef = true;
232 break;
233 }
234 if (!HasLoopDef)
235 continue;
236
237 // We want to do whatever we can to limit the number of alloca
238 // instructions that make it through to the code generator. allocas
239 // require us to use indirect addressing, which is slow and prone to
240 // compiler bugs. If this loop does an address calculation on an
241 // alloca ptr, then we want to use a higher than normal loop unroll
242 // threshold. This will give SROA a better chance to eliminate these
243 // allocas.
244 //
245 // We also want to have more unrolling for local memory to let ds
246 // instructions with different offsets combine.
247 //
248 // Don't use the maximum allowed value here as it will make some
249 // programs way too big.
250 UP.Threshold = Threshold;
251 LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
252 << " for loop:\n"
253 << *L << " due to " << *GEP << '\n');
254 if (UP.Threshold >= MaxBoost)
255 return;
256 }
257
258 // If we got a GEP in a small BB from inner loop then increase max trip
259 // count to analyze for better estimation cost in unroll
260 if (L->isInnermost() && BB->size() < UnrollMaxBlockToAnalyze)
261 UP.MaxIterationsCountToAnalyze = 32;
262 }
263}
264
265void AMDGPUTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
266 TTI::PeelingPreferences &PP) {
267 BaseT::getPeelingPreferences(L, SE, PP);
268}
269
270int64_t AMDGPUTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {
271 return 1024;
272}
273
274const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
275 // Codegen control options which don't matter.
276 AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
277 AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureFlatForGlobal,
278 AMDGPU::FeaturePromoteAlloca, AMDGPU::FeatureUnalignedScratchAccess,
279 AMDGPU::FeatureUnalignedAccessMode,
280
281 AMDGPU::FeatureAutoWaitcntBeforeBarrier,
282
283 // Property of the kernel/environment which can't actually differ.
284 AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,
285 AMDGPU::FeatureTrapHandler,
286
287 // The default assumption needs to be ecc is enabled, but no directly
288 // exposed operations depend on it, so it can be safely inlined.
289 AMDGPU::FeatureSRAMECC,
290
291 // Perf-tuning features
292 AMDGPU::FeatureFastFMAF32, AMDGPU::HalfRate64Ops};
293
294GCNTTIImpl::GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
295 : BaseT(TM, F.getParent()->getDataLayout()),
296 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
297 TLI(ST->getTargetLowering()), CommonTTI(TM, F),
298 IsGraphics(AMDGPU::isGraphics(CC: F.getCallingConv())) {
299 SIModeRegisterDefaults Mode(F, *ST);
300 HasFP32Denormals = Mode.FP32Denormals != DenormalMode::getPreserveSign();
301 HasFP64FP16Denormals =
302 Mode.FP64FP16Denormals != DenormalMode::getPreserveSign();
303}
304
305bool GCNTTIImpl::hasBranchDivergence(const Function *F) const {
306 return !F || !ST->isSingleLaneExecution(*F);
307}
308
309unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
310 // NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
311 // registers. See getRegisterClassForType for the implementation.
312 // In this case vector registers are not vector in terms of
313 // VGPRs, but those which can hold multiple values.
314
315 // This is really the number of registers to fill when vectorizing /
316 // interleaving loops, so we lie to avoid trying to use all registers.
317 return 4;
318}
319
320TypeSize
321GCNTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
322 switch (K) {
323 case TargetTransformInfo::RGK_Scalar:
324 return TypeSize::getFixed(ExactSize: 32);
325 case TargetTransformInfo::RGK_FixedWidthVector:
326 return TypeSize::getFixed(ExactSize: ST->hasPackedFP32Ops() ? 64 : 32);
327 case TargetTransformInfo::RGK_ScalableVector:
328 return TypeSize::getScalable(MinimumSize: 0);
329 }
330 llvm_unreachable("Unsupported register kind");
331}
332
333unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
334 return 32;
335}
336
337unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
338 if (Opcode == Instruction::Load || Opcode == Instruction::Store)
339 return 32 * 4 / ElemWidth;
340 return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
341 : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
342 : 1;
343}
344
345unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
346 unsigned ChainSizeInBytes,
347 VectorType *VecTy) const {
348 unsigned VecRegBitWidth = VF * LoadSize;
349 if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
350 // TODO: Support element-size less than 32bit?
351 return 128 / LoadSize;
352
353 return VF;
354}
355
356unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
357 unsigned ChainSizeInBytes,
358 VectorType *VecTy) const {
359 unsigned VecRegBitWidth = VF * StoreSize;
360 if (VecRegBitWidth > 128)
361 return 128 / StoreSize;
362
363 return VF;
364}
365
366unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
367 if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
368 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
369 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
370 AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
371 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
372 AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
373 return 512;
374 }
375
376 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
377 return 8 * ST->getMaxPrivateElementSize();
378
379 // Common to flat, global, local and region. Assume for unknown addrspace.
380 return 128;
381}
382
383bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
384 Align Alignment,
385 unsigned AddrSpace) const {
386 // We allow vectorization of flat stores, even though we may need to decompose
387 // them later if they may access private memory. We don't have enough context
388 // here, and legalization can handle it.
389 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
390 return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) &&
391 ChainSizeInBytes <= ST->getMaxPrivateElementSize();
392 }
393 return true;
394}
395
396bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
397 Align Alignment,
398 unsigned AddrSpace) const {
399 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
400}
401
402bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
403 Align Alignment,
404 unsigned AddrSpace) const {
405 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
406}
407
408int64_t GCNTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {
409 return 1024;
410}
411
412// FIXME: Really we would like to issue multiple 128-bit loads and stores per
413// iteration. Should we report a larger size and let it legalize?
414//
415// FIXME: Should we use narrower types for local/region, or account for when
416// unaligned access is legal?
417//
418// FIXME: This could use fine tuning and microbenchmarks.
419Type *GCNTTIImpl::getMemcpyLoopLoweringType(
420 LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
421 unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign,
422 std::optional<uint32_t> AtomicElementSize) const {
423
424 if (AtomicElementSize)
425 return Type::getIntNTy(C&: Context, N: *AtomicElementSize * 8);
426
427 unsigned MinAlign = std::min(a: SrcAlign, b: DestAlign);
428
429 // A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the
430 // hardware into byte accesses. If you assume all alignments are equally
431 // probable, it's more efficient on average to use short accesses for this
432 // case.
433 if (MinAlign == 2)
434 return Type::getInt16Ty(C&: Context);
435
436 // Not all subtargets have 128-bit DS instructions, and we currently don't
437 // form them by default.
438 if (SrcAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
439 SrcAddrSpace == AMDGPUAS::REGION_ADDRESS ||
440 DestAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
441 DestAddrSpace == AMDGPUAS::REGION_ADDRESS) {
442 return FixedVectorType::get(ElementType: Type::getInt32Ty(C&: Context), NumElts: 2);
443 }
444
445 // Global memory works best with 16-byte accesses. Private memory will also
446 // hit this, although they'll be decomposed.
447 return FixedVectorType::get(ElementType: Type::getInt32Ty(C&: Context), NumElts: 4);
448}
449
450void GCNTTIImpl::getMemcpyLoopResidualLoweringType(
451 SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
452 unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
453 unsigned SrcAlign, unsigned DestAlign,
454 std::optional<uint32_t> AtomicCpySize) const {
455 assert(RemainingBytes < 16);
456
457 if (AtomicCpySize)
458 BaseT::getMemcpyLoopResidualLoweringType(
459 OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
460 DestAlign, AtomicCpySize);
461
462 unsigned MinAlign = std::min(a: SrcAlign, b: DestAlign);
463
464 if (MinAlign != 2) {
465 Type *I64Ty = Type::getInt64Ty(C&: Context);
466 while (RemainingBytes >= 8) {
467 OpsOut.push_back(Elt: I64Ty);
468 RemainingBytes -= 8;
469 }
470
471 Type *I32Ty = Type::getInt32Ty(C&: Context);
472 while (RemainingBytes >= 4) {
473 OpsOut.push_back(Elt: I32Ty);
474 RemainingBytes -= 4;
475 }
476 }
477
478 Type *I16Ty = Type::getInt16Ty(C&: Context);
479 while (RemainingBytes >= 2) {
480 OpsOut.push_back(Elt: I16Ty);
481 RemainingBytes -= 2;
482 }
483
484 Type *I8Ty = Type::getInt8Ty(C&: Context);
485 while (RemainingBytes) {
486 OpsOut.push_back(Elt: I8Ty);
487 --RemainingBytes;
488 }
489}
490
491unsigned GCNTTIImpl::getMaxInterleaveFactor(ElementCount VF) {
492 // Disable unrolling if the loop is not vectorized.
493 // TODO: Enable this again.
494 if (VF.isScalar())
495 return 1;
496
497 return 8;
498}
499
500bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
501 MemIntrinsicInfo &Info) const {
502 switch (Inst->getIntrinsicID()) {
503 case Intrinsic::amdgcn_ds_ordered_add:
504 case Intrinsic::amdgcn_ds_ordered_swap:
505 case Intrinsic::amdgcn_ds_fadd:
506 case Intrinsic::amdgcn_ds_fmin:
507 case Intrinsic::amdgcn_ds_fmax: {
508 auto *Ordering = dyn_cast<ConstantInt>(Val: Inst->getArgOperand(i: 2));
509 auto *Volatile = dyn_cast<ConstantInt>(Val: Inst->getArgOperand(i: 4));
510 if (!Ordering || !Volatile)
511 return false; // Invalid.
512
513 unsigned OrderingVal = Ordering->getZExtValue();
514 if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
515 return false;
516
517 Info.PtrVal = Inst->getArgOperand(i: 0);
518 Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
519 Info.ReadMem = true;
520 Info.WriteMem = true;
521 Info.IsVolatile = !Volatile->isZero();
522 return true;
523 }
524 default:
525 return false;
526 }
527}
528
529InstructionCost GCNTTIImpl::getArithmeticInstrCost(
530 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
531 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
532 ArrayRef<const Value *> Args,
533 const Instruction *CxtI) {
534
535 // Legalize the type.
536 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
537 int ISD = TLI->InstructionOpcodeToISD(Opcode);
538
539 // Because we don't have any legal vector operations, but the legal types, we
540 // need to account for split vectors.
541 unsigned NElts = LT.second.isVector() ?
542 LT.second.getVectorNumElements() : 1;
543
544 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
545
546 switch (ISD) {
547 case ISD::SHL:
548 case ISD::SRL:
549 case ISD::SRA:
550 if (SLT == MVT::i64)
551 return get64BitInstrCost(CostKind) * LT.first * NElts;
552
553 if (ST->has16BitInsts() && SLT == MVT::i16)
554 NElts = (NElts + 1) / 2;
555
556 // i32
557 return getFullRateInstrCost() * LT.first * NElts;
558 case ISD::ADD:
559 case ISD::SUB:
560 case ISD::AND:
561 case ISD::OR:
562 case ISD::XOR:
563 if (SLT == MVT::i64) {
564 // and, or and xor are typically split into 2 VALU instructions.
565 return 2 * getFullRateInstrCost() * LT.first * NElts;
566 }
567
568 if (ST->has16BitInsts() && SLT == MVT::i16)
569 NElts = (NElts + 1) / 2;
570
571 return LT.first * NElts * getFullRateInstrCost();
572 case ISD::MUL: {
573 const int QuarterRateCost = getQuarterRateInstrCost(CostKind);
574 if (SLT == MVT::i64) {
575 const int FullRateCost = getFullRateInstrCost();
576 return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
577 }
578
579 if (ST->has16BitInsts() && SLT == MVT::i16)
580 NElts = (NElts + 1) / 2;
581
582 // i32
583 return QuarterRateCost * NElts * LT.first;
584 }
585 case ISD::FMUL:
586 // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for
587 // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole
588 // fused operation.
589 if (CxtI && CxtI->hasOneUse())
590 if (const auto *FAdd = dyn_cast<BinaryOperator>(Val: *CxtI->user_begin())) {
591 const int OPC = TLI->InstructionOpcodeToISD(Opcode: FAdd->getOpcode());
592 if (OPC == ISD::FADD || OPC == ISD::FSUB) {
593 if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)
594 return TargetTransformInfo::TCC_Free;
595 if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)
596 return TargetTransformInfo::TCC_Free;
597
598 // Estimate all types may be fused with contract/unsafe flags
599 const TargetOptions &Options = TLI->getTargetMachine().Options;
600 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
601 Options.UnsafeFPMath ||
602 (FAdd->hasAllowContract() && CxtI->hasAllowContract()))
603 return TargetTransformInfo::TCC_Free;
604 }
605 }
606 [[fallthrough]];
607 case ISD::FADD:
608 case ISD::FSUB:
609 if (ST->hasPackedFP32Ops() && SLT == MVT::f32)
610 NElts = (NElts + 1) / 2;
611 if (SLT == MVT::f64)
612 return LT.first * NElts * get64BitInstrCost(CostKind);
613
614 if (ST->has16BitInsts() && SLT == MVT::f16)
615 NElts = (NElts + 1) / 2;
616
617 if (SLT == MVT::f32 || SLT == MVT::f16)
618 return LT.first * NElts * getFullRateInstrCost();
619 break;
620 case ISD::FDIV:
621 case ISD::FREM:
622 // FIXME: frem should be handled separately. The fdiv in it is most of it,
623 // but the current lowering is also not entirely correct.
624 if (SLT == MVT::f64) {
625 int Cost = 7 * get64BitInstrCost(CostKind) +
626 getQuarterRateInstrCost(CostKind) +
627 3 * getHalfRateInstrCost(CostKind);
628 // Add cost of workaround.
629 if (!ST->hasUsableDivScaleConditionOutput())
630 Cost += 3 * getFullRateInstrCost();
631
632 return LT.first * Cost * NElts;
633 }
634
635 if (!Args.empty() && match(V: Args[0], P: PatternMatch::m_FPOne())) {
636 // TODO: This is more complicated, unsafe flags etc.
637 if ((SLT == MVT::f32 && !HasFP32Denormals) ||
638 (SLT == MVT::f16 && ST->has16BitInsts())) {
639 return LT.first * getQuarterRateInstrCost(CostKind) * NElts;
640 }
641 }
642
643 if (SLT == MVT::f16 && ST->has16BitInsts()) {
644 // 2 x v_cvt_f32_f16
645 // f32 rcp
646 // f32 fmul
647 // v_cvt_f16_f32
648 // f16 div_fixup
649 int Cost =
650 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(CostKind);
651 return LT.first * Cost * NElts;
652 }
653
654 if (SLT == MVT::f32 && ((CxtI && CxtI->hasApproxFunc()) ||
655 TLI->getTargetMachine().Options.UnsafeFPMath)) {
656 // Fast unsafe fdiv lowering:
657 // f32 rcp
658 // f32 fmul
659 int Cost = getQuarterRateInstrCost(CostKind) + getFullRateInstrCost();
660 return LT.first * Cost * NElts;
661 }
662
663 if (SLT == MVT::f32 || SLT == MVT::f16) {
664 // 4 more v_cvt_* insts without f16 insts support
665 int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +
666 1 * getQuarterRateInstrCost(CostKind);
667
668 if (!HasFP32Denormals) {
669 // FP mode switches.
670 Cost += 2 * getFullRateInstrCost();
671 }
672
673 return LT.first * NElts * Cost;
674 }
675 break;
676 case ISD::FNEG:
677 // Use the backend' estimation. If fneg is not free each element will cost
678 // one additional instruction.
679 return TLI->isFNegFree(VT: SLT) ? 0 : NElts;
680 default:
681 break;
682 }
683
684 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info,
685 Args, CxtI);
686}
687
688// Return true if there's a potential benefit from using v2f16/v2i16
689// instructions for an intrinsic, even if it requires nontrivial legalization.
690static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) {
691 switch (ID) {
692 case Intrinsic::fma: // TODO: fmuladd
693 // There's a small benefit to using vector ops in the legalized code.
694 case Intrinsic::round:
695 case Intrinsic::uadd_sat:
696 case Intrinsic::usub_sat:
697 case Intrinsic::sadd_sat:
698 case Intrinsic::ssub_sat:
699 return true;
700 default:
701 return false;
702 }
703}
704
705InstructionCost
706GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
707 TTI::TargetCostKind CostKind) {
708 if (ICA.getID() == Intrinsic::fabs)
709 return 0;
710
711 if (!intrinsicHasPackedVectorBenefit(ID: ICA.getID()))
712 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
713
714 Type *RetTy = ICA.getReturnType();
715
716 // Legalize the type.
717 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: RetTy);
718
719 unsigned NElts = LT.second.isVector() ?
720 LT.second.getVectorNumElements() : 1;
721
722 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
723
724 if (SLT == MVT::f64)
725 return LT.first * NElts * get64BitInstrCost(CostKind);
726
727 if ((ST->has16BitInsts() && SLT == MVT::f16) ||
728 (ST->hasPackedFP32Ops() && SLT == MVT::f32))
729 NElts = (NElts + 1) / 2;
730
731 // TODO: Get more refined intrinsic costs?
732 unsigned InstRate = getQuarterRateInstrCost(CostKind);
733
734 switch (ICA.getID()) {
735 case Intrinsic::fma:
736 InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
737 : getQuarterRateInstrCost(CostKind);
738 break;
739 case Intrinsic::uadd_sat:
740 case Intrinsic::usub_sat:
741 case Intrinsic::sadd_sat:
742 case Intrinsic::ssub_sat:
743 static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
744 if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
745 NElts = 1;
746 break;
747 }
748
749 return LT.first * NElts * InstRate;
750}
751
752InstructionCost GCNTTIImpl::getCFInstrCost(unsigned Opcode,
753 TTI::TargetCostKind CostKind,
754 const Instruction *I) {
755 assert((I == nullptr || I->getOpcode() == Opcode) &&
756 "Opcode should reflect passed instruction.");
757 const bool SCost =
758 (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency);
759 const int CBrCost = SCost ? 5 : 7;
760 switch (Opcode) {
761 case Instruction::Br: {
762 // Branch instruction takes about 4 slots on gfx900.
763 auto BI = dyn_cast_or_null<BranchInst>(Val: I);
764 if (BI && BI->isUnconditional())
765 return SCost ? 1 : 4;
766 // Suppose conditional branch takes additional 3 exec manipulations
767 // instructions in average.
768 return CBrCost;
769 }
770 case Instruction::Switch: {
771 auto SI = dyn_cast_or_null<SwitchInst>(Val: I);
772 // Each case (including default) takes 1 cmp + 1 cbr instructions in
773 // average.
774 return (SI ? (SI->getNumCases() + 1) : 4) * (CBrCost + 1);
775 }
776 case Instruction::Ret:
777 return SCost ? 1 : 10;
778 }
779 return BaseT::getCFInstrCost(Opcode, CostKind, I);
780}
781
782InstructionCost
783GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
784 std::optional<FastMathFlags> FMF,
785 TTI::TargetCostKind CostKind) {
786 if (TTI::requiresOrderedReduction(FMF))
787 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
788
789 EVT OrigTy = TLI->getValueType(DL, Ty);
790
791 // Computes cost on targets that have packed math instructions(which support
792 // 16-bit types only).
793 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
794 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
795
796 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
797 return LT.first * getFullRateInstrCost();
798}
799
800InstructionCost
801GCNTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
802 FastMathFlags FMF,
803 TTI::TargetCostKind CostKind) {
804 EVT OrigTy = TLI->getValueType(DL, Ty);
805
806 // Computes cost on targets that have packed math instructions(which support
807 // 16-bit types only).
808 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
809 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
810
811 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
812 return LT.first * getHalfRateInstrCost(CostKind);
813}
814
815InstructionCost GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
816 TTI::TargetCostKind CostKind,
817 unsigned Index, Value *Op0,
818 Value *Op1) {
819 switch (Opcode) {
820 case Instruction::ExtractElement:
821 case Instruction::InsertElement: {
822 unsigned EltSize
823 = DL.getTypeSizeInBits(Ty: cast<VectorType>(Val: ValTy)->getElementType());
824 if (EltSize < 32) {
825 if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
826 return 0;
827 return BaseT::getVectorInstrCost(Opcode, Val: ValTy, CostKind, Index, Op0,
828 Op1);
829 }
830
831 // Extracts are just reads of a subregister, so are free. Inserts are
832 // considered free because we don't want to have any cost for scalarizing
833 // operations, and we don't have to copy into a different register class.
834
835 // Dynamic indexing isn't free and is best avoided.
836 return Index == ~0u ? 2 : 0;
837 }
838 default:
839 return BaseT::getVectorInstrCost(Opcode, Val: ValTy, CostKind, Index, Op0, Op1);
840 }
841}
842
843/// Analyze if the results of inline asm are divergent. If \p Indices is empty,
844/// this is analyzing the collective result of all output registers. Otherwise,
845/// this is only querying a specific result index if this returns multiple
846/// registers in a struct.
847bool GCNTTIImpl::isInlineAsmSourceOfDivergence(
848 const CallInst *CI, ArrayRef<unsigned> Indices) const {
849 // TODO: Handle complex extract indices
850 if (Indices.size() > 1)
851 return true;
852
853 const DataLayout &DL = CI->getModule()->getDataLayout();
854 const SIRegisterInfo *TRI = ST->getRegisterInfo();
855 TargetLowering::AsmOperandInfoVector TargetConstraints =
856 TLI->ParseConstraints(DL, ST->getRegisterInfo(), *CI);
857
858 const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0];
859
860 int OutputIdx = 0;
861 for (auto &TC : TargetConstraints) {
862 if (TC.Type != InlineAsm::isOutput)
863 continue;
864
865 // Skip outputs we don't care about.
866 if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
867 continue;
868
869 TLI->ComputeConstraintToUse(TC, SDValue());
870
871 const TargetRegisterClass *RC = TLI->getRegForInlineAsmConstraint(
872 TRI, TC.ConstraintCode, TC.ConstraintVT).second;
873
874 // For AGPR constraints null is returned on subtargets without AGPRs, so
875 // assume divergent for null.
876 if (!RC || !TRI->isSGPRClass(RC))
877 return true;
878 }
879
880 return false;
881}
882
883bool GCNTTIImpl::isReadRegisterSourceOfDivergence(
884 const IntrinsicInst *ReadReg) const {
885 Metadata *MD =
886 cast<MetadataAsValue>(Val: ReadReg->getArgOperand(i: 0))->getMetadata();
887 StringRef RegName =
888 cast<MDString>(Val: cast<MDNode>(Val: MD)->getOperand(I: 0))->getString();
889
890 // Special case registers that look like VCC.
891 MVT VT = MVT::getVT(Ty: ReadReg->getType());
892 if (VT == MVT::i1)
893 return true;
894
895 // Special case scalar registers that start with 'v'.
896 if (RegName.starts_with(Prefix: "vcc") || RegName.empty())
897 return false;
898
899 // VGPR or AGPR is divergent. There aren't any specially named vector
900 // registers.
901 return RegName[0] == 'v' || RegName[0] == 'a';
902}
903
904/// \returns true if the result of the value could potentially be
905/// different across workitems in a wavefront.
906bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
907 if (const Argument *A = dyn_cast<Argument>(Val: V))
908 return !AMDGPU::isArgPassedInSGPR(Arg: A);
909
910 // Loads from the private and flat address spaces are divergent, because
911 // threads can execute the load instruction with the same inputs and get
912 // different results.
913 //
914 // All other loads are not divergent, because if threads issue loads with the
915 // same arguments, they will always get the same result.
916 if (const LoadInst *Load = dyn_cast<LoadInst>(Val: V))
917 return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
918 Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
919
920 // Atomics are divergent because they are executed sequentially: when an
921 // atomic operation refers to the same address in each thread, then each
922 // thread after the first sees the value written by the previous thread as
923 // original value.
924 if (isa<AtomicRMWInst>(Val: V) || isa<AtomicCmpXchgInst>(Val: V))
925 return true;
926
927 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(Val: V)) {
928 if (Intrinsic->getIntrinsicID() == Intrinsic::read_register)
929 return isReadRegisterSourceOfDivergence(ReadReg: Intrinsic);
930
931 return AMDGPU::isIntrinsicSourceOfDivergence(IntrID: Intrinsic->getIntrinsicID());
932 }
933
934 // Assume all function calls are a source of divergence.
935 if (const CallInst *CI = dyn_cast<CallInst>(Val: V)) {
936 if (CI->isInlineAsm())
937 return isInlineAsmSourceOfDivergence(CI);
938 return true;
939 }
940
941 // Assume all function calls are a source of divergence.
942 if (isa<InvokeInst>(Val: V))
943 return true;
944
945 return false;
946}
947
948bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
949 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(Val: V))
950 return AMDGPU::isIntrinsicAlwaysUniform(IntrID: Intrinsic->getIntrinsicID());
951
952 if (const CallInst *CI = dyn_cast<CallInst>(Val: V)) {
953 if (CI->isInlineAsm())
954 return !isInlineAsmSourceOfDivergence(CI);
955 return false;
956 }
957
958 // In most cases TID / wavefrontsize is uniform.
959 //
960 // However, if a kernel has uneven dimesions we can have a value of
961 // workitem-id-x divided by the wavefrontsize non-uniform. For example
962 // dimensions (65, 2) will have workitems with address (64, 0) and (0, 1)
963 // packed into a same wave which gives 1 and 0 after the division by 64
964 // respectively.
965 //
966 // FIXME: limit it to 1D kernels only, although that shall be possible
967 // to perform this optimization is the size of the X dimension is a power
968 // of 2, we just do not currently have infrastructure to query it.
969 using namespace llvm::PatternMatch;
970 uint64_t C;
971 if (match(V, m_LShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
972 m_ConstantInt(C))) ||
973 match(V, m_AShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
974 m_ConstantInt(C)))) {
975 const Function *F = cast<Instruction>(Val: V)->getFunction();
976 return C >= ST->getWavefrontSizeLog2() &&
977 ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0;
978 }
979
980 Value *Mask;
981 if (match(V, m_c_And(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
982 m_Value(Mask)))) {
983 const Function *F = cast<Instruction>(Val: V)->getFunction();
984 const DataLayout &DL = F->getParent()->getDataLayout();
985 return computeKnownBits(V: Mask, DL).countMinTrailingZeros() >=
986 ST->getWavefrontSizeLog2() &&
987 ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0;
988 }
989
990 const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(Val: V);
991 if (!ExtValue)
992 return false;
993
994 const CallInst *CI = dyn_cast<CallInst>(Val: ExtValue->getOperand(i_nocapture: 0));
995 if (!CI)
996 return false;
997
998 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(Val: CI)) {
999 switch (Intrinsic->getIntrinsicID()) {
1000 default:
1001 return false;
1002 case Intrinsic::amdgcn_if:
1003 case Intrinsic::amdgcn_else: {
1004 ArrayRef<unsigned> Indices = ExtValue->getIndices();
1005 return Indices.size() == 1 && Indices[0] == 1;
1006 }
1007 }
1008 }
1009
1010 // If we have inline asm returning mixed SGPR and VGPR results, we inferred
1011 // divergent for the overall struct return. We need to override it in the
1012 // case we're extracting an SGPR component here.
1013 if (CI->isInlineAsm())
1014 return !isInlineAsmSourceOfDivergence(CI, Indices: ExtValue->getIndices());
1015
1016 return false;
1017}
1018
1019bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
1020 Intrinsic::ID IID) const {
1021 switch (IID) {
1022 case Intrinsic::amdgcn_ds_fadd:
1023 case Intrinsic::amdgcn_ds_fmin:
1024 case Intrinsic::amdgcn_ds_fmax:
1025 case Intrinsic::amdgcn_is_shared:
1026 case Intrinsic::amdgcn_is_private:
1027 case Intrinsic::amdgcn_flat_atomic_fadd:
1028 case Intrinsic::amdgcn_flat_atomic_fmax:
1029 case Intrinsic::amdgcn_flat_atomic_fmin:
1030 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1031 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1032 OpIndexes.push_back(Elt: 0);
1033 return true;
1034 default:
1035 return false;
1036 }
1037}
1038
1039Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
1040 Value *OldV,
1041 Value *NewV) const {
1042 auto IntrID = II->getIntrinsicID();
1043 switch (IntrID) {
1044 case Intrinsic::amdgcn_ds_fadd:
1045 case Intrinsic::amdgcn_ds_fmin:
1046 case Intrinsic::amdgcn_ds_fmax: {
1047 const ConstantInt *IsVolatile = cast<ConstantInt>(Val: II->getArgOperand(i: 4));
1048 if (!IsVolatile->isZero())
1049 return nullptr;
1050 Module *M = II->getParent()->getParent()->getParent();
1051 Type *DestTy = II->getType();
1052 Type *SrcTy = NewV->getType();
1053 Function *NewDecl =
1054 Intrinsic::getDeclaration(M, id: II->getIntrinsicID(), Tys: {DestTy, SrcTy});
1055 II->setArgOperand(i: 0, v: NewV);
1056 II->setCalledFunction(NewDecl);
1057 return II;
1058 }
1059 case Intrinsic::amdgcn_is_shared:
1060 case Intrinsic::amdgcn_is_private: {
1061 unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
1062 AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS;
1063 unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1064 LLVMContext &Ctx = NewV->getType()->getContext();
1065 ConstantInt *NewVal = (TrueAS == NewAS) ?
1066 ConstantInt::getTrue(Context&: Ctx) : ConstantInt::getFalse(Context&: Ctx);
1067 return NewVal;
1068 }
1069 case Intrinsic::ptrmask: {
1070 unsigned OldAS = OldV->getType()->getPointerAddressSpace();
1071 unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1072 Value *MaskOp = II->getArgOperand(i: 1);
1073 Type *MaskTy = MaskOp->getType();
1074
1075 bool DoTruncate = false;
1076
1077 const GCNTargetMachine &TM =
1078 static_cast<const GCNTargetMachine &>(getTLI()->getTargetMachine());
1079 if (!TM.isNoopAddrSpaceCast(SrcAS: OldAS, DestAS: NewAS)) {
1080 // All valid 64-bit to 32-bit casts work by chopping off the high
1081 // bits. Any masking only clearing the low bits will also apply in the new
1082 // address space.
1083 if (DL.getPointerSizeInBits(AS: OldAS) != 64 ||
1084 DL.getPointerSizeInBits(AS: NewAS) != 32)
1085 return nullptr;
1086
1087 // TODO: Do we need to thread more context in here?
1088 KnownBits Known = computeKnownBits(V: MaskOp, DL, Depth: 0, AC: nullptr, CxtI: II);
1089 if (Known.countMinLeadingOnes() < 32)
1090 return nullptr;
1091
1092 DoTruncate = true;
1093 }
1094
1095 IRBuilder<> B(II);
1096 if (DoTruncate) {
1097 MaskTy = B.getInt32Ty();
1098 MaskOp = B.CreateTrunc(V: MaskOp, DestTy: MaskTy);
1099 }
1100
1101 return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy},
1102 {NewV, MaskOp});
1103 }
1104 case Intrinsic::amdgcn_flat_atomic_fadd:
1105 case Intrinsic::amdgcn_flat_atomic_fmax:
1106 case Intrinsic::amdgcn_flat_atomic_fmin:
1107 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1108 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
1109 Type *DestTy = II->getType();
1110 Type *SrcTy = NewV->getType();
1111 unsigned NewAS = SrcTy->getPointerAddressSpace();
1112 if (!AMDGPU::isExtendedGlobalAddrSpace(AS: NewAS))
1113 return nullptr;
1114 Module *M = II->getModule();
1115 Function *NewDecl = Intrinsic::getDeclaration(M, id: II->getIntrinsicID(),
1116 Tys: {DestTy, SrcTy, DestTy});
1117 II->setArgOperand(i: 0, v: NewV);
1118 II->setCalledFunction(NewDecl);
1119 return II;
1120 }
1121 default:
1122 return nullptr;
1123 }
1124}
1125
1126InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1127 VectorType *VT, ArrayRef<int> Mask,
1128 TTI::TargetCostKind CostKind,
1129 int Index, VectorType *SubTp,
1130 ArrayRef<const Value *> Args,
1131 const Instruction *CxtI) {
1132 Kind = improveShuffleKindFromMask(Kind, Mask, Ty: VT, Index, SubTy&: SubTp);
1133 // Treat extractsubvector as single op permutation.
1134 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
1135 if (IsExtractSubvector)
1136 Kind = TTI::SK_PermuteSingleSrc;
1137
1138 if (ST->hasVOP3PInsts()) {
1139 if (cast<FixedVectorType>(Val: VT)->getNumElements() == 2 &&
1140 DL.getTypeSizeInBits(Ty: VT->getElementType()) == 16) {
1141 // With op_sel VOP3P instructions freely can access the low half or high
1142 // half of a register, so any swizzle is free.
1143
1144 switch (Kind) {
1145 case TTI::SK_Broadcast:
1146 case TTI::SK_Reverse:
1147 case TTI::SK_PermuteSingleSrc:
1148 return 0;
1149 default:
1150 break;
1151 }
1152 }
1153 }
1154 // Restore optimal kind.
1155 if (IsExtractSubvector)
1156 Kind = TTI::SK_ExtractSubvector;
1157
1158 return BaseT::getShuffleCost(Kind, Tp: VT, Mask, CostKind, Index, SubTp);
1159}
1160
1161bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
1162 const Function *Callee) const {
1163 const TargetMachine &TM = getTLI()->getTargetMachine();
1164 const GCNSubtarget *CallerST
1165 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller));
1166 const GCNSubtarget *CalleeST
1167 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee));
1168
1169 const FeatureBitset &CallerBits = CallerST->getFeatureBits();
1170 const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
1171
1172 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
1173 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
1174 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
1175 return false;
1176
1177 // FIXME: dx10_clamp can just take the caller setting, but there seems to be
1178 // no way to support merge for backend defined attributes.
1179 SIModeRegisterDefaults CallerMode(*Caller, *CallerST);
1180 SIModeRegisterDefaults CalleeMode(*Callee, *CalleeST);
1181 if (!CallerMode.isInlineCompatible(CalleeMode))
1182 return false;
1183
1184 if (Callee->hasFnAttribute(Attribute::AlwaysInline) ||
1185 Callee->hasFnAttribute(Attribute::InlineHint))
1186 return true;
1187
1188 // Hack to make compile times reasonable.
1189 if (InlineMaxBB) {
1190 // Single BB does not increase total BB amount.
1191 if (Callee->size() == 1)
1192 return true;
1193 size_t BBSize = Caller->size() + Callee->size() - 1;
1194 return BBSize <= InlineMaxBB;
1195 }
1196
1197 return true;
1198}
1199
1200static unsigned adjustInliningThresholdUsingCallee(const CallBase *CB,
1201 const SITargetLowering *TLI,
1202 const GCNTTIImpl *TTIImpl) {
1203 const int NrOfSGPRUntilSpill = 26;
1204 const int NrOfVGPRUntilSpill = 32;
1205
1206 const DataLayout &DL = TTIImpl->getDataLayout();
1207
1208 unsigned adjustThreshold = 0;
1209 int SGPRsInUse = 0;
1210 int VGPRsInUse = 0;
1211 for (const Use &A : CB->args()) {
1212 SmallVector<EVT, 4> ValueVTs;
1213 ComputeValueVTs(TLI: *TLI, DL, Ty: A.get()->getType(), ValueVTs);
1214 for (auto ArgVT : ValueVTs) {
1215 unsigned CCRegNum = TLI->getNumRegistersForCallingConv(
1216 Context&: CB->getContext(), CC: CB->getCallingConv(), VT: ArgVT);
1217 if (AMDGPU::isArgPassedInSGPR(CB, ArgNo: CB->getArgOperandNo(U: &A)))
1218 SGPRsInUse += CCRegNum;
1219 else
1220 VGPRsInUse += CCRegNum;
1221 }
1222 }
1223
1224 // The cost of passing function arguments through the stack:
1225 // 1 instruction to put a function argument on the stack in the caller.
1226 // 1 instruction to take a function argument from the stack in callee.
1227 // 1 instruction is explicitly take care of data dependencies in callee
1228 // function.
1229 InstructionCost ArgStackCost(1);
1230 ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1231 Opcode: Instruction::Store, Src: Type::getInt32Ty(C&: CB->getContext()), Alignment: Align(4),
1232 AddressSpace: AMDGPUAS::PRIVATE_ADDRESS, CostKind: TTI::TCK_SizeAndLatency);
1233 ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1234 Opcode: Instruction::Load, Src: Type::getInt32Ty(C&: CB->getContext()), Alignment: Align(4),
1235 AddressSpace: AMDGPUAS::PRIVATE_ADDRESS, CostKind: TTI::TCK_SizeAndLatency);
1236
1237 // The penalty cost is computed relative to the cost of instructions and does
1238 // not model any storage costs.
1239 adjustThreshold += std::max(a: 0, b: SGPRsInUse - NrOfSGPRUntilSpill) *
1240 *ArgStackCost.getValue() * InlineConstants::getInstrCost();
1241 adjustThreshold += std::max(a: 0, b: VGPRsInUse - NrOfVGPRUntilSpill) *
1242 *ArgStackCost.getValue() * InlineConstants::getInstrCost();
1243 return adjustThreshold;
1244}
1245
1246static unsigned getCallArgsTotalAllocaSize(const CallBase *CB,
1247 const DataLayout &DL) {
1248 // If we have a pointer to a private array passed into a function
1249 // it will not be optimized out, leaving scratch usage.
1250 // This function calculates the total size in bytes of the memory that would
1251 // end in scratch if the call was not inlined.
1252 unsigned AllocaSize = 0;
1253 SmallPtrSet<const AllocaInst *, 8> AIVisited;
1254 for (Value *PtrArg : CB->args()) {
1255 PointerType *Ty = dyn_cast<PointerType>(Val: PtrArg->getType());
1256 if (!Ty)
1257 continue;
1258
1259 unsigned AddrSpace = Ty->getAddressSpace();
1260 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS &&
1261 AddrSpace != AMDGPUAS::PRIVATE_ADDRESS)
1262 continue;
1263
1264 const AllocaInst *AI = dyn_cast<AllocaInst>(Val: getUnderlyingObject(V: PtrArg));
1265 if (!AI || !AI->isStaticAlloca() || !AIVisited.insert(Ptr: AI).second)
1266 continue;
1267
1268 AllocaSize += DL.getTypeAllocSize(Ty: AI->getAllocatedType());
1269 }
1270 return AllocaSize;
1271}
1272
1273unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase *CB) const {
1274 unsigned Threshold = adjustInliningThresholdUsingCallee(CB, TLI, TTIImpl: this);
1275
1276 // Private object passed as arguments may end up in scratch usage if the call
1277 // is not inlined. Increase the inline threshold to promote inlining.
1278 unsigned AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1279 if (AllocaSize > 0)
1280 Threshold += ArgAllocaCost;
1281 return Threshold;
1282}
1283
1284unsigned GCNTTIImpl::getCallerAllocaCost(const CallBase *CB,
1285 const AllocaInst *AI) const {
1286
1287 // Below the cutoff, assume that the private memory objects would be
1288 // optimized
1289 auto AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1290 if (AllocaSize <= ArgAllocaCutoff)
1291 return 0;
1292
1293 // Above the cutoff, we give a cost to each private memory object
1294 // depending its size. If the array can be optimized by SROA this cost is not
1295 // added to the total-cost in the inliner cost analysis.
1296 //
1297 // We choose the total cost of the alloca such that their sum cancels the
1298 // bonus given in the threshold (ArgAllocaCost).
1299 //
1300 // Cost_Alloca_0 + ... + Cost_Alloca_N == ArgAllocaCost
1301 //
1302 // Awkwardly, the ArgAllocaCost bonus is multiplied by threshold-multiplier,
1303 // the single-bb bonus and the vector-bonus.
1304 //
1305 // We compensate the first two multipliers, by repeating logic from the
1306 // inliner-cost in here. The vector-bonus is 0 on AMDGPU.
1307 static_assert(InlinerVectorBonusPercent == 0, "vector bonus assumed to be 0");
1308 unsigned Threshold = ArgAllocaCost * getInliningThresholdMultiplier();
1309
1310 bool SingleBB = none_of(Range&: *CB->getCalledFunction(), P: [](const BasicBlock &BB) {
1311 return BB.getTerminator()->getNumSuccessors() > 1;
1312 });
1313 if (SingleBB) {
1314 Threshold += Threshold / 2;
1315 }
1316
1317 auto ArgAllocaSize = DL.getTypeAllocSize(Ty: AI->getAllocatedType());
1318
1319 // Attribute the bonus proportionally to the alloca size
1320 unsigned AllocaThresholdBonus = (Threshold * ArgAllocaSize) / AllocaSize;
1321
1322 return AllocaThresholdBonus;
1323}
1324
1325void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1326 TTI::UnrollingPreferences &UP,
1327 OptimizationRemarkEmitter *ORE) {
1328 CommonTTI.getUnrollingPreferences(L, SE, UP, ORE);
1329}
1330
1331void GCNTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
1332 TTI::PeelingPreferences &PP) {
1333 CommonTTI.getPeelingPreferences(L, SE, PP);
1334}
1335
1336int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const {
1337 return ST->hasFullRate64Ops()
1338 ? getFullRateInstrCost()
1339 : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
1340 : getQuarterRateInstrCost(CostKind);
1341}
1342
1343std::pair<InstructionCost, MVT>
1344GCNTTIImpl::getTypeLegalizationCost(Type *Ty) const {
1345 std::pair<InstructionCost, MVT> Cost = BaseT::getTypeLegalizationCost(Ty);
1346 auto Size = DL.getTypeSizeInBits(Ty);
1347 // Maximum load or store can handle 8 dwords for scalar and 4 for
1348 // vector ALU. Let's assume anything above 8 dwords is expensive
1349 // even if legal.
1350 if (Size <= 256)
1351 return Cost;
1352
1353 Cost.first += (Size + 255) / 256;
1354 return Cost;
1355}
1356
1357unsigned GCNTTIImpl::getPrefetchDistance() const {
1358 return ST->hasPrefetch() ? 128 : 0;
1359}
1360
1361bool GCNTTIImpl::shouldPrefetchAddressSpace(unsigned AS) const {
1362 return AMDGPU::isFlatGlobalAddrSpace(AS);
1363}
1364

source code of llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp