AMDGPUTargetTransformInfo.cpp source code [llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp]

1	//===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// \file
10	// This file implements a TargetTransformInfo analysis pass specific to the
11	// AMDGPU target machine. It uses the target's detailed information to provide
12	// more precise answers to certain TTI queries, while letting the target
13	// independent and default TTI implementations handle the rest.
14	//
15	//===----------------------------------------------------------------------===//
16
17	#include "AMDGPUTargetTransformInfo.h"
18	#include "AMDGPUTargetMachine.h"
19	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20	#include "SIModeRegisterDefaults.h"
21	#include "llvm/Analysis/InlineCost.h"
22	#include "llvm/Analysis/LoopInfo.h"
23	#include "llvm/Analysis/ValueTracking.h"
24	#include "llvm/CodeGen/Analysis.h"
25	#include "llvm/IR/IRBuilder.h"
26	#include "llvm/IR/IntrinsicsAMDGPU.h"
27	#include "llvm/IR/PatternMatch.h"
28	#include "llvm/Support/KnownBits.h"
29	#include <optional>
30
31	using namespace llvm;
32
33	#define DEBUG_TYPE "AMDGPUtti"
34
35	static cl::opt<unsigned> UnrollThresholdPrivate(
36	"amdgpu-unroll-threshold-private",
37	cl::desc ("Unroll threshold for AMDGPU if private memory used in a loop"),
38	cl::init(Val: `2700`), cl::Hidden);
39
40	static cl::opt<unsigned> UnrollThresholdLocal(
41	"amdgpu-unroll-threshold-local",
42	cl::desc ("Unroll threshold for AMDGPU if local memory used in a loop"),
43	cl::init(Val: `1000`), cl::Hidden);
44
45	static cl::opt<unsigned> UnrollThresholdIf(
46	"amdgpu-unroll-threshold-if",
47	cl::desc ("Unroll threshold increment for AMDGPU for each if statement inside loop"),
48	cl::init(Val: `200`), cl::Hidden);
49
50	static cl::opt<bool> UnrollRuntimeLocal(
51	"amdgpu-unroll-runtime-local",
52	cl::desc ("Allow runtime unroll for AMDGPU if local memory used in a loop"),
53	cl::init(Val: true), cl::Hidden);
54
55	static cl::opt<unsigned> UnrollMaxBlockToAnalyze(
56	"amdgpu-unroll-max-block-to-analyze",
57	cl::desc ("Inner loop block size threshold to analyze in unroll for AMDGPU"),
58	cl::init(Val: `32`), cl::Hidden);
59
60	static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost",
61	cl::Hidden, cl::init(Val: `4000`),
62	cl::desc ("Cost of alloca argument"));
63
64	// If the amount of scratch memory to eliminate exceeds our ability to allocate
65	// it into registers we gain nothing by aggressively inlining functions for that
66	// heuristic.
67	static cl::opt<unsigned>
68	ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden,
69	cl::init(Val: `256`),
70	cl::desc ("Maximum alloca size to use for inline cost"));
71
72	// Inliner constraint to achieve reasonable compilation time.
73	static cl::opt<size_t> InlineMaxBB(
74	"amdgpu-inline-max-bb", cl::Hidden, cl::init(Val: `1100`),
75	cl::desc ("Maximum number of BBs allowed in a function after inlining"
76	" (compile time constraint)"));
77
78	static bool dependsOnLocalPhi(const Loop L, const* Value *Cond,
79	unsigned Depth = `0`) {
80	const Instruction *I = dyn_cast<Instruction>(Val: Cond);
81	if (!I)
82	return false;
83
84	for (const Value *V : I->operand_values()) {
85	if (!L->contains(Inst: I))
86	continue;
87	if (const PHINode *PHI = dyn_cast<PHINode>(Val: V)) {
88	if (llvm::none_of(Range: L->getSubLoops(), P: [PHI](const Loop* SubLoop) {
89	return SubLoop->contains(Inst: PHI); }))
90	return true;
91	} else if (Depth < `10` && dependsOnLocalPhi(L, Cond: V, Depth: Depth+`1`))
92	return true;
93	}
94	return false;
95	}
96
97	AMDGPUTTIImpl::AMDGPUTTIImpl(const AMDGPUTargetMachine TM, const* Function &F)
98	: BaseT (TM, F.getParent()->getDataLayout()),
99	TargetTriple (TM->getTargetTriple()),
100	ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
101	TLI(ST->getTargetLowering()) {}
102
103	void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
104	TTI::UnrollingPreferences &UP,
105	OptimizationRemarkEmitter *ORE) {
106	const Function &F = *L->getHeader()->getParent();
107	UP.Threshold =
108	F.getFnAttributeAsParsedInteger(Kind: "amdgpu-unroll-threshold", Default: `300`);
109	UP.MaxCount = std::numeric_limits<unsigned>::max();
110	UP.Partial = true;
111
112	// Conditional branch in a loop back edge needs 3 additional exec
113	// manipulations in average.
114	UP.BEInsns += `3`;
115
116	// We want to run unroll even for the loops which have been vectorized.
117	UP.UnrollVectorizedLoop = true;
118
119	// TODO: Do we want runtime unrolling?
120
121	// Maximum alloca size than can fit registers. Reserve 16 registers.
122	const unsigned MaxAlloca = (`256` - `16`) * `4`;
123	unsigned ThresholdPrivate = UnrollThresholdPrivate;
124	unsigned ThresholdLocal = UnrollThresholdLocal;
125
126	// If this loop has the amdgpu.loop.unroll.threshold metadata we will use the
127	// provided threshold value as the default for Threshold
128	if (MDNode *LoopUnrollThreshold =
129	findOptionMDForLoop(TheLoop: L, Name: "amdgpu.loop.unroll.threshold")) {
130	if (LoopUnrollThreshold->getNumOperands() == `2`) {
131	ConstantInt *MetaThresholdValue = mdconst::extract_or_null<ConstantInt>(
132	MD: LoopUnrollThreshold->getOperand(I: `1`));
133	if (MetaThresholdValue) {
134	// We will also use the supplied value for PartialThreshold for now.
135	// We may introduce additional metadata if it becomes necessary in the
136	// future.
137	UP.Threshold = MetaThresholdValue->getSExtValue();
138	UP.PartialThreshold = UP.Threshold;
139	ThresholdPrivate = std::min(a: ThresholdPrivate, b: UP.Threshold);
140	ThresholdLocal = std::min(a: ThresholdLocal, b: UP.Threshold);
141	}
142	}
143	}
144
145	unsigned MaxBoost = std::max(a: ThresholdPrivate, b: ThresholdLocal);
146	for (const BasicBlock *BB : L->getBlocks()) {
147	const DataLayout &DL = BB->getModule()->getDataLayout();
148	unsigned LocalGEPsSeen = `0`;
149
150	if (llvm::any_of(Range: L->getSubLoops(), P: [BB](const Loop* SubLoop) {
151	return SubLoop->contains(BB); }))
152	continue; // Block belongs to an inner loop.
153
154	for (const Instruction &I : *BB) {
155	// Unroll a loop which contains an "if" statement whose condition
156	// defined by a PHI belonging to the loop. This may help to eliminate
157	// if region and potentially even PHI itself, saving on both divergence
158	// and registers used for the PHI.
159	// Add a small bonus for each of such "if" statements.
160	if (const BranchInst *Br = dyn_cast<BranchInst>(Val: &I)) {
161	if (UP.Threshold < MaxBoost && Br->isConditional()) {
162	BasicBlock *Succ0 = Br->getSuccessor(i: `0`);
163	BasicBlock *Succ1 = Br->getSuccessor(i: `1`);
164	if ((L->contains(BB: Succ0) && L->isLoopExiting(BB: Succ0)) \|\|
165	(L->contains(BB: Succ1) && L->isLoopExiting(BB: Succ1)))
166	continue;
167	if (dependsOnLocalPhi(L, Cond: Br->getCondition())) {
168	UP.Threshold += UnrollThresholdIf;
169	LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
170	<< " for loop:\n"
171	<< L << " due to " << Br << `'\n'`);
172	if (UP.Threshold >= MaxBoost)
173	return;
174	}
175	}
176	continue;
177	}
178
179	const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: &I);
180	if (!GEP)
181	continue;
182
183	unsigned AS = GEP->getAddressSpace();
184	unsigned Threshold = `0`;
185	if (AS == AMDGPUAS::PRIVATE_ADDRESS)
186	Threshold = ThresholdPrivate;
187	else if (AS == AMDGPUAS::LOCAL_ADDRESS \|\| AS == AMDGPUAS::REGION_ADDRESS)
188	Threshold = ThresholdLocal;
189	else
190	continue;
191
192	if (UP.Threshold >= Threshold)
193	continue;
194
195	if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
196	const Value *Ptr = GEP->getPointerOperand();
197	const AllocaInst *Alloca =
198	dyn_cast<AllocaInst>(Val: getUnderlyingObject(V: Ptr));
199	if (!Alloca \|\| !Alloca->isStaticAlloca())
200	continue;
201	Type *Ty = Alloca->getAllocatedType();
202	unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : `0`;
203	if (AllocaSize > MaxAlloca)
204	continue;
205	} else if (AS == AMDGPUAS::LOCAL_ADDRESS \|\|
206	AS == AMDGPUAS::REGION_ADDRESS) {
207	LocalGEPsSeen++;
208	// Inhibit unroll for local memory if we have seen addressing not to
209	// a variable, most likely we will be unable to combine it.
210	// Do not unroll too deep inner loops for local memory to give a chance
211	// to unroll an outer loop for a more important reason.
212	if (LocalGEPsSeen > `1` \|\| L->getLoopDepth() > `2` \|\|
213	(!isa<GlobalVariable>(Val: GEP->getPointerOperand()) &&
214	!isa<Argument>(Val: GEP->getPointerOperand())))
215	continue;
216	LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"
217	<< *L << " due to LDS use.\n");
218	UP.Runtime = UnrollRuntimeLocal;
219	}
220
221	// Check if GEP depends on a value defined by this loop itself.
222	bool HasLoopDef = false;
223	for (const Value *Op : GEP->operands()) {
224	const Instruction *Inst = dyn_cast<Instruction>(Val: Op);
225	if (!Inst \|\| L->isLoopInvariant(V: Op))
226	continue;
227
228	if (llvm::any_of(Range: L->getSubLoops(), P: [Inst](const Loop* SubLoop) {
229	return SubLoop->contains(Inst); }))
230	continue;
231	HasLoopDef = true;
232	break;
233	}
234	if (!HasLoopDef)
235	continue;
236
237	// We want to do whatever we can to limit the number of alloca
238	// instructions that make it through to the code generator. allocas
239	// require us to use indirect addressing, which is slow and prone to
240	// compiler bugs. If this loop does an address calculation on an
241	// alloca ptr, then we want to use a higher than normal loop unroll
242	// threshold. This will give SROA a better chance to eliminate these
243	// allocas.
244	//
245	// We also want to have more unrolling for local memory to let ds
246	// instructions with different offsets combine.
247	//
248	// Don't use the maximum allowed value here as it will make some
249	// programs way too big.
250	UP.Threshold = Threshold;
251	LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
252	<< " for loop:\n"
253	<< L << " due to " << GEP << `'\n'`);
254	if (UP.Threshold >= MaxBoost)
255	return;
256	}
257
258	// If we got a GEP in a small BB from inner loop then increase max trip
259	// count to analyze for better estimation cost in unroll
260	if (L->isInnermost() && BB->size() < UnrollMaxBlockToAnalyze)
261	UP.MaxIterationsCountToAnalyze = `32`;
262	}
263	}
264
265	void AMDGPUTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
266	TTI::PeelingPreferences &PP) {
267	BaseT::getPeelingPreferences(L, SE, PP);
268	}
269
270	int64_t AMDGPUTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {
271	return `1024`;
272	}
273
274	const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
275	// Codegen control options which don't matter.
276	AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
277	AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureFlatForGlobal,
278	AMDGPU::FeaturePromoteAlloca, AMDGPU::FeatureUnalignedScratchAccess,
279	AMDGPU::FeatureUnalignedAccessMode,
280
281	AMDGPU::FeatureAutoWaitcntBeforeBarrier,
282
283	// Property of the kernel/environment which can't actually differ.
284	AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,
285	AMDGPU::FeatureTrapHandler,
286
287	// The default assumption needs to be ecc is enabled, but no directly
288	// exposed operations depend on it, so it can be safely inlined.
289	AMDGPU::FeatureSRAMECC,
290
291	// Perf-tuning features
292	AMDGPU::FeatureFastFMAF32, AMDGPU::HalfRate64Ops};
293
294	GCNTTIImpl::GCNTTIImpl(const AMDGPUTargetMachine TM, const* Function &F)
295	: BaseT (TM, F.getParent()->getDataLayout()),
296	ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
297	TLI(ST->getTargetLowering()), CommonTTI (TM, F),
298	IsGraphics(AMDGPU::isGraphics(CC: F.getCallingConv())) {
299	SIModeRegisterDefaults Mode(F, *ST);
300	HasFP32Denormals = Mode.FP32Denormals != DenormalMode::getPreserveSign();
301	HasFP64FP16Denormals =
302	Mode.FP64FP16Denormals != DenormalMode::getPreserveSign();
303	}
304
305	bool GCNTTIImpl::hasBranchDivergence(const Function F) const* {
306	return !F \|\| !ST->isSingleLaneExecution(*F);
307	}
308
309	unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
310	// NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
311	// registers. See getRegisterClassForType for the implementation.
312	// In this case vector registers are not vector in terms of
313	// VGPRs, but those which can hold multiple values.
314
315	// This is really the number of registers to fill when vectorizing /
316	// interleaving loops, so we lie to avoid trying to use all registers.
317	return `4`;
318	}
319
320	TypeSize
321	GCNTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
322	switch (K) {
323	case TargetTransformInfo::RGK_Scalar:
324	return TypeSize::getFixed(ExactSize: `32`);
325	case TargetTransformInfo::RGK_FixedWidthVector:
326	return TypeSize::getFixed(ExactSize: ST->hasPackedFP32Ops() ? `64` : `32`);
327	case TargetTransformInfo::RGK_ScalableVector:
328	return TypeSize::getScalable(MinimumSize: `0`);
329	}
330	llvm_unreachable("Unsupported register kind");
331	}
332
333	unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
334	return `32`;
335	}
336
337	unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
338	if (Opcode == Instruction::Load \|\| Opcode == Instruction::Store)
339	return `32` * `4` / ElemWidth;
340	return (ElemWidth == `16` && ST->has16BitInsts()) ? `2`
341	: (ElemWidth == `32` && ST->hasPackedFP32Ops()) ? `2`
342	: `1`;
343	}
344
345	unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
346	unsigned ChainSizeInBytes,
347	VectorType VecTy) const* {
348	unsigned VecRegBitWidth = VF * LoadSize;
349	if (VecRegBitWidth > `128` && VecTy->getScalarSizeInBits() < `32`)
350	// TODO: Support element-size less than 32bit?
351	return `128` / LoadSize;
352
353	return VF;
354	}
355
356	unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
357	unsigned ChainSizeInBytes,
358	VectorType VecTy) const* {
359	unsigned VecRegBitWidth = VF * StoreSize;
360	if (VecRegBitWidth > `128`)
361	return `128` / StoreSize;
362
363	return VF;
364	}
365
366	unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
367	if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS \|\|
368	AddrSpace == AMDGPUAS::CONSTANT_ADDRESS \|\|
369	AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT \|\|
370	AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER \|\|
371	AddrSpace == AMDGPUAS::BUFFER_RESOURCE \|\|
372	AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
373	return `512`;
374	}
375
376	if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
377	return `8` * ST->getMaxPrivateElementSize();
378
379	// Common to flat, global, local and region. Assume for unknown addrspace.
380	return `128`;
381	}
382
383	bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
384	Align Alignment,
385	unsigned AddrSpace) const {
386	// We allow vectorization of flat stores, even though we may need to decompose
387	// them later if they may access private memory. We don't have enough context
388	// here, and legalization can handle it.
389	if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
390	return (Alignment >= `4` \|\| ST->hasUnalignedScratchAccess()) &&
391	ChainSizeInBytes <= ST->getMaxPrivateElementSize();
392	}
393	return true;
394	}
395
396	bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
397	Align Alignment,
398	unsigned AddrSpace) const {
399	return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
400	}
401
402	bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
403	Align Alignment,
404	unsigned AddrSpace) const {
405	return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
406	}
407
408	int64_t GCNTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {
409	return `1024`;
410	}
411
412	// FIXME: Really we would like to issue multiple 128-bit loads and stores per
413	// iteration. Should we report a larger size and let it legalize?
414	//
415	// FIXME: Should we use narrower types for local/region, or account for when
416	// unaligned access is legal?
417	//
418	// FIXME: This could use fine tuning and microbenchmarks.
419	Type *GCNTTIImpl::getMemcpyLoopLoweringType(
420	LLVMContext &Context, Value Length, unsigned* SrcAddrSpace,
421	unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign,
422	std::optional<uint32_t> AtomicElementSize) const {
423
424	if (AtomicElementSize)
425	return Type::getIntNTy(C&: Context, N: AtomicElementSize `8`);
426
427	unsigned MinAlign = std::min(a: SrcAlign, b: DestAlign);
428
429	// A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the
430	// hardware into byte accesses. If you assume all alignments are equally
431	// probable, it's more efficient on average to use short accesses for this
432	// case.
433	if (MinAlign == `2`)
434	return Type::getInt16Ty(C&: Context);
435
436	// Not all subtargets have 128-bit DS instructions, and we currently don't
437	// form them by default.
438	if (SrcAddrSpace == AMDGPUAS::LOCAL_ADDRESS \|\|
439	SrcAddrSpace == AMDGPUAS::REGION_ADDRESS \|\|
440	DestAddrSpace == AMDGPUAS::LOCAL_ADDRESS \|\|
441	DestAddrSpace == AMDGPUAS::REGION_ADDRESS) {
442	return FixedVectorType::get(ElementType: Type::getInt32Ty(C&: Context), NumElts: `2`);
443	}
444
445	// Global memory works best with 16-byte accesses. Private memory will also
446	// hit this, although they'll be decomposed.
447	return FixedVectorType::get(ElementType: Type::getInt32Ty(C&: Context), NumElts: `4`);
448	}
449
450	void GCNTTIImpl::getMemcpyLoopResidualLoweringType(
451	SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
452	unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
453	unsigned SrcAlign, unsigned DestAlign,
454	std::optional<uint32_t> AtomicCpySize) const {
455	assert(RemainingBytes < `16`);
456
457	if (AtomicCpySize)
458	BaseT::getMemcpyLoopResidualLoweringType(
459	OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
460	DestAlign, AtomicCpySize);
461
462	unsigned MinAlign = std::min(a: SrcAlign, b: DestAlign);
463
464	if (MinAlign != `2`) {
465	Type *I64Ty = Type::getInt64Ty(C&: Context);
466	while (RemainingBytes >= `8`) {
467	OpsOut.push_back(Elt: I64Ty);
468	RemainingBytes -= `8`;
469	}
470
471	Type *I32Ty = Type::getInt32Ty(C&: Context);
472	while (RemainingBytes >= `4`) {
473	OpsOut.push_back(Elt: I32Ty);
474	RemainingBytes -= `4`;
475	}
476	}
477
478	Type *I16Ty = Type::getInt16Ty(C&: Context);
479	while (RemainingBytes >= `2`) {
480	OpsOut.push_back(Elt: I16Ty);
481	RemainingBytes -= `2`;
482	}
483
484	Type *I8Ty = Type::getInt8Ty(C&: Context);
485	while (RemainingBytes) {
486	OpsOut.push_back(Elt: I8Ty);
487	--RemainingBytes;
488	}
489	}
490
491	unsigned GCNTTIImpl::getMaxInterleaveFactor(ElementCount VF) {
492	// Disable unrolling if the loop is not vectorized.
493	// TODO: Enable this again.
494	if (VF.isScalar())
495	return `1`;
496
497	return `8`;
498	}
499
500	bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
501	MemIntrinsicInfo &Info) const {
502	switch (Inst->getIntrinsicID()) {
503	case Intrinsic::amdgcn_ds_ordered_add:
504	case Intrinsic::amdgcn_ds_ordered_swap:
505	case Intrinsic::amdgcn_ds_fadd:
506	case Intrinsic::amdgcn_ds_fmin:
507	case Intrinsic::amdgcn_ds_fmax: {
508	auto *Ordering = dyn_cast<ConstantInt>(Val: Inst->getArgOperand(i: `2`));
509	auto *Volatile = dyn_cast<ConstantInt>(Val: Inst->getArgOperand(i: `4`));
510	if (!Ordering \|\| !Volatile)
511	return false; // Invalid.
512
513	unsigned OrderingVal = Ordering->getZExtValue();
514	if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
515	return false;
516
517	Info.PtrVal = Inst->getArgOperand(i: `0`);
518	Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
519	Info.ReadMem = true;
520	Info.WriteMem = true;
521	Info.IsVolatile = !Volatile->isZero();
522	return true;
523	}
524	default:
525	return false;
526	}
527	}
528
529	InstructionCost GCNTTIImpl::getArithmeticInstrCost(
530	unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
531	TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
532	ArrayRef<const Value *> Args,
533	const Instruction *CxtI) {
534
535	// Legalize the type.
536	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
537	int ISD = TLI->InstructionOpcodeToISD(Opcode);
538
539	// Because we don't have any legal vector operations, but the legal types, we
540	// need to account for split vectors.
541	unsigned NElts = LT.second.isVector() ?
542	LT.second.getVectorNumElements() : `1`;
543
544	MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
545
546	switch (ISD) {
547	case ISD::SHL:
548	case ISD::SRL:
549	case ISD::SRA:
550	if (SLT == MVT::i64)
551	return get64BitInstrCost(CostKind) * LT.first * NElts;
552
553	if (ST->has16BitInsts() && SLT == MVT::i16)
554	NElts = (NElts + `1`) / `2`;
555
556	// i32
557	return getFullRateInstrCost() * LT.first * NElts;
558	case ISD::ADD:
559	case ISD::SUB:
560	case ISD::AND:
561	case ISD::OR:
562	case ISD::XOR:
563	if (SLT == MVT::i64) {
564	// and, or and xor are typically split into 2 VALU instructions.
565	return `2` * getFullRateInstrCost() * LT.first * NElts;
566	}
567
568	if (ST->has16BitInsts() && SLT == MVT::i16)
569	NElts = (NElts + `1`) / `2`;
570
571	return LT.first * NElts * getFullRateInstrCost();
572	case ISD::MUL: {
573	const int QuarterRateCost = getQuarterRateInstrCost(CostKind);
574	if (SLT == MVT::i64) {
575	const int FullRateCost = getFullRateInstrCost();
576	return (`4` * QuarterRateCost + (`2` * `2`) * FullRateCost) * LT.first * NElts;
577	}
578
579	if (ST->has16BitInsts() && SLT == MVT::i16)
580	NElts = (NElts + `1`) / `2`;
581
582	// i32
583	return QuarterRateCost * NElts * LT.first;
584	}
585	case ISD::FMUL:
586	// Check possible fuse {fadd\|fsub}(a,fmul(b,c)) and return zero cost for
587	// fmul(b,c) supposing the fadd\|fsub will get estimated cost for the whole
588	// fused operation.
589	if (CxtI && CxtI->hasOneUse())
590	if (const auto FAdd = dyn_cast<BinaryOperator>(Val: CxtI->user_begin())) {
591	const int OPC = TLI->InstructionOpcodeToISD(Opcode: FAdd->getOpcode());
592	if (OPC == ISD::FADD \|\| OPC == ISD::FSUB) {
593	if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)
594	return TargetTransformInfo::TCC_Free;
595	if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)
596	return TargetTransformInfo::TCC_Free;
597
598	// Estimate all types may be fused with contract/unsafe flags
599	const TargetOptions &Options = TLI->getTargetMachine().Options;
600	if (Options.AllowFPOpFusion == FPOpFusion::Fast \|\|
601	Options.UnsafeFPMath \|\|
602	(FAdd->hasAllowContract() && CxtI->hasAllowContract()))
603	return TargetTransformInfo::TCC_Free;
604	}
605	}
606	[[fallthrough]];
607	case ISD::FADD:
608	case ISD::FSUB:
609	if (ST->hasPackedFP32Ops() && SLT == MVT::f32)
610	NElts = (NElts + `1`) / `2`;
611	if (SLT == MVT::f64)
612	return LT.first * NElts * get64BitInstrCost(CostKind);
613
614	if (ST->has16BitInsts() && SLT == MVT::f16)
615	NElts = (NElts + `1`) / `2`;
616
617	if (SLT == MVT::f32 \|\| SLT == MVT::f16)
618	return LT.first * NElts * getFullRateInstrCost();
619	break;
620	case ISD::FDIV:
621	case ISD::FREM:
622	// FIXME: frem should be handled separately. The fdiv in it is most of it,
623	// but the current lowering is also not entirely correct.
624	if (SLT == MVT::f64) {
625	int Cost = `7` * get64BitInstrCost(CostKind) +
626	getQuarterRateInstrCost(CostKind) +
627	`3` * getHalfRateInstrCost(CostKind);
628	// Add cost of workaround.
629	if (!ST->hasUsableDivScaleConditionOutput())
630	Cost += `3` * getFullRateInstrCost();
631
632	return LT.first * Cost * NElts;
633	}
634
635	if (!Args.empty() && match(V: Args [`0`], P: PatternMatch::m_FPOne())) {
636	// TODO: This is more complicated, unsafe flags etc.
637	if ((SLT == MVT::f32 && !HasFP32Denormals) \|\|
638	(SLT == MVT::f16 && ST->has16BitInsts())) {
639	return LT.first * getQuarterRateInstrCost(CostKind) * NElts;
640	}
641	}
642
643	if (SLT == MVT::f16 && ST->has16BitInsts()) {
644	// 2 x v_cvt_f32_f16
645	// f32 rcp
646	// f32 fmul
647	// v_cvt_f16_f32
648	// f16 div_fixup
649	int Cost =
650	`4` * getFullRateInstrCost() + `2` * getQuarterRateInstrCost(CostKind);
651	return LT.first * Cost * NElts;
652	}
653
654	if (SLT == MVT::f32 && ((CxtI && CxtI->hasApproxFunc()) \|\|
655	TLI->getTargetMachine().Options.UnsafeFPMath)) {
656	// Fast unsafe fdiv lowering:
657	// f32 rcp
658	// f32 fmul
659	int Cost = getQuarterRateInstrCost(CostKind) + getFullRateInstrCost();
660	return LT.first * Cost * NElts;
661	}
662
663	if (SLT == MVT::f32 \|\| SLT == MVT::f16) {
664	// 4 more v_cvt_ insts without f16 insts support*
665	int Cost = (SLT == MVT::f16 ? `14` : `10`) * getFullRateInstrCost() +
666	`1` * getQuarterRateInstrCost(CostKind);
667
668	if (!HasFP32Denormals) {
669	// FP mode switches.
670	Cost += `2` * getFullRateInstrCost();
671	}
672
673	return LT.first * NElts * Cost;
674	}
675	break;
676	case ISD::FNEG:
677	// Use the backend' estimation. If fneg is not free each element will cost
678	// one additional instruction.
679	return TLI->isFNegFree(VT: SLT) ? `0` : NElts;
680	default:
681	break;
682	}
683
684	return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info,
685	Args, CxtI);
686	}
687
688	// Return true if there's a potential benefit from using v2f16/v2i16
689	// instructions for an intrinsic, even if it requires nontrivial legalization.
690	static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) {
691	switch (ID) {
692	case Intrinsic::fma: // TODO: fmuladd
693	// There's a small benefit to using vector ops in the legalized code.
694	case Intrinsic::round:
695	case Intrinsic::uadd_sat:
696	case Intrinsic::usub_sat:
697	case Intrinsic::sadd_sat:
698	case Intrinsic::ssub_sat:
699	return true;
700	default:
701	return false;
702	}
703	}
704
705	InstructionCost
706	GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
707	TTI::TargetCostKind CostKind) {
708	if (ICA.getID() == Intrinsic::fabs)
709	return `0`;
710
711	if (!intrinsicHasPackedVectorBenefit(ID: ICA.getID()))
712	return BaseT::getIntrinsicInstrCost(ICA, CostKind);
713
714	Type *RetTy = ICA.getReturnType();
715
716	// Legalize the type.
717	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: RetTy);
718
719	unsigned NElts = LT.second.isVector() ?
720	LT.second.getVectorNumElements() : `1`;
721
722	MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
723
724	if (SLT == MVT::f64)
725	return LT.first * NElts * get64BitInstrCost(CostKind);
726
727	if ((ST->has16BitInsts() && SLT == MVT::f16) \|\|
728	(ST->hasPackedFP32Ops() && SLT == MVT::f32))
729	NElts = (NElts + `1`) / `2`;
730
731	// TODO: Get more refined intrinsic costs?
732	unsigned InstRate = getQuarterRateInstrCost(CostKind);
733
734	switch (ICA.getID()) {
735	case Intrinsic::fma:
736	InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
737	: getQuarterRateInstrCost(CostKind);
738	break;
739	case Intrinsic::uadd_sat:
740	case Intrinsic::usub_sat:
741	case Intrinsic::sadd_sat:
742	case Intrinsic::ssub_sat:
743	static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
744	if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
745	NElts = `1`;
746	break;
747	}
748
749	return LT.first * NElts * InstRate;
750	}
751
752	InstructionCost GCNTTIImpl::getCFInstrCost(unsigned Opcode,
753	TTI::TargetCostKind CostKind,
754	const Instruction *I) {
755	assert((I == nullptr \|\| I->getOpcode() == Opcode) &&
756	"Opcode should reflect passed instruction.");
757	const bool SCost =
758	(CostKind == TTI::TCK_CodeSize \|\| CostKind == TTI::TCK_SizeAndLatency);
759	const int CBrCost = SCost ? `5` : `7`;
760	switch (Opcode) {
761	case Instruction::Br: {
762	// Branch instruction takes about 4 slots on gfx900.
763	auto BI = dyn_cast_or_null<BranchInst>(Val: I);
764	if (BI && BI->isUnconditional())
765	return SCost ? `1` : `4`;
766	// Suppose conditional branch takes additional 3 exec manipulations
767	// instructions in average.
768	return CBrCost;
769	}
770	case Instruction::Switch: {
771	auto SI = dyn_cast_or_null<SwitchInst>(Val: I);
772	// Each case (including default) takes 1 cmp + 1 cbr instructions in
773	// average.
774	return (SI ? (SI->getNumCases() + `1`) : `4`) * (CBrCost + `1`);
775	}
776	case Instruction::Ret:
777	return SCost ? `1` : `10`;
778	}
779	return BaseT::getCFInstrCost(Opcode, CostKind, I);
780	}
781
782	InstructionCost
783	GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
784	std::optional<FastMathFlags> FMF,
785	TTI::TargetCostKind CostKind) {
786	if (TTI::requiresOrderedReduction(FMF))
787	return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
788
789	EVT OrigTy = TLI->getValueType(DL, Ty);
790
791	// Computes cost on targets that have packed math instructions(which support
792	// 16-bit types only).
793	if (!ST->hasVOP3PInsts() \|\| OrigTy.getScalarSizeInBits() != `16`)
794	return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
795
796	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
797	return LT.first * getFullRateInstrCost();
798	}
799
800	InstructionCost
801	GCNTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
802	FastMathFlags FMF,
803	TTI::TargetCostKind CostKind) {
804	EVT OrigTy = TLI->getValueType(DL, Ty);
805
806	// Computes cost on targets that have packed math instructions(which support
807	// 16-bit types only).
808	if (!ST->hasVOP3PInsts() \|\| OrigTy.getScalarSizeInBits() != `16`)
809	return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
810
811	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
812	return LT.first * getHalfRateInstrCost(CostKind);
813	}
814
815	InstructionCost GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
816	TTI::TargetCostKind CostKind,
817	unsigned Index, Value *Op0,
818	Value *Op1) {
819	switch (Opcode) {
820	case Instruction::ExtractElement:
821	case Instruction::InsertElement: {
822	unsigned EltSize
823	= DL.getTypeSizeInBits(Ty: cast<VectorType>(Val: ValTy)->getElementType());
824	if (EltSize < `32`) {
825	if (EltSize == `16` && Index == `0` && ST->has16BitInsts())
826	return `0`;
827	return BaseT::getVectorInstrCost(Opcode, Val: ValTy, CostKind, Index, Op0,
828	Op1);
829	}
830
831	// Extracts are just reads of a subregister, so are free. Inserts are
832	// considered free because we don't want to have any cost for scalarizing
833	// operations, and we don't have to copy into a different register class.
834
835	// Dynamic indexing isn't free and is best avoided.
836	return Index == ~`0u` ? `2` : `0`;
837	}
838	default:
839	return BaseT::getVectorInstrCost(Opcode, Val: ValTy, CostKind, Index, Op0, Op1);
840	}
841	}
842
843	/// Analyze if the results of inline asm are divergent. If \p Indices is empty,
844	/// this is analyzing the collective result of all output registers. Otherwise,
845	/// this is only querying a specific result index if this returns multiple
846	/// registers in a struct.
847	bool GCNTTIImpl::isInlineAsmSourceOfDivergence(
848	const CallInst CI, ArrayRef<unsigned> Indices) const* {
849	// TODO: Handle complex extract indices
850	if (Indices.size() > `1`)
851	return true;
852
853	const DataLayout &DL = CI->getModule()->getDataLayout();
854	const SIRegisterInfo *TRI = ST->getRegisterInfo();
855	TargetLowering::AsmOperandInfoVector TargetConstraints =
856	TLI->ParseConstraints(DL, ST->getRegisterInfo(), *CI);
857
858	const int TargetOutputIdx = Indices.empty() ? -`1` : Indices [`0`];
859
860	int OutputIdx = `0`;
861	for (auto &TC : TargetConstraints) {
862	if (TC.Type != InlineAsm::isOutput)
863	continue;
864
865	// Skip outputs we don't care about.
866	if (TargetOutputIdx != -`1` && TargetOutputIdx != OutputIdx++)
867	continue;
868
869	TLI->ComputeConstraintToUse(TC, SDValue());
870
871	const TargetRegisterClass *RC = TLI->getRegForInlineAsmConstraint(
872	TRI, TC.ConstraintCode, TC.ConstraintVT).second;
873
874	// For AGPR constraints null is returned on subtargets without AGPRs, so
875	// assume divergent for null.
876	if (!RC \|\| !TRI->isSGPRClass(RC))
877	return true;
878	}
879
880	return false;
881	}
882
883	bool GCNTTIImpl::isReadRegisterSourceOfDivergence(
884	const IntrinsicInst ReadReg) const* {
885	Metadata *MD =
886	cast<MetadataAsValue>(Val: ReadReg->getArgOperand(i: `0`))->getMetadata();
887	StringRef RegName =
888	cast<MDString>(Val: cast<MDNode>(Val: MD)->getOperand(I: `0`))->getString();
889
890	// Special case registers that look like VCC.
891	MVT VT = MVT::getVT(Ty: ReadReg->getType());
892	if (VT == MVT::i1)
893	return true;
894
895	// Special case scalar registers that start with 'v'.
896	if (RegName.starts_with(Prefix: "vcc") \|\| RegName.empty())
897	return false;
898
899	// VGPR or AGPR is divergent. There aren't any specially named vector
900	// registers.
901	return RegName [`0`] == `'v'` \|\| RegName [`0`] == `'a'`;
902	}
903
904	/// \returns true if the result of the value could potentially be
905	/// different across workitems in a wavefront.
906	bool GCNTTIImpl::isSourceOfDivergence(const Value V) const* {
907	if (const Argument *A = dyn_cast<Argument>(Val: V))
908	return !AMDGPU::isArgPassedInSGPR(Arg: A);
909
910	// Loads from the private and flat address spaces are divergent, because
911	// threads can execute the load instruction with the same inputs and get
912	// different results.
913	//
914	// All other loads are not divergent, because if threads issue loads with the
915	// same arguments, they will always get the same result.
916	if (const LoadInst *Load = dyn_cast<LoadInst>(Val: V))
917	return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS \|\|
918	Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
919
920	// Atomics are divergent because they are executed sequentially: when an
921	// atomic operation refers to the same address in each thread, then each
922	// thread after the first sees the value written by the previous thread as
923	// original value.
924	if (isa<AtomicRMWInst>(Val: V) \|\| isa<AtomicCmpXchgInst>(Val: V))
925	return true;
926
927	if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(Val: V)) {
928	if (Intrinsic->getIntrinsicID() == Intrinsic::read_register)
929	return isReadRegisterSourceOfDivergence(ReadReg: Intrinsic);
930
931	return AMDGPU::isIntrinsicSourceOfDivergence(IntrID: Intrinsic->getIntrinsicID());
932	}
933
934	// Assume all function calls are a source of divergence.
935	if (const CallInst *CI = dyn_cast<CallInst>(Val: V)) {
936	if (CI->isInlineAsm())
937	return isInlineAsmSourceOfDivergence(CI);
938	return true;
939	}
940
941	// Assume all function calls are a source of divergence.
942	if (isa<InvokeInst>(Val: V))
943	return true;
944
945	return false;
946	}
947
948	bool GCNTTIImpl::isAlwaysUniform(const Value V) const* {
949	if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(Val: V))
950	return AMDGPU::isIntrinsicAlwaysUniform(IntrID: Intrinsic->getIntrinsicID());
951
952	if (const CallInst *CI = dyn_cast<CallInst>(Val: V)) {
953	if (CI->isInlineAsm())
954	return !isInlineAsmSourceOfDivergence(CI);
955	return false;
956	}
957
958	// In most cases TID / wavefrontsize is uniform.
959	//
960	// However, if a kernel has uneven dimesions we can have a value of
961	// workitem-id-x divided by the wavefrontsize non-uniform. For example
962	// dimensions (65, 2) will have workitems with address (64, 0) and (0, 1)
963	// packed into a same wave which gives 1 and 0 after the division by 64
964	// respectively.
965	//
966	// FIXME: limit it to 1D kernels only, although that shall be possible
967	// to perform this optimization is the size of the X dimension is a power
968	// of 2, we just do not currently have infrastructure to query it.
969	using namespace llvm::PatternMatch;
970	uint64_t C;
971	if (match(V, m_LShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
972	m_ConstantInt(C))) \|\|
973	match(V, m_AShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
974	m_ConstantInt(C)))) {
975	const Function *F = cast<Instruction>(Val: V)->getFunction();
976	return C >= ST->getWavefrontSizeLog2() &&
977	ST->getMaxWorkitemID(F, `1`) == `0` && ST->getMaxWorkitemID(F, `2`) == `0`;
978	}
979
980	Value *Mask;
981	if (match(V, m_c_And(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
982	m_Value(Mask)))) {
983	const Function *F = cast<Instruction>(Val: V)->getFunction();
984	const DataLayout &DL = F->getParent()->getDataLayout();
985	return computeKnownBits(V: Mask, DL).countMinTrailingZeros() >=
986	ST->getWavefrontSizeLog2() &&
987	ST->getMaxWorkitemID(F, `1`) == `0` && ST->getMaxWorkitemID(F, `2`) == `0`;
988	}
989
990	const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(Val: V);
991	if (!ExtValue)
992	return false;
993
994	const CallInst *CI = dyn_cast<CallInst>(Val: ExtValue->getOperand(i_nocapture: `0`));
995	if (!CI)
996	return false;
997
998	if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(Val: CI)) {
999	switch (Intrinsic->getIntrinsicID()) {
1000	default:
1001	return false;
1002	case Intrinsic::amdgcn_if:
1003	case Intrinsic::amdgcn_else: {
1004	ArrayRef<unsigned> Indices = ExtValue->getIndices();
1005	return Indices.size() == `1` && Indices [`0`] == `1`;
1006	}
1007	}
1008	}
1009
1010	// If we have inline asm returning mixed SGPR and VGPR results, we inferred
1011	// divergent for the overall struct return. We need to override it in the
1012	// case we're extracting an SGPR component here.
1013	if (CI->isInlineAsm())
1014	return !isInlineAsmSourceOfDivergence(CI, Indices: ExtValue->getIndices());
1015
1016	return false;
1017	}
1018
1019	bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
1020	Intrinsic::ID IID) const {
1021	switch (IID) {
1022	case Intrinsic::amdgcn_ds_fadd:
1023	case Intrinsic::amdgcn_ds_fmin:
1024	case Intrinsic::amdgcn_ds_fmax:
1025	case Intrinsic::amdgcn_is_shared:
1026	case Intrinsic::amdgcn_is_private:
1027	case Intrinsic::amdgcn_flat_atomic_fadd:
1028	case Intrinsic::amdgcn_flat_atomic_fmax:
1029	case Intrinsic::amdgcn_flat_atomic_fmin:
1030	case Intrinsic::amdgcn_flat_atomic_fmax_num:
1031	case Intrinsic::amdgcn_flat_atomic_fmin_num:
1032	OpIndexes.push_back(Elt: `0`);
1033	return true;
1034	default:
1035	return false;
1036	}
1037	}
1038
1039	Value GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst II,
1040	Value *OldV,
1041	Value NewV) const* {
1042	auto IntrID = II->getIntrinsicID();
1043	switch (IntrID) {
1044	case Intrinsic::amdgcn_ds_fadd:
1045	case Intrinsic::amdgcn_ds_fmin:
1046	case Intrinsic::amdgcn_ds_fmax: {
1047	const ConstantInt *IsVolatile = cast<ConstantInt>(Val: II->getArgOperand(i: `4`));
1048	if (!IsVolatile->isZero())
1049	return nullptr;
1050	Module *M = II->getParent()->getParent()->getParent();
1051	Type *DestTy = II->getType();
1052	Type *SrcTy = NewV->getType();
1053	Function *NewDecl =
1054	Intrinsic::getDeclaration(M, id: II->getIntrinsicID(), Tys: {DestTy, SrcTy});
1055	II->setArgOperand(i: `0`, v: NewV);
1056	II->setCalledFunction(NewDecl);
1057	return II;
1058	}
1059	case Intrinsic::amdgcn_is_shared:
1060	case Intrinsic::amdgcn_is_private: {
1061	unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
1062	AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS;
1063	unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1064	LLVMContext &Ctx = NewV->getType()->getContext();
1065	ConstantInt *NewVal = (TrueAS == NewAS) ?
1066	ConstantInt::getTrue(Context&: Ctx) : ConstantInt::getFalse(Context&: Ctx);
1067	return NewVal;
1068	}
1069	case Intrinsic::ptrmask: {
1070	unsigned OldAS = OldV->getType()->getPointerAddressSpace();
1071	unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1072	Value *MaskOp = II->getArgOperand(i: `1`);
1073	Type *MaskTy = MaskOp->getType();
1074
1075	bool DoTruncate = false;
1076
1077	const GCNTargetMachine &TM =
1078	static_cast<const GCNTargetMachine &>(getTLI()->getTargetMachine());
1079	if (!TM.isNoopAddrSpaceCast(SrcAS: OldAS, DestAS: NewAS)) {
1080	// All valid 64-bit to 32-bit casts work by chopping off the high
1081	// bits. Any masking only clearing the low bits will also apply in the new
1082	// address space.
1083	if (DL.getPointerSizeInBits(AS: OldAS) != `64` \|\|
1084	DL.getPointerSizeInBits(AS: NewAS) != `32`)
1085	return nullptr;
1086
1087	// TODO: Do we need to thread more context in here?
1088	KnownBits Known = computeKnownBits(V: MaskOp, DL, Depth: `0`, AC: nullptr, CxtI: II);
1089	if (Known.countMinLeadingOnes() < `32`)
1090	return nullptr;
1091
1092	DoTruncate = true;
1093	}
1094
1095	IRBuilder<> B(II);
1096	if (DoTruncate) {
1097	MaskTy = B.getInt32Ty();
1098	MaskOp = B.CreateTrunc(V: MaskOp, DestTy: MaskTy);
1099	}
1100
1101	return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy},
1102	{NewV, MaskOp});
1103	}
1104	case Intrinsic::amdgcn_flat_atomic_fadd:
1105	case Intrinsic::amdgcn_flat_atomic_fmax:
1106	case Intrinsic::amdgcn_flat_atomic_fmin:
1107	case Intrinsic::amdgcn_flat_atomic_fmax_num:
1108	case Intrinsic::amdgcn_flat_atomic_fmin_num: {
1109	Type *DestTy = II->getType();
1110	Type *SrcTy = NewV->getType();
1111	unsigned NewAS = SrcTy->getPointerAddressSpace();
1112	if (!AMDGPU::isExtendedGlobalAddrSpace(AS: NewAS))
1113	return nullptr;
1114	Module *M = II->getModule();
1115	Function *NewDecl = Intrinsic::getDeclaration(M, id: II->getIntrinsicID(),
1116	Tys: {DestTy, SrcTy, DestTy});
1117	II->setArgOperand(i: `0`, v: NewV);
1118	II->setCalledFunction(NewDecl);
1119	return II;
1120	}
1121	default:
1122	return nullptr;
1123	}
1124	}
1125
1126	InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1127	VectorType VT, ArrayRef<int*> Mask,
1128	TTI::TargetCostKind CostKind,
1129	int Index, VectorType *SubTp,
1130	ArrayRef<const Value *> Args,
1131	const Instruction *CxtI) {
1132	Kind = improveShuffleKindFromMask(Kind, Mask, Ty: VT, Index, SubTy&: SubTp);
1133	// Treat extractsubvector as single op permutation.
1134	bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
1135	if (IsExtractSubvector)
1136	Kind = TTI::SK_PermuteSingleSrc;
1137
1138	if (ST->hasVOP3PInsts()) {
1139	if (cast<FixedVectorType>(Val: VT)->getNumElements() == `2` &&
1140	DL.getTypeSizeInBits(Ty: VT->getElementType()) == `16`) {
1141	// With op_sel VOP3P instructions freely can access the low half or high
1142	// half of a register, so any swizzle is free.
1143
1144	switch (Kind) {
1145	case TTI::SK_Broadcast:
1146	case TTI::SK_Reverse:
1147	case TTI::SK_PermuteSingleSrc:
1148	return `0`;
1149	default:
1150	break;
1151	}
1152	}
1153	}
1154	// Restore optimal kind.
1155	if (IsExtractSubvector)
1156	Kind = TTI::SK_ExtractSubvector;
1157
1158	return BaseT::getShuffleCost(Kind, Tp: VT, Mask, CostKind, Index, SubTp);
1159	}
1160
1161	bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
1162	const Function Callee) const* {
1163	const TargetMachine &TM = getTLI()->getTargetMachine();
1164	const GCNSubtarget *CallerST
1165	= static_cast<const GCNSubtarget >(TM.getSubtargetImpl(Caller));
1166	const GCNSubtarget *CalleeST
1167	= static_cast<const GCNSubtarget >(TM.getSubtargetImpl(Callee));
1168
1169	const FeatureBitset &CallerBits = CallerST->getFeatureBits();
1170	const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
1171
1172	FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
1173	FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
1174	if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
1175	return false;
1176
1177	// FIXME: dx10_clamp can just take the caller setting, but there seems to be
1178	// no way to support merge for backend defined attributes.
1179	SIModeRegisterDefaults CallerMode(Caller, CallerST);
1180	SIModeRegisterDefaults CalleeMode(Callee, CalleeST);
1181	if (!CallerMode.isInlineCompatible(CalleeMode))
1182	return false;
1183
1184	if (Callee->hasFnAttribute(Attribute::AlwaysInline) \|\|
1185	Callee->hasFnAttribute(Attribute::InlineHint))
1186	return true;
1187
1188	// Hack to make compile times reasonable.
1189	if (InlineMaxBB) {
1190	// Single BB does not increase total BB amount.
1191	if (Callee->size() == `1`)
1192	return true;
1193	size_t BBSize = Caller->size() + Callee->size() - `1`;
1194	return BBSize <= InlineMaxBB;
1195	}
1196
1197	return true;
1198	}
1199
1200	static unsigned adjustInliningThresholdUsingCallee(const CallBase *CB,
1201	const SITargetLowering *TLI,
1202	const GCNTTIImpl *TTIImpl) {
1203	const int NrOfSGPRUntilSpill = `26`;
1204	const int NrOfVGPRUntilSpill = `32`;
1205
1206	const DataLayout &DL = TTIImpl->getDataLayout();
1207
1208	unsigned adjustThreshold = `0`;
1209	int SGPRsInUse = `0`;
1210	int VGPRsInUse = `0`;
1211	for (const Use &A : CB->args()) {
1212	SmallVector<EVT, `4`> ValueVTs;
1213	ComputeValueVTs(TLI: *TLI, DL, Ty: A.get()->getType(), ValueVTs);
1214	for (auto ArgVT : ValueVTs) {
1215	unsigned CCRegNum = TLI->getNumRegistersForCallingConv(
1216	Context&: CB->getContext(), CC: CB->getCallingConv(), VT: ArgVT);
1217	if (AMDGPU::isArgPassedInSGPR(CB, ArgNo: CB->getArgOperandNo(U: &A)))
1218	SGPRsInUse += CCRegNum;
1219	else
1220	VGPRsInUse += CCRegNum;
1221	}
1222	}
1223
1224	// The cost of passing function arguments through the stack:
1225	// 1 instruction to put a function argument on the stack in the caller.
1226	// 1 instruction to take a function argument from the stack in callee.
1227	// 1 instruction is explicitly take care of data dependencies in callee
1228	// function.
1229	InstructionCost ArgStackCost(`1`);
1230	ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1231	Opcode: Instruction::Store, Src: Type::getInt32Ty(C&: CB->getContext()), Alignment: Align (`4`),
1232	AddressSpace: AMDGPUAS::PRIVATE_ADDRESS, CostKind: TTI::TCK_SizeAndLatency);
1233	ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1234	Opcode: Instruction::Load, Src: Type::getInt32Ty(C&: CB->getContext()), Alignment: Align (`4`),
1235	AddressSpace: AMDGPUAS::PRIVATE_ADDRESS, CostKind: TTI::TCK_SizeAndLatency);
1236
1237	// The penalty cost is computed relative to the cost of instructions and does
1238	// not model any storage costs.
1239	adjustThreshold += std::max(a: `0`, b: SGPRsInUse - NrOfSGPRUntilSpill) *
1240	ArgStackCost.getValue() InlineConstants::getInstrCost();
1241	adjustThreshold += std::max(a: `0`, b: VGPRsInUse - NrOfVGPRUntilSpill) *
1242	ArgStackCost.getValue() InlineConstants::getInstrCost();
1243	return adjustThreshold;
1244	}
1245
1246	static unsigned getCallArgsTotalAllocaSize(const CallBase *CB,
1247	const DataLayout &DL) {
1248	// If we have a pointer to a private array passed into a function
1249	// it will not be optimized out, leaving scratch usage.
1250	// This function calculates the total size in bytes of the memory that would
1251	// end in scratch if the call was not inlined.
1252	unsigned AllocaSize = `0`;
1253	SmallPtrSet<const AllocaInst *, `8`> AIVisited;
1254	for (Value *PtrArg : CB->args()) {
1255	PointerType *Ty = dyn_cast<PointerType>(Val: PtrArg->getType());
1256	if (!Ty)
1257	continue;
1258
1259	unsigned AddrSpace = Ty->getAddressSpace();
1260	if (AddrSpace != AMDGPUAS::FLAT_ADDRESS &&
1261	AddrSpace != AMDGPUAS::PRIVATE_ADDRESS)
1262	continue;
1263
1264	const AllocaInst *AI = dyn_cast<AllocaInst>(Val: getUnderlyingObject(V: PtrArg));
1265	if (!AI \|\| !AI->isStaticAlloca() \|\| !AIVisited.insert(Ptr: AI).second)
1266	continue;
1267
1268	AllocaSize += DL.getTypeAllocSize(Ty: AI->getAllocatedType());
1269	}
1270	return AllocaSize;
1271	}
1272
1273	unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase CB) const* {
1274	unsigned Threshold = adjustInliningThresholdUsingCallee(CB, TLI, TTIImpl: this);
1275
1276	// Private object passed as arguments may end up in scratch usage if the call
1277	// is not inlined. Increase the inline threshold to promote inlining.
1278	unsigned AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1279	if (AllocaSize > `0`)
1280	Threshold += ArgAllocaCost;
1281	return Threshold;
1282	}
1283
1284	unsigned GCNTTIImpl::getCallerAllocaCost(const CallBase *CB,
1285	const AllocaInst AI) const* {
1286
1287	// Below the cutoff, assume that the private memory objects would be
1288	// optimized
1289	auto AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1290	if (AllocaSize <= ArgAllocaCutoff)
1291	return `0`;
1292
1293	// Above the cutoff, we give a cost to each private memory object
1294	// depending its size. If the array can be optimized by SROA this cost is not
1295	// added to the total-cost in the inliner cost analysis.
1296	//
1297	// We choose the total cost of the alloca such that their sum cancels the
1298	// bonus given in the threshold (ArgAllocaCost).
1299	//
1300	// Cost_Alloca_0 + ... + Cost_Alloca_N == ArgAllocaCost
1301	//
1302	// Awkwardly, the ArgAllocaCost bonus is multiplied by threshold-multiplier,
1303	// the single-bb bonus and the vector-bonus.
1304	//
1305	// We compensate the first two multipliers, by repeating logic from the
1306	// inliner-cost in here. The vector-bonus is 0 on AMDGPU.
1307	static_assert(InlinerVectorBonusPercent == `0`, "vector bonus assumed to be 0");
1308	unsigned Threshold = ArgAllocaCost * getInliningThresholdMultiplier();
1309
1310	bool SingleBB = none_of(Range&: CB->getCalledFunction(), P: [](const* BasicBlock &BB) {
1311	return BB.getTerminator()->getNumSuccessors() > `1`;
1312	});
1313	if (SingleBB) {
1314	Threshold += Threshold / `2`;
1315	}
1316
1317	auto ArgAllocaSize = DL.getTypeAllocSize(Ty: AI->getAllocatedType());
1318
1319	// Attribute the bonus proportionally to the alloca size
1320	unsigned AllocaThresholdBonus = (Threshold * ArgAllocaSize) / AllocaSize;
1321
1322	return AllocaThresholdBonus;
1323	}
1324
1325	void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1326	TTI::UnrollingPreferences &UP,
1327	OptimizationRemarkEmitter *ORE) {
1328	CommonTTI.getUnrollingPreferences(L, SE, UP, ORE);
1329	}
1330
1331	void GCNTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
1332	TTI::PeelingPreferences &PP) {
1333	CommonTTI.getPeelingPreferences(L, SE, PP);
1334	}
1335
1336	int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const {
1337	return ST->hasFullRate64Ops()
1338	? getFullRateInstrCost()
1339	: ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
1340	: getQuarterRateInstrCost(CostKind);
1341	}
1342
1343	std::pair<InstructionCost, MVT>
1344	GCNTTIImpl::getTypeLegalizationCost(Type Ty) const* {
1345	std::pair<InstructionCost, MVT> Cost = BaseT::getTypeLegalizationCost(Ty);
1346	auto Size = DL.getTypeSizeInBits(Ty);
1347	// Maximum load or store can handle 8 dwords for scalar and 4 for
1348	// vector ALU. Let's assume anything above 8 dwords is expensive
1349	// even if legal.
1350	if (Size <= `256`)
1351	return Cost;
1352
1353	Cost.first += (Size + `255`) / `256`;
1354	return Cost;
1355	}
1356
1357	unsigned GCNTTIImpl::getPrefetchDistance() const {
1358	return ST->hasPrefetch() ? `128` : `0`;
1359	}
1360
1361	bool GCNTTIImpl::shouldPrefetchAddressSpace(unsigned AS) const {
1362	return AMDGPU::isFlatGlobalAddrSpace(AS);
1363	}
1364

source code of llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp