1 | //===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // \file |
10 | // This file implements a TargetTransformInfo analysis pass specific to the |
11 | // AMDGPU target machine. It uses the target's detailed information to provide |
12 | // more precise answers to certain TTI queries, while letting the target |
13 | // independent and default TTI implementations handle the rest. |
14 | // |
15 | //===----------------------------------------------------------------------===// |
16 | |
17 | #include "AMDGPUTargetTransformInfo.h" |
18 | #include "AMDGPUTargetMachine.h" |
19 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
20 | #include "SIModeRegisterDefaults.h" |
21 | #include "llvm/Analysis/InlineCost.h" |
22 | #include "llvm/Analysis/LoopInfo.h" |
23 | #include "llvm/Analysis/ValueTracking.h" |
24 | #include "llvm/CodeGen/Analysis.h" |
25 | #include "llvm/IR/IRBuilder.h" |
26 | #include "llvm/IR/IntrinsicsAMDGPU.h" |
27 | #include "llvm/IR/PatternMatch.h" |
28 | #include "llvm/Support/KnownBits.h" |
29 | #include <optional> |
30 | |
31 | using namespace llvm; |
32 | |
33 | #define DEBUG_TYPE "AMDGPUtti" |
34 | |
35 | static cl::opt<unsigned> UnrollThresholdPrivate( |
36 | "amdgpu-unroll-threshold-private" , |
37 | cl::desc("Unroll threshold for AMDGPU if private memory used in a loop" ), |
38 | cl::init(Val: 2700), cl::Hidden); |
39 | |
40 | static cl::opt<unsigned> UnrollThresholdLocal( |
41 | "amdgpu-unroll-threshold-local" , |
42 | cl::desc("Unroll threshold for AMDGPU if local memory used in a loop" ), |
43 | cl::init(Val: 1000), cl::Hidden); |
44 | |
45 | static cl::opt<unsigned> UnrollThresholdIf( |
46 | "amdgpu-unroll-threshold-if" , |
47 | cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop" ), |
48 | cl::init(Val: 200), cl::Hidden); |
49 | |
50 | static cl::opt<bool> UnrollRuntimeLocal( |
51 | "amdgpu-unroll-runtime-local" , |
52 | cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop" ), |
53 | cl::init(Val: true), cl::Hidden); |
54 | |
55 | static cl::opt<unsigned> UnrollMaxBlockToAnalyze( |
56 | "amdgpu-unroll-max-block-to-analyze" , |
57 | cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU" ), |
58 | cl::init(Val: 32), cl::Hidden); |
59 | |
60 | static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost" , |
61 | cl::Hidden, cl::init(Val: 4000), |
62 | cl::desc("Cost of alloca argument" )); |
63 | |
64 | // If the amount of scratch memory to eliminate exceeds our ability to allocate |
65 | // it into registers we gain nothing by aggressively inlining functions for that |
66 | // heuristic. |
67 | static cl::opt<unsigned> |
68 | ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff" , cl::Hidden, |
69 | cl::init(Val: 256), |
70 | cl::desc("Maximum alloca size to use for inline cost" )); |
71 | |
72 | // Inliner constraint to achieve reasonable compilation time. |
73 | static cl::opt<size_t> InlineMaxBB( |
74 | "amdgpu-inline-max-bb" , cl::Hidden, cl::init(Val: 1100), |
75 | cl::desc("Maximum number of BBs allowed in a function after inlining" |
76 | " (compile time constraint)" )); |
77 | |
78 | static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, |
79 | unsigned Depth = 0) { |
80 | const Instruction *I = dyn_cast<Instruction>(Val: Cond); |
81 | if (!I) |
82 | return false; |
83 | |
84 | for (const Value *V : I->operand_values()) { |
85 | if (!L->contains(Inst: I)) |
86 | continue; |
87 | if (const PHINode *PHI = dyn_cast<PHINode>(Val: V)) { |
88 | if (llvm::none_of(Range: L->getSubLoops(), P: [PHI](const Loop* SubLoop) { |
89 | return SubLoop->contains(Inst: PHI); })) |
90 | return true; |
91 | } else if (Depth < 10 && dependsOnLocalPhi(L, Cond: V, Depth: Depth+1)) |
92 | return true; |
93 | } |
94 | return false; |
95 | } |
96 | |
97 | AMDGPUTTIImpl::AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F) |
98 | : BaseT(TM, F.getParent()->getDataLayout()), |
99 | TargetTriple(TM->getTargetTriple()), |
100 | ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))), |
101 | TLI(ST->getTargetLowering()) {} |
102 | |
103 | void AMDGPUTTIImpl::(Loop *L, ScalarEvolution &SE, |
104 | TTI::UnrollingPreferences &UP, |
105 | OptimizationRemarkEmitter *ORE) { |
106 | const Function &F = *L->getHeader()->getParent(); |
107 | UP.Threshold = |
108 | F.getFnAttributeAsParsedInteger(Kind: "amdgpu-unroll-threshold" , Default: 300); |
109 | UP.MaxCount = std::numeric_limits<unsigned>::max(); |
110 | UP.Partial = true; |
111 | |
112 | // Conditional branch in a loop back edge needs 3 additional exec |
113 | // manipulations in average. |
114 | UP.BEInsns += 3; |
115 | |
116 | // We want to run unroll even for the loops which have been vectorized. |
117 | UP.UnrollVectorizedLoop = true; |
118 | |
119 | // TODO: Do we want runtime unrolling? |
120 | |
121 | // Maximum alloca size than can fit registers. Reserve 16 registers. |
122 | const unsigned MaxAlloca = (256 - 16) * 4; |
123 | unsigned ThresholdPrivate = UnrollThresholdPrivate; |
124 | unsigned ThresholdLocal = UnrollThresholdLocal; |
125 | |
126 | // If this loop has the amdgpu.loop.unroll.threshold metadata we will use the |
127 | // provided threshold value as the default for Threshold |
128 | if (MDNode *LoopUnrollThreshold = |
129 | findOptionMDForLoop(TheLoop: L, Name: "amdgpu.loop.unroll.threshold" )) { |
130 | if (LoopUnrollThreshold->getNumOperands() == 2) { |
131 | ConstantInt *MetaThresholdValue = mdconst::extract_or_null<ConstantInt>( |
132 | MD: LoopUnrollThreshold->getOperand(I: 1)); |
133 | if (MetaThresholdValue) { |
134 | // We will also use the supplied value for PartialThreshold for now. |
135 | // We may introduce additional metadata if it becomes necessary in the |
136 | // future. |
137 | UP.Threshold = MetaThresholdValue->getSExtValue(); |
138 | UP.PartialThreshold = UP.Threshold; |
139 | ThresholdPrivate = std::min(a: ThresholdPrivate, b: UP.Threshold); |
140 | ThresholdLocal = std::min(a: ThresholdLocal, b: UP.Threshold); |
141 | } |
142 | } |
143 | } |
144 | |
145 | unsigned MaxBoost = std::max(a: ThresholdPrivate, b: ThresholdLocal); |
146 | for (const BasicBlock *BB : L->getBlocks()) { |
147 | const DataLayout &DL = BB->getModule()->getDataLayout(); |
148 | unsigned LocalGEPsSeen = 0; |
149 | |
150 | if (llvm::any_of(Range: L->getSubLoops(), P: [BB](const Loop* SubLoop) { |
151 | return SubLoop->contains(BB); })) |
152 | continue; // Block belongs to an inner loop. |
153 | |
154 | for (const Instruction &I : *BB) { |
155 | // Unroll a loop which contains an "if" statement whose condition |
156 | // defined by a PHI belonging to the loop. This may help to eliminate |
157 | // if region and potentially even PHI itself, saving on both divergence |
158 | // and registers used for the PHI. |
159 | // Add a small bonus for each of such "if" statements. |
160 | if (const BranchInst *Br = dyn_cast<BranchInst>(Val: &I)) { |
161 | if (UP.Threshold < MaxBoost && Br->isConditional()) { |
162 | BasicBlock *Succ0 = Br->getSuccessor(i: 0); |
163 | BasicBlock *Succ1 = Br->getSuccessor(i: 1); |
164 | if ((L->contains(BB: Succ0) && L->isLoopExiting(BB: Succ0)) || |
165 | (L->contains(BB: Succ1) && L->isLoopExiting(BB: Succ1))) |
166 | continue; |
167 | if (dependsOnLocalPhi(L, Cond: Br->getCondition())) { |
168 | UP.Threshold += UnrollThresholdIf; |
169 | LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold |
170 | << " for loop:\n" |
171 | << *L << " due to " << *Br << '\n'); |
172 | if (UP.Threshold >= MaxBoost) |
173 | return; |
174 | } |
175 | } |
176 | continue; |
177 | } |
178 | |
179 | const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: &I); |
180 | if (!GEP) |
181 | continue; |
182 | |
183 | unsigned AS = GEP->getAddressSpace(); |
184 | unsigned Threshold = 0; |
185 | if (AS == AMDGPUAS::PRIVATE_ADDRESS) |
186 | Threshold = ThresholdPrivate; |
187 | else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) |
188 | Threshold = ThresholdLocal; |
189 | else |
190 | continue; |
191 | |
192 | if (UP.Threshold >= Threshold) |
193 | continue; |
194 | |
195 | if (AS == AMDGPUAS::PRIVATE_ADDRESS) { |
196 | const Value *Ptr = GEP->getPointerOperand(); |
197 | const AllocaInst *Alloca = |
198 | dyn_cast<AllocaInst>(Val: getUnderlyingObject(V: Ptr)); |
199 | if (!Alloca || !Alloca->isStaticAlloca()) |
200 | continue; |
201 | Type *Ty = Alloca->getAllocatedType(); |
202 | unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0; |
203 | if (AllocaSize > MaxAlloca) |
204 | continue; |
205 | } else if (AS == AMDGPUAS::LOCAL_ADDRESS || |
206 | AS == AMDGPUAS::REGION_ADDRESS) { |
207 | LocalGEPsSeen++; |
208 | // Inhibit unroll for local memory if we have seen addressing not to |
209 | // a variable, most likely we will be unable to combine it. |
210 | // Do not unroll too deep inner loops for local memory to give a chance |
211 | // to unroll an outer loop for a more important reason. |
212 | if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 || |
213 | (!isa<GlobalVariable>(Val: GEP->getPointerOperand()) && |
214 | !isa<Argument>(Val: GEP->getPointerOperand()))) |
215 | continue; |
216 | LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n" |
217 | << *L << " due to LDS use.\n" ); |
218 | UP.Runtime = UnrollRuntimeLocal; |
219 | } |
220 | |
221 | // Check if GEP depends on a value defined by this loop itself. |
222 | bool HasLoopDef = false; |
223 | for (const Value *Op : GEP->operands()) { |
224 | const Instruction *Inst = dyn_cast<Instruction>(Val: Op); |
225 | if (!Inst || L->isLoopInvariant(V: Op)) |
226 | continue; |
227 | |
228 | if (llvm::any_of(Range: L->getSubLoops(), P: [Inst](const Loop* SubLoop) { |
229 | return SubLoop->contains(Inst); })) |
230 | continue; |
231 | HasLoopDef = true; |
232 | break; |
233 | } |
234 | if (!HasLoopDef) |
235 | continue; |
236 | |
237 | // We want to do whatever we can to limit the number of alloca |
238 | // instructions that make it through to the code generator. allocas |
239 | // require us to use indirect addressing, which is slow and prone to |
240 | // compiler bugs. If this loop does an address calculation on an |
241 | // alloca ptr, then we want to use a higher than normal loop unroll |
242 | // threshold. This will give SROA a better chance to eliminate these |
243 | // allocas. |
244 | // |
245 | // We also want to have more unrolling for local memory to let ds |
246 | // instructions with different offsets combine. |
247 | // |
248 | // Don't use the maximum allowed value here as it will make some |
249 | // programs way too big. |
250 | UP.Threshold = Threshold; |
251 | LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold |
252 | << " for loop:\n" |
253 | << *L << " due to " << *GEP << '\n'); |
254 | if (UP.Threshold >= MaxBoost) |
255 | return; |
256 | } |
257 | |
258 | // If we got a GEP in a small BB from inner loop then increase max trip |
259 | // count to analyze for better estimation cost in unroll |
260 | if (L->isInnermost() && BB->size() < UnrollMaxBlockToAnalyze) |
261 | UP.MaxIterationsCountToAnalyze = 32; |
262 | } |
263 | } |
264 | |
265 | void AMDGPUTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, |
266 | TTI::PeelingPreferences &PP) { |
267 | BaseT::getPeelingPreferences(L, SE, PP); |
268 | } |
269 | |
270 | int64_t AMDGPUTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const { |
271 | return 1024; |
272 | } |
273 | |
274 | const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = { |
275 | // Codegen control options which don't matter. |
276 | AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler, |
277 | AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureFlatForGlobal, |
278 | AMDGPU::FeaturePromoteAlloca, AMDGPU::FeatureUnalignedScratchAccess, |
279 | AMDGPU::FeatureUnalignedAccessMode, |
280 | |
281 | AMDGPU::FeatureAutoWaitcntBeforeBarrier, |
282 | |
283 | // Property of the kernel/environment which can't actually differ. |
284 | AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK, |
285 | AMDGPU::FeatureTrapHandler, |
286 | |
287 | // The default assumption needs to be ecc is enabled, but no directly |
288 | // exposed operations depend on it, so it can be safely inlined. |
289 | AMDGPU::FeatureSRAMECC, |
290 | |
291 | // Perf-tuning features |
292 | AMDGPU::FeatureFastFMAF32, AMDGPU::HalfRate64Ops}; |
293 | |
294 | GCNTTIImpl::GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F) |
295 | : BaseT(TM, F.getParent()->getDataLayout()), |
296 | ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))), |
297 | TLI(ST->getTargetLowering()), CommonTTI(TM, F), |
298 | IsGraphics(AMDGPU::isGraphics(CC: F.getCallingConv())) { |
299 | SIModeRegisterDefaults Mode(F, *ST); |
300 | HasFP32Denormals = Mode.FP32Denormals != DenormalMode::getPreserveSign(); |
301 | HasFP64FP16Denormals = |
302 | Mode.FP64FP16Denormals != DenormalMode::getPreserveSign(); |
303 | } |
304 | |
305 | bool GCNTTIImpl::hasBranchDivergence(const Function *F) const { |
306 | return !F || !ST->isSingleLaneExecution(*F); |
307 | } |
308 | |
309 | unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const { |
310 | // NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector |
311 | // registers. See getRegisterClassForType for the implementation. |
312 | // In this case vector registers are not vector in terms of |
313 | // VGPRs, but those which can hold multiple values. |
314 | |
315 | // This is really the number of registers to fill when vectorizing / |
316 | // interleaving loops, so we lie to avoid trying to use all registers. |
317 | return 4; |
318 | } |
319 | |
320 | TypeSize |
321 | GCNTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { |
322 | switch (K) { |
323 | case TargetTransformInfo::RGK_Scalar: |
324 | return TypeSize::getFixed(ExactSize: 32); |
325 | case TargetTransformInfo::RGK_FixedWidthVector: |
326 | return TypeSize::getFixed(ExactSize: ST->hasPackedFP32Ops() ? 64 : 32); |
327 | case TargetTransformInfo::RGK_ScalableVector: |
328 | return TypeSize::getScalable(MinimumSize: 0); |
329 | } |
330 | llvm_unreachable("Unsupported register kind" ); |
331 | } |
332 | |
333 | unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const { |
334 | return 32; |
335 | } |
336 | |
337 | unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const { |
338 | if (Opcode == Instruction::Load || Opcode == Instruction::Store) |
339 | return 32 * 4 / ElemWidth; |
340 | return (ElemWidth == 16 && ST->has16BitInsts()) ? 2 |
341 | : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2 |
342 | : 1; |
343 | } |
344 | |
345 | unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize, |
346 | unsigned ChainSizeInBytes, |
347 | VectorType *VecTy) const { |
348 | unsigned VecRegBitWidth = VF * LoadSize; |
349 | if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32) |
350 | // TODO: Support element-size less than 32bit? |
351 | return 128 / LoadSize; |
352 | |
353 | return VF; |
354 | } |
355 | |
356 | unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize, |
357 | unsigned ChainSizeInBytes, |
358 | VectorType *VecTy) const { |
359 | unsigned VecRegBitWidth = VF * StoreSize; |
360 | if (VecRegBitWidth > 128) |
361 | return 128 / StoreSize; |
362 | |
363 | return VF; |
364 | } |
365 | |
366 | unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const { |
367 | if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS || |
368 | AddrSpace == AMDGPUAS::CONSTANT_ADDRESS || |
369 | AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT || |
370 | AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER || |
371 | AddrSpace == AMDGPUAS::BUFFER_RESOURCE || |
372 | AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) { |
373 | return 512; |
374 | } |
375 | |
376 | if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) |
377 | return 8 * ST->getMaxPrivateElementSize(); |
378 | |
379 | // Common to flat, global, local and region. Assume for unknown addrspace. |
380 | return 128; |
381 | } |
382 | |
383 | bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, |
384 | Align Alignment, |
385 | unsigned AddrSpace) const { |
386 | // We allow vectorization of flat stores, even though we may need to decompose |
387 | // them later if they may access private memory. We don't have enough context |
388 | // here, and legalization can handle it. |
389 | if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) { |
390 | return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) && |
391 | ChainSizeInBytes <= ST->getMaxPrivateElementSize(); |
392 | } |
393 | return true; |
394 | } |
395 | |
396 | bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, |
397 | Align Alignment, |
398 | unsigned AddrSpace) const { |
399 | return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); |
400 | } |
401 | |
402 | bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, |
403 | Align Alignment, |
404 | unsigned AddrSpace) const { |
405 | return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); |
406 | } |
407 | |
408 | int64_t GCNTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const { |
409 | return 1024; |
410 | } |
411 | |
412 | // FIXME: Really we would like to issue multiple 128-bit loads and stores per |
413 | // iteration. Should we report a larger size and let it legalize? |
414 | // |
415 | // FIXME: Should we use narrower types for local/region, or account for when |
416 | // unaligned access is legal? |
417 | // |
418 | // FIXME: This could use fine tuning and microbenchmarks. |
419 | Type *GCNTTIImpl::getMemcpyLoopLoweringType( |
420 | LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, |
421 | unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign, |
422 | std::optional<uint32_t> AtomicElementSize) const { |
423 | |
424 | if (AtomicElementSize) |
425 | return Type::getIntNTy(C&: Context, N: *AtomicElementSize * 8); |
426 | |
427 | unsigned MinAlign = std::min(a: SrcAlign, b: DestAlign); |
428 | |
429 | // A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the |
430 | // hardware into byte accesses. If you assume all alignments are equally |
431 | // probable, it's more efficient on average to use short accesses for this |
432 | // case. |
433 | if (MinAlign == 2) |
434 | return Type::getInt16Ty(C&: Context); |
435 | |
436 | // Not all subtargets have 128-bit DS instructions, and we currently don't |
437 | // form them by default. |
438 | if (SrcAddrSpace == AMDGPUAS::LOCAL_ADDRESS || |
439 | SrcAddrSpace == AMDGPUAS::REGION_ADDRESS || |
440 | DestAddrSpace == AMDGPUAS::LOCAL_ADDRESS || |
441 | DestAddrSpace == AMDGPUAS::REGION_ADDRESS) { |
442 | return FixedVectorType::get(ElementType: Type::getInt32Ty(C&: Context), NumElts: 2); |
443 | } |
444 | |
445 | // Global memory works best with 16-byte accesses. Private memory will also |
446 | // hit this, although they'll be decomposed. |
447 | return FixedVectorType::get(ElementType: Type::getInt32Ty(C&: Context), NumElts: 4); |
448 | } |
449 | |
450 | void GCNTTIImpl::getMemcpyLoopResidualLoweringType( |
451 | SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context, |
452 | unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, |
453 | unsigned SrcAlign, unsigned DestAlign, |
454 | std::optional<uint32_t> AtomicCpySize) const { |
455 | assert(RemainingBytes < 16); |
456 | |
457 | if (AtomicCpySize) |
458 | BaseT::getMemcpyLoopResidualLoweringType( |
459 | OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign, |
460 | DestAlign, AtomicCpySize); |
461 | |
462 | unsigned MinAlign = std::min(a: SrcAlign, b: DestAlign); |
463 | |
464 | if (MinAlign != 2) { |
465 | Type *I64Ty = Type::getInt64Ty(C&: Context); |
466 | while (RemainingBytes >= 8) { |
467 | OpsOut.push_back(Elt: I64Ty); |
468 | RemainingBytes -= 8; |
469 | } |
470 | |
471 | Type *I32Ty = Type::getInt32Ty(C&: Context); |
472 | while (RemainingBytes >= 4) { |
473 | OpsOut.push_back(Elt: I32Ty); |
474 | RemainingBytes -= 4; |
475 | } |
476 | } |
477 | |
478 | Type *I16Ty = Type::getInt16Ty(C&: Context); |
479 | while (RemainingBytes >= 2) { |
480 | OpsOut.push_back(Elt: I16Ty); |
481 | RemainingBytes -= 2; |
482 | } |
483 | |
484 | Type *I8Ty = Type::getInt8Ty(C&: Context); |
485 | while (RemainingBytes) { |
486 | OpsOut.push_back(Elt: I8Ty); |
487 | --RemainingBytes; |
488 | } |
489 | } |
490 | |
491 | unsigned GCNTTIImpl::getMaxInterleaveFactor(ElementCount VF) { |
492 | // Disable unrolling if the loop is not vectorized. |
493 | // TODO: Enable this again. |
494 | if (VF.isScalar()) |
495 | return 1; |
496 | |
497 | return 8; |
498 | } |
499 | |
500 | bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, |
501 | MemIntrinsicInfo &Info) const { |
502 | switch (Inst->getIntrinsicID()) { |
503 | case Intrinsic::amdgcn_ds_ordered_add: |
504 | case Intrinsic::amdgcn_ds_ordered_swap: |
505 | case Intrinsic::amdgcn_ds_fadd: |
506 | case Intrinsic::amdgcn_ds_fmin: |
507 | case Intrinsic::amdgcn_ds_fmax: { |
508 | auto *Ordering = dyn_cast<ConstantInt>(Val: Inst->getArgOperand(i: 2)); |
509 | auto *Volatile = dyn_cast<ConstantInt>(Val: Inst->getArgOperand(i: 4)); |
510 | if (!Ordering || !Volatile) |
511 | return false; // Invalid. |
512 | |
513 | unsigned OrderingVal = Ordering->getZExtValue(); |
514 | if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent)) |
515 | return false; |
516 | |
517 | Info.PtrVal = Inst->getArgOperand(i: 0); |
518 | Info.Ordering = static_cast<AtomicOrdering>(OrderingVal); |
519 | Info.ReadMem = true; |
520 | Info.WriteMem = true; |
521 | Info.IsVolatile = !Volatile->isZero(); |
522 | return true; |
523 | } |
524 | default: |
525 | return false; |
526 | } |
527 | } |
528 | |
529 | InstructionCost GCNTTIImpl::getArithmeticInstrCost( |
530 | unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, |
531 | TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, |
532 | ArrayRef<const Value *> Args, |
533 | const Instruction *CxtI) { |
534 | |
535 | // Legalize the type. |
536 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); |
537 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
538 | |
539 | // Because we don't have any legal vector operations, but the legal types, we |
540 | // need to account for split vectors. |
541 | unsigned NElts = LT.second.isVector() ? |
542 | LT.second.getVectorNumElements() : 1; |
543 | |
544 | MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy; |
545 | |
546 | switch (ISD) { |
547 | case ISD::SHL: |
548 | case ISD::SRL: |
549 | case ISD::SRA: |
550 | if (SLT == MVT::i64) |
551 | return get64BitInstrCost(CostKind) * LT.first * NElts; |
552 | |
553 | if (ST->has16BitInsts() && SLT == MVT::i16) |
554 | NElts = (NElts + 1) / 2; |
555 | |
556 | // i32 |
557 | return getFullRateInstrCost() * LT.first * NElts; |
558 | case ISD::ADD: |
559 | case ISD::SUB: |
560 | case ISD::AND: |
561 | case ISD::OR: |
562 | case ISD::XOR: |
563 | if (SLT == MVT::i64) { |
564 | // and, or and xor are typically split into 2 VALU instructions. |
565 | return 2 * getFullRateInstrCost() * LT.first * NElts; |
566 | } |
567 | |
568 | if (ST->has16BitInsts() && SLT == MVT::i16) |
569 | NElts = (NElts + 1) / 2; |
570 | |
571 | return LT.first * NElts * getFullRateInstrCost(); |
572 | case ISD::MUL: { |
573 | const int QuarterRateCost = getQuarterRateInstrCost(CostKind); |
574 | if (SLT == MVT::i64) { |
575 | const int FullRateCost = getFullRateInstrCost(); |
576 | return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts; |
577 | } |
578 | |
579 | if (ST->has16BitInsts() && SLT == MVT::i16) |
580 | NElts = (NElts + 1) / 2; |
581 | |
582 | // i32 |
583 | return QuarterRateCost * NElts * LT.first; |
584 | } |
585 | case ISD::FMUL: |
586 | // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for |
587 | // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole |
588 | // fused operation. |
589 | if (CxtI && CxtI->hasOneUse()) |
590 | if (const auto *FAdd = dyn_cast<BinaryOperator>(Val: *CxtI->user_begin())) { |
591 | const int OPC = TLI->InstructionOpcodeToISD(Opcode: FAdd->getOpcode()); |
592 | if (OPC == ISD::FADD || OPC == ISD::FSUB) { |
593 | if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals) |
594 | return TargetTransformInfo::TCC_Free; |
595 | if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals) |
596 | return TargetTransformInfo::TCC_Free; |
597 | |
598 | // Estimate all types may be fused with contract/unsafe flags |
599 | const TargetOptions &Options = TLI->getTargetMachine().Options; |
600 | if (Options.AllowFPOpFusion == FPOpFusion::Fast || |
601 | Options.UnsafeFPMath || |
602 | (FAdd->hasAllowContract() && CxtI->hasAllowContract())) |
603 | return TargetTransformInfo::TCC_Free; |
604 | } |
605 | } |
606 | [[fallthrough]]; |
607 | case ISD::FADD: |
608 | case ISD::FSUB: |
609 | if (ST->hasPackedFP32Ops() && SLT == MVT::f32) |
610 | NElts = (NElts + 1) / 2; |
611 | if (SLT == MVT::f64) |
612 | return LT.first * NElts * get64BitInstrCost(CostKind); |
613 | |
614 | if (ST->has16BitInsts() && SLT == MVT::f16) |
615 | NElts = (NElts + 1) / 2; |
616 | |
617 | if (SLT == MVT::f32 || SLT == MVT::f16) |
618 | return LT.first * NElts * getFullRateInstrCost(); |
619 | break; |
620 | case ISD::FDIV: |
621 | case ISD::FREM: |
622 | // FIXME: frem should be handled separately. The fdiv in it is most of it, |
623 | // but the current lowering is also not entirely correct. |
624 | if (SLT == MVT::f64) { |
625 | int Cost = 7 * get64BitInstrCost(CostKind) + |
626 | getQuarterRateInstrCost(CostKind) + |
627 | 3 * getHalfRateInstrCost(CostKind); |
628 | // Add cost of workaround. |
629 | if (!ST->hasUsableDivScaleConditionOutput()) |
630 | Cost += 3 * getFullRateInstrCost(); |
631 | |
632 | return LT.first * Cost * NElts; |
633 | } |
634 | |
635 | if (!Args.empty() && match(V: Args[0], P: PatternMatch::m_FPOne())) { |
636 | // TODO: This is more complicated, unsafe flags etc. |
637 | if ((SLT == MVT::f32 && !HasFP32Denormals) || |
638 | (SLT == MVT::f16 && ST->has16BitInsts())) { |
639 | return LT.first * getQuarterRateInstrCost(CostKind) * NElts; |
640 | } |
641 | } |
642 | |
643 | if (SLT == MVT::f16 && ST->has16BitInsts()) { |
644 | // 2 x v_cvt_f32_f16 |
645 | // f32 rcp |
646 | // f32 fmul |
647 | // v_cvt_f16_f32 |
648 | // f16 div_fixup |
649 | int Cost = |
650 | 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(CostKind); |
651 | return LT.first * Cost * NElts; |
652 | } |
653 | |
654 | if (SLT == MVT::f32 && ((CxtI && CxtI->hasApproxFunc()) || |
655 | TLI->getTargetMachine().Options.UnsafeFPMath)) { |
656 | // Fast unsafe fdiv lowering: |
657 | // f32 rcp |
658 | // f32 fmul |
659 | int Cost = getQuarterRateInstrCost(CostKind) + getFullRateInstrCost(); |
660 | return LT.first * Cost * NElts; |
661 | } |
662 | |
663 | if (SLT == MVT::f32 || SLT == MVT::f16) { |
664 | // 4 more v_cvt_* insts without f16 insts support |
665 | int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() + |
666 | 1 * getQuarterRateInstrCost(CostKind); |
667 | |
668 | if (!HasFP32Denormals) { |
669 | // FP mode switches. |
670 | Cost += 2 * getFullRateInstrCost(); |
671 | } |
672 | |
673 | return LT.first * NElts * Cost; |
674 | } |
675 | break; |
676 | case ISD::FNEG: |
677 | // Use the backend' estimation. If fneg is not free each element will cost |
678 | // one additional instruction. |
679 | return TLI->isFNegFree(VT: SLT) ? 0 : NElts; |
680 | default: |
681 | break; |
682 | } |
683 | |
684 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info, |
685 | Args, CxtI); |
686 | } |
687 | |
688 | // Return true if there's a potential benefit from using v2f16/v2i16 |
689 | // instructions for an intrinsic, even if it requires nontrivial legalization. |
690 | static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) { |
691 | switch (ID) { |
692 | case Intrinsic::fma: // TODO: fmuladd |
693 | // There's a small benefit to using vector ops in the legalized code. |
694 | case Intrinsic::round: |
695 | case Intrinsic::uadd_sat: |
696 | case Intrinsic::usub_sat: |
697 | case Intrinsic::sadd_sat: |
698 | case Intrinsic::ssub_sat: |
699 | return true; |
700 | default: |
701 | return false; |
702 | } |
703 | } |
704 | |
705 | InstructionCost |
706 | GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, |
707 | TTI::TargetCostKind CostKind) { |
708 | if (ICA.getID() == Intrinsic::fabs) |
709 | return 0; |
710 | |
711 | if (!intrinsicHasPackedVectorBenefit(ID: ICA.getID())) |
712 | return BaseT::getIntrinsicInstrCost(ICA, CostKind); |
713 | |
714 | Type *RetTy = ICA.getReturnType(); |
715 | |
716 | // Legalize the type. |
717 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: RetTy); |
718 | |
719 | unsigned NElts = LT.second.isVector() ? |
720 | LT.second.getVectorNumElements() : 1; |
721 | |
722 | MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy; |
723 | |
724 | if (SLT == MVT::f64) |
725 | return LT.first * NElts * get64BitInstrCost(CostKind); |
726 | |
727 | if ((ST->has16BitInsts() && SLT == MVT::f16) || |
728 | (ST->hasPackedFP32Ops() && SLT == MVT::f32)) |
729 | NElts = (NElts + 1) / 2; |
730 | |
731 | // TODO: Get more refined intrinsic costs? |
732 | unsigned InstRate = getQuarterRateInstrCost(CostKind); |
733 | |
734 | switch (ICA.getID()) { |
735 | case Intrinsic::fma: |
736 | InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind) |
737 | : getQuarterRateInstrCost(CostKind); |
738 | break; |
739 | case Intrinsic::uadd_sat: |
740 | case Intrinsic::usub_sat: |
741 | case Intrinsic::sadd_sat: |
742 | case Intrinsic::ssub_sat: |
743 | static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16}; |
744 | if (any_of(ValidSatTys, [<](MVT M) { return M == LT.second; })) |
745 | NElts = 1; |
746 | break; |
747 | } |
748 | |
749 | return LT.first * NElts * InstRate; |
750 | } |
751 | |
752 | InstructionCost GCNTTIImpl::getCFInstrCost(unsigned Opcode, |
753 | TTI::TargetCostKind CostKind, |
754 | const Instruction *I) { |
755 | assert((I == nullptr || I->getOpcode() == Opcode) && |
756 | "Opcode should reflect passed instruction." ); |
757 | const bool SCost = |
758 | (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency); |
759 | const int CBrCost = SCost ? 5 : 7; |
760 | switch (Opcode) { |
761 | case Instruction::Br: { |
762 | // Branch instruction takes about 4 slots on gfx900. |
763 | auto BI = dyn_cast_or_null<BranchInst>(Val: I); |
764 | if (BI && BI->isUnconditional()) |
765 | return SCost ? 1 : 4; |
766 | // Suppose conditional branch takes additional 3 exec manipulations |
767 | // instructions in average. |
768 | return CBrCost; |
769 | } |
770 | case Instruction::Switch: { |
771 | auto SI = dyn_cast_or_null<SwitchInst>(Val: I); |
772 | // Each case (including default) takes 1 cmp + 1 cbr instructions in |
773 | // average. |
774 | return (SI ? (SI->getNumCases() + 1) : 4) * (CBrCost + 1); |
775 | } |
776 | case Instruction::Ret: |
777 | return SCost ? 1 : 10; |
778 | } |
779 | return BaseT::getCFInstrCost(Opcode, CostKind, I); |
780 | } |
781 | |
782 | InstructionCost |
783 | GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, |
784 | std::optional<FastMathFlags> FMF, |
785 | TTI::TargetCostKind CostKind) { |
786 | if (TTI::requiresOrderedReduction(FMF)) |
787 | return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); |
788 | |
789 | EVT OrigTy = TLI->getValueType(DL, Ty); |
790 | |
791 | // Computes cost on targets that have packed math instructions(which support |
792 | // 16-bit types only). |
793 | if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16) |
794 | return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); |
795 | |
796 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); |
797 | return LT.first * getFullRateInstrCost(); |
798 | } |
799 | |
800 | InstructionCost |
801 | GCNTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, |
802 | FastMathFlags FMF, |
803 | TTI::TargetCostKind CostKind) { |
804 | EVT OrigTy = TLI->getValueType(DL, Ty); |
805 | |
806 | // Computes cost on targets that have packed math instructions(which support |
807 | // 16-bit types only). |
808 | if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16) |
809 | return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind); |
810 | |
811 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); |
812 | return LT.first * getHalfRateInstrCost(CostKind); |
813 | } |
814 | |
815 | InstructionCost GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, |
816 | TTI::TargetCostKind CostKind, |
817 | unsigned Index, Value *Op0, |
818 | Value *Op1) { |
819 | switch (Opcode) { |
820 | case Instruction::ExtractElement: |
821 | case Instruction::InsertElement: { |
822 | unsigned EltSize |
823 | = DL.getTypeSizeInBits(Ty: cast<VectorType>(Val: ValTy)->getElementType()); |
824 | if (EltSize < 32) { |
825 | if (EltSize == 16 && Index == 0 && ST->has16BitInsts()) |
826 | return 0; |
827 | return BaseT::getVectorInstrCost(Opcode, Val: ValTy, CostKind, Index, Op0, |
828 | Op1); |
829 | } |
830 | |
831 | // Extracts are just reads of a subregister, so are free. Inserts are |
832 | // considered free because we don't want to have any cost for scalarizing |
833 | // operations, and we don't have to copy into a different register class. |
834 | |
835 | // Dynamic indexing isn't free and is best avoided. |
836 | return Index == ~0u ? 2 : 0; |
837 | } |
838 | default: |
839 | return BaseT::getVectorInstrCost(Opcode, Val: ValTy, CostKind, Index, Op0, Op1); |
840 | } |
841 | } |
842 | |
843 | /// Analyze if the results of inline asm are divergent. If \p Indices is empty, |
844 | /// this is analyzing the collective result of all output registers. Otherwise, |
845 | /// this is only querying a specific result index if this returns multiple |
846 | /// registers in a struct. |
847 | bool GCNTTIImpl::isInlineAsmSourceOfDivergence( |
848 | const CallInst *CI, ArrayRef<unsigned> Indices) const { |
849 | // TODO: Handle complex extract indices |
850 | if (Indices.size() > 1) |
851 | return true; |
852 | |
853 | const DataLayout &DL = CI->getModule()->getDataLayout(); |
854 | const SIRegisterInfo *TRI = ST->getRegisterInfo(); |
855 | TargetLowering::AsmOperandInfoVector TargetConstraints = |
856 | TLI->ParseConstraints(DL, ST->getRegisterInfo(), *CI); |
857 | |
858 | const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0]; |
859 | |
860 | int OutputIdx = 0; |
861 | for (auto &TC : TargetConstraints) { |
862 | if (TC.Type != InlineAsm::isOutput) |
863 | continue; |
864 | |
865 | // Skip outputs we don't care about. |
866 | if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++) |
867 | continue; |
868 | |
869 | TLI->ComputeConstraintToUse(TC, SDValue()); |
870 | |
871 | const TargetRegisterClass *RC = TLI->getRegForInlineAsmConstraint( |
872 | TRI, TC.ConstraintCode, TC.ConstraintVT).second; |
873 | |
874 | // For AGPR constraints null is returned on subtargets without AGPRs, so |
875 | // assume divergent for null. |
876 | if (!RC || !TRI->isSGPRClass(RC)) |
877 | return true; |
878 | } |
879 | |
880 | return false; |
881 | } |
882 | |
883 | bool GCNTTIImpl::isReadRegisterSourceOfDivergence( |
884 | const IntrinsicInst *ReadReg) const { |
885 | Metadata *MD = |
886 | cast<MetadataAsValue>(Val: ReadReg->getArgOperand(i: 0))->getMetadata(); |
887 | StringRef RegName = |
888 | cast<MDString>(Val: cast<MDNode>(Val: MD)->getOperand(I: 0))->getString(); |
889 | |
890 | // Special case registers that look like VCC. |
891 | MVT VT = MVT::getVT(Ty: ReadReg->getType()); |
892 | if (VT == MVT::i1) |
893 | return true; |
894 | |
895 | // Special case scalar registers that start with 'v'. |
896 | if (RegName.starts_with(Prefix: "vcc" ) || RegName.empty()) |
897 | return false; |
898 | |
899 | // VGPR or AGPR is divergent. There aren't any specially named vector |
900 | // registers. |
901 | return RegName[0] == 'v' || RegName[0] == 'a'; |
902 | } |
903 | |
904 | /// \returns true if the result of the value could potentially be |
905 | /// different across workitems in a wavefront. |
906 | bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const { |
907 | if (const Argument *A = dyn_cast<Argument>(Val: V)) |
908 | return !AMDGPU::isArgPassedInSGPR(Arg: A); |
909 | |
910 | // Loads from the private and flat address spaces are divergent, because |
911 | // threads can execute the load instruction with the same inputs and get |
912 | // different results. |
913 | // |
914 | // All other loads are not divergent, because if threads issue loads with the |
915 | // same arguments, they will always get the same result. |
916 | if (const LoadInst *Load = dyn_cast<LoadInst>(Val: V)) |
917 | return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS || |
918 | Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS; |
919 | |
920 | // Atomics are divergent because they are executed sequentially: when an |
921 | // atomic operation refers to the same address in each thread, then each |
922 | // thread after the first sees the value written by the previous thread as |
923 | // original value. |
924 | if (isa<AtomicRMWInst>(Val: V) || isa<AtomicCmpXchgInst>(Val: V)) |
925 | return true; |
926 | |
927 | if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(Val: V)) { |
928 | if (Intrinsic->getIntrinsicID() == Intrinsic::read_register) |
929 | return isReadRegisterSourceOfDivergence(ReadReg: Intrinsic); |
930 | |
931 | return AMDGPU::isIntrinsicSourceOfDivergence(IntrID: Intrinsic->getIntrinsicID()); |
932 | } |
933 | |
934 | // Assume all function calls are a source of divergence. |
935 | if (const CallInst *CI = dyn_cast<CallInst>(Val: V)) { |
936 | if (CI->isInlineAsm()) |
937 | return isInlineAsmSourceOfDivergence(CI); |
938 | return true; |
939 | } |
940 | |
941 | // Assume all function calls are a source of divergence. |
942 | if (isa<InvokeInst>(Val: V)) |
943 | return true; |
944 | |
945 | return false; |
946 | } |
947 | |
948 | bool GCNTTIImpl::isAlwaysUniform(const Value *V) const { |
949 | if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(Val: V)) |
950 | return AMDGPU::isIntrinsicAlwaysUniform(IntrID: Intrinsic->getIntrinsicID()); |
951 | |
952 | if (const CallInst *CI = dyn_cast<CallInst>(Val: V)) { |
953 | if (CI->isInlineAsm()) |
954 | return !isInlineAsmSourceOfDivergence(CI); |
955 | return false; |
956 | } |
957 | |
958 | // In most cases TID / wavefrontsize is uniform. |
959 | // |
960 | // However, if a kernel has uneven dimesions we can have a value of |
961 | // workitem-id-x divided by the wavefrontsize non-uniform. For example |
962 | // dimensions (65, 2) will have workitems with address (64, 0) and (0, 1) |
963 | // packed into a same wave which gives 1 and 0 after the division by 64 |
964 | // respectively. |
965 | // |
966 | // FIXME: limit it to 1D kernels only, although that shall be possible |
967 | // to perform this optimization is the size of the X dimension is a power |
968 | // of 2, we just do not currently have infrastructure to query it. |
969 | using namespace llvm::PatternMatch; |
970 | uint64_t C; |
971 | if (match(V, m_LShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(), |
972 | m_ConstantInt(C))) || |
973 | match(V, m_AShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(), |
974 | m_ConstantInt(C)))) { |
975 | const Function *F = cast<Instruction>(Val: V)->getFunction(); |
976 | return C >= ST->getWavefrontSizeLog2() && |
977 | ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0; |
978 | } |
979 | |
980 | Value *Mask; |
981 | if (match(V, m_c_And(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(), |
982 | m_Value(Mask)))) { |
983 | const Function *F = cast<Instruction>(Val: V)->getFunction(); |
984 | const DataLayout &DL = F->getParent()->getDataLayout(); |
985 | return computeKnownBits(V: Mask, DL).countMinTrailingZeros() >= |
986 | ST->getWavefrontSizeLog2() && |
987 | ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0; |
988 | } |
989 | |
990 | const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(Val: V); |
991 | if (!ExtValue) |
992 | return false; |
993 | |
994 | const CallInst *CI = dyn_cast<CallInst>(Val: ExtValue->getOperand(i_nocapture: 0)); |
995 | if (!CI) |
996 | return false; |
997 | |
998 | if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(Val: CI)) { |
999 | switch (Intrinsic->getIntrinsicID()) { |
1000 | default: |
1001 | return false; |
1002 | case Intrinsic::amdgcn_if: |
1003 | case Intrinsic::amdgcn_else: { |
1004 | ArrayRef<unsigned> Indices = ExtValue->getIndices(); |
1005 | return Indices.size() == 1 && Indices[0] == 1; |
1006 | } |
1007 | } |
1008 | } |
1009 | |
1010 | // If we have inline asm returning mixed SGPR and VGPR results, we inferred |
1011 | // divergent for the overall struct return. We need to override it in the |
1012 | // case we're extracting an SGPR component here. |
1013 | if (CI->isInlineAsm()) |
1014 | return !isInlineAsmSourceOfDivergence(CI, Indices: ExtValue->getIndices()); |
1015 | |
1016 | return false; |
1017 | } |
1018 | |
1019 | bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes, |
1020 | Intrinsic::ID IID) const { |
1021 | switch (IID) { |
1022 | case Intrinsic::amdgcn_ds_fadd: |
1023 | case Intrinsic::amdgcn_ds_fmin: |
1024 | case Intrinsic::amdgcn_ds_fmax: |
1025 | case Intrinsic::amdgcn_is_shared: |
1026 | case Intrinsic::amdgcn_is_private: |
1027 | case Intrinsic::amdgcn_flat_atomic_fadd: |
1028 | case Intrinsic::amdgcn_flat_atomic_fmax: |
1029 | case Intrinsic::amdgcn_flat_atomic_fmin: |
1030 | case Intrinsic::amdgcn_flat_atomic_fmax_num: |
1031 | case Intrinsic::amdgcn_flat_atomic_fmin_num: |
1032 | OpIndexes.push_back(Elt: 0); |
1033 | return true; |
1034 | default: |
1035 | return false; |
1036 | } |
1037 | } |
1038 | |
1039 | Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, |
1040 | Value *OldV, |
1041 | Value *NewV) const { |
1042 | auto IntrID = II->getIntrinsicID(); |
1043 | switch (IntrID) { |
1044 | case Intrinsic::amdgcn_ds_fadd: |
1045 | case Intrinsic::amdgcn_ds_fmin: |
1046 | case Intrinsic::amdgcn_ds_fmax: { |
1047 | const ConstantInt *IsVolatile = cast<ConstantInt>(Val: II->getArgOperand(i: 4)); |
1048 | if (!IsVolatile->isZero()) |
1049 | return nullptr; |
1050 | Module *M = II->getParent()->getParent()->getParent(); |
1051 | Type *DestTy = II->getType(); |
1052 | Type *SrcTy = NewV->getType(); |
1053 | Function *NewDecl = |
1054 | Intrinsic::getDeclaration(M, id: II->getIntrinsicID(), Tys: {DestTy, SrcTy}); |
1055 | II->setArgOperand(i: 0, v: NewV); |
1056 | II->setCalledFunction(NewDecl); |
1057 | return II; |
1058 | } |
1059 | case Intrinsic::amdgcn_is_shared: |
1060 | case Intrinsic::amdgcn_is_private: { |
1061 | unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ? |
1062 | AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS; |
1063 | unsigned NewAS = NewV->getType()->getPointerAddressSpace(); |
1064 | LLVMContext &Ctx = NewV->getType()->getContext(); |
1065 | ConstantInt *NewVal = (TrueAS == NewAS) ? |
1066 | ConstantInt::getTrue(Context&: Ctx) : ConstantInt::getFalse(Context&: Ctx); |
1067 | return NewVal; |
1068 | } |
1069 | case Intrinsic::ptrmask: { |
1070 | unsigned OldAS = OldV->getType()->getPointerAddressSpace(); |
1071 | unsigned NewAS = NewV->getType()->getPointerAddressSpace(); |
1072 | Value *MaskOp = II->getArgOperand(i: 1); |
1073 | Type *MaskTy = MaskOp->getType(); |
1074 | |
1075 | bool DoTruncate = false; |
1076 | |
1077 | const GCNTargetMachine &TM = |
1078 | static_cast<const GCNTargetMachine &>(getTLI()->getTargetMachine()); |
1079 | if (!TM.isNoopAddrSpaceCast(SrcAS: OldAS, DestAS: NewAS)) { |
1080 | // All valid 64-bit to 32-bit casts work by chopping off the high |
1081 | // bits. Any masking only clearing the low bits will also apply in the new |
1082 | // address space. |
1083 | if (DL.getPointerSizeInBits(AS: OldAS) != 64 || |
1084 | DL.getPointerSizeInBits(AS: NewAS) != 32) |
1085 | return nullptr; |
1086 | |
1087 | // TODO: Do we need to thread more context in here? |
1088 | KnownBits Known = computeKnownBits(V: MaskOp, DL, Depth: 0, AC: nullptr, CxtI: II); |
1089 | if (Known.countMinLeadingOnes() < 32) |
1090 | return nullptr; |
1091 | |
1092 | DoTruncate = true; |
1093 | } |
1094 | |
1095 | IRBuilder<> B(II); |
1096 | if (DoTruncate) { |
1097 | MaskTy = B.getInt32Ty(); |
1098 | MaskOp = B.CreateTrunc(V: MaskOp, DestTy: MaskTy); |
1099 | } |
1100 | |
1101 | return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy}, |
1102 | {NewV, MaskOp}); |
1103 | } |
1104 | case Intrinsic::amdgcn_flat_atomic_fadd: |
1105 | case Intrinsic::amdgcn_flat_atomic_fmax: |
1106 | case Intrinsic::amdgcn_flat_atomic_fmin: |
1107 | case Intrinsic::amdgcn_flat_atomic_fmax_num: |
1108 | case Intrinsic::amdgcn_flat_atomic_fmin_num: { |
1109 | Type *DestTy = II->getType(); |
1110 | Type *SrcTy = NewV->getType(); |
1111 | unsigned NewAS = SrcTy->getPointerAddressSpace(); |
1112 | if (!AMDGPU::isExtendedGlobalAddrSpace(AS: NewAS)) |
1113 | return nullptr; |
1114 | Module *M = II->getModule(); |
1115 | Function *NewDecl = Intrinsic::getDeclaration(M, id: II->getIntrinsicID(), |
1116 | Tys: {DestTy, SrcTy, DestTy}); |
1117 | II->setArgOperand(i: 0, v: NewV); |
1118 | II->setCalledFunction(NewDecl); |
1119 | return II; |
1120 | } |
1121 | default: |
1122 | return nullptr; |
1123 | } |
1124 | } |
1125 | |
1126 | InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, |
1127 | VectorType *VT, ArrayRef<int> Mask, |
1128 | TTI::TargetCostKind CostKind, |
1129 | int Index, VectorType *SubTp, |
1130 | ArrayRef<const Value *> Args, |
1131 | const Instruction *CxtI) { |
1132 | Kind = improveShuffleKindFromMask(Kind, Mask, Ty: VT, Index, SubTy&: SubTp); |
1133 | // Treat extractsubvector as single op permutation. |
1134 | bool = Kind == TTI::SK_ExtractSubvector; |
1135 | if (IsExtractSubvector) |
1136 | Kind = TTI::SK_PermuteSingleSrc; |
1137 | |
1138 | if (ST->hasVOP3PInsts()) { |
1139 | if (cast<FixedVectorType>(Val: VT)->getNumElements() == 2 && |
1140 | DL.getTypeSizeInBits(Ty: VT->getElementType()) == 16) { |
1141 | // With op_sel VOP3P instructions freely can access the low half or high |
1142 | // half of a register, so any swizzle is free. |
1143 | |
1144 | switch (Kind) { |
1145 | case TTI::SK_Broadcast: |
1146 | case TTI::SK_Reverse: |
1147 | case TTI::SK_PermuteSingleSrc: |
1148 | return 0; |
1149 | default: |
1150 | break; |
1151 | } |
1152 | } |
1153 | } |
1154 | // Restore optimal kind. |
1155 | if (IsExtractSubvector) |
1156 | Kind = TTI::SK_ExtractSubvector; |
1157 | |
1158 | return BaseT::getShuffleCost(Kind, Tp: VT, Mask, CostKind, Index, SubTp); |
1159 | } |
1160 | |
1161 | bool GCNTTIImpl::areInlineCompatible(const Function *Caller, |
1162 | const Function *Callee) const { |
1163 | const TargetMachine &TM = getTLI()->getTargetMachine(); |
1164 | const GCNSubtarget *CallerST |
1165 | = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller)); |
1166 | const GCNSubtarget *CalleeST |
1167 | = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee)); |
1168 | |
1169 | const FeatureBitset &CallerBits = CallerST->getFeatureBits(); |
1170 | const FeatureBitset &CalleeBits = CalleeST->getFeatureBits(); |
1171 | |
1172 | FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList; |
1173 | FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList; |
1174 | if ((RealCallerBits & RealCalleeBits) != RealCalleeBits) |
1175 | return false; |
1176 | |
1177 | // FIXME: dx10_clamp can just take the caller setting, but there seems to be |
1178 | // no way to support merge for backend defined attributes. |
1179 | SIModeRegisterDefaults CallerMode(*Caller, *CallerST); |
1180 | SIModeRegisterDefaults CalleeMode(*Callee, *CalleeST); |
1181 | if (!CallerMode.isInlineCompatible(CalleeMode)) |
1182 | return false; |
1183 | |
1184 | if (Callee->hasFnAttribute(Attribute::AlwaysInline) || |
1185 | Callee->hasFnAttribute(Attribute::InlineHint)) |
1186 | return true; |
1187 | |
1188 | // Hack to make compile times reasonable. |
1189 | if (InlineMaxBB) { |
1190 | // Single BB does not increase total BB amount. |
1191 | if (Callee->size() == 1) |
1192 | return true; |
1193 | size_t BBSize = Caller->size() + Callee->size() - 1; |
1194 | return BBSize <= InlineMaxBB; |
1195 | } |
1196 | |
1197 | return true; |
1198 | } |
1199 | |
1200 | static unsigned adjustInliningThresholdUsingCallee(const CallBase *CB, |
1201 | const SITargetLowering *TLI, |
1202 | const GCNTTIImpl *TTIImpl) { |
1203 | const int NrOfSGPRUntilSpill = 26; |
1204 | const int NrOfVGPRUntilSpill = 32; |
1205 | |
1206 | const DataLayout &DL = TTIImpl->getDataLayout(); |
1207 | |
1208 | unsigned adjustThreshold = 0; |
1209 | int SGPRsInUse = 0; |
1210 | int VGPRsInUse = 0; |
1211 | for (const Use &A : CB->args()) { |
1212 | SmallVector<EVT, 4> ValueVTs; |
1213 | ComputeValueVTs(TLI: *TLI, DL, Ty: A.get()->getType(), ValueVTs); |
1214 | for (auto ArgVT : ValueVTs) { |
1215 | unsigned CCRegNum = TLI->getNumRegistersForCallingConv( |
1216 | Context&: CB->getContext(), CC: CB->getCallingConv(), VT: ArgVT); |
1217 | if (AMDGPU::isArgPassedInSGPR(CB, ArgNo: CB->getArgOperandNo(U: &A))) |
1218 | SGPRsInUse += CCRegNum; |
1219 | else |
1220 | VGPRsInUse += CCRegNum; |
1221 | } |
1222 | } |
1223 | |
1224 | // The cost of passing function arguments through the stack: |
1225 | // 1 instruction to put a function argument on the stack in the caller. |
1226 | // 1 instruction to take a function argument from the stack in callee. |
1227 | // 1 instruction is explicitly take care of data dependencies in callee |
1228 | // function. |
1229 | InstructionCost ArgStackCost(1); |
1230 | ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost( |
1231 | Opcode: Instruction::Store, Src: Type::getInt32Ty(C&: CB->getContext()), Alignment: Align(4), |
1232 | AddressSpace: AMDGPUAS::PRIVATE_ADDRESS, CostKind: TTI::TCK_SizeAndLatency); |
1233 | ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost( |
1234 | Opcode: Instruction::Load, Src: Type::getInt32Ty(C&: CB->getContext()), Alignment: Align(4), |
1235 | AddressSpace: AMDGPUAS::PRIVATE_ADDRESS, CostKind: TTI::TCK_SizeAndLatency); |
1236 | |
1237 | // The penalty cost is computed relative to the cost of instructions and does |
1238 | // not model any storage costs. |
1239 | adjustThreshold += std::max(a: 0, b: SGPRsInUse - NrOfSGPRUntilSpill) * |
1240 | *ArgStackCost.getValue() * InlineConstants::getInstrCost(); |
1241 | adjustThreshold += std::max(a: 0, b: VGPRsInUse - NrOfVGPRUntilSpill) * |
1242 | *ArgStackCost.getValue() * InlineConstants::getInstrCost(); |
1243 | return adjustThreshold; |
1244 | } |
1245 | |
1246 | static unsigned getCallArgsTotalAllocaSize(const CallBase *CB, |
1247 | const DataLayout &DL) { |
1248 | // If we have a pointer to a private array passed into a function |
1249 | // it will not be optimized out, leaving scratch usage. |
1250 | // This function calculates the total size in bytes of the memory that would |
1251 | // end in scratch if the call was not inlined. |
1252 | unsigned AllocaSize = 0; |
1253 | SmallPtrSet<const AllocaInst *, 8> AIVisited; |
1254 | for (Value *PtrArg : CB->args()) { |
1255 | PointerType *Ty = dyn_cast<PointerType>(Val: PtrArg->getType()); |
1256 | if (!Ty) |
1257 | continue; |
1258 | |
1259 | unsigned AddrSpace = Ty->getAddressSpace(); |
1260 | if (AddrSpace != AMDGPUAS::FLAT_ADDRESS && |
1261 | AddrSpace != AMDGPUAS::PRIVATE_ADDRESS) |
1262 | continue; |
1263 | |
1264 | const AllocaInst *AI = dyn_cast<AllocaInst>(Val: getUnderlyingObject(V: PtrArg)); |
1265 | if (!AI || !AI->isStaticAlloca() || !AIVisited.insert(Ptr: AI).second) |
1266 | continue; |
1267 | |
1268 | AllocaSize += DL.getTypeAllocSize(Ty: AI->getAllocatedType()); |
1269 | } |
1270 | return AllocaSize; |
1271 | } |
1272 | |
1273 | unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase *CB) const { |
1274 | unsigned Threshold = adjustInliningThresholdUsingCallee(CB, TLI, TTIImpl: this); |
1275 | |
1276 | // Private object passed as arguments may end up in scratch usage if the call |
1277 | // is not inlined. Increase the inline threshold to promote inlining. |
1278 | unsigned AllocaSize = getCallArgsTotalAllocaSize(CB, DL); |
1279 | if (AllocaSize > 0) |
1280 | Threshold += ArgAllocaCost; |
1281 | return Threshold; |
1282 | } |
1283 | |
1284 | unsigned GCNTTIImpl::getCallerAllocaCost(const CallBase *CB, |
1285 | const AllocaInst *AI) const { |
1286 | |
1287 | // Below the cutoff, assume that the private memory objects would be |
1288 | // optimized |
1289 | auto AllocaSize = getCallArgsTotalAllocaSize(CB, DL); |
1290 | if (AllocaSize <= ArgAllocaCutoff) |
1291 | return 0; |
1292 | |
1293 | // Above the cutoff, we give a cost to each private memory object |
1294 | // depending its size. If the array can be optimized by SROA this cost is not |
1295 | // added to the total-cost in the inliner cost analysis. |
1296 | // |
1297 | // We choose the total cost of the alloca such that their sum cancels the |
1298 | // bonus given in the threshold (ArgAllocaCost). |
1299 | // |
1300 | // Cost_Alloca_0 + ... + Cost_Alloca_N == ArgAllocaCost |
1301 | // |
1302 | // Awkwardly, the ArgAllocaCost bonus is multiplied by threshold-multiplier, |
1303 | // the single-bb bonus and the vector-bonus. |
1304 | // |
1305 | // We compensate the first two multipliers, by repeating logic from the |
1306 | // inliner-cost in here. The vector-bonus is 0 on AMDGPU. |
1307 | static_assert(InlinerVectorBonusPercent == 0, "vector bonus assumed to be 0" ); |
1308 | unsigned Threshold = ArgAllocaCost * getInliningThresholdMultiplier(); |
1309 | |
1310 | bool SingleBB = none_of(Range&: *CB->getCalledFunction(), P: [](const BasicBlock &BB) { |
1311 | return BB.getTerminator()->getNumSuccessors() > 1; |
1312 | }); |
1313 | if (SingleBB) { |
1314 | Threshold += Threshold / 2; |
1315 | } |
1316 | |
1317 | auto ArgAllocaSize = DL.getTypeAllocSize(Ty: AI->getAllocatedType()); |
1318 | |
1319 | // Attribute the bonus proportionally to the alloca size |
1320 | unsigned AllocaThresholdBonus = (Threshold * ArgAllocaSize) / AllocaSize; |
1321 | |
1322 | return AllocaThresholdBonus; |
1323 | } |
1324 | |
1325 | void GCNTTIImpl::(Loop *L, ScalarEvolution &SE, |
1326 | TTI::UnrollingPreferences &UP, |
1327 | OptimizationRemarkEmitter *ORE) { |
1328 | CommonTTI.getUnrollingPreferences(L, SE, UP, ORE); |
1329 | } |
1330 | |
1331 | void GCNTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, |
1332 | TTI::PeelingPreferences &PP) { |
1333 | CommonTTI.getPeelingPreferences(L, SE, PP); |
1334 | } |
1335 | |
1336 | int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const { |
1337 | return ST->hasFullRate64Ops() |
1338 | ? getFullRateInstrCost() |
1339 | : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind) |
1340 | : getQuarterRateInstrCost(CostKind); |
1341 | } |
1342 | |
1343 | std::pair<InstructionCost, MVT> |
1344 | GCNTTIImpl::getTypeLegalizationCost(Type *Ty) const { |
1345 | std::pair<InstructionCost, MVT> Cost = BaseT::getTypeLegalizationCost(Ty); |
1346 | auto Size = DL.getTypeSizeInBits(Ty); |
1347 | // Maximum load or store can handle 8 dwords for scalar and 4 for |
1348 | // vector ALU. Let's assume anything above 8 dwords is expensive |
1349 | // even if legal. |
1350 | if (Size <= 256) |
1351 | return Cost; |
1352 | |
1353 | Cost.first += (Size + 255) / 256; |
1354 | return Cost; |
1355 | } |
1356 | |
1357 | unsigned GCNTTIImpl::getPrefetchDistance() const { |
1358 | return ST->hasPrefetch() ? 128 : 0; |
1359 | } |
1360 | |
1361 | bool GCNTTIImpl::shouldPrefetchAddressSpace(unsigned AS) const { |
1362 | return AMDGPU::isFlatGlobalAddrSpace(AS); |
1363 | } |
1364 | |