1//===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file provides a helper that implements much of the TTI interface in
11/// terms of the target-independent code generator and TargetLowering
12/// interfaces.
13//
14//===----------------------------------------------------------------------===//
15
16#ifndef LLVM_CODEGEN_BASICTTIIMPL_H
17#define LLVM_CODEGEN_BASICTTIIMPL_H
18
19#include "llvm/ADT/APInt.h"
20#include "llvm/ADT/ArrayRef.h"
21#include "llvm/ADT/BitVector.h"
22#include "llvm/ADT/SmallPtrSet.h"
23#include "llvm/ADT/SmallVector.h"
24#include "llvm/Analysis/LoopInfo.h"
25#include "llvm/Analysis/OptimizationRemarkEmitter.h"
26#include "llvm/Analysis/TargetTransformInfo.h"
27#include "llvm/Analysis/TargetTransformInfoImpl.h"
28#include "llvm/CodeGen/ISDOpcodes.h"
29#include "llvm/CodeGen/TargetLowering.h"
30#include "llvm/CodeGen/TargetSubtargetInfo.h"
31#include "llvm/CodeGen/ValueTypes.h"
32#include "llvm/CodeGenTypes/MachineValueType.h"
33#include "llvm/IR/BasicBlock.h"
34#include "llvm/IR/Constant.h"
35#include "llvm/IR/Constants.h"
36#include "llvm/IR/DataLayout.h"
37#include "llvm/IR/DerivedTypes.h"
38#include "llvm/IR/InstrTypes.h"
39#include "llvm/IR/Instruction.h"
40#include "llvm/IR/Instructions.h"
41#include "llvm/IR/Intrinsics.h"
42#include "llvm/IR/Operator.h"
43#include "llvm/IR/Type.h"
44#include "llvm/IR/Value.h"
45#include "llvm/Support/Casting.h"
46#include "llvm/Support/CommandLine.h"
47#include "llvm/Support/ErrorHandling.h"
48#include "llvm/Support/MathExtras.h"
49#include "llvm/Target/TargetMachine.h"
50#include "llvm/Target/TargetOptions.h"
51#include <algorithm>
52#include <cassert>
53#include <cstdint>
54#include <limits>
55#include <optional>
56#include <utility>
57
58namespace llvm {
59
60class Function;
61class GlobalValue;
62class LLVMContext;
63class ScalarEvolution;
64class SCEV;
65class TargetMachine;
66
67extern cl::opt<unsigned> PartialUnrollingThreshold;
68
69/// Base class which can be used to help build a TTI implementation.
70///
71/// This class provides as much implementation of the TTI interface as is
72/// possible using the target independent parts of the code generator.
73///
74/// In order to subclass it, your class must implement a getST() method to
75/// return the subtarget, and a getTLI() method to return the target lowering.
76/// We need these methods implemented in the derived class so that this class
77/// doesn't have to duplicate storage for them.
78template <typename T>
79class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
80private:
81 using BaseT = TargetTransformInfoImplCRTPBase<T>;
82 using TTI = TargetTransformInfo;
83
84 /// Helper function to access this as a T.
85 T *thisT() { return static_cast<T *>(this); }
86
87 /// Estimate a cost of Broadcast as an extract and sequence of insert
88 /// operations.
89 InstructionCost getBroadcastShuffleOverhead(FixedVectorType *VTy,
90 TTI::TargetCostKind CostKind) {
91 InstructionCost Cost = 0;
92 // Broadcast cost is equal to the cost of extracting the zero'th element
93 // plus the cost of inserting it into every element of the result vector.
94 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
95 CostKind, 0, nullptr, nullptr);
96
97 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
98 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
99 CostKind, i, nullptr, nullptr);
100 }
101 return Cost;
102 }
103
104 /// Estimate a cost of shuffle as a sequence of extract and insert
105 /// operations.
106 InstructionCost getPermuteShuffleOverhead(FixedVectorType *VTy,
107 TTI::TargetCostKind CostKind) {
108 InstructionCost Cost = 0;
109 // Shuffle cost is equal to the cost of extracting element from its argument
110 // plus the cost of inserting them onto the result vector.
111
112 // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from
113 // index 0 of first vector, index 1 of second vector,index 2 of first
114 // vector and finally index 3 of second vector and insert them at index
115 // <0,1,2,3> of result vector.
116 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
117 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
118 CostKind, i, nullptr, nullptr);
119 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
120 CostKind, i, nullptr, nullptr);
121 }
122 return Cost;
123 }
124
125 /// Estimate a cost of subvector extraction as a sequence of extract and
126 /// insert operations.
127 InstructionCost getExtractSubvectorOverhead(VectorType *VTy,
128 TTI::TargetCostKind CostKind,
129 int Index,
130 FixedVectorType *SubVTy) {
131 assert(VTy && SubVTy &&
132 "Can only extract subvectors from vectors");
133 int NumSubElts = SubVTy->getNumElements();
134 assert((!isa<FixedVectorType>(VTy) ||
135 (Index + NumSubElts) <=
136 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
137 "SK_ExtractSubvector index out of range");
138
139 InstructionCost Cost = 0;
140 // Subvector extraction cost is equal to the cost of extracting element from
141 // the source type plus the cost of inserting them into the result vector
142 // type.
143 for (int i = 0; i != NumSubElts; ++i) {
144 Cost +=
145 thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
146 CostKind, i + Index, nullptr, nullptr);
147 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy,
148 CostKind, i, nullptr, nullptr);
149 }
150 return Cost;
151 }
152
153 /// Estimate a cost of subvector insertion as a sequence of extract and
154 /// insert operations.
155 InstructionCost getInsertSubvectorOverhead(VectorType *VTy,
156 TTI::TargetCostKind CostKind,
157 int Index,
158 FixedVectorType *SubVTy) {
159 assert(VTy && SubVTy &&
160 "Can only insert subvectors into vectors");
161 int NumSubElts = SubVTy->getNumElements();
162 assert((!isa<FixedVectorType>(VTy) ||
163 (Index + NumSubElts) <=
164 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
165 "SK_InsertSubvector index out of range");
166
167 InstructionCost Cost = 0;
168 // Subvector insertion cost is equal to the cost of extracting element from
169 // the source type plus the cost of inserting them into the result vector
170 // type.
171 for (int i = 0; i != NumSubElts; ++i) {
172 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy,
173 CostKind, i, nullptr, nullptr);
174 Cost +=
175 thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, CostKind,
176 i + Index, nullptr, nullptr);
177 }
178 return Cost;
179 }
180
181 /// Local query method delegates up to T which *must* implement this!
182 const TargetSubtargetInfo *getST() const {
183 return static_cast<const T *>(this)->getST();
184 }
185
186 /// Local query method delegates up to T which *must* implement this!
187 const TargetLoweringBase *getTLI() const {
188 return static_cast<const T *>(this)->getTLI();
189 }
190
191 static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) {
192 switch (M) {
193 case TTI::MIM_Unindexed:
194 return ISD::UNINDEXED;
195 case TTI::MIM_PreInc:
196 return ISD::PRE_INC;
197 case TTI::MIM_PreDec:
198 return ISD::PRE_DEC;
199 case TTI::MIM_PostInc:
200 return ISD::POST_INC;
201 case TTI::MIM_PostDec:
202 return ISD::POST_DEC;
203 }
204 llvm_unreachable("Unexpected MemIndexedMode");
205 }
206
207 InstructionCost getCommonMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
208 Align Alignment,
209 bool VariableMask,
210 bool IsGatherScatter,
211 TTI::TargetCostKind CostKind) {
212 // We cannot scalarize scalable vectors, so return Invalid.
213 if (isa<ScalableVectorType>(Val: DataTy))
214 return InstructionCost::getInvalid();
215
216 auto *VT = cast<FixedVectorType>(Val: DataTy);
217 // Assume the target does not have support for gather/scatter operations
218 // and provide a rough estimate.
219 //
220 // First, compute the cost of the individual memory operations.
221 InstructionCost AddrExtractCost =
222 IsGatherScatter
223 ? getVectorInstrCost(Instruction::ExtractElement,
224 FixedVectorType::get(
225 ElementType: PointerType::get(ElementType: VT->getElementType(), AddressSpace: 0),
226 NumElts: VT->getNumElements()),
227 CostKind, -1, nullptr, nullptr)
228 : 0;
229 InstructionCost LoadCost =
230 VT->getNumElements() *
231 (AddrExtractCost +
232 getMemoryOpCost(Opcode, Src: VT->getElementType(), Alignment, AddressSpace: 0, CostKind));
233
234 // Next, compute the cost of packing the result in a vector.
235 InstructionCost PackingCost =
236 getScalarizationOverhead(VT, Opcode != Instruction::Store,
237 Opcode == Instruction::Store, CostKind);
238
239 InstructionCost ConditionalCost = 0;
240 if (VariableMask) {
241 // Compute the cost of conditionally executing the memory operations with
242 // variable masks. This includes extracting the individual conditions, a
243 // branches and PHIs to combine the results.
244 // NOTE: Estimating the cost of conditionally executing the memory
245 // operations accurately is quite difficult and the current solution
246 // provides a very rough estimate only.
247 ConditionalCost =
248 VT->getNumElements() *
249 (getVectorInstrCost(
250 Instruction::ExtractElement,
251 FixedVectorType::get(ElementType: Type::getInt1Ty(C&: DataTy->getContext()),
252 NumElts: VT->getNumElements()),
253 CostKind, -1, nullptr, nullptr) +
254 getCFInstrCost(Opcode: Instruction::Br, CostKind) +
255 getCFInstrCost(Opcode: Instruction::PHI, CostKind));
256 }
257
258 return LoadCost + PackingCost + ConditionalCost;
259 }
260
261protected:
262 explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
263 : BaseT(DL) {}
264 virtual ~BasicTTIImplBase() = default;
265
266 using TargetTransformInfoImplBase::DL;
267
268public:
269 /// \name Scalar TTI Implementations
270 /// @{
271 bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth,
272 unsigned AddressSpace, Align Alignment,
273 unsigned *Fast) const {
274 EVT E = EVT::getIntegerVT(Context, BitWidth);
275 return getTLI()->allowsMisalignedMemoryAccesses(
276 E, AddressSpace, Alignment, MachineMemOperand::MONone, Fast);
277 }
278
279 bool hasBranchDivergence(const Function *F = nullptr) { return false; }
280
281 bool isSourceOfDivergence(const Value *V) { return false; }
282
283 bool isAlwaysUniform(const Value *V) { return false; }
284
285 bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
286 return false;
287 }
288
289 bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const {
290 return true;
291 }
292
293 unsigned getFlatAddressSpace() {
294 // Return an invalid address space.
295 return -1;
296 }
297
298 bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
299 Intrinsic::ID IID) const {
300 return false;
301 }
302
303 bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
304 return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS);
305 }
306
307 unsigned getAssumedAddrSpace(const Value *V) const {
308 return getTLI()->getTargetMachine().getAssumedAddrSpace(V);
309 }
310
311 bool isSingleThreaded() const {
312 return getTLI()->getTargetMachine().Options.ThreadModel ==
313 ThreadModel::Single;
314 }
315
316 std::pair<const Value *, unsigned>
317 getPredicatedAddrSpace(const Value *V) const {
318 return getTLI()->getTargetMachine().getPredicatedAddrSpace(V);
319 }
320
321 Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
322 Value *NewV) const {
323 return nullptr;
324 }
325
326 bool isLegalAddImmediate(int64_t imm) {
327 return getTLI()->isLegalAddImmediate(imm);
328 }
329
330 bool isLegalICmpImmediate(int64_t imm) {
331 return getTLI()->isLegalICmpImmediate(imm);
332 }
333
334 bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
335 bool HasBaseReg, int64_t Scale,
336 unsigned AddrSpace, Instruction *I = nullptr) {
337 TargetLoweringBase::AddrMode AM;
338 AM.BaseGV = BaseGV;
339 AM.BaseOffs = BaseOffset;
340 AM.HasBaseReg = HasBaseReg;
341 AM.Scale = Scale;
342 return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I);
343 }
344
345 int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) {
346 return getTLI()->getPreferredLargeGEPBaseOffset(MinOffset, MaxOffset);
347 }
348
349 unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
350 Type *ScalarValTy) const {
351 auto &&IsSupportedByTarget = [this, ScalarMemTy, ScalarValTy](unsigned VF) {
352 auto *SrcTy = FixedVectorType::get(ElementType: ScalarMemTy, NumElts: VF / 2);
353 EVT VT = getTLI()->getValueType(DL, SrcTy);
354 if (getTLI()->isOperationLegal(ISD::STORE, VT) ||
355 getTLI()->isOperationCustom(ISD::STORE, VT))
356 return true;
357
358 EVT ValVT =
359 getTLI()->getValueType(DL, FixedVectorType::get(ElementType: ScalarValTy, NumElts: VF / 2));
360 EVT LegalizedVT =
361 getTLI()->getTypeToTransformTo(ScalarMemTy->getContext(), VT);
362 return getTLI()->isTruncStoreLegal(LegalizedVT, ValVT);
363 };
364 while (VF > 2 && IsSupportedByTarget(VF))
365 VF /= 2;
366 return VF;
367 }
368
369 bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty,
370 const DataLayout &DL) const {
371 EVT VT = getTLI()->getValueType(DL, Ty);
372 return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT);
373 }
374
375 bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty,
376 const DataLayout &DL) const {
377 EVT VT = getTLI()->getValueType(DL, Ty);
378 return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT);
379 }
380
381 bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2) {
382 return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
383 }
384
385 bool isNumRegsMajorCostOfLSR() {
386 return TargetTransformInfoImplBase::isNumRegsMajorCostOfLSR();
387 }
388
389 bool shouldFoldTerminatingConditionAfterLSR() const {
390 return TargetTransformInfoImplBase::
391 shouldFoldTerminatingConditionAfterLSR();
392 }
393
394 bool isProfitableLSRChainElement(Instruction *I) {
395 return TargetTransformInfoImplBase::isProfitableLSRChainElement(I);
396 }
397
398 InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
399 int64_t BaseOffset, bool HasBaseReg,
400 int64_t Scale, unsigned AddrSpace) {
401 TargetLoweringBase::AddrMode AM;
402 AM.BaseGV = BaseGV;
403 AM.BaseOffs = BaseOffset;
404 AM.HasBaseReg = HasBaseReg;
405 AM.Scale = Scale;
406 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
407 return 0;
408 return -1;
409 }
410
411 bool isTruncateFree(Type *Ty1, Type *Ty2) {
412 return getTLI()->isTruncateFree(Ty1, Ty2);
413 }
414
415 bool isProfitableToHoist(Instruction *I) {
416 return getTLI()->isProfitableToHoist(I);
417 }
418
419 bool useAA() const { return getST()->useAA(); }
420
421 bool isTypeLegal(Type *Ty) {
422 EVT VT = getTLI()->getValueType(DL, Ty);
423 return getTLI()->isTypeLegal(VT);
424 }
425
426 unsigned getRegUsageForType(Type *Ty) {
427 EVT ETy = getTLI()->getValueType(DL, Ty);
428 return getTLI()->getNumRegisters(Ty->getContext(), ETy);
429 }
430
431 InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr,
432 ArrayRef<const Value *> Operands, Type *AccessType,
433 TTI::TargetCostKind CostKind) {
434 return BaseT::getGEPCost(PointeeType, Ptr, Operands, AccessType, CostKind);
435 }
436
437 unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
438 unsigned &JumpTableSize,
439 ProfileSummaryInfo *PSI,
440 BlockFrequencyInfo *BFI) {
441 /// Try to find the estimated number of clusters. Note that the number of
442 /// clusters identified in this function could be different from the actual
443 /// numbers found in lowering. This function ignore switches that are
444 /// lowered with a mix of jump table / bit test / BTree. This function was
445 /// initially intended to be used when estimating the cost of switch in
446 /// inline cost heuristic, but it's a generic cost model to be used in other
447 /// places (e.g., in loop unrolling).
448 unsigned N = SI.getNumCases();
449 const TargetLoweringBase *TLI = getTLI();
450 const DataLayout &DL = this->getDataLayout();
451
452 JumpTableSize = 0;
453 bool IsJTAllowed = TLI->areJTsAllowed(Fn: SI.getParent()->getParent());
454
455 // Early exit if both a jump table and bit test are not allowed.
456 if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(AS: 0u) < N))
457 return N;
458
459 APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue();
460 APInt MinCaseVal = MaxCaseVal;
461 for (auto CI : SI.cases()) {
462 const APInt &CaseVal = CI.getCaseValue()->getValue();
463 if (CaseVal.sgt(RHS: MaxCaseVal))
464 MaxCaseVal = CaseVal;
465 if (CaseVal.slt(RHS: MinCaseVal))
466 MinCaseVal = CaseVal;
467 }
468
469 // Check if suitable for a bit test
470 if (N <= DL.getIndexSizeInBits(AS: 0u)) {
471 SmallPtrSet<const BasicBlock *, 4> Dests;
472 for (auto I : SI.cases())
473 Dests.insert(Ptr: I.getCaseSuccessor());
474
475 if (TLI->isSuitableForBitTests(NumDests: Dests.size(), NumCmps: N, Low: MinCaseVal, High: MaxCaseVal,
476 DL))
477 return 1;
478 }
479
480 // Check if suitable for a jump table.
481 if (IsJTAllowed) {
482 if (N < 2 || N < TLI->getMinimumJumpTableEntries())
483 return N;
484 uint64_t Range =
485 (MaxCaseVal - MinCaseVal)
486 .getLimitedValue(Limit: std::numeric_limits<uint64_t>::max() - 1) + 1;
487 // Check whether a range of clusters is dense enough for a jump table
488 if (TLI->isSuitableForJumpTable(SI: &SI, NumCases: N, Range, PSI, BFI)) {
489 JumpTableSize = Range;
490 return 1;
491 }
492 }
493 return N;
494 }
495
496 bool shouldBuildLookupTables() {
497 const TargetLoweringBase *TLI = getTLI();
498 return TLI->isOperationLegalOrCustom(Op: ISD::BR_JT, MVT::VT: Other) ||
499 TLI->isOperationLegalOrCustom(Op: ISD::BRIND, MVT::VT: Other);
500 }
501
502 bool shouldBuildRelLookupTables() const {
503 const TargetMachine &TM = getTLI()->getTargetMachine();
504 // If non-PIC mode, do not generate a relative lookup table.
505 if (!TM.isPositionIndependent())
506 return false;
507
508 /// Relative lookup table entries consist of 32-bit offsets.
509 /// Do not generate relative lookup tables for large code models
510 /// in 64-bit achitectures where 32-bit offsets might not be enough.
511 if (TM.getCodeModel() == CodeModel::Medium ||
512 TM.getCodeModel() == CodeModel::Large)
513 return false;
514
515 Triple TargetTriple = TM.getTargetTriple();
516 if (!TargetTriple.isArch64Bit())
517 return false;
518
519 // TODO: Triggers issues on aarch64 on darwin, so temporarily disable it
520 // there.
521 if (TargetTriple.getArch() == Triple::aarch64 && TargetTriple.isOSDarwin())
522 return false;
523
524 return true;
525 }
526
527 bool haveFastSqrt(Type *Ty) {
528 const TargetLoweringBase *TLI = getTLI();
529 EVT VT = TLI->getValueType(DL, Ty);
530 return TLI->isTypeLegal(VT) &&
531 TLI->isOperationLegalOrCustom(Op: ISD::FSQRT, VT);
532 }
533
534 bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
535 return true;
536 }
537
538 InstructionCost getFPOpCost(Type *Ty) {
539 // Check whether FADD is available, as a proxy for floating-point in
540 // general.
541 const TargetLoweringBase *TLI = getTLI();
542 EVT VT = TLI->getValueType(DL, Ty);
543 if (TLI->isOperationLegalOrCustomOrPromote(Op: ISD::FADD, VT))
544 return TargetTransformInfo::TCC_Basic;
545 return TargetTransformInfo::TCC_Expensive;
546 }
547
548 bool preferToKeepConstantsAttached(const Instruction &Inst,
549 const Function &Fn) const {
550 switch (Inst.getOpcode()) {
551 default:
552 break;
553 case Instruction::SDiv:
554 case Instruction::SRem:
555 case Instruction::UDiv:
556 case Instruction::URem: {
557 if (!isa<ConstantInt>(Val: Inst.getOperand(i: 1)))
558 return false;
559 EVT VT = getTLI()->getValueType(DL, Inst.getType());
560 return !getTLI()->isIntDivCheap(VT, Fn.getAttributes());
561 }
562 };
563
564 return false;
565 }
566
567 unsigned getInliningThresholdMultiplier() const { return 1; }
568 unsigned adjustInliningThreshold(const CallBase *CB) { return 0; }
569 unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const {
570 return 0;
571 }
572
573 int getInlinerVectorBonusPercent() const { return 150; }
574
575 void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
576 TTI::UnrollingPreferences &UP,
577 OptimizationRemarkEmitter *ORE) {
578 // This unrolling functionality is target independent, but to provide some
579 // motivation for its intended use, for x86:
580
581 // According to the Intel 64 and IA-32 Architectures Optimization Reference
582 // Manual, Intel Core models and later have a loop stream detector (and
583 // associated uop queue) that can benefit from partial unrolling.
584 // The relevant requirements are:
585 // - The loop must have no more than 4 (8 for Nehalem and later) branches
586 // taken, and none of them may be calls.
587 // - The loop can have no more than 18 (28 for Nehalem and later) uops.
588
589 // According to the Software Optimization Guide for AMD Family 15h
590 // Processors, models 30h-4fh (Steamroller and later) have a loop predictor
591 // and loop buffer which can benefit from partial unrolling.
592 // The relevant requirements are:
593 // - The loop must have fewer than 16 branches
594 // - The loop must have less than 40 uops in all executed loop branches
595
596 // The number of taken branches in a loop is hard to estimate here, and
597 // benchmarking has revealed that it is better not to be conservative when
598 // estimating the branch count. As a result, we'll ignore the branch limits
599 // until someone finds a case where it matters in practice.
600
601 unsigned MaxOps;
602 const TargetSubtargetInfo *ST = getST();
603 if (PartialUnrollingThreshold.getNumOccurrences() > 0)
604 MaxOps = PartialUnrollingThreshold;
605 else if (ST->getSchedModel().LoopMicroOpBufferSize > 0)
606 MaxOps = ST->getSchedModel().LoopMicroOpBufferSize;
607 else
608 return;
609
610 // Scan the loop: don't unroll loops with calls.
611 for (BasicBlock *BB : L->blocks()) {
612 for (Instruction &I : *BB) {
613 if (isa<CallInst>(Val: I) || isa<InvokeInst>(Val: I)) {
614 if (const Function *F = cast<CallBase>(Val&: I).getCalledFunction()) {
615 if (!thisT()->isLoweredToCall(F))
616 continue;
617 }
618
619 if (ORE) {
620 ORE->emit([&]() {
621 return OptimizationRemark("TTI", "DontUnroll", L->getStartLoc(),
622 L->getHeader())
623 << "advising against unrolling the loop because it "
624 "contains a "
625 << ore::NV("Call", &I);
626 });
627 }
628 return;
629 }
630 }
631 }
632
633 // Enable runtime and partial unrolling up to the specified size.
634 // Enable using trip count upper bound to unroll loops.
635 UP.Partial = UP.Runtime = UP.UpperBound = true;
636 UP.PartialThreshold = MaxOps;
637
638 // Avoid unrolling when optimizing for size.
639 UP.OptSizeThreshold = 0;
640 UP.PartialOptSizeThreshold = 0;
641
642 // Set number of instructions optimized when "back edge"
643 // becomes "fall through" to default value of 2.
644 UP.BEInsns = 2;
645 }
646
647 void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
648 TTI::PeelingPreferences &PP) {
649 PP.PeelCount = 0;
650 PP.AllowPeeling = true;
651 PP.AllowLoopNestsPeeling = false;
652 PP.PeelProfiledIterations = true;
653 }
654
655 bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
656 AssumptionCache &AC,
657 TargetLibraryInfo *LibInfo,
658 HardwareLoopInfo &HWLoopInfo) {
659 return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
660 }
661
662 bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) {
663 return BaseT::preferPredicateOverEpilogue(TFI);
664 }
665
666 TailFoldingStyle
667 getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) {
668 return BaseT::getPreferredTailFoldingStyle(IVUpdateMayOverflow);
669 }
670
671 std::optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
672 IntrinsicInst &II) {
673 return BaseT::instCombineIntrinsic(IC, II);
674 }
675
676 std::optional<Value *>
677 simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II,
678 APInt DemandedMask, KnownBits &Known,
679 bool &KnownBitsComputed) {
680 return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known,
681 KnownBitsComputed);
682 }
683
684 std::optional<Value *> simplifyDemandedVectorEltsIntrinsic(
685 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
686 APInt &UndefElts2, APInt &UndefElts3,
687 std::function<void(Instruction *, unsigned, APInt, APInt &)>
688 SimplifyAndSetOp) {
689 return BaseT::simplifyDemandedVectorEltsIntrinsic(
690 IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
691 SimplifyAndSetOp);
692 }
693
694 virtual std::optional<unsigned>
695 getCacheSize(TargetTransformInfo::CacheLevel Level) const {
696 return std::optional<unsigned>(
697 getST()->getCacheSize(static_cast<unsigned>(Level)));
698 }
699
700 virtual std::optional<unsigned>
701 getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const {
702 std::optional<unsigned> TargetResult =
703 getST()->getCacheAssociativity(static_cast<unsigned>(Level));
704
705 if (TargetResult)
706 return TargetResult;
707
708 return BaseT::getCacheAssociativity(Level);
709 }
710
711 virtual unsigned getCacheLineSize() const {
712 return getST()->getCacheLineSize();
713 }
714
715 virtual unsigned getPrefetchDistance() const {
716 return getST()->getPrefetchDistance();
717 }
718
719 virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses,
720 unsigned NumStridedMemAccesses,
721 unsigned NumPrefetches,
722 bool HasCall) const {
723 return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
724 NumPrefetches, HasCall);
725 }
726
727 virtual unsigned getMaxPrefetchIterationsAhead() const {
728 return getST()->getMaxPrefetchIterationsAhead();
729 }
730
731 virtual bool enableWritePrefetching() const {
732 return getST()->enableWritePrefetching();
733 }
734
735 virtual bool shouldPrefetchAddressSpace(unsigned AS) const {
736 return getST()->shouldPrefetchAddressSpace(AS);
737 }
738
739 /// @}
740
741 /// \name Vector TTI Implementations
742 /// @{
743
744 TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
745 return TypeSize::getFixed(ExactSize: 32);
746 }
747
748 std::optional<unsigned> getMaxVScale() const { return std::nullopt; }
749 std::optional<unsigned> getVScaleForTuning() const { return std::nullopt; }
750 bool isVScaleKnownToBeAPowerOfTwo() const { return false; }
751
752 /// Estimate the overhead of scalarizing an instruction. Insert and Extract
753 /// are set if the demanded result elements need to be inserted and/or
754 /// extracted from vectors.
755 InstructionCost getScalarizationOverhead(VectorType *InTy,
756 const APInt &DemandedElts,
757 bool Insert, bool Extract,
758 TTI::TargetCostKind CostKind) {
759 /// FIXME: a bitfield is not a reasonable abstraction for talking about
760 /// which elements are needed from a scalable vector
761 if (isa<ScalableVectorType>(Val: InTy))
762 return InstructionCost::getInvalid();
763 auto *Ty = cast<FixedVectorType>(Val: InTy);
764
765 assert(DemandedElts.getBitWidth() == Ty->getNumElements() &&
766 "Vector size mismatch");
767
768 InstructionCost Cost = 0;
769
770 for (int i = 0, e = Ty->getNumElements(); i < e; ++i) {
771 if (!DemandedElts[i])
772 continue;
773 if (Insert)
774 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty,
775 CostKind, i, nullptr, nullptr);
776 if (Extract)
777 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
778 CostKind, i, nullptr, nullptr);
779 }
780
781 return Cost;
782 }
783
784 /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
785 InstructionCost getScalarizationOverhead(VectorType *InTy, bool Insert,
786 bool Extract,
787 TTI::TargetCostKind CostKind) {
788 if (isa<ScalableVectorType>(Val: InTy))
789 return InstructionCost::getInvalid();
790 auto *Ty = cast<FixedVectorType>(Val: InTy);
791
792 APInt DemandedElts = APInt::getAllOnes(numBits: Ty->getNumElements());
793 return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
794 CostKind);
795 }
796
797 /// Estimate the overhead of scalarizing an instructions unique
798 /// non-constant operands. The (potentially vector) types to use for each of
799 /// argument are passes via Tys.
800 InstructionCost
801 getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
802 ArrayRef<Type *> Tys,
803 TTI::TargetCostKind CostKind) {
804 assert(Args.size() == Tys.size() && "Expected matching Args and Tys");
805
806 InstructionCost Cost = 0;
807 SmallPtrSet<const Value*, 4> UniqueOperands;
808 for (int I = 0, E = Args.size(); I != E; I++) {
809 // Disregard things like metadata arguments.
810 const Value *A = Args[I];
811 Type *Ty = Tys[I];
812 if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() &&
813 !Ty->isPtrOrPtrVectorTy())
814 continue;
815
816 if (!isa<Constant>(Val: A) && UniqueOperands.insert(Ptr: A).second) {
817 if (auto *VecTy = dyn_cast<VectorType>(Val: Ty))
818 Cost += getScalarizationOverhead(VecTy, /*Insert*/ false,
819 /*Extract*/ true, CostKind);
820 }
821 }
822
823 return Cost;
824 }
825
826 /// Estimate the overhead of scalarizing the inputs and outputs of an
827 /// instruction, with return type RetTy and arguments Args of type Tys. If
828 /// Args are unknown (empty), then the cost associated with one argument is
829 /// added as a heuristic.
830 InstructionCost getScalarizationOverhead(VectorType *RetTy,
831 ArrayRef<const Value *> Args,
832 ArrayRef<Type *> Tys,
833 TTI::TargetCostKind CostKind) {
834 InstructionCost Cost = getScalarizationOverhead(
835 RetTy, /*Insert*/ true, /*Extract*/ false, CostKind);
836 if (!Args.empty())
837 Cost += getOperandsScalarizationOverhead(Args, Tys, CostKind);
838 else
839 // When no information on arguments is provided, we add the cost
840 // associated with one argument as a heuristic.
841 Cost += getScalarizationOverhead(RetTy, /*Insert*/ false,
842 /*Extract*/ true, CostKind);
843
844 return Cost;
845 }
846
847 /// Estimate the cost of type-legalization and the legalized type.
848 std::pair<InstructionCost, MVT> getTypeLegalizationCost(Type *Ty) const {
849 LLVMContext &C = Ty->getContext();
850 EVT MTy = getTLI()->getValueType(DL, Ty);
851
852 InstructionCost Cost = 1;
853 // We keep legalizing the type until we find a legal kind. We assume that
854 // the only operation that costs anything is the split. After splitting
855 // we need to handle two types.
856 while (true) {
857 TargetLoweringBase::LegalizeKind LK = getTLI()->getTypeConversion(C, MTy);
858
859 if (LK.first == TargetLoweringBase::TypeScalarizeScalableVector) {
860 // Ensure we return a sensible simple VT here, since many callers of
861 // this function require it.
862 MVT VT = MTy.isSimple() ? MTy.getSimpleVT() : MVT::i64;
863 return std::make_pair(x: InstructionCost::getInvalid(), y&: VT);
864 }
865
866 if (LK.first == TargetLoweringBase::TypeLegal)
867 return std::make_pair(x&: Cost, y: MTy.getSimpleVT());
868
869 if (LK.first == TargetLoweringBase::TypeSplitVector ||
870 LK.first == TargetLoweringBase::TypeExpandInteger)
871 Cost *= 2;
872
873 // Do not loop with f128 type.
874 if (MTy == LK.second)
875 return std::make_pair(x&: Cost, y: MTy.getSimpleVT());
876
877 // Keep legalizing the type.
878 MTy = LK.second;
879 }
880 }
881
882 unsigned getMaxInterleaveFactor(ElementCount VF) { return 1; }
883
884 InstructionCost getArithmeticInstrCost(
885 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
886 TTI::OperandValueInfo Opd1Info = {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
887 TTI::OperandValueInfo Opd2Info = {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
888 ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
889 const Instruction *CxtI = nullptr) {
890 // Check if any of the operands are vector operands.
891 const TargetLoweringBase *TLI = getTLI();
892 int ISD = TLI->InstructionOpcodeToISD(Opcode);
893 assert(ISD && "Invalid opcode");
894
895 // TODO: Handle more cost kinds.
896 if (CostKind != TTI::TCK_RecipThroughput)
897 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind,
898 Opd1Info, Opd2Info,
899 Args, CxtI);
900
901 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
902
903 bool IsFloat = Ty->isFPOrFPVectorTy();
904 // Assume that floating point arithmetic operations cost twice as much as
905 // integer operations.
906 InstructionCost OpCost = (IsFloat ? 2 : 1);
907
908 if (TLI->isOperationLegalOrPromote(Op: ISD, VT: LT.second)) {
909 // The operation is legal. Assume it costs 1.
910 // TODO: Once we have extract/insert subvector cost we need to use them.
911 return LT.first * OpCost;
912 }
913
914 if (!TLI->isOperationExpand(Op: ISD, VT: LT.second)) {
915 // If the operation is custom lowered, then assume that the code is twice
916 // as expensive.
917 return LT.first * 2 * OpCost;
918 }
919
920 // An 'Expand' of URem and SRem is special because it may default
921 // to expanding the operation into a sequence of sub-operations
922 // i.e. X % Y -> X-(X/Y)*Y.
923 if (ISD == ISD::UREM || ISD == ISD::SREM) {
924 bool IsSigned = ISD == ISD::SREM;
925 if (TLI->isOperationLegalOrCustom(Op: IsSigned ? ISD::SDIVREM : ISD::UDIVREM,
926 VT: LT.second) ||
927 TLI->isOperationLegalOrCustom(Op: IsSigned ? ISD::SDIV : ISD::UDIV,
928 VT: LT.second)) {
929 unsigned DivOpc = IsSigned ? Instruction::SDiv : Instruction::UDiv;
930 InstructionCost DivCost = thisT()->getArithmeticInstrCost(
931 DivOpc, Ty, CostKind, Opd1Info, Opd2Info);
932 InstructionCost MulCost =
933 thisT()->getArithmeticInstrCost(Instruction::Mul, Ty, CostKind);
934 InstructionCost SubCost =
935 thisT()->getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
936 return DivCost + MulCost + SubCost;
937 }
938 }
939
940 // We cannot scalarize scalable vectors, so return Invalid.
941 if (isa<ScalableVectorType>(Val: Ty))
942 return InstructionCost::getInvalid();
943
944 // Else, assume that we need to scalarize this op.
945 // TODO: If one of the types get legalized by splitting, handle this
946 // similarly to what getCastInstrCost() does.
947 if (auto *VTy = dyn_cast<FixedVectorType>(Val: Ty)) {
948 InstructionCost Cost = thisT()->getArithmeticInstrCost(
949 Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info,
950 Args, CxtI);
951 // Return the cost of multiple scalar invocation plus the cost of
952 // inserting and extracting the values.
953 SmallVector<Type *> Tys(Args.size(), Ty);
954 return getScalarizationOverhead(VTy, Args, Tys, CostKind) +
955 VTy->getNumElements() * Cost;
956 }
957
958 // We don't know anything about this scalar instruction.
959 return OpCost;
960 }
961
962 TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind,
963 ArrayRef<int> Mask,
964 VectorType *Ty, int &Index,
965 VectorType *&SubTy) const {
966 if (Mask.empty())
967 return Kind;
968 int NumSrcElts = Ty->getElementCount().getKnownMinValue();
969 switch (Kind) {
970 case TTI::SK_PermuteSingleSrc:
971 if (ShuffleVectorInst::isReverseMask(Mask, NumSrcElts))
972 return TTI::SK_Reverse;
973 if (ShuffleVectorInst::isZeroEltSplatMask(Mask, NumSrcElts))
974 return TTI::SK_Broadcast;
975 if (ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts, Index) &&
976 (Index + Mask.size()) <= (size_t)NumSrcElts) {
977 SubTy = FixedVectorType::get(ElementType: Ty->getElementType(), NumElts: Mask.size());
978 return TTI::SK_ExtractSubvector;
979 }
980 break;
981 case TTI::SK_PermuteTwoSrc: {
982 int NumSubElts;
983 if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask(
984 Mask, NumSrcElts, NumSubElts, Index)) {
985 if (Index + NumSubElts > NumSrcElts)
986 return Kind;
987 SubTy = FixedVectorType::get(ElementType: Ty->getElementType(), NumElts: NumSubElts);
988 return TTI::SK_InsertSubvector;
989 }
990 if (ShuffleVectorInst::isSelectMask(Mask, NumSrcElts))
991 return TTI::SK_Select;
992 if (ShuffleVectorInst::isTransposeMask(Mask, NumSrcElts))
993 return TTI::SK_Transpose;
994 if (ShuffleVectorInst::isSpliceMask(Mask, NumSrcElts, Index))
995 return TTI::SK_Splice;
996 break;
997 }
998 case TTI::SK_Select:
999 case TTI::SK_Reverse:
1000 case TTI::SK_Broadcast:
1001 case TTI::SK_Transpose:
1002 case TTI::SK_InsertSubvector:
1003 case TTI::SK_ExtractSubvector:
1004 case TTI::SK_Splice:
1005 break;
1006 }
1007 return Kind;
1008 }
1009
1010 InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
1011 ArrayRef<int> Mask,
1012 TTI::TargetCostKind CostKind, int Index,
1013 VectorType *SubTp,
1014 ArrayRef<const Value *> Args = std::nullopt) {
1015 switch (improveShuffleKindFromMask(Kind, Mask, Ty: Tp, Index, SubTy&: SubTp)) {
1016 case TTI::SK_Broadcast:
1017 if (auto *FVT = dyn_cast<FixedVectorType>(Val: Tp))
1018 return getBroadcastShuffleOverhead(VTy: FVT, CostKind);
1019 return InstructionCost::getInvalid();
1020 case TTI::SK_Select:
1021 case TTI::SK_Splice:
1022 case TTI::SK_Reverse:
1023 case TTI::SK_Transpose:
1024 case TTI::SK_PermuteSingleSrc:
1025 case TTI::SK_PermuteTwoSrc:
1026 if (auto *FVT = dyn_cast<FixedVectorType>(Val: Tp))
1027 return getPermuteShuffleOverhead(VTy: FVT, CostKind);
1028 return InstructionCost::getInvalid();
1029 case TTI::SK_ExtractSubvector:
1030 return getExtractSubvectorOverhead(VTy: Tp, CostKind, Index,
1031 SubVTy: cast<FixedVectorType>(Val: SubTp));
1032 case TTI::SK_InsertSubvector:
1033 return getInsertSubvectorOverhead(VTy: Tp, CostKind, Index,
1034 SubVTy: cast<FixedVectorType>(Val: SubTp));
1035 }
1036 llvm_unreachable("Unknown TTI::ShuffleKind");
1037 }
1038
1039 InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
1040 TTI::CastContextHint CCH,
1041 TTI::TargetCostKind CostKind,
1042 const Instruction *I = nullptr) {
1043 if (BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I) == 0)
1044 return 0;
1045
1046 const TargetLoweringBase *TLI = getTLI();
1047 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1048 assert(ISD && "Invalid opcode");
1049 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Ty: Src);
1050 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Ty: Dst);
1051
1052 TypeSize SrcSize = SrcLT.second.getSizeInBits();
1053 TypeSize DstSize = DstLT.second.getSizeInBits();
1054 bool IntOrPtrSrc = Src->isIntegerTy() || Src->isPointerTy();
1055 bool IntOrPtrDst = Dst->isIntegerTy() || Dst->isPointerTy();
1056
1057 switch (Opcode) {
1058 default:
1059 break;
1060 case Instruction::Trunc:
1061 // Check for NOOP conversions.
1062 if (TLI->isTruncateFree(FromVT: SrcLT.second, ToVT: DstLT.second))
1063 return 0;
1064 [[fallthrough]];
1065 case Instruction::BitCast:
1066 // Bitcast between types that are legalized to the same type are free and
1067 // assume int to/from ptr of the same size is also free.
1068 if (SrcLT.first == DstLT.first && IntOrPtrSrc == IntOrPtrDst &&
1069 SrcSize == DstSize)
1070 return 0;
1071 break;
1072 case Instruction::FPExt:
1073 if (I && getTLI()->isExtFree(I))
1074 return 0;
1075 break;
1076 case Instruction::ZExt:
1077 if (TLI->isZExtFree(FromTy: SrcLT.second, ToTy: DstLT.second))
1078 return 0;
1079 [[fallthrough]];
1080 case Instruction::SExt:
1081 if (I && getTLI()->isExtFree(I))
1082 return 0;
1083
1084 // If this is a zext/sext of a load, return 0 if the corresponding
1085 // extending load exists on target and the result type is legal.
1086 if (CCH == TTI::CastContextHint::Normal) {
1087 EVT ExtVT = EVT::getEVT(Ty: Dst);
1088 EVT LoadVT = EVT::getEVT(Ty: Src);
1089 unsigned LType =
1090 ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD);
1091 if (DstLT.first == SrcLT.first &&
1092 TLI->isLoadExtLegal(ExtType: LType, ValVT: ExtVT, MemVT: LoadVT))
1093 return 0;
1094 }
1095 break;
1096 case Instruction::AddrSpaceCast:
1097 if (TLI->isFreeAddrSpaceCast(SrcAS: Src->getPointerAddressSpace(),
1098 DestAS: Dst->getPointerAddressSpace()))
1099 return 0;
1100 break;
1101 }
1102
1103 auto *SrcVTy = dyn_cast<VectorType>(Val: Src);
1104 auto *DstVTy = dyn_cast<VectorType>(Val: Dst);
1105
1106 // If the cast is marked as legal (or promote) then assume low cost.
1107 if (SrcLT.first == DstLT.first &&
1108 TLI->isOperationLegalOrPromote(Op: ISD, VT: DstLT.second))
1109 return SrcLT.first;
1110
1111 // Handle scalar conversions.
1112 if (!SrcVTy && !DstVTy) {
1113 // Just check the op cost. If the operation is legal then assume it costs
1114 // 1.
1115 if (!TLI->isOperationExpand(Op: ISD, VT: DstLT.second))
1116 return 1;
1117
1118 // Assume that illegal scalar instruction are expensive.
1119 return 4;
1120 }
1121
1122 // Check vector-to-vector casts.
1123 if (DstVTy && SrcVTy) {
1124 // If the cast is between same-sized registers, then the check is simple.
1125 if (SrcLT.first == DstLT.first && SrcSize == DstSize) {
1126
1127 // Assume that Zext is done using AND.
1128 if (Opcode == Instruction::ZExt)
1129 return SrcLT.first;
1130
1131 // Assume that sext is done using SHL and SRA.
1132 if (Opcode == Instruction::SExt)
1133 return SrcLT.first * 2;
1134
1135 // Just check the op cost. If the operation is legal then assume it
1136 // costs
1137 // 1 and multiply by the type-legalization overhead.
1138 if (!TLI->isOperationExpand(Op: ISD, VT: DstLT.second))
1139 return SrcLT.first * 1;
1140 }
1141
1142 // If we are legalizing by splitting, query the concrete TTI for the cost
1143 // of casting the original vector twice. We also need to factor in the
1144 // cost of the split itself. Count that as 1, to be consistent with
1145 // getTypeLegalizationCost().
1146 bool SplitSrc =
1147 TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Ty: Src)) ==
1148 TargetLowering::TypeSplitVector;
1149 bool SplitDst =
1150 TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Ty: Dst)) ==
1151 TargetLowering::TypeSplitVector;
1152 if ((SplitSrc || SplitDst) && SrcVTy->getElementCount().isVector() &&
1153 DstVTy->getElementCount().isVector()) {
1154 Type *SplitDstTy = VectorType::getHalfElementsVectorType(VTy: DstVTy);
1155 Type *SplitSrcTy = VectorType::getHalfElementsVectorType(VTy: SrcVTy);
1156 T *TTI = static_cast<T *>(this);
1157 // If both types need to be split then the split is free.
1158 InstructionCost SplitCost =
1159 (!SplitSrc || !SplitDst) ? TTI->getVectorSplitCost() : 0;
1160 return SplitCost +
1161 (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, CCH,
1162 CostKind, I));
1163 }
1164
1165 // Scalarization cost is Invalid, can't assume any num elements.
1166 if (isa<ScalableVectorType>(Val: DstVTy))
1167 return InstructionCost::getInvalid();
1168
1169 // In other cases where the source or destination are illegal, assume
1170 // the operation will get scalarized.
1171 unsigned Num = cast<FixedVectorType>(Val: DstVTy)->getNumElements();
1172 InstructionCost Cost = thisT()->getCastInstrCost(
1173 Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind, I);
1174
1175 // Return the cost of multiple scalar invocation plus the cost of
1176 // inserting and extracting the values.
1177 return getScalarizationOverhead(DstVTy, /*Insert*/ true, /*Extract*/ true,
1178 CostKind) +
1179 Num * Cost;
1180 }
1181
1182 // We already handled vector-to-vector and scalar-to-scalar conversions.
1183 // This
1184 // is where we handle bitcast between vectors and scalars. We need to assume
1185 // that the conversion is scalarized in one way or another.
1186 if (Opcode == Instruction::BitCast) {
1187 // Illegal bitcasts are done by storing and loading from a stack slot.
1188 return (SrcVTy ? getScalarizationOverhead(SrcVTy, /*Insert*/ false,
1189 /*Extract*/ true, CostKind)
1190 : 0) +
1191 (DstVTy ? getScalarizationOverhead(DstVTy, /*Insert*/ true,
1192 /*Extract*/ false, CostKind)
1193 : 0);
1194 }
1195
1196 llvm_unreachable("Unhandled cast");
1197 }
1198
1199 InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst,
1200 VectorType *VecTy, unsigned Index) {
1201 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1202 return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy,
1203 CostKind, Index, nullptr, nullptr) +
1204 thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(),
1205 TTI::CastContextHint::None, CostKind);
1206 }
1207
1208 InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
1209 const Instruction *I = nullptr) {
1210 return BaseT::getCFInstrCost(Opcode, CostKind, I);
1211 }
1212
1213 InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
1214 CmpInst::Predicate VecPred,
1215 TTI::TargetCostKind CostKind,
1216 const Instruction *I = nullptr) {
1217 const TargetLoweringBase *TLI = getTLI();
1218 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1219 assert(ISD && "Invalid opcode");
1220
1221 // TODO: Handle other cost kinds.
1222 if (CostKind != TTI::TCK_RecipThroughput)
1223 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1224 I);
1225
1226 // Selects on vectors are actually vector selects.
1227 if (ISD == ISD::SELECT) {
1228 assert(CondTy && "CondTy must exist");
1229 if (CondTy->isVectorTy())
1230 ISD = ISD::VSELECT;
1231 }
1232 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
1233
1234 if (!(ValTy->isVectorTy() && !LT.second.isVector()) &&
1235 !TLI->isOperationExpand(Op: ISD, VT: LT.second)) {
1236 // The operation is legal. Assume it costs 1. Multiply
1237 // by the type-legalization overhead.
1238 return LT.first * 1;
1239 }
1240
1241 // Otherwise, assume that the cast is scalarized.
1242 // TODO: If one of the types get legalized by splitting, handle this
1243 // similarly to what getCastInstrCost() does.
1244 if (auto *ValVTy = dyn_cast<VectorType>(Val: ValTy)) {
1245 if (isa<ScalableVectorType>(Val: ValTy))
1246 return InstructionCost::getInvalid();
1247
1248 unsigned Num = cast<FixedVectorType>(Val: ValVTy)->getNumElements();
1249 if (CondTy)
1250 CondTy = CondTy->getScalarType();
1251 InstructionCost Cost = thisT()->getCmpSelInstrCost(
1252 Opcode, ValVTy->getScalarType(), CondTy, VecPred, CostKind, I);
1253
1254 // Return the cost of multiple scalar invocation plus the cost of
1255 // inserting and extracting the values.
1256 return getScalarizationOverhead(ValVTy, /*Insert*/ true,
1257 /*Extract*/ false, CostKind) +
1258 Num * Cost;
1259 }
1260
1261 // Unknown scalar opcode.
1262 return 1;
1263 }
1264
1265 InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
1266 TTI::TargetCostKind CostKind,
1267 unsigned Index, Value *Op0, Value *Op1) {
1268 return getRegUsageForType(Ty: Val->getScalarType());
1269 }
1270
1271 InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
1272 TTI::TargetCostKind CostKind,
1273 unsigned Index) {
1274 Value *Op0 = nullptr;
1275 Value *Op1 = nullptr;
1276 if (auto *IE = dyn_cast<InsertElementInst>(Val: &I)) {
1277 Op0 = IE->getOperand(i_nocapture: 0);
1278 Op1 = IE->getOperand(i_nocapture: 1);
1279 }
1280 return thisT()->getVectorInstrCost(I.getOpcode(), Val, CostKind, Index, Op0,
1281 Op1);
1282 }
1283
1284 InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
1285 int VF,
1286 const APInt &DemandedDstElts,
1287 TTI::TargetCostKind CostKind) {
1288 assert(DemandedDstElts.getBitWidth() == (unsigned)VF * ReplicationFactor &&
1289 "Unexpected size of DemandedDstElts.");
1290
1291 InstructionCost Cost;
1292
1293 auto *SrcVT = FixedVectorType::get(ElementType: EltTy, NumElts: VF);
1294 auto *ReplicatedVT = FixedVectorType::get(ElementType: EltTy, NumElts: VF * ReplicationFactor);
1295
1296 // The Mask shuffling cost is extract all the elements of the Mask
1297 // and insert each of them Factor times into the wide vector:
1298 //
1299 // E.g. an interleaved group with factor 3:
1300 // %mask = icmp ult <8 x i32> %vec1, %vec2
1301 // %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef,
1302 // <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7>
1303 // The cost is estimated as extract all mask elements from the <8xi1> mask
1304 // vector and insert them factor times into the <24xi1> shuffled mask
1305 // vector.
1306 APInt DemandedSrcElts = APIntOps::ScaleBitMask(A: DemandedDstElts, NewBitWidth: VF);
1307 Cost += thisT()->getScalarizationOverhead(SrcVT, DemandedSrcElts,
1308 /*Insert*/ false,
1309 /*Extract*/ true, CostKind);
1310 Cost += thisT()->getScalarizationOverhead(ReplicatedVT, DemandedDstElts,
1311 /*Insert*/ true,
1312 /*Extract*/ false, CostKind);
1313
1314 return Cost;
1315 }
1316
1317 InstructionCost
1318 getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
1319 unsigned AddressSpace, TTI::TargetCostKind CostKind,
1320 TTI::OperandValueInfo OpInfo = {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
1321 const Instruction *I = nullptr) {
1322 assert(!Src->isVoidTy() && "Invalid type");
1323 // Assume types, such as structs, are expensive.
1324 if (getTLI()->getValueType(DL, Src, true) == MVT::Other)
1325 return 4;
1326 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Src);
1327
1328 // Assuming that all loads of legal types cost 1.
1329 InstructionCost Cost = LT.first;
1330 if (CostKind != TTI::TCK_RecipThroughput)
1331 return Cost;
1332
1333 const DataLayout &DL = this->getDataLayout();
1334 if (Src->isVectorTy() &&
1335 // In practice it's not currently possible to have a change in lane
1336 // length for extending loads or truncating stores so both types should
1337 // have the same scalable property.
1338 TypeSize::isKnownLT(LHS: DL.getTypeStoreSizeInBits(Ty: Src),
1339 RHS: LT.second.getSizeInBits())) {
1340 // This is a vector load that legalizes to a larger type than the vector
1341 // itself. Unless the corresponding extending load or truncating store is
1342 // legal, then this will scalarize.
1343 TargetLowering::LegalizeAction LA = TargetLowering::Expand;
1344 EVT MemVT = getTLI()->getValueType(DL, Src);
1345 if (Opcode == Instruction::Store)
1346 LA = getTLI()->getTruncStoreAction(LT.second, MemVT);
1347 else
1348 LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT);
1349
1350 if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) {
1351 // This is a vector load/store for some illegal type that is scalarized.
1352 // We must account for the cost of building or decomposing the vector.
1353 Cost += getScalarizationOverhead(
1354 cast<VectorType>(Val: Src), Opcode != Instruction::Store,
1355 Opcode == Instruction::Store, CostKind);
1356 }
1357 }
1358
1359 return Cost;
1360 }
1361
1362 InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
1363 Align Alignment, unsigned AddressSpace,
1364 TTI::TargetCostKind CostKind) {
1365 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, VariableMask: true, IsGatherScatter: false,
1366 CostKind);
1367 }
1368
1369 InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
1370 const Value *Ptr, bool VariableMask,
1371 Align Alignment,
1372 TTI::TargetCostKind CostKind,
1373 const Instruction *I = nullptr) {
1374 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, VariableMask,
1375 IsGatherScatter: true, CostKind);
1376 }
1377
1378 InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy,
1379 const Value *Ptr, bool VariableMask,
1380 Align Alignment,
1381 TTI::TargetCostKind CostKind,
1382 const Instruction *I) {
1383 // For a target without strided memory operations (or for an illegal
1384 // operation type on one which does), assume we lower to a gather/scatter
1385 // operation. (Which may in turn be scalarized.)
1386 return thisT()->getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1387 Alignment, CostKind, I);
1388 }
1389
1390 InstructionCost getInterleavedMemoryOpCost(
1391 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1392 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1393 bool UseMaskForCond = false, bool UseMaskForGaps = false) {
1394
1395 // We cannot scalarize scalable vectors, so return Invalid.
1396 if (isa<ScalableVectorType>(Val: VecTy))
1397 return InstructionCost::getInvalid();
1398
1399 auto *VT = cast<FixedVectorType>(Val: VecTy);
1400
1401 unsigned NumElts = VT->getNumElements();
1402 assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
1403
1404 unsigned NumSubElts = NumElts / Factor;
1405 auto *SubVT = FixedVectorType::get(ElementType: VT->getElementType(), NumElts: NumSubElts);
1406
1407 // Firstly, the cost of load/store operation.
1408 InstructionCost Cost;
1409 if (UseMaskForCond || UseMaskForGaps)
1410 Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment,
1411 AddressSpace, CostKind);
1412 else
1413 Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace,
1414 CostKind);
1415
1416 // Legalize the vector type, and get the legalized and unlegalized type
1417 // sizes.
1418 MVT VecTyLT = getTypeLegalizationCost(Ty: VecTy).second;
1419 unsigned VecTySize = thisT()->getDataLayout().getTypeStoreSize(VecTy);
1420 unsigned VecTyLTSize = VecTyLT.getStoreSize();
1421
1422 // Scale the cost of the memory operation by the fraction of legalized
1423 // instructions that will actually be used. We shouldn't account for the
1424 // cost of dead instructions since they will be removed.
1425 //
1426 // E.g., An interleaved load of factor 8:
1427 // %vec = load <16 x i64>, <16 x i64>* %ptr
1428 // %v0 = shufflevector %vec, undef, <0, 8>
1429 //
1430 // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be
1431 // used (those corresponding to elements [0:1] and [8:9] of the unlegalized
1432 // type). The other loads are unused.
1433 //
1434 // TODO: Note that legalization can turn masked loads/stores into unmasked
1435 // (legalized) loads/stores. This can be reflected in the cost.
1436 if (Cost.isValid() && VecTySize > VecTyLTSize) {
1437 // The number of loads of a legal type it will take to represent a load
1438 // of the unlegalized vector type.
1439 unsigned NumLegalInsts = divideCeil(Numerator: VecTySize, Denominator: VecTyLTSize);
1440
1441 // The number of elements of the unlegalized type that correspond to a
1442 // single legal instruction.
1443 unsigned NumEltsPerLegalInst = divideCeil(Numerator: NumElts, Denominator: NumLegalInsts);
1444
1445 // Determine which legal instructions will be used.
1446 BitVector UsedInsts(NumLegalInsts, false);
1447 for (unsigned Index : Indices)
1448 for (unsigned Elt = 0; Elt < NumSubElts; ++Elt)
1449 UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst);
1450
1451 // Scale the cost of the load by the fraction of legal instructions that
1452 // will be used.
1453 Cost = divideCeil(Numerator: UsedInsts.count() * *Cost.getValue(), Denominator: NumLegalInsts);
1454 }
1455
1456 // Then plus the cost of interleave operation.
1457 assert(Indices.size() <= Factor &&
1458 "Interleaved memory op has too many members");
1459
1460 const APInt DemandedAllSubElts = APInt::getAllOnes(numBits: NumSubElts);
1461 const APInt DemandedAllResultElts = APInt::getAllOnes(numBits: NumElts);
1462
1463 APInt DemandedLoadStoreElts = APInt::getZero(numBits: NumElts);
1464 for (unsigned Index : Indices) {
1465 assert(Index < Factor && "Invalid index for interleaved memory op");
1466 for (unsigned Elm = 0; Elm < NumSubElts; Elm++)
1467 DemandedLoadStoreElts.setBit(Index + Elm * Factor);
1468 }
1469
1470 if (Opcode == Instruction::Load) {
1471 // The interleave cost is similar to extract sub vectors' elements
1472 // from the wide vector, and insert them into sub vectors.
1473 //
1474 // E.g. An interleaved load of factor 2 (with one member of index 0):
1475 // %vec = load <8 x i32>, <8 x i32>* %ptr
1476 // %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0
1477 // The cost is estimated as extract elements at 0, 2, 4, 6 from the
1478 // <8 x i32> vector and insert them into a <4 x i32> vector.
1479 InstructionCost InsSubCost = thisT()->getScalarizationOverhead(
1480 SubVT, DemandedAllSubElts,
1481 /*Insert*/ true, /*Extract*/ false, CostKind);
1482 Cost += Indices.size() * InsSubCost;
1483 Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
1484 /*Insert*/ false,
1485 /*Extract*/ true, CostKind);
1486 } else {
1487 // The interleave cost is extract elements from sub vectors, and
1488 // insert them into the wide vector.
1489 //
1490 // E.g. An interleaved store of factor 3 with 2 members at indices 0,1:
1491 // (using VF=4):
1492 // %v0_v1 = shuffle %v0, %v1, <0,4,undef,1,5,undef,2,6,undef,3,7,undef>
1493 // %gaps.mask = <true, true, false, true, true, false,
1494 // true, true, false, true, true, false>
1495 // call llvm.masked.store <12 x i32> %v0_v1, <12 x i32>* %ptr,
1496 // i32 Align, <12 x i1> %gaps.mask
1497 // The cost is estimated as extract all elements (of actual members,
1498 // excluding gaps) from both <4 x i32> vectors and insert into the <12 x
1499 // i32> vector.
1500 InstructionCost ExtSubCost = thisT()->getScalarizationOverhead(
1501 SubVT, DemandedAllSubElts,
1502 /*Insert*/ false, /*Extract*/ true, CostKind);
1503 Cost += ExtSubCost * Indices.size();
1504 Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
1505 /*Insert*/ true,
1506 /*Extract*/ false, CostKind);
1507 }
1508
1509 if (!UseMaskForCond)
1510 return Cost;
1511
1512 Type *I8Type = Type::getInt8Ty(C&: VT->getContext());
1513
1514 Cost += thisT()->getReplicationShuffleCost(
1515 I8Type, Factor, NumSubElts,
1516 UseMaskForGaps ? DemandedLoadStoreElts : DemandedAllResultElts,
1517 CostKind);
1518
1519 // The Gaps mask is invariant and created outside the loop, therefore the
1520 // cost of creating it is not accounted for here. However if we have both
1521 // a MaskForGaps and some other mask that guards the execution of the
1522 // memory access, we need to account for the cost of And-ing the two masks
1523 // inside the loop.
1524 if (UseMaskForGaps) {
1525 auto *MaskVT = FixedVectorType::get(ElementType: I8Type, NumElts);
1526 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, MaskVT,
1527 CostKind);
1528 }
1529
1530 return Cost;
1531 }
1532
1533 /// Get intrinsic cost based on arguments.
1534 InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1535 TTI::TargetCostKind CostKind) {
1536 // Check for generically free intrinsics.
1537 if (BaseT::getIntrinsicInstrCost(ICA, CostKind) == 0)
1538 return 0;
1539
1540 // Assume that target intrinsics are cheap.
1541 Intrinsic::ID IID = ICA.getID();
1542 if (Function::isTargetIntrinsic(IID))
1543 return TargetTransformInfo::TCC_Basic;
1544
1545 if (ICA.isTypeBasedOnly())
1546 return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
1547
1548 Type *RetTy = ICA.getReturnType();
1549
1550 ElementCount RetVF =
1551 (RetTy->isVectorTy() ? cast<VectorType>(Val: RetTy)->getElementCount()
1552 : ElementCount::getFixed(MinVal: 1));
1553 const IntrinsicInst *I = ICA.getInst();
1554 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
1555 FastMathFlags FMF = ICA.getFlags();
1556 switch (IID) {
1557 default:
1558 break;
1559
1560 case Intrinsic::powi:
1561 if (auto *RHSC = dyn_cast<ConstantInt>(Val: Args[1])) {
1562 bool ShouldOptForSize = I->getParent()->getParent()->hasOptSize();
1563 if (getTLI()->isBeneficialToExpandPowI(RHSC->getSExtValue(),
1564 ShouldOptForSize)) {
1565 // The cost is modeled on the expansion performed by ExpandPowI in
1566 // SelectionDAGBuilder.
1567 APInt Exponent = RHSC->getValue().abs();
1568 unsigned ActiveBits = Exponent.getActiveBits();
1569 unsigned PopCount = Exponent.popcount();
1570 InstructionCost Cost = (ActiveBits + PopCount - 2) *
1571 thisT()->getArithmeticInstrCost(
1572 Instruction::FMul, RetTy, CostKind);
1573 if (RHSC->isNegative())
1574 Cost += thisT()->getArithmeticInstrCost(Instruction::FDiv, RetTy,
1575 CostKind);
1576 return Cost;
1577 }
1578 }
1579 break;
1580 case Intrinsic::cttz:
1581 // FIXME: If necessary, this should go in target-specific overrides.
1582 if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz(RetTy))
1583 return TargetTransformInfo::TCC_Basic;
1584 break;
1585
1586 case Intrinsic::ctlz:
1587 // FIXME: If necessary, this should go in target-specific overrides.
1588 if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz(RetTy))
1589 return TargetTransformInfo::TCC_Basic;
1590 break;
1591
1592 case Intrinsic::memcpy:
1593 return thisT()->getMemcpyCost(ICA.getInst());
1594
1595 case Intrinsic::masked_scatter: {
1596 const Value *Mask = Args[3];
1597 bool VarMask = !isa<Constant>(Val: Mask);
1598 Align Alignment = cast<ConstantInt>(Val: Args[2])->getAlignValue();
1599 return thisT()->getGatherScatterOpCost(Instruction::Store,
1600 ICA.getArgTypes()[0], Args[1],
1601 VarMask, Alignment, CostKind, I);
1602 }
1603 case Intrinsic::masked_gather: {
1604 const Value *Mask = Args[2];
1605 bool VarMask = !isa<Constant>(Val: Mask);
1606 Align Alignment = cast<ConstantInt>(Val: Args[1])->getAlignValue();
1607 return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0],
1608 VarMask, Alignment, CostKind, I);
1609 }
1610 case Intrinsic::experimental_vp_strided_store: {
1611 const Value *Data = Args[0];
1612 const Value *Ptr = Args[1];
1613 const Value *Mask = Args[3];
1614 const Value *EVL = Args[4];
1615 bool VarMask = !isa<Constant>(Val: Mask) || !isa<Constant>(Val: EVL);
1616 Align Alignment = I->getParamAlign(ArgNo: 1).valueOrOne();
1617 return thisT()->getStridedMemoryOpCost(Instruction::Store,
1618 Data->getType(), Ptr, VarMask,
1619 Alignment, CostKind, I);
1620 }
1621 case Intrinsic::experimental_vp_strided_load: {
1622 const Value *Ptr = Args[0];
1623 const Value *Mask = Args[2];
1624 const Value *EVL = Args[3];
1625 bool VarMask = !isa<Constant>(Val: Mask) || !isa<Constant>(Val: EVL);
1626 Align Alignment = I->getParamAlign(ArgNo: 0).valueOrOne();
1627 return thisT()->getStridedMemoryOpCost(Instruction::Load, RetTy, Ptr,
1628 VarMask, Alignment, CostKind, I);
1629 }
1630 case Intrinsic::experimental_stepvector: {
1631 if (isa<ScalableVectorType>(Val: RetTy))
1632 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1633 // The cost of materialising a constant integer vector.
1634 return TargetTransformInfo::TCC_Basic;
1635 }
1636 case Intrinsic::vector_extract: {
1637 // FIXME: Handle case where a scalable vector is extracted from a scalable
1638 // vector
1639 if (isa<ScalableVectorType>(Val: RetTy))
1640 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1641 unsigned Index = cast<ConstantInt>(Val: Args[1])->getZExtValue();
1642 return thisT()->getShuffleCost(
1643 TTI::SK_ExtractSubvector, cast<VectorType>(Val: Args[0]->getType()),
1644 std::nullopt, CostKind, Index, cast<VectorType>(Val: RetTy));
1645 }
1646 case Intrinsic::vector_insert: {
1647 // FIXME: Handle case where a scalable vector is inserted into a scalable
1648 // vector
1649 if (isa<ScalableVectorType>(Val: Args[1]->getType()))
1650 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1651 unsigned Index = cast<ConstantInt>(Val: Args[2])->getZExtValue();
1652 return thisT()->getShuffleCost(
1653 TTI::SK_InsertSubvector, cast<VectorType>(Val: Args[0]->getType()),
1654 std::nullopt, CostKind, Index, cast<VectorType>(Val: Args[1]->getType()));
1655 }
1656 case Intrinsic::experimental_vector_reverse: {
1657 return thisT()->getShuffleCost(
1658 TTI::SK_Reverse, cast<VectorType>(Val: Args[0]->getType()), std::nullopt,
1659 CostKind, 0, cast<VectorType>(Val: RetTy));
1660 }
1661 case Intrinsic::experimental_vector_splice: {
1662 unsigned Index = cast<ConstantInt>(Val: Args[2])->getZExtValue();
1663 return thisT()->getShuffleCost(
1664 TTI::SK_Splice, cast<VectorType>(Val: Args[0]->getType()), std::nullopt,
1665 CostKind, Index, cast<VectorType>(Val: RetTy));
1666 }
1667 case Intrinsic::vector_reduce_add:
1668 case Intrinsic::vector_reduce_mul:
1669 case Intrinsic::vector_reduce_and:
1670 case Intrinsic::vector_reduce_or:
1671 case Intrinsic::vector_reduce_xor:
1672 case Intrinsic::vector_reduce_smax:
1673 case Intrinsic::vector_reduce_smin:
1674 case Intrinsic::vector_reduce_fmax:
1675 case Intrinsic::vector_reduce_fmin:
1676 case Intrinsic::vector_reduce_fmaximum:
1677 case Intrinsic::vector_reduce_fminimum:
1678 case Intrinsic::vector_reduce_umax:
1679 case Intrinsic::vector_reduce_umin: {
1680 IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, I, 1);
1681 return getTypeBasedIntrinsicInstrCost(ICA: Attrs, CostKind);
1682 }
1683 case Intrinsic::vector_reduce_fadd:
1684 case Intrinsic::vector_reduce_fmul: {
1685 IntrinsicCostAttributes Attrs(
1686 IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, I, 1);
1687 return getTypeBasedIntrinsicInstrCost(ICA: Attrs, CostKind);
1688 }
1689 case Intrinsic::fshl:
1690 case Intrinsic::fshr: {
1691 const Value *X = Args[0];
1692 const Value *Y = Args[1];
1693 const Value *Z = Args[2];
1694 const TTI::OperandValueInfo OpInfoX = TTI::getOperandInfo(V: X);
1695 const TTI::OperandValueInfo OpInfoY = TTI::getOperandInfo(V: Y);
1696 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(V: Z);
1697 const TTI::OperandValueInfo OpInfoBW =
1698 {.Kind: TTI::OK_UniformConstantValue,
1699 .Properties: isPowerOf2_32(Value: RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2
1700 : TTI::OP_None};
1701
1702 // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
1703 // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
1704 InstructionCost Cost = 0;
1705 Cost +=
1706 thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
1707 Cost +=
1708 thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind);
1709 Cost += thisT()->getArithmeticInstrCost(
1710 BinaryOperator::Shl, RetTy, CostKind, OpInfoX,
1711 {OpInfoZ.Kind, TTI::OP_None});
1712 Cost += thisT()->getArithmeticInstrCost(
1713 BinaryOperator::LShr, RetTy, CostKind, OpInfoY,
1714 {OpInfoZ.Kind, TTI::OP_None});
1715 // Non-constant shift amounts requires a modulo.
1716 if (!OpInfoZ.isConstant())
1717 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::URem, RetTy,
1718 CostKind, OpInfoZ, OpInfoBW);
1719 // For non-rotates (X != Y) we must add shift-by-zero handling costs.
1720 if (X != Y) {
1721 Type *CondTy = RetTy->getWithNewBitWidth(NewBitWidth: 1);
1722 Cost +=
1723 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1724 CmpInst::ICMP_EQ, CostKind);
1725 Cost +=
1726 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1727 CmpInst::ICMP_EQ, CostKind);
1728 }
1729 return Cost;
1730 }
1731 case Intrinsic::get_active_lane_mask: {
1732 EVT ResVT = getTLI()->getValueType(DL, RetTy, true);
1733 EVT ArgType = getTLI()->getValueType(DL, ICA.getArgTypes()[0], true);
1734
1735 // If we're not expanding the intrinsic then we assume this is cheap
1736 // to implement.
1737 if (!getTLI()->shouldExpandGetActiveLaneMask(ResVT, ArgType)) {
1738 return getTypeLegalizationCost(Ty: RetTy).first;
1739 }
1740
1741 // Create the expanded types that will be used to calculate the uadd_sat
1742 // operation.
1743 Type *ExpRetTy = VectorType::get(
1744 ElementType: ICA.getArgTypes()[0], EC: cast<VectorType>(Val: RetTy)->getElementCount());
1745 IntrinsicCostAttributes Attrs(Intrinsic::uadd_sat, ExpRetTy, {}, FMF);
1746 InstructionCost Cost =
1747 thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1748 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, ExpRetTy, RetTy,
1749 CmpInst::ICMP_ULT, CostKind);
1750 return Cost;
1751 }
1752 }
1753
1754 // VP Intrinsics should have the same cost as their non-vp counterpart.
1755 // TODO: Adjust the cost to make the vp intrinsic cheaper than its non-vp
1756 // counterpart when the vector length argument is smaller than the maximum
1757 // vector length.
1758 // TODO: Support other kinds of VPIntrinsics
1759 if (VPIntrinsic::isVPIntrinsic(ICA.getID())) {
1760 std::optional<unsigned> FOp =
1761 VPIntrinsic::getFunctionalOpcodeForVP(ID: ICA.getID());
1762 if (FOp) {
1763 if (ICA.getID() == Intrinsic::vp_load) {
1764 Align Alignment;
1765 if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(Val: ICA.getInst()))
1766 Alignment = VPI->getPointerAlignment().valueOrOne();
1767 unsigned AS = 0;
1768 if (ICA.getArgs().size() > 1)
1769 if (auto *PtrTy =
1770 dyn_cast<PointerType>(Val: ICA.getArgs()[0]->getType()))
1771 AS = PtrTy->getAddressSpace();
1772 return thisT()->getMemoryOpCost(*FOp, ICA.getReturnType(), Alignment,
1773 AS, CostKind);
1774 }
1775 if (ICA.getID() == Intrinsic::vp_store) {
1776 Align Alignment;
1777 if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(Val: ICA.getInst()))
1778 Alignment = VPI->getPointerAlignment().valueOrOne();
1779 unsigned AS = 0;
1780 if (ICA.getArgs().size() >= 2)
1781 if (auto *PtrTy =
1782 dyn_cast<PointerType>(Val: ICA.getArgs()[1]->getType()))
1783 AS = PtrTy->getAddressSpace();
1784 return thisT()->getMemoryOpCost(*FOp, Args[0]->getType(), Alignment,
1785 AS, CostKind);
1786 }
1787 if (VPBinOpIntrinsic::isVPBinOp(ID: ICA.getID())) {
1788 return thisT()->getArithmeticInstrCost(*FOp, ICA.getReturnType(),
1789 CostKind);
1790 }
1791 }
1792
1793 std::optional<Intrinsic::ID> FID =
1794 VPIntrinsic::getFunctionalIntrinsicIDForVP(ID: ICA.getID());
1795 if (FID) {
1796 // Non-vp version will have same Args/Tys except mask and vector length.
1797 assert(ICA.getArgs().size() >= 2 && ICA.getArgTypes().size() >= 2 &&
1798 "Expected VPIntrinsic to have Mask and Vector Length args and "
1799 "types");
1800 ArrayRef<Type *> NewTys = ArrayRef(ICA.getArgTypes()).drop_back(N: 2);
1801
1802 // VPReduction intrinsics have a start value argument that their non-vp
1803 // counterparts do not have, except for the fadd and fmul non-vp
1804 // counterpart.
1805 if (VPReductionIntrinsic::isVPReduction(ID: ICA.getID()) &&
1806 *FID != Intrinsic::vector_reduce_fadd &&
1807 *FID != Intrinsic::vector_reduce_fmul)
1808 NewTys = NewTys.drop_front();
1809
1810 IntrinsicCostAttributes NewICA(*FID, ICA.getReturnType(), NewTys,
1811 ICA.getFlags());
1812 return thisT()->getIntrinsicInstrCost(NewICA, CostKind);
1813 }
1814 }
1815
1816 // Assume that we need to scalarize this intrinsic.)
1817 // Compute the scalarization overhead based on Args for a vector
1818 // intrinsic.
1819 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
1820 if (RetVF.isVector() && !RetVF.isScalable()) {
1821 ScalarizationCost = 0;
1822 if (!RetTy->isVoidTy())
1823 ScalarizationCost += getScalarizationOverhead(
1824 cast<VectorType>(Val: RetTy),
1825 /*Insert*/ true, /*Extract*/ false, CostKind);
1826 ScalarizationCost +=
1827 getOperandsScalarizationOverhead(Args, Tys: ICA.getArgTypes(), CostKind);
1828 }
1829
1830 IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I,
1831 ScalarizationCost);
1832 return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1833 }
1834
1835 /// Get intrinsic cost based on argument types.
1836 /// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the
1837 /// cost of scalarizing the arguments and the return value will be computed
1838 /// based on types.
1839 InstructionCost
1840 getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1841 TTI::TargetCostKind CostKind) {
1842 Intrinsic::ID IID = ICA.getID();
1843 Type *RetTy = ICA.getReturnType();
1844 const SmallVectorImpl<Type *> &Tys = ICA.getArgTypes();
1845 FastMathFlags FMF = ICA.getFlags();
1846 InstructionCost ScalarizationCostPassed = ICA.getScalarizationCost();
1847 bool SkipScalarizationCost = ICA.skipScalarizationCost();
1848
1849 VectorType *VecOpTy = nullptr;
1850 if (!Tys.empty()) {
1851 // The vector reduction operand is operand 0 except for fadd/fmul.
1852 // Their operand 0 is a scalar start value, so the vector op is operand 1.
1853 unsigned VecTyIndex = 0;
1854 if (IID == Intrinsic::vector_reduce_fadd ||
1855 IID == Intrinsic::vector_reduce_fmul)
1856 VecTyIndex = 1;
1857 assert(Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes");
1858 VecOpTy = dyn_cast<VectorType>(Val: Tys[VecTyIndex]);
1859 }
1860
1861 // Library call cost - other than size, make it expensive.
1862 unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10;
1863 unsigned ISD = 0;
1864 switch (IID) {
1865 default: {
1866 // Scalable vectors cannot be scalarized, so return Invalid.
1867 if (isa<ScalableVectorType>(Val: RetTy) || any_of(Tys, [](const Type *Ty) {
1868 return isa<ScalableVectorType>(Val: Ty);
1869 }))
1870 return InstructionCost::getInvalid();
1871
1872 // Assume that we need to scalarize this intrinsic.
1873 InstructionCost ScalarizationCost =
1874 SkipScalarizationCost ? ScalarizationCostPassed : 0;
1875 unsigned ScalarCalls = 1;
1876 Type *ScalarRetTy = RetTy;
1877 if (auto *RetVTy = dyn_cast<VectorType>(Val: RetTy)) {
1878 if (!SkipScalarizationCost)
1879 ScalarizationCost = getScalarizationOverhead(
1880 RetVTy, /*Insert*/ true, /*Extract*/ false, CostKind);
1881 ScalarCalls = std::max(a: ScalarCalls,
1882 b: cast<FixedVectorType>(Val: RetVTy)->getNumElements());
1883 ScalarRetTy = RetTy->getScalarType();
1884 }
1885 SmallVector<Type *, 4> ScalarTys;
1886 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
1887 Type *Ty = Tys[i];
1888 if (auto *VTy = dyn_cast<VectorType>(Val: Ty)) {
1889 if (!SkipScalarizationCost)
1890 ScalarizationCost += getScalarizationOverhead(
1891 VTy, /*Insert*/ false, /*Extract*/ true, CostKind);
1892 ScalarCalls = std::max(a: ScalarCalls,
1893 b: cast<FixedVectorType>(Val: VTy)->getNumElements());
1894 Ty = Ty->getScalarType();
1895 }
1896 ScalarTys.push_back(Elt: Ty);
1897 }
1898 if (ScalarCalls == 1)
1899 return 1; // Return cost of a scalar intrinsic. Assume it to be cheap.
1900
1901 IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF);
1902 InstructionCost ScalarCost =
1903 thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind);
1904
1905 return ScalarCalls * ScalarCost + ScalarizationCost;
1906 }
1907 // Look for intrinsics that can be lowered directly or turned into a scalar
1908 // intrinsic call.
1909 case Intrinsic::sqrt:
1910 ISD = ISD::FSQRT;
1911 break;
1912 case Intrinsic::sin:
1913 ISD = ISD::FSIN;
1914 break;
1915 case Intrinsic::cos:
1916 ISD = ISD::FCOS;
1917 break;
1918 case Intrinsic::exp:
1919 ISD = ISD::FEXP;
1920 break;
1921 case Intrinsic::exp2:
1922 ISD = ISD::FEXP2;
1923 break;
1924 case Intrinsic::exp10:
1925 ISD = ISD::FEXP10;
1926 break;
1927 case Intrinsic::log:
1928 ISD = ISD::FLOG;
1929 break;
1930 case Intrinsic::log10:
1931 ISD = ISD::FLOG10;
1932 break;
1933 case Intrinsic::log2:
1934 ISD = ISD::FLOG2;
1935 break;
1936 case Intrinsic::fabs:
1937 ISD = ISD::FABS;
1938 break;
1939 case Intrinsic::canonicalize:
1940 ISD = ISD::FCANONICALIZE;
1941 break;
1942 case Intrinsic::minnum:
1943 ISD = ISD::FMINNUM;
1944 break;
1945 case Intrinsic::maxnum:
1946 ISD = ISD::FMAXNUM;
1947 break;
1948 case Intrinsic::minimum:
1949 ISD = ISD::FMINIMUM;
1950 break;
1951 case Intrinsic::maximum:
1952 ISD = ISD::FMAXIMUM;
1953 break;
1954 case Intrinsic::copysign:
1955 ISD = ISD::FCOPYSIGN;
1956 break;
1957 case Intrinsic::floor:
1958 ISD = ISD::FFLOOR;
1959 break;
1960 case Intrinsic::ceil:
1961 ISD = ISD::FCEIL;
1962 break;
1963 case Intrinsic::trunc:
1964 ISD = ISD::FTRUNC;
1965 break;
1966 case Intrinsic::nearbyint:
1967 ISD = ISD::FNEARBYINT;
1968 break;
1969 case Intrinsic::rint:
1970 ISD = ISD::FRINT;
1971 break;
1972 case Intrinsic::lrint:
1973 ISD = ISD::LRINT;
1974 break;
1975 case Intrinsic::llrint:
1976 ISD = ISD::LLRINT;
1977 break;
1978 case Intrinsic::round:
1979 ISD = ISD::FROUND;
1980 break;
1981 case Intrinsic::roundeven:
1982 ISD = ISD::FROUNDEVEN;
1983 break;
1984 case Intrinsic::pow:
1985 ISD = ISD::FPOW;
1986 break;
1987 case Intrinsic::fma:
1988 ISD = ISD::FMA;
1989 break;
1990 case Intrinsic::fmuladd:
1991 ISD = ISD::FMA;
1992 break;
1993 case Intrinsic::experimental_constrained_fmuladd:
1994 ISD = ISD::STRICT_FMA;
1995 break;
1996 // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free.
1997 case Intrinsic::lifetime_start:
1998 case Intrinsic::lifetime_end:
1999 case Intrinsic::sideeffect:
2000 case Intrinsic::pseudoprobe:
2001 case Intrinsic::arithmetic_fence:
2002 return 0;
2003 case Intrinsic::masked_store: {
2004 Type *Ty = Tys[0];
2005 Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
2006 return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0,
2007 CostKind);
2008 }
2009 case Intrinsic::masked_load: {
2010 Type *Ty = RetTy;
2011 Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
2012 return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0,
2013 CostKind);
2014 }
2015 case Intrinsic::vector_reduce_add:
2016 return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy,
2017 std::nullopt, CostKind);
2018 case Intrinsic::vector_reduce_mul:
2019 return thisT()->getArithmeticReductionCost(Instruction::Mul, VecOpTy,
2020 std::nullopt, CostKind);
2021 case Intrinsic::vector_reduce_and:
2022 return thisT()->getArithmeticReductionCost(Instruction::And, VecOpTy,
2023 std::nullopt, CostKind);
2024 case Intrinsic::vector_reduce_or:
2025 return thisT()->getArithmeticReductionCost(Instruction::Or, VecOpTy,
2026 std::nullopt, CostKind);
2027 case Intrinsic::vector_reduce_xor:
2028 return thisT()->getArithmeticReductionCost(Instruction::Xor, VecOpTy,
2029 std::nullopt, CostKind);
2030 case Intrinsic::vector_reduce_fadd:
2031 return thisT()->getArithmeticReductionCost(Instruction::FAdd, VecOpTy,
2032 FMF, CostKind);
2033 case Intrinsic::vector_reduce_fmul:
2034 return thisT()->getArithmeticReductionCost(Instruction::FMul, VecOpTy,
2035 FMF, CostKind);
2036 case Intrinsic::vector_reduce_smax:
2037 return thisT()->getMinMaxReductionCost(Intrinsic::smax, VecOpTy,
2038 ICA.getFlags(), CostKind);
2039 case Intrinsic::vector_reduce_smin:
2040 return thisT()->getMinMaxReductionCost(Intrinsic::smin, VecOpTy,
2041 ICA.getFlags(), CostKind);
2042 case Intrinsic::vector_reduce_umax:
2043 return thisT()->getMinMaxReductionCost(Intrinsic::umax, VecOpTy,
2044 ICA.getFlags(), CostKind);
2045 case Intrinsic::vector_reduce_umin:
2046 return thisT()->getMinMaxReductionCost(Intrinsic::umin, VecOpTy,
2047 ICA.getFlags(), CostKind);
2048 case Intrinsic::vector_reduce_fmax:
2049 return thisT()->getMinMaxReductionCost(Intrinsic::maxnum, VecOpTy,
2050 ICA.getFlags(), CostKind);
2051 case Intrinsic::vector_reduce_fmin:
2052 return thisT()->getMinMaxReductionCost(Intrinsic::minnum, VecOpTy,
2053 ICA.getFlags(), CostKind);
2054 case Intrinsic::vector_reduce_fmaximum:
2055 return thisT()->getMinMaxReductionCost(Intrinsic::maximum, VecOpTy,
2056 ICA.getFlags(), CostKind);
2057 case Intrinsic::vector_reduce_fminimum:
2058 return thisT()->getMinMaxReductionCost(Intrinsic::minimum, VecOpTy,
2059 ICA.getFlags(), CostKind);
2060 case Intrinsic::abs: {
2061 // abs(X) = select(icmp(X,0),X,sub(0,X))
2062 Type *CondTy = RetTy->getWithNewBitWidth(NewBitWidth: 1);
2063 CmpInst::Predicate Pred = CmpInst::ICMP_SGT;
2064 InstructionCost Cost = 0;
2065 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
2066 Pred, CostKind);
2067 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2068 Pred, CostKind);
2069 // TODO: Should we add an OperandValueProperties::OP_Zero property?
2070 Cost += thisT()->getArithmeticInstrCost(
2071 BinaryOperator::Sub, RetTy, CostKind, {TTI::OK_UniformConstantValue, TTI::OP_None});
2072 return Cost;
2073 }
2074 case Intrinsic::smax:
2075 case Intrinsic::smin:
2076 case Intrinsic::umax:
2077 case Intrinsic::umin: {
2078 // minmax(X,Y) = select(icmp(X,Y),X,Y)
2079 Type *CondTy = RetTy->getWithNewBitWidth(NewBitWidth: 1);
2080 bool IsUnsigned = IID == Intrinsic::umax || IID == Intrinsic::umin;
2081 CmpInst::Predicate Pred =
2082 IsUnsigned ? CmpInst::ICMP_UGT : CmpInst::ICMP_SGT;
2083 InstructionCost Cost = 0;
2084 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
2085 Pred, CostKind);
2086 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2087 Pred, CostKind);
2088 return Cost;
2089 }
2090 case Intrinsic::sadd_sat:
2091 case Intrinsic::ssub_sat: {
2092 Type *CondTy = RetTy->getWithNewBitWidth(NewBitWidth: 1);
2093
2094 Type *OpTy = StructType::create(Elements: {RetTy, CondTy});
2095 Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat
2096 ? Intrinsic::sadd_with_overflow
2097 : Intrinsic::ssub_with_overflow;
2098 CmpInst::Predicate Pred = CmpInst::ICMP_SGT;
2099
2100 // SatMax -> Overflow && SumDiff < 0
2101 // SatMin -> Overflow && SumDiff >= 0
2102 InstructionCost Cost = 0;
2103 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
2104 nullptr, ScalarizationCostPassed);
2105 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
2106 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
2107 Pred, CostKind);
2108 Cost += 2 * thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy,
2109 CondTy, Pred, CostKind);
2110 return Cost;
2111 }
2112 case Intrinsic::uadd_sat:
2113 case Intrinsic::usub_sat: {
2114 Type *CondTy = RetTy->getWithNewBitWidth(NewBitWidth: 1);
2115
2116 Type *OpTy = StructType::create(Elements: {RetTy, CondTy});
2117 Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat
2118 ? Intrinsic::uadd_with_overflow
2119 : Intrinsic::usub_with_overflow;
2120
2121 InstructionCost Cost = 0;
2122 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
2123 nullptr, ScalarizationCostPassed);
2124 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
2125 Cost +=
2126 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2127 CmpInst::BAD_ICMP_PREDICATE, CostKind);
2128 return Cost;
2129 }
2130 case Intrinsic::smul_fix:
2131 case Intrinsic::umul_fix: {
2132 unsigned ExtSize = RetTy->getScalarSizeInBits() * 2;
2133 Type *ExtTy = RetTy->getWithNewBitWidth(NewBitWidth: ExtSize);
2134
2135 unsigned ExtOp =
2136 IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
2137 TTI::CastContextHint CCH = TTI::CastContextHint::None;
2138
2139 InstructionCost Cost = 0;
2140 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind);
2141 Cost +=
2142 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2143 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy,
2144 CCH, CostKind);
2145 Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, RetTy,
2146 CostKind,
2147 {TTI::OK_AnyValue, TTI::OP_None},
2148 {TTI::OK_UniformConstantValue, TTI::OP_None});
2149 Cost += thisT()->getArithmeticInstrCost(Instruction::Shl, RetTy, CostKind,
2150 {TTI::OK_AnyValue, TTI::OP_None},
2151 {TTI::OK_UniformConstantValue, TTI::OP_None});
2152 Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind);
2153 return Cost;
2154 }
2155 case Intrinsic::sadd_with_overflow:
2156 case Intrinsic::ssub_with_overflow: {
2157 Type *SumTy = RetTy->getContainedType(i: 0);
2158 Type *OverflowTy = RetTy->getContainedType(i: 1);
2159 unsigned Opcode = IID == Intrinsic::sadd_with_overflow
2160 ? BinaryOperator::Add
2161 : BinaryOperator::Sub;
2162
2163 // Add:
2164 // Overflow -> (Result < LHS) ^ (RHS < 0)
2165 // Sub:
2166 // Overflow -> (Result < LHS) ^ (RHS > 0)
2167 InstructionCost Cost = 0;
2168 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
2169 Cost += 2 * thisT()->getCmpSelInstrCost(
2170 Instruction::ICmp, SumTy, OverflowTy,
2171 CmpInst::ICMP_SGT, CostKind);
2172 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Xor, OverflowTy,
2173 CostKind);
2174 return Cost;
2175 }
2176 case Intrinsic::uadd_with_overflow:
2177 case Intrinsic::usub_with_overflow: {
2178 Type *SumTy = RetTy->getContainedType(i: 0);
2179 Type *OverflowTy = RetTy->getContainedType(i: 1);
2180 unsigned Opcode = IID == Intrinsic::uadd_with_overflow
2181 ? BinaryOperator::Add
2182 : BinaryOperator::Sub;
2183 CmpInst::Predicate Pred = IID == Intrinsic::uadd_with_overflow
2184 ? CmpInst::ICMP_ULT
2185 : CmpInst::ICMP_UGT;
2186
2187 InstructionCost Cost = 0;
2188 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
2189 Cost +=
2190 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy, OverflowTy,
2191 Pred, CostKind);
2192 return Cost;
2193 }
2194 case Intrinsic::smul_with_overflow:
2195 case Intrinsic::umul_with_overflow: {
2196 Type *MulTy = RetTy->getContainedType(i: 0);
2197 Type *OverflowTy = RetTy->getContainedType(i: 1);
2198 unsigned ExtSize = MulTy->getScalarSizeInBits() * 2;
2199 Type *ExtTy = MulTy->getWithNewBitWidth(NewBitWidth: ExtSize);
2200 bool IsSigned = IID == Intrinsic::smul_with_overflow;
2201
2202 unsigned ExtOp = IsSigned ? Instruction::SExt : Instruction::ZExt;
2203 TTI::CastContextHint CCH = TTI::CastContextHint::None;
2204
2205 InstructionCost Cost = 0;
2206 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind);
2207 Cost +=
2208 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2209 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy,
2210 CCH, CostKind);
2211 Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, ExtTy,
2212 CostKind,
2213 {TTI::OK_AnyValue, TTI::OP_None},
2214 {TTI::OK_UniformConstantValue, TTI::OP_None});
2215
2216 if (IsSigned)
2217 Cost += thisT()->getArithmeticInstrCost(Instruction::AShr, MulTy,
2218 CostKind,
2219 {TTI::OK_AnyValue, TTI::OP_None},
2220 {TTI::OK_UniformConstantValue, TTI::OP_None});
2221
2222 Cost += thisT()->getCmpSelInstrCost(
2223 BinaryOperator::ICmp, MulTy, OverflowTy, CmpInst::ICMP_NE, CostKind);
2224 return Cost;
2225 }
2226 case Intrinsic::fptosi_sat:
2227 case Intrinsic::fptoui_sat: {
2228 if (Tys.empty())
2229 break;
2230 Type *FromTy = Tys[0];
2231 bool IsSigned = IID == Intrinsic::fptosi_sat;
2232
2233 InstructionCost Cost = 0;
2234 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FromTy,
2235 {FromTy, FromTy});
2236 Cost += thisT()->getIntrinsicInstrCost(Attrs1, CostKind);
2237 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FromTy,
2238 {FromTy, FromTy});
2239 Cost += thisT()->getIntrinsicInstrCost(Attrs2, CostKind);
2240 Cost += thisT()->getCastInstrCost(
2241 IsSigned ? Instruction::FPToSI : Instruction::FPToUI, RetTy, FromTy,
2242 TTI::CastContextHint::None, CostKind);
2243 if (IsSigned) {
2244 Type *CondTy = RetTy->getWithNewBitWidth(NewBitWidth: 1);
2245 Cost += thisT()->getCmpSelInstrCost(
2246 BinaryOperator::FCmp, FromTy, CondTy, CmpInst::FCMP_UNO, CostKind);
2247 Cost += thisT()->getCmpSelInstrCost(
2248 BinaryOperator::Select, RetTy, CondTy, CmpInst::FCMP_UNO, CostKind);
2249 }
2250 return Cost;
2251 }
2252 case Intrinsic::ctpop:
2253 ISD = ISD::CTPOP;
2254 // In case of legalization use TCC_Expensive. This is cheaper than a
2255 // library call but still not a cheap instruction.
2256 SingleCallCost = TargetTransformInfo::TCC_Expensive;
2257 break;
2258 case Intrinsic::ctlz:
2259 ISD = ISD::CTLZ;
2260 break;
2261 case Intrinsic::cttz:
2262 ISD = ISD::CTTZ;
2263 break;
2264 case Intrinsic::bswap:
2265 ISD = ISD::BSWAP;
2266 break;
2267 case Intrinsic::bitreverse:
2268 ISD = ISD::BITREVERSE;
2269 break;
2270 }
2271
2272 const TargetLoweringBase *TLI = getTLI();
2273 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: RetTy);
2274
2275 if (TLI->isOperationLegalOrPromote(Op: ISD, VT: LT.second)) {
2276 if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() &&
2277 TLI->isFAbsFree(LT.second)) {
2278 return 0;
2279 }
2280
2281 // The operation is legal. Assume it costs 1.
2282 // If the type is split to multiple registers, assume that there is some
2283 // overhead to this.
2284 // TODO: Once we have extract/insert subvector cost we need to use them.
2285 if (LT.first > 1)
2286 return (LT.first * 2);
2287 else
2288 return (LT.first * 1);
2289 } else if (!TLI->isOperationExpand(Op: ISD, VT: LT.second)) {
2290 // If the operation is custom lowered then assume
2291 // that the code is twice as expensive.
2292 return (LT.first * 2);
2293 }
2294
2295 // If we can't lower fmuladd into an FMA estimate the cost as a floating
2296 // point mul followed by an add.
2297 if (IID == Intrinsic::fmuladd)
2298 return thisT()->getArithmeticInstrCost(BinaryOperator::FMul, RetTy,
2299 CostKind) +
2300 thisT()->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy,
2301 CostKind);
2302 if (IID == Intrinsic::experimental_constrained_fmuladd) {
2303 IntrinsicCostAttributes FMulAttrs(
2304 Intrinsic::experimental_constrained_fmul, RetTy, Tys);
2305 IntrinsicCostAttributes FAddAttrs(
2306 Intrinsic::experimental_constrained_fadd, RetTy, Tys);
2307 return thisT()->getIntrinsicInstrCost(FMulAttrs, CostKind) +
2308 thisT()->getIntrinsicInstrCost(FAddAttrs, CostKind);
2309 }
2310
2311 // Else, assume that we need to scalarize this intrinsic. For math builtins
2312 // this will emit a costly libcall, adding call overhead and spills. Make it
2313 // very expensive.
2314 if (auto *RetVTy = dyn_cast<VectorType>(Val: RetTy)) {
2315 // Scalable vectors cannot be scalarized, so return Invalid.
2316 if (isa<ScalableVectorType>(Val: RetTy) || any_of(Tys, [](const Type *Ty) {
2317 return isa<ScalableVectorType>(Val: Ty);
2318 }))
2319 return InstructionCost::getInvalid();
2320
2321 InstructionCost ScalarizationCost =
2322 SkipScalarizationCost
2323 ? ScalarizationCostPassed
2324 : getScalarizationOverhead(RetVTy, /*Insert*/ true,
2325 /*Extract*/ false, CostKind);
2326
2327 unsigned ScalarCalls = cast<FixedVectorType>(Val: RetVTy)->getNumElements();
2328 SmallVector<Type *, 4> ScalarTys;
2329 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
2330 Type *Ty = Tys[i];
2331 if (Ty->isVectorTy())
2332 Ty = Ty->getScalarType();
2333 ScalarTys.push_back(Elt: Ty);
2334 }
2335 IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF);
2336 InstructionCost ScalarCost =
2337 thisT()->getIntrinsicInstrCost(Attrs, CostKind);
2338 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
2339 if (auto *VTy = dyn_cast<VectorType>(Val: Tys[i])) {
2340 if (!ICA.skipScalarizationCost())
2341 ScalarizationCost += getScalarizationOverhead(
2342 VTy, /*Insert*/ false, /*Extract*/ true, CostKind);
2343 ScalarCalls = std::max(a: ScalarCalls,
2344 b: cast<FixedVectorType>(Val: VTy)->getNumElements());
2345 }
2346 }
2347 return ScalarCalls * ScalarCost + ScalarizationCost;
2348 }
2349
2350 // This is going to be turned into a library call, make it expensive.
2351 return SingleCallCost;
2352 }
2353
2354 /// Compute a cost of the given call instruction.
2355 ///
2356 /// Compute the cost of calling function F with return type RetTy and
2357 /// argument types Tys. F might be nullptr, in this case the cost of an
2358 /// arbitrary call with the specified signature will be returned.
2359 /// This is used, for instance, when we estimate call of a vector
2360 /// counterpart of the given function.
2361 /// \param F Called function, might be nullptr.
2362 /// \param RetTy Return value types.
2363 /// \param Tys Argument types.
2364 /// \returns The cost of Call instruction.
2365 InstructionCost getCallInstrCost(Function *F, Type *RetTy,
2366 ArrayRef<Type *> Tys,
2367 TTI::TargetCostKind CostKind) {
2368 return 10;
2369 }
2370
2371 unsigned getNumberOfParts(Type *Tp) {
2372 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Tp);
2373 return LT.first.isValid() ? *LT.first.getValue() : 0;
2374 }
2375
2376 InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *,
2377 const SCEV *) {
2378 return 0;
2379 }
2380
2381 /// Try to calculate arithmetic and shuffle op costs for reduction intrinsics.
2382 /// We're assuming that reduction operation are performing the following way:
2383 ///
2384 /// %val1 = shufflevector<n x t> %val, <n x t> %undef,
2385 /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef>
2386 /// \----------------v-------------/ \----------v------------/
2387 /// n/2 elements n/2 elements
2388 /// %red1 = op <n x t> %val, <n x t> val1
2389 /// After this operation we have a vector %red1 where only the first n/2
2390 /// elements are meaningful, the second n/2 elements are undefined and can be
2391 /// dropped. All other operations are actually working with the vector of
2392 /// length n/2, not n, though the real vector length is still n.
2393 /// %val2 = shufflevector<n x t> %red1, <n x t> %undef,
2394 /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef>
2395 /// \----------------v-------------/ \----------v------------/
2396 /// n/4 elements 3*n/4 elements
2397 /// %red2 = op <n x t> %red1, <n x t> val2 - working with the vector of
2398 /// length n/2, the resulting vector has length n/4 etc.
2399 ///
2400 /// The cost model should take into account that the actual length of the
2401 /// vector is reduced on each iteration.
2402 InstructionCost getTreeReductionCost(unsigned Opcode, VectorType *Ty,
2403 TTI::TargetCostKind CostKind) {
2404 // Targets must implement a default value for the scalable case, since
2405 // we don't know how many lanes the vector has.
2406 if (isa<ScalableVectorType>(Val: Ty))
2407 return InstructionCost::getInvalid();
2408
2409 Type *ScalarTy = Ty->getElementType();
2410 unsigned NumVecElts = cast<FixedVectorType>(Val: Ty)->getNumElements();
2411 if ((Opcode == Instruction::Or || Opcode == Instruction::And) &&
2412 ScalarTy == IntegerType::getInt1Ty(C&: Ty->getContext()) &&
2413 NumVecElts >= 2) {
2414 // Or reduction for i1 is represented as:
2415 // %val = bitcast <ReduxWidth x i1> to iReduxWidth
2416 // %res = cmp ne iReduxWidth %val, 0
2417 // And reduction for i1 is represented as:
2418 // %val = bitcast <ReduxWidth x i1> to iReduxWidth
2419 // %res = cmp eq iReduxWidth %val, 11111
2420 Type *ValTy = IntegerType::get(C&: Ty->getContext(), NumBits: NumVecElts);
2421 return thisT()->getCastInstrCost(Instruction::BitCast, ValTy, Ty,
2422 TTI::CastContextHint::None, CostKind) +
2423 thisT()->getCmpSelInstrCost(Instruction::ICmp, ValTy,
2424 CmpInst::makeCmpResultType(opnd_type: ValTy),
2425 CmpInst::BAD_ICMP_PREDICATE, CostKind);
2426 }
2427 unsigned NumReduxLevels = Log2_32(Value: NumVecElts);
2428 InstructionCost ArithCost = 0;
2429 InstructionCost ShuffleCost = 0;
2430 std::pair<InstructionCost, MVT> LT = thisT()->getTypeLegalizationCost(Ty);
2431 unsigned LongVectorCount = 0;
2432 unsigned MVTLen =
2433 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
2434 while (NumVecElts > MVTLen) {
2435 NumVecElts /= 2;
2436 VectorType *SubTy = FixedVectorType::get(ElementType: ScalarTy, NumElts: NumVecElts);
2437 ShuffleCost +=
2438 thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
2439 CostKind, NumVecElts, SubTy);
2440 ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind);
2441 Ty = SubTy;
2442 ++LongVectorCount;
2443 }
2444
2445 NumReduxLevels -= LongVectorCount;
2446
2447 // The minimal length of the vector is limited by the real length of vector
2448 // operations performed on the current platform. That's why several final
2449 // reduction operations are performed on the vectors with the same
2450 // architecture-dependent length.
2451
2452 // By default reductions need one shuffle per reduction level.
2453 ShuffleCost +=
2454 NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
2455 std::nullopt, CostKind, 0, Ty);
2456 ArithCost +=
2457 NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty, CostKind);
2458 return ShuffleCost + ArithCost +
2459 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
2460 CostKind, 0, nullptr, nullptr);
2461 }
2462
2463 /// Try to calculate the cost of performing strict (in-order) reductions,
2464 /// which involves doing a sequence of floating point additions in lane
2465 /// order, starting with an initial value. For example, consider a scalar
2466 /// initial value 'InitVal' of type float and a vector of type <4 x float>:
2467 ///
2468 /// Vector = <float %v0, float %v1, float %v2, float %v3>
2469 ///
2470 /// %add1 = %InitVal + %v0
2471 /// %add2 = %add1 + %v1
2472 /// %add3 = %add2 + %v2
2473 /// %add4 = %add3 + %v3
2474 ///
2475 /// As a simple estimate we can say the cost of such a reduction is 4 times
2476 /// the cost of a scalar FP addition. We can only estimate the costs for
2477 /// fixed-width vectors here because for scalable vectors we do not know the
2478 /// runtime number of operations.
2479 InstructionCost getOrderedReductionCost(unsigned Opcode, VectorType *Ty,
2480 TTI::TargetCostKind CostKind) {
2481 // Targets must implement a default value for the scalable case, since
2482 // we don't know how many lanes the vector has.
2483 if (isa<ScalableVectorType>(Val: Ty))
2484 return InstructionCost::getInvalid();
2485
2486 auto *VTy = cast<FixedVectorType>(Val: Ty);
2487 InstructionCost ExtractCost = getScalarizationOverhead(
2488 VTy, /*Insert=*/false, /*Extract=*/true, CostKind);
2489 InstructionCost ArithCost = thisT()->getArithmeticInstrCost(
2490 Opcode, VTy->getElementType(), CostKind);
2491 ArithCost *= VTy->getNumElements();
2492
2493 return ExtractCost + ArithCost;
2494 }
2495
2496 InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
2497 std::optional<FastMathFlags> FMF,
2498 TTI::TargetCostKind CostKind) {
2499 assert(Ty && "Unknown reduction vector type");
2500 if (TTI::requiresOrderedReduction(FMF))
2501 return getOrderedReductionCost(Opcode, Ty, CostKind);
2502 return getTreeReductionCost(Opcode, Ty, CostKind);
2503 }
2504
2505 /// Try to calculate op costs for min/max reduction operations.
2506 /// \param CondTy Conditional type for the Select instruction.
2507 InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
2508 FastMathFlags FMF,
2509 TTI::TargetCostKind CostKind) {
2510 // Targets must implement a default value for the scalable case, since
2511 // we don't know how many lanes the vector has.
2512 if (isa<ScalableVectorType>(Val: Ty))
2513 return InstructionCost::getInvalid();
2514
2515 Type *ScalarTy = Ty->getElementType();
2516 unsigned NumVecElts = cast<FixedVectorType>(Val: Ty)->getNumElements();
2517 unsigned NumReduxLevels = Log2_32(Value: NumVecElts);
2518 InstructionCost MinMaxCost = 0;
2519 InstructionCost ShuffleCost = 0;
2520 std::pair<InstructionCost, MVT> LT = thisT()->getTypeLegalizationCost(Ty);
2521 unsigned LongVectorCount = 0;
2522 unsigned MVTLen =
2523 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
2524 while (NumVecElts > MVTLen) {
2525 NumVecElts /= 2;
2526 auto *SubTy = FixedVectorType::get(ElementType: ScalarTy, NumElts: NumVecElts);
2527
2528 ShuffleCost +=
2529 thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
2530 CostKind, NumVecElts, SubTy);
2531
2532 IntrinsicCostAttributes Attrs(IID, SubTy, {SubTy, SubTy}, FMF);
2533 MinMaxCost += getIntrinsicInstrCost(ICA: Attrs, CostKind);
2534 Ty = SubTy;
2535 ++LongVectorCount;
2536 }
2537
2538 NumReduxLevels -= LongVectorCount;
2539
2540 // The minimal length of the vector is limited by the real length of vector
2541 // operations performed on the current platform. That's why several final
2542 // reduction opertions are perfomed on the vectors with the same
2543 // architecture-dependent length.
2544 ShuffleCost +=
2545 NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
2546 std::nullopt, CostKind, 0, Ty);
2547 IntrinsicCostAttributes Attrs(IID, Ty, {Ty, Ty}, FMF);
2548 MinMaxCost += NumReduxLevels * getIntrinsicInstrCost(ICA: Attrs, CostKind);
2549 // The last min/max should be in vector registers and we counted it above.
2550 // So just need a single extractelement.
2551 return ShuffleCost + MinMaxCost +
2552 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
2553 CostKind, 0, nullptr, nullptr);
2554 }
2555
2556 InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned,
2557 Type *ResTy, VectorType *Ty,
2558 FastMathFlags FMF,
2559 TTI::TargetCostKind CostKind) {
2560 // Without any native support, this is equivalent to the cost of
2561 // vecreduce.opcode(ext(Ty A)).
2562 VectorType *ExtTy = VectorType::get(ElementType: ResTy, Other: Ty);
2563 InstructionCost RedCost =
2564 thisT()->getArithmeticReductionCost(Opcode, ExtTy, FMF, CostKind);
2565 InstructionCost ExtCost = thisT()->getCastInstrCost(
2566 IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
2567 TTI::CastContextHint::None, CostKind);
2568
2569 return RedCost + ExtCost;
2570 }
2571
2572 InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy,
2573 VectorType *Ty,
2574 TTI::TargetCostKind CostKind) {
2575 // Without any native support, this is equivalent to the cost of
2576 // vecreduce.add(mul(ext(Ty A), ext(Ty B))) or
2577 // vecreduce.add(mul(A, B)).
2578 VectorType *ExtTy = VectorType::get(ElementType: ResTy, Other: Ty);
2579 InstructionCost RedCost = thisT()->getArithmeticReductionCost(
2580 Instruction::Add, ExtTy, std::nullopt, CostKind);
2581 InstructionCost ExtCost = thisT()->getCastInstrCost(
2582 IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
2583 TTI::CastContextHint::None, CostKind);
2584
2585 InstructionCost MulCost =
2586 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2587
2588 return RedCost + MulCost + 2 * ExtCost;
2589 }
2590
2591 InstructionCost getVectorSplitCost() { return 1; }
2592
2593 /// @}
2594};
2595
2596/// Concrete BasicTTIImpl that can be used if no further customization
2597/// is needed.
2598class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> {
2599 using BaseT = BasicTTIImplBase<BasicTTIImpl>;
2600
2601 friend class BasicTTIImplBase<BasicTTIImpl>;
2602
2603 const TargetSubtargetInfo *ST;
2604 const TargetLoweringBase *TLI;
2605
2606 const TargetSubtargetInfo *getST() const { return ST; }
2607 const TargetLoweringBase *getTLI() const { return TLI; }
2608
2609public:
2610 explicit BasicTTIImpl(const TargetMachine *TM, const Function &F);
2611};
2612
2613} // end namespace llvm
2614
2615#endif // LLVM_CODEGEN_BASICTTIIMPL_H
2616

source code of llvm/include/llvm/CodeGen/BasicTTIImpl.h