1 | //===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file |
10 | /// This file provides a helper that implements much of the TTI interface in |
11 | /// terms of the target-independent code generator and TargetLowering |
12 | /// interfaces. |
13 | // |
14 | //===----------------------------------------------------------------------===// |
15 | |
16 | #ifndef LLVM_CODEGEN_BASICTTIIMPL_H |
17 | #define LLVM_CODEGEN_BASICTTIIMPL_H |
18 | |
19 | #include "llvm/ADT/APInt.h" |
20 | #include "llvm/ADT/ArrayRef.h" |
21 | #include "llvm/ADT/BitVector.h" |
22 | #include "llvm/ADT/SmallPtrSet.h" |
23 | #include "llvm/ADT/SmallVector.h" |
24 | #include "llvm/Analysis/LoopInfo.h" |
25 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
26 | #include "llvm/Analysis/TargetTransformInfo.h" |
27 | #include "llvm/Analysis/TargetTransformInfoImpl.h" |
28 | #include "llvm/CodeGen/ISDOpcodes.h" |
29 | #include "llvm/CodeGen/TargetLowering.h" |
30 | #include "llvm/CodeGen/TargetSubtargetInfo.h" |
31 | #include "llvm/CodeGen/ValueTypes.h" |
32 | #include "llvm/CodeGenTypes/MachineValueType.h" |
33 | #include "llvm/IR/BasicBlock.h" |
34 | #include "llvm/IR/Constant.h" |
35 | #include "llvm/IR/Constants.h" |
36 | #include "llvm/IR/DataLayout.h" |
37 | #include "llvm/IR/DerivedTypes.h" |
38 | #include "llvm/IR/InstrTypes.h" |
39 | #include "llvm/IR/Instruction.h" |
40 | #include "llvm/IR/Instructions.h" |
41 | #include "llvm/IR/Intrinsics.h" |
42 | #include "llvm/IR/Operator.h" |
43 | #include "llvm/IR/Type.h" |
44 | #include "llvm/IR/Value.h" |
45 | #include "llvm/Support/Casting.h" |
46 | #include "llvm/Support/CommandLine.h" |
47 | #include "llvm/Support/ErrorHandling.h" |
48 | #include "llvm/Support/MathExtras.h" |
49 | #include "llvm/Target/TargetMachine.h" |
50 | #include "llvm/Target/TargetOptions.h" |
51 | #include <algorithm> |
52 | #include <cassert> |
53 | #include <cstdint> |
54 | #include <limits> |
55 | #include <optional> |
56 | #include <utility> |
57 | |
58 | namespace llvm { |
59 | |
60 | class Function; |
61 | class GlobalValue; |
62 | class LLVMContext; |
63 | class ScalarEvolution; |
64 | class SCEV; |
65 | class TargetMachine; |
66 | |
67 | extern cl::opt<unsigned> PartialUnrollingThreshold; |
68 | |
69 | /// Base class which can be used to help build a TTI implementation. |
70 | /// |
71 | /// This class provides as much implementation of the TTI interface as is |
72 | /// possible using the target independent parts of the code generator. |
73 | /// |
74 | /// In order to subclass it, your class must implement a getST() method to |
75 | /// return the subtarget, and a getTLI() method to return the target lowering. |
76 | /// We need these methods implemented in the derived class so that this class |
77 | /// doesn't have to duplicate storage for them. |
78 | template <typename T> |
79 | class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> { |
80 | private: |
81 | using BaseT = TargetTransformInfoImplCRTPBase<T>; |
82 | using TTI = TargetTransformInfo; |
83 | |
84 | /// Helper function to access this as a T. |
85 | T *thisT() { return static_cast<T *>(this); } |
86 | |
87 | /// Estimate a cost of Broadcast as an extract and sequence of insert |
88 | /// operations. |
89 | InstructionCost getBroadcastShuffleOverhead(FixedVectorType *VTy, |
90 | TTI::TargetCostKind CostKind) { |
91 | InstructionCost Cost = 0; |
92 | // Broadcast cost is equal to the cost of extracting the zero'th element |
93 | // plus the cost of inserting it into every element of the result vector. |
94 | Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, |
95 | CostKind, 0, nullptr, nullptr); |
96 | |
97 | for (int i = 0, e = VTy->getNumElements(); i < e; ++i) { |
98 | Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, |
99 | CostKind, i, nullptr, nullptr); |
100 | } |
101 | return Cost; |
102 | } |
103 | |
104 | /// Estimate a cost of shuffle as a sequence of extract and insert |
105 | /// operations. |
106 | InstructionCost getPermuteShuffleOverhead(FixedVectorType *VTy, |
107 | TTI::TargetCostKind CostKind) { |
108 | InstructionCost Cost = 0; |
109 | // Shuffle cost is equal to the cost of extracting element from its argument |
110 | // plus the cost of inserting them onto the result vector. |
111 | |
112 | // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from |
113 | // index 0 of first vector, index 1 of second vector,index 2 of first |
114 | // vector and finally index 3 of second vector and insert them at index |
115 | // <0,1,2,3> of result vector. |
116 | for (int i = 0, e = VTy->getNumElements(); i < e; ++i) { |
117 | Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, |
118 | CostKind, i, nullptr, nullptr); |
119 | Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, |
120 | CostKind, i, nullptr, nullptr); |
121 | } |
122 | return Cost; |
123 | } |
124 | |
125 | /// Estimate a cost of subvector extraction as a sequence of extract and |
126 | /// insert operations. |
127 | InstructionCost (VectorType *VTy, |
128 | TTI::TargetCostKind CostKind, |
129 | int Index, |
130 | FixedVectorType *SubVTy) { |
131 | assert(VTy && SubVTy && |
132 | "Can only extract subvectors from vectors" ); |
133 | int NumSubElts = SubVTy->getNumElements(); |
134 | assert((!isa<FixedVectorType>(VTy) || |
135 | (Index + NumSubElts) <= |
136 | (int)cast<FixedVectorType>(VTy)->getNumElements()) && |
137 | "SK_ExtractSubvector index out of range" ); |
138 | |
139 | InstructionCost Cost = 0; |
140 | // Subvector extraction cost is equal to the cost of extracting element from |
141 | // the source type plus the cost of inserting them into the result vector |
142 | // type. |
143 | for (int i = 0; i != NumSubElts; ++i) { |
144 | Cost += |
145 | thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, |
146 | CostKind, i + Index, nullptr, nullptr); |
147 | Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy, |
148 | CostKind, i, nullptr, nullptr); |
149 | } |
150 | return Cost; |
151 | } |
152 | |
153 | /// Estimate a cost of subvector insertion as a sequence of extract and |
154 | /// insert operations. |
155 | InstructionCost getInsertSubvectorOverhead(VectorType *VTy, |
156 | TTI::TargetCostKind CostKind, |
157 | int Index, |
158 | FixedVectorType *SubVTy) { |
159 | assert(VTy && SubVTy && |
160 | "Can only insert subvectors into vectors" ); |
161 | int NumSubElts = SubVTy->getNumElements(); |
162 | assert((!isa<FixedVectorType>(VTy) || |
163 | (Index + NumSubElts) <= |
164 | (int)cast<FixedVectorType>(VTy)->getNumElements()) && |
165 | "SK_InsertSubvector index out of range" ); |
166 | |
167 | InstructionCost Cost = 0; |
168 | // Subvector insertion cost is equal to the cost of extracting element from |
169 | // the source type plus the cost of inserting them into the result vector |
170 | // type. |
171 | for (int i = 0; i != NumSubElts; ++i) { |
172 | Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy, |
173 | CostKind, i, nullptr, nullptr); |
174 | Cost += |
175 | thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, CostKind, |
176 | i + Index, nullptr, nullptr); |
177 | } |
178 | return Cost; |
179 | } |
180 | |
181 | /// Local query method delegates up to T which *must* implement this! |
182 | const TargetSubtargetInfo *getST() const { |
183 | return static_cast<const T *>(this)->getST(); |
184 | } |
185 | |
186 | /// Local query method delegates up to T which *must* implement this! |
187 | const TargetLoweringBase *getTLI() const { |
188 | return static_cast<const T *>(this)->getTLI(); |
189 | } |
190 | |
191 | static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) { |
192 | switch (M) { |
193 | case TTI::MIM_Unindexed: |
194 | return ISD::UNINDEXED; |
195 | case TTI::MIM_PreInc: |
196 | return ISD::PRE_INC; |
197 | case TTI::MIM_PreDec: |
198 | return ISD::PRE_DEC; |
199 | case TTI::MIM_PostInc: |
200 | return ISD::POST_INC; |
201 | case TTI::MIM_PostDec: |
202 | return ISD::POST_DEC; |
203 | } |
204 | llvm_unreachable("Unexpected MemIndexedMode" ); |
205 | } |
206 | |
207 | InstructionCost getCommonMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, |
208 | Align Alignment, |
209 | bool VariableMask, |
210 | bool IsGatherScatter, |
211 | TTI::TargetCostKind CostKind) { |
212 | // We cannot scalarize scalable vectors, so return Invalid. |
213 | if (isa<ScalableVectorType>(Val: DataTy)) |
214 | return InstructionCost::getInvalid(); |
215 | |
216 | auto *VT = cast<FixedVectorType>(Val: DataTy); |
217 | // Assume the target does not have support for gather/scatter operations |
218 | // and provide a rough estimate. |
219 | // |
220 | // First, compute the cost of the individual memory operations. |
221 | InstructionCost = |
222 | IsGatherScatter |
223 | ? getVectorInstrCost(Instruction::ExtractElement, |
224 | FixedVectorType::get( |
225 | ElementType: PointerType::get(ElementType: VT->getElementType(), AddressSpace: 0), |
226 | NumElts: VT->getNumElements()), |
227 | CostKind, -1, nullptr, nullptr) |
228 | : 0; |
229 | InstructionCost LoadCost = |
230 | VT->getNumElements() * |
231 | (AddrExtractCost + |
232 | getMemoryOpCost(Opcode, Src: VT->getElementType(), Alignment, AddressSpace: 0, CostKind)); |
233 | |
234 | // Next, compute the cost of packing the result in a vector. |
235 | InstructionCost PackingCost = |
236 | getScalarizationOverhead(VT, Opcode != Instruction::Store, |
237 | Opcode == Instruction::Store, CostKind); |
238 | |
239 | InstructionCost ConditionalCost = 0; |
240 | if (VariableMask) { |
241 | // Compute the cost of conditionally executing the memory operations with |
242 | // variable masks. This includes extracting the individual conditions, a |
243 | // branches and PHIs to combine the results. |
244 | // NOTE: Estimating the cost of conditionally executing the memory |
245 | // operations accurately is quite difficult and the current solution |
246 | // provides a very rough estimate only. |
247 | ConditionalCost = |
248 | VT->getNumElements() * |
249 | (getVectorInstrCost( |
250 | Instruction::ExtractElement, |
251 | FixedVectorType::get(ElementType: Type::getInt1Ty(C&: DataTy->getContext()), |
252 | NumElts: VT->getNumElements()), |
253 | CostKind, -1, nullptr, nullptr) + |
254 | getCFInstrCost(Opcode: Instruction::Br, CostKind) + |
255 | getCFInstrCost(Opcode: Instruction::PHI, CostKind)); |
256 | } |
257 | |
258 | return LoadCost + PackingCost + ConditionalCost; |
259 | } |
260 | |
261 | protected: |
262 | explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL) |
263 | : BaseT(DL) {} |
264 | virtual ~BasicTTIImplBase() = default; |
265 | |
266 | using TargetTransformInfoImplBase::DL; |
267 | |
268 | public: |
269 | /// \name Scalar TTI Implementations |
270 | /// @{ |
271 | bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, |
272 | unsigned AddressSpace, Align Alignment, |
273 | unsigned *Fast) const { |
274 | EVT E = EVT::getIntegerVT(Context, BitWidth); |
275 | return getTLI()->allowsMisalignedMemoryAccesses( |
276 | E, AddressSpace, Alignment, MachineMemOperand::MONone, Fast); |
277 | } |
278 | |
279 | bool hasBranchDivergence(const Function *F = nullptr) { return false; } |
280 | |
281 | bool isSourceOfDivergence(const Value *V) { return false; } |
282 | |
283 | bool isAlwaysUniform(const Value *V) { return false; } |
284 | |
285 | bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const { |
286 | return false; |
287 | } |
288 | |
289 | bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const { |
290 | return true; |
291 | } |
292 | |
293 | unsigned getFlatAddressSpace() { |
294 | // Return an invalid address space. |
295 | return -1; |
296 | } |
297 | |
298 | bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes, |
299 | Intrinsic::ID IID) const { |
300 | return false; |
301 | } |
302 | |
303 | bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const { |
304 | return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS); |
305 | } |
306 | |
307 | unsigned getAssumedAddrSpace(const Value *V) const { |
308 | return getTLI()->getTargetMachine().getAssumedAddrSpace(V); |
309 | } |
310 | |
311 | bool isSingleThreaded() const { |
312 | return getTLI()->getTargetMachine().Options.ThreadModel == |
313 | ThreadModel::Single; |
314 | } |
315 | |
316 | std::pair<const Value *, unsigned> |
317 | getPredicatedAddrSpace(const Value *V) const { |
318 | return getTLI()->getTargetMachine().getPredicatedAddrSpace(V); |
319 | } |
320 | |
321 | Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, |
322 | Value *NewV) const { |
323 | return nullptr; |
324 | } |
325 | |
326 | bool isLegalAddImmediate(int64_t imm) { |
327 | return getTLI()->isLegalAddImmediate(imm); |
328 | } |
329 | |
330 | bool isLegalICmpImmediate(int64_t imm) { |
331 | return getTLI()->isLegalICmpImmediate(imm); |
332 | } |
333 | |
334 | bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, |
335 | bool HasBaseReg, int64_t Scale, |
336 | unsigned AddrSpace, Instruction *I = nullptr) { |
337 | TargetLoweringBase::AddrMode AM; |
338 | AM.BaseGV = BaseGV; |
339 | AM.BaseOffs = BaseOffset; |
340 | AM.HasBaseReg = HasBaseReg; |
341 | AM.Scale = Scale; |
342 | return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I); |
343 | } |
344 | |
345 | int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) { |
346 | return getTLI()->getPreferredLargeGEPBaseOffset(MinOffset, MaxOffset); |
347 | } |
348 | |
349 | unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, |
350 | Type *ScalarValTy) const { |
351 | auto &&IsSupportedByTarget = [this, ScalarMemTy, ScalarValTy](unsigned VF) { |
352 | auto *SrcTy = FixedVectorType::get(ElementType: ScalarMemTy, NumElts: VF / 2); |
353 | EVT VT = getTLI()->getValueType(DL, SrcTy); |
354 | if (getTLI()->isOperationLegal(ISD::STORE, VT) || |
355 | getTLI()->isOperationCustom(ISD::STORE, VT)) |
356 | return true; |
357 | |
358 | EVT ValVT = |
359 | getTLI()->getValueType(DL, FixedVectorType::get(ElementType: ScalarValTy, NumElts: VF / 2)); |
360 | EVT LegalizedVT = |
361 | getTLI()->getTypeToTransformTo(ScalarMemTy->getContext(), VT); |
362 | return getTLI()->isTruncStoreLegal(LegalizedVT, ValVT); |
363 | }; |
364 | while (VF > 2 && IsSupportedByTarget(VF)) |
365 | VF /= 2; |
366 | return VF; |
367 | } |
368 | |
369 | bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty, |
370 | const DataLayout &DL) const { |
371 | EVT VT = getTLI()->getValueType(DL, Ty); |
372 | return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT); |
373 | } |
374 | |
375 | bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty, |
376 | const DataLayout &DL) const { |
377 | EVT VT = getTLI()->getValueType(DL, Ty); |
378 | return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT); |
379 | } |
380 | |
381 | bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2) { |
382 | return TargetTransformInfoImplBase::isLSRCostLess(C1, C2); |
383 | } |
384 | |
385 | bool isNumRegsMajorCostOfLSR() { |
386 | return TargetTransformInfoImplBase::isNumRegsMajorCostOfLSR(); |
387 | } |
388 | |
389 | bool shouldFoldTerminatingConditionAfterLSR() const { |
390 | return TargetTransformInfoImplBase:: |
391 | shouldFoldTerminatingConditionAfterLSR(); |
392 | } |
393 | |
394 | bool isProfitableLSRChainElement(Instruction *I) { |
395 | return TargetTransformInfoImplBase::isProfitableLSRChainElement(I); |
396 | } |
397 | |
398 | InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, |
399 | int64_t BaseOffset, bool HasBaseReg, |
400 | int64_t Scale, unsigned AddrSpace) { |
401 | TargetLoweringBase::AddrMode AM; |
402 | AM.BaseGV = BaseGV; |
403 | AM.BaseOffs = BaseOffset; |
404 | AM.HasBaseReg = HasBaseReg; |
405 | AM.Scale = Scale; |
406 | if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace)) |
407 | return 0; |
408 | return -1; |
409 | } |
410 | |
411 | bool isTruncateFree(Type *Ty1, Type *Ty2) { |
412 | return getTLI()->isTruncateFree(Ty1, Ty2); |
413 | } |
414 | |
415 | bool isProfitableToHoist(Instruction *I) { |
416 | return getTLI()->isProfitableToHoist(I); |
417 | } |
418 | |
419 | bool useAA() const { return getST()->useAA(); } |
420 | |
421 | bool isTypeLegal(Type *Ty) { |
422 | EVT VT = getTLI()->getValueType(DL, Ty); |
423 | return getTLI()->isTypeLegal(VT); |
424 | } |
425 | |
426 | unsigned getRegUsageForType(Type *Ty) { |
427 | EVT ETy = getTLI()->getValueType(DL, Ty); |
428 | return getTLI()->getNumRegisters(Ty->getContext(), ETy); |
429 | } |
430 | |
431 | InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, |
432 | ArrayRef<const Value *> Operands, Type *AccessType, |
433 | TTI::TargetCostKind CostKind) { |
434 | return BaseT::getGEPCost(PointeeType, Ptr, Operands, AccessType, CostKind); |
435 | } |
436 | |
437 | unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, |
438 | unsigned &JumpTableSize, |
439 | ProfileSummaryInfo *PSI, |
440 | BlockFrequencyInfo *BFI) { |
441 | /// Try to find the estimated number of clusters. Note that the number of |
442 | /// clusters identified in this function could be different from the actual |
443 | /// numbers found in lowering. This function ignore switches that are |
444 | /// lowered with a mix of jump table / bit test / BTree. This function was |
445 | /// initially intended to be used when estimating the cost of switch in |
446 | /// inline cost heuristic, but it's a generic cost model to be used in other |
447 | /// places (e.g., in loop unrolling). |
448 | unsigned N = SI.getNumCases(); |
449 | const TargetLoweringBase *TLI = getTLI(); |
450 | const DataLayout &DL = this->getDataLayout(); |
451 | |
452 | JumpTableSize = 0; |
453 | bool IsJTAllowed = TLI->areJTsAllowed(Fn: SI.getParent()->getParent()); |
454 | |
455 | // Early exit if both a jump table and bit test are not allowed. |
456 | if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(AS: 0u) < N)) |
457 | return N; |
458 | |
459 | APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue(); |
460 | APInt MinCaseVal = MaxCaseVal; |
461 | for (auto CI : SI.cases()) { |
462 | const APInt &CaseVal = CI.getCaseValue()->getValue(); |
463 | if (CaseVal.sgt(RHS: MaxCaseVal)) |
464 | MaxCaseVal = CaseVal; |
465 | if (CaseVal.slt(RHS: MinCaseVal)) |
466 | MinCaseVal = CaseVal; |
467 | } |
468 | |
469 | // Check if suitable for a bit test |
470 | if (N <= DL.getIndexSizeInBits(AS: 0u)) { |
471 | SmallPtrSet<const BasicBlock *, 4> Dests; |
472 | for (auto I : SI.cases()) |
473 | Dests.insert(Ptr: I.getCaseSuccessor()); |
474 | |
475 | if (TLI->isSuitableForBitTests(NumDests: Dests.size(), NumCmps: N, Low: MinCaseVal, High: MaxCaseVal, |
476 | DL)) |
477 | return 1; |
478 | } |
479 | |
480 | // Check if suitable for a jump table. |
481 | if (IsJTAllowed) { |
482 | if (N < 2 || N < TLI->getMinimumJumpTableEntries()) |
483 | return N; |
484 | uint64_t Range = |
485 | (MaxCaseVal - MinCaseVal) |
486 | .getLimitedValue(Limit: std::numeric_limits<uint64_t>::max() - 1) + 1; |
487 | // Check whether a range of clusters is dense enough for a jump table |
488 | if (TLI->isSuitableForJumpTable(SI: &SI, NumCases: N, Range, PSI, BFI)) { |
489 | JumpTableSize = Range; |
490 | return 1; |
491 | } |
492 | } |
493 | return N; |
494 | } |
495 | |
496 | bool shouldBuildLookupTables() { |
497 | const TargetLoweringBase *TLI = getTLI(); |
498 | return TLI->isOperationLegalOrCustom(Op: ISD::BR_JT, MVT::VT: Other) || |
499 | TLI->isOperationLegalOrCustom(Op: ISD::BRIND, MVT::VT: Other); |
500 | } |
501 | |
502 | bool shouldBuildRelLookupTables() const { |
503 | const TargetMachine &TM = getTLI()->getTargetMachine(); |
504 | // If non-PIC mode, do not generate a relative lookup table. |
505 | if (!TM.isPositionIndependent()) |
506 | return false; |
507 | |
508 | /// Relative lookup table entries consist of 32-bit offsets. |
509 | /// Do not generate relative lookup tables for large code models |
510 | /// in 64-bit achitectures where 32-bit offsets might not be enough. |
511 | if (TM.getCodeModel() == CodeModel::Medium || |
512 | TM.getCodeModel() == CodeModel::Large) |
513 | return false; |
514 | |
515 | Triple TargetTriple = TM.getTargetTriple(); |
516 | if (!TargetTriple.isArch64Bit()) |
517 | return false; |
518 | |
519 | // TODO: Triggers issues on aarch64 on darwin, so temporarily disable it |
520 | // there. |
521 | if (TargetTriple.getArch() == Triple::aarch64 && TargetTriple.isOSDarwin()) |
522 | return false; |
523 | |
524 | return true; |
525 | } |
526 | |
527 | bool haveFastSqrt(Type *Ty) { |
528 | const TargetLoweringBase *TLI = getTLI(); |
529 | EVT VT = TLI->getValueType(DL, Ty); |
530 | return TLI->isTypeLegal(VT) && |
531 | TLI->isOperationLegalOrCustom(Op: ISD::FSQRT, VT); |
532 | } |
533 | |
534 | bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) { |
535 | return true; |
536 | } |
537 | |
538 | InstructionCost getFPOpCost(Type *Ty) { |
539 | // Check whether FADD is available, as a proxy for floating-point in |
540 | // general. |
541 | const TargetLoweringBase *TLI = getTLI(); |
542 | EVT VT = TLI->getValueType(DL, Ty); |
543 | if (TLI->isOperationLegalOrCustomOrPromote(Op: ISD::FADD, VT)) |
544 | return TargetTransformInfo::TCC_Basic; |
545 | return TargetTransformInfo::TCC_Expensive; |
546 | } |
547 | |
548 | bool preferToKeepConstantsAttached(const Instruction &Inst, |
549 | const Function &Fn) const { |
550 | switch (Inst.getOpcode()) { |
551 | default: |
552 | break; |
553 | case Instruction::SDiv: |
554 | case Instruction::SRem: |
555 | case Instruction::UDiv: |
556 | case Instruction::URem: { |
557 | if (!isa<ConstantInt>(Val: Inst.getOperand(i: 1))) |
558 | return false; |
559 | EVT VT = getTLI()->getValueType(DL, Inst.getType()); |
560 | return !getTLI()->isIntDivCheap(VT, Fn.getAttributes()); |
561 | } |
562 | }; |
563 | |
564 | return false; |
565 | } |
566 | |
567 | unsigned getInliningThresholdMultiplier() const { return 1; } |
568 | unsigned adjustInliningThreshold(const CallBase *CB) { return 0; } |
569 | unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const { |
570 | return 0; |
571 | } |
572 | |
573 | int getInlinerVectorBonusPercent() const { return 150; } |
574 | |
575 | void (Loop *L, ScalarEvolution &SE, |
576 | TTI::UnrollingPreferences &UP, |
577 | OptimizationRemarkEmitter *ORE) { |
578 | // This unrolling functionality is target independent, but to provide some |
579 | // motivation for its intended use, for x86: |
580 | |
581 | // According to the Intel 64 and IA-32 Architectures Optimization Reference |
582 | // Manual, Intel Core models and later have a loop stream detector (and |
583 | // associated uop queue) that can benefit from partial unrolling. |
584 | // The relevant requirements are: |
585 | // - The loop must have no more than 4 (8 for Nehalem and later) branches |
586 | // taken, and none of them may be calls. |
587 | // - The loop can have no more than 18 (28 for Nehalem and later) uops. |
588 | |
589 | // According to the Software Optimization Guide for AMD Family 15h |
590 | // Processors, models 30h-4fh (Steamroller and later) have a loop predictor |
591 | // and loop buffer which can benefit from partial unrolling. |
592 | // The relevant requirements are: |
593 | // - The loop must have fewer than 16 branches |
594 | // - The loop must have less than 40 uops in all executed loop branches |
595 | |
596 | // The number of taken branches in a loop is hard to estimate here, and |
597 | // benchmarking has revealed that it is better not to be conservative when |
598 | // estimating the branch count. As a result, we'll ignore the branch limits |
599 | // until someone finds a case where it matters in practice. |
600 | |
601 | unsigned MaxOps; |
602 | const TargetSubtargetInfo *ST = getST(); |
603 | if (PartialUnrollingThreshold.getNumOccurrences() > 0) |
604 | MaxOps = PartialUnrollingThreshold; |
605 | else if (ST->getSchedModel().LoopMicroOpBufferSize > 0) |
606 | MaxOps = ST->getSchedModel().LoopMicroOpBufferSize; |
607 | else |
608 | return; |
609 | |
610 | // Scan the loop: don't unroll loops with calls. |
611 | for (BasicBlock *BB : L->blocks()) { |
612 | for (Instruction &I : *BB) { |
613 | if (isa<CallInst>(Val: I) || isa<InvokeInst>(Val: I)) { |
614 | if (const Function *F = cast<CallBase>(Val&: I).getCalledFunction()) { |
615 | if (!thisT()->isLoweredToCall(F)) |
616 | continue; |
617 | } |
618 | |
619 | if (ORE) { |
620 | ORE->emit([&]() { |
621 | return OptimizationRemark("TTI" , "DontUnroll" , L->getStartLoc(), |
622 | L->getHeader()) |
623 | << "advising against unrolling the loop because it " |
624 | "contains a " |
625 | << ore::NV("Call" , &I); |
626 | }); |
627 | } |
628 | return; |
629 | } |
630 | } |
631 | } |
632 | |
633 | // Enable runtime and partial unrolling up to the specified size. |
634 | // Enable using trip count upper bound to unroll loops. |
635 | UP.Partial = UP.Runtime = UP.UpperBound = true; |
636 | UP.PartialThreshold = MaxOps; |
637 | |
638 | // Avoid unrolling when optimizing for size. |
639 | UP.OptSizeThreshold = 0; |
640 | UP.PartialOptSizeThreshold = 0; |
641 | |
642 | // Set number of instructions optimized when "back edge" |
643 | // becomes "fall through" to default value of 2. |
644 | UP.BEInsns = 2; |
645 | } |
646 | |
647 | void getPeelingPreferences(Loop *L, ScalarEvolution &SE, |
648 | TTI::PeelingPreferences &PP) { |
649 | PP.PeelCount = 0; |
650 | PP.AllowPeeling = true; |
651 | PP.AllowLoopNestsPeeling = false; |
652 | PP.PeelProfiledIterations = true; |
653 | } |
654 | |
655 | bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, |
656 | AssumptionCache &AC, |
657 | TargetLibraryInfo *LibInfo, |
658 | HardwareLoopInfo &HWLoopInfo) { |
659 | return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo); |
660 | } |
661 | |
662 | bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) { |
663 | return BaseT::preferPredicateOverEpilogue(TFI); |
664 | } |
665 | |
666 | TailFoldingStyle |
667 | getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) { |
668 | return BaseT::getPreferredTailFoldingStyle(IVUpdateMayOverflow); |
669 | } |
670 | |
671 | std::optional<Instruction *> instCombineIntrinsic(InstCombiner &IC, |
672 | IntrinsicInst &II) { |
673 | return BaseT::instCombineIntrinsic(IC, II); |
674 | } |
675 | |
676 | std::optional<Value *> |
677 | simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II, |
678 | APInt DemandedMask, KnownBits &Known, |
679 | bool &KnownBitsComputed) { |
680 | return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known, |
681 | KnownBitsComputed); |
682 | } |
683 | |
684 | std::optional<Value *> simplifyDemandedVectorEltsIntrinsic( |
685 | InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, |
686 | APInt &UndefElts2, APInt &UndefElts3, |
687 | std::function<void(Instruction *, unsigned, APInt, APInt &)> |
688 | SimplifyAndSetOp) { |
689 | return BaseT::simplifyDemandedVectorEltsIntrinsic( |
690 | IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3, |
691 | SimplifyAndSetOp); |
692 | } |
693 | |
694 | virtual std::optional<unsigned> |
695 | getCacheSize(TargetTransformInfo::CacheLevel Level) const { |
696 | return std::optional<unsigned>( |
697 | getST()->getCacheSize(static_cast<unsigned>(Level))); |
698 | } |
699 | |
700 | virtual std::optional<unsigned> |
701 | getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const { |
702 | std::optional<unsigned> TargetResult = |
703 | getST()->getCacheAssociativity(static_cast<unsigned>(Level)); |
704 | |
705 | if (TargetResult) |
706 | return TargetResult; |
707 | |
708 | return BaseT::getCacheAssociativity(Level); |
709 | } |
710 | |
711 | virtual unsigned getCacheLineSize() const { |
712 | return getST()->getCacheLineSize(); |
713 | } |
714 | |
715 | virtual unsigned getPrefetchDistance() const { |
716 | return getST()->getPrefetchDistance(); |
717 | } |
718 | |
719 | virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses, |
720 | unsigned NumStridedMemAccesses, |
721 | unsigned NumPrefetches, |
722 | bool HasCall) const { |
723 | return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses, |
724 | NumPrefetches, HasCall); |
725 | } |
726 | |
727 | virtual unsigned getMaxPrefetchIterationsAhead() const { |
728 | return getST()->getMaxPrefetchIterationsAhead(); |
729 | } |
730 | |
731 | virtual bool enableWritePrefetching() const { |
732 | return getST()->enableWritePrefetching(); |
733 | } |
734 | |
735 | virtual bool shouldPrefetchAddressSpace(unsigned AS) const { |
736 | return getST()->shouldPrefetchAddressSpace(AS); |
737 | } |
738 | |
739 | /// @} |
740 | |
741 | /// \name Vector TTI Implementations |
742 | /// @{ |
743 | |
744 | TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { |
745 | return TypeSize::getFixed(ExactSize: 32); |
746 | } |
747 | |
748 | std::optional<unsigned> getMaxVScale() const { return std::nullopt; } |
749 | std::optional<unsigned> getVScaleForTuning() const { return std::nullopt; } |
750 | bool isVScaleKnownToBeAPowerOfTwo() const { return false; } |
751 | |
752 | /// Estimate the overhead of scalarizing an instruction. Insert and Extract |
753 | /// are set if the demanded result elements need to be inserted and/or |
754 | /// extracted from vectors. |
755 | InstructionCost getScalarizationOverhead(VectorType *InTy, |
756 | const APInt &DemandedElts, |
757 | bool Insert, bool , |
758 | TTI::TargetCostKind CostKind) { |
759 | /// FIXME: a bitfield is not a reasonable abstraction for talking about |
760 | /// which elements are needed from a scalable vector |
761 | if (isa<ScalableVectorType>(Val: InTy)) |
762 | return InstructionCost::getInvalid(); |
763 | auto *Ty = cast<FixedVectorType>(Val: InTy); |
764 | |
765 | assert(DemandedElts.getBitWidth() == Ty->getNumElements() && |
766 | "Vector size mismatch" ); |
767 | |
768 | InstructionCost Cost = 0; |
769 | |
770 | for (int i = 0, e = Ty->getNumElements(); i < e; ++i) { |
771 | if (!DemandedElts[i]) |
772 | continue; |
773 | if (Insert) |
774 | Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty, |
775 | CostKind, i, nullptr, nullptr); |
776 | if (Extract) |
777 | Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, |
778 | CostKind, i, nullptr, nullptr); |
779 | } |
780 | |
781 | return Cost; |
782 | } |
783 | |
784 | /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead. |
785 | InstructionCost getScalarizationOverhead(VectorType *InTy, bool Insert, |
786 | bool , |
787 | TTI::TargetCostKind CostKind) { |
788 | if (isa<ScalableVectorType>(Val: InTy)) |
789 | return InstructionCost::getInvalid(); |
790 | auto *Ty = cast<FixedVectorType>(Val: InTy); |
791 | |
792 | APInt DemandedElts = APInt::getAllOnes(numBits: Ty->getNumElements()); |
793 | return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract, |
794 | CostKind); |
795 | } |
796 | |
797 | /// Estimate the overhead of scalarizing an instructions unique |
798 | /// non-constant operands. The (potentially vector) types to use for each of |
799 | /// argument are passes via Tys. |
800 | InstructionCost |
801 | getOperandsScalarizationOverhead(ArrayRef<const Value *> Args, |
802 | ArrayRef<Type *> Tys, |
803 | TTI::TargetCostKind CostKind) { |
804 | assert(Args.size() == Tys.size() && "Expected matching Args and Tys" ); |
805 | |
806 | InstructionCost Cost = 0; |
807 | SmallPtrSet<const Value*, 4> UniqueOperands; |
808 | for (int I = 0, E = Args.size(); I != E; I++) { |
809 | // Disregard things like metadata arguments. |
810 | const Value *A = Args[I]; |
811 | Type *Ty = Tys[I]; |
812 | if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() && |
813 | !Ty->isPtrOrPtrVectorTy()) |
814 | continue; |
815 | |
816 | if (!isa<Constant>(Val: A) && UniqueOperands.insert(Ptr: A).second) { |
817 | if (auto *VecTy = dyn_cast<VectorType>(Val: Ty)) |
818 | Cost += getScalarizationOverhead(VecTy, /*Insert*/ false, |
819 | /*Extract*/ true, CostKind); |
820 | } |
821 | } |
822 | |
823 | return Cost; |
824 | } |
825 | |
826 | /// Estimate the overhead of scalarizing the inputs and outputs of an |
827 | /// instruction, with return type RetTy and arguments Args of type Tys. If |
828 | /// Args are unknown (empty), then the cost associated with one argument is |
829 | /// added as a heuristic. |
830 | InstructionCost getScalarizationOverhead(VectorType *RetTy, |
831 | ArrayRef<const Value *> Args, |
832 | ArrayRef<Type *> Tys, |
833 | TTI::TargetCostKind CostKind) { |
834 | InstructionCost Cost = getScalarizationOverhead( |
835 | RetTy, /*Insert*/ true, /*Extract*/ false, CostKind); |
836 | if (!Args.empty()) |
837 | Cost += getOperandsScalarizationOverhead(Args, Tys, CostKind); |
838 | else |
839 | // When no information on arguments is provided, we add the cost |
840 | // associated with one argument as a heuristic. |
841 | Cost += getScalarizationOverhead(RetTy, /*Insert*/ false, |
842 | /*Extract*/ true, CostKind); |
843 | |
844 | return Cost; |
845 | } |
846 | |
847 | /// Estimate the cost of type-legalization and the legalized type. |
848 | std::pair<InstructionCost, MVT> getTypeLegalizationCost(Type *Ty) const { |
849 | LLVMContext &C = Ty->getContext(); |
850 | EVT MTy = getTLI()->getValueType(DL, Ty); |
851 | |
852 | InstructionCost Cost = 1; |
853 | // We keep legalizing the type until we find a legal kind. We assume that |
854 | // the only operation that costs anything is the split. After splitting |
855 | // we need to handle two types. |
856 | while (true) { |
857 | TargetLoweringBase::LegalizeKind LK = getTLI()->getTypeConversion(C, MTy); |
858 | |
859 | if (LK.first == TargetLoweringBase::TypeScalarizeScalableVector) { |
860 | // Ensure we return a sensible simple VT here, since many callers of |
861 | // this function require it. |
862 | MVT VT = MTy.isSimple() ? MTy.getSimpleVT() : MVT::i64; |
863 | return std::make_pair(x: InstructionCost::getInvalid(), y&: VT); |
864 | } |
865 | |
866 | if (LK.first == TargetLoweringBase::TypeLegal) |
867 | return std::make_pair(x&: Cost, y: MTy.getSimpleVT()); |
868 | |
869 | if (LK.first == TargetLoweringBase::TypeSplitVector || |
870 | LK.first == TargetLoweringBase::TypeExpandInteger) |
871 | Cost *= 2; |
872 | |
873 | // Do not loop with f128 type. |
874 | if (MTy == LK.second) |
875 | return std::make_pair(x&: Cost, y: MTy.getSimpleVT()); |
876 | |
877 | // Keep legalizing the type. |
878 | MTy = LK.second; |
879 | } |
880 | } |
881 | |
882 | unsigned getMaxInterleaveFactor(ElementCount VF) { return 1; } |
883 | |
884 | InstructionCost getArithmeticInstrCost( |
885 | unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, |
886 | TTI::OperandValueInfo Opd1Info = {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, |
887 | TTI::OperandValueInfo Opd2Info = {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, |
888 | ArrayRef<const Value *> Args = ArrayRef<const Value *>(), |
889 | const Instruction *CxtI = nullptr) { |
890 | // Check if any of the operands are vector operands. |
891 | const TargetLoweringBase *TLI = getTLI(); |
892 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
893 | assert(ISD && "Invalid opcode" ); |
894 | |
895 | // TODO: Handle more cost kinds. |
896 | if (CostKind != TTI::TCK_RecipThroughput) |
897 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, |
898 | Opd1Info, Opd2Info, |
899 | Args, CxtI); |
900 | |
901 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); |
902 | |
903 | bool IsFloat = Ty->isFPOrFPVectorTy(); |
904 | // Assume that floating point arithmetic operations cost twice as much as |
905 | // integer operations. |
906 | InstructionCost OpCost = (IsFloat ? 2 : 1); |
907 | |
908 | if (TLI->isOperationLegalOrPromote(Op: ISD, VT: LT.second)) { |
909 | // The operation is legal. Assume it costs 1. |
910 | // TODO: Once we have extract/insert subvector cost we need to use them. |
911 | return LT.first * OpCost; |
912 | } |
913 | |
914 | if (!TLI->isOperationExpand(Op: ISD, VT: LT.second)) { |
915 | // If the operation is custom lowered, then assume that the code is twice |
916 | // as expensive. |
917 | return LT.first * 2 * OpCost; |
918 | } |
919 | |
920 | // An 'Expand' of URem and SRem is special because it may default |
921 | // to expanding the operation into a sequence of sub-operations |
922 | // i.e. X % Y -> X-(X/Y)*Y. |
923 | if (ISD == ISD::UREM || ISD == ISD::SREM) { |
924 | bool IsSigned = ISD == ISD::SREM; |
925 | if (TLI->isOperationLegalOrCustom(Op: IsSigned ? ISD::SDIVREM : ISD::UDIVREM, |
926 | VT: LT.second) || |
927 | TLI->isOperationLegalOrCustom(Op: IsSigned ? ISD::SDIV : ISD::UDIV, |
928 | VT: LT.second)) { |
929 | unsigned DivOpc = IsSigned ? Instruction::SDiv : Instruction::UDiv; |
930 | InstructionCost DivCost = thisT()->getArithmeticInstrCost( |
931 | DivOpc, Ty, CostKind, Opd1Info, Opd2Info); |
932 | InstructionCost MulCost = |
933 | thisT()->getArithmeticInstrCost(Instruction::Mul, Ty, CostKind); |
934 | InstructionCost SubCost = |
935 | thisT()->getArithmeticInstrCost(Instruction::Sub, Ty, CostKind); |
936 | return DivCost + MulCost + SubCost; |
937 | } |
938 | } |
939 | |
940 | // We cannot scalarize scalable vectors, so return Invalid. |
941 | if (isa<ScalableVectorType>(Val: Ty)) |
942 | return InstructionCost::getInvalid(); |
943 | |
944 | // Else, assume that we need to scalarize this op. |
945 | // TODO: If one of the types get legalized by splitting, handle this |
946 | // similarly to what getCastInstrCost() does. |
947 | if (auto *VTy = dyn_cast<FixedVectorType>(Val: Ty)) { |
948 | InstructionCost Cost = thisT()->getArithmeticInstrCost( |
949 | Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info, |
950 | Args, CxtI); |
951 | // Return the cost of multiple scalar invocation plus the cost of |
952 | // inserting and extracting the values. |
953 | SmallVector<Type *> Tys(Args.size(), Ty); |
954 | return getScalarizationOverhead(VTy, Args, Tys, CostKind) + |
955 | VTy->getNumElements() * Cost; |
956 | } |
957 | |
958 | // We don't know anything about this scalar instruction. |
959 | return OpCost; |
960 | } |
961 | |
962 | TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, |
963 | ArrayRef<int> Mask, |
964 | VectorType *Ty, int &Index, |
965 | VectorType *&SubTy) const { |
966 | if (Mask.empty()) |
967 | return Kind; |
968 | int NumSrcElts = Ty->getElementCount().getKnownMinValue(); |
969 | switch (Kind) { |
970 | case TTI::SK_PermuteSingleSrc: |
971 | if (ShuffleVectorInst::isReverseMask(Mask, NumSrcElts)) |
972 | return TTI::SK_Reverse; |
973 | if (ShuffleVectorInst::isZeroEltSplatMask(Mask, NumSrcElts)) |
974 | return TTI::SK_Broadcast; |
975 | if (ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts, Index) && |
976 | (Index + Mask.size()) <= (size_t)NumSrcElts) { |
977 | SubTy = FixedVectorType::get(ElementType: Ty->getElementType(), NumElts: Mask.size()); |
978 | return TTI::SK_ExtractSubvector; |
979 | } |
980 | break; |
981 | case TTI::SK_PermuteTwoSrc: { |
982 | int NumSubElts; |
983 | if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask( |
984 | Mask, NumSrcElts, NumSubElts, Index)) { |
985 | if (Index + NumSubElts > NumSrcElts) |
986 | return Kind; |
987 | SubTy = FixedVectorType::get(ElementType: Ty->getElementType(), NumElts: NumSubElts); |
988 | return TTI::SK_InsertSubvector; |
989 | } |
990 | if (ShuffleVectorInst::isSelectMask(Mask, NumSrcElts)) |
991 | return TTI::SK_Select; |
992 | if (ShuffleVectorInst::isTransposeMask(Mask, NumSrcElts)) |
993 | return TTI::SK_Transpose; |
994 | if (ShuffleVectorInst::isSpliceMask(Mask, NumSrcElts, Index)) |
995 | return TTI::SK_Splice; |
996 | break; |
997 | } |
998 | case TTI::SK_Select: |
999 | case TTI::SK_Reverse: |
1000 | case TTI::SK_Broadcast: |
1001 | case TTI::SK_Transpose: |
1002 | case TTI::SK_InsertSubvector: |
1003 | case TTI::SK_ExtractSubvector: |
1004 | case TTI::SK_Splice: |
1005 | break; |
1006 | } |
1007 | return Kind; |
1008 | } |
1009 | |
1010 | InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, |
1011 | ArrayRef<int> Mask, |
1012 | TTI::TargetCostKind CostKind, int Index, |
1013 | VectorType *SubTp, |
1014 | ArrayRef<const Value *> Args = std::nullopt) { |
1015 | switch (improveShuffleKindFromMask(Kind, Mask, Ty: Tp, Index, SubTy&: SubTp)) { |
1016 | case TTI::SK_Broadcast: |
1017 | if (auto *FVT = dyn_cast<FixedVectorType>(Val: Tp)) |
1018 | return getBroadcastShuffleOverhead(VTy: FVT, CostKind); |
1019 | return InstructionCost::getInvalid(); |
1020 | case TTI::SK_Select: |
1021 | case TTI::SK_Splice: |
1022 | case TTI::SK_Reverse: |
1023 | case TTI::SK_Transpose: |
1024 | case TTI::SK_PermuteSingleSrc: |
1025 | case TTI::SK_PermuteTwoSrc: |
1026 | if (auto *FVT = dyn_cast<FixedVectorType>(Val: Tp)) |
1027 | return getPermuteShuffleOverhead(VTy: FVT, CostKind); |
1028 | return InstructionCost::getInvalid(); |
1029 | case TTI::SK_ExtractSubvector: |
1030 | return getExtractSubvectorOverhead(VTy: Tp, CostKind, Index, |
1031 | SubVTy: cast<FixedVectorType>(Val: SubTp)); |
1032 | case TTI::SK_InsertSubvector: |
1033 | return getInsertSubvectorOverhead(VTy: Tp, CostKind, Index, |
1034 | SubVTy: cast<FixedVectorType>(Val: SubTp)); |
1035 | } |
1036 | llvm_unreachable("Unknown TTI::ShuffleKind" ); |
1037 | } |
1038 | |
1039 | InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, |
1040 | TTI::CastContextHint CCH, |
1041 | TTI::TargetCostKind CostKind, |
1042 | const Instruction *I = nullptr) { |
1043 | if (BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I) == 0) |
1044 | return 0; |
1045 | |
1046 | const TargetLoweringBase *TLI = getTLI(); |
1047 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
1048 | assert(ISD && "Invalid opcode" ); |
1049 | std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Ty: Src); |
1050 | std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Ty: Dst); |
1051 | |
1052 | TypeSize SrcSize = SrcLT.second.getSizeInBits(); |
1053 | TypeSize DstSize = DstLT.second.getSizeInBits(); |
1054 | bool IntOrPtrSrc = Src->isIntegerTy() || Src->isPointerTy(); |
1055 | bool IntOrPtrDst = Dst->isIntegerTy() || Dst->isPointerTy(); |
1056 | |
1057 | switch (Opcode) { |
1058 | default: |
1059 | break; |
1060 | case Instruction::Trunc: |
1061 | // Check for NOOP conversions. |
1062 | if (TLI->isTruncateFree(FromVT: SrcLT.second, ToVT: DstLT.second)) |
1063 | return 0; |
1064 | [[fallthrough]]; |
1065 | case Instruction::BitCast: |
1066 | // Bitcast between types that are legalized to the same type are free and |
1067 | // assume int to/from ptr of the same size is also free. |
1068 | if (SrcLT.first == DstLT.first && IntOrPtrSrc == IntOrPtrDst && |
1069 | SrcSize == DstSize) |
1070 | return 0; |
1071 | break; |
1072 | case Instruction::FPExt: |
1073 | if (I && getTLI()->isExtFree(I)) |
1074 | return 0; |
1075 | break; |
1076 | case Instruction::ZExt: |
1077 | if (TLI->isZExtFree(FromTy: SrcLT.second, ToTy: DstLT.second)) |
1078 | return 0; |
1079 | [[fallthrough]]; |
1080 | case Instruction::SExt: |
1081 | if (I && getTLI()->isExtFree(I)) |
1082 | return 0; |
1083 | |
1084 | // If this is a zext/sext of a load, return 0 if the corresponding |
1085 | // extending load exists on target and the result type is legal. |
1086 | if (CCH == TTI::CastContextHint::Normal) { |
1087 | EVT ExtVT = EVT::getEVT(Ty: Dst); |
1088 | EVT LoadVT = EVT::getEVT(Ty: Src); |
1089 | unsigned LType = |
1090 | ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD); |
1091 | if (DstLT.first == SrcLT.first && |
1092 | TLI->isLoadExtLegal(ExtType: LType, ValVT: ExtVT, MemVT: LoadVT)) |
1093 | return 0; |
1094 | } |
1095 | break; |
1096 | case Instruction::AddrSpaceCast: |
1097 | if (TLI->isFreeAddrSpaceCast(SrcAS: Src->getPointerAddressSpace(), |
1098 | DestAS: Dst->getPointerAddressSpace())) |
1099 | return 0; |
1100 | break; |
1101 | } |
1102 | |
1103 | auto *SrcVTy = dyn_cast<VectorType>(Val: Src); |
1104 | auto *DstVTy = dyn_cast<VectorType>(Val: Dst); |
1105 | |
1106 | // If the cast is marked as legal (or promote) then assume low cost. |
1107 | if (SrcLT.first == DstLT.first && |
1108 | TLI->isOperationLegalOrPromote(Op: ISD, VT: DstLT.second)) |
1109 | return SrcLT.first; |
1110 | |
1111 | // Handle scalar conversions. |
1112 | if (!SrcVTy && !DstVTy) { |
1113 | // Just check the op cost. If the operation is legal then assume it costs |
1114 | // 1. |
1115 | if (!TLI->isOperationExpand(Op: ISD, VT: DstLT.second)) |
1116 | return 1; |
1117 | |
1118 | // Assume that illegal scalar instruction are expensive. |
1119 | return 4; |
1120 | } |
1121 | |
1122 | // Check vector-to-vector casts. |
1123 | if (DstVTy && SrcVTy) { |
1124 | // If the cast is between same-sized registers, then the check is simple. |
1125 | if (SrcLT.first == DstLT.first && SrcSize == DstSize) { |
1126 | |
1127 | // Assume that Zext is done using AND. |
1128 | if (Opcode == Instruction::ZExt) |
1129 | return SrcLT.first; |
1130 | |
1131 | // Assume that sext is done using SHL and SRA. |
1132 | if (Opcode == Instruction::SExt) |
1133 | return SrcLT.first * 2; |
1134 | |
1135 | // Just check the op cost. If the operation is legal then assume it |
1136 | // costs |
1137 | // 1 and multiply by the type-legalization overhead. |
1138 | if (!TLI->isOperationExpand(Op: ISD, VT: DstLT.second)) |
1139 | return SrcLT.first * 1; |
1140 | } |
1141 | |
1142 | // If we are legalizing by splitting, query the concrete TTI for the cost |
1143 | // of casting the original vector twice. We also need to factor in the |
1144 | // cost of the split itself. Count that as 1, to be consistent with |
1145 | // getTypeLegalizationCost(). |
1146 | bool SplitSrc = |
1147 | TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Ty: Src)) == |
1148 | TargetLowering::TypeSplitVector; |
1149 | bool SplitDst = |
1150 | TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Ty: Dst)) == |
1151 | TargetLowering::TypeSplitVector; |
1152 | if ((SplitSrc || SplitDst) && SrcVTy->getElementCount().isVector() && |
1153 | DstVTy->getElementCount().isVector()) { |
1154 | Type *SplitDstTy = VectorType::getHalfElementsVectorType(VTy: DstVTy); |
1155 | Type *SplitSrcTy = VectorType::getHalfElementsVectorType(VTy: SrcVTy); |
1156 | T *TTI = static_cast<T *>(this); |
1157 | // If both types need to be split then the split is free. |
1158 | InstructionCost SplitCost = |
1159 | (!SplitSrc || !SplitDst) ? TTI->getVectorSplitCost() : 0; |
1160 | return SplitCost + |
1161 | (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, CCH, |
1162 | CostKind, I)); |
1163 | } |
1164 | |
1165 | // Scalarization cost is Invalid, can't assume any num elements. |
1166 | if (isa<ScalableVectorType>(Val: DstVTy)) |
1167 | return InstructionCost::getInvalid(); |
1168 | |
1169 | // In other cases where the source or destination are illegal, assume |
1170 | // the operation will get scalarized. |
1171 | unsigned Num = cast<FixedVectorType>(Val: DstVTy)->getNumElements(); |
1172 | InstructionCost Cost = thisT()->getCastInstrCost( |
1173 | Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind, I); |
1174 | |
1175 | // Return the cost of multiple scalar invocation plus the cost of |
1176 | // inserting and extracting the values. |
1177 | return getScalarizationOverhead(DstVTy, /*Insert*/ true, /*Extract*/ true, |
1178 | CostKind) + |
1179 | Num * Cost; |
1180 | } |
1181 | |
1182 | // We already handled vector-to-vector and scalar-to-scalar conversions. |
1183 | // This |
1184 | // is where we handle bitcast between vectors and scalars. We need to assume |
1185 | // that the conversion is scalarized in one way or another. |
1186 | if (Opcode == Instruction::BitCast) { |
1187 | // Illegal bitcasts are done by storing and loading from a stack slot. |
1188 | return (SrcVTy ? getScalarizationOverhead(SrcVTy, /*Insert*/ false, |
1189 | /*Extract*/ true, CostKind) |
1190 | : 0) + |
1191 | (DstVTy ? getScalarizationOverhead(DstVTy, /*Insert*/ true, |
1192 | /*Extract*/ false, CostKind) |
1193 | : 0); |
1194 | } |
1195 | |
1196 | llvm_unreachable("Unhandled cast" ); |
1197 | } |
1198 | |
1199 | InstructionCost (unsigned Opcode, Type *Dst, |
1200 | VectorType *VecTy, unsigned Index) { |
1201 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
1202 | return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy, |
1203 | CostKind, Index, nullptr, nullptr) + |
1204 | thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(), |
1205 | TTI::CastContextHint::None, CostKind); |
1206 | } |
1207 | |
1208 | InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, |
1209 | const Instruction *I = nullptr) { |
1210 | return BaseT::getCFInstrCost(Opcode, CostKind, I); |
1211 | } |
1212 | |
1213 | InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, |
1214 | CmpInst::Predicate VecPred, |
1215 | TTI::TargetCostKind CostKind, |
1216 | const Instruction *I = nullptr) { |
1217 | const TargetLoweringBase *TLI = getTLI(); |
1218 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
1219 | assert(ISD && "Invalid opcode" ); |
1220 | |
1221 | // TODO: Handle other cost kinds. |
1222 | if (CostKind != TTI::TCK_RecipThroughput) |
1223 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, |
1224 | I); |
1225 | |
1226 | // Selects on vectors are actually vector selects. |
1227 | if (ISD == ISD::SELECT) { |
1228 | assert(CondTy && "CondTy must exist" ); |
1229 | if (CondTy->isVectorTy()) |
1230 | ISD = ISD::VSELECT; |
1231 | } |
1232 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy); |
1233 | |
1234 | if (!(ValTy->isVectorTy() && !LT.second.isVector()) && |
1235 | !TLI->isOperationExpand(Op: ISD, VT: LT.second)) { |
1236 | // The operation is legal. Assume it costs 1. Multiply |
1237 | // by the type-legalization overhead. |
1238 | return LT.first * 1; |
1239 | } |
1240 | |
1241 | // Otherwise, assume that the cast is scalarized. |
1242 | // TODO: If one of the types get legalized by splitting, handle this |
1243 | // similarly to what getCastInstrCost() does. |
1244 | if (auto *ValVTy = dyn_cast<VectorType>(Val: ValTy)) { |
1245 | if (isa<ScalableVectorType>(Val: ValTy)) |
1246 | return InstructionCost::getInvalid(); |
1247 | |
1248 | unsigned Num = cast<FixedVectorType>(Val: ValVTy)->getNumElements(); |
1249 | if (CondTy) |
1250 | CondTy = CondTy->getScalarType(); |
1251 | InstructionCost Cost = thisT()->getCmpSelInstrCost( |
1252 | Opcode, ValVTy->getScalarType(), CondTy, VecPred, CostKind, I); |
1253 | |
1254 | // Return the cost of multiple scalar invocation plus the cost of |
1255 | // inserting and extracting the values. |
1256 | return getScalarizationOverhead(ValVTy, /*Insert*/ true, |
1257 | /*Extract*/ false, CostKind) + |
1258 | Num * Cost; |
1259 | } |
1260 | |
1261 | // Unknown scalar opcode. |
1262 | return 1; |
1263 | } |
1264 | |
1265 | InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, |
1266 | TTI::TargetCostKind CostKind, |
1267 | unsigned Index, Value *Op0, Value *Op1) { |
1268 | return getRegUsageForType(Ty: Val->getScalarType()); |
1269 | } |
1270 | |
1271 | InstructionCost getVectorInstrCost(const Instruction &I, Type *Val, |
1272 | TTI::TargetCostKind CostKind, |
1273 | unsigned Index) { |
1274 | Value *Op0 = nullptr; |
1275 | Value *Op1 = nullptr; |
1276 | if (auto *IE = dyn_cast<InsertElementInst>(Val: &I)) { |
1277 | Op0 = IE->getOperand(i_nocapture: 0); |
1278 | Op1 = IE->getOperand(i_nocapture: 1); |
1279 | } |
1280 | return thisT()->getVectorInstrCost(I.getOpcode(), Val, CostKind, Index, Op0, |
1281 | Op1); |
1282 | } |
1283 | |
1284 | InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, |
1285 | int VF, |
1286 | const APInt &DemandedDstElts, |
1287 | TTI::TargetCostKind CostKind) { |
1288 | assert(DemandedDstElts.getBitWidth() == (unsigned)VF * ReplicationFactor && |
1289 | "Unexpected size of DemandedDstElts." ); |
1290 | |
1291 | InstructionCost Cost; |
1292 | |
1293 | auto *SrcVT = FixedVectorType::get(ElementType: EltTy, NumElts: VF); |
1294 | auto *ReplicatedVT = FixedVectorType::get(ElementType: EltTy, NumElts: VF * ReplicationFactor); |
1295 | |
1296 | // The Mask shuffling cost is extract all the elements of the Mask |
1297 | // and insert each of them Factor times into the wide vector: |
1298 | // |
1299 | // E.g. an interleaved group with factor 3: |
1300 | // %mask = icmp ult <8 x i32> %vec1, %vec2 |
1301 | // %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef, |
1302 | // <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7> |
1303 | // The cost is estimated as extract all mask elements from the <8xi1> mask |
1304 | // vector and insert them factor times into the <24xi1> shuffled mask |
1305 | // vector. |
1306 | APInt DemandedSrcElts = APIntOps::ScaleBitMask(A: DemandedDstElts, NewBitWidth: VF); |
1307 | Cost += thisT()->getScalarizationOverhead(SrcVT, DemandedSrcElts, |
1308 | /*Insert*/ false, |
1309 | /*Extract*/ true, CostKind); |
1310 | Cost += thisT()->getScalarizationOverhead(ReplicatedVT, DemandedDstElts, |
1311 | /*Insert*/ true, |
1312 | /*Extract*/ false, CostKind); |
1313 | |
1314 | return Cost; |
1315 | } |
1316 | |
1317 | InstructionCost |
1318 | getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, |
1319 | unsigned AddressSpace, TTI::TargetCostKind CostKind, |
1320 | TTI::OperandValueInfo OpInfo = {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, |
1321 | const Instruction *I = nullptr) { |
1322 | assert(!Src->isVoidTy() && "Invalid type" ); |
1323 | // Assume types, such as structs, are expensive. |
1324 | if (getTLI()->getValueType(DL, Src, true) == MVT::Other) |
1325 | return 4; |
1326 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Src); |
1327 | |
1328 | // Assuming that all loads of legal types cost 1. |
1329 | InstructionCost Cost = LT.first; |
1330 | if (CostKind != TTI::TCK_RecipThroughput) |
1331 | return Cost; |
1332 | |
1333 | const DataLayout &DL = this->getDataLayout(); |
1334 | if (Src->isVectorTy() && |
1335 | // In practice it's not currently possible to have a change in lane |
1336 | // length for extending loads or truncating stores so both types should |
1337 | // have the same scalable property. |
1338 | TypeSize::isKnownLT(LHS: DL.getTypeStoreSizeInBits(Ty: Src), |
1339 | RHS: LT.second.getSizeInBits())) { |
1340 | // This is a vector load that legalizes to a larger type than the vector |
1341 | // itself. Unless the corresponding extending load or truncating store is |
1342 | // legal, then this will scalarize. |
1343 | TargetLowering::LegalizeAction LA = TargetLowering::Expand; |
1344 | EVT MemVT = getTLI()->getValueType(DL, Src); |
1345 | if (Opcode == Instruction::Store) |
1346 | LA = getTLI()->getTruncStoreAction(LT.second, MemVT); |
1347 | else |
1348 | LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT); |
1349 | |
1350 | if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) { |
1351 | // This is a vector load/store for some illegal type that is scalarized. |
1352 | // We must account for the cost of building or decomposing the vector. |
1353 | Cost += getScalarizationOverhead( |
1354 | cast<VectorType>(Val: Src), Opcode != Instruction::Store, |
1355 | Opcode == Instruction::Store, CostKind); |
1356 | } |
1357 | } |
1358 | |
1359 | return Cost; |
1360 | } |
1361 | |
1362 | InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, |
1363 | Align Alignment, unsigned AddressSpace, |
1364 | TTI::TargetCostKind CostKind) { |
1365 | return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, VariableMask: true, IsGatherScatter: false, |
1366 | CostKind); |
1367 | } |
1368 | |
1369 | InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, |
1370 | const Value *Ptr, bool VariableMask, |
1371 | Align Alignment, |
1372 | TTI::TargetCostKind CostKind, |
1373 | const Instruction *I = nullptr) { |
1374 | return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, VariableMask, |
1375 | IsGatherScatter: true, CostKind); |
1376 | } |
1377 | |
1378 | InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, |
1379 | const Value *Ptr, bool VariableMask, |
1380 | Align Alignment, |
1381 | TTI::TargetCostKind CostKind, |
1382 | const Instruction *I) { |
1383 | // For a target without strided memory operations (or for an illegal |
1384 | // operation type on one which does), assume we lower to a gather/scatter |
1385 | // operation. (Which may in turn be scalarized.) |
1386 | return thisT()->getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, |
1387 | Alignment, CostKind, I); |
1388 | } |
1389 | |
1390 | InstructionCost getInterleavedMemoryOpCost( |
1391 | unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, |
1392 | Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, |
1393 | bool UseMaskForCond = false, bool UseMaskForGaps = false) { |
1394 | |
1395 | // We cannot scalarize scalable vectors, so return Invalid. |
1396 | if (isa<ScalableVectorType>(Val: VecTy)) |
1397 | return InstructionCost::getInvalid(); |
1398 | |
1399 | auto *VT = cast<FixedVectorType>(Val: VecTy); |
1400 | |
1401 | unsigned NumElts = VT->getNumElements(); |
1402 | assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor" ); |
1403 | |
1404 | unsigned NumSubElts = NumElts / Factor; |
1405 | auto *SubVT = FixedVectorType::get(ElementType: VT->getElementType(), NumElts: NumSubElts); |
1406 | |
1407 | // Firstly, the cost of load/store operation. |
1408 | InstructionCost Cost; |
1409 | if (UseMaskForCond || UseMaskForGaps) |
1410 | Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment, |
1411 | AddressSpace, CostKind); |
1412 | else |
1413 | Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, |
1414 | CostKind); |
1415 | |
1416 | // Legalize the vector type, and get the legalized and unlegalized type |
1417 | // sizes. |
1418 | MVT VecTyLT = getTypeLegalizationCost(Ty: VecTy).second; |
1419 | unsigned VecTySize = thisT()->getDataLayout().getTypeStoreSize(VecTy); |
1420 | unsigned VecTyLTSize = VecTyLT.getStoreSize(); |
1421 | |
1422 | // Scale the cost of the memory operation by the fraction of legalized |
1423 | // instructions that will actually be used. We shouldn't account for the |
1424 | // cost of dead instructions since they will be removed. |
1425 | // |
1426 | // E.g., An interleaved load of factor 8: |
1427 | // %vec = load <16 x i64>, <16 x i64>* %ptr |
1428 | // %v0 = shufflevector %vec, undef, <0, 8> |
1429 | // |
1430 | // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be |
1431 | // used (those corresponding to elements [0:1] and [8:9] of the unlegalized |
1432 | // type). The other loads are unused. |
1433 | // |
1434 | // TODO: Note that legalization can turn masked loads/stores into unmasked |
1435 | // (legalized) loads/stores. This can be reflected in the cost. |
1436 | if (Cost.isValid() && VecTySize > VecTyLTSize) { |
1437 | // The number of loads of a legal type it will take to represent a load |
1438 | // of the unlegalized vector type. |
1439 | unsigned NumLegalInsts = divideCeil(Numerator: VecTySize, Denominator: VecTyLTSize); |
1440 | |
1441 | // The number of elements of the unlegalized type that correspond to a |
1442 | // single legal instruction. |
1443 | unsigned NumEltsPerLegalInst = divideCeil(Numerator: NumElts, Denominator: NumLegalInsts); |
1444 | |
1445 | // Determine which legal instructions will be used. |
1446 | BitVector UsedInsts(NumLegalInsts, false); |
1447 | for (unsigned Index : Indices) |
1448 | for (unsigned Elt = 0; Elt < NumSubElts; ++Elt) |
1449 | UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst); |
1450 | |
1451 | // Scale the cost of the load by the fraction of legal instructions that |
1452 | // will be used. |
1453 | Cost = divideCeil(Numerator: UsedInsts.count() * *Cost.getValue(), Denominator: NumLegalInsts); |
1454 | } |
1455 | |
1456 | // Then plus the cost of interleave operation. |
1457 | assert(Indices.size() <= Factor && |
1458 | "Interleaved memory op has too many members" ); |
1459 | |
1460 | const APInt DemandedAllSubElts = APInt::getAllOnes(numBits: NumSubElts); |
1461 | const APInt DemandedAllResultElts = APInt::getAllOnes(numBits: NumElts); |
1462 | |
1463 | APInt DemandedLoadStoreElts = APInt::getZero(numBits: NumElts); |
1464 | for (unsigned Index : Indices) { |
1465 | assert(Index < Factor && "Invalid index for interleaved memory op" ); |
1466 | for (unsigned Elm = 0; Elm < NumSubElts; Elm++) |
1467 | DemandedLoadStoreElts.setBit(Index + Elm * Factor); |
1468 | } |
1469 | |
1470 | if (Opcode == Instruction::Load) { |
1471 | // The interleave cost is similar to extract sub vectors' elements |
1472 | // from the wide vector, and insert them into sub vectors. |
1473 | // |
1474 | // E.g. An interleaved load of factor 2 (with one member of index 0): |
1475 | // %vec = load <8 x i32>, <8 x i32>* %ptr |
1476 | // %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0 |
1477 | // The cost is estimated as extract elements at 0, 2, 4, 6 from the |
1478 | // <8 x i32> vector and insert them into a <4 x i32> vector. |
1479 | InstructionCost InsSubCost = thisT()->getScalarizationOverhead( |
1480 | SubVT, DemandedAllSubElts, |
1481 | /*Insert*/ true, /*Extract*/ false, CostKind); |
1482 | Cost += Indices.size() * InsSubCost; |
1483 | Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts, |
1484 | /*Insert*/ false, |
1485 | /*Extract*/ true, CostKind); |
1486 | } else { |
1487 | // The interleave cost is extract elements from sub vectors, and |
1488 | // insert them into the wide vector. |
1489 | // |
1490 | // E.g. An interleaved store of factor 3 with 2 members at indices 0,1: |
1491 | // (using VF=4): |
1492 | // %v0_v1 = shuffle %v0, %v1, <0,4,undef,1,5,undef,2,6,undef,3,7,undef> |
1493 | // %gaps.mask = <true, true, false, true, true, false, |
1494 | // true, true, false, true, true, false> |
1495 | // call llvm.masked.store <12 x i32> %v0_v1, <12 x i32>* %ptr, |
1496 | // i32 Align, <12 x i1> %gaps.mask |
1497 | // The cost is estimated as extract all elements (of actual members, |
1498 | // excluding gaps) from both <4 x i32> vectors and insert into the <12 x |
1499 | // i32> vector. |
1500 | InstructionCost ExtSubCost = thisT()->getScalarizationOverhead( |
1501 | SubVT, DemandedAllSubElts, |
1502 | /*Insert*/ false, /*Extract*/ true, CostKind); |
1503 | Cost += ExtSubCost * Indices.size(); |
1504 | Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts, |
1505 | /*Insert*/ true, |
1506 | /*Extract*/ false, CostKind); |
1507 | } |
1508 | |
1509 | if (!UseMaskForCond) |
1510 | return Cost; |
1511 | |
1512 | Type *I8Type = Type::getInt8Ty(C&: VT->getContext()); |
1513 | |
1514 | Cost += thisT()->getReplicationShuffleCost( |
1515 | I8Type, Factor, NumSubElts, |
1516 | UseMaskForGaps ? DemandedLoadStoreElts : DemandedAllResultElts, |
1517 | CostKind); |
1518 | |
1519 | // The Gaps mask is invariant and created outside the loop, therefore the |
1520 | // cost of creating it is not accounted for here. However if we have both |
1521 | // a MaskForGaps and some other mask that guards the execution of the |
1522 | // memory access, we need to account for the cost of And-ing the two masks |
1523 | // inside the loop. |
1524 | if (UseMaskForGaps) { |
1525 | auto *MaskVT = FixedVectorType::get(ElementType: I8Type, NumElts); |
1526 | Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, MaskVT, |
1527 | CostKind); |
1528 | } |
1529 | |
1530 | return Cost; |
1531 | } |
1532 | |
1533 | /// Get intrinsic cost based on arguments. |
1534 | InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, |
1535 | TTI::TargetCostKind CostKind) { |
1536 | // Check for generically free intrinsics. |
1537 | if (BaseT::getIntrinsicInstrCost(ICA, CostKind) == 0) |
1538 | return 0; |
1539 | |
1540 | // Assume that target intrinsics are cheap. |
1541 | Intrinsic::ID IID = ICA.getID(); |
1542 | if (Function::isTargetIntrinsic(IID)) |
1543 | return TargetTransformInfo::TCC_Basic; |
1544 | |
1545 | if (ICA.isTypeBasedOnly()) |
1546 | return getTypeBasedIntrinsicInstrCost(ICA, CostKind); |
1547 | |
1548 | Type *RetTy = ICA.getReturnType(); |
1549 | |
1550 | ElementCount RetVF = |
1551 | (RetTy->isVectorTy() ? cast<VectorType>(Val: RetTy)->getElementCount() |
1552 | : ElementCount::getFixed(MinVal: 1)); |
1553 | const IntrinsicInst *I = ICA.getInst(); |
1554 | const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); |
1555 | FastMathFlags FMF = ICA.getFlags(); |
1556 | switch (IID) { |
1557 | default: |
1558 | break; |
1559 | |
1560 | case Intrinsic::powi: |
1561 | if (auto *RHSC = dyn_cast<ConstantInt>(Val: Args[1])) { |
1562 | bool ShouldOptForSize = I->getParent()->getParent()->hasOptSize(); |
1563 | if (getTLI()->isBeneficialToExpandPowI(RHSC->getSExtValue(), |
1564 | ShouldOptForSize)) { |
1565 | // The cost is modeled on the expansion performed by ExpandPowI in |
1566 | // SelectionDAGBuilder. |
1567 | APInt Exponent = RHSC->getValue().abs(); |
1568 | unsigned ActiveBits = Exponent.getActiveBits(); |
1569 | unsigned PopCount = Exponent.popcount(); |
1570 | InstructionCost Cost = (ActiveBits + PopCount - 2) * |
1571 | thisT()->getArithmeticInstrCost( |
1572 | Instruction::FMul, RetTy, CostKind); |
1573 | if (RHSC->isNegative()) |
1574 | Cost += thisT()->getArithmeticInstrCost(Instruction::FDiv, RetTy, |
1575 | CostKind); |
1576 | return Cost; |
1577 | } |
1578 | } |
1579 | break; |
1580 | case Intrinsic::cttz: |
1581 | // FIXME: If necessary, this should go in target-specific overrides. |
1582 | if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz(RetTy)) |
1583 | return TargetTransformInfo::TCC_Basic; |
1584 | break; |
1585 | |
1586 | case Intrinsic::ctlz: |
1587 | // FIXME: If necessary, this should go in target-specific overrides. |
1588 | if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz(RetTy)) |
1589 | return TargetTransformInfo::TCC_Basic; |
1590 | break; |
1591 | |
1592 | case Intrinsic::memcpy: |
1593 | return thisT()->getMemcpyCost(ICA.getInst()); |
1594 | |
1595 | case Intrinsic::masked_scatter: { |
1596 | const Value *Mask = Args[3]; |
1597 | bool VarMask = !isa<Constant>(Val: Mask); |
1598 | Align Alignment = cast<ConstantInt>(Val: Args[2])->getAlignValue(); |
1599 | return thisT()->getGatherScatterOpCost(Instruction::Store, |
1600 | ICA.getArgTypes()[0], Args[1], |
1601 | VarMask, Alignment, CostKind, I); |
1602 | } |
1603 | case Intrinsic::masked_gather: { |
1604 | const Value *Mask = Args[2]; |
1605 | bool VarMask = !isa<Constant>(Val: Mask); |
1606 | Align Alignment = cast<ConstantInt>(Val: Args[1])->getAlignValue(); |
1607 | return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0], |
1608 | VarMask, Alignment, CostKind, I); |
1609 | } |
1610 | case Intrinsic::experimental_vp_strided_store: { |
1611 | const Value *Data = Args[0]; |
1612 | const Value *Ptr = Args[1]; |
1613 | const Value *Mask = Args[3]; |
1614 | const Value *EVL = Args[4]; |
1615 | bool VarMask = !isa<Constant>(Val: Mask) || !isa<Constant>(Val: EVL); |
1616 | Align Alignment = I->getParamAlign(ArgNo: 1).valueOrOne(); |
1617 | return thisT()->getStridedMemoryOpCost(Instruction::Store, |
1618 | Data->getType(), Ptr, VarMask, |
1619 | Alignment, CostKind, I); |
1620 | } |
1621 | case Intrinsic::experimental_vp_strided_load: { |
1622 | const Value *Ptr = Args[0]; |
1623 | const Value *Mask = Args[2]; |
1624 | const Value *EVL = Args[3]; |
1625 | bool VarMask = !isa<Constant>(Val: Mask) || !isa<Constant>(Val: EVL); |
1626 | Align Alignment = I->getParamAlign(ArgNo: 0).valueOrOne(); |
1627 | return thisT()->getStridedMemoryOpCost(Instruction::Load, RetTy, Ptr, |
1628 | VarMask, Alignment, CostKind, I); |
1629 | } |
1630 | case Intrinsic::experimental_stepvector: { |
1631 | if (isa<ScalableVectorType>(Val: RetTy)) |
1632 | return BaseT::getIntrinsicInstrCost(ICA, CostKind); |
1633 | // The cost of materialising a constant integer vector. |
1634 | return TargetTransformInfo::TCC_Basic; |
1635 | } |
1636 | case Intrinsic::vector_extract: { |
1637 | // FIXME: Handle case where a scalable vector is extracted from a scalable |
1638 | // vector |
1639 | if (isa<ScalableVectorType>(Val: RetTy)) |
1640 | return BaseT::getIntrinsicInstrCost(ICA, CostKind); |
1641 | unsigned Index = cast<ConstantInt>(Val: Args[1])->getZExtValue(); |
1642 | return thisT()->getShuffleCost( |
1643 | TTI::SK_ExtractSubvector, cast<VectorType>(Val: Args[0]->getType()), |
1644 | std::nullopt, CostKind, Index, cast<VectorType>(Val: RetTy)); |
1645 | } |
1646 | case Intrinsic::vector_insert: { |
1647 | // FIXME: Handle case where a scalable vector is inserted into a scalable |
1648 | // vector |
1649 | if (isa<ScalableVectorType>(Val: Args[1]->getType())) |
1650 | return BaseT::getIntrinsicInstrCost(ICA, CostKind); |
1651 | unsigned Index = cast<ConstantInt>(Val: Args[2])->getZExtValue(); |
1652 | return thisT()->getShuffleCost( |
1653 | TTI::SK_InsertSubvector, cast<VectorType>(Val: Args[0]->getType()), |
1654 | std::nullopt, CostKind, Index, cast<VectorType>(Val: Args[1]->getType())); |
1655 | } |
1656 | case Intrinsic::experimental_vector_reverse: { |
1657 | return thisT()->getShuffleCost( |
1658 | TTI::SK_Reverse, cast<VectorType>(Val: Args[0]->getType()), std::nullopt, |
1659 | CostKind, 0, cast<VectorType>(Val: RetTy)); |
1660 | } |
1661 | case Intrinsic::experimental_vector_splice: { |
1662 | unsigned Index = cast<ConstantInt>(Val: Args[2])->getZExtValue(); |
1663 | return thisT()->getShuffleCost( |
1664 | TTI::SK_Splice, cast<VectorType>(Val: Args[0]->getType()), std::nullopt, |
1665 | CostKind, Index, cast<VectorType>(Val: RetTy)); |
1666 | } |
1667 | case Intrinsic::vector_reduce_add: |
1668 | case Intrinsic::vector_reduce_mul: |
1669 | case Intrinsic::vector_reduce_and: |
1670 | case Intrinsic::vector_reduce_or: |
1671 | case Intrinsic::vector_reduce_xor: |
1672 | case Intrinsic::vector_reduce_smax: |
1673 | case Intrinsic::vector_reduce_smin: |
1674 | case Intrinsic::vector_reduce_fmax: |
1675 | case Intrinsic::vector_reduce_fmin: |
1676 | case Intrinsic::vector_reduce_fmaximum: |
1677 | case Intrinsic::vector_reduce_fminimum: |
1678 | case Intrinsic::vector_reduce_umax: |
1679 | case Intrinsic::vector_reduce_umin: { |
1680 | IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, I, 1); |
1681 | return getTypeBasedIntrinsicInstrCost(ICA: Attrs, CostKind); |
1682 | } |
1683 | case Intrinsic::vector_reduce_fadd: |
1684 | case Intrinsic::vector_reduce_fmul: { |
1685 | IntrinsicCostAttributes Attrs( |
1686 | IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, I, 1); |
1687 | return getTypeBasedIntrinsicInstrCost(ICA: Attrs, CostKind); |
1688 | } |
1689 | case Intrinsic::fshl: |
1690 | case Intrinsic::fshr: { |
1691 | const Value *X = Args[0]; |
1692 | const Value *Y = Args[1]; |
1693 | const Value *Z = Args[2]; |
1694 | const TTI::OperandValueInfo OpInfoX = TTI::getOperandInfo(V: X); |
1695 | const TTI::OperandValueInfo OpInfoY = TTI::getOperandInfo(V: Y); |
1696 | const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(V: Z); |
1697 | const TTI::OperandValueInfo OpInfoBW = |
1698 | {.Kind: TTI::OK_UniformConstantValue, |
1699 | .Properties: isPowerOf2_32(Value: RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2 |
1700 | : TTI::OP_None}; |
1701 | |
1702 | // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW))) |
1703 | // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW)) |
1704 | InstructionCost Cost = 0; |
1705 | Cost += |
1706 | thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind); |
1707 | Cost += |
1708 | thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind); |
1709 | Cost += thisT()->getArithmeticInstrCost( |
1710 | BinaryOperator::Shl, RetTy, CostKind, OpInfoX, |
1711 | {OpInfoZ.Kind, TTI::OP_None}); |
1712 | Cost += thisT()->getArithmeticInstrCost( |
1713 | BinaryOperator::LShr, RetTy, CostKind, OpInfoY, |
1714 | {OpInfoZ.Kind, TTI::OP_None}); |
1715 | // Non-constant shift amounts requires a modulo. |
1716 | if (!OpInfoZ.isConstant()) |
1717 | Cost += thisT()->getArithmeticInstrCost(BinaryOperator::URem, RetTy, |
1718 | CostKind, OpInfoZ, OpInfoBW); |
1719 | // For non-rotates (X != Y) we must add shift-by-zero handling costs. |
1720 | if (X != Y) { |
1721 | Type *CondTy = RetTy->getWithNewBitWidth(NewBitWidth: 1); |
1722 | Cost += |
1723 | thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, |
1724 | CmpInst::ICMP_EQ, CostKind); |
1725 | Cost += |
1726 | thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, |
1727 | CmpInst::ICMP_EQ, CostKind); |
1728 | } |
1729 | return Cost; |
1730 | } |
1731 | case Intrinsic::get_active_lane_mask: { |
1732 | EVT ResVT = getTLI()->getValueType(DL, RetTy, true); |
1733 | EVT ArgType = getTLI()->getValueType(DL, ICA.getArgTypes()[0], true); |
1734 | |
1735 | // If we're not expanding the intrinsic then we assume this is cheap |
1736 | // to implement. |
1737 | if (!getTLI()->shouldExpandGetActiveLaneMask(ResVT, ArgType)) { |
1738 | return getTypeLegalizationCost(Ty: RetTy).first; |
1739 | } |
1740 | |
1741 | // Create the expanded types that will be used to calculate the uadd_sat |
1742 | // operation. |
1743 | Type *ExpRetTy = VectorType::get( |
1744 | ElementType: ICA.getArgTypes()[0], EC: cast<VectorType>(Val: RetTy)->getElementCount()); |
1745 | IntrinsicCostAttributes Attrs(Intrinsic::uadd_sat, ExpRetTy, {}, FMF); |
1746 | InstructionCost Cost = |
1747 | thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind); |
1748 | Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, ExpRetTy, RetTy, |
1749 | CmpInst::ICMP_ULT, CostKind); |
1750 | return Cost; |
1751 | } |
1752 | } |
1753 | |
1754 | // VP Intrinsics should have the same cost as their non-vp counterpart. |
1755 | // TODO: Adjust the cost to make the vp intrinsic cheaper than its non-vp |
1756 | // counterpart when the vector length argument is smaller than the maximum |
1757 | // vector length. |
1758 | // TODO: Support other kinds of VPIntrinsics |
1759 | if (VPIntrinsic::isVPIntrinsic(ICA.getID())) { |
1760 | std::optional<unsigned> FOp = |
1761 | VPIntrinsic::getFunctionalOpcodeForVP(ID: ICA.getID()); |
1762 | if (FOp) { |
1763 | if (ICA.getID() == Intrinsic::vp_load) { |
1764 | Align Alignment; |
1765 | if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(Val: ICA.getInst())) |
1766 | Alignment = VPI->getPointerAlignment().valueOrOne(); |
1767 | unsigned AS = 0; |
1768 | if (ICA.getArgs().size() > 1) |
1769 | if (auto *PtrTy = |
1770 | dyn_cast<PointerType>(Val: ICA.getArgs()[0]->getType())) |
1771 | AS = PtrTy->getAddressSpace(); |
1772 | return thisT()->getMemoryOpCost(*FOp, ICA.getReturnType(), Alignment, |
1773 | AS, CostKind); |
1774 | } |
1775 | if (ICA.getID() == Intrinsic::vp_store) { |
1776 | Align Alignment; |
1777 | if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(Val: ICA.getInst())) |
1778 | Alignment = VPI->getPointerAlignment().valueOrOne(); |
1779 | unsigned AS = 0; |
1780 | if (ICA.getArgs().size() >= 2) |
1781 | if (auto *PtrTy = |
1782 | dyn_cast<PointerType>(Val: ICA.getArgs()[1]->getType())) |
1783 | AS = PtrTy->getAddressSpace(); |
1784 | return thisT()->getMemoryOpCost(*FOp, Args[0]->getType(), Alignment, |
1785 | AS, CostKind); |
1786 | } |
1787 | if (VPBinOpIntrinsic::isVPBinOp(ID: ICA.getID())) { |
1788 | return thisT()->getArithmeticInstrCost(*FOp, ICA.getReturnType(), |
1789 | CostKind); |
1790 | } |
1791 | } |
1792 | |
1793 | std::optional<Intrinsic::ID> FID = |
1794 | VPIntrinsic::getFunctionalIntrinsicIDForVP(ID: ICA.getID()); |
1795 | if (FID) { |
1796 | // Non-vp version will have same Args/Tys except mask and vector length. |
1797 | assert(ICA.getArgs().size() >= 2 && ICA.getArgTypes().size() >= 2 && |
1798 | "Expected VPIntrinsic to have Mask and Vector Length args and " |
1799 | "types" ); |
1800 | ArrayRef<Type *> NewTys = ArrayRef(ICA.getArgTypes()).drop_back(N: 2); |
1801 | |
1802 | // VPReduction intrinsics have a start value argument that their non-vp |
1803 | // counterparts do not have, except for the fadd and fmul non-vp |
1804 | // counterpart. |
1805 | if (VPReductionIntrinsic::isVPReduction(ID: ICA.getID()) && |
1806 | *FID != Intrinsic::vector_reduce_fadd && |
1807 | *FID != Intrinsic::vector_reduce_fmul) |
1808 | NewTys = NewTys.drop_front(); |
1809 | |
1810 | IntrinsicCostAttributes NewICA(*FID, ICA.getReturnType(), NewTys, |
1811 | ICA.getFlags()); |
1812 | return thisT()->getIntrinsicInstrCost(NewICA, CostKind); |
1813 | } |
1814 | } |
1815 | |
1816 | // Assume that we need to scalarize this intrinsic.) |
1817 | // Compute the scalarization overhead based on Args for a vector |
1818 | // intrinsic. |
1819 | InstructionCost ScalarizationCost = InstructionCost::getInvalid(); |
1820 | if (RetVF.isVector() && !RetVF.isScalable()) { |
1821 | ScalarizationCost = 0; |
1822 | if (!RetTy->isVoidTy()) |
1823 | ScalarizationCost += getScalarizationOverhead( |
1824 | cast<VectorType>(Val: RetTy), |
1825 | /*Insert*/ true, /*Extract*/ false, CostKind); |
1826 | ScalarizationCost += |
1827 | getOperandsScalarizationOverhead(Args, Tys: ICA.getArgTypes(), CostKind); |
1828 | } |
1829 | |
1830 | IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I, |
1831 | ScalarizationCost); |
1832 | return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind); |
1833 | } |
1834 | |
1835 | /// Get intrinsic cost based on argument types. |
1836 | /// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the |
1837 | /// cost of scalarizing the arguments and the return value will be computed |
1838 | /// based on types. |
1839 | InstructionCost |
1840 | getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, |
1841 | TTI::TargetCostKind CostKind) { |
1842 | Intrinsic::ID IID = ICA.getID(); |
1843 | Type *RetTy = ICA.getReturnType(); |
1844 | const SmallVectorImpl<Type *> &Tys = ICA.getArgTypes(); |
1845 | FastMathFlags FMF = ICA.getFlags(); |
1846 | InstructionCost ScalarizationCostPassed = ICA.getScalarizationCost(); |
1847 | bool SkipScalarizationCost = ICA.skipScalarizationCost(); |
1848 | |
1849 | VectorType *VecOpTy = nullptr; |
1850 | if (!Tys.empty()) { |
1851 | // The vector reduction operand is operand 0 except for fadd/fmul. |
1852 | // Their operand 0 is a scalar start value, so the vector op is operand 1. |
1853 | unsigned VecTyIndex = 0; |
1854 | if (IID == Intrinsic::vector_reduce_fadd || |
1855 | IID == Intrinsic::vector_reduce_fmul) |
1856 | VecTyIndex = 1; |
1857 | assert(Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes" ); |
1858 | VecOpTy = dyn_cast<VectorType>(Val: Tys[VecTyIndex]); |
1859 | } |
1860 | |
1861 | // Library call cost - other than size, make it expensive. |
1862 | unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10; |
1863 | unsigned ISD = 0; |
1864 | switch (IID) { |
1865 | default: { |
1866 | // Scalable vectors cannot be scalarized, so return Invalid. |
1867 | if (isa<ScalableVectorType>(Val: RetTy) || any_of(Tys, [](const Type *Ty) { |
1868 | return isa<ScalableVectorType>(Val: Ty); |
1869 | })) |
1870 | return InstructionCost::getInvalid(); |
1871 | |
1872 | // Assume that we need to scalarize this intrinsic. |
1873 | InstructionCost ScalarizationCost = |
1874 | SkipScalarizationCost ? ScalarizationCostPassed : 0; |
1875 | unsigned ScalarCalls = 1; |
1876 | Type *ScalarRetTy = RetTy; |
1877 | if (auto *RetVTy = dyn_cast<VectorType>(Val: RetTy)) { |
1878 | if (!SkipScalarizationCost) |
1879 | ScalarizationCost = getScalarizationOverhead( |
1880 | RetVTy, /*Insert*/ true, /*Extract*/ false, CostKind); |
1881 | ScalarCalls = std::max(a: ScalarCalls, |
1882 | b: cast<FixedVectorType>(Val: RetVTy)->getNumElements()); |
1883 | ScalarRetTy = RetTy->getScalarType(); |
1884 | } |
1885 | SmallVector<Type *, 4> ScalarTys; |
1886 | for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { |
1887 | Type *Ty = Tys[i]; |
1888 | if (auto *VTy = dyn_cast<VectorType>(Val: Ty)) { |
1889 | if (!SkipScalarizationCost) |
1890 | ScalarizationCost += getScalarizationOverhead( |
1891 | VTy, /*Insert*/ false, /*Extract*/ true, CostKind); |
1892 | ScalarCalls = std::max(a: ScalarCalls, |
1893 | b: cast<FixedVectorType>(Val: VTy)->getNumElements()); |
1894 | Ty = Ty->getScalarType(); |
1895 | } |
1896 | ScalarTys.push_back(Elt: Ty); |
1897 | } |
1898 | if (ScalarCalls == 1) |
1899 | return 1; // Return cost of a scalar intrinsic. Assume it to be cheap. |
1900 | |
1901 | IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF); |
1902 | InstructionCost ScalarCost = |
1903 | thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind); |
1904 | |
1905 | return ScalarCalls * ScalarCost + ScalarizationCost; |
1906 | } |
1907 | // Look for intrinsics that can be lowered directly or turned into a scalar |
1908 | // intrinsic call. |
1909 | case Intrinsic::sqrt: |
1910 | ISD = ISD::FSQRT; |
1911 | break; |
1912 | case Intrinsic::sin: |
1913 | ISD = ISD::FSIN; |
1914 | break; |
1915 | case Intrinsic::cos: |
1916 | ISD = ISD::FCOS; |
1917 | break; |
1918 | case Intrinsic::exp: |
1919 | ISD = ISD::FEXP; |
1920 | break; |
1921 | case Intrinsic::exp2: |
1922 | ISD = ISD::FEXP2; |
1923 | break; |
1924 | case Intrinsic::exp10: |
1925 | ISD = ISD::FEXP10; |
1926 | break; |
1927 | case Intrinsic::log: |
1928 | ISD = ISD::FLOG; |
1929 | break; |
1930 | case Intrinsic::log10: |
1931 | ISD = ISD::FLOG10; |
1932 | break; |
1933 | case Intrinsic::log2: |
1934 | ISD = ISD::FLOG2; |
1935 | break; |
1936 | case Intrinsic::fabs: |
1937 | ISD = ISD::FABS; |
1938 | break; |
1939 | case Intrinsic::canonicalize: |
1940 | ISD = ISD::FCANONICALIZE; |
1941 | break; |
1942 | case Intrinsic::minnum: |
1943 | ISD = ISD::FMINNUM; |
1944 | break; |
1945 | case Intrinsic::maxnum: |
1946 | ISD = ISD::FMAXNUM; |
1947 | break; |
1948 | case Intrinsic::minimum: |
1949 | ISD = ISD::FMINIMUM; |
1950 | break; |
1951 | case Intrinsic::maximum: |
1952 | ISD = ISD::FMAXIMUM; |
1953 | break; |
1954 | case Intrinsic::copysign: |
1955 | ISD = ISD::FCOPYSIGN; |
1956 | break; |
1957 | case Intrinsic::floor: |
1958 | ISD = ISD::FFLOOR; |
1959 | break; |
1960 | case Intrinsic::ceil: |
1961 | ISD = ISD::FCEIL; |
1962 | break; |
1963 | case Intrinsic::trunc: |
1964 | ISD = ISD::FTRUNC; |
1965 | break; |
1966 | case Intrinsic::nearbyint: |
1967 | ISD = ISD::FNEARBYINT; |
1968 | break; |
1969 | case Intrinsic::rint: |
1970 | ISD = ISD::FRINT; |
1971 | break; |
1972 | case Intrinsic::lrint: |
1973 | ISD = ISD::LRINT; |
1974 | break; |
1975 | case Intrinsic::llrint: |
1976 | ISD = ISD::LLRINT; |
1977 | break; |
1978 | case Intrinsic::round: |
1979 | ISD = ISD::FROUND; |
1980 | break; |
1981 | case Intrinsic::roundeven: |
1982 | ISD = ISD::FROUNDEVEN; |
1983 | break; |
1984 | case Intrinsic::pow: |
1985 | ISD = ISD::FPOW; |
1986 | break; |
1987 | case Intrinsic::fma: |
1988 | ISD = ISD::FMA; |
1989 | break; |
1990 | case Intrinsic::fmuladd: |
1991 | ISD = ISD::FMA; |
1992 | break; |
1993 | case Intrinsic::experimental_constrained_fmuladd: |
1994 | ISD = ISD::STRICT_FMA; |
1995 | break; |
1996 | // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free. |
1997 | case Intrinsic::lifetime_start: |
1998 | case Intrinsic::lifetime_end: |
1999 | case Intrinsic::sideeffect: |
2000 | case Intrinsic::pseudoprobe: |
2001 | case Intrinsic::arithmetic_fence: |
2002 | return 0; |
2003 | case Intrinsic::masked_store: { |
2004 | Type *Ty = Tys[0]; |
2005 | Align TyAlign = thisT()->DL.getABITypeAlign(Ty); |
2006 | return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0, |
2007 | CostKind); |
2008 | } |
2009 | case Intrinsic::masked_load: { |
2010 | Type *Ty = RetTy; |
2011 | Align TyAlign = thisT()->DL.getABITypeAlign(Ty); |
2012 | return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0, |
2013 | CostKind); |
2014 | } |
2015 | case Intrinsic::vector_reduce_add: |
2016 | return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy, |
2017 | std::nullopt, CostKind); |
2018 | case Intrinsic::vector_reduce_mul: |
2019 | return thisT()->getArithmeticReductionCost(Instruction::Mul, VecOpTy, |
2020 | std::nullopt, CostKind); |
2021 | case Intrinsic::vector_reduce_and: |
2022 | return thisT()->getArithmeticReductionCost(Instruction::And, VecOpTy, |
2023 | std::nullopt, CostKind); |
2024 | case Intrinsic::vector_reduce_or: |
2025 | return thisT()->getArithmeticReductionCost(Instruction::Or, VecOpTy, |
2026 | std::nullopt, CostKind); |
2027 | case Intrinsic::vector_reduce_xor: |
2028 | return thisT()->getArithmeticReductionCost(Instruction::Xor, VecOpTy, |
2029 | std::nullopt, CostKind); |
2030 | case Intrinsic::vector_reduce_fadd: |
2031 | return thisT()->getArithmeticReductionCost(Instruction::FAdd, VecOpTy, |
2032 | FMF, CostKind); |
2033 | case Intrinsic::vector_reduce_fmul: |
2034 | return thisT()->getArithmeticReductionCost(Instruction::FMul, VecOpTy, |
2035 | FMF, CostKind); |
2036 | case Intrinsic::vector_reduce_smax: |
2037 | return thisT()->getMinMaxReductionCost(Intrinsic::smax, VecOpTy, |
2038 | ICA.getFlags(), CostKind); |
2039 | case Intrinsic::vector_reduce_smin: |
2040 | return thisT()->getMinMaxReductionCost(Intrinsic::smin, VecOpTy, |
2041 | ICA.getFlags(), CostKind); |
2042 | case Intrinsic::vector_reduce_umax: |
2043 | return thisT()->getMinMaxReductionCost(Intrinsic::umax, VecOpTy, |
2044 | ICA.getFlags(), CostKind); |
2045 | case Intrinsic::vector_reduce_umin: |
2046 | return thisT()->getMinMaxReductionCost(Intrinsic::umin, VecOpTy, |
2047 | ICA.getFlags(), CostKind); |
2048 | case Intrinsic::vector_reduce_fmax: |
2049 | return thisT()->getMinMaxReductionCost(Intrinsic::maxnum, VecOpTy, |
2050 | ICA.getFlags(), CostKind); |
2051 | case Intrinsic::vector_reduce_fmin: |
2052 | return thisT()->getMinMaxReductionCost(Intrinsic::minnum, VecOpTy, |
2053 | ICA.getFlags(), CostKind); |
2054 | case Intrinsic::vector_reduce_fmaximum: |
2055 | return thisT()->getMinMaxReductionCost(Intrinsic::maximum, VecOpTy, |
2056 | ICA.getFlags(), CostKind); |
2057 | case Intrinsic::vector_reduce_fminimum: |
2058 | return thisT()->getMinMaxReductionCost(Intrinsic::minimum, VecOpTy, |
2059 | ICA.getFlags(), CostKind); |
2060 | case Intrinsic::abs: { |
2061 | // abs(X) = select(icmp(X,0),X,sub(0,X)) |
2062 | Type *CondTy = RetTy->getWithNewBitWidth(NewBitWidth: 1); |
2063 | CmpInst::Predicate Pred = CmpInst::ICMP_SGT; |
2064 | InstructionCost Cost = 0; |
2065 | Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, |
2066 | Pred, CostKind); |
2067 | Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, |
2068 | Pred, CostKind); |
2069 | // TODO: Should we add an OperandValueProperties::OP_Zero property? |
2070 | Cost += thisT()->getArithmeticInstrCost( |
2071 | BinaryOperator::Sub, RetTy, CostKind, {TTI::OK_UniformConstantValue, TTI::OP_None}); |
2072 | return Cost; |
2073 | } |
2074 | case Intrinsic::smax: |
2075 | case Intrinsic::smin: |
2076 | case Intrinsic::umax: |
2077 | case Intrinsic::umin: { |
2078 | // minmax(X,Y) = select(icmp(X,Y),X,Y) |
2079 | Type *CondTy = RetTy->getWithNewBitWidth(NewBitWidth: 1); |
2080 | bool IsUnsigned = IID == Intrinsic::umax || IID == Intrinsic::umin; |
2081 | CmpInst::Predicate Pred = |
2082 | IsUnsigned ? CmpInst::ICMP_UGT : CmpInst::ICMP_SGT; |
2083 | InstructionCost Cost = 0; |
2084 | Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, |
2085 | Pred, CostKind); |
2086 | Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, |
2087 | Pred, CostKind); |
2088 | return Cost; |
2089 | } |
2090 | case Intrinsic::sadd_sat: |
2091 | case Intrinsic::ssub_sat: { |
2092 | Type *CondTy = RetTy->getWithNewBitWidth(NewBitWidth: 1); |
2093 | |
2094 | Type *OpTy = StructType::create(Elements: {RetTy, CondTy}); |
2095 | Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat |
2096 | ? Intrinsic::sadd_with_overflow |
2097 | : Intrinsic::ssub_with_overflow; |
2098 | CmpInst::Predicate Pred = CmpInst::ICMP_SGT; |
2099 | |
2100 | // SatMax -> Overflow && SumDiff < 0 |
2101 | // SatMin -> Overflow && SumDiff >= 0 |
2102 | InstructionCost Cost = 0; |
2103 | IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF, |
2104 | nullptr, ScalarizationCostPassed); |
2105 | Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind); |
2106 | Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, |
2107 | Pred, CostKind); |
2108 | Cost += 2 * thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, |
2109 | CondTy, Pred, CostKind); |
2110 | return Cost; |
2111 | } |
2112 | case Intrinsic::uadd_sat: |
2113 | case Intrinsic::usub_sat: { |
2114 | Type *CondTy = RetTy->getWithNewBitWidth(NewBitWidth: 1); |
2115 | |
2116 | Type *OpTy = StructType::create(Elements: {RetTy, CondTy}); |
2117 | Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat |
2118 | ? Intrinsic::uadd_with_overflow |
2119 | : Intrinsic::usub_with_overflow; |
2120 | |
2121 | InstructionCost Cost = 0; |
2122 | IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF, |
2123 | nullptr, ScalarizationCostPassed); |
2124 | Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind); |
2125 | Cost += |
2126 | thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, |
2127 | CmpInst::BAD_ICMP_PREDICATE, CostKind); |
2128 | return Cost; |
2129 | } |
2130 | case Intrinsic::smul_fix: |
2131 | case Intrinsic::umul_fix: { |
2132 | unsigned ExtSize = RetTy->getScalarSizeInBits() * 2; |
2133 | Type *ExtTy = RetTy->getWithNewBitWidth(NewBitWidth: ExtSize); |
2134 | |
2135 | unsigned ExtOp = |
2136 | IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt; |
2137 | TTI::CastContextHint CCH = TTI::CastContextHint::None; |
2138 | |
2139 | InstructionCost Cost = 0; |
2140 | Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind); |
2141 | Cost += |
2142 | thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind); |
2143 | Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy, |
2144 | CCH, CostKind); |
2145 | Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, RetTy, |
2146 | CostKind, |
2147 | {TTI::OK_AnyValue, TTI::OP_None}, |
2148 | {TTI::OK_UniformConstantValue, TTI::OP_None}); |
2149 | Cost += thisT()->getArithmeticInstrCost(Instruction::Shl, RetTy, CostKind, |
2150 | {TTI::OK_AnyValue, TTI::OP_None}, |
2151 | {TTI::OK_UniformConstantValue, TTI::OP_None}); |
2152 | Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind); |
2153 | return Cost; |
2154 | } |
2155 | case Intrinsic::sadd_with_overflow: |
2156 | case Intrinsic::ssub_with_overflow: { |
2157 | Type *SumTy = RetTy->getContainedType(i: 0); |
2158 | Type *OverflowTy = RetTy->getContainedType(i: 1); |
2159 | unsigned Opcode = IID == Intrinsic::sadd_with_overflow |
2160 | ? BinaryOperator::Add |
2161 | : BinaryOperator::Sub; |
2162 | |
2163 | // Add: |
2164 | // Overflow -> (Result < LHS) ^ (RHS < 0) |
2165 | // Sub: |
2166 | // Overflow -> (Result < LHS) ^ (RHS > 0) |
2167 | InstructionCost Cost = 0; |
2168 | Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind); |
2169 | Cost += 2 * thisT()->getCmpSelInstrCost( |
2170 | Instruction::ICmp, SumTy, OverflowTy, |
2171 | CmpInst::ICMP_SGT, CostKind); |
2172 | Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Xor, OverflowTy, |
2173 | CostKind); |
2174 | return Cost; |
2175 | } |
2176 | case Intrinsic::uadd_with_overflow: |
2177 | case Intrinsic::usub_with_overflow: { |
2178 | Type *SumTy = RetTy->getContainedType(i: 0); |
2179 | Type *OverflowTy = RetTy->getContainedType(i: 1); |
2180 | unsigned Opcode = IID == Intrinsic::uadd_with_overflow |
2181 | ? BinaryOperator::Add |
2182 | : BinaryOperator::Sub; |
2183 | CmpInst::Predicate Pred = IID == Intrinsic::uadd_with_overflow |
2184 | ? CmpInst::ICMP_ULT |
2185 | : CmpInst::ICMP_UGT; |
2186 | |
2187 | InstructionCost Cost = 0; |
2188 | Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind); |
2189 | Cost += |
2190 | thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy, OverflowTy, |
2191 | Pred, CostKind); |
2192 | return Cost; |
2193 | } |
2194 | case Intrinsic::smul_with_overflow: |
2195 | case Intrinsic::umul_with_overflow: { |
2196 | Type *MulTy = RetTy->getContainedType(i: 0); |
2197 | Type *OverflowTy = RetTy->getContainedType(i: 1); |
2198 | unsigned ExtSize = MulTy->getScalarSizeInBits() * 2; |
2199 | Type *ExtTy = MulTy->getWithNewBitWidth(NewBitWidth: ExtSize); |
2200 | bool IsSigned = IID == Intrinsic::smul_with_overflow; |
2201 | |
2202 | unsigned ExtOp = IsSigned ? Instruction::SExt : Instruction::ZExt; |
2203 | TTI::CastContextHint CCH = TTI::CastContextHint::None; |
2204 | |
2205 | InstructionCost Cost = 0; |
2206 | Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind); |
2207 | Cost += |
2208 | thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind); |
2209 | Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy, |
2210 | CCH, CostKind); |
2211 | Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, ExtTy, |
2212 | CostKind, |
2213 | {TTI::OK_AnyValue, TTI::OP_None}, |
2214 | {TTI::OK_UniformConstantValue, TTI::OP_None}); |
2215 | |
2216 | if (IsSigned) |
2217 | Cost += thisT()->getArithmeticInstrCost(Instruction::AShr, MulTy, |
2218 | CostKind, |
2219 | {TTI::OK_AnyValue, TTI::OP_None}, |
2220 | {TTI::OK_UniformConstantValue, TTI::OP_None}); |
2221 | |
2222 | Cost += thisT()->getCmpSelInstrCost( |
2223 | BinaryOperator::ICmp, MulTy, OverflowTy, CmpInst::ICMP_NE, CostKind); |
2224 | return Cost; |
2225 | } |
2226 | case Intrinsic::fptosi_sat: |
2227 | case Intrinsic::fptoui_sat: { |
2228 | if (Tys.empty()) |
2229 | break; |
2230 | Type *FromTy = Tys[0]; |
2231 | bool IsSigned = IID == Intrinsic::fptosi_sat; |
2232 | |
2233 | InstructionCost Cost = 0; |
2234 | IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FromTy, |
2235 | {FromTy, FromTy}); |
2236 | Cost += thisT()->getIntrinsicInstrCost(Attrs1, CostKind); |
2237 | IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FromTy, |
2238 | {FromTy, FromTy}); |
2239 | Cost += thisT()->getIntrinsicInstrCost(Attrs2, CostKind); |
2240 | Cost += thisT()->getCastInstrCost( |
2241 | IsSigned ? Instruction::FPToSI : Instruction::FPToUI, RetTy, FromTy, |
2242 | TTI::CastContextHint::None, CostKind); |
2243 | if (IsSigned) { |
2244 | Type *CondTy = RetTy->getWithNewBitWidth(NewBitWidth: 1); |
2245 | Cost += thisT()->getCmpSelInstrCost( |
2246 | BinaryOperator::FCmp, FromTy, CondTy, CmpInst::FCMP_UNO, CostKind); |
2247 | Cost += thisT()->getCmpSelInstrCost( |
2248 | BinaryOperator::Select, RetTy, CondTy, CmpInst::FCMP_UNO, CostKind); |
2249 | } |
2250 | return Cost; |
2251 | } |
2252 | case Intrinsic::ctpop: |
2253 | ISD = ISD::CTPOP; |
2254 | // In case of legalization use TCC_Expensive. This is cheaper than a |
2255 | // library call but still not a cheap instruction. |
2256 | SingleCallCost = TargetTransformInfo::TCC_Expensive; |
2257 | break; |
2258 | case Intrinsic::ctlz: |
2259 | ISD = ISD::CTLZ; |
2260 | break; |
2261 | case Intrinsic::cttz: |
2262 | ISD = ISD::CTTZ; |
2263 | break; |
2264 | case Intrinsic::bswap: |
2265 | ISD = ISD::BSWAP; |
2266 | break; |
2267 | case Intrinsic::bitreverse: |
2268 | ISD = ISD::BITREVERSE; |
2269 | break; |
2270 | } |
2271 | |
2272 | const TargetLoweringBase *TLI = getTLI(); |
2273 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: RetTy); |
2274 | |
2275 | if (TLI->isOperationLegalOrPromote(Op: ISD, VT: LT.second)) { |
2276 | if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() && |
2277 | TLI->isFAbsFree(LT.second)) { |
2278 | return 0; |
2279 | } |
2280 | |
2281 | // The operation is legal. Assume it costs 1. |
2282 | // If the type is split to multiple registers, assume that there is some |
2283 | // overhead to this. |
2284 | // TODO: Once we have extract/insert subvector cost we need to use them. |
2285 | if (LT.first > 1) |
2286 | return (LT.first * 2); |
2287 | else |
2288 | return (LT.first * 1); |
2289 | } else if (!TLI->isOperationExpand(Op: ISD, VT: LT.second)) { |
2290 | // If the operation is custom lowered then assume |
2291 | // that the code is twice as expensive. |
2292 | return (LT.first * 2); |
2293 | } |
2294 | |
2295 | // If we can't lower fmuladd into an FMA estimate the cost as a floating |
2296 | // point mul followed by an add. |
2297 | if (IID == Intrinsic::fmuladd) |
2298 | return thisT()->getArithmeticInstrCost(BinaryOperator::FMul, RetTy, |
2299 | CostKind) + |
2300 | thisT()->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy, |
2301 | CostKind); |
2302 | if (IID == Intrinsic::experimental_constrained_fmuladd) { |
2303 | IntrinsicCostAttributes FMulAttrs( |
2304 | Intrinsic::experimental_constrained_fmul, RetTy, Tys); |
2305 | IntrinsicCostAttributes FAddAttrs( |
2306 | Intrinsic::experimental_constrained_fadd, RetTy, Tys); |
2307 | return thisT()->getIntrinsicInstrCost(FMulAttrs, CostKind) + |
2308 | thisT()->getIntrinsicInstrCost(FAddAttrs, CostKind); |
2309 | } |
2310 | |
2311 | // Else, assume that we need to scalarize this intrinsic. For math builtins |
2312 | // this will emit a costly libcall, adding call overhead and spills. Make it |
2313 | // very expensive. |
2314 | if (auto *RetVTy = dyn_cast<VectorType>(Val: RetTy)) { |
2315 | // Scalable vectors cannot be scalarized, so return Invalid. |
2316 | if (isa<ScalableVectorType>(Val: RetTy) || any_of(Tys, [](const Type *Ty) { |
2317 | return isa<ScalableVectorType>(Val: Ty); |
2318 | })) |
2319 | return InstructionCost::getInvalid(); |
2320 | |
2321 | InstructionCost ScalarizationCost = |
2322 | SkipScalarizationCost |
2323 | ? ScalarizationCostPassed |
2324 | : getScalarizationOverhead(RetVTy, /*Insert*/ true, |
2325 | /*Extract*/ false, CostKind); |
2326 | |
2327 | unsigned ScalarCalls = cast<FixedVectorType>(Val: RetVTy)->getNumElements(); |
2328 | SmallVector<Type *, 4> ScalarTys; |
2329 | for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { |
2330 | Type *Ty = Tys[i]; |
2331 | if (Ty->isVectorTy()) |
2332 | Ty = Ty->getScalarType(); |
2333 | ScalarTys.push_back(Elt: Ty); |
2334 | } |
2335 | IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF); |
2336 | InstructionCost ScalarCost = |
2337 | thisT()->getIntrinsicInstrCost(Attrs, CostKind); |
2338 | for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { |
2339 | if (auto *VTy = dyn_cast<VectorType>(Val: Tys[i])) { |
2340 | if (!ICA.skipScalarizationCost()) |
2341 | ScalarizationCost += getScalarizationOverhead( |
2342 | VTy, /*Insert*/ false, /*Extract*/ true, CostKind); |
2343 | ScalarCalls = std::max(a: ScalarCalls, |
2344 | b: cast<FixedVectorType>(Val: VTy)->getNumElements()); |
2345 | } |
2346 | } |
2347 | return ScalarCalls * ScalarCost + ScalarizationCost; |
2348 | } |
2349 | |
2350 | // This is going to be turned into a library call, make it expensive. |
2351 | return SingleCallCost; |
2352 | } |
2353 | |
2354 | /// Compute a cost of the given call instruction. |
2355 | /// |
2356 | /// Compute the cost of calling function F with return type RetTy and |
2357 | /// argument types Tys. F might be nullptr, in this case the cost of an |
2358 | /// arbitrary call with the specified signature will be returned. |
2359 | /// This is used, for instance, when we estimate call of a vector |
2360 | /// counterpart of the given function. |
2361 | /// \param F Called function, might be nullptr. |
2362 | /// \param RetTy Return value types. |
2363 | /// \param Tys Argument types. |
2364 | /// \returns The cost of Call instruction. |
2365 | InstructionCost getCallInstrCost(Function *F, Type *RetTy, |
2366 | ArrayRef<Type *> Tys, |
2367 | TTI::TargetCostKind CostKind) { |
2368 | return 10; |
2369 | } |
2370 | |
2371 | unsigned getNumberOfParts(Type *Tp) { |
2372 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Tp); |
2373 | return LT.first.isValid() ? *LT.first.getValue() : 0; |
2374 | } |
2375 | |
2376 | InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, |
2377 | const SCEV *) { |
2378 | return 0; |
2379 | } |
2380 | |
2381 | /// Try to calculate arithmetic and shuffle op costs for reduction intrinsics. |
2382 | /// We're assuming that reduction operation are performing the following way: |
2383 | /// |
2384 | /// %val1 = shufflevector<n x t> %val, <n x t> %undef, |
2385 | /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef> |
2386 | /// \----------------v-------------/ \----------v------------/ |
2387 | /// n/2 elements n/2 elements |
2388 | /// %red1 = op <n x t> %val, <n x t> val1 |
2389 | /// After this operation we have a vector %red1 where only the first n/2 |
2390 | /// elements are meaningful, the second n/2 elements are undefined and can be |
2391 | /// dropped. All other operations are actually working with the vector of |
2392 | /// length n/2, not n, though the real vector length is still n. |
2393 | /// %val2 = shufflevector<n x t> %red1, <n x t> %undef, |
2394 | /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef> |
2395 | /// \----------------v-------------/ \----------v------------/ |
2396 | /// n/4 elements 3*n/4 elements |
2397 | /// %red2 = op <n x t> %red1, <n x t> val2 - working with the vector of |
2398 | /// length n/2, the resulting vector has length n/4 etc. |
2399 | /// |
2400 | /// The cost model should take into account that the actual length of the |
2401 | /// vector is reduced on each iteration. |
2402 | InstructionCost getTreeReductionCost(unsigned Opcode, VectorType *Ty, |
2403 | TTI::TargetCostKind CostKind) { |
2404 | // Targets must implement a default value for the scalable case, since |
2405 | // we don't know how many lanes the vector has. |
2406 | if (isa<ScalableVectorType>(Val: Ty)) |
2407 | return InstructionCost::getInvalid(); |
2408 | |
2409 | Type *ScalarTy = Ty->getElementType(); |
2410 | unsigned NumVecElts = cast<FixedVectorType>(Val: Ty)->getNumElements(); |
2411 | if ((Opcode == Instruction::Or || Opcode == Instruction::And) && |
2412 | ScalarTy == IntegerType::getInt1Ty(C&: Ty->getContext()) && |
2413 | NumVecElts >= 2) { |
2414 | // Or reduction for i1 is represented as: |
2415 | // %val = bitcast <ReduxWidth x i1> to iReduxWidth |
2416 | // %res = cmp ne iReduxWidth %val, 0 |
2417 | // And reduction for i1 is represented as: |
2418 | // %val = bitcast <ReduxWidth x i1> to iReduxWidth |
2419 | // %res = cmp eq iReduxWidth %val, 11111 |
2420 | Type *ValTy = IntegerType::get(C&: Ty->getContext(), NumBits: NumVecElts); |
2421 | return thisT()->getCastInstrCost(Instruction::BitCast, ValTy, Ty, |
2422 | TTI::CastContextHint::None, CostKind) + |
2423 | thisT()->getCmpSelInstrCost(Instruction::ICmp, ValTy, |
2424 | CmpInst::makeCmpResultType(opnd_type: ValTy), |
2425 | CmpInst::BAD_ICMP_PREDICATE, CostKind); |
2426 | } |
2427 | unsigned NumReduxLevels = Log2_32(Value: NumVecElts); |
2428 | InstructionCost ArithCost = 0; |
2429 | InstructionCost ShuffleCost = 0; |
2430 | std::pair<InstructionCost, MVT> LT = thisT()->getTypeLegalizationCost(Ty); |
2431 | unsigned LongVectorCount = 0; |
2432 | unsigned MVTLen = |
2433 | LT.second.isVector() ? LT.second.getVectorNumElements() : 1; |
2434 | while (NumVecElts > MVTLen) { |
2435 | NumVecElts /= 2; |
2436 | VectorType *SubTy = FixedVectorType::get(ElementType: ScalarTy, NumElts: NumVecElts); |
2437 | ShuffleCost += |
2438 | thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt, |
2439 | CostKind, NumVecElts, SubTy); |
2440 | ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind); |
2441 | Ty = SubTy; |
2442 | ++LongVectorCount; |
2443 | } |
2444 | |
2445 | NumReduxLevels -= LongVectorCount; |
2446 | |
2447 | // The minimal length of the vector is limited by the real length of vector |
2448 | // operations performed on the current platform. That's why several final |
2449 | // reduction operations are performed on the vectors with the same |
2450 | // architecture-dependent length. |
2451 | |
2452 | // By default reductions need one shuffle per reduction level. |
2453 | ShuffleCost += |
2454 | NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, |
2455 | std::nullopt, CostKind, 0, Ty); |
2456 | ArithCost += |
2457 | NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty, CostKind); |
2458 | return ShuffleCost + ArithCost + |
2459 | thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, |
2460 | CostKind, 0, nullptr, nullptr); |
2461 | } |
2462 | |
2463 | /// Try to calculate the cost of performing strict (in-order) reductions, |
2464 | /// which involves doing a sequence of floating point additions in lane |
2465 | /// order, starting with an initial value. For example, consider a scalar |
2466 | /// initial value 'InitVal' of type float and a vector of type <4 x float>: |
2467 | /// |
2468 | /// Vector = <float %v0, float %v1, float %v2, float %v3> |
2469 | /// |
2470 | /// %add1 = %InitVal + %v0 |
2471 | /// %add2 = %add1 + %v1 |
2472 | /// %add3 = %add2 + %v2 |
2473 | /// %add4 = %add3 + %v3 |
2474 | /// |
2475 | /// As a simple estimate we can say the cost of such a reduction is 4 times |
2476 | /// the cost of a scalar FP addition. We can only estimate the costs for |
2477 | /// fixed-width vectors here because for scalable vectors we do not know the |
2478 | /// runtime number of operations. |
2479 | InstructionCost getOrderedReductionCost(unsigned Opcode, VectorType *Ty, |
2480 | TTI::TargetCostKind CostKind) { |
2481 | // Targets must implement a default value for the scalable case, since |
2482 | // we don't know how many lanes the vector has. |
2483 | if (isa<ScalableVectorType>(Val: Ty)) |
2484 | return InstructionCost::getInvalid(); |
2485 | |
2486 | auto *VTy = cast<FixedVectorType>(Val: Ty); |
2487 | InstructionCost = getScalarizationOverhead( |
2488 | VTy, /*Insert=*/false, /*Extract=*/true, CostKind); |
2489 | InstructionCost ArithCost = thisT()->getArithmeticInstrCost( |
2490 | Opcode, VTy->getElementType(), CostKind); |
2491 | ArithCost *= VTy->getNumElements(); |
2492 | |
2493 | return ExtractCost + ArithCost; |
2494 | } |
2495 | |
2496 | InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, |
2497 | std::optional<FastMathFlags> FMF, |
2498 | TTI::TargetCostKind CostKind) { |
2499 | assert(Ty && "Unknown reduction vector type" ); |
2500 | if (TTI::requiresOrderedReduction(FMF)) |
2501 | return getOrderedReductionCost(Opcode, Ty, CostKind); |
2502 | return getTreeReductionCost(Opcode, Ty, CostKind); |
2503 | } |
2504 | |
2505 | /// Try to calculate op costs for min/max reduction operations. |
2506 | /// \param CondTy Conditional type for the Select instruction. |
2507 | InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, |
2508 | FastMathFlags FMF, |
2509 | TTI::TargetCostKind CostKind) { |
2510 | // Targets must implement a default value for the scalable case, since |
2511 | // we don't know how many lanes the vector has. |
2512 | if (isa<ScalableVectorType>(Val: Ty)) |
2513 | return InstructionCost::getInvalid(); |
2514 | |
2515 | Type *ScalarTy = Ty->getElementType(); |
2516 | unsigned NumVecElts = cast<FixedVectorType>(Val: Ty)->getNumElements(); |
2517 | unsigned NumReduxLevels = Log2_32(Value: NumVecElts); |
2518 | InstructionCost MinMaxCost = 0; |
2519 | InstructionCost ShuffleCost = 0; |
2520 | std::pair<InstructionCost, MVT> LT = thisT()->getTypeLegalizationCost(Ty); |
2521 | unsigned LongVectorCount = 0; |
2522 | unsigned MVTLen = |
2523 | LT.second.isVector() ? LT.second.getVectorNumElements() : 1; |
2524 | while (NumVecElts > MVTLen) { |
2525 | NumVecElts /= 2; |
2526 | auto *SubTy = FixedVectorType::get(ElementType: ScalarTy, NumElts: NumVecElts); |
2527 | |
2528 | ShuffleCost += |
2529 | thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt, |
2530 | CostKind, NumVecElts, SubTy); |
2531 | |
2532 | IntrinsicCostAttributes Attrs(IID, SubTy, {SubTy, SubTy}, FMF); |
2533 | MinMaxCost += getIntrinsicInstrCost(ICA: Attrs, CostKind); |
2534 | Ty = SubTy; |
2535 | ++LongVectorCount; |
2536 | } |
2537 | |
2538 | NumReduxLevels -= LongVectorCount; |
2539 | |
2540 | // The minimal length of the vector is limited by the real length of vector |
2541 | // operations performed on the current platform. That's why several final |
2542 | // reduction opertions are perfomed on the vectors with the same |
2543 | // architecture-dependent length. |
2544 | ShuffleCost += |
2545 | NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, |
2546 | std::nullopt, CostKind, 0, Ty); |
2547 | IntrinsicCostAttributes Attrs(IID, Ty, {Ty, Ty}, FMF); |
2548 | MinMaxCost += NumReduxLevels * getIntrinsicInstrCost(ICA: Attrs, CostKind); |
2549 | // The last min/max should be in vector registers and we counted it above. |
2550 | // So just need a single extractelement. |
2551 | return ShuffleCost + MinMaxCost + |
2552 | thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, |
2553 | CostKind, 0, nullptr, nullptr); |
2554 | } |
2555 | |
2556 | InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, |
2557 | Type *ResTy, VectorType *Ty, |
2558 | FastMathFlags FMF, |
2559 | TTI::TargetCostKind CostKind) { |
2560 | // Without any native support, this is equivalent to the cost of |
2561 | // vecreduce.opcode(ext(Ty A)). |
2562 | VectorType *ExtTy = VectorType::get(ElementType: ResTy, Other: Ty); |
2563 | InstructionCost RedCost = |
2564 | thisT()->getArithmeticReductionCost(Opcode, ExtTy, FMF, CostKind); |
2565 | InstructionCost ExtCost = thisT()->getCastInstrCost( |
2566 | IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty, |
2567 | TTI::CastContextHint::None, CostKind); |
2568 | |
2569 | return RedCost + ExtCost; |
2570 | } |
2571 | |
2572 | InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, |
2573 | VectorType *Ty, |
2574 | TTI::TargetCostKind CostKind) { |
2575 | // Without any native support, this is equivalent to the cost of |
2576 | // vecreduce.add(mul(ext(Ty A), ext(Ty B))) or |
2577 | // vecreduce.add(mul(A, B)). |
2578 | VectorType *ExtTy = VectorType::get(ElementType: ResTy, Other: Ty); |
2579 | InstructionCost RedCost = thisT()->getArithmeticReductionCost( |
2580 | Instruction::Add, ExtTy, std::nullopt, CostKind); |
2581 | InstructionCost ExtCost = thisT()->getCastInstrCost( |
2582 | IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty, |
2583 | TTI::CastContextHint::None, CostKind); |
2584 | |
2585 | InstructionCost MulCost = |
2586 | thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind); |
2587 | |
2588 | return RedCost + MulCost + 2 * ExtCost; |
2589 | } |
2590 | |
2591 | InstructionCost getVectorSplitCost() { return 1; } |
2592 | |
2593 | /// @} |
2594 | }; |
2595 | |
2596 | /// Concrete BasicTTIImpl that can be used if no further customization |
2597 | /// is needed. |
2598 | class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> { |
2599 | using BaseT = BasicTTIImplBase<BasicTTIImpl>; |
2600 | |
2601 | friend class BasicTTIImplBase<BasicTTIImpl>; |
2602 | |
2603 | const TargetSubtargetInfo *ST; |
2604 | const TargetLoweringBase *TLI; |
2605 | |
2606 | const TargetSubtargetInfo *getST() const { return ST; } |
2607 | const TargetLoweringBase *getTLI() const { return TLI; } |
2608 | |
2609 | public: |
2610 | explicit BasicTTIImpl(const TargetMachine *TM, const Function &F); |
2611 | }; |
2612 | |
2613 | } // end namespace llvm |
2614 | |
2615 | #endif // LLVM_CODEGEN_BASICTTIIMPL_H |
2616 | |