BasicTTIImpl.h source code [llvm/include/llvm/CodeGen/BasicTTIImpl.h]

1	//===- BasicTTIImpl.h -------------------------------------------- C++ --===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// This file provides a helper that implements much of the TTI interface in
11	/// terms of the target-independent code generator and TargetLowering
12	/// interfaces.
13	//
14	//===----------------------------------------------------------------------===//
15
16	#ifndef LLVM_CODEGEN_BASICTTIIMPL_H
17	#define LLVM_CODEGEN_BASICTTIIMPL_H
18
19	#include "llvm/ADT/APInt.h"
20	#include "llvm/ADT/ArrayRef.h"
21	#include "llvm/ADT/BitVector.h"
22	#include "llvm/ADT/SmallPtrSet.h"
23	#include "llvm/ADT/SmallVector.h"
24	#include "llvm/Analysis/LoopInfo.h"
25	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
26	#include "llvm/Analysis/TargetTransformInfo.h"
27	#include "llvm/Analysis/TargetTransformInfoImpl.h"
28	#include "llvm/CodeGen/ISDOpcodes.h"
29	#include "llvm/CodeGen/TargetLowering.h"
30	#include "llvm/CodeGen/TargetSubtargetInfo.h"
31	#include "llvm/CodeGen/ValueTypes.h"
32	#include "llvm/CodeGenTypes/MachineValueType.h"
33	#include "llvm/IR/BasicBlock.h"
34	#include "llvm/IR/Constant.h"
35	#include "llvm/IR/Constants.h"
36	#include "llvm/IR/DataLayout.h"
37	#include "llvm/IR/DerivedTypes.h"
38	#include "llvm/IR/InstrTypes.h"
39	#include "llvm/IR/Instruction.h"
40	#include "llvm/IR/Instructions.h"
41	#include "llvm/IR/Intrinsics.h"
42	#include "llvm/IR/Operator.h"
43	#include "llvm/IR/Type.h"
44	#include "llvm/IR/Value.h"
45	#include "llvm/Support/Casting.h"
46	#include "llvm/Support/CommandLine.h"
47	#include "llvm/Support/ErrorHandling.h"
48	#include "llvm/Support/MathExtras.h"
49	#include "llvm/Target/TargetMachine.h"
50	#include "llvm/Target/TargetOptions.h"
51	#include <algorithm>
52	#include <cassert>
53	#include <cstdint>
54	#include <limits>
55	#include <optional>
56	#include <utility>
57
58	namespace llvm {
59
60	class Function;
61	class GlobalValue;
62	class LLVMContext;
63	class ScalarEvolution;
64	class SCEV;
65	class TargetMachine;
66
67	extern cl::opt<unsigned> PartialUnrollingThreshold;
68
69	/// Base class which can be used to help build a TTI implementation.
70	///
71	/// This class provides as much implementation of the TTI interface as is
72	/// possible using the target independent parts of the code generator.
73	///
74	/// In order to subclass it, your class must implement a getST() method to
75	/// return the subtarget, and a getTLI() method to return the target lowering.
76	/// We need these methods implemented in the derived class so that this class
77	/// doesn't have to duplicate storage for them.
78	template <typename T>
79	class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
80	private:
81	using BaseT = TargetTransformInfoImplCRTPBase<T>;
82	using TTI = TargetTransformInfo;
83
84	/// Helper function to access this as a T.
85	T thisT() { return* static_cast<T >(this*); }
86
87	/// Estimate a cost of Broadcast as an extract and sequence of insert
88	/// operations.
89	InstructionCost getBroadcastShuffleOverhead(FixedVectorType *VTy,
90	TTI::TargetCostKind CostKind) {
91	InstructionCost Cost = `0`;
92	// Broadcast cost is equal to the cost of extracting the zero'th element
93	// plus the cost of inserting it into every element of the result vector.
94	Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
95	CostKind, `0`, nullptr, nullptr);
96
97	for (int i = `0`, e = VTy->getNumElements(); i < e; ++i) {
98	Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
99	CostKind, i, nullptr, nullptr);
100	}
101	return Cost;
102	}
103
104	/// Estimate a cost of shuffle as a sequence of extract and insert
105	/// operations.
106	InstructionCost getPermuteShuffleOverhead(FixedVectorType *VTy,
107	TTI::TargetCostKind CostKind) {
108	InstructionCost Cost = `0`;
109	// Shuffle cost is equal to the cost of extracting element from its argument
110	// plus the cost of inserting them onto the result vector.
111
112	// e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from
113	// index 0 of first vector, index 1 of second vector,index 2 of first
114	// vector and finally index 3 of second vector and insert them at index
115	// <0,1,2,3> of result vector.
116	for (int i = `0`, e = VTy->getNumElements(); i < e; ++i) {
117	Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
118	CostKind, i, nullptr, nullptr);
119	Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
120	CostKind, i, nullptr, nullptr);
121	}
122	return Cost;
123	}
124
125	/// Estimate a cost of subvector extraction as a sequence of extract and
126	/// insert operations.
127	InstructionCost getExtractSubvectorOverhead(VectorType *VTy,
128	TTI::TargetCostKind CostKind,
129	int Index,
130	FixedVectorType *SubVTy) {
131	assert(VTy && SubVTy &&
132	"Can only extract subvectors from vectors");
133	int NumSubElts = SubVTy->getNumElements();
134	assert((!isa<FixedVectorType>(VTy) \|\|
135	(Index + NumSubElts) <=
136	(int)cast<FixedVectorType>(VTy)->getNumElements()) &&
137	"SK_ExtractSubvector index out of range");
138
139	InstructionCost Cost = `0`;
140	// Subvector extraction cost is equal to the cost of extracting element from
141	// the source type plus the cost of inserting them into the result vector
142	// type.
143	for (int i = `0`; i != NumSubElts; ++i) {
144	Cost +=
145	thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
146	CostKind, i + Index, nullptr, nullptr);
147	Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy,
148	CostKind, i, nullptr, nullptr);
149	}
150	return Cost;
151	}
152
153	/// Estimate a cost of subvector insertion as a sequence of extract and
154	/// insert operations.
155	InstructionCost getInsertSubvectorOverhead(VectorType *VTy,
156	TTI::TargetCostKind CostKind,
157	int Index,
158	FixedVectorType *SubVTy) {
159	assert(VTy && SubVTy &&
160	"Can only insert subvectors into vectors");
161	int NumSubElts = SubVTy->getNumElements();
162	assert((!isa<FixedVectorType>(VTy) \|\|
163	(Index + NumSubElts) <=
164	(int)cast<FixedVectorType>(VTy)->getNumElements()) &&
165	"SK_InsertSubvector index out of range");
166
167	InstructionCost Cost = `0`;
168	// Subvector insertion cost is equal to the cost of extracting element from
169	// the source type plus the cost of inserting them into the result vector
170	// type.
171	for (int i = `0`; i != NumSubElts; ++i) {
172	Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy,
173	CostKind, i, nullptr, nullptr);
174	Cost +=
175	thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, CostKind,
176	i + Index, nullptr, nullptr);
177	}
178	return Cost;
179	}
180
181	/// Local query method delegates up to T which must* implement this!*
182	const TargetSubtargetInfo getST() const* {
183	return static_cast<const T >(this*)->getST();
184	}
185
186	/// Local query method delegates up to T which must* implement this!*
187	const TargetLoweringBase getTLI() const* {
188	return static_cast<const T >(this*)->getTLI();
189	}
190
191	static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) {
192	switch (M) {
193	case TTI::MIM_Unindexed:
194	return ISD::UNINDEXED;
195	case TTI::MIM_PreInc:
196	return ISD::PRE_INC;
197	case TTI::MIM_PreDec:
198	return ISD::PRE_DEC;
199	case TTI::MIM_PostInc:
200	return ISD::POST_INC;
201	case TTI::MIM_PostDec:
202	return ISD::POST_DEC;
203	}
204	llvm_unreachable("Unexpected MemIndexedMode");
205	}
206
207	InstructionCost getCommonMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
208	Align Alignment,
209	bool VariableMask,
210	bool IsGatherScatter,
211	TTI::TargetCostKind CostKind) {
212	// We cannot scalarize scalable vectors, so return Invalid.
213	if (isa<ScalableVectorType>(Val: DataTy))
214	return InstructionCost::getInvalid();
215
216	auto *VT = cast<FixedVectorType>(Val: DataTy);
217	// Assume the target does not have support for gather/scatter operations
218	// and provide a rough estimate.
219	//
220	// First, compute the cost of the individual memory operations.
221	InstructionCost AddrExtractCost =
222	IsGatherScatter
223	? getVectorInstrCost(Instruction::ExtractElement,
224	FixedVectorType::get(
225	ElementType: PointerType::get(ElementType: VT->getElementType(), AddressSpace: `0`),
226	NumElts: VT->getNumElements()),
227	CostKind, -`1`, nullptr, nullptr)
228	: `0`;
229	InstructionCost LoadCost =
230	VT->getNumElements() *
231	(AddrExtractCost +
232	getMemoryOpCost(Opcode, Src: VT->getElementType(), Alignment, AddressSpace: `0`, CostKind));
233
234	// Next, compute the cost of packing the result in a vector.
235	InstructionCost PackingCost =
236	getScalarizationOverhead(VT, Opcode != Instruction::Store,
237	Opcode == Instruction::Store, CostKind);
238
239	InstructionCost ConditionalCost = `0`;
240	if (VariableMask) {
241	// Compute the cost of conditionally executing the memory operations with
242	// variable masks. This includes extracting the individual conditions, a
243	// branches and PHIs to combine the results.
244	// NOTE: Estimating the cost of conditionally executing the memory
245	// operations accurately is quite difficult and the current solution
246	// provides a very rough estimate only.
247	ConditionalCost =
248	VT->getNumElements() *
249	(getVectorInstrCost(
250	Instruction::ExtractElement,
251	FixedVectorType::get(ElementType: Type::getInt1Ty(C&: DataTy->getContext()),
252	NumElts: VT->getNumElements()),
253	CostKind, -`1`, nullptr, nullptr) +
254	getCFInstrCost(Opcode: Instruction::Br, CostKind) +
255	getCFInstrCost(Opcode: Instruction::PHI, CostKind));
256	}
257
258	return LoadCost + PackingCost + ConditionalCost;
259	}
260
261	protected:
262	explicit BasicTTIImplBase(const TargetMachine TM, const* DataLayout &DL)
263	: BaseT(DL) {}
264	virtual ~BasicTTIImplBase() = default;
265
266	using TargetTransformInfoImplBase::DL;
267
268	public:
269	/// \name Scalar TTI Implementations
270	/// @{
271	bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth,
272	unsigned AddressSpace, Align Alignment,
273	unsigned Fast) const* {
274	EVT E = EVT::getIntegerVT(Context, BitWidth);
275	return getTLI()->allowsMisalignedMemoryAccesses(
276	E, AddressSpace, Alignment, MachineMemOperand::MONone, Fast);
277	}
278
279	bool hasBranchDivergence(const Function F = nullptr) { return* false; }
280
281	bool isSourceOfDivergence(const Value V) { return* false; }
282
283	bool isAlwaysUniform(const Value V) { return* false; }
284
285	bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
286	return false;
287	}
288
289	bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const {
290	return true;
291	}
292
293	unsigned getFlatAddressSpace() {
294	// Return an invalid address space.
295	return -`1`;
296	}
297
298	bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
299	Intrinsic::ID IID) const {
300	return false;
301	}
302
303	bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
304	return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS);
305	}
306
307	unsigned getAssumedAddrSpace(const Value V) const* {
308	return getTLI()->getTargetMachine().getAssumedAddrSpace(V);
309	}
310
311	bool isSingleThreaded() const {
312	return getTLI()->getTargetMachine().Options.ThreadModel ==
313	ThreadModel::Single;
314	}
315
316	std::pair<const Value , unsigned*>
317	getPredicatedAddrSpace(const Value V) const* {
318	return getTLI()->getTargetMachine().getPredicatedAddrSpace(V);
319	}
320
321	Value rewriteIntrinsicWithAddressSpace(IntrinsicInst II, Value *OldV,
322	Value NewV) const* {
323	return nullptr;
324	}
325
326	bool isLegalAddImmediate(int64_t imm) {
327	return getTLI()->isLegalAddImmediate(imm);
328	}
329
330	bool isLegalICmpImmediate(int64_t imm) {
331	return getTLI()->isLegalICmpImmediate(imm);
332	}
333
334	bool isLegalAddressingMode(Type Ty, GlobalValue BaseGV, int64_t BaseOffset,
335	bool HasBaseReg, int64_t Scale,
336	unsigned AddrSpace, Instruction I = nullptr*) {
337	TargetLoweringBase::AddrMode AM;
338	AM.BaseGV = BaseGV;
339	AM.BaseOffs = BaseOffset;
340	AM.HasBaseReg = HasBaseReg;
341	AM.Scale = Scale;
342	return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I);
343	}
344
345	int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) {
346	return getTLI()->getPreferredLargeGEPBaseOffset(MinOffset, MaxOffset);
347	}
348
349	unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
350	Type ScalarValTy) const* {
351	auto &&IsSupportedByTarget = [this, ScalarMemTy, ScalarValTy](unsigned VF) {
352	auto *SrcTy = FixedVectorType::get(ElementType: ScalarMemTy, NumElts: VF / `2`);
353	EVT VT = getTLI()->getValueType(DL, SrcTy);
354	if (getTLI()->isOperationLegal(ISD::STORE, VT) \|\|
355	getTLI()->isOperationCustom(ISD::STORE, VT))
356	return true;
357
358	EVT ValVT =
359	getTLI()->getValueType(DL, FixedVectorType::get(ElementType: ScalarValTy, NumElts: VF / `2`));
360	EVT LegalizedVT =
361	getTLI()->getTypeToTransformTo(ScalarMemTy->getContext(), VT);
362	return getTLI()->isTruncStoreLegal(LegalizedVT, ValVT);
363	};
364	while (VF > `2` && IsSupportedByTarget(VF))
365	VF /= `2`;
366	return VF;
367	}
368
369	bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty,
370	const DataLayout &DL) const {
371	EVT VT = getTLI()->getValueType(DL, Ty);
372	return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT);
373	}
374
375	bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty,
376	const DataLayout &DL) const {
377	EVT VT = getTLI()->getValueType(DL, Ty);
378	return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT);
379	}
380
381	bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2) {
382	return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
383	}
384
385	bool isNumRegsMajorCostOfLSR() {
386	return TargetTransformInfoImplBase::isNumRegsMajorCostOfLSR();
387	}
388
389	bool shouldFoldTerminatingConditionAfterLSR() const {
390	return TargetTransformInfoImplBase::
391	shouldFoldTerminatingConditionAfterLSR();
392	}
393
394	bool isProfitableLSRChainElement(Instruction *I) {
395	return TargetTransformInfoImplBase::isProfitableLSRChainElement(I);
396	}
397
398	InstructionCost getScalingFactorCost(Type Ty, GlobalValue BaseGV,
399	int64_t BaseOffset, bool HasBaseReg,
400	int64_t Scale, unsigned AddrSpace) {
401	TargetLoweringBase::AddrMode AM;
402	AM.BaseGV = BaseGV;
403	AM.BaseOffs = BaseOffset;
404	AM.HasBaseReg = HasBaseReg;
405	AM.Scale = Scale;
406	if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
407	return `0`;
408	return -`1`;
409	}
410
411	bool isTruncateFree(Type Ty1, Type Ty2) {
412	return getTLI()->isTruncateFree(Ty1, Ty2);
413	}
414
415	bool isProfitableToHoist(Instruction *I) {
416	return getTLI()->isProfitableToHoist(I);
417	}
418
419	bool useAA() const { return getST()->useAA(); }
420
421	bool isTypeLegal(Type *Ty) {
422	EVT VT = getTLI()->getValueType(DL, Ty);
423	return getTLI()->isTypeLegal(VT);
424	}
425
426	unsigned getRegUsageForType(Type *Ty) {
427	EVT ETy = getTLI()->getValueType(DL, Ty);
428	return getTLI()->getNumRegisters(Ty->getContext(), ETy);
429	}
430
431	InstructionCost getGEPCost(Type PointeeType, const* Value *Ptr,
432	ArrayRef<const Value > Operands, Type AccessType,
433	TTI::TargetCostKind CostKind) {
434	return BaseT::getGEPCost(PointeeType, Ptr, Operands, AccessType, CostKind);
435	}
436
437	unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
438	unsigned &JumpTableSize,
439	ProfileSummaryInfo *PSI,
440	BlockFrequencyInfo *BFI) {
441	/// Try to find the estimated number of clusters. Note that the number of
442	/// clusters identified in this function could be different from the actual
443	/// numbers found in lowering. This function ignore switches that are
444	/// lowered with a mix of jump table / bit test / BTree. This function was
445	/// initially intended to be used when estimating the cost of switch in
446	/// inline cost heuristic, but it's a generic cost model to be used in other
447	/// places (e.g., in loop unrolling).
448	unsigned N = SI.getNumCases();
449	const TargetLoweringBase *TLI = getTLI();
450	const DataLayout &DL = this->getDataLayout();
451
452	JumpTableSize = `0`;
453	bool IsJTAllowed = TLI->areJTsAllowed(Fn: SI.getParent()->getParent());
454
455	// Early exit if both a jump table and bit test are not allowed.
456	if (N < `1` \|\| (!IsJTAllowed && DL.getIndexSizeInBits(AS: `0u`) < N))
457	return N;
458
459	APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue();
460	APInt MinCaseVal = MaxCaseVal;
461	for (auto CI : SI.cases()) {
462	const APInt &CaseVal = CI.getCaseValue()->getValue();
463	if (CaseVal.sgt(RHS: MaxCaseVal))
464	MaxCaseVal = CaseVal;
465	if (CaseVal.slt(RHS: MinCaseVal))
466	MinCaseVal = CaseVal;
467	}
468
469	// Check if suitable for a bit test
470	if (N <= DL.getIndexSizeInBits(AS: `0u`)) {
471	SmallPtrSet<const BasicBlock *, `4`> Dests;
472	for (auto I : SI.cases())
473	Dests.insert(Ptr: I.getCaseSuccessor());
474
475	if (TLI->isSuitableForBitTests(NumDests: Dests.size(), NumCmps: N, Low: MinCaseVal, High: MaxCaseVal,
476	DL))
477	return `1`;
478	}
479
480	// Check if suitable for a jump table.
481	if (IsJTAllowed) {
482	if (N < `2` \|\| N < TLI->getMinimumJumpTableEntries())
483	return N;
484	uint64_t Range =
485	(MaxCaseVal - MinCaseVal)
486	.getLimitedValue(Limit: std::numeric_limits<uint64_t>::max() - `1`) + `1`;
487	// Check whether a range of clusters is dense enough for a jump table
488	if (TLI->isSuitableForJumpTable(SI: &SI, NumCases: N, Range, PSI, BFI)) {
489	JumpTableSize = Range;
490	return `1`;
491	}
492	}
493	return N;
494	}
495
496	bool shouldBuildLookupTables() {
497	const TargetLoweringBase *TLI = getTLI();
498	return TLI->isOperationLegalOrCustom(Op: ISD::BR_JT, MVT::VT: Other) \|\|
499	TLI->isOperationLegalOrCustom(Op: ISD::BRIND, MVT::VT: Other);
500	}
501
502	bool shouldBuildRelLookupTables() const {
503	const TargetMachine &TM = getTLI()->getTargetMachine();
504	// If non-PIC mode, do not generate a relative lookup table.
505	if (!TM.isPositionIndependent())
506	return false;
507
508	/// Relative lookup table entries consist of 32-bit offsets.
509	/// Do not generate relative lookup tables for large code models
510	/// in 64-bit achitectures where 32-bit offsets might not be enough.
511	if (TM.getCodeModel() == CodeModel::Medium \|\|
512	TM.getCodeModel() == CodeModel::Large)
513	return false;
514
515	Triple TargetTriple = TM.getTargetTriple();
516	if (!TargetTriple.isArch64Bit())
517	return false;
518
519	// TODO: Triggers issues on aarch64 on darwin, so temporarily disable it
520	// there.
521	if (TargetTriple.getArch() == Triple::aarch64 && TargetTriple.isOSDarwin())
522	return false;
523
524	return true;
525	}
526
527	bool haveFastSqrt(Type *Ty) {
528	const TargetLoweringBase *TLI = getTLI();
529	EVT VT = TLI->getValueType(DL, Ty);
530	return TLI->isTypeLegal(VT) &&
531	TLI->isOperationLegalOrCustom(Op: ISD::FSQRT, VT);
532	}
533
534	bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
535	return true;
536	}
537
538	InstructionCost getFPOpCost(Type *Ty) {
539	// Check whether FADD is available, as a proxy for floating-point in
540	// general.
541	const TargetLoweringBase *TLI = getTLI();
542	EVT VT = TLI->getValueType(DL, Ty);
543	if (TLI->isOperationLegalOrCustomOrPromote(Op: ISD::FADD, VT))
544	return TargetTransformInfo::TCC_Basic;
545	return TargetTransformInfo::TCC_Expensive;
546	}
547
548	bool preferToKeepConstantsAttached(const Instruction &Inst,
549	const Function &Fn) const {
550	switch (Inst.getOpcode()) {
551	default:
552	break;
553	case Instruction::SDiv:
554	case Instruction::SRem:
555	case Instruction::UDiv:
556	case Instruction::URem: {
557	if (!isa<ConstantInt>(Val: Inst.getOperand(i: `1`)))
558	return false;
559	EVT VT = getTLI()->getValueType(DL, Inst.getType());
560	return !getTLI()->isIntDivCheap(VT, Fn.getAttributes());
561	}
562	};
563
564	return false;
565	}
566
567	unsigned getInliningThresholdMultiplier() const { return `1`; }
568	unsigned adjustInliningThreshold(const CallBase CB) { return* `0`; }
569	unsigned getCallerAllocaCost(const CallBase CB, const* AllocaInst AI) const* {
570	return `0`;
571	}
572
573	int getInlinerVectorBonusPercent() const { return `150`; }
574
575	void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
576	TTI::UnrollingPreferences &UP,
577	OptimizationRemarkEmitter *ORE) {
578	// This unrolling functionality is target independent, but to provide some
579	// motivation for its intended use, for x86:
580
581	// According to the Intel 64 and IA-32 Architectures Optimization Reference
582	// Manual, Intel Core models and later have a loop stream detector (and
583	// associated uop queue) that can benefit from partial unrolling.
584	// The relevant requirements are:
585	// - The loop must have no more than 4 (8 for Nehalem and later) branches
586	// taken, and none of them may be calls.
587	// - The loop can have no more than 18 (28 for Nehalem and later) uops.
588
589	// According to the Software Optimization Guide for AMD Family 15h
590	// Processors, models 30h-4fh (Steamroller and later) have a loop predictor
591	// and loop buffer which can benefit from partial unrolling.
592	// The relevant requirements are:
593	// - The loop must have fewer than 16 branches
594	// - The loop must have less than 40 uops in all executed loop branches
595
596	// The number of taken branches in a loop is hard to estimate here, and
597	// benchmarking has revealed that it is better not to be conservative when
598	// estimating the branch count. As a result, we'll ignore the branch limits
599	// until someone finds a case where it matters in practice.
600
601	unsigned MaxOps;
602	const TargetSubtargetInfo *ST = getST();
603	if (PartialUnrollingThreshold.getNumOccurrences() > `0`)
604	MaxOps = PartialUnrollingThreshold;
605	else if (ST->getSchedModel().LoopMicroOpBufferSize > `0`)
606	MaxOps = ST->getSchedModel().LoopMicroOpBufferSize;
607	else
608	return;
609
610	// Scan the loop: don't unroll loops with calls.
611	for (BasicBlock *BB : L->blocks()) {
612	for (Instruction &I : *BB) {
613	if (isa<CallInst>(Val: I) \|\| isa<InvokeInst>(Val: I)) {
614	if (const Function *F = cast<CallBase>(Val&: I).getCalledFunction()) {
615	if (!thisT()->isLoweredToCall(F))
616	continue;
617	}
618
619	if (ORE) {
620	ORE->emit([&]() {
621	return OptimizationRemark ("TTI", "DontUnroll", L->getStartLoc(),
622	L->getHeader())
623	<< "advising against unrolling the loop because it "
624	"contains a "
625	<< ore::NV ("Call", &I);
626	});
627	}
628	return;
629	}
630	}
631	}
632
633	// Enable runtime and partial unrolling up to the specified size.
634	// Enable using trip count upper bound to unroll loops.
635	UP.Partial = UP.Runtime = UP.UpperBound = true;
636	UP.PartialThreshold = MaxOps;
637
638	// Avoid unrolling when optimizing for size.
639	UP.OptSizeThreshold = `0`;
640	UP.PartialOptSizeThreshold = `0`;
641
642	// Set number of instructions optimized when "back edge"
643	// becomes "fall through" to default value of 2.
644	UP.BEInsns = `2`;
645	}
646
647	void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
648	TTI::PeelingPreferences &PP) {
649	PP.PeelCount = `0`;
650	PP.AllowPeeling = true;
651	PP.AllowLoopNestsPeeling = false;
652	PP.PeelProfiledIterations = true;
653	}
654
655	bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
656	AssumptionCache &AC,
657	TargetLibraryInfo *LibInfo,
658	HardwareLoopInfo &HWLoopInfo) {
659	return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
660	}
661
662	bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) {
663	return BaseT::preferPredicateOverEpilogue(TFI);
664	}
665
666	TailFoldingStyle
667	getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) {
668	return BaseT::getPreferredTailFoldingStyle(IVUpdateMayOverflow);
669	}
670
671	std::optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
672	IntrinsicInst &II) {
673	return BaseT::instCombineIntrinsic(IC, II);
674	}
675
676	std::optional<Value *>
677	simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II,
678	APInt DemandedMask, KnownBits &Known,
679	bool &KnownBitsComputed) {
680	return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known,
681	KnownBitsComputed);
682	}
683
684	std::optional<Value *> simplifyDemandedVectorEltsIntrinsic(
685	InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
686	APInt &UndefElts2, APInt &UndefElts3,
687	std::function<void(Instruction , unsigned*, APInt, APInt &)>
688	SimplifyAndSetOp) {
689	return BaseT::simplifyDemandedVectorEltsIntrinsic(
690	IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
691	SimplifyAndSetOp);
692	}
693
694	virtual std::optional<unsigned>
695	getCacheSize(TargetTransformInfo::CacheLevel Level) const {
696	return std::optional<unsigned>(
697	getST()->getCacheSize(static_cast<unsigned>(Level)));
698	}
699
700	virtual std::optional<unsigned>
701	getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const {
702	std::optional<unsigned> TargetResult =
703	getST()->getCacheAssociativity(static_cast<unsigned>(Level));
704
705	if (TargetResult)
706	return TargetResult;
707
708	return BaseT::getCacheAssociativity(Level);
709	}
710
711	virtual unsigned getCacheLineSize() const {
712	return getST()->getCacheLineSize();
713	}
714
715	virtual unsigned getPrefetchDistance() const {
716	return getST()->getPrefetchDistance();
717	}
718
719	virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses,
720	unsigned NumStridedMemAccesses,
721	unsigned NumPrefetches,
722	bool HasCall) const {
723	return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
724	NumPrefetches, HasCall);
725	}
726
727	virtual unsigned getMaxPrefetchIterationsAhead() const {
728	return getST()->getMaxPrefetchIterationsAhead();
729	}
730
731	virtual bool enableWritePrefetching() const {
732	return getST()->enableWritePrefetching();
733	}
734
735	virtual bool shouldPrefetchAddressSpace(unsigned AS) const {
736	return getST()->shouldPrefetchAddressSpace(AS);
737	}
738
739	/// @}
740
741	/// \name Vector TTI Implementations
742	/// @{
743
744	TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
745	return TypeSize::getFixed(ExactSize: `32`);
746	}
747
748	std::optional<unsigned> getMaxVScale() const { return std::nullopt; }
749	std::optional<unsigned> getVScaleForTuning() const { return std::nullopt; }
750	bool isVScaleKnownToBeAPowerOfTwo() const { return false; }
751
752	/// Estimate the overhead of scalarizing an instruction. Insert and Extract
753	/// are set if the demanded result elements need to be inserted and/or
754	/// extracted from vectors.
755	InstructionCost getScalarizationOverhead(VectorType *InTy,
756	const APInt &DemandedElts,
757	bool Insert, bool Extract,
758	TTI::TargetCostKind CostKind) {
759	/// FIXME: a bitfield is not a reasonable abstraction for talking about
760	/// which elements are needed from a scalable vector
761	if (isa<ScalableVectorType>(Val: InTy))
762	return InstructionCost::getInvalid();
763	auto *Ty = cast<FixedVectorType>(Val: InTy);
764
765	assert(DemandedElts.getBitWidth() == Ty->getNumElements() &&
766	"Vector size mismatch");
767
768	InstructionCost Cost = `0`;
769
770	for (int i = `0`, e = Ty->getNumElements(); i < e; ++i) {
771	if (!DemandedElts [i])
772	continue;
773	if (Insert)
774	Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty,
775	CostKind, i, nullptr, nullptr);
776	if (Extract)
777	Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
778	CostKind, i, nullptr, nullptr);
779	}
780
781	return Cost;
782	}
783
784	/// Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
785	InstructionCost getScalarizationOverhead(VectorType InTy, bool* Insert,
786	bool Extract,
787	TTI::TargetCostKind CostKind) {
788	if (isa<ScalableVectorType>(Val: InTy))
789	return InstructionCost::getInvalid();
790	auto *Ty = cast<FixedVectorType>(Val: InTy);
791
792	APInt DemandedElts = APInt::getAllOnes(numBits: Ty->getNumElements());
793	return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
794	CostKind);
795	}
796
797	/// Estimate the overhead of scalarizing an instructions unique
798	/// non-constant operands. The (potentially vector) types to use for each of
799	/// argument are passes via Tys.
800	InstructionCost
801	getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
802	ArrayRef<Type *> Tys,
803	TTI::TargetCostKind CostKind) {
804	assert(Args.size() == Tys.size() && "Expected matching Args and Tys");
805
806	InstructionCost Cost = `0`;
807	SmallPtrSet<const Value*, `4`> UniqueOperands;
808	for (int I = `0`, E = Args.size(); I != E; I++) {
809	// Disregard things like metadata arguments.
810	const Value *A = Args [I];
811	Type *Ty = Tys [I];
812	if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() &&
813	!Ty->isPtrOrPtrVectorTy())
814	continue;
815
816	if (!isa<Constant>(Val: A) && UniqueOperands.insert(Ptr: A).second) {
817	if (auto *VecTy = dyn_cast<VectorType>(Val: Ty))
818	Cost += getScalarizationOverhead(VecTy, /Insert/ false,
819	/Extract/ true, CostKind);
820	}
821	}
822
823	return Cost;
824	}
825
826	/// Estimate the overhead of scalarizing the inputs and outputs of an
827	/// instruction, with return type RetTy and arguments Args of type Tys. If
828	/// Args are unknown (empty), then the cost associated with one argument is
829	/// added as a heuristic.
830	InstructionCost getScalarizationOverhead(VectorType *RetTy,
831	ArrayRef<const Value *> Args,
832	ArrayRef<Type *> Tys,
833	TTI::TargetCostKind CostKind) {
834	InstructionCost Cost = getScalarizationOverhead(
835	RetTy, /Insert/ true, /Extract/ false, CostKind);
836	if (!Args.empty())
837	Cost += getOperandsScalarizationOverhead(Args, Tys, CostKind);
838	else
839	// When no information on arguments is provided, we add the cost
840	// associated with one argument as a heuristic.
841	Cost += getScalarizationOverhead(RetTy, /Insert/ false,
842	/Extract/ true, CostKind);
843
844	return Cost;
845	}
846
847	/// Estimate the cost of type-legalization and the legalized type.
848	std::pair<InstructionCost, MVT> getTypeLegalizationCost(Type Ty) const* {
849	LLVMContext &C = Ty->getContext();
850	EVT MTy = getTLI()->getValueType(DL, Ty);
851
852	InstructionCost Cost = `1`;
853	// We keep legalizing the type until we find a legal kind. We assume that
854	// the only operation that costs anything is the split. After splitting
855	// we need to handle two types.
856	while (true) {
857	TargetLoweringBase::LegalizeKind LK = getTLI()->getTypeConversion(C, MTy);
858
859	if (LK.first == TargetLoweringBase::TypeScalarizeScalableVector) {
860	// Ensure we return a sensible simple VT here, since many callers of
861	// this function require it.
862	MVT VT = MTy.isSimple() ? MTy.getSimpleVT() : MVT::i64;
863	return std::make_pair(x: InstructionCost::getInvalid(), y&: VT);
864	}
865
866	if (LK.first == TargetLoweringBase::TypeLegal)
867	return std::make_pair(x&: Cost, y: MTy.getSimpleVT());
868
869	if (LK.first == TargetLoweringBase::TypeSplitVector \|\|
870	LK.first == TargetLoweringBase::TypeExpandInteger)
871	Cost *= `2`;
872
873	// Do not loop with f128 type.
874	if (MTy == LK.second)
875	return std::make_pair(x&: Cost, y: MTy.getSimpleVT());
876
877	// Keep legalizing the type.
878	MTy = LK.second;
879	}
880	}
881
882	unsigned getMaxInterleaveFactor(ElementCount VF) { return `1`; }
883
884	InstructionCost getArithmeticInstrCost(
885	unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
886	TTI::OperandValueInfo Opd1Info = {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
887	TTI::OperandValueInfo Opd2Info = {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
888	ArrayRef<const Value > Args = ArrayRef<const* Value *>(),
889	const Instruction CxtI = nullptr*) {
890	// Check if any of the operands are vector operands.
891	const TargetLoweringBase *TLI = getTLI();
892	int ISD = TLI->InstructionOpcodeToISD(Opcode);
893	assert(ISD && "Invalid opcode");
894
895	// TODO: Handle more cost kinds.
896	if (CostKind != TTI::TCK_RecipThroughput)
897	return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind,
898	Opd1Info, Opd2Info,
899	Args, CxtI);
900
901	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
902
903	bool IsFloat = Ty->isFPOrFPVectorTy();
904	// Assume that floating point arithmetic operations cost twice as much as
905	// integer operations.
906	InstructionCost OpCost = (IsFloat ? `2` : `1`);
907
908	if (TLI->isOperationLegalOrPromote(Op: ISD, VT: LT.second)) {
909	// The operation is legal. Assume it costs 1.
910	// TODO: Once we have extract/insert subvector cost we need to use them.
911	return LT.first * OpCost;
912	}
913
914	if (!TLI->isOperationExpand(Op: ISD, VT: LT.second)) {
915	// If the operation is custom lowered, then assume that the code is twice
916	// as expensive.
917	return LT.first * `2` * OpCost;
918	}
919
920	// An 'Expand' of URem and SRem is special because it may default
921	// to expanding the operation into a sequence of sub-operations
922	// i.e. X % Y -> X-(X/Y)Y.*
923	if (ISD == ISD::UREM \|\| ISD == ISD::SREM) {
924	bool IsSigned = ISD == ISD::SREM;
925	if (TLI->isOperationLegalOrCustom(Op: IsSigned ? ISD::SDIVREM : ISD::UDIVREM,
926	VT: LT.second) \|\|
927	TLI->isOperationLegalOrCustom(Op: IsSigned ? ISD::SDIV : ISD::UDIV,
928	VT: LT.second)) {
929	unsigned DivOpc = IsSigned ? Instruction::SDiv : Instruction::UDiv;
930	InstructionCost DivCost = thisT()->getArithmeticInstrCost(
931	DivOpc, Ty, CostKind, Opd1Info, Opd2Info);
932	InstructionCost MulCost =
933	thisT()->getArithmeticInstrCost(Instruction::Mul, Ty, CostKind);
934	InstructionCost SubCost =
935	thisT()->getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
936	return DivCost + MulCost + SubCost;
937	}
938	}
939
940	// We cannot scalarize scalable vectors, so return Invalid.
941	if (isa<ScalableVectorType>(Val: Ty))
942	return InstructionCost::getInvalid();
943
944	// Else, assume that we need to scalarize this op.
945	// TODO: If one of the types get legalized by splitting, handle this
946	// similarly to what getCastInstrCost() does.
947	if (auto *VTy = dyn_cast<FixedVectorType>(Val: Ty)) {
948	InstructionCost Cost = thisT()->getArithmeticInstrCost(
949	Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info,
950	Args, CxtI);
951	// Return the cost of multiple scalar invocation plus the cost of
952	// inserting and extracting the values.
953	SmallVector<Type *> Tys(Args.size(), Ty);
954	return getScalarizationOverhead(VTy, Args, Tys, CostKind) +
955	VTy->getNumElements() * Cost;
956	}
957
958	// We don't know anything about this scalar instruction.
959	return OpCost;
960	}
961
962	TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind,
963	ArrayRef<int> Mask,
964	VectorType Ty, int* &Index,
965	VectorType &SubTy) const* {
966	if (Mask.empty())
967	return Kind;
968	int NumSrcElts = Ty->getElementCount().getKnownMinValue();
969	switch (Kind) {
970	case TTI::SK_PermuteSingleSrc:
971	if (ShuffleVectorInst::isReverseMask(Mask, NumSrcElts))
972	return TTI::SK_Reverse;
973	if (ShuffleVectorInst::isZeroEltSplatMask(Mask, NumSrcElts))
974	return TTI::SK_Broadcast;
975	if (ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts, Index) &&
976	(Index + Mask.size()) <= (size_t)NumSrcElts) {
977	SubTy = FixedVectorType::get(ElementType: Ty->getElementType(), NumElts: Mask.size());
978	return TTI::SK_ExtractSubvector;
979	}
980	break;
981	case TTI::SK_PermuteTwoSrc: {
982	int NumSubElts;
983	if (Mask.size() > `2` && ShuffleVectorInst::isInsertSubvectorMask(
984	Mask, NumSrcElts, NumSubElts, Index)) {
985	if (Index + NumSubElts > NumSrcElts)
986	return Kind;
987	SubTy = FixedVectorType::get(ElementType: Ty->getElementType(), NumElts: NumSubElts);
988	return TTI::SK_InsertSubvector;
989	}
990	if (ShuffleVectorInst::isSelectMask(Mask, NumSrcElts))
991	return TTI::SK_Select;
992	if (ShuffleVectorInst::isTransposeMask(Mask, NumSrcElts))
993	return TTI::SK_Transpose;
994	if (ShuffleVectorInst::isSpliceMask(Mask, NumSrcElts, Index))
995	return TTI::SK_Splice;
996	break;
997	}
998	case TTI::SK_Select:
999	case TTI::SK_Reverse:
1000	case TTI::SK_Broadcast:
1001	case TTI::SK_Transpose:
1002	case TTI::SK_InsertSubvector:
1003	case TTI::SK_ExtractSubvector:
1004	case TTI::SK_Splice:
1005	break;
1006	}
1007	return Kind;
1008	}
1009
1010	InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
1011	ArrayRef<int> Mask,
1012	TTI::TargetCostKind CostKind, int Index,
1013	VectorType *SubTp,
1014	ArrayRef<const Value *> Args = std::nullopt) {
1015	switch (improveShuffleKindFromMask(Kind, Mask, Ty: Tp, Index, SubTy&: SubTp)) {
1016	case TTI::SK_Broadcast:
1017	if (auto *FVT = dyn_cast<FixedVectorType>(Val: Tp))
1018	return getBroadcastShuffleOverhead(VTy: FVT, CostKind);
1019	return InstructionCost::getInvalid();
1020	case TTI::SK_Select:
1021	case TTI::SK_Splice:
1022	case TTI::SK_Reverse:
1023	case TTI::SK_Transpose:
1024	case TTI::SK_PermuteSingleSrc:
1025	case TTI::SK_PermuteTwoSrc:
1026	if (auto *FVT = dyn_cast<FixedVectorType>(Val: Tp))
1027	return getPermuteShuffleOverhead(VTy: FVT, CostKind);
1028	return InstructionCost::getInvalid();
1029	case TTI::SK_ExtractSubvector:
1030	return getExtractSubvectorOverhead(VTy: Tp, CostKind, Index,
1031	SubVTy: cast<FixedVectorType>(Val: SubTp));
1032	case TTI::SK_InsertSubvector:
1033	return getInsertSubvectorOverhead(VTy: Tp, CostKind, Index,
1034	SubVTy: cast<FixedVectorType>(Val: SubTp));
1035	}
1036	llvm_unreachable("Unknown TTI::ShuffleKind");
1037	}
1038
1039	InstructionCost getCastInstrCost(unsigned Opcode, Type Dst, Type Src,
1040	TTI::CastContextHint CCH,
1041	TTI::TargetCostKind CostKind,
1042	const Instruction I = nullptr*) {
1043	if (BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I) == `0`)
1044	return `0`;
1045
1046	const TargetLoweringBase *TLI = getTLI();
1047	int ISD = TLI->InstructionOpcodeToISD(Opcode);
1048	assert(ISD && "Invalid opcode");
1049	std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Ty: Src);
1050	std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Ty: Dst);
1051
1052	TypeSize SrcSize = SrcLT.second.getSizeInBits();
1053	TypeSize DstSize = DstLT.second.getSizeInBits();
1054	bool IntOrPtrSrc = Src->isIntegerTy() \|\| Src->isPointerTy();
1055	bool IntOrPtrDst = Dst->isIntegerTy() \|\| Dst->isPointerTy();
1056
1057	switch (Opcode) {
1058	default:
1059	break;
1060	case Instruction::Trunc:
1061	// Check for NOOP conversions.
1062	if (TLI->isTruncateFree(FromVT: SrcLT.second, ToVT: DstLT.second))
1063	return `0`;
1064	[[fallthrough]];
1065	case Instruction::BitCast:
1066	// Bitcast between types that are legalized to the same type are free and
1067	// assume int to/from ptr of the same size is also free.
1068	if (SrcLT.first == DstLT.first && IntOrPtrSrc == IntOrPtrDst &&
1069	SrcSize == DstSize)
1070	return `0`;
1071	break;
1072	case Instruction::FPExt:
1073	if (I && getTLI()->isExtFree(I))
1074	return `0`;
1075	break;
1076	case Instruction::ZExt:
1077	if (TLI->isZExtFree(FromTy: SrcLT.second, ToTy: DstLT.second))
1078	return `0`;
1079	[[fallthrough]];
1080	case Instruction::SExt:
1081	if (I && getTLI()->isExtFree(I))
1082	return `0`;
1083
1084	// If this is a zext/sext of a load, return 0 if the corresponding
1085	// extending load exists on target and the result type is legal.
1086	if (CCH == TTI::CastContextHint::Normal) {
1087	EVT ExtVT = EVT::getEVT(Ty: Dst);
1088	EVT LoadVT = EVT::getEVT(Ty: Src);
1089	unsigned LType =
1090	((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD);
1091	if (DstLT.first == SrcLT.first &&
1092	TLI->isLoadExtLegal(ExtType: LType, ValVT: ExtVT, MemVT: LoadVT))
1093	return `0`;
1094	}
1095	break;
1096	case Instruction::AddrSpaceCast:
1097	if (TLI->isFreeAddrSpaceCast(SrcAS: Src->getPointerAddressSpace(),
1098	DestAS: Dst->getPointerAddressSpace()))
1099	return `0`;
1100	break;
1101	}
1102
1103	auto *SrcVTy = dyn_cast<VectorType>(Val: Src);
1104	auto *DstVTy = dyn_cast<VectorType>(Val: Dst);
1105
1106	// If the cast is marked as legal (or promote) then assume low cost.
1107	if (SrcLT.first == DstLT.first &&
1108	TLI->isOperationLegalOrPromote(Op: ISD, VT: DstLT.second))
1109	return SrcLT.first;
1110
1111	// Handle scalar conversions.
1112	if (!SrcVTy && !DstVTy) {
1113	// Just check the op cost. If the operation is legal then assume it costs
1114	// 1.
1115	if (!TLI->isOperationExpand(Op: ISD, VT: DstLT.second))
1116	return `1`;
1117
1118	// Assume that illegal scalar instruction are expensive.
1119	return `4`;
1120	}
1121
1122	// Check vector-to-vector casts.
1123	if (DstVTy && SrcVTy) {
1124	// If the cast is between same-sized registers, then the check is simple.
1125	if (SrcLT.first == DstLT.first && SrcSize == DstSize) {
1126
1127	// Assume that Zext is done using AND.
1128	if (Opcode == Instruction::ZExt)
1129	return SrcLT.first;
1130
1131	// Assume that sext is done using SHL and SRA.
1132	if (Opcode == Instruction::SExt)
1133	return SrcLT.first * `2`;
1134
1135	// Just check the op cost. If the operation is legal then assume it
1136	// costs
1137	// 1 and multiply by the type-legalization overhead.
1138	if (!TLI->isOperationExpand(Op: ISD, VT: DstLT.second))
1139	return SrcLT.first * `1`;
1140	}
1141
1142	// If we are legalizing by splitting, query the concrete TTI for the cost
1143	// of casting the original vector twice. We also need to factor in the
1144	// cost of the split itself. Count that as 1, to be consistent with
1145	// getTypeLegalizationCost().
1146	bool SplitSrc =
1147	TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Ty: Src)) ==
1148	TargetLowering::TypeSplitVector;
1149	bool SplitDst =
1150	TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Ty: Dst)) ==
1151	TargetLowering::TypeSplitVector;
1152	if ((SplitSrc \|\| SplitDst) && SrcVTy->getElementCount().isVector() &&
1153	DstVTy->getElementCount().isVector()) {
1154	Type *SplitDstTy = VectorType::getHalfElementsVectorType(VTy: DstVTy);
1155	Type *SplitSrcTy = VectorType::getHalfElementsVectorType(VTy: SrcVTy);
1156	T TTI = static_cast<T >(this);
1157	// If both types need to be split then the split is free.
1158	InstructionCost SplitCost =
1159	(!SplitSrc \|\| !SplitDst) ? TTI->getVectorSplitCost() : `0`;
1160	return SplitCost +
1161	(`2` * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, CCH,
1162	CostKind, I));
1163	}
1164
1165	// Scalarization cost is Invalid, can't assume any num elements.
1166	if (isa<ScalableVectorType>(Val: DstVTy))
1167	return InstructionCost::getInvalid();
1168
1169	// In other cases where the source or destination are illegal, assume
1170	// the operation will get scalarized.
1171	unsigned Num = cast<FixedVectorType>(Val: DstVTy)->getNumElements();
1172	InstructionCost Cost = thisT()->getCastInstrCost(
1173	Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind, I);
1174
1175	// Return the cost of multiple scalar invocation plus the cost of
1176	// inserting and extracting the values.
1177	return getScalarizationOverhead(DstVTy, /Insert/ true, /Extract/ true,
1178	CostKind) +
1179	Num * Cost;
1180	}
1181
1182	// We already handled vector-to-vector and scalar-to-scalar conversions.
1183	// This
1184	// is where we handle bitcast between vectors and scalars. We need to assume
1185	// that the conversion is scalarized in one way or another.
1186	if (Opcode == Instruction::BitCast) {
1187	// Illegal bitcasts are done by storing and loading from a stack slot.
1188	return (SrcVTy ? getScalarizationOverhead(SrcVTy, /Insert/ false,
1189	/Extract/ true, CostKind)
1190	: `0`) +
1191	(DstVTy ? getScalarizationOverhead(DstVTy, /Insert/ true,
1192	/Extract/ false, CostKind)
1193	: `0`);
1194	}
1195
1196	llvm_unreachable("Unhandled cast");
1197	}
1198
1199	InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst,
1200	VectorType VecTy, unsigned* Index) {
1201	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1202	return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy,
1203	CostKind, Index, nullptr, nullptr) +
1204	thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(),
1205	TTI::CastContextHint::None, CostKind);
1206	}
1207
1208	InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
1209	const Instruction I = nullptr*) {
1210	return BaseT::getCFInstrCost(Opcode, CostKind, I);
1211	}
1212
1213	InstructionCost getCmpSelInstrCost(unsigned Opcode, Type ValTy, Type CondTy,
1214	CmpInst::Predicate VecPred,
1215	TTI::TargetCostKind CostKind,
1216	const Instruction I = nullptr*) {
1217	const TargetLoweringBase *TLI = getTLI();
1218	int ISD = TLI->InstructionOpcodeToISD(Opcode);
1219	assert(ISD && "Invalid opcode");
1220
1221	// TODO: Handle other cost kinds.
1222	if (CostKind != TTI::TCK_RecipThroughput)
1223	return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1224	I);
1225
1226	// Selects on vectors are actually vector selects.
1227	if (ISD == ISD::SELECT) {
1228	assert(CondTy && "CondTy must exist");
1229	if (CondTy->isVectorTy())
1230	ISD = ISD::VSELECT;
1231	}
1232	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
1233
1234	if (!(ValTy->isVectorTy() && !LT.second.isVector()) &&
1235	!TLI->isOperationExpand(Op: ISD, VT: LT.second)) {
1236	// The operation is legal. Assume it costs 1. Multiply
1237	// by the type-legalization overhead.
1238	return LT.first * `1`;
1239	}
1240
1241	// Otherwise, assume that the cast is scalarized.
1242	// TODO: If one of the types get legalized by splitting, handle this
1243	// similarly to what getCastInstrCost() does.
1244	if (auto *ValVTy = dyn_cast<VectorType>(Val: ValTy)) {
1245	if (isa<ScalableVectorType>(Val: ValTy))
1246	return InstructionCost::getInvalid();
1247
1248	unsigned Num = cast<FixedVectorType>(Val: ValVTy)->getNumElements();
1249	if (CondTy)
1250	CondTy = CondTy->getScalarType();
1251	InstructionCost Cost = thisT()->getCmpSelInstrCost(
1252	Opcode, ValVTy->getScalarType(), CondTy, VecPred, CostKind, I);
1253
1254	// Return the cost of multiple scalar invocation plus the cost of
1255	// inserting and extracting the values.
1256	return getScalarizationOverhead(ValVTy, /Insert/ true,
1257	/Extract/ false, CostKind) +
1258	Num * Cost;
1259	}
1260
1261	// Unknown scalar opcode.
1262	return `1`;
1263	}
1264
1265	InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
1266	TTI::TargetCostKind CostKind,
1267	unsigned Index, Value Op0, Value Op1) {
1268	return getRegUsageForType(Ty: Val->getScalarType());
1269	}
1270
1271	InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
1272	TTI::TargetCostKind CostKind,
1273	unsigned Index) {
1274	Value Op0 = nullptr*;
1275	Value Op1 = nullptr*;
1276	if (auto *IE = dyn_cast<InsertElementInst>(Val: &I)) {
1277	Op0 = IE->getOperand(i_nocapture: `0`);
1278	Op1 = IE->getOperand(i_nocapture: `1`);
1279	}
1280	return thisT()->getVectorInstrCost(I.getOpcode(), Val, CostKind, Index, Op0,
1281	Op1);
1282	}
1283
1284	InstructionCost getReplicationShuffleCost(Type EltTy, int* ReplicationFactor,
1285	int VF,
1286	const APInt &DemandedDstElts,
1287	TTI::TargetCostKind CostKind) {
1288	assert(DemandedDstElts.getBitWidth() == (unsigned)VF * ReplicationFactor &&
1289	"Unexpected size of DemandedDstElts.");
1290
1291	InstructionCost Cost;
1292
1293	auto *SrcVT = FixedVectorType::get(ElementType: EltTy, NumElts: VF);
1294	auto ReplicatedVT = FixedVectorType::get(ElementType: EltTy, NumElts: VF ReplicationFactor);
1295
1296	// The Mask shuffling cost is extract all the elements of the Mask
1297	// and insert each of them Factor times into the wide vector:
1298	//
1299	// E.g. an interleaved group with factor 3:
1300	// %mask = icmp ult <8 x i32> %vec1, %vec2
1301	// %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef,
1302	// <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7>
1303	// The cost is estimated as extract all mask elements from the <8xi1> mask
1304	// vector and insert them factor times into the <24xi1> shuffled mask
1305	// vector.
1306	APInt DemandedSrcElts = APIntOps::ScaleBitMask(A: DemandedDstElts, NewBitWidth: VF);
1307	Cost += thisT()->getScalarizationOverhead(SrcVT, DemandedSrcElts,
1308	/Insert/ false,
1309	/Extract/ true, CostKind);
1310	Cost += thisT()->getScalarizationOverhead(ReplicatedVT, DemandedDstElts,
1311	/Insert/ true,
1312	/Extract/ false, CostKind);
1313
1314	return Cost;
1315	}
1316
1317	InstructionCost
1318	getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
1319	unsigned AddressSpace, TTI::TargetCostKind CostKind,
1320	TTI::OperandValueInfo OpInfo = {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
1321	const Instruction I = nullptr*) {
1322	assert(!Src->isVoidTy() && "Invalid type");
1323	// Assume types, such as structs, are expensive.
1324	if (getTLI()->getValueType(DL, Src, true) == MVT::Other)
1325	return `4`;
1326	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Src);
1327
1328	// Assuming that all loads of legal types cost 1.
1329	InstructionCost Cost = LT.first;
1330	if (CostKind != TTI::TCK_RecipThroughput)
1331	return Cost;
1332
1333	const DataLayout &DL = this->getDataLayout();
1334	if (Src->isVectorTy() &&
1335	// In practice it's not currently possible to have a change in lane
1336	// length for extending loads or truncating stores so both types should
1337	// have the same scalable property.
1338	TypeSize::isKnownLT(LHS: DL.getTypeStoreSizeInBits(Ty: Src),
1339	RHS: LT.second.getSizeInBits())) {
1340	// This is a vector load that legalizes to a larger type than the vector
1341	// itself. Unless the corresponding extending load or truncating store is
1342	// legal, then this will scalarize.
1343	TargetLowering::LegalizeAction LA = TargetLowering::Expand;
1344	EVT MemVT = getTLI()->getValueType(DL, Src);
1345	if (Opcode == Instruction::Store)
1346	LA = getTLI()->getTruncStoreAction(LT.second, MemVT);
1347	else
1348	LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT);
1349
1350	if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) {
1351	// This is a vector load/store for some illegal type that is scalarized.
1352	// We must account for the cost of building or decomposing the vector.
1353	Cost += getScalarizationOverhead(
1354	cast<VectorType>(Val: Src), Opcode != Instruction::Store,
1355	Opcode == Instruction::Store, CostKind);
1356	}
1357	}
1358
1359	return Cost;
1360	}
1361
1362	InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
1363	Align Alignment, unsigned AddressSpace,
1364	TTI::TargetCostKind CostKind) {
1365	return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, VariableMask: true, IsGatherScatter: false,
1366	CostKind);
1367	}
1368
1369	InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
1370	const Value Ptr, bool* VariableMask,
1371	Align Alignment,
1372	TTI::TargetCostKind CostKind,
1373	const Instruction I = nullptr*) {
1374	return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, VariableMask,
1375	IsGatherScatter: true, CostKind);
1376	}
1377
1378	InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy,
1379	const Value Ptr, bool* VariableMask,
1380	Align Alignment,
1381	TTI::TargetCostKind CostKind,
1382	const Instruction *I) {
1383	// For a target without strided memory operations (or for an illegal
1384	// operation type on one which does), assume we lower to a gather/scatter
1385	// operation. (Which may in turn be scalarized.)
1386	return thisT()->getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1387	Alignment, CostKind, I);
1388	}
1389
1390	InstructionCost getInterleavedMemoryOpCost(
1391	unsigned Opcode, Type VecTy, unsigned* Factor, ArrayRef<unsigned> Indices,
1392	Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1393	bool UseMaskForCond = false, bool UseMaskForGaps = false) {
1394
1395	// We cannot scalarize scalable vectors, so return Invalid.
1396	if (isa<ScalableVectorType>(Val: VecTy))
1397	return InstructionCost::getInvalid();
1398
1399	auto *VT = cast<FixedVectorType>(Val: VecTy);
1400
1401	unsigned NumElts = VT->getNumElements();
1402	assert(Factor > `1` && NumElts % Factor == `0` && "Invalid interleave factor");
1403
1404	unsigned NumSubElts = NumElts / Factor;
1405	auto *SubVT = FixedVectorType::get(ElementType: VT->getElementType(), NumElts: NumSubElts);
1406
1407	// Firstly, the cost of load/store operation.
1408	InstructionCost Cost;
1409	if (UseMaskForCond \|\| UseMaskForGaps)
1410	Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment,
1411	AddressSpace, CostKind);
1412	else
1413	Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace,
1414	CostKind);
1415
1416	// Legalize the vector type, and get the legalized and unlegalized type
1417	// sizes.
1418	MVT VecTyLT = getTypeLegalizationCost(Ty: VecTy).second;
1419	unsigned VecTySize = thisT()->getDataLayout().getTypeStoreSize(VecTy);
1420	unsigned VecTyLTSize = VecTyLT.getStoreSize();
1421
1422	// Scale the cost of the memory operation by the fraction of legalized
1423	// instructions that will actually be used. We shouldn't account for the
1424	// cost of dead instructions since they will be removed.
1425	//
1426	// E.g., An interleaved load of factor 8:
1427	// %vec = load <16 x i64>, <16 x i64> %ptr*
1428	// %v0 = shufflevector %vec, undef, <0, 8>
1429	//
1430	// If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be
1431	// used (those corresponding to elements [0:1] and [8:9] of the unlegalized
1432	// type). The other loads are unused.
1433	//
1434	// TODO: Note that legalization can turn masked loads/stores into unmasked
1435	// (legalized) loads/stores. This can be reflected in the cost.
1436	if (Cost.isValid() && VecTySize > VecTyLTSize) {
1437	// The number of loads of a legal type it will take to represent a load
1438	// of the unlegalized vector type.
1439	unsigned NumLegalInsts = divideCeil(Numerator: VecTySize, Denominator: VecTyLTSize);
1440
1441	// The number of elements of the unlegalized type that correspond to a
1442	// single legal instruction.
1443	unsigned NumEltsPerLegalInst = divideCeil(Numerator: NumElts, Denominator: NumLegalInsts);
1444
1445	// Determine which legal instructions will be used.
1446	BitVector UsedInsts(NumLegalInsts, false);
1447	for (unsigned Index : Indices)
1448	for (unsigned Elt = `0`; Elt < NumSubElts; ++Elt)
1449	UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst);
1450
1451	// Scale the cost of the load by the fraction of legal instructions that
1452	// will be used.
1453	Cost = divideCeil(Numerator: UsedInsts.count() * *Cost.getValue(), Denominator: NumLegalInsts);
1454	}
1455
1456	// Then plus the cost of interleave operation.
1457	assert(Indices.size() <= Factor &&
1458	"Interleaved memory op has too many members");
1459
1460	const APInt DemandedAllSubElts = APInt::getAllOnes(numBits: NumSubElts);
1461	const APInt DemandedAllResultElts = APInt::getAllOnes(numBits: NumElts);
1462
1463	APInt DemandedLoadStoreElts = APInt::getZero(numBits: NumElts);
1464	for (unsigned Index : Indices) {
1465	assert(Index < Factor && "Invalid index for interleaved memory op");
1466	for (unsigned Elm = `0`; Elm < NumSubElts; Elm++)
1467	DemandedLoadStoreElts.setBit(Index + Elm * Factor);
1468	}
1469
1470	if (Opcode == Instruction::Load) {
1471	// The interleave cost is similar to extract sub vectors' elements
1472	// from the wide vector, and insert them into sub vectors.
1473	//
1474	// E.g. An interleaved load of factor 2 (with one member of index 0):
1475	// %vec = load <8 x i32>, <8 x i32> %ptr*
1476	// %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0
1477	// The cost is estimated as extract elements at 0, 2, 4, 6 from the
1478	// <8 x i32> vector and insert them into a <4 x i32> vector.
1479	InstructionCost InsSubCost = thisT()->getScalarizationOverhead(
1480	SubVT, DemandedAllSubElts,
1481	/Insert/ true, /Extract/ false, CostKind);
1482	Cost += Indices.size() * InsSubCost;
1483	Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
1484	/Insert/ false,
1485	/Extract/ true, CostKind);
1486	} else {
1487	// The interleave cost is extract elements from sub vectors, and
1488	// insert them into the wide vector.
1489	//
1490	// E.g. An interleaved store of factor 3 with 2 members at indices 0,1:
1491	// (using VF=4):
1492	// %v0_v1 = shuffle %v0, %v1, <0,4,undef,1,5,undef,2,6,undef,3,7,undef>
1493	// %gaps.mask = <true, true, false, true, true, false,
1494	// true, true, false, true, true, false>
1495	// call llvm.masked.store <12 x i32> %v0_v1, <12 x i32> %ptr,*
1496	// i32 Align, <12 x i1> %gaps.mask
1497	// The cost is estimated as extract all elements (of actual members,
1498	// excluding gaps) from both <4 x i32> vectors and insert into the <12 x
1499	// i32> vector.
1500	InstructionCost ExtSubCost = thisT()->getScalarizationOverhead(
1501	SubVT, DemandedAllSubElts,
1502	/Insert/ false, /Extract/ true, CostKind);
1503	Cost += ExtSubCost * Indices.size();
1504	Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
1505	/Insert/ true,
1506	/Extract/ false, CostKind);
1507	}
1508
1509	if (!UseMaskForCond)
1510	return Cost;
1511
1512	Type *I8Type = Type::getInt8Ty(C&: VT->getContext());
1513
1514	Cost += thisT()->getReplicationShuffleCost(
1515	I8Type, Factor, NumSubElts,
1516	UseMaskForGaps ? DemandedLoadStoreElts : DemandedAllResultElts,
1517	CostKind);
1518
1519	// The Gaps mask is invariant and created outside the loop, therefore the
1520	// cost of creating it is not accounted for here. However if we have both
1521	// a MaskForGaps and some other mask that guards the execution of the
1522	// memory access, we need to account for the cost of And-ing the two masks
1523	// inside the loop.
1524	if (UseMaskForGaps) {
1525	auto *MaskVT = FixedVectorType::get(ElementType: I8Type, NumElts);
1526	Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, MaskVT,
1527	CostKind);
1528	}
1529
1530	return Cost;
1531	}
1532
1533	/// Get intrinsic cost based on arguments.
1534	InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1535	TTI::TargetCostKind CostKind) {
1536	// Check for generically free intrinsics.
1537	if (BaseT::getIntrinsicInstrCost(ICA, CostKind) == `0`)
1538	return `0`;
1539
1540	// Assume that target intrinsics are cheap.
1541	Intrinsic::ID IID = ICA.getID();
1542	if (Function::isTargetIntrinsic(IID))
1543	return TargetTransformInfo::TCC_Basic;
1544
1545	if (ICA.isTypeBasedOnly())
1546	return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
1547
1548	Type *RetTy = ICA.getReturnType();
1549
1550	ElementCount RetVF =
1551	(RetTy->isVectorTy() ? cast<VectorType>(Val: RetTy)->getElementCount()
1552	: ElementCount::getFixed(MinVal: `1`));
1553	const IntrinsicInst *I = ICA.getInst();
1554	const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
1555	FastMathFlags FMF = ICA.getFlags();
1556	switch (IID) {
1557	default:
1558	break;
1559
1560	case Intrinsic::powi:
1561	if (auto *RHSC = dyn_cast<ConstantInt>(Val: Args [`1`])) {
1562	bool ShouldOptForSize = I->getParent()->getParent()->hasOptSize();
1563	if (getTLI()->isBeneficialToExpandPowI(RHSC->getSExtValue(),
1564	ShouldOptForSize)) {
1565	// The cost is modeled on the expansion performed by ExpandPowI in
1566	// SelectionDAGBuilder.
1567	APInt Exponent = RHSC->getValue().abs();
1568	unsigned ActiveBits = Exponent.getActiveBits();
1569	unsigned PopCount = Exponent.popcount();
1570	InstructionCost Cost = (ActiveBits + PopCount - `2`) *
1571	thisT()->getArithmeticInstrCost(
1572	Instruction::FMul, RetTy, CostKind);
1573	if (RHSC->isNegative())
1574	Cost += thisT()->getArithmeticInstrCost(Instruction::FDiv, RetTy,
1575	CostKind);
1576	return Cost;
1577	}
1578	}
1579	break;
1580	case Intrinsic::cttz:
1581	// FIXME: If necessary, this should go in target-specific overrides.
1582	if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz(RetTy))
1583	return TargetTransformInfo::TCC_Basic;
1584	break;
1585
1586	case Intrinsic::ctlz:
1587	// FIXME: If necessary, this should go in target-specific overrides.
1588	if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz(RetTy))
1589	return TargetTransformInfo::TCC_Basic;
1590	break;
1591
1592	case Intrinsic::memcpy:
1593	return thisT()->getMemcpyCost(ICA.getInst());
1594
1595	case Intrinsic::masked_scatter: {
1596	const Value *Mask = Args [`3`];
1597	bool VarMask = !isa<Constant>(Val: Mask);
1598	Align Alignment = cast<ConstantInt>(Val: Args [`2`])->getAlignValue();
1599	return thisT()->getGatherScatterOpCost(Instruction::Store,
1600	ICA.getArgTypes()[`0`], Args [`1`],
1601	VarMask, Alignment, CostKind, I);
1602	}
1603	case Intrinsic::masked_gather: {
1604	const Value *Mask = Args [`2`];
1605	bool VarMask = !isa<Constant>(Val: Mask);
1606	Align Alignment = cast<ConstantInt>(Val: Args [`1`])->getAlignValue();
1607	return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args [`0`],
1608	VarMask, Alignment, CostKind, I);
1609	}
1610	case Intrinsic::experimental_vp_strided_store: {
1611	const Value *Data = Args [`0`];
1612	const Value *Ptr = Args [`1`];
1613	const Value *Mask = Args [`3`];
1614	const Value *EVL = Args [`4`];
1615	bool VarMask = !isa<Constant>(Val: Mask) \|\| !isa<Constant>(Val: EVL);
1616	Align Alignment = I->getParamAlign(ArgNo: `1`).valueOrOne();
1617	return thisT()->getStridedMemoryOpCost(Instruction::Store,
1618	Data->getType(), Ptr, VarMask,
1619	Alignment, CostKind, I);
1620	}
1621	case Intrinsic::experimental_vp_strided_load: {
1622	const Value *Ptr = Args [`0`];
1623	const Value *Mask = Args [`2`];
1624	const Value *EVL = Args [`3`];
1625	bool VarMask = !isa<Constant>(Val: Mask) \|\| !isa<Constant>(Val: EVL);
1626	Align Alignment = I->getParamAlign(ArgNo: `0`).valueOrOne();
1627	return thisT()->getStridedMemoryOpCost(Instruction::Load, RetTy, Ptr,
1628	VarMask, Alignment, CostKind, I);
1629	}
1630	case Intrinsic::experimental_stepvector: {
1631	if (isa<ScalableVectorType>(Val: RetTy))
1632	return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1633	// The cost of materialising a constant integer vector.
1634	return TargetTransformInfo::TCC_Basic;
1635	}
1636	case Intrinsic::vector_extract: {
1637	// FIXME: Handle case where a scalable vector is extracted from a scalable
1638	// vector
1639	if (isa<ScalableVectorType>(Val: RetTy))
1640	return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1641	unsigned Index = cast<ConstantInt>(Val: Args [`1`])->getZExtValue();
1642	return thisT()->getShuffleCost(
1643	TTI::SK_ExtractSubvector, cast<VectorType>(Val: Args [`0`]->getType()),
1644	std::nullopt, CostKind, Index, cast<VectorType>(Val: RetTy));
1645	}
1646	case Intrinsic::vector_insert: {
1647	// FIXME: Handle case where a scalable vector is inserted into a scalable
1648	// vector
1649	if (isa<ScalableVectorType>(Val: Args [`1`]->getType()))
1650	return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1651	unsigned Index = cast<ConstantInt>(Val: Args [`2`])->getZExtValue();
1652	return thisT()->getShuffleCost(
1653	TTI::SK_InsertSubvector, cast<VectorType>(Val: Args [`0`]->getType()),
1654	std::nullopt, CostKind, Index, cast<VectorType>(Val: Args [`1`]->getType()));
1655	}
1656	case Intrinsic::experimental_vector_reverse: {
1657	return thisT()->getShuffleCost(
1658	TTI::SK_Reverse, cast<VectorType>(Val: Args [`0`]->getType()), std::nullopt,
1659	CostKind, `0`, cast<VectorType>(Val: RetTy));
1660	}
1661	case Intrinsic::experimental_vector_splice: {
1662	unsigned Index = cast<ConstantInt>(Val: Args [`2`])->getZExtValue();
1663	return thisT()->getShuffleCost(
1664	TTI::SK_Splice, cast<VectorType>(Val: Args [`0`]->getType()), std::nullopt,
1665	CostKind, Index, cast<VectorType>(Val: RetTy));
1666	}
1667	case Intrinsic::vector_reduce_add:
1668	case Intrinsic::vector_reduce_mul:
1669	case Intrinsic::vector_reduce_and:
1670	case Intrinsic::vector_reduce_or:
1671	case Intrinsic::vector_reduce_xor:
1672	case Intrinsic::vector_reduce_smax:
1673	case Intrinsic::vector_reduce_smin:
1674	case Intrinsic::vector_reduce_fmax:
1675	case Intrinsic::vector_reduce_fmin:
1676	case Intrinsic::vector_reduce_fmaximum:
1677	case Intrinsic::vector_reduce_fminimum:
1678	case Intrinsic::vector_reduce_umax:
1679	case Intrinsic::vector_reduce_umin: {
1680	IntrinsicCostAttributes Attrs(IID, RetTy, Args [`0`]->getType(), FMF, I, `1`);
1681	return getTypeBasedIntrinsicInstrCost(ICA: Attrs, CostKind);
1682	}
1683	case Intrinsic::vector_reduce_fadd:
1684	case Intrinsic::vector_reduce_fmul: {
1685	IntrinsicCostAttributes Attrs(
1686	IID, RetTy, {Args [`0`]->getType(), Args [`1`]->getType()}, FMF, I, `1`);
1687	return getTypeBasedIntrinsicInstrCost(ICA: Attrs, CostKind);
1688	}
1689	case Intrinsic::fshl:
1690	case Intrinsic::fshr: {
1691	const Value *X = Args [`0`];
1692	const Value *Y = Args [`1`];
1693	const Value *Z = Args [`2`];
1694	const TTI::OperandValueInfo OpInfoX = TTI::getOperandInfo(V: X);
1695	const TTI::OperandValueInfo OpInfoY = TTI::getOperandInfo(V: Y);
1696	const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(V: Z);
1697	const TTI::OperandValueInfo OpInfoBW =
1698	{.Kind: TTI::OK_UniformConstantValue,
1699	.Properties: isPowerOf2_32(Value: RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2
1700	: TTI::OP_None};
1701
1702	// fshl: (X << (Z % BW)) \| (Y >> (BW - (Z % BW)))
1703	// fshr: (X << (BW - (Z % BW))) \| (Y >> (Z % BW))
1704	InstructionCost Cost = `0`;
1705	Cost +=
1706	thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
1707	Cost +=
1708	thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind);
1709	Cost += thisT()->getArithmeticInstrCost(
1710	BinaryOperator::Shl, RetTy, CostKind, OpInfoX,
1711	{OpInfoZ.Kind, TTI::OP_None});
1712	Cost += thisT()->getArithmeticInstrCost(
1713	BinaryOperator::LShr, RetTy, CostKind, OpInfoY,
1714	{OpInfoZ.Kind, TTI::OP_None});
1715	// Non-constant shift amounts requires a modulo.
1716	if (!OpInfoZ.isConstant())
1717	Cost += thisT()->getArithmeticInstrCost(BinaryOperator::URem, RetTy,
1718	CostKind, OpInfoZ, OpInfoBW);
1719	// For non-rotates (X != Y) we must add shift-by-zero handling costs.
1720	if (X != Y) {
1721	Type *CondTy = RetTy->getWithNewBitWidth(NewBitWidth: `1`);
1722	Cost +=
1723	thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1724	CmpInst::ICMP_EQ, CostKind);
1725	Cost +=
1726	thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1727	CmpInst::ICMP_EQ, CostKind);
1728	}
1729	return Cost;
1730	}
1731	case Intrinsic::get_active_lane_mask: {
1732	EVT ResVT = getTLI()->getValueType(DL, RetTy, true);
1733	EVT ArgType = getTLI()->getValueType(DL, ICA.getArgTypes()[`0`], true);
1734
1735	// If we're not expanding the intrinsic then we assume this is cheap
1736	// to implement.
1737	if (!getTLI()->shouldExpandGetActiveLaneMask(ResVT, ArgType)) {
1738	return getTypeLegalizationCost(Ty: RetTy).first;
1739	}
1740
1741	// Create the expanded types that will be used to calculate the uadd_sat
1742	// operation.
1743	Type *ExpRetTy = VectorType::get(
1744	ElementType: ICA.getArgTypes()[`0`], EC: cast<VectorType>(Val: RetTy)->getElementCount());
1745	IntrinsicCostAttributes Attrs(Intrinsic::uadd_sat, ExpRetTy, {}, FMF);
1746	InstructionCost Cost =
1747	thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1748	Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, ExpRetTy, RetTy,
1749	CmpInst::ICMP_ULT, CostKind);
1750	return Cost;
1751	}
1752	}
1753
1754	// VP Intrinsics should have the same cost as their non-vp counterpart.
1755	// TODO: Adjust the cost to make the vp intrinsic cheaper than its non-vp
1756	// counterpart when the vector length argument is smaller than the maximum
1757	// vector length.
1758	// TODO: Support other kinds of VPIntrinsics
1759	if (VPIntrinsic::isVPIntrinsic(ICA.getID())) {
1760	std::optional<unsigned> FOp =
1761	VPIntrinsic::getFunctionalOpcodeForVP(ID: ICA.getID());
1762	if (FOp) {
1763	if (ICA.getID() == Intrinsic::vp_load) {
1764	Align Alignment;
1765	if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(Val: ICA.getInst()))
1766	Alignment = VPI->getPointerAlignment().valueOrOne();
1767	unsigned AS = `0`;
1768	if (ICA.getArgs().size() > `1`)
1769	if (auto *PtrTy =
1770	dyn_cast<PointerType>(Val: ICA.getArgs()[`0`]->getType()))
1771	AS = PtrTy->getAddressSpace();
1772	return thisT()->getMemoryOpCost(*FOp, ICA.getReturnType(), Alignment,
1773	AS, CostKind);
1774	}
1775	if (ICA.getID() == Intrinsic::vp_store) {
1776	Align Alignment;
1777	if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(Val: ICA.getInst()))
1778	Alignment = VPI->getPointerAlignment().valueOrOne();
1779	unsigned AS = `0`;
1780	if (ICA.getArgs().size() >= `2`)
1781	if (auto *PtrTy =
1782	dyn_cast<PointerType>(Val: ICA.getArgs()[`1`]->getType()))
1783	AS = PtrTy->getAddressSpace();
1784	return thisT()->getMemoryOpCost(*FOp, Args [`0`]->getType(), Alignment,
1785	AS, CostKind);
1786	}
1787	if (VPBinOpIntrinsic::isVPBinOp(ID: ICA.getID())) {
1788	return thisT()->getArithmeticInstrCost(*FOp, ICA.getReturnType(),
1789	CostKind);
1790	}
1791	}
1792
1793	std::optional<Intrinsic::ID> FID =
1794	VPIntrinsic::getFunctionalIntrinsicIDForVP(ID: ICA.getID());
1795	if (FID) {
1796	// Non-vp version will have same Args/Tys except mask and vector length.
1797	assert(ICA.getArgs().size() >= `2` && ICA.getArgTypes().size() >= `2` &&
1798	"Expected VPIntrinsic to have Mask and Vector Length args and "
1799	"types");
1800	ArrayRef<Type *> NewTys = ArrayRef(ICA.getArgTypes()).drop_back(N: `2`);
1801
1802	// VPReduction intrinsics have a start value argument that their non-vp
1803	// counterparts do not have, except for the fadd and fmul non-vp
1804	// counterpart.
1805	if (VPReductionIntrinsic::isVPReduction(ID: ICA.getID()) &&
1806	*FID != Intrinsic::vector_reduce_fadd &&
1807	*FID != Intrinsic::vector_reduce_fmul)
1808	NewTys = NewTys.drop_front();
1809
1810	IntrinsicCostAttributes NewICA(*FID, ICA.getReturnType(), NewTys,
1811	ICA.getFlags());
1812	return thisT()->getIntrinsicInstrCost(NewICA, CostKind);
1813	}
1814	}
1815
1816	// Assume that we need to scalarize this intrinsic.)
1817	// Compute the scalarization overhead based on Args for a vector
1818	// intrinsic.
1819	InstructionCost ScalarizationCost = InstructionCost::getInvalid();
1820	if (RetVF.isVector() && !RetVF.isScalable()) {
1821	ScalarizationCost = `0`;
1822	if (!RetTy->isVoidTy())
1823	ScalarizationCost += getScalarizationOverhead(
1824	cast<VectorType>(Val: RetTy),
1825	/Insert/ true, /Extract/ false, CostKind);
1826	ScalarizationCost +=
1827	getOperandsScalarizationOverhead(Args, Tys: ICA.getArgTypes(), CostKind);
1828	}
1829
1830	IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I,
1831	ScalarizationCost);
1832	return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1833	}
1834
1835	/// Get intrinsic cost based on argument types.
1836	/// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the
1837	/// cost of scalarizing the arguments and the return value will be computed
1838	/// based on types.
1839	InstructionCost
1840	getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1841	TTI::TargetCostKind CostKind) {
1842	Intrinsic::ID IID = ICA.getID();
1843	Type *RetTy = ICA.getReturnType();
1844	const SmallVectorImpl<Type *> &Tys = ICA.getArgTypes();
1845	FastMathFlags FMF = ICA.getFlags();
1846	InstructionCost ScalarizationCostPassed = ICA.getScalarizationCost();
1847	bool SkipScalarizationCost = ICA.skipScalarizationCost();
1848
1849	VectorType VecOpTy = nullptr*;
1850	if (!Tys.empty()) {
1851	// The vector reduction operand is operand 0 except for fadd/fmul.
1852	// Their operand 0 is a scalar start value, so the vector op is operand 1.
1853	unsigned VecTyIndex = `0`;
1854	if (IID == Intrinsic::vector_reduce_fadd \|\|
1855	IID == Intrinsic::vector_reduce_fmul)
1856	VecTyIndex = `1`;
1857	assert(Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes");
1858	VecOpTy = dyn_cast<VectorType>(Val: Tys [VecTyIndex]);
1859	}
1860
1861	// Library call cost - other than size, make it expensive.
1862	unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? `1` : `10`;
1863	unsigned ISD = `0`;
1864	switch (IID) {
1865	default: {
1866	// Scalable vectors cannot be scalarized, so return Invalid.
1867	if (isa<ScalableVectorType>(Val: RetTy) \|\| any_of(Tys, [](const Type *Ty) {
1868	return isa<ScalableVectorType>(Val: Ty);
1869	}))
1870	return InstructionCost::getInvalid();
1871
1872	// Assume that we need to scalarize this intrinsic.
1873	InstructionCost ScalarizationCost =
1874	SkipScalarizationCost ? ScalarizationCostPassed : `0`;
1875	unsigned ScalarCalls = `1`;
1876	Type *ScalarRetTy = RetTy;
1877	if (auto *RetVTy = dyn_cast<VectorType>(Val: RetTy)) {
1878	if (!SkipScalarizationCost)
1879	ScalarizationCost = getScalarizationOverhead(
1880	RetVTy, /Insert/ true, /Extract/ false, CostKind);
1881	ScalarCalls = std::max(a: ScalarCalls,
1882	b: cast<FixedVectorType>(Val: RetVTy)->getNumElements());
1883	ScalarRetTy = RetTy->getScalarType();
1884	}
1885	SmallVector<Type *, `4`> ScalarTys;
1886	for (unsigned i = `0`, ie = Tys.size(); i != ie; ++i) {
1887	Type *Ty = Tys [i];
1888	if (auto *VTy = dyn_cast<VectorType>(Val: Ty)) {
1889	if (!SkipScalarizationCost)
1890	ScalarizationCost += getScalarizationOverhead(
1891	VTy, /Insert/ false, /Extract/ true, CostKind);
1892	ScalarCalls = std::max(a: ScalarCalls,
1893	b: cast<FixedVectorType>(Val: VTy)->getNumElements());
1894	Ty = Ty->getScalarType();
1895	}
1896	ScalarTys.push_back(Elt: Ty);
1897	}
1898	if (ScalarCalls == `1`)
1899	return `1`; // Return cost of a scalar intrinsic. Assume it to be cheap.
1900
1901	IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF);
1902	InstructionCost ScalarCost =
1903	thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind);
1904
1905	return ScalarCalls * ScalarCost + ScalarizationCost;
1906	}
1907	// Look for intrinsics that can be lowered directly or turned into a scalar
1908	// intrinsic call.
1909	case Intrinsic::sqrt:
1910	ISD = ISD::FSQRT;
1911	break;
1912	case Intrinsic::sin:
1913	ISD = ISD::FSIN;
1914	break;
1915	case Intrinsic::cos:
1916	ISD = ISD::FCOS;
1917	break;
1918	case Intrinsic::exp:
1919	ISD = ISD::FEXP;
1920	break;
1921	case Intrinsic::exp2:
1922	ISD = ISD::FEXP2;
1923	break;
1924	case Intrinsic::exp10:
1925	ISD = ISD::FEXP10;
1926	break;
1927	case Intrinsic::log:
1928	ISD = ISD::FLOG;
1929	break;
1930	case Intrinsic::log10:
1931	ISD = ISD::FLOG10;
1932	break;
1933	case Intrinsic::log2:
1934	ISD = ISD::FLOG2;
1935	break;
1936	case Intrinsic::fabs:
1937	ISD = ISD::FABS;
1938	break;
1939	case Intrinsic::canonicalize:
1940	ISD = ISD::FCANONICALIZE;
1941	break;
1942	case Intrinsic::minnum:
1943	ISD = ISD::FMINNUM;
1944	break;
1945	case Intrinsic::maxnum:
1946	ISD = ISD::FMAXNUM;
1947	break;
1948	case Intrinsic::minimum:
1949	ISD = ISD::FMINIMUM;
1950	break;
1951	case Intrinsic::maximum:
1952	ISD = ISD::FMAXIMUM;
1953	break;
1954	case Intrinsic::copysign:
1955	ISD = ISD::FCOPYSIGN;
1956	break;
1957	case Intrinsic::floor:
1958	ISD = ISD::FFLOOR;
1959	break;
1960	case Intrinsic::ceil:
1961	ISD = ISD::FCEIL;
1962	break;
1963	case Intrinsic::trunc:
1964	ISD = ISD::FTRUNC;
1965	break;
1966	case Intrinsic::nearbyint:
1967	ISD = ISD::FNEARBYINT;
1968	break;
1969	case Intrinsic::rint:
1970	ISD = ISD::FRINT;
1971	break;
1972	case Intrinsic::lrint:
1973	ISD = ISD::LRINT;
1974	break;
1975	case Intrinsic::llrint:
1976	ISD = ISD::LLRINT;
1977	break;
1978	case Intrinsic::round:
1979	ISD = ISD::FROUND;
1980	break;
1981	case Intrinsic::roundeven:
1982	ISD = ISD::FROUNDEVEN;
1983	break;
1984	case Intrinsic::pow:
1985	ISD = ISD::FPOW;
1986	break;
1987	case Intrinsic::fma:
1988	ISD = ISD::FMA;
1989	break;
1990	case Intrinsic::fmuladd:
1991	ISD = ISD::FMA;
1992	break;
1993	case Intrinsic::experimental_constrained_fmuladd:
1994	ISD = ISD::STRICT_FMA;
1995	break;
1996	// FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free.
1997	case Intrinsic::lifetime_start:
1998	case Intrinsic::lifetime_end:
1999	case Intrinsic::sideeffect:
2000	case Intrinsic::pseudoprobe:
2001	case Intrinsic::arithmetic_fence:
2002	return `0`;
2003	case Intrinsic::masked_store: {
2004	Type *Ty = Tys [`0`];
2005	Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
2006	return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, `0`,
2007	CostKind);
2008	}
2009	case Intrinsic::masked_load: {
2010	Type *Ty = RetTy;
2011	Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
2012	return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, `0`,
2013	CostKind);
2014	}
2015	case Intrinsic::vector_reduce_add:
2016	return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy,
2017	std::nullopt, CostKind);
2018	case Intrinsic::vector_reduce_mul:
2019	return thisT()->getArithmeticReductionCost(Instruction::Mul, VecOpTy,
2020	std::nullopt, CostKind);
2021	case Intrinsic::vector_reduce_and:
2022	return thisT()->getArithmeticReductionCost(Instruction::And, VecOpTy,
2023	std::nullopt, CostKind);
2024	case Intrinsic::vector_reduce_or:
2025	return thisT()->getArithmeticReductionCost(Instruction::Or, VecOpTy,
2026	std::nullopt, CostKind);
2027	case Intrinsic::vector_reduce_xor:
2028	return thisT()->getArithmeticReductionCost(Instruction::Xor, VecOpTy,
2029	std::nullopt, CostKind);
2030	case Intrinsic::vector_reduce_fadd:
2031	return thisT()->getArithmeticReductionCost(Instruction::FAdd, VecOpTy,
2032	FMF, CostKind);
2033	case Intrinsic::vector_reduce_fmul:
2034	return thisT()->getArithmeticReductionCost(Instruction::FMul, VecOpTy,
2035	FMF, CostKind);
2036	case Intrinsic::vector_reduce_smax:
2037	return thisT()->getMinMaxReductionCost(Intrinsic::smax, VecOpTy,
2038	ICA.getFlags(), CostKind);
2039	case Intrinsic::vector_reduce_smin:
2040	return thisT()->getMinMaxReductionCost(Intrinsic::smin, VecOpTy,
2041	ICA.getFlags(), CostKind);
2042	case Intrinsic::vector_reduce_umax:
2043	return thisT()->getMinMaxReductionCost(Intrinsic::umax, VecOpTy,
2044	ICA.getFlags(), CostKind);
2045	case Intrinsic::vector_reduce_umin:
2046	return thisT()->getMinMaxReductionCost(Intrinsic::umin, VecOpTy,
2047	ICA.getFlags(), CostKind);
2048	case Intrinsic::vector_reduce_fmax:
2049	return thisT()->getMinMaxReductionCost(Intrinsic::maxnum, VecOpTy,
2050	ICA.getFlags(), CostKind);
2051	case Intrinsic::vector_reduce_fmin:
2052	return thisT()->getMinMaxReductionCost(Intrinsic::minnum, VecOpTy,
2053	ICA.getFlags(), CostKind);
2054	case Intrinsic::vector_reduce_fmaximum:
2055	return thisT()->getMinMaxReductionCost(Intrinsic::maximum, VecOpTy,
2056	ICA.getFlags(), CostKind);
2057	case Intrinsic::vector_reduce_fminimum:
2058	return thisT()->getMinMaxReductionCost(Intrinsic::minimum, VecOpTy,
2059	ICA.getFlags(), CostKind);
2060	case Intrinsic::abs: {
2061	// abs(X) = select(icmp(X,0),X,sub(0,X))
2062	Type *CondTy = RetTy->getWithNewBitWidth(NewBitWidth: `1`);
2063	CmpInst::Predicate Pred = CmpInst::ICMP_SGT;
2064	InstructionCost Cost = `0`;
2065	Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
2066	Pred, CostKind);
2067	Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2068	Pred, CostKind);
2069	// TODO: Should we add an OperandValueProperties::OP_Zero property?
2070	Cost += thisT()->getArithmeticInstrCost(
2071	BinaryOperator::Sub, RetTy, CostKind, {TTI::OK_UniformConstantValue, TTI::OP_None});
2072	return Cost;
2073	}
2074	case Intrinsic::smax:
2075	case Intrinsic::smin:
2076	case Intrinsic::umax:
2077	case Intrinsic::umin: {
2078	// minmax(X,Y) = select(icmp(X,Y),X,Y)
2079	Type *CondTy = RetTy->getWithNewBitWidth(NewBitWidth: `1`);
2080	bool IsUnsigned = IID == Intrinsic::umax \|\| IID == Intrinsic::umin;
2081	CmpInst::Predicate Pred =
2082	IsUnsigned ? CmpInst::ICMP_UGT : CmpInst::ICMP_SGT;
2083	InstructionCost Cost = `0`;
2084	Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
2085	Pred, CostKind);
2086	Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2087	Pred, CostKind);
2088	return Cost;
2089	}
2090	case Intrinsic::sadd_sat:
2091	case Intrinsic::ssub_sat: {
2092	Type *CondTy = RetTy->getWithNewBitWidth(NewBitWidth: `1`);
2093
2094	Type *OpTy = StructType::create(Elements: {RetTy, CondTy});
2095	Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat
2096	? Intrinsic::sadd_with_overflow
2097	: Intrinsic::ssub_with_overflow;
2098	CmpInst::Predicate Pred = CmpInst::ICMP_SGT;
2099
2100	// SatMax -> Overflow && SumDiff < 0
2101	// SatMin -> Overflow && SumDiff >= 0
2102	InstructionCost Cost = `0`;
2103	IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
2104	nullptr, ScalarizationCostPassed);
2105	Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
2106	Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
2107	Pred, CostKind);
2108	Cost += `2` * thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy,
2109	CondTy, Pred, CostKind);
2110	return Cost;
2111	}
2112	case Intrinsic::uadd_sat:
2113	case Intrinsic::usub_sat: {
2114	Type *CondTy = RetTy->getWithNewBitWidth(NewBitWidth: `1`);
2115
2116	Type *OpTy = StructType::create(Elements: {RetTy, CondTy});
2117	Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat
2118	? Intrinsic::uadd_with_overflow
2119	: Intrinsic::usub_with_overflow;
2120
2121	InstructionCost Cost = `0`;
2122	IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
2123	nullptr, ScalarizationCostPassed);
2124	Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
2125	Cost +=
2126	thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2127	CmpInst::BAD_ICMP_PREDICATE, CostKind);
2128	return Cost;
2129	}
2130	case Intrinsic::smul_fix:
2131	case Intrinsic::umul_fix: {
2132	unsigned ExtSize = RetTy->getScalarSizeInBits() * `2`;
2133	Type *ExtTy = RetTy->getWithNewBitWidth(NewBitWidth: ExtSize);
2134
2135	unsigned ExtOp =
2136	IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
2137	TTI::CastContextHint CCH = TTI::CastContextHint::None;
2138
2139	InstructionCost Cost = `0`;
2140	Cost += `2` * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind);
2141	Cost +=
2142	thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2143	Cost += `2` * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy,
2144	CCH, CostKind);
2145	Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, RetTy,
2146	CostKind,
2147	{TTI::OK_AnyValue, TTI::OP_None},
2148	{TTI::OK_UniformConstantValue, TTI::OP_None});
2149	Cost += thisT()->getArithmeticInstrCost(Instruction::Shl, RetTy, CostKind,
2150	{TTI::OK_AnyValue, TTI::OP_None},
2151	{TTI::OK_UniformConstantValue, TTI::OP_None});
2152	Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind);
2153	return Cost;
2154	}
2155	case Intrinsic::sadd_with_overflow:
2156	case Intrinsic::ssub_with_overflow: {
2157	Type *SumTy = RetTy->getContainedType(i: `0`);
2158	Type *OverflowTy = RetTy->getContainedType(i: `1`);
2159	unsigned Opcode = IID == Intrinsic::sadd_with_overflow
2160	? BinaryOperator::Add
2161	: BinaryOperator::Sub;
2162
2163	// Add:
2164	// Overflow -> (Result < LHS) ^ (RHS < 0)
2165	// Sub:
2166	// Overflow -> (Result < LHS) ^ (RHS > 0)
2167	InstructionCost Cost = `0`;
2168	Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
2169	Cost += `2` * thisT()->getCmpSelInstrCost(
2170	Instruction::ICmp, SumTy, OverflowTy,
2171	CmpInst::ICMP_SGT, CostKind);
2172	Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Xor, OverflowTy,
2173	CostKind);
2174	return Cost;
2175	}
2176	case Intrinsic::uadd_with_overflow:
2177	case Intrinsic::usub_with_overflow: {
2178	Type *SumTy = RetTy->getContainedType(i: `0`);
2179	Type *OverflowTy = RetTy->getContainedType(i: `1`);
2180	unsigned Opcode = IID == Intrinsic::uadd_with_overflow
2181	? BinaryOperator::Add
2182	: BinaryOperator::Sub;
2183	CmpInst::Predicate Pred = IID == Intrinsic::uadd_with_overflow
2184	? CmpInst::ICMP_ULT
2185	: CmpInst::ICMP_UGT;
2186
2187	InstructionCost Cost = `0`;
2188	Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
2189	Cost +=
2190	thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy, OverflowTy,
2191	Pred, CostKind);
2192	return Cost;
2193	}
2194	case Intrinsic::smul_with_overflow:
2195	case Intrinsic::umul_with_overflow: {
2196	Type *MulTy = RetTy->getContainedType(i: `0`);
2197	Type *OverflowTy = RetTy->getContainedType(i: `1`);
2198	unsigned ExtSize = MulTy->getScalarSizeInBits() * `2`;
2199	Type *ExtTy = MulTy->getWithNewBitWidth(NewBitWidth: ExtSize);
2200	bool IsSigned = IID == Intrinsic::smul_with_overflow;
2201
2202	unsigned ExtOp = IsSigned ? Instruction::SExt : Instruction::ZExt;
2203	TTI::CastContextHint CCH = TTI::CastContextHint::None;
2204
2205	InstructionCost Cost = `0`;
2206	Cost += `2` * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind);
2207	Cost +=
2208	thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2209	Cost += `2` * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy,
2210	CCH, CostKind);
2211	Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, ExtTy,
2212	CostKind,
2213	{TTI::OK_AnyValue, TTI::OP_None},
2214	{TTI::OK_UniformConstantValue, TTI::OP_None});
2215
2216	if (IsSigned)
2217	Cost += thisT()->getArithmeticInstrCost(Instruction::AShr, MulTy,
2218	CostKind,
2219	{TTI::OK_AnyValue, TTI::OP_None},
2220	{TTI::OK_UniformConstantValue, TTI::OP_None});
2221
2222	Cost += thisT()->getCmpSelInstrCost(
2223	BinaryOperator::ICmp, MulTy, OverflowTy, CmpInst::ICMP_NE, CostKind);
2224	return Cost;
2225	}
2226	case Intrinsic::fptosi_sat:
2227	case Intrinsic::fptoui_sat: {
2228	if (Tys.empty())
2229	break;
2230	Type *FromTy = Tys [`0`];
2231	bool IsSigned = IID == Intrinsic::fptosi_sat;
2232
2233	InstructionCost Cost = `0`;
2234	IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FromTy,
2235	{FromTy, FromTy});
2236	Cost += thisT()->getIntrinsicInstrCost(Attrs1, CostKind);
2237	IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FromTy,
2238	{FromTy, FromTy});
2239	Cost += thisT()->getIntrinsicInstrCost(Attrs2, CostKind);
2240	Cost += thisT()->getCastInstrCost(
2241	IsSigned ? Instruction::FPToSI : Instruction::FPToUI, RetTy, FromTy,
2242	TTI::CastContextHint::None, CostKind);
2243	if (IsSigned) {
2244	Type *CondTy = RetTy->getWithNewBitWidth(NewBitWidth: `1`);
2245	Cost += thisT()->getCmpSelInstrCost(
2246	BinaryOperator::FCmp, FromTy, CondTy, CmpInst::FCMP_UNO, CostKind);
2247	Cost += thisT()->getCmpSelInstrCost(
2248	BinaryOperator::Select, RetTy, CondTy, CmpInst::FCMP_UNO, CostKind);
2249	}
2250	return Cost;
2251	}
2252	case Intrinsic::ctpop:
2253	ISD = ISD::CTPOP;
2254	// In case of legalization use TCC_Expensive. This is cheaper than a
2255	// library call but still not a cheap instruction.
2256	SingleCallCost = TargetTransformInfo::TCC_Expensive;
2257	break;
2258	case Intrinsic::ctlz:
2259	ISD = ISD::CTLZ;
2260	break;
2261	case Intrinsic::cttz:
2262	ISD = ISD::CTTZ;
2263	break;
2264	case Intrinsic::bswap:
2265	ISD = ISD::BSWAP;
2266	break;
2267	case Intrinsic::bitreverse:
2268	ISD = ISD::BITREVERSE;
2269	break;
2270	}
2271
2272	const TargetLoweringBase *TLI = getTLI();
2273	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: RetTy);
2274
2275	if (TLI->isOperationLegalOrPromote(Op: ISD, VT: LT.second)) {
2276	if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() &&
2277	TLI->isFAbsFree(LT.second)) {
2278	return `0`;
2279	}
2280
2281	// The operation is legal. Assume it costs 1.
2282	// If the type is split to multiple registers, assume that there is some
2283	// overhead to this.
2284	// TODO: Once we have extract/insert subvector cost we need to use them.
2285	if (LT.first > `1`)
2286	return (LT.first * `2`);
2287	else
2288	return (LT.first * `1`);
2289	} else if (!TLI->isOperationExpand(Op: ISD, VT: LT.second)) {
2290	// If the operation is custom lowered then assume
2291	// that the code is twice as expensive.
2292	return (LT.first * `2`);
2293	}
2294
2295	// If we can't lower fmuladd into an FMA estimate the cost as a floating
2296	// point mul followed by an add.
2297	if (IID == Intrinsic::fmuladd)
2298	return thisT()->getArithmeticInstrCost(BinaryOperator::FMul, RetTy,
2299	CostKind) +
2300	thisT()->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy,
2301	CostKind);
2302	if (IID == Intrinsic::experimental_constrained_fmuladd) {
2303	IntrinsicCostAttributes FMulAttrs(
2304	Intrinsic::experimental_constrained_fmul, RetTy, Tys);
2305	IntrinsicCostAttributes FAddAttrs(
2306	Intrinsic::experimental_constrained_fadd, RetTy, Tys);
2307	return thisT()->getIntrinsicInstrCost(FMulAttrs, CostKind) +
2308	thisT()->getIntrinsicInstrCost(FAddAttrs, CostKind);
2309	}
2310
2311	// Else, assume that we need to scalarize this intrinsic. For math builtins
2312	// this will emit a costly libcall, adding call overhead and spills. Make it
2313	// very expensive.
2314	if (auto *RetVTy = dyn_cast<VectorType>(Val: RetTy)) {
2315	// Scalable vectors cannot be scalarized, so return Invalid.
2316	if (isa<ScalableVectorType>(Val: RetTy) \|\| any_of(Tys, [](const Type *Ty) {
2317	return isa<ScalableVectorType>(Val: Ty);
2318	}))
2319	return InstructionCost::getInvalid();
2320
2321	InstructionCost ScalarizationCost =
2322	SkipScalarizationCost
2323	? ScalarizationCostPassed
2324	: getScalarizationOverhead(RetVTy, /Insert/ true,
2325	/Extract/ false, CostKind);
2326
2327	unsigned ScalarCalls = cast<FixedVectorType>(Val: RetVTy)->getNumElements();
2328	SmallVector<Type *, `4`> ScalarTys;
2329	for (unsigned i = `0`, ie = Tys.size(); i != ie; ++i) {
2330	Type *Ty = Tys [i];
2331	if (Ty->isVectorTy())
2332	Ty = Ty->getScalarType();
2333	ScalarTys.push_back(Elt: Ty);
2334	}
2335	IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF);
2336	InstructionCost ScalarCost =
2337	thisT()->getIntrinsicInstrCost(Attrs, CostKind);
2338	for (unsigned i = `0`, ie = Tys.size(); i != ie; ++i) {
2339	if (auto *VTy = dyn_cast<VectorType>(Val: Tys [i])) {
2340	if (!ICA.skipScalarizationCost())
2341	ScalarizationCost += getScalarizationOverhead(
2342	VTy, /Insert/ false, /Extract/ true, CostKind);
2343	ScalarCalls = std::max(a: ScalarCalls,
2344	b: cast<FixedVectorType>(Val: VTy)->getNumElements());
2345	}
2346	}
2347	return ScalarCalls * ScalarCost + ScalarizationCost;
2348	}
2349
2350	// This is going to be turned into a library call, make it expensive.
2351	return SingleCallCost;
2352	}
2353
2354	/// Compute a cost of the given call instruction.
2355	///
2356	/// Compute the cost of calling function F with return type RetTy and
2357	/// argument types Tys. F might be nullptr, in this case the cost of an
2358	/// arbitrary call with the specified signature will be returned.
2359	/// This is used, for instance, when we estimate call of a vector
2360	/// counterpart of the given function.
2361	/// \param F Called function, might be nullptr.
2362	/// \param RetTy Return value types.
2363	/// \param Tys Argument types.
2364	/// \returns The cost of Call instruction.
2365	InstructionCost getCallInstrCost(Function F, Type RetTy,
2366	ArrayRef<Type *> Tys,
2367	TTI::TargetCostKind CostKind) {
2368	return `10`;
2369	}
2370
2371	unsigned getNumberOfParts(Type *Tp) {
2372	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Tp);
2373	return LT.first.isValid() ? *LT.first.getValue() : `0`;
2374	}
2375
2376	InstructionCost getAddressComputationCost(Type Ty, ScalarEvolution ,
2377	const SCEV *) {
2378	return `0`;
2379	}
2380
2381	/// Try to calculate arithmetic and shuffle op costs for reduction intrinsics.
2382	/// We're assuming that reduction operation are performing the following way:
2383	///
2384	/// %val1 = shufflevector<n x t> %val, <n x t> %undef,
2385	/// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef>
2386	/// \----------------v-------------/ \----------v------------/
2387	/// n/2 elements n/2 elements
2388	/// %red1 = op <n x t> %val, <n x t> val1
2389	/// After this operation we have a vector %red1 where only the first n/2
2390	/// elements are meaningful, the second n/2 elements are undefined and can be
2391	/// dropped. All other operations are actually working with the vector of
2392	/// length n/2, not n, though the real vector length is still n.
2393	/// %val2 = shufflevector<n x t> %red1, <n x t> %undef,
2394	/// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef>
2395	/// \----------------v-------------/ \----------v------------/
2396	/// n/4 elements 3n/4 elements*
2397	/// %red2 = op <n x t> %red1, <n x t> val2 - working with the vector of
2398	/// length n/2, the resulting vector has length n/4 etc.
2399	///
2400	/// The cost model should take into account that the actual length of the
2401	/// vector is reduced on each iteration.
2402	InstructionCost getTreeReductionCost(unsigned Opcode, VectorType *Ty,
2403	TTI::TargetCostKind CostKind) {
2404	// Targets must implement a default value for the scalable case, since
2405	// we don't know how many lanes the vector has.
2406	if (isa<ScalableVectorType>(Val: Ty))
2407	return InstructionCost::getInvalid();
2408
2409	Type *ScalarTy = Ty->getElementType();
2410	unsigned NumVecElts = cast<FixedVectorType>(Val: Ty)->getNumElements();
2411	if ((Opcode == Instruction::Or \|\| Opcode == Instruction::And) &&
2412	ScalarTy == IntegerType::getInt1Ty(C&: Ty->getContext()) &&
2413	NumVecElts >= `2`) {
2414	// Or reduction for i1 is represented as:
2415	// %val = bitcast <ReduxWidth x i1> to iReduxWidth
2416	// %res = cmp ne iReduxWidth %val, 0
2417	// And reduction for i1 is represented as:
2418	// %val = bitcast <ReduxWidth x i1> to iReduxWidth
2419	// %res = cmp eq iReduxWidth %val, 11111
2420	Type *ValTy = IntegerType::get(C&: Ty->getContext(), NumBits: NumVecElts);
2421	return thisT()->getCastInstrCost(Instruction::BitCast, ValTy, Ty,
2422	TTI::CastContextHint::None, CostKind) +
2423	thisT()->getCmpSelInstrCost(Instruction::ICmp, ValTy,
2424	CmpInst::makeCmpResultType(opnd_type: ValTy),
2425	CmpInst::BAD_ICMP_PREDICATE, CostKind);
2426	}
2427	unsigned NumReduxLevels = Log2_32(Value: NumVecElts);
2428	InstructionCost ArithCost = `0`;
2429	InstructionCost ShuffleCost = `0`;
2430	std::pair<InstructionCost, MVT> LT = thisT()->getTypeLegalizationCost(Ty);
2431	unsigned LongVectorCount = `0`;
2432	unsigned MVTLen =
2433	LT.second.isVector() ? LT.second.getVectorNumElements() : `1`;
2434	while (NumVecElts > MVTLen) {
2435	NumVecElts /= `2`;
2436	VectorType *SubTy = FixedVectorType::get(ElementType: ScalarTy, NumElts: NumVecElts);
2437	ShuffleCost +=
2438	thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
2439	CostKind, NumVecElts, SubTy);
2440	ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind);
2441	Ty = SubTy;
2442	++LongVectorCount;
2443	}
2444
2445	NumReduxLevels -= LongVectorCount;
2446
2447	// The minimal length of the vector is limited by the real length of vector
2448	// operations performed on the current platform. That's why several final
2449	// reduction operations are performed on the vectors with the same
2450	// architecture-dependent length.
2451
2452	// By default reductions need one shuffle per reduction level.
2453	ShuffleCost +=
2454	NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
2455	std::nullopt, CostKind, `0`, Ty);
2456	ArithCost +=
2457	NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty, CostKind);
2458	return ShuffleCost + ArithCost +
2459	thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
2460	CostKind, `0`, nullptr, nullptr);
2461	}
2462
2463	/// Try to calculate the cost of performing strict (in-order) reductions,
2464	/// which involves doing a sequence of floating point additions in lane
2465	/// order, starting with an initial value. For example, consider a scalar
2466	/// initial value 'InitVal' of type float and a vector of type <4 x float>:
2467	///
2468	/// Vector = <float %v0, float %v1, float %v2, float %v3>
2469	///
2470	/// %add1 = %InitVal + %v0
2471	/// %add2 = %add1 + %v1
2472	/// %add3 = %add2 + %v2
2473	/// %add4 = %add3 + %v3
2474	///
2475	/// As a simple estimate we can say the cost of such a reduction is 4 times
2476	/// the cost of a scalar FP addition. We can only estimate the costs for
2477	/// fixed-width vectors here because for scalable vectors we do not know the
2478	/// runtime number of operations.
2479	InstructionCost getOrderedReductionCost(unsigned Opcode, VectorType *Ty,
2480	TTI::TargetCostKind CostKind) {
2481	// Targets must implement a default value for the scalable case, since
2482	// we don't know how many lanes the vector has.
2483	if (isa<ScalableVectorType>(Val: Ty))
2484	return InstructionCost::getInvalid();
2485
2486	auto *VTy = cast<FixedVectorType>(Val: Ty);
2487	InstructionCost ExtractCost = getScalarizationOverhead(
2488	VTy, /Insert=/false, /Extract=/true, CostKind);
2489	InstructionCost ArithCost = thisT()->getArithmeticInstrCost(
2490	Opcode, VTy->getElementType(), CostKind);
2491	ArithCost *= VTy->getNumElements();
2492
2493	return ExtractCost + ArithCost;
2494	}
2495
2496	InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
2497	std::optional<FastMathFlags> FMF,
2498	TTI::TargetCostKind CostKind) {
2499	assert(Ty && "Unknown reduction vector type");
2500	if (TTI::requiresOrderedReduction(FMF))
2501	return getOrderedReductionCost(Opcode, Ty, CostKind);
2502	return getTreeReductionCost(Opcode, Ty, CostKind);
2503	}
2504
2505	/// Try to calculate op costs for min/max reduction operations.
2506	/// \param CondTy Conditional type for the Select instruction.
2507	InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
2508	FastMathFlags FMF,
2509	TTI::TargetCostKind CostKind) {
2510	// Targets must implement a default value for the scalable case, since
2511	// we don't know how many lanes the vector has.
2512	if (isa<ScalableVectorType>(Val: Ty))
2513	return InstructionCost::getInvalid();
2514
2515	Type *ScalarTy = Ty->getElementType();
2516	unsigned NumVecElts = cast<FixedVectorType>(Val: Ty)->getNumElements();
2517	unsigned NumReduxLevels = Log2_32(Value: NumVecElts);
2518	InstructionCost MinMaxCost = `0`;
2519	InstructionCost ShuffleCost = `0`;
2520	std::pair<InstructionCost, MVT> LT = thisT()->getTypeLegalizationCost(Ty);
2521	unsigned LongVectorCount = `0`;
2522	unsigned MVTLen =
2523	LT.second.isVector() ? LT.second.getVectorNumElements() : `1`;
2524	while (NumVecElts > MVTLen) {
2525	NumVecElts /= `2`;
2526	auto *SubTy = FixedVectorType::get(ElementType: ScalarTy, NumElts: NumVecElts);
2527
2528	ShuffleCost +=
2529	thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
2530	CostKind, NumVecElts, SubTy);
2531
2532	IntrinsicCostAttributes Attrs(IID, SubTy, {SubTy, SubTy}, FMF);
2533	MinMaxCost += getIntrinsicInstrCost(ICA: Attrs, CostKind);
2534	Ty = SubTy;
2535	++LongVectorCount;
2536	}
2537
2538	NumReduxLevels -= LongVectorCount;
2539
2540	// The minimal length of the vector is limited by the real length of vector
2541	// operations performed on the current platform. That's why several final
2542	// reduction opertions are perfomed on the vectors with the same
2543	// architecture-dependent length.
2544	ShuffleCost +=
2545	NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
2546	std::nullopt, CostKind, `0`, Ty);
2547	IntrinsicCostAttributes Attrs(IID, Ty, {Ty, Ty}, FMF);
2548	MinMaxCost += NumReduxLevels * getIntrinsicInstrCost(ICA: Attrs, CostKind);
2549	// The last min/max should be in vector registers and we counted it above.
2550	// So just need a single extractelement.
2551	return ShuffleCost + MinMaxCost +
2552	thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
2553	CostKind, `0`, nullptr, nullptr);
2554	}
2555
2556	InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned,
2557	Type ResTy, VectorType Ty,
2558	FastMathFlags FMF,
2559	TTI::TargetCostKind CostKind) {
2560	// Without any native support, this is equivalent to the cost of
2561	// vecreduce.opcode(ext(Ty A)).
2562	VectorType *ExtTy = VectorType::get(ElementType: ResTy, Other: Ty);
2563	InstructionCost RedCost =
2564	thisT()->getArithmeticReductionCost(Opcode, ExtTy, FMF, CostKind);
2565	InstructionCost ExtCost = thisT()->getCastInstrCost(
2566	IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
2567	TTI::CastContextHint::None, CostKind);
2568
2569	return RedCost + ExtCost;
2570	}
2571
2572	InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy,
2573	VectorType *Ty,
2574	TTI::TargetCostKind CostKind) {
2575	// Without any native support, this is equivalent to the cost of
2576	// vecreduce.add(mul(ext(Ty A), ext(Ty B))) or
2577	// vecreduce.add(mul(A, B)).
2578	VectorType *ExtTy = VectorType::get(ElementType: ResTy, Other: Ty);
2579	InstructionCost RedCost = thisT()->getArithmeticReductionCost(
2580	Instruction::Add, ExtTy, std::nullopt, CostKind);
2581	InstructionCost ExtCost = thisT()->getCastInstrCost(
2582	IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
2583	TTI::CastContextHint::None, CostKind);
2584
2585	InstructionCost MulCost =
2586	thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2587
2588	return RedCost + MulCost + `2` * ExtCost;
2589	}
2590
2591	InstructionCost getVectorSplitCost() { return `1`; }
2592
2593	/// @}
2594	};
2595
2596	/// Concrete BasicTTIImpl that can be used if no further customization
2597	/// is needed.
2598	class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> {
2599	using BaseT = BasicTTIImplBase<BasicTTIImpl>;
2600
2601	friend class BasicTTIImplBase<BasicTTIImpl>;
2602
2603	const TargetSubtargetInfo *ST;
2604	const TargetLoweringBase *TLI;
2605
2606	const TargetSubtargetInfo getST() const* { return ST; }
2607	const TargetLoweringBase getTLI() const* { return TLI; }
2608
2609	public:
2610	explicit BasicTTIImpl(const TargetMachine TM, const* Function &F);
2611	};
2612
2613	} // end namespace llvm
2614
2615	#endif // LLVM_CODEGEN_BASICTTIIMPL_H
2616

source code of llvm/include/llvm/CodeGen/BasicTTIImpl.h