RISCVTargetTransformInfo.cpp source code [llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp]

1	//===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#include "RISCVTargetTransformInfo.h"
10	#include "MCTargetDesc/RISCVMatInt.h"
11	#include "llvm/ADT/STLExtras.h"
12	#include "llvm/Analysis/TargetTransformInfo.h"
13	#include "llvm/CodeGen/BasicTTIImpl.h"
14	#include "llvm/CodeGen/CostTable.h"
15	#include "llvm/CodeGen/TargetLowering.h"
16	#include "llvm/IR/Instructions.h"
17	#include <cmath>
18	#include <optional>
19	using namespace llvm;
20
21	#define DEBUG_TYPE "riscvtti"
22
23	static cl::opt<unsigned> RVVRegisterWidthLMUL(
24	"riscv-v-register-bit-width-lmul",
25	cl::desc (
26	"The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
27	"by autovectorized code. Fractional LMULs are not supported."),
28	cl::init(Val: `2`), cl::Hidden);
29
30	static cl::opt<unsigned> SLPMaxVF(
31	"riscv-v-slp-max-vf",
32	cl::desc (
33	"Overrides result used for getMaximumVF query which is used "
34	"exclusively by SLP vectorizer."),
35	cl::Hidden);
36
37	InstructionCost
38	RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
39	TTI::TargetCostKind CostKind) {
40	// Check if the type is valid for all CostKind
41	if (!VT.isVector())
42	return InstructionCost::getInvalid();
43	size_t NumInstr = OpCodes.size();
44	if (CostKind == TTI::TCK_CodeSize)
45	return NumInstr;
46	InstructionCost LMULCost = TLI->getLMULCost(VT);
47	if ((CostKind != TTI::TCK_RecipThroughput) && (CostKind != TTI::TCK_Latency))
48	return LMULCost * NumInstr;
49	InstructionCost Cost = `0`;
50	for (auto Op : OpCodes) {
51	switch (Op) {
52	case RISCV::VRGATHER_VI:
53	Cost += TLI->getVRGatherVICost(VT);
54	break;
55	case RISCV::VRGATHER_VV:
56	Cost += TLI->getVRGatherVVCost(VT);
57	break;
58	case RISCV::VSLIDEUP_VI:
59	case RISCV::VSLIDEDOWN_VI:
60	Cost += TLI->getVSlideVICost(VT);
61	break;
62	case RISCV::VSLIDEUP_VX:
63	case RISCV::VSLIDEDOWN_VX:
64	Cost += TLI->getVSlideVXCost(VT);
65	break;
66	case RISCV::VREDMAX_VS:
67	case RISCV::VREDMIN_VS:
68	case RISCV::VREDMAXU_VS:
69	case RISCV::VREDMINU_VS:
70	case RISCV::VREDSUM_VS:
71	case RISCV::VREDAND_VS:
72	case RISCV::VREDOR_VS:
73	case RISCV::VREDXOR_VS:
74	case RISCV::VFREDMAX_VS:
75	case RISCV::VFREDMIN_VS:
76	case RISCV::VFREDUSUM_VS: {
77	unsigned VL = VT.getVectorMinNumElements();
78	if (!VT.isFixedLengthVector())
79	VL = getVScaleForTuning();
80	Cost += Log2_32_Ceil(VL);
81	break;
82	}
83	case RISCV::VFREDOSUM_VS: {
84	unsigned VL = VT.getVectorMinNumElements();
85	if (!VT.isFixedLengthVector())
86	VL = getVScaleForTuning();
87	Cost += VL;
88	break;
89	}
90	case RISCV::VMV_X_S:
91	case RISCV::VMV_S_X:
92	case RISCV::VFMV_F_S:
93	case RISCV::VFMV_S_F:
94	case RISCV::VMOR_MM:
95	case RISCV::VMXOR_MM:
96	case RISCV::VMAND_MM:
97	case RISCV::VMANDN_MM:
98	case RISCV::VMNAND_MM:
99	case RISCV::VCPOP_M:
100	Cost += `1`;
101	break;
102	default:
103	Cost += LMULCost;
104	}
105	}
106	return Cost;
107	}
108
109	InstructionCost RISCVTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
110	TTI::TargetCostKind CostKind) {
111	assert(Ty->isIntegerTy() &&
112	"getIntImmCost can only estimate cost of materialising integers");
113
114	// We have a Zero register, so 0 is always free.
115	if (Imm == `0`)
116	return TTI::TCC_Free;
117
118	// Otherwise, we check how many instructions it will take to materialise.
119	const DataLayout &DL = getDataLayout();
120	return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty), *getST());
121	}
122
123	// Look for patterns of shift followed by AND that can be turned into a pair of
124	// shifts. We won't need to materialize an immediate for the AND so these can
125	// be considered free.
126	static bool canUseShiftPair(Instruction Inst, const* APInt &Imm) {
127	uint64_t Mask = Imm.getZExtValue();
128	auto *BO = dyn_cast<BinaryOperator>(Val: Inst->getOperand(i: `0`));
129	if (!BO \|\| !BO->hasOneUse())
130	return false;
131
132	if (BO->getOpcode() != Instruction::Shl)
133	return false;
134
135	if (!isa<ConstantInt>(Val: BO->getOperand(i_nocapture: `1`)))
136	return false;
137
138	unsigned ShAmt = cast<ConstantInt>(Val: BO->getOperand(i_nocapture: `1`))->getZExtValue();
139	// (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1
140	// is a mask shifted by c2 bits with c3 leading zeros.
141	if (isShiftedMask_64(Value: Mask)) {
142	unsigned Trailing = llvm::countr_zero(Val: Mask);
143	if (ShAmt == Trailing)
144	return true;
145	}
146
147	return false;
148	}
149
150	InstructionCost RISCVTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
151	const APInt &Imm, Type *Ty,
152	TTI::TargetCostKind CostKind,
153	Instruction *Inst) {
154	assert(Ty->isIntegerTy() &&
155	"getIntImmCost can only estimate cost of materialising integers");
156
157	// We have a Zero register, so 0 is always free.
158	if (Imm == `0`)
159	return TTI::TCC_Free;
160
161	// Some instructions in RISC-V can take a 12-bit immediate. Some of these are
162	// commutative, in others the immediate comes from a specific argument index.
163	bool Takes12BitImm = false;
164	unsigned ImmArgIdx = ~`0U`;
165
166	switch (Opcode) {
167	case Instruction::GetElementPtr:
168	// Never hoist any arguments to a GetElementPtr. CodeGenPrepare will
169	// split up large offsets in GEP into better parts than ConstantHoisting
170	// can.
171	return TTI::TCC_Free;
172	case Instruction::Store:
173	// If the address is a constant, use the materialization cost.
174	if (Idx == `1`)
175	return getIntImmCost(Imm, Ty, CostKind);
176	return TTI::TCC_Free;
177	case Instruction::Load:
178	// If the address is a constant, use the materialization cost.
179	return getIntImmCost(Imm, Ty, CostKind);
180	case Instruction::And:
181	// zext.h
182	if (Imm == UINT64_C(`0xffff`) && ST->hasStdExtZbb())
183	return TTI::TCC_Free;
184	// zext.w
185	if (Imm == UINT64_C(`0xffffffff`) && ST->hasStdExtZba())
186	return TTI::TCC_Free;
187	// bclri
188	if (ST->hasStdExtZbs() && (~Imm).isPowerOf2())
189	return TTI::TCC_Free;
190	if (Inst && Idx == `1` && Imm.getBitWidth() <= ST->getXLen() &&
191	canUseShiftPair(Inst, Imm))
192	return TTI::TCC_Free;
193	Takes12BitImm = true;
194	break;
195	case Instruction::Add:
196	Takes12BitImm = true;
197	break;
198	case Instruction::Or:
199	case Instruction::Xor:
200	// bseti/binvi
201	if (ST->hasStdExtZbs() && Imm.isPowerOf2())
202	return TTI::TCC_Free;
203	Takes12BitImm = true;
204	break;
205	case Instruction::Mul:
206	// Power of 2 is a shift. Negated power of 2 is a shift and a negate.
207	if (Imm.isPowerOf2() \|\| Imm.isNegatedPowerOf2())
208	return TTI::TCC_Free;
209	// One more or less than a power of 2 can use SLLI+ADD/SUB.
210	if ((Imm + `1`).isPowerOf2() \|\| (Imm - `1`).isPowerOf2())
211	return TTI::TCC_Free;
212	// FIXME: There is no MULI instruction.
213	Takes12BitImm = true;
214	break;
215	case Instruction::Sub:
216	case Instruction::Shl:
217	case Instruction::LShr:
218	case Instruction::AShr:
219	Takes12BitImm = true;
220	ImmArgIdx = `1`;
221	break;
222	default:
223	break;
224	}
225
226	if (Takes12BitImm) {
227	// Check immediate is the correct argument...
228	if (Instruction::isCommutative(Opcode) \|\| Idx == ImmArgIdx) {
229	// ... and fits into the 12-bit immediate.
230	if (Imm.getSignificantBits() <= `64` &&
231	getTLI()->isLegalAddImmediate(Imm: Imm.getSExtValue())) {
232	return TTI::TCC_Free;
233	}
234	}
235
236	// Otherwise, use the full materialisation cost.
237	return getIntImmCost(Imm, Ty, CostKind);
238	}
239
240	// By default, prevent hoisting.
241	return TTI::TCC_Free;
242	}
243
244	InstructionCost
245	RISCVTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
246	const APInt &Imm, Type *Ty,
247	TTI::TargetCostKind CostKind) {
248	// Prevent hoisting in unknown cases.
249	return TTI::TCC_Free;
250	}
251
252	bool RISCVTTIImpl::hasActiveVectorLength(unsigned, Type DataTy, Align) const* {
253	return ST->hasVInstructions();
254	}
255
256	TargetTransformInfo::PopcntSupportKind
257	RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) {
258	assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
259	return ST->hasStdExtZbb() \|\| ST->hasVendorXCVbitmanip()
260	? TTI::PSK_FastHardware
261	: TTI::PSK_Software;
262	}
263
264	bool RISCVTTIImpl::shouldExpandReduction(const IntrinsicInst II) const* {
265	// Currently, the ExpandReductions pass can't expand scalable-vector
266	// reductions, but we still request expansion as RVV doesn't support certain
267	// reductions and the SelectionDAG can't legalize them either.
268	switch (II->getIntrinsicID()) {
269	default:
270	return false;
271	// These reductions have no equivalent in RVV
272	case Intrinsic::vector_reduce_mul:
273	case Intrinsic::vector_reduce_fmul:
274	return true;
275	}
276	}
277
278	std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
279	if (ST->hasVInstructions())
280	return ST->getRealMaxVLen() / RISCV::RVVBitsPerBlock;
281	return BaseT::getMaxVScale();
282	}
283
284	std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const {
285	if (ST->hasVInstructions())
286	if (unsigned MinVLen = ST->getRealMinVLen();
287	MinVLen >= RISCV::RVVBitsPerBlock)
288	return MinVLen / RISCV::RVVBitsPerBlock;
289	return BaseT::getVScaleForTuning();
290	}
291
292	TypeSize
293	RISCVTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
294	unsigned LMUL =
295	llvm::bit_floor(Value: std::clamp<unsigned>(val: RVVRegisterWidthLMUL, lo: `1`, hi: `8`));
296	switch (K) {
297	case TargetTransformInfo::RGK_Scalar:
298	return TypeSize::getFixed(ExactSize: ST->getXLen());
299	case TargetTransformInfo::RGK_FixedWidthVector:
300	return TypeSize::getFixed(
301	ExactSize: ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : `0`);
302	case TargetTransformInfo::RGK_ScalableVector:
303	return TypeSize::getScalable(
304	MinimumSize: (ST->hasVInstructions() &&
305	ST->getRealMinVLen() >= RISCV::RVVBitsPerBlock)
306	? LMUL * RISCV::RVVBitsPerBlock
307	: `0`);
308	}
309
310	llvm_unreachable("Unsupported register kind");
311	}
312
313	InstructionCost
314	RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty, TTI::TargetCostKind CostKind) {
315	// Add a cost of address generation + the cost of the load. The address
316	// is expected to be a PC relative offset to a constant pool entry
317	// using auipc/addi.
318	return `2` + getMemoryOpCost(Opcode: Instruction::Load, Src: Ty, Alignment: DL.getABITypeAlign(Ty),
319	/AddressSpace=/`0`, CostKind);
320	}
321
322	static VectorType getVRGatherIndexType(MVT DataVT, const* RISCVSubtarget &ST,
323	LLVMContext &C) {
324	assert((DataVT.getScalarSizeInBits() != `8` \|\|
325	DataVT.getVectorNumElements() <= `256`) && "unhandled case in lowering");
326	MVT IndexVT = DataVT.changeTypeToInteger();
327	if (IndexVT.getScalarType().bitsGT(ST.getXLenVT()))
328	IndexVT = IndexVT.changeVectorElementType(MVT::i16);
329	return cast<VectorType>(Val: EVT (IndexVT).getTypeForEVT(Context&: C));
330	}
331
332	InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
333	VectorType Tp, ArrayRef<int*> Mask,
334	TTI::TargetCostKind CostKind,
335	int Index, VectorType *SubTp,
336	ArrayRef<const Value *> Args,
337	const Instruction *CxtI) {
338	Kind = improveShuffleKindFromMask(Kind, Mask, Ty: Tp, Index, SubTy&: SubTp);
339
340	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Tp);
341
342	// First, handle cases where having a fixed length vector enables us to
343	// give a more accurate cost than falling back to generic scalable codegen.
344	// TODO: Each of these cases hints at a modeling gap around scalable vectors.
345	if (isa<FixedVectorType>(Val: Tp)) {
346	switch (Kind) {
347	default:
348	break;
349	case TTI::SK_PermuteSingleSrc: {
350	if (Mask.size() >= `2` && LT.second.isFixedLengthVector()) {
351	MVT EltTp = LT.second.getVectorElementType();
352	// If the size of the element is < ELEN then shuffles of interleaves and
353	// deinterleaves of 2 vectors can be lowered into the following
354	// sequences
355	if (EltTp.getScalarSizeInBits() < ST->getELen()) {
356	// Example sequence:
357	// vsetivli zero, 4, e8, mf4, ta, ma (ignored)
358	// vwaddu.vv v10, v8, v9
359	// li a0, -1 (ignored)
360	// vwmaccu.vx v10, a0, v9
361	if (ShuffleVectorInst::isInterleaveMask(Mask, Factor: `2`, NumInputElts: Mask.size()))
362	return `2` * LT.first * TLI->getLMULCost(VT: LT.second);
363
364	if (Mask [`0`] == `0` \|\| Mask [`0`] == `1`) {
365	auto DeinterleaveMask = createStrideMask(Start: Mask [`0`], Stride: `2`, VF: Mask.size());
366	// Example sequence:
367	// vnsrl.wi v10, v8, 0
368	if (equal(DeinterleaveMask, Mask))
369	return LT.first * getRISCVInstructionCost(RISCV::VNSRL_WI,
370	LT.second, CostKind);
371	}
372	}
373	}
374	// vrgather + cost of generating the mask constant.
375	// We model this for an unknown mask with a single vrgather.
376	if (LT.second.isFixedLengthVector() && LT.first == `1` &&
377	(LT.second.getScalarSizeInBits() != `8` \|\|
378	LT.second.getVectorNumElements() <= `256`)) {
379	VectorType IdxTy = getVRGatherIndexType(DataVT: LT.second, ST: ST, C&: Tp->getContext());
380	InstructionCost IndexCost = getConstantPoolLoadCost(Ty: IdxTy, CostKind);
381	return IndexCost +
382	getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind);
383	}
384	[[fallthrough]];
385	}
386	case TTI::SK_Transpose:
387	case TTI::SK_PermuteTwoSrc: {
388	// 2 x (vrgather + cost of generating the mask constant) + cost of mask
389	// register for the second vrgather. We model this for an unknown
390	// (shuffle) mask.
391	if (LT.second.isFixedLengthVector() && LT.first == `1` &&
392	(LT.second.getScalarSizeInBits() != `8` \|\|
393	LT.second.getVectorNumElements() <= `256`)) {
394	auto &C = Tp->getContext();
395	auto EC = Tp->getElementCount();
396	VectorType IdxTy = getVRGatherIndexType(DataVT: LT.second, ST: ST, C);
397	VectorType *MaskTy = VectorType::get(ElementType: IntegerType::getInt1Ty(C), EC);
398	InstructionCost IndexCost = getConstantPoolLoadCost(Ty: IdxTy, CostKind);
399	InstructionCost MaskCost = getConstantPoolLoadCost(Ty: MaskTy, CostKind);
400	return `2` * IndexCost +
401	getRISCVInstructionCost({RISCV::VRGATHER_VV, RISCV::VRGATHER_VV},
402	LT.second, CostKind) +
403	MaskCost;
404	}
405	[[fallthrough]];
406	}
407	case TTI::SK_Select: {
408	// We are going to permute multiple sources and the result will be in
409	// multiple destinations. Providing an accurate cost only for splits where
410	// the element type remains the same.
411	if (!Mask.empty() && LT.first.isValid() && LT.first != `1` &&
412	LT.second.isFixedLengthVector() &&
413	LT.second.getVectorElementType().getSizeInBits() ==
414	Tp->getElementType()->getPrimitiveSizeInBits() &&
415	LT.second.getVectorNumElements() <
416	cast<FixedVectorType>(Val: Tp)->getNumElements() &&
417	divideCeil(Numerator: Mask.size(),
418	Denominator: cast<FixedVectorType>(Val: Tp)->getNumElements()) ==
419	static_cast<unsigned>(*LT.first.getValue())) {
420	unsigned NumRegs = *LT.first.getValue();
421	unsigned VF = cast<FixedVectorType>(Val: Tp)->getNumElements();
422	unsigned SubVF = PowerOf2Ceil(A: VF / NumRegs);
423	auto *SubVecTy = FixedVectorType::get(ElementType: Tp->getElementType(), NumElts: SubVF);
424
425	InstructionCost Cost = `0`;
426	for (unsigned I = `0`; I < NumRegs; ++I) {
427	bool IsSingleVector = true;
428	SmallVector<int> SubMask(SubVF, PoisonMaskElem);
429	transform(Range: Mask.slice(N: I * SubVF,
430	M: I == NumRegs - `1` ? Mask.size() % SubVF : SubVF),
431	d_first: SubMask.begin(), F: [&](int I) {
432	bool SingleSubVector = I / VF == `0`;
433	IsSingleVector &= SingleSubVector;
434	return (SingleSubVector ? `0` : `1`) * SubVF + I % VF;
435	});
436	Cost += getShuffleCost(Kind: IsSingleVector ? TTI::SK_PermuteSingleSrc
437	: TTI::SK_PermuteTwoSrc,
438	Tp: SubVecTy, Mask: SubMask, CostKind, Index: `0`, SubTp: nullptr);
439	return Cost;
440	}
441	}
442	break;
443	}
444	}
445	};
446
447	// Handle scalable vectors (and fixed vectors legalized to scalable vectors).
448	switch (Kind) {
449	default:
450	// Fallthrough to generic handling.
451	// TODO: Most of these cases will return getInvalid in generic code, and
452	// must be implemented here.
453	break;
454	case TTI::SK_ExtractSubvector:
455	// Extract at zero is always a subregister extract
456	if (Index == `0`)
457	return TTI::TCC_Free;
458
459	// If we're extracting a subvector of at most m1 size at a sub-register
460	// boundary - which unfortunately we need exact vlen to identify - this is
461	// a subregister extract at worst and thus won't require a vslidedown.
462	// TODO: Extend for aligned m2, m4 subvector extracts
463	// TODO: Extend for misalgined (but contained) extracts
464	// TODO: Extend for scalable subvector types
465	if (std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(Ty: SubTp);
466	SubLT.second.isValid() && SubLT.second.isFixedLengthVector()) {
467	const unsigned MinVLen = ST->getRealMinVLen();
468	const unsigned MaxVLen = ST->getRealMaxVLen();
469	if (MinVLen == MaxVLen &&
470	SubLT.second.getScalarSizeInBits() * Index % MinVLen == `0` &&
471	SubLT.second.getSizeInBits() <= MinVLen)
472	return TTI::TCC_Free;
473	}
474
475	// Example sequence:
476	// vsetivli zero, 4, e8, mf2, tu, ma (ignored)
477	// vslidedown.vi v8, v9, 2
478	return LT.first *
479	getRISCVInstructionCost(RISCV::VSLIDEDOWN_VI, LT.second, CostKind);
480	case TTI::SK_InsertSubvector:
481	// Example sequence:
482	// vsetivli zero, 4, e8, mf2, tu, ma (ignored)
483	// vslideup.vi v8, v9, 2
484	return LT.first *
485	getRISCVInstructionCost(RISCV::VSLIDEUP_VI, LT.second, CostKind);
486	case TTI::SK_Select: {
487	// Example sequence:
488	// li a0, 90
489	// vsetivli zero, 8, e8, mf2, ta, ma (ignored)
490	// vmv.s.x v0, a0
491	// vmerge.vvm v8, v9, v8, v0
492	// We use 2 for the cost of the mask materialization as this is the true
493	// cost for small masks and most shuffles are small. At worst, this cost
494	// should be a very small constant for the constant pool load. As such,
495	// we may bias towards large selects slightly more than truely warranted.
496	return LT.first *
497	(`1` + getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM},
498	LT.second, CostKind));
499	}
500	case TTI::SK_Broadcast: {
501	bool HasScalar = (Args.size() > `0`) && (Operator::getOpcode(V: Args [`0`]) ==
502	Instruction::InsertElement);
503	if (LT.second.getScalarSizeInBits() == `1`) {
504	if (HasScalar) {
505	// Example sequence:
506	// andi a0, a0, 1
507	// vsetivli zero, 2, e8, mf8, ta, ma (ignored)
508	// vmv.v.x v8, a0
509	// vmsne.vi v0, v8, 0
510	return LT.first *
511	(`1` + getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
512	LT.second, CostKind));
513	}
514	// Example sequence:
515	// vsetivli zero, 2, e8, mf8, ta, mu (ignored)
516	// vmv.v.i v8, 0
517	// vmerge.vim v8, v8, 1, v0
518	// vmv.x.s a0, v8
519	// andi a0, a0, 1
520	// vmv.v.x v8, a0
521	// vmsne.vi v0, v8, 0
522
523	return LT.first *
524	(`1` + getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM,
525	RISCV::VMV_X_S, RISCV::VMV_V_X,
526	RISCV::VMSNE_VI},
527	LT.second, CostKind));
528	}
529
530	if (HasScalar) {
531	// Example sequence:
532	// vmv.v.x v8, a0
533	return LT.first *
534	getRISCVInstructionCost(RISCV::VMV_V_X, LT.second, CostKind);
535	}
536
537	// Example sequence:
538	// vrgather.vi v9, v8, 0
539	return LT.first *
540	getRISCVInstructionCost(RISCV::VRGATHER_VI, LT.second, CostKind);
541	}
542	case TTI::SK_Splice: {
543	// vslidedown+vslideup.
544	// TODO: Multiplying by LT.first implies this legalizes into multiple copies
545	// of similar code, but I think we expand through memory.
546	unsigned Opcodes[`2`] = {RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX};
547	if (Index >= `0` && Index < `32`)
548	Opcodes[`0`] = RISCV::VSLIDEDOWN_VI;
549	else if (Index < `0` && Index > -`32`)
550	Opcodes[`1`] = RISCV::VSLIDEUP_VI;
551	return LT.first * getRISCVInstructionCost(OpCodes: Opcodes, VT: LT.second, CostKind);
552	}
553	case TTI::SK_Reverse: {
554	// TODO: Cases to improve here:
555	// Illegal vector types*
556	// i64 on RV32*
557	// i1 vector*
558	// At low LMUL, most of the cost is producing the vrgather index register.
559	// At high LMUL, the cost of the vrgather itself will dominate.
560	// Example sequence:
561	// csrr a0, vlenb
562	// srli a0, a0, 3
563	// addi a0, a0, -1
564	// vsetvli a1, zero, e8, mf8, ta, mu (ignored)
565	// vid.v v9
566	// vrsub.vx v10, v9, a0
567	// vrgather.vv v9, v8, v10
568	InstructionCost LenCost = `3`;
569	if (LT.second.isFixedLengthVector())
570	// vrsub.vi has a 5 bit immediate field, otherwise an li suffices
571	LenCost = isInt<`5`>(x: LT.second.getVectorNumElements() - `1`) ? `0` : `1`;
572	unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV};
573	if (LT.second.isFixedLengthVector() &&
574	isInt<`5`>(LT.second.getVectorNumElements() - `1`))
575	Opcodes[`1`] = RISCV::VRSUB_VI;
576	InstructionCost GatherCost =
577	getRISCVInstructionCost(Opcodes, LT.second, CostKind);
578	// Mask operation additionally required extend and truncate
579	InstructionCost ExtendCost = Tp->getElementType()->isIntegerTy(Bitwidth: `1`) ? `3` : `0`;
580	return LT.first * (LenCost + GatherCost + ExtendCost);
581	}
582	}
583	return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
584	}
585
586	InstructionCost
587	RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
588	unsigned AddressSpace,
589	TTI::TargetCostKind CostKind) {
590	if (!isLegalMaskedLoadStore(DataType: Src, Alignment) \|\|
591	CostKind != TTI::TCK_RecipThroughput)
592	return BaseT::getMaskedMemoryOpCost(Opcode, DataTy: Src, Alignment, AddressSpace,
593	CostKind);
594
595	return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
596	}
597
598	InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost(
599	unsigned Opcode, Type VecTy, unsigned* Factor, ArrayRef<unsigned> Indices,
600	Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
601	bool UseMaskForCond, bool UseMaskForGaps) {
602	if (isa<ScalableVectorType>(Val: VecTy))
603	return InstructionCost::getInvalid();
604	auto *FVTy = cast<FixedVectorType>(Val: VecTy);
605	InstructionCost MemCost =
606	getMemoryOpCost(Opcode, Src: VecTy, Alignment, AddressSpace, CostKind);
607	unsigned VF = FVTy->getNumElements() / Factor;
608
609	// The interleaved memory access pass will lower interleaved memory ops (i.e
610	// a load and store followed by a specific shuffle) to vlseg/vsseg
611	// intrinsics. In those cases then we can treat it as if it's just one (legal)
612	// memory op
613	if (!UseMaskForCond && !UseMaskForGaps &&
614	Factor <= TLI->getMaxSupportedInterleaveFactor()) {
615	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: FVTy);
616	// Need to make sure type has't been scalarized
617	if (LT.second.isFixedLengthVector()) {
618	auto *LegalFVTy = FixedVectorType::get(ElementType: FVTy->getElementType(),
619	NumElts: LT.second.getVectorNumElements());
620	// FIXME: We use the memory op cost of the legalized* type here, becuase*
621	// it's getMemoryOpCost returns a really expensive cost for types like
622	// <6 x i8>, which show up when doing interleaves of Factor=3 etc.
623	// Should the memory op cost of these be cheaper?
624	if (TLI->isLegalInterleavedAccessType(VTy: LegalFVTy, Factor, Alignment,
625	AddrSpace: AddressSpace, DL)) {
626	InstructionCost LegalMemCost = getMemoryOpCost(
627	Opcode, Src: LegalFVTy, Alignment, AddressSpace, CostKind);
628	return LT.first + LegalMemCost;
629	}
630	}
631	}
632
633	// An interleaved load will look like this for Factor=3:
634	// %wide.vec = load <12 x i32>, ptr %3, align 4
635	// %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
636	// %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
637	// %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
638	if (Opcode == Instruction::Load) {
639	InstructionCost Cost = MemCost;
640	for (unsigned Index : Indices) {
641	FixedVectorType *SubVecTy =
642	FixedVectorType::get(ElementType: FVTy->getElementType(), NumElts: VF * Factor);
643	auto Mask = createStrideMask(Start: Index, Stride: Factor, VF);
644	InstructionCost ShuffleCost =
645	getShuffleCost(Kind: TTI::ShuffleKind::SK_PermuteSingleSrc, Tp: SubVecTy, Mask,
646	CostKind, Index: `0`, SubTp: nullptr, Args: {});
647	Cost += ShuffleCost;
648	}
649	return Cost;
650	}
651
652	// TODO: Model for NF > 2
653	// We'll need to enhance getShuffleCost to model shuffles that are just
654	// inserts and extracts into subvectors, since they won't have the full cost
655	// of a vrgather.
656	// An interleaved store for 3 vectors of 4 lanes will look like
657	// %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7>
658	// %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3>
659	// %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11>
660	// %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask>
661	// store <12 x i32> %interleaved.vec, ptr %10, align 4
662	if (Factor != `2`)
663	return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
664	Alignment, AddressSpace, CostKind,
665	UseMaskForCond, UseMaskForGaps);
666
667	assert(Opcode == Instruction::Store && "Opcode must be a store");
668	// For an interleaving store of 2 vectors, we perform one large interleaving
669	// shuffle that goes into the wide store
670	auto Mask = createInterleaveMask(VF, NumVecs: Factor);
671	InstructionCost ShuffleCost =
672	getShuffleCost(Kind: TTI::ShuffleKind::SK_PermuteSingleSrc, Tp: FVTy, Mask,
673	CostKind, Index: `0`, SubTp: nullptr, Args: {});
674	return MemCost + ShuffleCost;
675	}
676
677	InstructionCost RISCVTTIImpl::getGatherScatterOpCost(
678	unsigned Opcode, Type DataTy, const* Value Ptr, bool* VariableMask,
679	Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
680	if (CostKind != TTI::TCK_RecipThroughput)
681	return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
682	Alignment, CostKind, I);
683
684	if ((Opcode == Instruction::Load &&
685	!isLegalMaskedGather(DataType: DataTy, Alignment: Align (Alignment))) \|\|
686	(Opcode == Instruction::Store &&
687	!isLegalMaskedScatter(DataType: DataTy, Alignment: Align (Alignment))))
688	return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
689	Alignment, CostKind, I);
690
691	// Cost is proportional to the number of memory operations implied. For
692	// scalable vectors, we use an estimate on that number since we don't
693	// know exactly what VL will be.
694	auto &VTy = *cast<VectorType>(Val: DataTy);
695	InstructionCost MemOpCost =
696	getMemoryOpCost(Opcode, Src: VTy.getElementType(), Alignment, AddressSpace: `0`, CostKind,
697	OpdInfo: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I);
698	unsigned NumLoads = getEstimatedVLFor(Ty: &VTy);
699	return NumLoads * MemOpCost;
700	}
701
702	InstructionCost RISCVTTIImpl::getStridedMemoryOpCost(
703	unsigned Opcode, Type DataTy, const* Value Ptr, bool* VariableMask,
704	Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
705	if (((Opcode == Instruction::Load \|\| Opcode == Instruction::Store) &&
706	!isLegalStridedLoadStore(DataType: DataTy, Alignment)) \|\|
707	(Opcode != Instruction::Load && Opcode != Instruction::Store))
708	return BaseT::getStridedMemoryOpCost(Opcode, DataTy, Ptr, VariableMask,
709	Alignment, CostKind, I);
710
711	if (CostKind == TTI::TCK_CodeSize)
712	return TTI::TCC_Basic;
713
714	// Cost is proportional to the number of memory operations implied. For
715	// scalable vectors, we use an estimate on that number since we don't
716	// know exactly what VL will be.
717	auto &VTy = *cast<VectorType>(Val: DataTy);
718	InstructionCost MemOpCost =
719	getMemoryOpCost(Opcode, Src: VTy.getElementType(), Alignment, AddressSpace: `0`, CostKind,
720	OpdInfo: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I);
721	unsigned NumLoads = getEstimatedVLFor(Ty: &VTy);
722	return NumLoads * MemOpCost;
723	}
724
725	// Currently, these represent both throughput and codesize costs
726	// for the respective intrinsics. The costs in this table are simply
727	// instruction counts with the following adjustments made:
728	// One vsetvli is considered free.*
729	static const CostTblEntry VectorIntrinsicCostTable[]{
730	{Intrinsic::floor, MVT::f32, `9`},
731	{Intrinsic::floor, MVT::f64, `9`},
732	{Intrinsic::ceil, MVT::f32, `9`},
733	{Intrinsic::ceil, MVT::f64, `9`},
734	{Intrinsic::trunc, MVT::f32, `7`},
735	{Intrinsic::trunc, MVT::f64, `7`},
736	{Intrinsic::round, MVT::f32, `9`},
737	{Intrinsic::round, MVT::f64, `9`},
738	{Intrinsic::roundeven, MVT::f32, `9`},
739	{Intrinsic::roundeven, MVT::f64, `9`},
740	{Intrinsic::rint, MVT::f32, `7`},
741	{Intrinsic::rint, MVT::f64, `7`},
742	{Intrinsic::lrint, MVT::i32, `1`},
743	{Intrinsic::lrint, MVT::i64, `1`},
744	{Intrinsic::llrint, MVT::i64, `1`},
745	{Intrinsic::nearbyint, MVT::f32, `9`},
746	{Intrinsic::nearbyint, MVT::f64, `9`},
747	{Intrinsic::bswap, MVT::i16, `3`},
748	{Intrinsic::bswap, MVT::i32, `12`},
749	{Intrinsic::bswap, MVT::i64, `31`},
750	{Intrinsic::vp_bswap, MVT::i16, `3`},
751	{Intrinsic::vp_bswap, MVT::i32, `12`},
752	{Intrinsic::vp_bswap, MVT::i64, `31`},
753	{Intrinsic::vp_fshl, MVT::i8, `7`},
754	{Intrinsic::vp_fshl, MVT::i16, `7`},
755	{Intrinsic::vp_fshl, MVT::i32, `7`},
756	{Intrinsic::vp_fshl, MVT::i64, `7`},
757	{Intrinsic::vp_fshr, MVT::i8, `7`},
758	{Intrinsic::vp_fshr, MVT::i16, `7`},
759	{Intrinsic::vp_fshr, MVT::i32, `7`},
760	{Intrinsic::vp_fshr, MVT::i64, `7`},
761	{Intrinsic::bitreverse, MVT::i8, `17`},
762	{Intrinsic::bitreverse, MVT::i16, `24`},
763	{Intrinsic::bitreverse, MVT::i32, `33`},
764	{Intrinsic::bitreverse, MVT::i64, `52`},
765	{Intrinsic::vp_bitreverse, MVT::i8, `17`},
766	{Intrinsic::vp_bitreverse, MVT::i16, `24`},
767	{Intrinsic::vp_bitreverse, MVT::i32, `33`},
768	{Intrinsic::vp_bitreverse, MVT::i64, `52`},
769	{Intrinsic::ctpop, MVT::i8, `12`},
770	{Intrinsic::ctpop, MVT::i16, `19`},
771	{Intrinsic::ctpop, MVT::i32, `20`},
772	{Intrinsic::ctpop, MVT::i64, `21`},
773	{Intrinsic::vp_ctpop, MVT::i8, `12`},
774	{Intrinsic::vp_ctpop, MVT::i16, `19`},
775	{Intrinsic::vp_ctpop, MVT::i32, `20`},
776	{Intrinsic::vp_ctpop, MVT::i64, `21`},
777	{Intrinsic::vp_ctlz, MVT::i8, `19`},
778	{Intrinsic::vp_ctlz, MVT::i16, `28`},
779	{Intrinsic::vp_ctlz, MVT::i32, `31`},
780	{Intrinsic::vp_ctlz, MVT::i64, `35`},
781	{Intrinsic::vp_cttz, MVT::i8, `16`},
782	{Intrinsic::vp_cttz, MVT::i16, `23`},
783	{Intrinsic::vp_cttz, MVT::i32, `24`},
784	{Intrinsic::vp_cttz, MVT::i64, `25`},
785	};
786
787	static unsigned getISDForVPIntrinsicID(Intrinsic::ID ID) {
788	switch (ID) {
789	#define HELPER_MAP_VPID_TO_VPSD(VPID, VPSD) \
790	case Intrinsic::VPID: \
791	return ISD::VPSD;
792	#include "llvm/IR/VPIntrinsics.def"
793	#undef HELPER_MAP_VPID_TO_VPSD
794	}
795	return ISD::DELETED_NODE;
796	}
797
798	InstructionCost
799	RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
800	TTI::TargetCostKind CostKind) {
801	auto *RetTy = ICA.getReturnType();
802	switch (ICA.getID()) {
803	case Intrinsic::ceil:
804	case Intrinsic::floor:
805	case Intrinsic::trunc:
806	case Intrinsic::rint:
807	case Intrinsic::lrint:
808	case Intrinsic::llrint:
809	case Intrinsic::round:
810	case Intrinsic::roundeven: {
811	// These all use the same code.
812	auto LT = getTypeLegalizationCost(Ty: RetTy);
813	if (!LT.second.isVector() && TLI->isOperationCustom(Op: ISD::FCEIL, VT: LT.second))
814	return LT.first * `8`;
815	break;
816	}
817	case Intrinsic::umin:
818	case Intrinsic::umax:
819	case Intrinsic::smin:
820	case Intrinsic::smax: {
821	auto LT = getTypeLegalizationCost(Ty: RetTy);
822	if (LT.second.isScalarInteger() && ST->hasStdExtZbb())
823	return LT.first;
824
825	if (ST->hasVInstructions() && LT.second.isVector()) {
826	unsigned Op;
827	switch (ICA.getID()) {
828	case Intrinsic::umin:
829	Op = RISCV::VMINU_VV;
830	break;
831	case Intrinsic::umax:
832	Op = RISCV::VMAXU_VV;
833	break;
834	case Intrinsic::smin:
835	Op = RISCV::VMIN_VV;
836	break;
837	case Intrinsic::smax:
838	Op = RISCV::VMAX_VV;
839	break;
840	}
841	return LT.first * getRISCVInstructionCost(OpCodes: Op, VT: LT.second, CostKind);
842	}
843	break;
844	}
845	case Intrinsic::sadd_sat:
846	case Intrinsic::ssub_sat:
847	case Intrinsic::uadd_sat:
848	case Intrinsic::usub_sat:
849	case Intrinsic::fabs:
850	case Intrinsic::sqrt: {
851	auto LT = getTypeLegalizationCost(Ty: RetTy);
852	if (ST->hasVInstructions() && LT.second.isVector())
853	return LT.first;
854	break;
855	}
856	case Intrinsic::ctpop: {
857	auto LT = getTypeLegalizationCost(Ty: RetTy);
858	if (ST->hasVInstructions() && ST->hasStdExtZvbb() && LT.second.isVector())
859	return LT.first;
860	break;
861	}
862	case Intrinsic::abs: {
863	auto LT = getTypeLegalizationCost(Ty: RetTy);
864	if (ST->hasVInstructions() && LT.second.isVector()) {
865	// vrsub.vi v10, v8, 0
866	// vmax.vv v8, v8, v10
867	return LT.first * `2`;
868	}
869	break;
870	}
871	case Intrinsic::get_active_lane_mask: {
872	if (ST->hasVInstructions()) {
873	Type *ExpRetTy = VectorType::get(
874	ElementType: ICA.getArgTypes()[`0`], EC: cast<VectorType>(Val: RetTy)->getElementCount());
875	auto LT = getTypeLegalizationCost(Ty: ExpRetTy);
876
877	// vid.v v8 // considered hoisted
878	// vsaddu.vx v8, v8, a0
879	// vmsltu.vx v0, v8, a1
880	return LT.first *
881	getRISCVInstructionCost({RISCV::VSADDU_VX, RISCV::VMSLTU_VX},
882	LT.second, CostKind);
883	}
884	break;
885	}
886	// TODO: add more intrinsic
887	case Intrinsic::experimental_stepvector: {
888	auto LT = getTypeLegalizationCost(Ty: RetTy);
889	// Legalisation of illegal types involves an `index' instruction plus
890	// (LT.first - 1) vector adds.
891	if (ST->hasVInstructions())
892	return getRISCVInstructionCost(RISCV::VID_V, LT.second, CostKind) +
893	(LT.first - `1`) *
894	getRISCVInstructionCost(RISCV::VADD_VX, LT.second, CostKind);
895	return `1` + (LT.first - `1`);
896	}
897	case Intrinsic::vp_rint: {
898	// RISC-V target uses at least 5 instructions to lower rounding intrinsics.
899	unsigned Cost = `5`;
900	auto LT = getTypeLegalizationCost(Ty: RetTy);
901	if (TLI->isOperationCustom(Op: ISD::VP_FRINT, VT: LT.second))
902	return Cost * LT.first;
903	break;
904	}
905	case Intrinsic::vp_nearbyint: {
906	// More one read and one write for fflags than vp_rint.
907	unsigned Cost = `7`;
908	auto LT = getTypeLegalizationCost(Ty: RetTy);
909	if (TLI->isOperationCustom(Op: ISD::VP_FRINT, VT: LT.second))
910	return Cost * LT.first;
911	break;
912	}
913	case Intrinsic::vp_ceil:
914	case Intrinsic::vp_floor:
915	case Intrinsic::vp_round:
916	case Intrinsic::vp_roundeven:
917	case Intrinsic::vp_roundtozero: {
918	// Rounding with static rounding mode needs two more instructions to
919	// swap/write FRM than vp_rint.
920	unsigned Cost = `7`;
921	auto LT = getTypeLegalizationCost(Ty: RetTy);
922	unsigned VPISD = getISDForVPIntrinsicID(ID: ICA.getID());
923	if (TLI->isOperationCustom(Op: VPISD, VT: LT.second))
924	return Cost * LT.first;
925	break;
926	}
927	}
928
929	if (ST->hasVInstructions() && RetTy->isVectorTy()) {
930	if (auto LT = getTypeLegalizationCost(Ty: RetTy);
931	LT.second.isVector()) {
932	MVT EltTy = LT.second.getVectorElementType();
933	if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable,
934	ICA.getID(), EltTy))
935	return LT.first * Entry->Cost;
936	}
937	}
938
939	return BaseT::getIntrinsicInstrCost(ICA, CostKind);
940	}
941
942	InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
943	Type *Src,
944	TTI::CastContextHint CCH,
945	TTI::TargetCostKind CostKind,
946	const Instruction *I) {
947	bool IsVectorType = isa<VectorType>(Val: Dst) && isa<VectorType>(Val: Src);
948	if (!IsVectorType)
949	return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
950
951	bool IsTypeLegal = isTypeLegal(Ty: Src) && isTypeLegal(Ty: Dst) &&
952	(Src->getScalarSizeInBits() <= ST->getELen()) &&
953	(Dst->getScalarSizeInBits() <= ST->getELen());
954
955	// FIXME: Need to compute legalizing cost for illegal types.
956	if (!IsTypeLegal)
957	return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
958
959	std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Ty: Src);
960	std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Ty: Dst);
961
962	int ISD = TLI->InstructionOpcodeToISD(Opcode);
963	assert(ISD && "Invalid opcode");
964
965	int PowDiff = (int)Log2_32(Value: Dst->getScalarSizeInBits()) -
966	(int)Log2_32(Value: Src->getScalarSizeInBits());
967	switch (ISD) {
968	case ISD::SIGN_EXTEND:
969	case ISD::ZERO_EXTEND: {
970	const unsigned SrcEltSize = Src->getScalarSizeInBits();
971	if (SrcEltSize == `1`) {
972	// We do not use vsext/vzext to extend from mask vector.
973	// Instead we use the following instructions to extend from mask vector:
974	// vmv.v.i v8, 0
975	// vmerge.vim v8, v8, -1, v0
976	return getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM},
977	DstLT.second, CostKind);
978	}
979	if ((PowDiff < `1`) \|\| (PowDiff > `3`))
980	return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
981	unsigned SExtOp[] = {RISCV::VSEXT_VF2, RISCV::VSEXT_VF4, RISCV::VSEXT_VF8};
982	unsigned ZExtOp[] = {RISCV::VZEXT_VF2, RISCV::VZEXT_VF4, RISCV::VZEXT_VF8};
983	unsigned Op =
984	(ISD == ISD::SIGN_EXTEND) ? SExtOp[PowDiff - `1`] : ZExtOp[PowDiff - `1`];
985	return getRISCVInstructionCost(OpCodes: Op, VT: DstLT.second, CostKind);
986	}
987	case ISD::TRUNCATE:
988	if (Dst->getScalarSizeInBits() == `1`) {
989	// We do not use several vncvt to truncate to mask vector. So we could
990	// not use PowDiff to calculate it.
991	// Instead we use the following instructions to truncate to mask vector:
992	// vand.vi v8, v8, 1
993	// vmsne.vi v0, v8, 0
994	return getRISCVInstructionCost({RISCV::VAND_VI, RISCV::VMSNE_VI},
995	SrcLT.second, CostKind);
996	}
997	[[fallthrough]];
998	case ISD::FP_EXTEND:
999	case ISD::FP_ROUND: {
1000	// Counts of narrow/widen instructions.
1001	unsigned SrcEltSize = Src->getScalarSizeInBits();
1002	unsigned DstEltSize = Dst->getScalarSizeInBits();
1003
1004	unsigned Op = (ISD == ISD::TRUNCATE) ? RISCV::VNSRL_WI
1005	: (ISD == ISD::FP_EXTEND) ? RISCV::VFWCVT_F_F_V
1006	: RISCV::VFNCVT_F_F_W;
1007	InstructionCost Cost = `0`;
1008	for (; SrcEltSize != DstEltSize;) {
1009	MVT ElementMVT = (ISD == ISD::TRUNCATE)
1010	? MVT::getIntegerVT(BitWidth: DstEltSize)
1011	: MVT::getFloatingPointVT(BitWidth: DstEltSize);
1012	MVT DstMVT = DstLT.second.changeVectorElementType(EltVT: ElementMVT);
1013	DstEltSize =
1014	(DstEltSize > SrcEltSize) ? DstEltSize >> `1` : DstEltSize << `1`;
1015	Cost += getRISCVInstructionCost(OpCodes: Op, VT: DstMVT, CostKind);
1016	}
1017	return Cost;
1018	}
1019	case ISD::FP_TO_SINT:
1020	case ISD::FP_TO_UINT:
1021	case ISD::SINT_TO_FP:
1022	case ISD::UINT_TO_FP:
1023	if (Src->getScalarSizeInBits() == `1` \|\| Dst->getScalarSizeInBits() == `1`) {
1024	// The cost of convert from or to mask vector is different from other
1025	// cases. We could not use PowDiff to calculate it.
1026	// For mask vector to fp, we should use the following instructions:
1027	// vmv.v.i v8, 0
1028	// vmerge.vim v8, v8, -1, v0
1029	// vfcvt.f.x.v v8, v8
1030
1031	// And for fp vector to mask, we use:
1032	// vfncvt.rtz.x.f.w v9, v8
1033	// vand.vi v8, v9, 1
1034	// vmsne.vi v0, v8, 0
1035	return `3`;
1036	}
1037	if (std::abs(x: PowDiff) <= `1`)
1038	return `1`;
1039	// Backend could lower (v[sz]ext i8 to double) to vfcvt(v[sz]ext.f8 i8),
1040	// so it only need two conversion.
1041	if (Src->isIntOrIntVectorTy())
1042	return `2`;
1043	// Counts of narrow/widen instructions.
1044	return std::abs(x: PowDiff);
1045	}
1046	return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1047	}
1048
1049	unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) {
1050	if (isa<ScalableVectorType>(Val: Ty)) {
1051	const unsigned EltSize = DL.getTypeSizeInBits(Ty: Ty->getElementType());
1052	const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue();
1053	const unsigned VectorBits = getVScaleForTuning() RISCV::RVVBitsPerBlock;
1054	return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize);
1055	}
1056	return cast<FixedVectorType>(Val: Ty)->getNumElements();
1057	}
1058
1059	InstructionCost
1060	RISCVTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
1061	FastMathFlags FMF,
1062	TTI::TargetCostKind CostKind) {
1063	if (isa<FixedVectorType>(Val: Ty) && !ST->useRVVForFixedLengthVectors())
1064	return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1065
1066	// Skip if scalar size of Ty is bigger than ELEN.
1067	if (Ty->getScalarSizeInBits() > ST->getELen())
1068	return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1069
1070	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1071	if (Ty->getElementType()->isIntegerTy(Bitwidth: `1`)) {
1072	// SelectionDAGBuilder does following transforms:
1073	// vector_reduce_{smin,umax}(<n x i1>) --> vector_reduce_or(<n x i1>)
1074	// vector_reduce_{smax,umin}(<n x i1>) --> vector_reduce_and(<n x i1>)
1075	if (IID == Intrinsic::umax \|\| IID == Intrinsic::smin)
1076	return getArithmeticReductionCost(Opcode: Instruction::Or, Ty, FMF, CostKind);
1077	else
1078	return getArithmeticReductionCost(Opcode: Instruction::And, Ty, FMF, CostKind);
1079	}
1080
1081	if (IID == Intrinsic::maximum \|\| IID == Intrinsic::minimum) {
1082	SmallVector<unsigned, `3`> Opcodes;
1083	InstructionCost ExtraCost = `0`;
1084	switch (IID) {
1085	case Intrinsic::maximum:
1086	if (FMF.noNaNs()) {
1087	Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
1088	} else {
1089	Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMAX_VS,
1090	RISCV::VFMV_F_S};
1091	// Cost of Canonical Nan + branch
1092	// lui a0, 523264
1093	// fmv.w.x fa0, a0
1094	Type *DstTy = Ty->getScalarType();
1095	const unsigned EltTyBits = DstTy->getScalarSizeInBits();
1096	Type *SrcTy = IntegerType::getIntNTy(C&: DstTy->getContext(), N: EltTyBits);
1097	ExtraCost = `1` +
1098	getCastInstrCost(Opcode: Instruction::UIToFP, Dst: DstTy, Src: SrcTy,
1099	CCH: TTI::CastContextHint::None, CostKind) +
1100	getCFInstrCost(Opcode: Instruction::Br, CostKind);
1101	}
1102	break;
1103
1104	case Intrinsic::minimum:
1105	if (FMF.noNaNs()) {
1106	Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
1107	} else {
1108	Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMIN_VS,
1109	RISCV::VFMV_F_S};
1110	// Cost of Canonical Nan + branch
1111	// lui a0, 523264
1112	// fmv.w.x fa0, a0
1113	Type *DstTy = Ty->getScalarType();
1114	const unsigned EltTyBits = DL.getTypeSizeInBits(Ty: DstTy);
1115	Type *SrcTy = IntegerType::getIntNTy(C&: DstTy->getContext(), N: EltTyBits);
1116	ExtraCost = `1` +
1117	getCastInstrCost(Opcode: Instruction::UIToFP, Dst: DstTy, Src: SrcTy,
1118	CCH: TTI::CastContextHint::None, CostKind) +
1119	getCFInstrCost(Opcode: Instruction::Br, CostKind);
1120	}
1121	break;
1122	}
1123	return ExtraCost + getRISCVInstructionCost(OpCodes: Opcodes, VT: LT.second, CostKind);
1124	}
1125
1126	// IR Reduction is composed by two vmv and one rvv reduction instruction.
1127	unsigned SplitOp;
1128	SmallVector<unsigned, `3`> Opcodes;
1129	switch (IID) {
1130	default:
1131	llvm_unreachable("Unsupported intrinsic");
1132	case Intrinsic::smax:
1133	SplitOp = RISCV::VMAX_VV;
1134	Opcodes = {RISCV::VMV_S_X, RISCV::VREDMAX_VS, RISCV::VMV_X_S};
1135	break;
1136	case Intrinsic::smin:
1137	SplitOp = RISCV::VMIN_VV;
1138	Opcodes = {RISCV::VMV_S_X, RISCV::VREDMIN_VS, RISCV::VMV_X_S};
1139	break;
1140	case Intrinsic::umax:
1141	SplitOp = RISCV::VMAXU_VV;
1142	Opcodes = {RISCV::VMV_S_X, RISCV::VREDMAXU_VS, RISCV::VMV_X_S};
1143	break;
1144	case Intrinsic::umin:
1145	SplitOp = RISCV::VMINU_VV;
1146	Opcodes = {RISCV::VMV_S_X, RISCV::VREDMINU_VS, RISCV::VMV_X_S};
1147	break;
1148	case Intrinsic::maxnum:
1149	SplitOp = RISCV::VFMAX_VV;
1150	Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
1151	break;
1152	case Intrinsic::minnum:
1153	SplitOp = RISCV::VFMIN_VV;
1154	Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
1155	break;
1156	}
1157	// Add a cost for data larger than LMUL8
1158	InstructionCost SplitCost =
1159	(LT.first > `1`) ? (LT.first - `1`) *
1160	getRISCVInstructionCost(OpCodes: SplitOp, VT: LT.second, CostKind)
1161	: `0`;
1162	return SplitCost + getRISCVInstructionCost(OpCodes: Opcodes, VT: LT.second, CostKind);
1163	}
1164
1165	InstructionCost
1166	RISCVTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
1167	std::optional<FastMathFlags> FMF,
1168	TTI::TargetCostKind CostKind) {
1169	if (isa<FixedVectorType>(Val: Ty) && !ST->useRVVForFixedLengthVectors())
1170	return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1171
1172	// Skip if scalar size of Ty is bigger than ELEN.
1173	if (Ty->getScalarSizeInBits() > ST->getELen())
1174	return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1175
1176	int ISD = TLI->InstructionOpcodeToISD(Opcode);
1177	assert(ISD && "Invalid opcode");
1178
1179	if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&
1180	ISD != ISD::FADD)
1181	return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1182
1183	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1184	SmallVector<unsigned, `3`> Opcodes;
1185	Type *ElementTy = Ty->getElementType();
1186	if (ElementTy->isIntegerTy(Bitwidth: `1`)) {
1187	if (ISD == ISD::AND) {
1188	// Example sequences:
1189	// vsetvli a0, zero, e8, mf8, ta, ma
1190	// vmnot.m v8, v0
1191	// vcpop.m a0, v8
1192	// seqz a0, a0
1193	Opcodes = {RISCV::VMNAND_MM, RISCV::VCPOP_M};
1194	return (LT.first - `1`) +
1195	getRISCVInstructionCost(OpCodes: Opcodes, VT: LT.second, CostKind) +
1196	getCmpSelInstrCost(Opcode: Instruction::ICmp, ValTy: ElementTy, CondTy: ElementTy,
1197	VecPred: CmpInst::ICMP_EQ, CostKind);
1198	} else {
1199	// Example sequences:
1200	// vsetvli a0, zero, e8, mf8, ta, ma
1201	// vcpop.m a0, v0
1202	// snez a0, a0
1203	Opcodes = {RISCV::VCPOP_M};
1204	return (LT.first - `1`) +
1205	getRISCVInstructionCost(OpCodes: Opcodes, VT: LT.second, CostKind) +
1206	getCmpSelInstrCost(Opcode: Instruction::ICmp, ValTy: ElementTy, CondTy: ElementTy,
1207	VecPred: CmpInst::ICMP_NE, CostKind);
1208	}
1209	}
1210
1211	// IR Reduction is composed by two vmv and one rvv reduction instruction.
1212	if (TTI::requiresOrderedReduction(FMF)) {
1213	Opcodes.push_back(RISCV::VFMV_S_F);
1214	for (unsigned i = `0`; i < LT.first.getValue(); i++)
1215	Opcodes.push_back(RISCV::VFREDOSUM_VS);
1216	Opcodes.push_back(RISCV::VFMV_F_S);
1217	return getRISCVInstructionCost(OpCodes: Opcodes, VT: LT.second, CostKind);
1218	}
1219	unsigned SplitOp;
1220	switch (ISD) {
1221	case ISD::ADD:
1222	SplitOp = RISCV::VADD_VV;
1223	Opcodes = {RISCV::VMV_S_X, RISCV::VREDSUM_VS, RISCV::VMV_X_S};
1224	break;
1225	case ISD::OR:
1226	SplitOp = RISCV::VOR_VV;
1227	Opcodes = {RISCV::VMV_S_X, RISCV::VREDOR_VS, RISCV::VMV_X_S};
1228	break;
1229	case ISD::XOR:
1230	SplitOp = RISCV::VXOR_VV;
1231	Opcodes = {RISCV::VMV_S_X, RISCV::VREDXOR_VS, RISCV::VMV_X_S};
1232	break;
1233	case ISD::AND:
1234	SplitOp = RISCV::VAND_VV;
1235	Opcodes = {RISCV::VMV_S_X, RISCV::VREDAND_VS, RISCV::VMV_X_S};
1236	break;
1237	case ISD::FADD:
1238	SplitOp = RISCV::VFADD_VV;
1239	Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDUSUM_VS, RISCV::VFMV_F_S};
1240	break;
1241	}
1242	// Add a cost for data larger than LMUL8
1243	InstructionCost SplitCost =
1244	(LT.first > `1`) ? (LT.first - `1`) *
1245	getRISCVInstructionCost(OpCodes: SplitOp, VT: LT.second, CostKind)
1246	: `0`;
1247	return SplitCost + getRISCVInstructionCost(OpCodes: Opcodes, VT: LT.second, CostKind);
1248	}
1249
1250	InstructionCost RISCVTTIImpl::getExtendedReductionCost(
1251	unsigned Opcode, bool IsUnsigned, Type ResTy, VectorType ValTy,
1252	FastMathFlags FMF, TTI::TargetCostKind CostKind) {
1253	if (isa<FixedVectorType>(Val: ValTy) && !ST->useRVVForFixedLengthVectors())
1254	return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, Ty: ValTy,
1255	FMF, CostKind);
1256
1257	// Skip if scalar size of ResTy is bigger than ELEN.
1258	if (ResTy->getScalarSizeInBits() > ST->getELen())
1259	return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, Ty: ValTy,
1260	FMF, CostKind);
1261
1262	if (Opcode != Instruction::Add && Opcode != Instruction::FAdd)
1263	return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, Ty: ValTy,
1264	FMF, CostKind);
1265
1266	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
1267
1268	if (ResTy->getScalarSizeInBits() != `2` * LT.second.getScalarSizeInBits())
1269	return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, Ty: ValTy,
1270	FMF, CostKind);
1271
1272	return (LT.first - `1`) +
1273	getArithmeticReductionCost(Opcode, Ty: ValTy, FMF, CostKind);
1274	}
1275
1276	InstructionCost RISCVTTIImpl::getStoreImmCost(Type *Ty,
1277	TTI::OperandValueInfo OpInfo,
1278	TTI::TargetCostKind CostKind) {
1279	assert(OpInfo.isConstant() && "non constant operand?");
1280	if (!isa<VectorType>(Val: Ty))
1281	// FIXME: We need to account for immediate materialization here, but doing
1282	// a decent job requires more knowledge about the immediate than we
1283	// currently have here.
1284	return `0`;
1285
1286	if (OpInfo.isUniform())
1287	// vmv.x.i, vmv.v.x, or vfmv.v.f
1288	// We ignore the cost of the scalar constant materialization to be consistent
1289	// with how we treat scalar constants themselves just above.
1290	return `1`;
1291
1292	return getConstantPoolLoadCost(Ty, CostKind);
1293	}
1294
1295
1296	InstructionCost RISCVTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1297	MaybeAlign Alignment,
1298	unsigned AddressSpace,
1299	TTI::TargetCostKind CostKind,
1300	TTI::OperandValueInfo OpInfo,
1301	const Instruction *I) {
1302	EVT VT = TLI->getValueType(DL, Ty: Src, AllowUnknown: true);
1303	// Type legalization can't handle structs
1304	if (VT == MVT::Other)
1305	return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1306	CostKind, OpInfo, I);
1307
1308	InstructionCost Cost = `0`;
1309	if (Opcode == Instruction::Store && OpInfo.isConstant())
1310	Cost += getStoreImmCost(Ty: Src, OpInfo, CostKind);
1311	InstructionCost BaseCost =
1312	BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1313	CostKind, OpInfo, I);
1314	// Assume memory ops cost scale with the number of vector registers
1315	// possible accessed by the instruction. Note that BasicTTI already
1316	// handles the LT.first term for us.
1317	if (std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Src);
1318	LT.second.isVector() && CostKind != TTI::TCK_CodeSize)
1319	BaseCost *= TLI->getLMULCost(VT: LT.second);
1320	return Cost + BaseCost;
1321
1322	}
1323
1324	InstructionCost RISCVTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
1325	Type *CondTy,
1326	CmpInst::Predicate VecPred,
1327	TTI::TargetCostKind CostKind,
1328	const Instruction *I) {
1329	if (CostKind != TTI::TCK_RecipThroughput)
1330	return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1331	I);
1332
1333	if (isa<FixedVectorType>(Val: ValTy) && !ST->useRVVForFixedLengthVectors())
1334	return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1335	I);
1336
1337	// Skip if scalar size of ValTy is bigger than ELEN.
1338	if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELen())
1339	return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1340	I);
1341
1342	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
1343	if (Opcode == Instruction::Select && ValTy->isVectorTy()) {
1344	if (CondTy->isVectorTy()) {
1345	if (ValTy->getScalarSizeInBits() == `1`) {
1346	// vmandn.mm v8, v8, v9
1347	// vmand.mm v9, v0, v9
1348	// vmor.mm v0, v9, v8
1349	return LT.first *
1350	getRISCVInstructionCost(
1351	{RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
1352	LT.second, CostKind);
1353	}
1354	// vselect and max/min are supported natively.
1355	return LT.first *
1356	getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second, CostKind);
1357	}
1358
1359	if (ValTy->getScalarSizeInBits() == `1`) {
1360	// vmv.v.x v9, a0
1361	// vmsne.vi v9, v9, 0
1362	// vmandn.mm v8, v8, v9
1363	// vmand.mm v9, v0, v9
1364	// vmor.mm v0, v9, v8
1365	MVT InterimVT = LT.second.changeVectorElementType(MVT::i8);
1366	return LT.first *
1367	getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
1368	InterimVT, CostKind) +
1369	LT.first * getRISCVInstructionCost(
1370	{RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
1371	LT.second, CostKind);
1372	}
1373
1374	// vmv.v.x v10, a0
1375	// vmsne.vi v0, v10, 0
1376	// vmerge.vvm v8, v9, v8, v0
1377	return LT.first * getRISCVInstructionCost(
1378	{RISCV::VMV_V_X, RISCV::VMSNE_VI, RISCV::VMERGE_VVM},
1379	LT.second, CostKind);
1380	}
1381
1382	if ((Opcode == Instruction::ICmp) && ValTy->isVectorTy() &&
1383	CmpInst::isIntPredicate(P: VecPred)) {
1384	// Use VMSLT_VV to represent VMSEQ, VMSNE, VMSLTU, VMSLEU, VMSLT, VMSLE
1385	// provided they incur the same cost across all implementations
1386	return LT.first *
1387	getRISCVInstructionCost(RISCV::VMSLT_VV, LT.second, CostKind);
1388	}
1389
1390	if ((Opcode == Instruction::FCmp) && ValTy->isVectorTy() &&
1391	CmpInst::isFPPredicate(P: VecPred)) {
1392
1393	// Use VMXOR_MM and VMXNOR_MM to generate all true/false mask
1394	if ((VecPred == CmpInst::FCMP_FALSE) \|\| (VecPred == CmpInst::FCMP_TRUE))
1395	return getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind);
1396
1397	// If we do not support the input floating point vector type, use the base
1398	// one which will calculate as:
1399	// ScalarizeCost + Num Cost for fixed vector,*
1400	// InvalidCost for scalable vector.
1401	if ((ValTy->getScalarSizeInBits() == `16` && !ST->hasVInstructionsF16()) \|\|
1402	(ValTy->getScalarSizeInBits() == `32` && !ST->hasVInstructionsF32()) \|\|
1403	(ValTy->getScalarSizeInBits() == `64` && !ST->hasVInstructionsF64()))
1404	return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1405	I);
1406
1407	// Assuming vector fp compare and mask instructions are all the same cost
1408	// until a need arises to differentiate them.
1409	switch (VecPred) {
1410	case CmpInst::FCMP_ONE: // vmflt.vv + vmflt.vv + vmor.mm
1411	case CmpInst::FCMP_ORD: // vmfeq.vv + vmfeq.vv + vmand.mm
1412	case CmpInst::FCMP_UNO: // vmfne.vv + vmfne.vv + vmor.mm
1413	case CmpInst::FCMP_UEQ: // vmflt.vv + vmflt.vv + vmnor.mm
1414	return LT.first * getRISCVInstructionCost(
1415	{RISCV::VMFLT_VV, RISCV::VMFLT_VV, RISCV::VMOR_MM},
1416	LT.second, CostKind);
1417
1418	case CmpInst::FCMP_UGT: // vmfle.vv + vmnot.m
1419	case CmpInst::FCMP_UGE: // vmflt.vv + vmnot.m
1420	case CmpInst::FCMP_ULT: // vmfle.vv + vmnot.m
1421	case CmpInst::FCMP_ULE: // vmflt.vv + vmnot.m
1422	return LT.first *
1423	getRISCVInstructionCost({RISCV::VMFLT_VV, RISCV::VMNAND_MM},
1424	LT.second, CostKind);
1425
1426	case CmpInst::FCMP_OEQ: // vmfeq.vv
1427	case CmpInst::FCMP_OGT: // vmflt.vv
1428	case CmpInst::FCMP_OGE: // vmfle.vv
1429	case CmpInst::FCMP_OLT: // vmflt.vv
1430	case CmpInst::FCMP_OLE: // vmfle.vv
1431	case CmpInst::FCMP_UNE: // vmfne.vv
1432	return LT.first *
1433	getRISCVInstructionCost(RISCV::VMFLT_VV, LT.second, CostKind);
1434	default:
1435	break;
1436	}
1437	}
1438
1439	// TODO: Add cost for scalar type.
1440
1441	return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
1442	}
1443
1444	InstructionCost RISCVTTIImpl::getCFInstrCost(unsigned Opcode,
1445	TTI::TargetCostKind CostKind,
1446	const Instruction *I) {
1447	if (CostKind != TTI::TCK_RecipThroughput)
1448	return Opcode == Instruction::PHI ? `0` : `1`;
1449	// Branches are assumed to be predicted.
1450	return `0`;
1451	}
1452
1453	InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
1454	TTI::TargetCostKind CostKind,
1455	unsigned Index, Value *Op0,
1456	Value *Op1) {
1457	assert(Val->isVectorTy() && "This must be a vector type");
1458
1459	if (Opcode != Instruction::ExtractElement &&
1460	Opcode != Instruction::InsertElement)
1461	return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1462
1463	// Legalize the type.
1464	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Val);
1465
1466	// This type is legalized to a scalar type.
1467	if (!LT.second.isVector()) {
1468	auto *FixedVecTy = cast<FixedVectorType>(Val);
1469	// If Index is a known constant, cost is zero.
1470	if (Index != -`1U`)
1471	return `0`;
1472	// Extract/InsertElement with non-constant index is very costly when
1473	// scalarized; estimate cost of loads/stores sequence via the stack:
1474	// ExtractElement cost: store vector to stack, load scalar;
1475	// InsertElement cost: store vector to stack, store scalar, load vector.
1476	Type *ElemTy = FixedVecTy->getElementType();
1477	auto NumElems = FixedVecTy->getNumElements();
1478	auto Align = DL.getPrefTypeAlign(Ty: ElemTy);
1479	InstructionCost LoadCost =
1480	getMemoryOpCost(Opcode: Instruction::Load, Src: ElemTy, Alignment: Align, AddressSpace: `0`, CostKind);
1481	InstructionCost StoreCost =
1482	getMemoryOpCost(Opcode: Instruction::Store, Src: ElemTy, Alignment: Align, AddressSpace: `0`, CostKind);
1483	return Opcode == Instruction::ExtractElement
1484	? StoreCost * NumElems + LoadCost
1485	: (StoreCost + LoadCost) * NumElems + StoreCost;
1486	}
1487
1488	// For unsupported scalable vector.
1489	if (LT.second.isScalableVector() && !LT.first.isValid())
1490	return LT.first;
1491
1492	if (!isTypeLegal(Ty: Val))
1493	return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1494
1495	// Mask vector extract/insert is expanded via e8.
1496	if (Val->getScalarSizeInBits() == `1`) {
1497	VectorType *WideTy =
1498	VectorType::get(ElementType: IntegerType::get(C&: Val->getContext(), NumBits: `8`),
1499	EC: cast<VectorType>(Val)->getElementCount());
1500	if (Opcode == Instruction::ExtractElement) {
1501	InstructionCost ExtendCost
1502	= getCastInstrCost(Opcode: Instruction::ZExt, Dst: WideTy, Src: Val,
1503	CCH: TTI::CastContextHint::None, CostKind);
1504	InstructionCost ExtractCost
1505	= getVectorInstrCost(Opcode, Val: WideTy, CostKind, Index, Op0: nullptr, Op1: nullptr);
1506	return ExtendCost + ExtractCost;
1507	}
1508	InstructionCost ExtendCost
1509	= getCastInstrCost(Opcode: Instruction::ZExt, Dst: WideTy, Src: Val,
1510	CCH: TTI::CastContextHint::None, CostKind);
1511	InstructionCost InsertCost
1512	= getVectorInstrCost(Opcode, Val: WideTy, CostKind, Index, Op0: nullptr, Op1: nullptr);
1513	InstructionCost TruncCost
1514	= getCastInstrCost(Opcode: Instruction::Trunc, Dst: Val, Src: WideTy,
1515	CCH: TTI::CastContextHint::None, CostKind);
1516	return ExtendCost + InsertCost + TruncCost;
1517	}
1518
1519
1520	// In RVV, we could use vslidedown + vmv.x.s to extract element from vector
1521	// and vslideup + vmv.s.x to insert element to vector.
1522	unsigned BaseCost = `1`;
1523	// When insertelement we should add the index with 1 as the input of vslideup.
1524	unsigned SlideCost = Opcode == Instruction::InsertElement ? `2` : `1`;
1525
1526	if (Index != -`1U`) {
1527	// The type may be split. For fixed-width vectors we can normalize the
1528	// index to the new type.
1529	if (LT.second.isFixedLengthVector()) {
1530	unsigned Width = LT.second.getVectorNumElements();
1531	Index = Index % Width;
1532	}
1533
1534	// We could extract/insert the first element without vslidedown/vslideup.
1535	if (Index == `0`)
1536	SlideCost = `0`;
1537	else if (Opcode == Instruction::InsertElement)
1538	SlideCost = `1`; // With a constant index, we do not need to use addi.
1539	}
1540
1541	// Extract i64 in the target that has XLEN=32 need more instruction.
1542	if (Val->getScalarType()->isIntegerTy() &&
1543	ST->getXLen() < Val->getScalarSizeInBits()) {
1544	// For extractelement, we need the following instructions:
1545	// vsetivli zero, 1, e64, m1, ta, mu (not count)
1546	// vslidedown.vx v8, v8, a0
1547	// vmv.x.s a0, v8
1548	// li a1, 32
1549	// vsrl.vx v8, v8, a1
1550	// vmv.x.s a1, v8
1551
1552	// For insertelement, we need the following instructions:
1553	// vsetivli zero, 2, e32, m4, ta, mu (not count)
1554	// vmv.v.i v12, 0
1555	// vslide1up.vx v16, v12, a1
1556	// vslide1up.vx v12, v16, a0
1557	// addi a0, a2, 1
1558	// vsetvli zero, a0, e64, m4, tu, mu (not count)
1559	// vslideup.vx v8, v12, a2
1560
1561	// TODO: should we count these special vsetvlis?
1562	BaseCost = Opcode == Instruction::InsertElement ? `3` : `4`;
1563	}
1564	return BaseCost + SlideCost;
1565	}
1566
1567	InstructionCost RISCVTTIImpl::getArithmeticInstrCost(
1568	unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1569	TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
1570	ArrayRef<const Value > Args, const* Instruction *CxtI) {
1571
1572	// TODO: Handle more cost kinds.
1573	if (CostKind != TTI::TCK_RecipThroughput)
1574	return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info,
1575	Args, CxtI);
1576
1577	if (isa<FixedVectorType>(Val: Ty) && !ST->useRVVForFixedLengthVectors())
1578	return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info,
1579	Args, CxtI);
1580
1581	// Skip if scalar size of Ty is bigger than ELEN.
1582	if (isa<VectorType>(Val: Ty) && Ty->getScalarSizeInBits() > ST->getELen())
1583	return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info,
1584	Args, CxtI);
1585
1586	// Legalize the type.
1587	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1588
1589	// TODO: Handle scalar type.
1590	if (!LT.second.isVector())
1591	return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info,
1592	Args, CxtI);
1593
1594
1595	auto getConstantMatCost =
1596	[&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {
1597	if (OpInfo.isUniform() && TLI->canSplatOperand(Opcode, Operand))
1598	// Two sub-cases:
1599	// Has a 5 bit immediate operand which can be splatted.*
1600	// Has a larger immediate which must be materialized in scalar register*
1601	// We return 0 for both as we currently ignore the cost of materializing
1602	// scalar constants in GPRs.
1603	return `0`;
1604
1605	return getConstantPoolLoadCost(Ty, CostKind);
1606	};
1607
1608	// Add the cost of materializing any constant vectors required.
1609	InstructionCost ConstantMatCost = `0`;
1610	if (Op1Info.isConstant())
1611	ConstantMatCost += getConstantMatCost (`0`, Op1Info);
1612	if (Op2Info.isConstant())
1613	ConstantMatCost += getConstantMatCost (`1`, Op2Info);
1614
1615	switch (TLI->InstructionOpcodeToISD(Opcode)) {
1616	case ISD::ADD:
1617	case ISD::SUB:
1618	case ISD::AND:
1619	case ISD::OR:
1620	case ISD::XOR:
1621	case ISD::SHL:
1622	case ISD::SRL:
1623	case ISD::SRA:
1624	case ISD::MUL:
1625	case ISD::MULHS:
1626	case ISD::MULHU:
1627	case ISD::FADD:
1628	case ISD::FSUB:
1629	case ISD::FMUL:
1630	case ISD::FNEG: {
1631	return ConstantMatCost + TLI->getLMULCost(VT: LT.second) * LT.first * `1`;
1632	}
1633	default:
1634	return ConstantMatCost +
1635	BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info,
1636	Args, CxtI);
1637	}
1638	}
1639
1640	// TODO: Deduplicate from TargetTransformInfoImplCRTPBase.
1641	InstructionCost RISCVTTIImpl::getPointersChainCost(
1642	ArrayRef<const Value > Ptrs, const* Value *Base,
1643	const TTI::PointersChainInfo &Info, Type *AccessTy,
1644	TTI::TargetCostKind CostKind) {
1645	InstructionCost Cost = TTI::TCC_Free;
1646	// In the basic model we take into account GEP instructions only
1647	// (although here can come alloca instruction, a value, constants and/or
1648	// constant expressions, PHIs, bitcasts ... whatever allowed to be used as a
1649	// pointer). Typically, if Base is a not a GEP-instruction and all the
1650	// pointers are relative to the same base address, all the rest are
1651	// either GEP instructions, PHIs, bitcasts or constants. When we have same
1652	// base, we just calculate cost of each non-Base GEP as an ADD operation if
1653	// any their index is a non-const.
1654	// If no known dependecies between the pointers cost is calculated as a sum
1655	// of costs of GEP instructions.
1656	for (auto [I, V] : enumerate(First&: Ptrs)) {
1657	const auto *GEP = dyn_cast<GetElementPtrInst>(Val: V);
1658	if (!GEP)
1659	continue;
1660	if (Info.isSameBase() && V != Base) {
1661	if (GEP->hasAllConstantIndices())
1662	continue;
1663	// If the chain is unit-stride and BaseReg + stridei is a legal*
1664	// addressing mode, then presume the base GEP is sitting around in a
1665	// register somewhere and check if we can fold the offset relative to
1666	// it.
1667	unsigned Stride = DL.getTypeStoreSize(Ty: AccessTy);
1668	if (Info.isUnitStride() &&
1669	isLegalAddressingMode(Ty: AccessTy,
1670	/ BaseGV / nullptr,
1671	/ BaseOffset / Stride * I,
1672	/ HasBaseReg / true,
1673	/ Scale / `0`,
1674	AddrSpace: GEP->getType()->getPointerAddressSpace()))
1675	continue;
1676	Cost += getArithmeticInstrCost(Opcode: Instruction::Add, Ty: GEP->getType(), CostKind,
1677	Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
1678	Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
1679	Args: std::nullopt);
1680	} else {
1681	SmallVector<const Value *> Indices(GEP->indices());
1682	Cost += getGEPCost(PointeeType: GEP->getSourceElementType(), Ptr: GEP->getPointerOperand(),
1683	Operands: Indices, AccessType: AccessTy, CostKind);
1684	}
1685	}
1686	return Cost;
1687	}
1688
1689	void RISCVTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1690	TTI::UnrollingPreferences &UP,
1691	OptimizationRemarkEmitter *ORE) {
1692	// TODO: More tuning on benchmarks and metrics with changes as needed
1693	// would apply to all settings below to enable performance.
1694
1695
1696	if (ST->enableDefaultUnroll())
1697	return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
1698
1699	// Enable Upper bound unrolling universally, not dependant upon the conditions
1700	// below.
1701	UP.UpperBound = true;
1702
1703	// Disable loop unrolling for Oz and Os.
1704	UP.OptSizeThreshold = `0`;
1705	UP.PartialOptSizeThreshold = `0`;
1706	if (L->getHeader()->getParent()->hasOptSize())
1707	return;
1708
1709	SmallVector<BasicBlock *, `4`> ExitingBlocks;
1710	L->getExitingBlocks(ExitingBlocks);
1711	LLVM_DEBUG(dbgs() << "Loop has:\n"
1712	<< "Blocks: " << L->getNumBlocks() << "\n"
1713	<< "Exit blocks: " << ExitingBlocks.size() << "\n");
1714
1715	// Only allow another exit other than the latch. This acts as an early exit
1716	// as it mirrors the profitability calculation of the runtime unroller.
1717	if (ExitingBlocks.size() > `2`)
1718	return;
1719
1720	// Limit the CFG of the loop body for targets with a branch predictor.
1721	// Allowing 4 blocks permits if-then-else diamonds in the body.
1722	if (L->getNumBlocks() > `4`)
1723	return;
1724
1725	// Don't unroll vectorized loops, including the remainder loop
1726	if (getBooleanLoopAttribute(TheLoop: L, Name: "llvm.loop.isvectorized"))
1727	return;
1728
1729	// Scan the loop: don't unroll loops with calls as this could prevent
1730	// inlining.
1731	InstructionCost Cost = `0`;
1732	for (auto *BB : L->getBlocks()) {
1733	for (auto &I : *BB) {
1734	// Initial setting - Don't unroll loops containing vectorized
1735	// instructions.
1736	if (I.getType()->isVectorTy())
1737	return;
1738
1739	if (isa<CallInst>(Val: I) \|\| isa<InvokeInst>(Val: I)) {
1740	if (const Function *F = cast<CallBase>(Val&: I).getCalledFunction()) {
1741	if (!isLoweredToCall(F))
1742	continue;
1743	}
1744	return;
1745	}
1746
1747	SmallVector<const Value *> Operands(I.operand_values());
1748	Cost += getInstructionCost(U: &I, Operands,
1749	CostKind: TargetTransformInfo::TCK_SizeAndLatency);
1750	}
1751	}
1752
1753	LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
1754
1755	UP.Partial = true;
1756	UP.Runtime = true;
1757	UP.UnrollRemainder = true;
1758	UP.UnrollAndJam = true;
1759	UP.UnrollAndJamInnerLoopThreshold = `60`;
1760
1761	// Force unrolling small loops can be very useful because of the branch
1762	// taken cost of the backedge.
1763	if (Cost < `12`)
1764	UP.Force = true;
1765	}
1766
1767	void RISCVTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
1768	TTI::PeelingPreferences &PP) {
1769	BaseT::getPeelingPreferences(L, SE, PP);
1770	}
1771
1772	unsigned RISCVTTIImpl::getRegUsageForType(Type *Ty) {
1773	TypeSize Size = DL.getTypeSizeInBits(Ty);
1774	if (Ty->isVectorTy()) {
1775	if (Size.isScalable() && ST->hasVInstructions())
1776	return divideCeil(Numerator: Size.getKnownMinValue(), Denominator: RISCV::RVVBitsPerBlock);
1777
1778	if (ST->useRVVForFixedLengthVectors())
1779	return divideCeil(Numerator: Size, Denominator: ST->getRealMinVLen());
1780	}
1781
1782	return BaseT::getRegUsageForType(Ty);
1783	}
1784
1785	unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1786	if (SLPMaxVF.getNumOccurrences())
1787	return SLPMaxVF;
1788
1789	// Return how many elements can fit in getRegisterBitwidth. This is the
1790	// same routine as used in LoopVectorizer. We should probably be
1791	// accounting for whether we actually have instructions with the right
1792	// lane type, but we don't have enough information to do that without
1793	// some additional plumbing which hasn't been justified yet.
1794	TypeSize RegWidth =
1795	getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector);
1796	// If no vector registers, or absurd element widths, disable
1797	// vectorization by returning 1.
1798	return std::max<unsigned>(a: `1U`, b: RegWidth.getFixedValue() / ElemWidth);
1799	}
1800
1801	bool RISCVTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
1802	const TargetTransformInfo::LSRCost &C2) {
1803	// RISC-V specific here are "instruction number 1st priority".
1804	return std::tie(args: C1.Insns, args: C1.NumRegs, args: C1.AddRecCost,
1805	args: C1.NumIVMuls, args: C1.NumBaseAdds,
1806	args: C1.ScaleCost, args: C1.ImmCost, args: C1.SetupCost) <
1807	std::tie(args: C2.Insns, args: C2.NumRegs, args: C2.AddRecCost,
1808	args: C2.NumIVMuls, args: C2.NumBaseAdds,
1809	args: C2.ScaleCost, args: C2.ImmCost, args: C2.SetupCost);
1810	}
1811
1812	bool RISCVTTIImpl::isLegalMaskedCompressStore(Type *DataTy, Align Alignment) {
1813	auto *VTy = dyn_cast<VectorType>(Val: DataTy);
1814	if (!VTy \|\| VTy->isScalableTy())
1815	return false;
1816
1817	if (!isLegalMaskedLoadStore(DataType: DataTy, Alignment))
1818	return false;
1819	return true;
1820	}
1821
1822	bool RISCVTTIImpl::areInlineCompatible(const Function *Caller,
1823	const Function Callee) const* {
1824	const TargetMachine &TM = getTLI()->getTargetMachine();
1825
1826	const FeatureBitset &CallerBits =
1827	TM.getSubtargetImpl(*Caller)->getFeatureBits();
1828	const FeatureBitset &CalleeBits =
1829	TM.getSubtargetImpl(*Callee)->getFeatureBits();
1830
1831	// Inline a callee if its target-features are a subset of the callers
1832	// target-features.
1833	return (CallerBits & CalleeBits) == CalleeBits;
1834	}
1835

source code of llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp