1//===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "RISCVTargetTransformInfo.h"
10#include "MCTargetDesc/RISCVMatInt.h"
11#include "llvm/ADT/STLExtras.h"
12#include "llvm/Analysis/TargetTransformInfo.h"
13#include "llvm/CodeGen/BasicTTIImpl.h"
14#include "llvm/CodeGen/CostTable.h"
15#include "llvm/CodeGen/TargetLowering.h"
16#include "llvm/IR/Instructions.h"
17#include <cmath>
18#include <optional>
19using namespace llvm;
20
21#define DEBUG_TYPE "riscvtti"
22
23static cl::opt<unsigned> RVVRegisterWidthLMUL(
24 "riscv-v-register-bit-width-lmul",
25 cl::desc(
26 "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
27 "by autovectorized code. Fractional LMULs are not supported."),
28 cl::init(Val: 2), cl::Hidden);
29
30static cl::opt<unsigned> SLPMaxVF(
31 "riscv-v-slp-max-vf",
32 cl::desc(
33 "Overrides result used for getMaximumVF query which is used "
34 "exclusively by SLP vectorizer."),
35 cl::Hidden);
36
37InstructionCost
38RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
39 TTI::TargetCostKind CostKind) {
40 // Check if the type is valid for all CostKind
41 if (!VT.isVector())
42 return InstructionCost::getInvalid();
43 size_t NumInstr = OpCodes.size();
44 if (CostKind == TTI::TCK_CodeSize)
45 return NumInstr;
46 InstructionCost LMULCost = TLI->getLMULCost(VT);
47 if ((CostKind != TTI::TCK_RecipThroughput) && (CostKind != TTI::TCK_Latency))
48 return LMULCost * NumInstr;
49 InstructionCost Cost = 0;
50 for (auto Op : OpCodes) {
51 switch (Op) {
52 case RISCV::VRGATHER_VI:
53 Cost += TLI->getVRGatherVICost(VT);
54 break;
55 case RISCV::VRGATHER_VV:
56 Cost += TLI->getVRGatherVVCost(VT);
57 break;
58 case RISCV::VSLIDEUP_VI:
59 case RISCV::VSLIDEDOWN_VI:
60 Cost += TLI->getVSlideVICost(VT);
61 break;
62 case RISCV::VSLIDEUP_VX:
63 case RISCV::VSLIDEDOWN_VX:
64 Cost += TLI->getVSlideVXCost(VT);
65 break;
66 case RISCV::VREDMAX_VS:
67 case RISCV::VREDMIN_VS:
68 case RISCV::VREDMAXU_VS:
69 case RISCV::VREDMINU_VS:
70 case RISCV::VREDSUM_VS:
71 case RISCV::VREDAND_VS:
72 case RISCV::VREDOR_VS:
73 case RISCV::VREDXOR_VS:
74 case RISCV::VFREDMAX_VS:
75 case RISCV::VFREDMIN_VS:
76 case RISCV::VFREDUSUM_VS: {
77 unsigned VL = VT.getVectorMinNumElements();
78 if (!VT.isFixedLengthVector())
79 VL *= *getVScaleForTuning();
80 Cost += Log2_32_Ceil(VL);
81 break;
82 }
83 case RISCV::VFREDOSUM_VS: {
84 unsigned VL = VT.getVectorMinNumElements();
85 if (!VT.isFixedLengthVector())
86 VL *= *getVScaleForTuning();
87 Cost += VL;
88 break;
89 }
90 case RISCV::VMV_X_S:
91 case RISCV::VMV_S_X:
92 case RISCV::VFMV_F_S:
93 case RISCV::VFMV_S_F:
94 case RISCV::VMOR_MM:
95 case RISCV::VMXOR_MM:
96 case RISCV::VMAND_MM:
97 case RISCV::VMANDN_MM:
98 case RISCV::VMNAND_MM:
99 case RISCV::VCPOP_M:
100 Cost += 1;
101 break;
102 default:
103 Cost += LMULCost;
104 }
105 }
106 return Cost;
107}
108
109InstructionCost RISCVTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
110 TTI::TargetCostKind CostKind) {
111 assert(Ty->isIntegerTy() &&
112 "getIntImmCost can only estimate cost of materialising integers");
113
114 // We have a Zero register, so 0 is always free.
115 if (Imm == 0)
116 return TTI::TCC_Free;
117
118 // Otherwise, we check how many instructions it will take to materialise.
119 const DataLayout &DL = getDataLayout();
120 return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty), *getST());
121}
122
123// Look for patterns of shift followed by AND that can be turned into a pair of
124// shifts. We won't need to materialize an immediate for the AND so these can
125// be considered free.
126static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) {
127 uint64_t Mask = Imm.getZExtValue();
128 auto *BO = dyn_cast<BinaryOperator>(Val: Inst->getOperand(i: 0));
129 if (!BO || !BO->hasOneUse())
130 return false;
131
132 if (BO->getOpcode() != Instruction::Shl)
133 return false;
134
135 if (!isa<ConstantInt>(Val: BO->getOperand(i_nocapture: 1)))
136 return false;
137
138 unsigned ShAmt = cast<ConstantInt>(Val: BO->getOperand(i_nocapture: 1))->getZExtValue();
139 // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1
140 // is a mask shifted by c2 bits with c3 leading zeros.
141 if (isShiftedMask_64(Value: Mask)) {
142 unsigned Trailing = llvm::countr_zero(Val: Mask);
143 if (ShAmt == Trailing)
144 return true;
145 }
146
147 return false;
148}
149
150InstructionCost RISCVTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
151 const APInt &Imm, Type *Ty,
152 TTI::TargetCostKind CostKind,
153 Instruction *Inst) {
154 assert(Ty->isIntegerTy() &&
155 "getIntImmCost can only estimate cost of materialising integers");
156
157 // We have a Zero register, so 0 is always free.
158 if (Imm == 0)
159 return TTI::TCC_Free;
160
161 // Some instructions in RISC-V can take a 12-bit immediate. Some of these are
162 // commutative, in others the immediate comes from a specific argument index.
163 bool Takes12BitImm = false;
164 unsigned ImmArgIdx = ~0U;
165
166 switch (Opcode) {
167 case Instruction::GetElementPtr:
168 // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will
169 // split up large offsets in GEP into better parts than ConstantHoisting
170 // can.
171 return TTI::TCC_Free;
172 case Instruction::Store:
173 // If the address is a constant, use the materialization cost.
174 if (Idx == 1)
175 return getIntImmCost(Imm, Ty, CostKind);
176 return TTI::TCC_Free;
177 case Instruction::Load:
178 // If the address is a constant, use the materialization cost.
179 return getIntImmCost(Imm, Ty, CostKind);
180 case Instruction::And:
181 // zext.h
182 if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb())
183 return TTI::TCC_Free;
184 // zext.w
185 if (Imm == UINT64_C(0xffffffff) && ST->hasStdExtZba())
186 return TTI::TCC_Free;
187 // bclri
188 if (ST->hasStdExtZbs() && (~Imm).isPowerOf2())
189 return TTI::TCC_Free;
190 if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() &&
191 canUseShiftPair(Inst, Imm))
192 return TTI::TCC_Free;
193 Takes12BitImm = true;
194 break;
195 case Instruction::Add:
196 Takes12BitImm = true;
197 break;
198 case Instruction::Or:
199 case Instruction::Xor:
200 // bseti/binvi
201 if (ST->hasStdExtZbs() && Imm.isPowerOf2())
202 return TTI::TCC_Free;
203 Takes12BitImm = true;
204 break;
205 case Instruction::Mul:
206 // Power of 2 is a shift. Negated power of 2 is a shift and a negate.
207 if (Imm.isPowerOf2() || Imm.isNegatedPowerOf2())
208 return TTI::TCC_Free;
209 // One more or less than a power of 2 can use SLLI+ADD/SUB.
210 if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2())
211 return TTI::TCC_Free;
212 // FIXME: There is no MULI instruction.
213 Takes12BitImm = true;
214 break;
215 case Instruction::Sub:
216 case Instruction::Shl:
217 case Instruction::LShr:
218 case Instruction::AShr:
219 Takes12BitImm = true;
220 ImmArgIdx = 1;
221 break;
222 default:
223 break;
224 }
225
226 if (Takes12BitImm) {
227 // Check immediate is the correct argument...
228 if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) {
229 // ... and fits into the 12-bit immediate.
230 if (Imm.getSignificantBits() <= 64 &&
231 getTLI()->isLegalAddImmediate(Imm: Imm.getSExtValue())) {
232 return TTI::TCC_Free;
233 }
234 }
235
236 // Otherwise, use the full materialisation cost.
237 return getIntImmCost(Imm, Ty, CostKind);
238 }
239
240 // By default, prevent hoisting.
241 return TTI::TCC_Free;
242}
243
244InstructionCost
245RISCVTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
246 const APInt &Imm, Type *Ty,
247 TTI::TargetCostKind CostKind) {
248 // Prevent hoisting in unknown cases.
249 return TTI::TCC_Free;
250}
251
252bool RISCVTTIImpl::hasActiveVectorLength(unsigned, Type *DataTy, Align) const {
253 return ST->hasVInstructions();
254}
255
256TargetTransformInfo::PopcntSupportKind
257RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) {
258 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
259 return ST->hasStdExtZbb() || ST->hasVendorXCVbitmanip()
260 ? TTI::PSK_FastHardware
261 : TTI::PSK_Software;
262}
263
264bool RISCVTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const {
265 // Currently, the ExpandReductions pass can't expand scalable-vector
266 // reductions, but we still request expansion as RVV doesn't support certain
267 // reductions and the SelectionDAG can't legalize them either.
268 switch (II->getIntrinsicID()) {
269 default:
270 return false;
271 // These reductions have no equivalent in RVV
272 case Intrinsic::vector_reduce_mul:
273 case Intrinsic::vector_reduce_fmul:
274 return true;
275 }
276}
277
278std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
279 if (ST->hasVInstructions())
280 return ST->getRealMaxVLen() / RISCV::RVVBitsPerBlock;
281 return BaseT::getMaxVScale();
282}
283
284std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const {
285 if (ST->hasVInstructions())
286 if (unsigned MinVLen = ST->getRealMinVLen();
287 MinVLen >= RISCV::RVVBitsPerBlock)
288 return MinVLen / RISCV::RVVBitsPerBlock;
289 return BaseT::getVScaleForTuning();
290}
291
292TypeSize
293RISCVTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
294 unsigned LMUL =
295 llvm::bit_floor(Value: std::clamp<unsigned>(val: RVVRegisterWidthLMUL, lo: 1, hi: 8));
296 switch (K) {
297 case TargetTransformInfo::RGK_Scalar:
298 return TypeSize::getFixed(ExactSize: ST->getXLen());
299 case TargetTransformInfo::RGK_FixedWidthVector:
300 return TypeSize::getFixed(
301 ExactSize: ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0);
302 case TargetTransformInfo::RGK_ScalableVector:
303 return TypeSize::getScalable(
304 MinimumSize: (ST->hasVInstructions() &&
305 ST->getRealMinVLen() >= RISCV::RVVBitsPerBlock)
306 ? LMUL * RISCV::RVVBitsPerBlock
307 : 0);
308 }
309
310 llvm_unreachable("Unsupported register kind");
311}
312
313InstructionCost
314RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty, TTI::TargetCostKind CostKind) {
315 // Add a cost of address generation + the cost of the load. The address
316 // is expected to be a PC relative offset to a constant pool entry
317 // using auipc/addi.
318 return 2 + getMemoryOpCost(Opcode: Instruction::Load, Src: Ty, Alignment: DL.getABITypeAlign(Ty),
319 /*AddressSpace=*/0, CostKind);
320}
321
322static VectorType *getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST,
323 LLVMContext &C) {
324 assert((DataVT.getScalarSizeInBits() != 8 ||
325 DataVT.getVectorNumElements() <= 256) && "unhandled case in lowering");
326 MVT IndexVT = DataVT.changeTypeToInteger();
327 if (IndexVT.getScalarType().bitsGT(ST.getXLenVT()))
328 IndexVT = IndexVT.changeVectorElementType(MVT::i16);
329 return cast<VectorType>(Val: EVT(IndexVT).getTypeForEVT(Context&: C));
330}
331
332InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
333 VectorType *Tp, ArrayRef<int> Mask,
334 TTI::TargetCostKind CostKind,
335 int Index, VectorType *SubTp,
336 ArrayRef<const Value *> Args,
337 const Instruction *CxtI) {
338 Kind = improveShuffleKindFromMask(Kind, Mask, Ty: Tp, Index, SubTy&: SubTp);
339
340 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Tp);
341
342 // First, handle cases where having a fixed length vector enables us to
343 // give a more accurate cost than falling back to generic scalable codegen.
344 // TODO: Each of these cases hints at a modeling gap around scalable vectors.
345 if (isa<FixedVectorType>(Val: Tp)) {
346 switch (Kind) {
347 default:
348 break;
349 case TTI::SK_PermuteSingleSrc: {
350 if (Mask.size() >= 2 && LT.second.isFixedLengthVector()) {
351 MVT EltTp = LT.second.getVectorElementType();
352 // If the size of the element is < ELEN then shuffles of interleaves and
353 // deinterleaves of 2 vectors can be lowered into the following
354 // sequences
355 if (EltTp.getScalarSizeInBits() < ST->getELen()) {
356 // Example sequence:
357 // vsetivli zero, 4, e8, mf4, ta, ma (ignored)
358 // vwaddu.vv v10, v8, v9
359 // li a0, -1 (ignored)
360 // vwmaccu.vx v10, a0, v9
361 if (ShuffleVectorInst::isInterleaveMask(Mask, Factor: 2, NumInputElts: Mask.size()))
362 return 2 * LT.first * TLI->getLMULCost(VT: LT.second);
363
364 if (Mask[0] == 0 || Mask[0] == 1) {
365 auto DeinterleaveMask = createStrideMask(Start: Mask[0], Stride: 2, VF: Mask.size());
366 // Example sequence:
367 // vnsrl.wi v10, v8, 0
368 if (equal(DeinterleaveMask, Mask))
369 return LT.first * getRISCVInstructionCost(RISCV::VNSRL_WI,
370 LT.second, CostKind);
371 }
372 }
373 }
374 // vrgather + cost of generating the mask constant.
375 // We model this for an unknown mask with a single vrgather.
376 if (LT.second.isFixedLengthVector() && LT.first == 1 &&
377 (LT.second.getScalarSizeInBits() != 8 ||
378 LT.second.getVectorNumElements() <= 256)) {
379 VectorType *IdxTy = getVRGatherIndexType(DataVT: LT.second, ST: *ST, C&: Tp->getContext());
380 InstructionCost IndexCost = getConstantPoolLoadCost(Ty: IdxTy, CostKind);
381 return IndexCost +
382 getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind);
383 }
384 [[fallthrough]];
385 }
386 case TTI::SK_Transpose:
387 case TTI::SK_PermuteTwoSrc: {
388 // 2 x (vrgather + cost of generating the mask constant) + cost of mask
389 // register for the second vrgather. We model this for an unknown
390 // (shuffle) mask.
391 if (LT.second.isFixedLengthVector() && LT.first == 1 &&
392 (LT.second.getScalarSizeInBits() != 8 ||
393 LT.second.getVectorNumElements() <= 256)) {
394 auto &C = Tp->getContext();
395 auto EC = Tp->getElementCount();
396 VectorType *IdxTy = getVRGatherIndexType(DataVT: LT.second, ST: *ST, C);
397 VectorType *MaskTy = VectorType::get(ElementType: IntegerType::getInt1Ty(C), EC);
398 InstructionCost IndexCost = getConstantPoolLoadCost(Ty: IdxTy, CostKind);
399 InstructionCost MaskCost = getConstantPoolLoadCost(Ty: MaskTy, CostKind);
400 return 2 * IndexCost +
401 getRISCVInstructionCost({RISCV::VRGATHER_VV, RISCV::VRGATHER_VV},
402 LT.second, CostKind) +
403 MaskCost;
404 }
405 [[fallthrough]];
406 }
407 case TTI::SK_Select: {
408 // We are going to permute multiple sources and the result will be in
409 // multiple destinations. Providing an accurate cost only for splits where
410 // the element type remains the same.
411 if (!Mask.empty() && LT.first.isValid() && LT.first != 1 &&
412 LT.second.isFixedLengthVector() &&
413 LT.second.getVectorElementType().getSizeInBits() ==
414 Tp->getElementType()->getPrimitiveSizeInBits() &&
415 LT.second.getVectorNumElements() <
416 cast<FixedVectorType>(Val: Tp)->getNumElements() &&
417 divideCeil(Numerator: Mask.size(),
418 Denominator: cast<FixedVectorType>(Val: Tp)->getNumElements()) ==
419 static_cast<unsigned>(*LT.first.getValue())) {
420 unsigned NumRegs = *LT.first.getValue();
421 unsigned VF = cast<FixedVectorType>(Val: Tp)->getNumElements();
422 unsigned SubVF = PowerOf2Ceil(A: VF / NumRegs);
423 auto *SubVecTy = FixedVectorType::get(ElementType: Tp->getElementType(), NumElts: SubVF);
424
425 InstructionCost Cost = 0;
426 for (unsigned I = 0; I < NumRegs; ++I) {
427 bool IsSingleVector = true;
428 SmallVector<int> SubMask(SubVF, PoisonMaskElem);
429 transform(Range: Mask.slice(N: I * SubVF,
430 M: I == NumRegs - 1 ? Mask.size() % SubVF : SubVF),
431 d_first: SubMask.begin(), F: [&](int I) {
432 bool SingleSubVector = I / VF == 0;
433 IsSingleVector &= SingleSubVector;
434 return (SingleSubVector ? 0 : 1) * SubVF + I % VF;
435 });
436 Cost += getShuffleCost(Kind: IsSingleVector ? TTI::SK_PermuteSingleSrc
437 : TTI::SK_PermuteTwoSrc,
438 Tp: SubVecTy, Mask: SubMask, CostKind, Index: 0, SubTp: nullptr);
439 return Cost;
440 }
441 }
442 break;
443 }
444 }
445 };
446
447 // Handle scalable vectors (and fixed vectors legalized to scalable vectors).
448 switch (Kind) {
449 default:
450 // Fallthrough to generic handling.
451 // TODO: Most of these cases will return getInvalid in generic code, and
452 // must be implemented here.
453 break;
454 case TTI::SK_ExtractSubvector:
455 // Extract at zero is always a subregister extract
456 if (Index == 0)
457 return TTI::TCC_Free;
458
459 // If we're extracting a subvector of at most m1 size at a sub-register
460 // boundary - which unfortunately we need exact vlen to identify - this is
461 // a subregister extract at worst and thus won't require a vslidedown.
462 // TODO: Extend for aligned m2, m4 subvector extracts
463 // TODO: Extend for misalgined (but contained) extracts
464 // TODO: Extend for scalable subvector types
465 if (std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(Ty: SubTp);
466 SubLT.second.isValid() && SubLT.second.isFixedLengthVector()) {
467 const unsigned MinVLen = ST->getRealMinVLen();
468 const unsigned MaxVLen = ST->getRealMaxVLen();
469 if (MinVLen == MaxVLen &&
470 SubLT.second.getScalarSizeInBits() * Index % MinVLen == 0 &&
471 SubLT.second.getSizeInBits() <= MinVLen)
472 return TTI::TCC_Free;
473 }
474
475 // Example sequence:
476 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
477 // vslidedown.vi v8, v9, 2
478 return LT.first *
479 getRISCVInstructionCost(RISCV::VSLIDEDOWN_VI, LT.second, CostKind);
480 case TTI::SK_InsertSubvector:
481 // Example sequence:
482 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
483 // vslideup.vi v8, v9, 2
484 return LT.first *
485 getRISCVInstructionCost(RISCV::VSLIDEUP_VI, LT.second, CostKind);
486 case TTI::SK_Select: {
487 // Example sequence:
488 // li a0, 90
489 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
490 // vmv.s.x v0, a0
491 // vmerge.vvm v8, v9, v8, v0
492 // We use 2 for the cost of the mask materialization as this is the true
493 // cost for small masks and most shuffles are small. At worst, this cost
494 // should be a very small constant for the constant pool load. As such,
495 // we may bias towards large selects slightly more than truely warranted.
496 return LT.first *
497 (1 + getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM},
498 LT.second, CostKind));
499 }
500 case TTI::SK_Broadcast: {
501 bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(V: Args[0]) ==
502 Instruction::InsertElement);
503 if (LT.second.getScalarSizeInBits() == 1) {
504 if (HasScalar) {
505 // Example sequence:
506 // andi a0, a0, 1
507 // vsetivli zero, 2, e8, mf8, ta, ma (ignored)
508 // vmv.v.x v8, a0
509 // vmsne.vi v0, v8, 0
510 return LT.first *
511 (1 + getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
512 LT.second, CostKind));
513 }
514 // Example sequence:
515 // vsetivli zero, 2, e8, mf8, ta, mu (ignored)
516 // vmv.v.i v8, 0
517 // vmerge.vim v8, v8, 1, v0
518 // vmv.x.s a0, v8
519 // andi a0, a0, 1
520 // vmv.v.x v8, a0
521 // vmsne.vi v0, v8, 0
522
523 return LT.first *
524 (1 + getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM,
525 RISCV::VMV_X_S, RISCV::VMV_V_X,
526 RISCV::VMSNE_VI},
527 LT.second, CostKind));
528 }
529
530 if (HasScalar) {
531 // Example sequence:
532 // vmv.v.x v8, a0
533 return LT.first *
534 getRISCVInstructionCost(RISCV::VMV_V_X, LT.second, CostKind);
535 }
536
537 // Example sequence:
538 // vrgather.vi v9, v8, 0
539 return LT.first *
540 getRISCVInstructionCost(RISCV::VRGATHER_VI, LT.second, CostKind);
541 }
542 case TTI::SK_Splice: {
543 // vslidedown+vslideup.
544 // TODO: Multiplying by LT.first implies this legalizes into multiple copies
545 // of similar code, but I think we expand through memory.
546 unsigned Opcodes[2] = {RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX};
547 if (Index >= 0 && Index < 32)
548 Opcodes[0] = RISCV::VSLIDEDOWN_VI;
549 else if (Index < 0 && Index > -32)
550 Opcodes[1] = RISCV::VSLIDEUP_VI;
551 return LT.first * getRISCVInstructionCost(OpCodes: Opcodes, VT: LT.second, CostKind);
552 }
553 case TTI::SK_Reverse: {
554 // TODO: Cases to improve here:
555 // * Illegal vector types
556 // * i64 on RV32
557 // * i1 vector
558 // At low LMUL, most of the cost is producing the vrgather index register.
559 // At high LMUL, the cost of the vrgather itself will dominate.
560 // Example sequence:
561 // csrr a0, vlenb
562 // srli a0, a0, 3
563 // addi a0, a0, -1
564 // vsetvli a1, zero, e8, mf8, ta, mu (ignored)
565 // vid.v v9
566 // vrsub.vx v10, v9, a0
567 // vrgather.vv v9, v8, v10
568 InstructionCost LenCost = 3;
569 if (LT.second.isFixedLengthVector())
570 // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
571 LenCost = isInt<5>(x: LT.second.getVectorNumElements() - 1) ? 0 : 1;
572 unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV};
573 if (LT.second.isFixedLengthVector() &&
574 isInt<5>(LT.second.getVectorNumElements() - 1))
575 Opcodes[1] = RISCV::VRSUB_VI;
576 InstructionCost GatherCost =
577 getRISCVInstructionCost(Opcodes, LT.second, CostKind);
578 // Mask operation additionally required extend and truncate
579 InstructionCost ExtendCost = Tp->getElementType()->isIntegerTy(Bitwidth: 1) ? 3 : 0;
580 return LT.first * (LenCost + GatherCost + ExtendCost);
581 }
582 }
583 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
584}
585
586InstructionCost
587RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
588 unsigned AddressSpace,
589 TTI::TargetCostKind CostKind) {
590 if (!isLegalMaskedLoadStore(DataType: Src, Alignment) ||
591 CostKind != TTI::TCK_RecipThroughput)
592 return BaseT::getMaskedMemoryOpCost(Opcode, DataTy: Src, Alignment, AddressSpace,
593 CostKind);
594
595 return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
596}
597
598InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost(
599 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
600 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
601 bool UseMaskForCond, bool UseMaskForGaps) {
602 if (isa<ScalableVectorType>(Val: VecTy))
603 return InstructionCost::getInvalid();
604 auto *FVTy = cast<FixedVectorType>(Val: VecTy);
605 InstructionCost MemCost =
606 getMemoryOpCost(Opcode, Src: VecTy, Alignment, AddressSpace, CostKind);
607 unsigned VF = FVTy->getNumElements() / Factor;
608
609 // The interleaved memory access pass will lower interleaved memory ops (i.e
610 // a load and store followed by a specific shuffle) to vlseg/vsseg
611 // intrinsics. In those cases then we can treat it as if it's just one (legal)
612 // memory op
613 if (!UseMaskForCond && !UseMaskForGaps &&
614 Factor <= TLI->getMaxSupportedInterleaveFactor()) {
615 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: FVTy);
616 // Need to make sure type has't been scalarized
617 if (LT.second.isFixedLengthVector()) {
618 auto *LegalFVTy = FixedVectorType::get(ElementType: FVTy->getElementType(),
619 NumElts: LT.second.getVectorNumElements());
620 // FIXME: We use the memory op cost of the *legalized* type here, becuase
621 // it's getMemoryOpCost returns a really expensive cost for types like
622 // <6 x i8>, which show up when doing interleaves of Factor=3 etc.
623 // Should the memory op cost of these be cheaper?
624 if (TLI->isLegalInterleavedAccessType(VTy: LegalFVTy, Factor, Alignment,
625 AddrSpace: AddressSpace, DL)) {
626 InstructionCost LegalMemCost = getMemoryOpCost(
627 Opcode, Src: LegalFVTy, Alignment, AddressSpace, CostKind);
628 return LT.first + LegalMemCost;
629 }
630 }
631 }
632
633 // An interleaved load will look like this for Factor=3:
634 // %wide.vec = load <12 x i32>, ptr %3, align 4
635 // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
636 // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
637 // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
638 if (Opcode == Instruction::Load) {
639 InstructionCost Cost = MemCost;
640 for (unsigned Index : Indices) {
641 FixedVectorType *SubVecTy =
642 FixedVectorType::get(ElementType: FVTy->getElementType(), NumElts: VF * Factor);
643 auto Mask = createStrideMask(Start: Index, Stride: Factor, VF);
644 InstructionCost ShuffleCost =
645 getShuffleCost(Kind: TTI::ShuffleKind::SK_PermuteSingleSrc, Tp: SubVecTy, Mask,
646 CostKind, Index: 0, SubTp: nullptr, Args: {});
647 Cost += ShuffleCost;
648 }
649 return Cost;
650 }
651
652 // TODO: Model for NF > 2
653 // We'll need to enhance getShuffleCost to model shuffles that are just
654 // inserts and extracts into subvectors, since they won't have the full cost
655 // of a vrgather.
656 // An interleaved store for 3 vectors of 4 lanes will look like
657 // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7>
658 // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3>
659 // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11>
660 // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask>
661 // store <12 x i32> %interleaved.vec, ptr %10, align 4
662 if (Factor != 2)
663 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
664 Alignment, AddressSpace, CostKind,
665 UseMaskForCond, UseMaskForGaps);
666
667 assert(Opcode == Instruction::Store && "Opcode must be a store");
668 // For an interleaving store of 2 vectors, we perform one large interleaving
669 // shuffle that goes into the wide store
670 auto Mask = createInterleaveMask(VF, NumVecs: Factor);
671 InstructionCost ShuffleCost =
672 getShuffleCost(Kind: TTI::ShuffleKind::SK_PermuteSingleSrc, Tp: FVTy, Mask,
673 CostKind, Index: 0, SubTp: nullptr, Args: {});
674 return MemCost + ShuffleCost;
675}
676
677InstructionCost RISCVTTIImpl::getGatherScatterOpCost(
678 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
679 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
680 if (CostKind != TTI::TCK_RecipThroughput)
681 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
682 Alignment, CostKind, I);
683
684 if ((Opcode == Instruction::Load &&
685 !isLegalMaskedGather(DataType: DataTy, Alignment: Align(Alignment))) ||
686 (Opcode == Instruction::Store &&
687 !isLegalMaskedScatter(DataType: DataTy, Alignment: Align(Alignment))))
688 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
689 Alignment, CostKind, I);
690
691 // Cost is proportional to the number of memory operations implied. For
692 // scalable vectors, we use an estimate on that number since we don't
693 // know exactly what VL will be.
694 auto &VTy = *cast<VectorType>(Val: DataTy);
695 InstructionCost MemOpCost =
696 getMemoryOpCost(Opcode, Src: VTy.getElementType(), Alignment, AddressSpace: 0, CostKind,
697 OpdInfo: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I);
698 unsigned NumLoads = getEstimatedVLFor(Ty: &VTy);
699 return NumLoads * MemOpCost;
700}
701
702InstructionCost RISCVTTIImpl::getStridedMemoryOpCost(
703 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
704 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
705 if (((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
706 !isLegalStridedLoadStore(DataType: DataTy, Alignment)) ||
707 (Opcode != Instruction::Load && Opcode != Instruction::Store))
708 return BaseT::getStridedMemoryOpCost(Opcode, DataTy, Ptr, VariableMask,
709 Alignment, CostKind, I);
710
711 if (CostKind == TTI::TCK_CodeSize)
712 return TTI::TCC_Basic;
713
714 // Cost is proportional to the number of memory operations implied. For
715 // scalable vectors, we use an estimate on that number since we don't
716 // know exactly what VL will be.
717 auto &VTy = *cast<VectorType>(Val: DataTy);
718 InstructionCost MemOpCost =
719 getMemoryOpCost(Opcode, Src: VTy.getElementType(), Alignment, AddressSpace: 0, CostKind,
720 OpdInfo: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I);
721 unsigned NumLoads = getEstimatedVLFor(Ty: &VTy);
722 return NumLoads * MemOpCost;
723}
724
725// Currently, these represent both throughput and codesize costs
726// for the respective intrinsics. The costs in this table are simply
727// instruction counts with the following adjustments made:
728// * One vsetvli is considered free.
729static const CostTblEntry VectorIntrinsicCostTable[]{
730 {Intrinsic::floor, MVT::f32, 9},
731 {Intrinsic::floor, MVT::f64, 9},
732 {Intrinsic::ceil, MVT::f32, 9},
733 {Intrinsic::ceil, MVT::f64, 9},
734 {Intrinsic::trunc, MVT::f32, 7},
735 {Intrinsic::trunc, MVT::f64, 7},
736 {Intrinsic::round, MVT::f32, 9},
737 {Intrinsic::round, MVT::f64, 9},
738 {Intrinsic::roundeven, MVT::f32, 9},
739 {Intrinsic::roundeven, MVT::f64, 9},
740 {Intrinsic::rint, MVT::f32, 7},
741 {Intrinsic::rint, MVT::f64, 7},
742 {Intrinsic::lrint, MVT::i32, 1},
743 {Intrinsic::lrint, MVT::i64, 1},
744 {Intrinsic::llrint, MVT::i64, 1},
745 {Intrinsic::nearbyint, MVT::f32, 9},
746 {Intrinsic::nearbyint, MVT::f64, 9},
747 {Intrinsic::bswap, MVT::i16, 3},
748 {Intrinsic::bswap, MVT::i32, 12},
749 {Intrinsic::bswap, MVT::i64, 31},
750 {Intrinsic::vp_bswap, MVT::i16, 3},
751 {Intrinsic::vp_bswap, MVT::i32, 12},
752 {Intrinsic::vp_bswap, MVT::i64, 31},
753 {Intrinsic::vp_fshl, MVT::i8, 7},
754 {Intrinsic::vp_fshl, MVT::i16, 7},
755 {Intrinsic::vp_fshl, MVT::i32, 7},
756 {Intrinsic::vp_fshl, MVT::i64, 7},
757 {Intrinsic::vp_fshr, MVT::i8, 7},
758 {Intrinsic::vp_fshr, MVT::i16, 7},
759 {Intrinsic::vp_fshr, MVT::i32, 7},
760 {Intrinsic::vp_fshr, MVT::i64, 7},
761 {Intrinsic::bitreverse, MVT::i8, 17},
762 {Intrinsic::bitreverse, MVT::i16, 24},
763 {Intrinsic::bitreverse, MVT::i32, 33},
764 {Intrinsic::bitreverse, MVT::i64, 52},
765 {Intrinsic::vp_bitreverse, MVT::i8, 17},
766 {Intrinsic::vp_bitreverse, MVT::i16, 24},
767 {Intrinsic::vp_bitreverse, MVT::i32, 33},
768 {Intrinsic::vp_bitreverse, MVT::i64, 52},
769 {Intrinsic::ctpop, MVT::i8, 12},
770 {Intrinsic::ctpop, MVT::i16, 19},
771 {Intrinsic::ctpop, MVT::i32, 20},
772 {Intrinsic::ctpop, MVT::i64, 21},
773 {Intrinsic::vp_ctpop, MVT::i8, 12},
774 {Intrinsic::vp_ctpop, MVT::i16, 19},
775 {Intrinsic::vp_ctpop, MVT::i32, 20},
776 {Intrinsic::vp_ctpop, MVT::i64, 21},
777 {Intrinsic::vp_ctlz, MVT::i8, 19},
778 {Intrinsic::vp_ctlz, MVT::i16, 28},
779 {Intrinsic::vp_ctlz, MVT::i32, 31},
780 {Intrinsic::vp_ctlz, MVT::i64, 35},
781 {Intrinsic::vp_cttz, MVT::i8, 16},
782 {Intrinsic::vp_cttz, MVT::i16, 23},
783 {Intrinsic::vp_cttz, MVT::i32, 24},
784 {Intrinsic::vp_cttz, MVT::i64, 25},
785};
786
787static unsigned getISDForVPIntrinsicID(Intrinsic::ID ID) {
788 switch (ID) {
789#define HELPER_MAP_VPID_TO_VPSD(VPID, VPSD) \
790 case Intrinsic::VPID: \
791 return ISD::VPSD;
792#include "llvm/IR/VPIntrinsics.def"
793#undef HELPER_MAP_VPID_TO_VPSD
794 }
795 return ISD::DELETED_NODE;
796}
797
798InstructionCost
799RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
800 TTI::TargetCostKind CostKind) {
801 auto *RetTy = ICA.getReturnType();
802 switch (ICA.getID()) {
803 case Intrinsic::ceil:
804 case Intrinsic::floor:
805 case Intrinsic::trunc:
806 case Intrinsic::rint:
807 case Intrinsic::lrint:
808 case Intrinsic::llrint:
809 case Intrinsic::round:
810 case Intrinsic::roundeven: {
811 // These all use the same code.
812 auto LT = getTypeLegalizationCost(Ty: RetTy);
813 if (!LT.second.isVector() && TLI->isOperationCustom(Op: ISD::FCEIL, VT: LT.second))
814 return LT.first * 8;
815 break;
816 }
817 case Intrinsic::umin:
818 case Intrinsic::umax:
819 case Intrinsic::smin:
820 case Intrinsic::smax: {
821 auto LT = getTypeLegalizationCost(Ty: RetTy);
822 if (LT.second.isScalarInteger() && ST->hasStdExtZbb())
823 return LT.first;
824
825 if (ST->hasVInstructions() && LT.second.isVector()) {
826 unsigned Op;
827 switch (ICA.getID()) {
828 case Intrinsic::umin:
829 Op = RISCV::VMINU_VV;
830 break;
831 case Intrinsic::umax:
832 Op = RISCV::VMAXU_VV;
833 break;
834 case Intrinsic::smin:
835 Op = RISCV::VMIN_VV;
836 break;
837 case Intrinsic::smax:
838 Op = RISCV::VMAX_VV;
839 break;
840 }
841 return LT.first * getRISCVInstructionCost(OpCodes: Op, VT: LT.second, CostKind);
842 }
843 break;
844 }
845 case Intrinsic::sadd_sat:
846 case Intrinsic::ssub_sat:
847 case Intrinsic::uadd_sat:
848 case Intrinsic::usub_sat:
849 case Intrinsic::fabs:
850 case Intrinsic::sqrt: {
851 auto LT = getTypeLegalizationCost(Ty: RetTy);
852 if (ST->hasVInstructions() && LT.second.isVector())
853 return LT.first;
854 break;
855 }
856 case Intrinsic::ctpop: {
857 auto LT = getTypeLegalizationCost(Ty: RetTy);
858 if (ST->hasVInstructions() && ST->hasStdExtZvbb() && LT.second.isVector())
859 return LT.first;
860 break;
861 }
862 case Intrinsic::abs: {
863 auto LT = getTypeLegalizationCost(Ty: RetTy);
864 if (ST->hasVInstructions() && LT.second.isVector()) {
865 // vrsub.vi v10, v8, 0
866 // vmax.vv v8, v8, v10
867 return LT.first * 2;
868 }
869 break;
870 }
871 case Intrinsic::get_active_lane_mask: {
872 if (ST->hasVInstructions()) {
873 Type *ExpRetTy = VectorType::get(
874 ElementType: ICA.getArgTypes()[0], EC: cast<VectorType>(Val: RetTy)->getElementCount());
875 auto LT = getTypeLegalizationCost(Ty: ExpRetTy);
876
877 // vid.v v8 // considered hoisted
878 // vsaddu.vx v8, v8, a0
879 // vmsltu.vx v0, v8, a1
880 return LT.first *
881 getRISCVInstructionCost({RISCV::VSADDU_VX, RISCV::VMSLTU_VX},
882 LT.second, CostKind);
883 }
884 break;
885 }
886 // TODO: add more intrinsic
887 case Intrinsic::experimental_stepvector: {
888 auto LT = getTypeLegalizationCost(Ty: RetTy);
889 // Legalisation of illegal types involves an `index' instruction plus
890 // (LT.first - 1) vector adds.
891 if (ST->hasVInstructions())
892 return getRISCVInstructionCost(RISCV::VID_V, LT.second, CostKind) +
893 (LT.first - 1) *
894 getRISCVInstructionCost(RISCV::VADD_VX, LT.second, CostKind);
895 return 1 + (LT.first - 1);
896 }
897 case Intrinsic::vp_rint: {
898 // RISC-V target uses at least 5 instructions to lower rounding intrinsics.
899 unsigned Cost = 5;
900 auto LT = getTypeLegalizationCost(Ty: RetTy);
901 if (TLI->isOperationCustom(Op: ISD::VP_FRINT, VT: LT.second))
902 return Cost * LT.first;
903 break;
904 }
905 case Intrinsic::vp_nearbyint: {
906 // More one read and one write for fflags than vp_rint.
907 unsigned Cost = 7;
908 auto LT = getTypeLegalizationCost(Ty: RetTy);
909 if (TLI->isOperationCustom(Op: ISD::VP_FRINT, VT: LT.second))
910 return Cost * LT.first;
911 break;
912 }
913 case Intrinsic::vp_ceil:
914 case Intrinsic::vp_floor:
915 case Intrinsic::vp_round:
916 case Intrinsic::vp_roundeven:
917 case Intrinsic::vp_roundtozero: {
918 // Rounding with static rounding mode needs two more instructions to
919 // swap/write FRM than vp_rint.
920 unsigned Cost = 7;
921 auto LT = getTypeLegalizationCost(Ty: RetTy);
922 unsigned VPISD = getISDForVPIntrinsicID(ID: ICA.getID());
923 if (TLI->isOperationCustom(Op: VPISD, VT: LT.second))
924 return Cost * LT.first;
925 break;
926 }
927 }
928
929 if (ST->hasVInstructions() && RetTy->isVectorTy()) {
930 if (auto LT = getTypeLegalizationCost(Ty: RetTy);
931 LT.second.isVector()) {
932 MVT EltTy = LT.second.getVectorElementType();
933 if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable,
934 ICA.getID(), EltTy))
935 return LT.first * Entry->Cost;
936 }
937 }
938
939 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
940}
941
942InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
943 Type *Src,
944 TTI::CastContextHint CCH,
945 TTI::TargetCostKind CostKind,
946 const Instruction *I) {
947 bool IsVectorType = isa<VectorType>(Val: Dst) && isa<VectorType>(Val: Src);
948 if (!IsVectorType)
949 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
950
951 bool IsTypeLegal = isTypeLegal(Ty: Src) && isTypeLegal(Ty: Dst) &&
952 (Src->getScalarSizeInBits() <= ST->getELen()) &&
953 (Dst->getScalarSizeInBits() <= ST->getELen());
954
955 // FIXME: Need to compute legalizing cost for illegal types.
956 if (!IsTypeLegal)
957 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
958
959 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Ty: Src);
960 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Ty: Dst);
961
962 int ISD = TLI->InstructionOpcodeToISD(Opcode);
963 assert(ISD && "Invalid opcode");
964
965 int PowDiff = (int)Log2_32(Value: Dst->getScalarSizeInBits()) -
966 (int)Log2_32(Value: Src->getScalarSizeInBits());
967 switch (ISD) {
968 case ISD::SIGN_EXTEND:
969 case ISD::ZERO_EXTEND: {
970 const unsigned SrcEltSize = Src->getScalarSizeInBits();
971 if (SrcEltSize == 1) {
972 // We do not use vsext/vzext to extend from mask vector.
973 // Instead we use the following instructions to extend from mask vector:
974 // vmv.v.i v8, 0
975 // vmerge.vim v8, v8, -1, v0
976 return getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM},
977 DstLT.second, CostKind);
978 }
979 if ((PowDiff < 1) || (PowDiff > 3))
980 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
981 unsigned SExtOp[] = {RISCV::VSEXT_VF2, RISCV::VSEXT_VF4, RISCV::VSEXT_VF8};
982 unsigned ZExtOp[] = {RISCV::VZEXT_VF2, RISCV::VZEXT_VF4, RISCV::VZEXT_VF8};
983 unsigned Op =
984 (ISD == ISD::SIGN_EXTEND) ? SExtOp[PowDiff - 1] : ZExtOp[PowDiff - 1];
985 return getRISCVInstructionCost(OpCodes: Op, VT: DstLT.second, CostKind);
986 }
987 case ISD::TRUNCATE:
988 if (Dst->getScalarSizeInBits() == 1) {
989 // We do not use several vncvt to truncate to mask vector. So we could
990 // not use PowDiff to calculate it.
991 // Instead we use the following instructions to truncate to mask vector:
992 // vand.vi v8, v8, 1
993 // vmsne.vi v0, v8, 0
994 return getRISCVInstructionCost({RISCV::VAND_VI, RISCV::VMSNE_VI},
995 SrcLT.second, CostKind);
996 }
997 [[fallthrough]];
998 case ISD::FP_EXTEND:
999 case ISD::FP_ROUND: {
1000 // Counts of narrow/widen instructions.
1001 unsigned SrcEltSize = Src->getScalarSizeInBits();
1002 unsigned DstEltSize = Dst->getScalarSizeInBits();
1003
1004 unsigned Op = (ISD == ISD::TRUNCATE) ? RISCV::VNSRL_WI
1005 : (ISD == ISD::FP_EXTEND) ? RISCV::VFWCVT_F_F_V
1006 : RISCV::VFNCVT_F_F_W;
1007 InstructionCost Cost = 0;
1008 for (; SrcEltSize != DstEltSize;) {
1009 MVT ElementMVT = (ISD == ISD::TRUNCATE)
1010 ? MVT::getIntegerVT(BitWidth: DstEltSize)
1011 : MVT::getFloatingPointVT(BitWidth: DstEltSize);
1012 MVT DstMVT = DstLT.second.changeVectorElementType(EltVT: ElementMVT);
1013 DstEltSize =
1014 (DstEltSize > SrcEltSize) ? DstEltSize >> 1 : DstEltSize << 1;
1015 Cost += getRISCVInstructionCost(OpCodes: Op, VT: DstMVT, CostKind);
1016 }
1017 return Cost;
1018 }
1019 case ISD::FP_TO_SINT:
1020 case ISD::FP_TO_UINT:
1021 case ISD::SINT_TO_FP:
1022 case ISD::UINT_TO_FP:
1023 if (Src->getScalarSizeInBits() == 1 || Dst->getScalarSizeInBits() == 1) {
1024 // The cost of convert from or to mask vector is different from other
1025 // cases. We could not use PowDiff to calculate it.
1026 // For mask vector to fp, we should use the following instructions:
1027 // vmv.v.i v8, 0
1028 // vmerge.vim v8, v8, -1, v0
1029 // vfcvt.f.x.v v8, v8
1030
1031 // And for fp vector to mask, we use:
1032 // vfncvt.rtz.x.f.w v9, v8
1033 // vand.vi v8, v9, 1
1034 // vmsne.vi v0, v8, 0
1035 return 3;
1036 }
1037 if (std::abs(x: PowDiff) <= 1)
1038 return 1;
1039 // Backend could lower (v[sz]ext i8 to double) to vfcvt(v[sz]ext.f8 i8),
1040 // so it only need two conversion.
1041 if (Src->isIntOrIntVectorTy())
1042 return 2;
1043 // Counts of narrow/widen instructions.
1044 return std::abs(x: PowDiff);
1045 }
1046 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1047}
1048
1049unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) {
1050 if (isa<ScalableVectorType>(Val: Ty)) {
1051 const unsigned EltSize = DL.getTypeSizeInBits(Ty: Ty->getElementType());
1052 const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue();
1053 const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock;
1054 return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize);
1055 }
1056 return cast<FixedVectorType>(Val: Ty)->getNumElements();
1057}
1058
1059InstructionCost
1060RISCVTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
1061 FastMathFlags FMF,
1062 TTI::TargetCostKind CostKind) {
1063 if (isa<FixedVectorType>(Val: Ty) && !ST->useRVVForFixedLengthVectors())
1064 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1065
1066 // Skip if scalar size of Ty is bigger than ELEN.
1067 if (Ty->getScalarSizeInBits() > ST->getELen())
1068 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1069
1070 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1071 if (Ty->getElementType()->isIntegerTy(Bitwidth: 1)) {
1072 // SelectionDAGBuilder does following transforms:
1073 // vector_reduce_{smin,umax}(<n x i1>) --> vector_reduce_or(<n x i1>)
1074 // vector_reduce_{smax,umin}(<n x i1>) --> vector_reduce_and(<n x i1>)
1075 if (IID == Intrinsic::umax || IID == Intrinsic::smin)
1076 return getArithmeticReductionCost(Opcode: Instruction::Or, Ty, FMF, CostKind);
1077 else
1078 return getArithmeticReductionCost(Opcode: Instruction::And, Ty, FMF, CostKind);
1079 }
1080
1081 if (IID == Intrinsic::maximum || IID == Intrinsic::minimum) {
1082 SmallVector<unsigned, 3> Opcodes;
1083 InstructionCost ExtraCost = 0;
1084 switch (IID) {
1085 case Intrinsic::maximum:
1086 if (FMF.noNaNs()) {
1087 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
1088 } else {
1089 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMAX_VS,
1090 RISCV::VFMV_F_S};
1091 // Cost of Canonical Nan + branch
1092 // lui a0, 523264
1093 // fmv.w.x fa0, a0
1094 Type *DstTy = Ty->getScalarType();
1095 const unsigned EltTyBits = DstTy->getScalarSizeInBits();
1096 Type *SrcTy = IntegerType::getIntNTy(C&: DstTy->getContext(), N: EltTyBits);
1097 ExtraCost = 1 +
1098 getCastInstrCost(Opcode: Instruction::UIToFP, Dst: DstTy, Src: SrcTy,
1099 CCH: TTI::CastContextHint::None, CostKind) +
1100 getCFInstrCost(Opcode: Instruction::Br, CostKind);
1101 }
1102 break;
1103
1104 case Intrinsic::minimum:
1105 if (FMF.noNaNs()) {
1106 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
1107 } else {
1108 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMIN_VS,
1109 RISCV::VFMV_F_S};
1110 // Cost of Canonical Nan + branch
1111 // lui a0, 523264
1112 // fmv.w.x fa0, a0
1113 Type *DstTy = Ty->getScalarType();
1114 const unsigned EltTyBits = DL.getTypeSizeInBits(Ty: DstTy);
1115 Type *SrcTy = IntegerType::getIntNTy(C&: DstTy->getContext(), N: EltTyBits);
1116 ExtraCost = 1 +
1117 getCastInstrCost(Opcode: Instruction::UIToFP, Dst: DstTy, Src: SrcTy,
1118 CCH: TTI::CastContextHint::None, CostKind) +
1119 getCFInstrCost(Opcode: Instruction::Br, CostKind);
1120 }
1121 break;
1122 }
1123 return ExtraCost + getRISCVInstructionCost(OpCodes: Opcodes, VT: LT.second, CostKind);
1124 }
1125
1126 // IR Reduction is composed by two vmv and one rvv reduction instruction.
1127 unsigned SplitOp;
1128 SmallVector<unsigned, 3> Opcodes;
1129 switch (IID) {
1130 default:
1131 llvm_unreachable("Unsupported intrinsic");
1132 case Intrinsic::smax:
1133 SplitOp = RISCV::VMAX_VV;
1134 Opcodes = {RISCV::VMV_S_X, RISCV::VREDMAX_VS, RISCV::VMV_X_S};
1135 break;
1136 case Intrinsic::smin:
1137 SplitOp = RISCV::VMIN_VV;
1138 Opcodes = {RISCV::VMV_S_X, RISCV::VREDMIN_VS, RISCV::VMV_X_S};
1139 break;
1140 case Intrinsic::umax:
1141 SplitOp = RISCV::VMAXU_VV;
1142 Opcodes = {RISCV::VMV_S_X, RISCV::VREDMAXU_VS, RISCV::VMV_X_S};
1143 break;
1144 case Intrinsic::umin:
1145 SplitOp = RISCV::VMINU_VV;
1146 Opcodes = {RISCV::VMV_S_X, RISCV::VREDMINU_VS, RISCV::VMV_X_S};
1147 break;
1148 case Intrinsic::maxnum:
1149 SplitOp = RISCV::VFMAX_VV;
1150 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
1151 break;
1152 case Intrinsic::minnum:
1153 SplitOp = RISCV::VFMIN_VV;
1154 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
1155 break;
1156 }
1157 // Add a cost for data larger than LMUL8
1158 InstructionCost SplitCost =
1159 (LT.first > 1) ? (LT.first - 1) *
1160 getRISCVInstructionCost(OpCodes: SplitOp, VT: LT.second, CostKind)
1161 : 0;
1162 return SplitCost + getRISCVInstructionCost(OpCodes: Opcodes, VT: LT.second, CostKind);
1163}
1164
1165InstructionCost
1166RISCVTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
1167 std::optional<FastMathFlags> FMF,
1168 TTI::TargetCostKind CostKind) {
1169 if (isa<FixedVectorType>(Val: Ty) && !ST->useRVVForFixedLengthVectors())
1170 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1171
1172 // Skip if scalar size of Ty is bigger than ELEN.
1173 if (Ty->getScalarSizeInBits() > ST->getELen())
1174 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1175
1176 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1177 assert(ISD && "Invalid opcode");
1178
1179 if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&
1180 ISD != ISD::FADD)
1181 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1182
1183 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1184 SmallVector<unsigned, 3> Opcodes;
1185 Type *ElementTy = Ty->getElementType();
1186 if (ElementTy->isIntegerTy(Bitwidth: 1)) {
1187 if (ISD == ISD::AND) {
1188 // Example sequences:
1189 // vsetvli a0, zero, e8, mf8, ta, ma
1190 // vmnot.m v8, v0
1191 // vcpop.m a0, v8
1192 // seqz a0, a0
1193 Opcodes = {RISCV::VMNAND_MM, RISCV::VCPOP_M};
1194 return (LT.first - 1) +
1195 getRISCVInstructionCost(OpCodes: Opcodes, VT: LT.second, CostKind) +
1196 getCmpSelInstrCost(Opcode: Instruction::ICmp, ValTy: ElementTy, CondTy: ElementTy,
1197 VecPred: CmpInst::ICMP_EQ, CostKind);
1198 } else {
1199 // Example sequences:
1200 // vsetvli a0, zero, e8, mf8, ta, ma
1201 // vcpop.m a0, v0
1202 // snez a0, a0
1203 Opcodes = {RISCV::VCPOP_M};
1204 return (LT.first - 1) +
1205 getRISCVInstructionCost(OpCodes: Opcodes, VT: LT.second, CostKind) +
1206 getCmpSelInstrCost(Opcode: Instruction::ICmp, ValTy: ElementTy, CondTy: ElementTy,
1207 VecPred: CmpInst::ICMP_NE, CostKind);
1208 }
1209 }
1210
1211 // IR Reduction is composed by two vmv and one rvv reduction instruction.
1212 if (TTI::requiresOrderedReduction(FMF)) {
1213 Opcodes.push_back(RISCV::VFMV_S_F);
1214 for (unsigned i = 0; i < LT.first.getValue(); i++)
1215 Opcodes.push_back(RISCV::VFREDOSUM_VS);
1216 Opcodes.push_back(RISCV::VFMV_F_S);
1217 return getRISCVInstructionCost(OpCodes: Opcodes, VT: LT.second, CostKind);
1218 }
1219 unsigned SplitOp;
1220 switch (ISD) {
1221 case ISD::ADD:
1222 SplitOp = RISCV::VADD_VV;
1223 Opcodes = {RISCV::VMV_S_X, RISCV::VREDSUM_VS, RISCV::VMV_X_S};
1224 break;
1225 case ISD::OR:
1226 SplitOp = RISCV::VOR_VV;
1227 Opcodes = {RISCV::VMV_S_X, RISCV::VREDOR_VS, RISCV::VMV_X_S};
1228 break;
1229 case ISD::XOR:
1230 SplitOp = RISCV::VXOR_VV;
1231 Opcodes = {RISCV::VMV_S_X, RISCV::VREDXOR_VS, RISCV::VMV_X_S};
1232 break;
1233 case ISD::AND:
1234 SplitOp = RISCV::VAND_VV;
1235 Opcodes = {RISCV::VMV_S_X, RISCV::VREDAND_VS, RISCV::VMV_X_S};
1236 break;
1237 case ISD::FADD:
1238 SplitOp = RISCV::VFADD_VV;
1239 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDUSUM_VS, RISCV::VFMV_F_S};
1240 break;
1241 }
1242 // Add a cost for data larger than LMUL8
1243 InstructionCost SplitCost =
1244 (LT.first > 1) ? (LT.first - 1) *
1245 getRISCVInstructionCost(OpCodes: SplitOp, VT: LT.second, CostKind)
1246 : 0;
1247 return SplitCost + getRISCVInstructionCost(OpCodes: Opcodes, VT: LT.second, CostKind);
1248}
1249
1250InstructionCost RISCVTTIImpl::getExtendedReductionCost(
1251 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
1252 FastMathFlags FMF, TTI::TargetCostKind CostKind) {
1253 if (isa<FixedVectorType>(Val: ValTy) && !ST->useRVVForFixedLengthVectors())
1254 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, Ty: ValTy,
1255 FMF, CostKind);
1256
1257 // Skip if scalar size of ResTy is bigger than ELEN.
1258 if (ResTy->getScalarSizeInBits() > ST->getELen())
1259 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, Ty: ValTy,
1260 FMF, CostKind);
1261
1262 if (Opcode != Instruction::Add && Opcode != Instruction::FAdd)
1263 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, Ty: ValTy,
1264 FMF, CostKind);
1265
1266 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
1267
1268 if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits())
1269 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, Ty: ValTy,
1270 FMF, CostKind);
1271
1272 return (LT.first - 1) +
1273 getArithmeticReductionCost(Opcode, Ty: ValTy, FMF, CostKind);
1274}
1275
1276InstructionCost RISCVTTIImpl::getStoreImmCost(Type *Ty,
1277 TTI::OperandValueInfo OpInfo,
1278 TTI::TargetCostKind CostKind) {
1279 assert(OpInfo.isConstant() && "non constant operand?");
1280 if (!isa<VectorType>(Val: Ty))
1281 // FIXME: We need to account for immediate materialization here, but doing
1282 // a decent job requires more knowledge about the immediate than we
1283 // currently have here.
1284 return 0;
1285
1286 if (OpInfo.isUniform())
1287 // vmv.x.i, vmv.v.x, or vfmv.v.f
1288 // We ignore the cost of the scalar constant materialization to be consistent
1289 // with how we treat scalar constants themselves just above.
1290 return 1;
1291
1292 return getConstantPoolLoadCost(Ty, CostKind);
1293}
1294
1295
1296InstructionCost RISCVTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1297 MaybeAlign Alignment,
1298 unsigned AddressSpace,
1299 TTI::TargetCostKind CostKind,
1300 TTI::OperandValueInfo OpInfo,
1301 const Instruction *I) {
1302 EVT VT = TLI->getValueType(DL, Ty: Src, AllowUnknown: true);
1303 // Type legalization can't handle structs
1304 if (VT == MVT::Other)
1305 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1306 CostKind, OpInfo, I);
1307
1308 InstructionCost Cost = 0;
1309 if (Opcode == Instruction::Store && OpInfo.isConstant())
1310 Cost += getStoreImmCost(Ty: Src, OpInfo, CostKind);
1311 InstructionCost BaseCost =
1312 BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1313 CostKind, OpInfo, I);
1314 // Assume memory ops cost scale with the number of vector registers
1315 // possible accessed by the instruction. Note that BasicTTI already
1316 // handles the LT.first term for us.
1317 if (std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Src);
1318 LT.second.isVector() && CostKind != TTI::TCK_CodeSize)
1319 BaseCost *= TLI->getLMULCost(VT: LT.second);
1320 return Cost + BaseCost;
1321
1322}
1323
1324InstructionCost RISCVTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
1325 Type *CondTy,
1326 CmpInst::Predicate VecPred,
1327 TTI::TargetCostKind CostKind,
1328 const Instruction *I) {
1329 if (CostKind != TTI::TCK_RecipThroughput)
1330 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1331 I);
1332
1333 if (isa<FixedVectorType>(Val: ValTy) && !ST->useRVVForFixedLengthVectors())
1334 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1335 I);
1336
1337 // Skip if scalar size of ValTy is bigger than ELEN.
1338 if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELen())
1339 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1340 I);
1341
1342 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
1343 if (Opcode == Instruction::Select && ValTy->isVectorTy()) {
1344 if (CondTy->isVectorTy()) {
1345 if (ValTy->getScalarSizeInBits() == 1) {
1346 // vmandn.mm v8, v8, v9
1347 // vmand.mm v9, v0, v9
1348 // vmor.mm v0, v9, v8
1349 return LT.first *
1350 getRISCVInstructionCost(
1351 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
1352 LT.second, CostKind);
1353 }
1354 // vselect and max/min are supported natively.
1355 return LT.first *
1356 getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second, CostKind);
1357 }
1358
1359 if (ValTy->getScalarSizeInBits() == 1) {
1360 // vmv.v.x v9, a0
1361 // vmsne.vi v9, v9, 0
1362 // vmandn.mm v8, v8, v9
1363 // vmand.mm v9, v0, v9
1364 // vmor.mm v0, v9, v8
1365 MVT InterimVT = LT.second.changeVectorElementType(MVT::i8);
1366 return LT.first *
1367 getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
1368 InterimVT, CostKind) +
1369 LT.first * getRISCVInstructionCost(
1370 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
1371 LT.second, CostKind);
1372 }
1373
1374 // vmv.v.x v10, a0
1375 // vmsne.vi v0, v10, 0
1376 // vmerge.vvm v8, v9, v8, v0
1377 return LT.first * getRISCVInstructionCost(
1378 {RISCV::VMV_V_X, RISCV::VMSNE_VI, RISCV::VMERGE_VVM},
1379 LT.second, CostKind);
1380 }
1381
1382 if ((Opcode == Instruction::ICmp) && ValTy->isVectorTy() &&
1383 CmpInst::isIntPredicate(P: VecPred)) {
1384 // Use VMSLT_VV to represent VMSEQ, VMSNE, VMSLTU, VMSLEU, VMSLT, VMSLE
1385 // provided they incur the same cost across all implementations
1386 return LT.first *
1387 getRISCVInstructionCost(RISCV::VMSLT_VV, LT.second, CostKind);
1388 }
1389
1390 if ((Opcode == Instruction::FCmp) && ValTy->isVectorTy() &&
1391 CmpInst::isFPPredicate(P: VecPred)) {
1392
1393 // Use VMXOR_MM and VMXNOR_MM to generate all true/false mask
1394 if ((VecPred == CmpInst::FCMP_FALSE) || (VecPred == CmpInst::FCMP_TRUE))
1395 return getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind);
1396
1397 // If we do not support the input floating point vector type, use the base
1398 // one which will calculate as:
1399 // ScalarizeCost + Num * Cost for fixed vector,
1400 // InvalidCost for scalable vector.
1401 if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) ||
1402 (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) ||
1403 (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64()))
1404 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1405 I);
1406
1407 // Assuming vector fp compare and mask instructions are all the same cost
1408 // until a need arises to differentiate them.
1409 switch (VecPred) {
1410 case CmpInst::FCMP_ONE: // vmflt.vv + vmflt.vv + vmor.mm
1411 case CmpInst::FCMP_ORD: // vmfeq.vv + vmfeq.vv + vmand.mm
1412 case CmpInst::FCMP_UNO: // vmfne.vv + vmfne.vv + vmor.mm
1413 case CmpInst::FCMP_UEQ: // vmflt.vv + vmflt.vv + vmnor.mm
1414 return LT.first * getRISCVInstructionCost(
1415 {RISCV::VMFLT_VV, RISCV::VMFLT_VV, RISCV::VMOR_MM},
1416 LT.second, CostKind);
1417
1418 case CmpInst::FCMP_UGT: // vmfle.vv + vmnot.m
1419 case CmpInst::FCMP_UGE: // vmflt.vv + vmnot.m
1420 case CmpInst::FCMP_ULT: // vmfle.vv + vmnot.m
1421 case CmpInst::FCMP_ULE: // vmflt.vv + vmnot.m
1422 return LT.first *
1423 getRISCVInstructionCost({RISCV::VMFLT_VV, RISCV::VMNAND_MM},
1424 LT.second, CostKind);
1425
1426 case CmpInst::FCMP_OEQ: // vmfeq.vv
1427 case CmpInst::FCMP_OGT: // vmflt.vv
1428 case CmpInst::FCMP_OGE: // vmfle.vv
1429 case CmpInst::FCMP_OLT: // vmflt.vv
1430 case CmpInst::FCMP_OLE: // vmfle.vv
1431 case CmpInst::FCMP_UNE: // vmfne.vv
1432 return LT.first *
1433 getRISCVInstructionCost(RISCV::VMFLT_VV, LT.second, CostKind);
1434 default:
1435 break;
1436 }
1437 }
1438
1439 // TODO: Add cost for scalar type.
1440
1441 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
1442}
1443
1444InstructionCost RISCVTTIImpl::getCFInstrCost(unsigned Opcode,
1445 TTI::TargetCostKind CostKind,
1446 const Instruction *I) {
1447 if (CostKind != TTI::TCK_RecipThroughput)
1448 return Opcode == Instruction::PHI ? 0 : 1;
1449 // Branches are assumed to be predicted.
1450 return 0;
1451}
1452
1453InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
1454 TTI::TargetCostKind CostKind,
1455 unsigned Index, Value *Op0,
1456 Value *Op1) {
1457 assert(Val->isVectorTy() && "This must be a vector type");
1458
1459 if (Opcode != Instruction::ExtractElement &&
1460 Opcode != Instruction::InsertElement)
1461 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1462
1463 // Legalize the type.
1464 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Val);
1465
1466 // This type is legalized to a scalar type.
1467 if (!LT.second.isVector()) {
1468 auto *FixedVecTy = cast<FixedVectorType>(Val);
1469 // If Index is a known constant, cost is zero.
1470 if (Index != -1U)
1471 return 0;
1472 // Extract/InsertElement with non-constant index is very costly when
1473 // scalarized; estimate cost of loads/stores sequence via the stack:
1474 // ExtractElement cost: store vector to stack, load scalar;
1475 // InsertElement cost: store vector to stack, store scalar, load vector.
1476 Type *ElemTy = FixedVecTy->getElementType();
1477 auto NumElems = FixedVecTy->getNumElements();
1478 auto Align = DL.getPrefTypeAlign(Ty: ElemTy);
1479 InstructionCost LoadCost =
1480 getMemoryOpCost(Opcode: Instruction::Load, Src: ElemTy, Alignment: Align, AddressSpace: 0, CostKind);
1481 InstructionCost StoreCost =
1482 getMemoryOpCost(Opcode: Instruction::Store, Src: ElemTy, Alignment: Align, AddressSpace: 0, CostKind);
1483 return Opcode == Instruction::ExtractElement
1484 ? StoreCost * NumElems + LoadCost
1485 : (StoreCost + LoadCost) * NumElems + StoreCost;
1486 }
1487
1488 // For unsupported scalable vector.
1489 if (LT.second.isScalableVector() && !LT.first.isValid())
1490 return LT.first;
1491
1492 if (!isTypeLegal(Ty: Val))
1493 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1494
1495 // Mask vector extract/insert is expanded via e8.
1496 if (Val->getScalarSizeInBits() == 1) {
1497 VectorType *WideTy =
1498 VectorType::get(ElementType: IntegerType::get(C&: Val->getContext(), NumBits: 8),
1499 EC: cast<VectorType>(Val)->getElementCount());
1500 if (Opcode == Instruction::ExtractElement) {
1501 InstructionCost ExtendCost
1502 = getCastInstrCost(Opcode: Instruction::ZExt, Dst: WideTy, Src: Val,
1503 CCH: TTI::CastContextHint::None, CostKind);
1504 InstructionCost ExtractCost
1505 = getVectorInstrCost(Opcode, Val: WideTy, CostKind, Index, Op0: nullptr, Op1: nullptr);
1506 return ExtendCost + ExtractCost;
1507 }
1508 InstructionCost ExtendCost
1509 = getCastInstrCost(Opcode: Instruction::ZExt, Dst: WideTy, Src: Val,
1510 CCH: TTI::CastContextHint::None, CostKind);
1511 InstructionCost InsertCost
1512 = getVectorInstrCost(Opcode, Val: WideTy, CostKind, Index, Op0: nullptr, Op1: nullptr);
1513 InstructionCost TruncCost
1514 = getCastInstrCost(Opcode: Instruction::Trunc, Dst: Val, Src: WideTy,
1515 CCH: TTI::CastContextHint::None, CostKind);
1516 return ExtendCost + InsertCost + TruncCost;
1517 }
1518
1519
1520 // In RVV, we could use vslidedown + vmv.x.s to extract element from vector
1521 // and vslideup + vmv.s.x to insert element to vector.
1522 unsigned BaseCost = 1;
1523 // When insertelement we should add the index with 1 as the input of vslideup.
1524 unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1;
1525
1526 if (Index != -1U) {
1527 // The type may be split. For fixed-width vectors we can normalize the
1528 // index to the new type.
1529 if (LT.second.isFixedLengthVector()) {
1530 unsigned Width = LT.second.getVectorNumElements();
1531 Index = Index % Width;
1532 }
1533
1534 // We could extract/insert the first element without vslidedown/vslideup.
1535 if (Index == 0)
1536 SlideCost = 0;
1537 else if (Opcode == Instruction::InsertElement)
1538 SlideCost = 1; // With a constant index, we do not need to use addi.
1539 }
1540
1541 // Extract i64 in the target that has XLEN=32 need more instruction.
1542 if (Val->getScalarType()->isIntegerTy() &&
1543 ST->getXLen() < Val->getScalarSizeInBits()) {
1544 // For extractelement, we need the following instructions:
1545 // vsetivli zero, 1, e64, m1, ta, mu (not count)
1546 // vslidedown.vx v8, v8, a0
1547 // vmv.x.s a0, v8
1548 // li a1, 32
1549 // vsrl.vx v8, v8, a1
1550 // vmv.x.s a1, v8
1551
1552 // For insertelement, we need the following instructions:
1553 // vsetivli zero, 2, e32, m4, ta, mu (not count)
1554 // vmv.v.i v12, 0
1555 // vslide1up.vx v16, v12, a1
1556 // vslide1up.vx v12, v16, a0
1557 // addi a0, a2, 1
1558 // vsetvli zero, a0, e64, m4, tu, mu (not count)
1559 // vslideup.vx v8, v12, a2
1560
1561 // TODO: should we count these special vsetvlis?
1562 BaseCost = Opcode == Instruction::InsertElement ? 3 : 4;
1563 }
1564 return BaseCost + SlideCost;
1565}
1566
1567InstructionCost RISCVTTIImpl::getArithmeticInstrCost(
1568 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1569 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
1570 ArrayRef<const Value *> Args, const Instruction *CxtI) {
1571
1572 // TODO: Handle more cost kinds.
1573 if (CostKind != TTI::TCK_RecipThroughput)
1574 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info,
1575 Args, CxtI);
1576
1577 if (isa<FixedVectorType>(Val: Ty) && !ST->useRVVForFixedLengthVectors())
1578 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info,
1579 Args, CxtI);
1580
1581 // Skip if scalar size of Ty is bigger than ELEN.
1582 if (isa<VectorType>(Val: Ty) && Ty->getScalarSizeInBits() > ST->getELen())
1583 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info,
1584 Args, CxtI);
1585
1586 // Legalize the type.
1587 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1588
1589 // TODO: Handle scalar type.
1590 if (!LT.second.isVector())
1591 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info,
1592 Args, CxtI);
1593
1594
1595 auto getConstantMatCost =
1596 [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {
1597 if (OpInfo.isUniform() && TLI->canSplatOperand(Opcode, Operand))
1598 // Two sub-cases:
1599 // * Has a 5 bit immediate operand which can be splatted.
1600 // * Has a larger immediate which must be materialized in scalar register
1601 // We return 0 for both as we currently ignore the cost of materializing
1602 // scalar constants in GPRs.
1603 return 0;
1604
1605 return getConstantPoolLoadCost(Ty, CostKind);
1606 };
1607
1608 // Add the cost of materializing any constant vectors required.
1609 InstructionCost ConstantMatCost = 0;
1610 if (Op1Info.isConstant())
1611 ConstantMatCost += getConstantMatCost(0, Op1Info);
1612 if (Op2Info.isConstant())
1613 ConstantMatCost += getConstantMatCost(1, Op2Info);
1614
1615 switch (TLI->InstructionOpcodeToISD(Opcode)) {
1616 case ISD::ADD:
1617 case ISD::SUB:
1618 case ISD::AND:
1619 case ISD::OR:
1620 case ISD::XOR:
1621 case ISD::SHL:
1622 case ISD::SRL:
1623 case ISD::SRA:
1624 case ISD::MUL:
1625 case ISD::MULHS:
1626 case ISD::MULHU:
1627 case ISD::FADD:
1628 case ISD::FSUB:
1629 case ISD::FMUL:
1630 case ISD::FNEG: {
1631 return ConstantMatCost + TLI->getLMULCost(VT: LT.second) * LT.first * 1;
1632 }
1633 default:
1634 return ConstantMatCost +
1635 BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info,
1636 Args, CxtI);
1637 }
1638}
1639
1640// TODO: Deduplicate from TargetTransformInfoImplCRTPBase.
1641InstructionCost RISCVTTIImpl::getPointersChainCost(
1642 ArrayRef<const Value *> Ptrs, const Value *Base,
1643 const TTI::PointersChainInfo &Info, Type *AccessTy,
1644 TTI::TargetCostKind CostKind) {
1645 InstructionCost Cost = TTI::TCC_Free;
1646 // In the basic model we take into account GEP instructions only
1647 // (although here can come alloca instruction, a value, constants and/or
1648 // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a
1649 // pointer). Typically, if Base is a not a GEP-instruction and all the
1650 // pointers are relative to the same base address, all the rest are
1651 // either GEP instructions, PHIs, bitcasts or constants. When we have same
1652 // base, we just calculate cost of each non-Base GEP as an ADD operation if
1653 // any their index is a non-const.
1654 // If no known dependecies between the pointers cost is calculated as a sum
1655 // of costs of GEP instructions.
1656 for (auto [I, V] : enumerate(First&: Ptrs)) {
1657 const auto *GEP = dyn_cast<GetElementPtrInst>(Val: V);
1658 if (!GEP)
1659 continue;
1660 if (Info.isSameBase() && V != Base) {
1661 if (GEP->hasAllConstantIndices())
1662 continue;
1663 // If the chain is unit-stride and BaseReg + stride*i is a legal
1664 // addressing mode, then presume the base GEP is sitting around in a
1665 // register somewhere and check if we can fold the offset relative to
1666 // it.
1667 unsigned Stride = DL.getTypeStoreSize(Ty: AccessTy);
1668 if (Info.isUnitStride() &&
1669 isLegalAddressingMode(Ty: AccessTy,
1670 /* BaseGV */ nullptr,
1671 /* BaseOffset */ Stride * I,
1672 /* HasBaseReg */ true,
1673 /* Scale */ 0,
1674 AddrSpace: GEP->getType()->getPointerAddressSpace()))
1675 continue;
1676 Cost += getArithmeticInstrCost(Opcode: Instruction::Add, Ty: GEP->getType(), CostKind,
1677 Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
1678 Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
1679 Args: std::nullopt);
1680 } else {
1681 SmallVector<const Value *> Indices(GEP->indices());
1682 Cost += getGEPCost(PointeeType: GEP->getSourceElementType(), Ptr: GEP->getPointerOperand(),
1683 Operands: Indices, AccessType: AccessTy, CostKind);
1684 }
1685 }
1686 return Cost;
1687}
1688
1689void RISCVTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1690 TTI::UnrollingPreferences &UP,
1691 OptimizationRemarkEmitter *ORE) {
1692 // TODO: More tuning on benchmarks and metrics with changes as needed
1693 // would apply to all settings below to enable performance.
1694
1695
1696 if (ST->enableDefaultUnroll())
1697 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
1698
1699 // Enable Upper bound unrolling universally, not dependant upon the conditions
1700 // below.
1701 UP.UpperBound = true;
1702
1703 // Disable loop unrolling for Oz and Os.
1704 UP.OptSizeThreshold = 0;
1705 UP.PartialOptSizeThreshold = 0;
1706 if (L->getHeader()->getParent()->hasOptSize())
1707 return;
1708
1709 SmallVector<BasicBlock *, 4> ExitingBlocks;
1710 L->getExitingBlocks(ExitingBlocks);
1711 LLVM_DEBUG(dbgs() << "Loop has:\n"
1712 << "Blocks: " << L->getNumBlocks() << "\n"
1713 << "Exit blocks: " << ExitingBlocks.size() << "\n");
1714
1715 // Only allow another exit other than the latch. This acts as an early exit
1716 // as it mirrors the profitability calculation of the runtime unroller.
1717 if (ExitingBlocks.size() > 2)
1718 return;
1719
1720 // Limit the CFG of the loop body for targets with a branch predictor.
1721 // Allowing 4 blocks permits if-then-else diamonds in the body.
1722 if (L->getNumBlocks() > 4)
1723 return;
1724
1725 // Don't unroll vectorized loops, including the remainder loop
1726 if (getBooleanLoopAttribute(TheLoop: L, Name: "llvm.loop.isvectorized"))
1727 return;
1728
1729 // Scan the loop: don't unroll loops with calls as this could prevent
1730 // inlining.
1731 InstructionCost Cost = 0;
1732 for (auto *BB : L->getBlocks()) {
1733 for (auto &I : *BB) {
1734 // Initial setting - Don't unroll loops containing vectorized
1735 // instructions.
1736 if (I.getType()->isVectorTy())
1737 return;
1738
1739 if (isa<CallInst>(Val: I) || isa<InvokeInst>(Val: I)) {
1740 if (const Function *F = cast<CallBase>(Val&: I).getCalledFunction()) {
1741 if (!isLoweredToCall(F))
1742 continue;
1743 }
1744 return;
1745 }
1746
1747 SmallVector<const Value *> Operands(I.operand_values());
1748 Cost += getInstructionCost(U: &I, Operands,
1749 CostKind: TargetTransformInfo::TCK_SizeAndLatency);
1750 }
1751 }
1752
1753 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
1754
1755 UP.Partial = true;
1756 UP.Runtime = true;
1757 UP.UnrollRemainder = true;
1758 UP.UnrollAndJam = true;
1759 UP.UnrollAndJamInnerLoopThreshold = 60;
1760
1761 // Force unrolling small loops can be very useful because of the branch
1762 // taken cost of the backedge.
1763 if (Cost < 12)
1764 UP.Force = true;
1765}
1766
1767void RISCVTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
1768 TTI::PeelingPreferences &PP) {
1769 BaseT::getPeelingPreferences(L, SE, PP);
1770}
1771
1772unsigned RISCVTTIImpl::getRegUsageForType(Type *Ty) {
1773 TypeSize Size = DL.getTypeSizeInBits(Ty);
1774 if (Ty->isVectorTy()) {
1775 if (Size.isScalable() && ST->hasVInstructions())
1776 return divideCeil(Numerator: Size.getKnownMinValue(), Denominator: RISCV::RVVBitsPerBlock);
1777
1778 if (ST->useRVVForFixedLengthVectors())
1779 return divideCeil(Numerator: Size, Denominator: ST->getRealMinVLen());
1780 }
1781
1782 return BaseT::getRegUsageForType(Ty);
1783}
1784
1785unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1786 if (SLPMaxVF.getNumOccurrences())
1787 return SLPMaxVF;
1788
1789 // Return how many elements can fit in getRegisterBitwidth. This is the
1790 // same routine as used in LoopVectorizer. We should probably be
1791 // accounting for whether we actually have instructions with the right
1792 // lane type, but we don't have enough information to do that without
1793 // some additional plumbing which hasn't been justified yet.
1794 TypeSize RegWidth =
1795 getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector);
1796 // If no vector registers, or absurd element widths, disable
1797 // vectorization by returning 1.
1798 return std::max<unsigned>(a: 1U, b: RegWidth.getFixedValue() / ElemWidth);
1799}
1800
1801bool RISCVTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
1802 const TargetTransformInfo::LSRCost &C2) {
1803 // RISC-V specific here are "instruction number 1st priority".
1804 return std::tie(args: C1.Insns, args: C1.NumRegs, args: C1.AddRecCost,
1805 args: C1.NumIVMuls, args: C1.NumBaseAdds,
1806 args: C1.ScaleCost, args: C1.ImmCost, args: C1.SetupCost) <
1807 std::tie(args: C2.Insns, args: C2.NumRegs, args: C2.AddRecCost,
1808 args: C2.NumIVMuls, args: C2.NumBaseAdds,
1809 args: C2.ScaleCost, args: C2.ImmCost, args: C2.SetupCost);
1810}
1811
1812bool RISCVTTIImpl::isLegalMaskedCompressStore(Type *DataTy, Align Alignment) {
1813 auto *VTy = dyn_cast<VectorType>(Val: DataTy);
1814 if (!VTy || VTy->isScalableTy())
1815 return false;
1816
1817 if (!isLegalMaskedLoadStore(DataType: DataTy, Alignment))
1818 return false;
1819 return true;
1820}
1821
1822bool RISCVTTIImpl::areInlineCompatible(const Function *Caller,
1823 const Function *Callee) const {
1824 const TargetMachine &TM = getTLI()->getTargetMachine();
1825
1826 const FeatureBitset &CallerBits =
1827 TM.getSubtargetImpl(*Caller)->getFeatureBits();
1828 const FeatureBitset &CalleeBits =
1829 TM.getSubtargetImpl(*Callee)->getFeatureBits();
1830
1831 // Inline a callee if its target-features are a subset of the callers
1832 // target-features.
1833 return (CallerBits & CalleeBits) == CalleeBits;
1834}
1835

source code of llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp