1//===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "ARMTargetTransformInfo.h"
10#include "ARMSubtarget.h"
11#include "MCTargetDesc/ARMAddressingModes.h"
12#include "llvm/ADT/APInt.h"
13#include "llvm/ADT/SmallVector.h"
14#include "llvm/Analysis/LoopInfo.h"
15#include "llvm/CodeGen/CostTable.h"
16#include "llvm/CodeGen/ISDOpcodes.h"
17#include "llvm/CodeGen/ValueTypes.h"
18#include "llvm/CodeGenTypes/MachineValueType.h"
19#include "llvm/IR/BasicBlock.h"
20#include "llvm/IR/DataLayout.h"
21#include "llvm/IR/DerivedTypes.h"
22#include "llvm/IR/Instruction.h"
23#include "llvm/IR/Instructions.h"
24#include "llvm/IR/IntrinsicInst.h"
25#include "llvm/IR/Intrinsics.h"
26#include "llvm/IR/IntrinsicsARM.h"
27#include "llvm/IR/PatternMatch.h"
28#include "llvm/IR/Type.h"
29#include "llvm/Support/Casting.h"
30#include "llvm/Support/KnownBits.h"
31#include "llvm/Target/TargetMachine.h"
32#include "llvm/TargetParser/SubtargetFeature.h"
33#include "llvm/Transforms/InstCombine/InstCombiner.h"
34#include "llvm/Transforms/Utils/Local.h"
35#include "llvm/Transforms/Utils/LoopUtils.h"
36#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
37#include <algorithm>
38#include <cassert>
39#include <cstdint>
40#include <optional>
41#include <utility>
42
43using namespace llvm;
44
45#define DEBUG_TYPE "armtti"
46
47static cl::opt<bool> EnableMaskedLoadStores(
48 "enable-arm-maskedldst", cl::Hidden, cl::init(Val: true),
49 cl::desc("Enable the generation of masked loads and stores"));
50
51static cl::opt<bool> DisableLowOverheadLoops(
52 "disable-arm-loloops", cl::Hidden, cl::init(Val: false),
53 cl::desc("Disable the generation of low-overhead loops"));
54
55static cl::opt<bool>
56 AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(Val: true),
57 cl::desc("Enable the generation of WLS loops"));
58
59extern cl::opt<TailPredication::Mode> EnableTailPredication;
60
61extern cl::opt<bool> EnableMaskedGatherScatters;
62
63extern cl::opt<unsigned> MVEMaxSupportedInterleaveFactor;
64
65/// Convert a vector load intrinsic into a simple llvm load instruction.
66/// This is beneficial when the underlying object being addressed comes
67/// from a constant, since we get constant-folding for free.
68static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign,
69 InstCombiner::BuilderTy &Builder) {
70 auto *IntrAlign = dyn_cast<ConstantInt>(Val: II.getArgOperand(i: 1));
71
72 if (!IntrAlign)
73 return nullptr;
74
75 unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign
76 ? MemAlign
77 : IntrAlign->getLimitedValue();
78
79 if (!isPowerOf2_32(Value: Alignment))
80 return nullptr;
81
82 auto *BCastInst = Builder.CreateBitCast(V: II.getArgOperand(i: 0),
83 DestTy: PointerType::get(ElementType: II.getType(), AddressSpace: 0));
84 return Builder.CreateAlignedLoad(Ty: II.getType(), Ptr: BCastInst, Align: Align(Alignment));
85}
86
87bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
88 const Function *Callee) const {
89 const TargetMachine &TM = getTLI()->getTargetMachine();
90 const FeatureBitset &CallerBits =
91 TM.getSubtargetImpl(*Caller)->getFeatureBits();
92 const FeatureBitset &CalleeBits =
93 TM.getSubtargetImpl(*Callee)->getFeatureBits();
94
95 // To inline a callee, all features not in the allowed list must match exactly.
96 bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) ==
97 (CalleeBits & ~InlineFeaturesAllowed);
98 // For features in the allowed list, the callee's features must be a subset of
99 // the callers'.
100 bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) ==
101 (CalleeBits & InlineFeaturesAllowed);
102 return MatchExact && MatchSubset;
103}
104
105TTI::AddressingModeKind
106ARMTTIImpl::getPreferredAddressingMode(const Loop *L,
107 ScalarEvolution *SE) const {
108 if (ST->hasMVEIntegerOps())
109 return TTI::AMK_PostIndexed;
110
111 if (L->getHeader()->getParent()->hasOptSize())
112 return TTI::AMK_None;
113
114 if (ST->isMClass() && ST->isThumb2() &&
115 L->getNumBlocks() == 1)
116 return TTI::AMK_PreIndexed;
117
118 return TTI::AMK_None;
119}
120
121std::optional<Instruction *>
122ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
123 using namespace PatternMatch;
124 Intrinsic::ID IID = II.getIntrinsicID();
125 switch (IID) {
126 default:
127 break;
128 case Intrinsic::arm_neon_vld1: {
129 Align MemAlign =
130 getKnownAlignment(V: II.getArgOperand(i: 0), DL: IC.getDataLayout(), CxtI: &II,
131 AC: &IC.getAssumptionCache(), DT: &IC.getDominatorTree());
132 if (Value *V = simplifyNeonVld1(II, MemAlign: MemAlign.value(), Builder&: IC.Builder)) {
133 return IC.replaceInstUsesWith(I&: II, V);
134 }
135 break;
136 }
137
138 case Intrinsic::arm_neon_vld2:
139 case Intrinsic::arm_neon_vld3:
140 case Intrinsic::arm_neon_vld4:
141 case Intrinsic::arm_neon_vld2lane:
142 case Intrinsic::arm_neon_vld3lane:
143 case Intrinsic::arm_neon_vld4lane:
144 case Intrinsic::arm_neon_vst1:
145 case Intrinsic::arm_neon_vst2:
146 case Intrinsic::arm_neon_vst3:
147 case Intrinsic::arm_neon_vst4:
148 case Intrinsic::arm_neon_vst2lane:
149 case Intrinsic::arm_neon_vst3lane:
150 case Intrinsic::arm_neon_vst4lane: {
151 Align MemAlign =
152 getKnownAlignment(V: II.getArgOperand(i: 0), DL: IC.getDataLayout(), CxtI: &II,
153 AC: &IC.getAssumptionCache(), DT: &IC.getDominatorTree());
154 unsigned AlignArg = II.arg_size() - 1;
155 Value *AlignArgOp = II.getArgOperand(i: AlignArg);
156 MaybeAlign Align = cast<ConstantInt>(Val: AlignArgOp)->getMaybeAlignValue();
157 if (Align && *Align < MemAlign) {
158 return IC.replaceOperand(
159 I&: II, OpNum: AlignArg,
160 V: ConstantInt::get(Ty: Type::getInt32Ty(C&: II.getContext()), V: MemAlign.value(),
161 IsSigned: false));
162 }
163 break;
164 }
165
166 case Intrinsic::arm_mve_pred_i2v: {
167 Value *Arg = II.getArgOperand(i: 0);
168 Value *ArgArg;
169 if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
170 PatternMatch::m_Value(ArgArg))) &&
171 II.getType() == ArgArg->getType()) {
172 return IC.replaceInstUsesWith(I&: II, V: ArgArg);
173 }
174 Constant *XorMask;
175 if (match(Arg, m_Xor(PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
176 PatternMatch::m_Value(ArgArg)),
177 PatternMatch::m_Constant(XorMask))) &&
178 II.getType() == ArgArg->getType()) {
179 if (auto *CI = dyn_cast<ConstantInt>(Val: XorMask)) {
180 if (CI->getValue().trunc(width: 16).isAllOnes()) {
181 auto TrueVector = IC.Builder.CreateVectorSplat(
182 NumElts: cast<FixedVectorType>(Val: II.getType())->getNumElements(),
183 V: IC.Builder.getTrue());
184 return BinaryOperator::Create(Op: Instruction::Xor, S1: ArgArg, S2: TrueVector);
185 }
186 }
187 }
188 KnownBits ScalarKnown(32);
189 if (IC.SimplifyDemandedBits(I: &II, OpNo: 0, DemandedMask: APInt::getLowBitsSet(numBits: 32, loBitsSet: 16),
190 Known&: ScalarKnown, Depth: 0)) {
191 return &II;
192 }
193 break;
194 }
195 case Intrinsic::arm_mve_pred_v2i: {
196 Value *Arg = II.getArgOperand(i: 0);
197 Value *ArgArg;
198 if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(
199 PatternMatch::m_Value(ArgArg)))) {
200 return IC.replaceInstUsesWith(I&: II, V: ArgArg);
201 }
202 if (!II.getMetadata(KindID: LLVMContext::MD_range)) {
203 Type *IntTy32 = Type::getInt32Ty(C&: II.getContext());
204 Metadata *M[] = {
205 ConstantAsMetadata::get(C: ConstantInt::get(Ty: IntTy32, V: 0)),
206 ConstantAsMetadata::get(C: ConstantInt::get(Ty: IntTy32, V: 0x10000))};
207 II.setMetadata(KindID: LLVMContext::MD_range, Node: MDNode::get(Context&: II.getContext(), MDs: M));
208 II.setMetadata(KindID: LLVMContext::MD_noundef,
209 Node: MDNode::get(Context&: II.getContext(), MDs: std::nullopt));
210 return &II;
211 }
212 break;
213 }
214 case Intrinsic::arm_mve_vadc:
215 case Intrinsic::arm_mve_vadc_predicated: {
216 unsigned CarryOp =
217 (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
218 assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&
219 "Bad type for intrinsic!");
220
221 KnownBits CarryKnown(32);
222 if (IC.SimplifyDemandedBits(I: &II, OpNo: CarryOp, DemandedMask: APInt::getOneBitSet(numBits: 32, BitNo: 29),
223 Known&: CarryKnown)) {
224 return &II;
225 }
226 break;
227 }
228 case Intrinsic::arm_mve_vmldava: {
229 Instruction *I = cast<Instruction>(Val: &II);
230 if (I->hasOneUse()) {
231 auto *User = cast<Instruction>(Val: *I->user_begin());
232 Value *OpZ;
233 if (match(V: User, P: m_c_Add(L: m_Specific(V: I), R: m_Value(V&: OpZ))) &&
234 match(V: I->getOperand(i: 3), P: m_Zero())) {
235 Value *OpX = I->getOperand(i: 4);
236 Value *OpY = I->getOperand(i: 5);
237 Type *OpTy = OpX->getType();
238
239 IC.Builder.SetInsertPoint(User);
240 Value *V =
241 IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy},
242 {I->getOperand(0), I->getOperand(1),
243 I->getOperand(2), OpZ, OpX, OpY});
244
245 IC.replaceInstUsesWith(I&: *User, V);
246 return IC.eraseInstFromFunction(I&: *User);
247 }
248 }
249 return std::nullopt;
250 }
251 }
252 return std::nullopt;
253}
254
255std::optional<Value *> ARMTTIImpl::simplifyDemandedVectorEltsIntrinsic(
256 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
257 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
258 std::function<void(Instruction *, unsigned, APInt, APInt &)>
259 SimplifyAndSetOp) const {
260
261 // Compute the demanded bits for a narrowing MVE intrinsic. The TopOpc is the
262 // opcode specifying a Top/Bottom instruction, which can change between
263 // instructions.
264 auto SimplifyNarrowInstrTopBottom =[&](unsigned TopOpc) {
265 unsigned NumElts = cast<FixedVectorType>(Val: II.getType())->getNumElements();
266 unsigned IsTop = cast<ConstantInt>(Val: II.getOperand(i_nocapture: TopOpc))->getZExtValue();
267
268 // The only odd/even lanes of operand 0 will only be demanded depending
269 // on whether this is a top/bottom instruction.
270 APInt DemandedElts =
271 APInt::getSplat(NewLen: NumElts, V: IsTop ? APInt::getLowBitsSet(numBits: 2, loBitsSet: 1)
272 : APInt::getHighBitsSet(numBits: 2, hiBitsSet: 1));
273 SimplifyAndSetOp(&II, 0, OrigDemandedElts & DemandedElts, UndefElts);
274 // The other lanes will be defined from the inserted elements.
275 UndefElts &= APInt::getSplat(NewLen: NumElts, V: IsTop ? APInt::getLowBitsSet(numBits: 2, loBitsSet: 1)
276 : APInt::getHighBitsSet(numBits: 2, hiBitsSet: 1));
277 return std::nullopt;
278 };
279
280 switch (II.getIntrinsicID()) {
281 default:
282 break;
283 case Intrinsic::arm_mve_vcvt_narrow:
284 SimplifyNarrowInstrTopBottom(2);
285 break;
286 case Intrinsic::arm_mve_vqmovn:
287 SimplifyNarrowInstrTopBottom(4);
288 break;
289 case Intrinsic::arm_mve_vshrn:
290 SimplifyNarrowInstrTopBottom(7);
291 break;
292 }
293
294 return std::nullopt;
295}
296
297InstructionCost ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
298 TTI::TargetCostKind CostKind) {
299 assert(Ty->isIntegerTy());
300
301 unsigned Bits = Ty->getPrimitiveSizeInBits();
302 if (Bits == 0 || Imm.getActiveBits() >= 64)
303 return 4;
304
305 int64_t SImmVal = Imm.getSExtValue();
306 uint64_t ZImmVal = Imm.getZExtValue();
307 if (!ST->isThumb()) {
308 if ((SImmVal >= 0 && SImmVal < 65536) ||
309 (ARM_AM::getSOImmVal(Arg: ZImmVal) != -1) ||
310 (ARM_AM::getSOImmVal(Arg: ~ZImmVal) != -1))
311 return 1;
312 return ST->hasV6T2Ops() ? 2 : 3;
313 }
314 if (ST->isThumb2()) {
315 if ((SImmVal >= 0 && SImmVal < 65536) ||
316 (ARM_AM::getT2SOImmVal(Arg: ZImmVal) != -1) ||
317 (ARM_AM::getT2SOImmVal(Arg: ~ZImmVal) != -1))
318 return 1;
319 return ST->hasV6T2Ops() ? 2 : 3;
320 }
321 // Thumb1, any i8 imm cost 1.
322 if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256))
323 return 1;
324 if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(V: ZImmVal))
325 return 2;
326 // Load from constantpool.
327 return 3;
328}
329
330// Constants smaller than 256 fit in the immediate field of
331// Thumb1 instructions so we return a zero cost and 1 otherwise.
332InstructionCost ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
333 const APInt &Imm, Type *Ty) {
334 if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
335 return 0;
336
337 return 1;
338}
339
340// Checks whether Inst is part of a min(max()) or max(min()) pattern
341// that will match to an SSAT instruction. Returns the instruction being
342// saturated, or null if no saturation pattern was found.
343static Value *isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) {
344 Value *LHS, *RHS;
345 ConstantInt *C;
346 SelectPatternFlavor InstSPF = matchSelectPattern(V: Inst, LHS, RHS).Flavor;
347
348 if (InstSPF == SPF_SMAX &&
349 PatternMatch::match(V: RHS, P: PatternMatch::m_ConstantInt(CI&: C)) &&
350 C->getValue() == Imm && Imm.isNegative() && Imm.isNegatedPowerOf2()) {
351
352 auto isSSatMin = [&](Value *MinInst) {
353 if (isa<SelectInst>(Val: MinInst)) {
354 Value *MinLHS, *MinRHS;
355 ConstantInt *MinC;
356 SelectPatternFlavor MinSPF =
357 matchSelectPattern(V: MinInst, LHS&: MinLHS, RHS&: MinRHS).Flavor;
358 if (MinSPF == SPF_SMIN &&
359 PatternMatch::match(V: MinRHS, P: PatternMatch::m_ConstantInt(CI&: MinC)) &&
360 MinC->getValue() == ((-Imm) - 1))
361 return true;
362 }
363 return false;
364 };
365
366 if (isSSatMin(Inst->getOperand(i: 1)))
367 return cast<Instruction>(Val: Inst->getOperand(i: 1))->getOperand(i: 1);
368 if (Inst->hasNUses(N: 2) &&
369 (isSSatMin(*Inst->user_begin()) || isSSatMin(*(++Inst->user_begin()))))
370 return Inst->getOperand(i: 1);
371 }
372 return nullptr;
373}
374
375// Look for a FP Saturation pattern, where the instruction can be simplified to
376// a fptosi.sat. max(min(fptosi)). The constant in this case is always free.
377static bool isFPSatMinMaxPattern(Instruction *Inst, const APInt &Imm) {
378 if (Imm.getBitWidth() != 64 ||
379 Imm != APInt::getHighBitsSet(numBits: 64, hiBitsSet: 33)) // -2147483648
380 return false;
381 Value *FP = isSSATMinMaxPattern(Inst, Imm);
382 if (!FP && isa<ICmpInst>(Val: Inst) && Inst->hasOneUse())
383 FP = isSSATMinMaxPattern(Inst: cast<Instruction>(Val: *Inst->user_begin()), Imm);
384 if (!FP)
385 return false;
386 return isa<FPToSIInst>(Val: FP);
387}
388
389InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
390 const APInt &Imm, Type *Ty,
391 TTI::TargetCostKind CostKind,
392 Instruction *Inst) {
393 // Division by a constant can be turned into multiplication, but only if we
394 // know it's constant. So it's not so much that the immediate is cheap (it's
395 // not), but that the alternative is worse.
396 // FIXME: this is probably unneeded with GlobalISel.
397 if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
398 Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
399 Idx == 1)
400 return 0;
401
402 // Leave any gep offsets for the CodeGenPrepare, which will do a better job at
403 // splitting any large offsets.
404 if (Opcode == Instruction::GetElementPtr && Idx != 0)
405 return 0;
406
407 if (Opcode == Instruction::And) {
408 // UXTB/UXTH
409 if (Imm == 255 || Imm == 65535)
410 return 0;
411 // Conversion to BIC is free, and means we can use ~Imm instead.
412 return std::min(a: getIntImmCost(Imm, Ty, CostKind),
413 b: getIntImmCost(Imm: ~Imm, Ty, CostKind));
414 }
415
416 if (Opcode == Instruction::Add)
417 // Conversion to SUB is free, and means we can use -Imm instead.
418 return std::min(a: getIntImmCost(Imm, Ty, CostKind),
419 b: getIntImmCost(Imm: -Imm, Ty, CostKind));
420
421 if (Opcode == Instruction::ICmp && Imm.isNegative() &&
422 Ty->getIntegerBitWidth() == 32) {
423 int64_t NegImm = -Imm.getSExtValue();
424 if (ST->isThumb2() && NegImm < 1<<12)
425 // icmp X, #-C -> cmn X, #C
426 return 0;
427 if (ST->isThumb() && NegImm < 1<<8)
428 // icmp X, #-C -> adds X, #C
429 return 0;
430 }
431
432 // xor a, -1 can always be folded to MVN
433 if (Opcode == Instruction::Xor && Imm.isAllOnes())
434 return 0;
435
436 // Ensures negative constant of min(max()) or max(min()) patterns that
437 // match to SSAT instructions don't get hoisted
438 if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) &&
439 Ty->getIntegerBitWidth() <= 32) {
440 if (isSSATMinMaxPattern(Inst, Imm) ||
441 (isa<ICmpInst>(Val: Inst) && Inst->hasOneUse() &&
442 isSSATMinMaxPattern(Inst: cast<Instruction>(Val: *Inst->user_begin()), Imm)))
443 return 0;
444 }
445
446 if (Inst && ST->hasVFP2Base() && isFPSatMinMaxPattern(Inst, Imm))
447 return 0;
448
449 // We can convert <= -1 to < 0, which is generally quite cheap.
450 if (Inst && Opcode == Instruction::ICmp && Idx == 1 && Imm.isAllOnes()) {
451 ICmpInst::Predicate Pred = cast<ICmpInst>(Val: Inst)->getPredicate();
452 if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE)
453 return std::min(a: getIntImmCost(Imm, Ty, CostKind),
454 b: getIntImmCost(Imm: Imm + 1, Ty, CostKind));
455 }
456
457 return getIntImmCost(Imm, Ty, CostKind);
458}
459
460InstructionCost ARMTTIImpl::getCFInstrCost(unsigned Opcode,
461 TTI::TargetCostKind CostKind,
462 const Instruction *I) {
463 if (CostKind == TTI::TCK_RecipThroughput &&
464 (ST->hasNEON() || ST->hasMVEIntegerOps())) {
465 // FIXME: The vectorizer is highly sensistive to the cost of these
466 // instructions, which suggests that it may be using the costs incorrectly.
467 // But, for now, just make them free to avoid performance regressions for
468 // vector targets.
469 return 0;
470 }
471 return BaseT::getCFInstrCost(Opcode, CostKind, I);
472}
473
474InstructionCost ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
475 Type *Src,
476 TTI::CastContextHint CCH,
477 TTI::TargetCostKind CostKind,
478 const Instruction *I) {
479 int ISD = TLI->InstructionOpcodeToISD(Opcode);
480 assert(ISD && "Invalid opcode");
481
482 // TODO: Allow non-throughput costs that aren't binary.
483 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
484 if (CostKind != TTI::TCK_RecipThroughput)
485 return Cost == 0 ? 0 : 1;
486 return Cost;
487 };
488 auto IsLegalFPType = [this](EVT VT) {
489 EVT EltVT = VT.getScalarType();
490 return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
491 (EltVT == MVT::f64 && ST->hasFP64()) ||
492 (EltVT == MVT::f16 && ST->hasFullFP16());
493 };
494
495 EVT SrcTy = TLI->getValueType(DL, Ty: Src);
496 EVT DstTy = TLI->getValueType(DL, Ty: Dst);
497
498 if (!SrcTy.isSimple() || !DstTy.isSimple())
499 return AdjustCost(
500 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
501
502 // Extending masked load/Truncating masked stores is expensive because we
503 // currently don't split them. This means that we'll likely end up
504 // loading/storing each element individually (hence the high cost).
505 if ((ST->hasMVEIntegerOps() &&
506 (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt ||
507 Opcode == Instruction::SExt)) ||
508 (ST->hasMVEFloatOps() &&
509 (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) &&
510 IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)))
511 if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128)
512 return 2 * DstTy.getVectorNumElements() *
513 ST->getMVEVectorCostFactor(CostKind);
514
515 // The extend of other kinds of load is free
516 if (CCH == TTI::CastContextHint::Normal ||
517 CCH == TTI::CastContextHint::Masked) {
518 static const TypeConversionCostTblEntry LoadConversionTbl[] = {
519 {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0},
520 {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0},
521 {ISD::SIGN_EXTEND, MVT::i32, MVT::i8, 0},
522 {ISD::ZERO_EXTEND, MVT::i32, MVT::i8, 0},
523 {ISD::SIGN_EXTEND, MVT::i16, MVT::i8, 0},
524 {ISD::ZERO_EXTEND, MVT::i16, MVT::i8, 0},
525 {ISD::SIGN_EXTEND, MVT::i64, MVT::i32, 1},
526 {ISD::ZERO_EXTEND, MVT::i64, MVT::i32, 1},
527 {ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 1},
528 {ISD::ZERO_EXTEND, MVT::i64, MVT::i16, 1},
529 {ISD::SIGN_EXTEND, MVT::i64, MVT::i8, 1},
530 {ISD::ZERO_EXTEND, MVT::i64, MVT::i8, 1},
531 };
532 if (const auto *Entry = ConvertCostTableLookup(
533 LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
534 return AdjustCost(Entry->Cost);
535
536 static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
537 {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0},
538 {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0},
539 {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 0},
540 {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 0},
541 {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 0},
542 {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 0},
543 // The following extend from a legal type to an illegal type, so need to
544 // split the load. This introduced an extra load operation, but the
545 // extend is still "free".
546 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1},
547 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1},
548 {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 3},
549 {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 3},
550 {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1},
551 {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1},
552 };
553 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
554 if (const auto *Entry =
555 ConvertCostTableLookup(MVELoadConversionTbl, ISD,
556 DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
557 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
558 }
559
560 static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
561 // FPExtends are similar but also require the VCVT instructions.
562 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1},
563 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 3},
564 };
565 if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
566 if (const auto *Entry =
567 ConvertCostTableLookup(MVEFLoadConversionTbl, ISD,
568 DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
569 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
570 }
571
572 // The truncate of a store is free. This is the mirror of extends above.
573 static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = {
574 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i16, 0},
575 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i8, 0},
576 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i8, 0},
577 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i16, 1},
578 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i8, 1},
579 {ISD::TRUNCATE, MVT::v16i32, MVT::v16i8, 3},
580 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i8, 1},
581 };
582 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
583 if (const auto *Entry =
584 ConvertCostTableLookup(MVEStoreConversionTbl, ISD,
585 SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
586 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
587 }
588
589 static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {
590 {ISD::FP_ROUND, MVT::v4f32, MVT::v4f16, 1},
591 {ISD::FP_ROUND, MVT::v8f32, MVT::v8f16, 3},
592 };
593 if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
594 if (const auto *Entry =
595 ConvertCostTableLookup(MVEFStoreConversionTbl, ISD,
596 SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
597 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
598 }
599 }
600
601 // NEON vector operations that can extend their inputs.
602 if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) &&
603 I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) {
604 static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = {
605 // vaddl
606 { ISD::ADD, MVT::v4i32, MVT::v4i16, 0 },
607 { ISD::ADD, MVT::v8i16, MVT::v8i8, 0 },
608 // vsubl
609 { ISD::SUB, MVT::v4i32, MVT::v4i16, 0 },
610 { ISD::SUB, MVT::v8i16, MVT::v8i8, 0 },
611 // vmull
612 { ISD::MUL, MVT::v4i32, MVT::v4i16, 0 },
613 { ISD::MUL, MVT::v8i16, MVT::v8i8, 0 },
614 // vshll
615 { ISD::SHL, MVT::v4i32, MVT::v4i16, 0 },
616 { ISD::SHL, MVT::v8i16, MVT::v8i8, 0 },
617 };
618
619 auto *User = cast<Instruction>(Val: *I->user_begin());
620 int UserISD = TLI->InstructionOpcodeToISD(Opcode: User->getOpcode());
621 if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD,
622 DstTy.getSimpleVT(),
623 SrcTy.getSimpleVT())) {
624 return AdjustCost(Entry->Cost);
625 }
626 }
627
628 // Single to/from double precision conversions.
629 if (Src->isVectorTy() && ST->hasNEON() &&
630 ((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 &&
631 DstTy.getScalarType() == MVT::f32) ||
632 (ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 &&
633 DstTy.getScalarType() == MVT::f64))) {
634 static const CostTblEntry NEONFltDblTbl[] = {
635 // Vector fptrunc/fpext conversions.
636 {ISD::FP_ROUND, MVT::v2f64, 2},
637 {ISD::FP_EXTEND, MVT::v2f32, 2},
638 {ISD::FP_EXTEND, MVT::v4f32, 4}};
639
640 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Src);
641 if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
642 return AdjustCost(LT.first * Entry->Cost);
643 }
644
645 // Some arithmetic, load and store operations have specific instructions
646 // to cast up/down their types automatically at no extra cost.
647 // TODO: Get these tables to know at least what the related operations are.
648 static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
649 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
650 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
651 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
652 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
653 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 },
654 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 },
655
656 // The number of vmovl instructions for the extension.
657 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
658 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
659 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
660 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
661 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 3 },
662 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 3 },
663 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
664 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
665 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
666 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
667 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
668 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
669 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
670 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
671 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
672 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
673 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
674 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
675
676 // Operations that we legalize using splitting.
677 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
678 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 },
679
680 // Vector float <-> i32 conversions.
681 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
682 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
683
684 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
685 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
686 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 },
687 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 },
688 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
689 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
690 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
691 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
692 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
693 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
694 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
695 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
696 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
697 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
698 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
699 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
700 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 },
701 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 },
702 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 },
703 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 },
704
705 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
706 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
707 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 3 },
708 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 3 },
709 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
710 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
711
712 // Vector double <-> i32 conversions.
713 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
714 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
715
716 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
717 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
718 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 },
719 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 },
720 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
721 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
722
723 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
724 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
725 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 4 },
726 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 4 },
727 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f32, 8 },
728 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 8 }
729 };
730
731 if (SrcTy.isVector() && ST->hasNEON()) {
732 if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
733 DstTy.getSimpleVT(),
734 SrcTy.getSimpleVT()))
735 return AdjustCost(Entry->Cost);
736 }
737
738 // Scalar float to integer conversions.
739 static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
740 { ISD::FP_TO_SINT, MVT::i1, MVT::f32, 2 },
741 { ISD::FP_TO_UINT, MVT::i1, MVT::f32, 2 },
742 { ISD::FP_TO_SINT, MVT::i1, MVT::f64, 2 },
743 { ISD::FP_TO_UINT, MVT::i1, MVT::f64, 2 },
744 { ISD::FP_TO_SINT, MVT::i8, MVT::f32, 2 },
745 { ISD::FP_TO_UINT, MVT::i8, MVT::f32, 2 },
746 { ISD::FP_TO_SINT, MVT::i8, MVT::f64, 2 },
747 { ISD::FP_TO_UINT, MVT::i8, MVT::f64, 2 },
748 { ISD::FP_TO_SINT, MVT::i16, MVT::f32, 2 },
749 { ISD::FP_TO_UINT, MVT::i16, MVT::f32, 2 },
750 { ISD::FP_TO_SINT, MVT::i16, MVT::f64, 2 },
751 { ISD::FP_TO_UINT, MVT::i16, MVT::f64, 2 },
752 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 2 },
753 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 2 },
754 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 2 },
755 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 2 },
756 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 10 },
757 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 10 },
758 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 10 },
759 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 10 }
760 };
761 if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
762 if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
763 DstTy.getSimpleVT(),
764 SrcTy.getSimpleVT()))
765 return AdjustCost(Entry->Cost);
766 }
767
768 // Scalar integer to float conversions.
769 static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
770 { ISD::SINT_TO_FP, MVT::f32, MVT::i1, 2 },
771 { ISD::UINT_TO_FP, MVT::f32, MVT::i1, 2 },
772 { ISD::SINT_TO_FP, MVT::f64, MVT::i1, 2 },
773 { ISD::UINT_TO_FP, MVT::f64, MVT::i1, 2 },
774 { ISD::SINT_TO_FP, MVT::f32, MVT::i8, 2 },
775 { ISD::UINT_TO_FP, MVT::f32, MVT::i8, 2 },
776 { ISD::SINT_TO_FP, MVT::f64, MVT::i8, 2 },
777 { ISD::UINT_TO_FP, MVT::f64, MVT::i8, 2 },
778 { ISD::SINT_TO_FP, MVT::f32, MVT::i16, 2 },
779 { ISD::UINT_TO_FP, MVT::f32, MVT::i16, 2 },
780 { ISD::SINT_TO_FP, MVT::f64, MVT::i16, 2 },
781 { ISD::UINT_TO_FP, MVT::f64, MVT::i16, 2 },
782 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 2 },
783 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 2 },
784 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 2 },
785 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 2 },
786 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 10 },
787 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 10 },
788 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 10 },
789 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 10 }
790 };
791
792 if (SrcTy.isInteger() && ST->hasNEON()) {
793 if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
794 ISD, DstTy.getSimpleVT(),
795 SrcTy.getSimpleVT()))
796 return AdjustCost(Entry->Cost);
797 }
798
799 // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
800 // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
801 // are linearised so take more.
802 static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
803 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
804 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
805 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
806 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
807 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 10 },
808 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 2 },
809 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
810 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
811 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 10 },
812 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
813 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 8 },
814 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 2 },
815 };
816
817 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
818 if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
819 ISD, DstTy.getSimpleVT(),
820 SrcTy.getSimpleVT()))
821 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
822 }
823
824 if (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND) {
825 // As general rule, fp converts that were not matched above are scalarized
826 // and cost 1 vcvt for each lane, so long as the instruction is available.
827 // If not it will become a series of function calls.
828 const InstructionCost CallCost =
829 getCallInstrCost(F: nullptr, RetTy: Dst, Tys: {Src}, CostKind);
830 int Lanes = 1;
831 if (SrcTy.isFixedLengthVector())
832 Lanes = SrcTy.getVectorNumElements();
833
834 if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))
835 return Lanes;
836 else
837 return Lanes * CallCost;
838 }
839
840 if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() &&
841 SrcTy.isFixedLengthVector()) {
842 // Treat a truncate with larger than legal source (128bits for MVE) as
843 // expensive, 2 instructions per lane.
844 if ((SrcTy.getScalarType() == MVT::i8 ||
845 SrcTy.getScalarType() == MVT::i16 ||
846 SrcTy.getScalarType() == MVT::i32) &&
847 SrcTy.getSizeInBits() > 128 &&
848 SrcTy.getSizeInBits() > DstTy.getSizeInBits())
849 return SrcTy.getVectorNumElements() * 2;
850 }
851
852 // Scalar integer conversion costs.
853 static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
854 // i16 -> i64 requires two dependent operations.
855 { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 },
856
857 // Truncates on i64 are assumed to be free.
858 { ISD::TRUNCATE, MVT::i32, MVT::i64, 0 },
859 { ISD::TRUNCATE, MVT::i16, MVT::i64, 0 },
860 { ISD::TRUNCATE, MVT::i8, MVT::i64, 0 },
861 { ISD::TRUNCATE, MVT::i1, MVT::i64, 0 }
862 };
863
864 if (SrcTy.isInteger()) {
865 if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
866 DstTy.getSimpleVT(),
867 SrcTy.getSimpleVT()))
868 return AdjustCost(Entry->Cost);
869 }
870
871 int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
872 ? ST->getMVEVectorCostFactor(CostKind)
873 : 1;
874 return AdjustCost(
875 BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
876}
877
878InstructionCost ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
879 TTI::TargetCostKind CostKind,
880 unsigned Index, Value *Op0,
881 Value *Op1) {
882 // Penalize inserting into an D-subregister. We end up with a three times
883 // lower estimated throughput on swift.
884 if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
885 ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
886 return 3;
887
888 if (ST->hasNEON() && (Opcode == Instruction::InsertElement ||
889 Opcode == Instruction::ExtractElement)) {
890 // Cross-class copies are expensive on many microarchitectures,
891 // so assume they are expensive by default.
892 if (cast<VectorType>(Val: ValTy)->getElementType()->isIntegerTy())
893 return 3;
894
895 // Even if it's not a cross class copy, this likely leads to mixing
896 // of NEON and VFP code and should be therefore penalized.
897 if (ValTy->isVectorTy() &&
898 ValTy->getScalarSizeInBits() <= 32)
899 return std::max<InstructionCost>(
900 a: BaseT::getVectorInstrCost(Opcode, Val: ValTy, CostKind, Index, Op0, Op1),
901 b: 2U);
902 }
903
904 if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
905 Opcode == Instruction::ExtractElement)) {
906 // Integer cross-lane moves are more expensive than float, which can
907 // sometimes just be vmovs. Integer involve being passes to GPR registers,
908 // causing more of a delay.
909 std::pair<InstructionCost, MVT> LT =
910 getTypeLegalizationCost(Ty: ValTy->getScalarType());
911 return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1);
912 }
913
914 return BaseT::getVectorInstrCost(Opcode, Val: ValTy, CostKind, Index, Op0, Op1);
915}
916
917InstructionCost ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
918 Type *CondTy,
919 CmpInst::Predicate VecPred,
920 TTI::TargetCostKind CostKind,
921 const Instruction *I) {
922 int ISD = TLI->InstructionOpcodeToISD(Opcode);
923
924 // Thumb scalar code size cost for select.
925 if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT &&
926 ST->isThumb() && !ValTy->isVectorTy()) {
927 // Assume expensive structs.
928 if (TLI->getValueType(DL, ValTy, true) == MVT::Other)
929 return TTI::TCC_Expensive;
930
931 // Select costs can vary because they:
932 // - may require one or more conditional mov (including an IT),
933 // - can't operate directly on immediates,
934 // - require live flags, which we can't copy around easily.
935 InstructionCost Cost = getTypeLegalizationCost(Ty: ValTy).first;
936
937 // Possible IT instruction for Thumb2, or more for Thumb1.
938 ++Cost;
939
940 // i1 values may need rematerialising by using mov immediates and/or
941 // flag setting instructions.
942 if (ValTy->isIntegerTy(Bitwidth: 1))
943 ++Cost;
944
945 return Cost;
946 }
947
948 // If this is a vector min/max/abs, use the cost of that intrinsic directly
949 // instead. Hopefully when min/max intrinsics are more prevalent this code
950 // will not be needed.
951 const Instruction *Sel = I;
952 if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && Sel &&
953 Sel->hasOneUse())
954 Sel = cast<Instruction>(Val: Sel->user_back());
955 if (Sel && ValTy->isVectorTy() &&
956 (ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) {
957 const Value *LHS, *RHS;
958 SelectPatternFlavor SPF = matchSelectPattern(V: Sel, LHS, RHS).Flavor;
959 unsigned IID = 0;
960 switch (SPF) {
961 case SPF_ABS:
962 IID = Intrinsic::abs;
963 break;
964 case SPF_SMIN:
965 IID = Intrinsic::smin;
966 break;
967 case SPF_SMAX:
968 IID = Intrinsic::smax;
969 break;
970 case SPF_UMIN:
971 IID = Intrinsic::umin;
972 break;
973 case SPF_UMAX:
974 IID = Intrinsic::umax;
975 break;
976 case SPF_FMINNUM:
977 IID = Intrinsic::minnum;
978 break;
979 case SPF_FMAXNUM:
980 IID = Intrinsic::maxnum;
981 break;
982 default:
983 break;
984 }
985 if (IID) {
986 // The ICmp is free, the select gets the cost of the min/max/etc
987 if (Sel != I)
988 return 0;
989 IntrinsicCostAttributes CostAttrs(IID, ValTy, {ValTy, ValTy});
990 return getIntrinsicInstrCost(ICA: CostAttrs, CostKind);
991 }
992 }
993
994 // On NEON a vector select gets lowered to vbsl.
995 if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) {
996 // Lowering of some vector selects is currently far from perfect.
997 static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
998 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
999 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
1000 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 }
1001 };
1002
1003 EVT SelCondTy = TLI->getValueType(DL, Ty: CondTy);
1004 EVT SelValTy = TLI->getValueType(DL, Ty: ValTy);
1005 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
1006 if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
1007 SelCondTy.getSimpleVT(),
1008 SelValTy.getSimpleVT()))
1009 return Entry->Cost;
1010 }
1011
1012 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
1013 return LT.first;
1014 }
1015
1016 if (ST->hasMVEIntegerOps() && ValTy->isVectorTy() &&
1017 (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
1018 cast<FixedVectorType>(Val: ValTy)->getNumElements() > 1) {
1019 FixedVectorType *VecValTy = cast<FixedVectorType>(Val: ValTy);
1020 FixedVectorType *VecCondTy = dyn_cast_or_null<FixedVectorType>(Val: CondTy);
1021 if (!VecCondTy)
1022 VecCondTy = cast<FixedVectorType>(Val: CmpInst::makeCmpResultType(opnd_type: VecValTy));
1023
1024 // If we don't have mve.fp any fp operations will need to be scalarized.
1025 if (Opcode == Instruction::FCmp && !ST->hasMVEFloatOps()) {
1026 // One scalaization insert, one scalarization extract and the cost of the
1027 // fcmps.
1028 return BaseT::getScalarizationOverhead(InTy: VecValTy, /*Insert*/ false,
1029 /*Extract*/ true, CostKind) +
1030 BaseT::getScalarizationOverhead(InTy: VecCondTy, /*Insert*/ true,
1031 /*Extract*/ false, CostKind) +
1032 VecValTy->getNumElements() *
1033 getCmpSelInstrCost(Opcode, ValTy: ValTy->getScalarType(),
1034 CondTy: VecCondTy->getScalarType(), VecPred,
1035 CostKind, I);
1036 }
1037
1038 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
1039 int BaseCost = ST->getMVEVectorCostFactor(CostKind);
1040 // There are two types - the input that specifies the type of the compare
1041 // and the output vXi1 type. Because we don't know how the output will be
1042 // split, we may need an expensive shuffle to get two in sync. This has the
1043 // effect of making larger than legal compares (v8i32 for example)
1044 // expensive.
1045 if (LT.second.isVector() && LT.second.getVectorNumElements() > 2) {
1046 if (LT.first > 1)
1047 return LT.first * BaseCost +
1048 BaseT::getScalarizationOverhead(InTy: VecCondTy, /*Insert*/ true,
1049 /*Extract*/ false, CostKind);
1050 return BaseCost;
1051 }
1052 }
1053
1054 // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1055 // for "multiple beats" potentially needed by MVE instructions.
1056 int BaseCost = 1;
1057 if (ST->hasMVEIntegerOps() && ValTy->isVectorTy())
1058 BaseCost = ST->getMVEVectorCostFactor(CostKind);
1059
1060 return BaseCost *
1061 BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
1062}
1063
1064InstructionCost ARMTTIImpl::getAddressComputationCost(Type *Ty,
1065 ScalarEvolution *SE,
1066 const SCEV *Ptr) {
1067 // Address computations in vectorized code with non-consecutive addresses will
1068 // likely result in more instructions compared to scalar code where the
1069 // computation can more often be merged into the index mode. The resulting
1070 // extra micro-ops can significantly decrease throughput.
1071 unsigned NumVectorInstToHideOverhead = 10;
1072 int MaxMergeDistance = 64;
1073
1074 if (ST->hasNEON()) {
1075 if (Ty->isVectorTy() && SE &&
1076 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MergeDistance: MaxMergeDistance + 1))
1077 return NumVectorInstToHideOverhead;
1078
1079 // In many cases the address computation is not merged into the instruction
1080 // addressing mode.
1081 return 1;
1082 }
1083 return BaseT::getAddressComputationCost(Ty, SE, Ptr);
1084}
1085
1086bool ARMTTIImpl::isProfitableLSRChainElement(Instruction *I) {
1087 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: I)) {
1088 // If a VCTP is part of a chain, it's already profitable and shouldn't be
1089 // optimized, else LSR may block tail-predication.
1090 switch (II->getIntrinsicID()) {
1091 case Intrinsic::arm_mve_vctp8:
1092 case Intrinsic::arm_mve_vctp16:
1093 case Intrinsic::arm_mve_vctp32:
1094 case Intrinsic::arm_mve_vctp64:
1095 return true;
1096 default:
1097 break;
1098 }
1099 }
1100 return false;
1101}
1102
1103bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
1104 if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
1105 return false;
1106
1107 if (auto *VecTy = dyn_cast<FixedVectorType>(Val: DataTy)) {
1108 // Don't support v2i1 yet.
1109 if (VecTy->getNumElements() == 2)
1110 return false;
1111
1112 // We don't support extending fp types.
1113 unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
1114 if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy())
1115 return false;
1116 }
1117
1118 unsigned EltWidth = DataTy->getScalarSizeInBits();
1119 return (EltWidth == 32 && Alignment >= 4) ||
1120 (EltWidth == 16 && Alignment >= 2) || (EltWidth == 8);
1121}
1122
1123bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) {
1124 if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps())
1125 return false;
1126
1127 unsigned EltWidth = Ty->getScalarSizeInBits();
1128 return ((EltWidth == 32 && Alignment >= 4) ||
1129 (EltWidth == 16 && Alignment >= 2) || EltWidth == 8);
1130}
1131
1132/// Given a memcpy/memset/memmove instruction, return the number of memory
1133/// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a
1134/// call is used.
1135int ARMTTIImpl::getNumMemOps(const IntrinsicInst *I) const {
1136 MemOp MOp;
1137 unsigned DstAddrSpace = ~0u;
1138 unsigned SrcAddrSpace = ~0u;
1139 const Function *F = I->getParent()->getParent();
1140
1141 if (const auto *MC = dyn_cast<MemTransferInst>(Val: I)) {
1142 ConstantInt *C = dyn_cast<ConstantInt>(Val: MC->getLength());
1143 // If 'size' is not a constant, a library call will be generated.
1144 if (!C)
1145 return -1;
1146
1147 const unsigned Size = C->getValue().getZExtValue();
1148 const Align DstAlign = *MC->getDestAlign();
1149 const Align SrcAlign = *MC->getSourceAlign();
1150
1151 MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
1152 /*IsVolatile*/ false);
1153 DstAddrSpace = MC->getDestAddressSpace();
1154 SrcAddrSpace = MC->getSourceAddressSpace();
1155 }
1156 else if (const auto *MS = dyn_cast<MemSetInst>(Val: I)) {
1157 ConstantInt *C = dyn_cast<ConstantInt>(Val: MS->getLength());
1158 // If 'size' is not a constant, a library call will be generated.
1159 if (!C)
1160 return -1;
1161
1162 const unsigned Size = C->getValue().getZExtValue();
1163 const Align DstAlign = *MS->getDestAlign();
1164
1165 MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign,
1166 /*IsZeroMemset*/ false, /*IsVolatile*/ false);
1167 DstAddrSpace = MS->getDestAddressSpace();
1168 }
1169 else
1170 llvm_unreachable("Expected a memcpy/move or memset!");
1171
1172 unsigned Limit, Factor = 2;
1173 switch(I->getIntrinsicID()) {
1174 case Intrinsic::memcpy:
1175 Limit = TLI->getMaxStoresPerMemcpy(OptSize: F->hasMinSize());
1176 break;
1177 case Intrinsic::memmove:
1178 Limit = TLI->getMaxStoresPerMemmove(OptSize: F->hasMinSize());
1179 break;
1180 case Intrinsic::memset:
1181 Limit = TLI->getMaxStoresPerMemset(OptSize: F->hasMinSize());
1182 Factor = 1;
1183 break;
1184 default:
1185 llvm_unreachable("Expected a memcpy/move or memset!");
1186 }
1187
1188 // MemOps will be poplulated with a list of data types that needs to be
1189 // loaded and stored. That's why we multiply the number of elements by 2 to
1190 // get the cost for this memcpy.
1191 std::vector<EVT> MemOps;
1192 if (getTLI()->findOptimalMemOpLowering(
1193 MemOps, Limit, Op: MOp, DstAS: DstAddrSpace,
1194 SrcAS: SrcAddrSpace, FuncAttributes: F->getAttributes()))
1195 return MemOps.size() * Factor;
1196
1197 // If we can't find an optimal memop lowering, return the default cost
1198 return -1;
1199}
1200
1201InstructionCost ARMTTIImpl::getMemcpyCost(const Instruction *I) {
1202 int NumOps = getNumMemOps(I: cast<IntrinsicInst>(Val: I));
1203
1204 // To model the cost of a library call, we assume 1 for the call, and
1205 // 3 for the argument setup.
1206 if (NumOps == -1)
1207 return 4;
1208 return NumOps;
1209}
1210
1211InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1212 VectorType *Tp, ArrayRef<int> Mask,
1213 TTI::TargetCostKind CostKind,
1214 int Index, VectorType *SubTp,
1215 ArrayRef<const Value *> Args,
1216 const Instruction *CxtI) {
1217 Kind = improveShuffleKindFromMask(Kind, Mask, Ty: Tp, Index, SubTy&: SubTp);
1218 // Treat extractsubvector as single op permutation.
1219 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
1220 if (IsExtractSubvector)
1221 Kind = TTI::SK_PermuteSingleSrc;
1222 if (ST->hasNEON()) {
1223 if (Kind == TTI::SK_Broadcast) {
1224 static const CostTblEntry NEONDupTbl[] = {
1225 // VDUP handles these cases.
1226 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1227 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1228 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1229 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1230 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
1231 {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
1232
1233 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
1234 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
1235 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
1236 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}};
1237
1238 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Tp);
1239 if (const auto *Entry =
1240 CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))
1241 return LT.first * Entry->Cost;
1242 }
1243 if (Kind == TTI::SK_Reverse) {
1244 static const CostTblEntry NEONShuffleTbl[] = {
1245 // Reverse shuffle cost one instruction if we are shuffling within a
1246 // double word (vrev) or two if we shuffle a quad word (vrev, vext).
1247 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1248 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1249 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1250 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1251 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
1252 {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
1253
1254 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
1255 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
1256 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
1257 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
1258
1259 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Tp);
1260 if (const auto *Entry =
1261 CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
1262 return LT.first * Entry->Cost;
1263 }
1264 if (Kind == TTI::SK_Select) {
1265 static const CostTblEntry NEONSelShuffleTbl[] = {
1266 // Select shuffle cost table for ARM. Cost is the number of
1267 // instructions
1268 // required to create the shuffled vector.
1269
1270 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1271 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1272 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1273 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1274
1275 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
1276 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
1277 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2},
1278
1279 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16},
1280
1281 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
1282
1283 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Tp);
1284 if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
1285 ISD::VECTOR_SHUFFLE, LT.second))
1286 return LT.first * Entry->Cost;
1287 }
1288 }
1289 if (ST->hasMVEIntegerOps()) {
1290 if (Kind == TTI::SK_Broadcast) {
1291 static const CostTblEntry MVEDupTbl[] = {
1292 // VDUP handles these cases.
1293 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
1294 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
1295 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1},
1296 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
1297 {ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}};
1298
1299 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Tp);
1300 if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
1301 LT.second))
1302 return LT.first * Entry->Cost *
1303 ST->getMVEVectorCostFactor(CostKind: TTI::TCK_RecipThroughput);
1304 }
1305
1306 if (!Mask.empty()) {
1307 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Tp);
1308 if (LT.second.isVector() &&
1309 Mask.size() <= LT.second.getVectorNumElements() &&
1310 (isVREVMask(M: Mask, VT: LT.second, BlockSize: 16) || isVREVMask(M: Mask, VT: LT.second, BlockSize: 32) ||
1311 isVREVMask(M: Mask, VT: LT.second, BlockSize: 64)))
1312 return ST->getMVEVectorCostFactor(CostKind: TTI::TCK_RecipThroughput) * LT.first;
1313 }
1314 }
1315
1316 // Restore optimal kind.
1317 if (IsExtractSubvector)
1318 Kind = TTI::SK_ExtractSubvector;
1319 int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy()
1320 ? ST->getMVEVectorCostFactor(CostKind: TTI::TCK_RecipThroughput)
1321 : 1;
1322 return BaseCost *
1323 BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
1324}
1325
1326InstructionCost ARMTTIImpl::getArithmeticInstrCost(
1327 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1328 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
1329 ArrayRef<const Value *> Args,
1330 const Instruction *CxtI) {
1331 int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
1332 if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(Bitwidth: 1)) {
1333 // Make operations on i1 relatively expensive as this often involves
1334 // combining predicates. AND and XOR should be easier to handle with IT
1335 // blocks.
1336 switch (ISDOpcode) {
1337 default:
1338 break;
1339 case ISD::AND:
1340 case ISD::XOR:
1341 return 2;
1342 case ISD::OR:
1343 return 3;
1344 }
1345 }
1346
1347 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1348
1349 if (ST->hasNEON()) {
1350 const unsigned FunctionCallDivCost = 20;
1351 const unsigned ReciprocalDivCost = 10;
1352 static const CostTblEntry CostTbl[] = {
1353 // Division.
1354 // These costs are somewhat random. Choose a cost of 20 to indicate that
1355 // vectorizing devision (added function call) is going to be very expensive.
1356 // Double registers types.
1357 { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1358 { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1359 { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
1360 { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
1361 { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1362 { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1363 { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
1364 { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
1365 { ISD::SDIV, MVT::v4i16, ReciprocalDivCost},
1366 { ISD::UDIV, MVT::v4i16, ReciprocalDivCost},
1367 { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
1368 { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
1369 { ISD::SDIV, MVT::v8i8, ReciprocalDivCost},
1370 { ISD::UDIV, MVT::v8i8, ReciprocalDivCost},
1371 { ISD::SREM, MVT::v8i8, 8 * FunctionCallDivCost},
1372 { ISD::UREM, MVT::v8i8, 8 * FunctionCallDivCost},
1373 // Quad register types.
1374 { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1375 { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1376 { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
1377 { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
1378 { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1379 { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1380 { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
1381 { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
1382 { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1383 { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1384 { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
1385 { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
1386 { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1387 { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1388 { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
1389 { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
1390 // Multiplication.
1391 };
1392
1393 if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
1394 return LT.first * Entry->Cost;
1395
1396 InstructionCost Cost = BaseT::getArithmeticInstrCost(
1397 Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info);
1398
1399 // This is somewhat of a hack. The problem that we are facing is that SROA
1400 // creates a sequence of shift, and, or instructions to construct values.
1401 // These sequences are recognized by the ISel and have zero-cost. Not so for
1402 // the vectorized code. Because we have support for v2i64 but not i64 those
1403 // sequences look particularly beneficial to vectorize.
1404 // To work around this we increase the cost of v2i64 operations to make them
1405 // seem less beneficial.
1406 if (LT.second == MVT::v2i64 && Op2Info.isUniform() && Op2Info.isConstant())
1407 Cost += 4;
1408
1409 return Cost;
1410 }
1411
1412 // If this operation is a shift on arm/thumb2, it might well be folded into
1413 // the following instruction, hence having a cost of 0.
1414 auto LooksLikeAFreeShift = [&]() {
1415 if (ST->isThumb1Only() || Ty->isVectorTy())
1416 return false;
1417
1418 if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift())
1419 return false;
1420 if (!Op2Info.isUniform() || !Op2Info.isConstant())
1421 return false;
1422
1423 // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB
1424 switch (cast<Instruction>(Val: CxtI->user_back())->getOpcode()) {
1425 case Instruction::Add:
1426 case Instruction::Sub:
1427 case Instruction::And:
1428 case Instruction::Xor:
1429 case Instruction::Or:
1430 case Instruction::ICmp:
1431 return true;
1432 default:
1433 return false;
1434 }
1435 };
1436 if (LooksLikeAFreeShift())
1437 return 0;
1438
1439 // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1440 // for "multiple beats" potentially needed by MVE instructions.
1441 int BaseCost = 1;
1442 if (ST->hasMVEIntegerOps() && Ty->isVectorTy())
1443 BaseCost = ST->getMVEVectorCostFactor(CostKind);
1444
1445 // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
1446 // without treating floats as more expensive that scalars or increasing the
1447 // costs for custom operations. The results is also multiplied by the
1448 // MVEVectorCostFactor where appropriate.
1449 if (TLI->isOperationLegalOrCustomOrPromote(Op: ISDOpcode, VT: LT.second))
1450 return LT.first * BaseCost;
1451
1452 // Else this is expand, assume that we need to scalarize this op.
1453 if (auto *VTy = dyn_cast<FixedVectorType>(Val: Ty)) {
1454 unsigned Num = VTy->getNumElements();
1455 InstructionCost Cost =
1456 getArithmeticInstrCost(Opcode, Ty: Ty->getScalarType(), CostKind);
1457 // Return the cost of multiple scalar invocation plus the cost of
1458 // inserting and extracting the values.
1459 SmallVector<Type *> Tys(Args.size(), Ty);
1460 return BaseT::getScalarizationOverhead(RetTy: VTy, Args, Tys, CostKind) +
1461 Num * Cost;
1462 }
1463
1464 return BaseCost;
1465}
1466
1467InstructionCost ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1468 MaybeAlign Alignment,
1469 unsigned AddressSpace,
1470 TTI::TargetCostKind CostKind,
1471 TTI::OperandValueInfo OpInfo,
1472 const Instruction *I) {
1473 // TODO: Handle other cost kinds.
1474 if (CostKind != TTI::TCK_RecipThroughput)
1475 return 1;
1476
1477 // Type legalization can't handle structs
1478 if (TLI->getValueType(DL, Src, true) == MVT::Other)
1479 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1480 CostKind);
1481
1482 if (ST->hasNEON() && Src->isVectorTy() &&
1483 (Alignment && *Alignment != Align(16)) &&
1484 cast<VectorType>(Val: Src)->getElementType()->isDoubleTy()) {
1485 // Unaligned loads/stores are extremely inefficient.
1486 // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
1487 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Src);
1488 return LT.first * 4;
1489 }
1490
1491 // MVE can optimize a fpext(load(4xhalf)) using an extending integer load.
1492 // Same for stores.
1493 if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Val: Src) && I &&
1494 ((Opcode == Instruction::Load && I->hasOneUse() &&
1495 isa<FPExtInst>(Val: *I->user_begin())) ||
1496 (Opcode == Instruction::Store && isa<FPTruncInst>(Val: I->getOperand(i: 0))))) {
1497 FixedVectorType *SrcVTy = cast<FixedVectorType>(Val: Src);
1498 Type *DstTy =
1499 Opcode == Instruction::Load
1500 ? (*I->user_begin())->getType()
1501 : cast<Instruction>(Val: I->getOperand(i: 0))->getOperand(i: 0)->getType();
1502 if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() &&
1503 DstTy->getScalarType()->isFloatTy())
1504 return ST->getMVEVectorCostFactor(CostKind);
1505 }
1506
1507 int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
1508 ? ST->getMVEVectorCostFactor(CostKind)
1509 : 1;
1510 return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1511 CostKind, OpInfo, I);
1512}
1513
1514InstructionCost
1515ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
1516 unsigned AddressSpace,
1517 TTI::TargetCostKind CostKind) {
1518 if (ST->hasMVEIntegerOps()) {
1519 if (Opcode == Instruction::Load && isLegalMaskedLoad(DataTy: Src, Alignment))
1520 return ST->getMVEVectorCostFactor(CostKind);
1521 if (Opcode == Instruction::Store && isLegalMaskedStore(DataTy: Src, Alignment))
1522 return ST->getMVEVectorCostFactor(CostKind);
1523 }
1524 if (!isa<FixedVectorType>(Val: Src))
1525 return BaseT::getMaskedMemoryOpCost(Opcode, DataTy: Src, Alignment, AddressSpace,
1526 CostKind);
1527 // Scalar cost, which is currently very high due to the efficiency of the
1528 // generated code.
1529 return cast<FixedVectorType>(Val: Src)->getNumElements() * 8;
1530}
1531
1532InstructionCost ARMTTIImpl::getInterleavedMemoryOpCost(
1533 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1534 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1535 bool UseMaskForCond, bool UseMaskForGaps) {
1536 assert(Factor >= 2 && "Invalid interleave factor");
1537 assert(isa<VectorType>(VecTy) && "Expect a vector type");
1538
1539 // vldN/vstN doesn't support vector types of i64/f64 element.
1540 bool EltIs64Bits = DL.getTypeSizeInBits(Ty: VecTy->getScalarType()) == 64;
1541
1542 if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
1543 !UseMaskForCond && !UseMaskForGaps) {
1544 unsigned NumElts = cast<FixedVectorType>(Val: VecTy)->getNumElements();
1545 auto *SubVecTy =
1546 FixedVectorType::get(ElementType: VecTy->getScalarType(), NumElts: NumElts / Factor);
1547
1548 // vldN/vstN only support legal vector types of size 64 or 128 in bits.
1549 // Accesses having vector types that are a multiple of 128 bits can be
1550 // matched to more than one vldN/vstN instruction.
1551 int BaseCost =
1552 ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor(CostKind) : 1;
1553 if (NumElts % Factor == 0 &&
1554 TLI->isLegalInterleavedAccessType(Factor, VecTy: SubVecTy, Alignment, DL))
1555 return Factor * BaseCost * TLI->getNumInterleavedAccesses(VecTy: SubVecTy, DL);
1556
1557 // Some smaller than legal interleaved patterns are cheap as we can make
1558 // use of the vmovn or vrev patterns to interleave a standard load. This is
1559 // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
1560 // promoted differently). The cost of 2 here is then a load and vrev or
1561 // vmovn.
1562 if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
1563 VecTy->isIntOrIntVectorTy() &&
1564 DL.getTypeSizeInBits(Ty: SubVecTy).getFixedValue() <= 64)
1565 return 2 * BaseCost;
1566 }
1567
1568 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1569 Alignment, AddressSpace, CostKind,
1570 UseMaskForCond, UseMaskForGaps);
1571}
1572
1573InstructionCost ARMTTIImpl::getGatherScatterOpCost(
1574 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1575 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
1576 using namespace PatternMatch;
1577 if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters)
1578 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1579 Alignment, CostKind, I);
1580
1581 assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!");
1582 auto *VTy = cast<FixedVectorType>(Val: DataTy);
1583
1584 // TODO: Splitting, once we do that.
1585
1586 unsigned NumElems = VTy->getNumElements();
1587 unsigned EltSize = VTy->getScalarSizeInBits();
1588 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: DataTy);
1589
1590 // For now, it is assumed that for the MVE gather instructions the loads are
1591 // all effectively serialised. This means the cost is the scalar cost
1592 // multiplied by the number of elements being loaded. This is possibly very
1593 // conservative, but even so we still end up vectorising loops because the
1594 // cost per iteration for many loops is lower than for scalar loops.
1595 InstructionCost VectorCost =
1596 NumElems * LT.first * ST->getMVEVectorCostFactor(CostKind);
1597 // The scalarization cost should be a lot higher. We use the number of vector
1598 // elements plus the scalarization overhead. If masking is required then a lot
1599 // of little blocks will be needed and potentially a scalarized p0 mask,
1600 // greatly increasing the cost.
1601 InstructionCost ScalarCost =
1602 NumElems * LT.first + (VariableMask ? NumElems * 5 : 0) +
1603 BaseT::getScalarizationOverhead(InTy: VTy, /*Insert*/ true, /*Extract*/ false,
1604 CostKind) +
1605 BaseT::getScalarizationOverhead(InTy: VTy, /*Insert*/ false, /*Extract*/ true,
1606 CostKind);
1607
1608 if (EltSize < 8 || Alignment < EltSize / 8)
1609 return ScalarCost;
1610
1611 unsigned ExtSize = EltSize;
1612 // Check whether there's a single user that asks for an extended type
1613 if (I != nullptr) {
1614 // Dependent of the caller of this function, a gather instruction will
1615 // either have opcode Instruction::Load or be a call to the masked_gather
1616 // intrinsic
1617 if ((I->getOpcode() == Instruction::Load ||
1618 match(I, m_Intrinsic<Intrinsic::masked_gather>())) &&
1619 I->hasOneUse()) {
1620 const User *Us = *I->users().begin();
1621 if (isa<ZExtInst>(Val: Us) || isa<SExtInst>(Val: Us)) {
1622 // only allow valid type combinations
1623 unsigned TypeSize =
1624 cast<Instruction>(Val: Us)->getType()->getScalarSizeInBits();
1625 if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) ||
1626 (TypeSize == 16 && EltSize == 8)) &&
1627 TypeSize * NumElems == 128) {
1628 ExtSize = TypeSize;
1629 }
1630 }
1631 }
1632 // Check whether the input data needs to be truncated
1633 TruncInst *T;
1634 if ((I->getOpcode() == Instruction::Store ||
1635 match(I, m_Intrinsic<Intrinsic::masked_scatter>())) &&
1636 (T = dyn_cast<TruncInst>(I->getOperand(0)))) {
1637 // Only allow valid type combinations
1638 unsigned TypeSize = T->getOperand(i_nocapture: 0)->getType()->getScalarSizeInBits();
1639 if (((EltSize == 16 && TypeSize == 32) ||
1640 (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) &&
1641 TypeSize * NumElems == 128)
1642 ExtSize = TypeSize;
1643 }
1644 }
1645
1646 if (ExtSize * NumElems != 128 || NumElems < 4)
1647 return ScalarCost;
1648
1649 // Any (aligned) i32 gather will not need to be scalarised.
1650 if (ExtSize == 32)
1651 return VectorCost;
1652 // For smaller types, we need to ensure that the gep's inputs are correctly
1653 // extended from a small enough value. Other sizes (including i64) are
1654 // scalarized for now.
1655 if (ExtSize != 8 && ExtSize != 16)
1656 return ScalarCost;
1657
1658 if (const auto *BC = dyn_cast<BitCastInst>(Val: Ptr))
1659 Ptr = BC->getOperand(i_nocapture: 0);
1660 if (const auto *GEP = dyn_cast<GetElementPtrInst>(Val: Ptr)) {
1661 if (GEP->getNumOperands() != 2)
1662 return ScalarCost;
1663 unsigned Scale = DL.getTypeAllocSize(Ty: GEP->getResultElementType());
1664 // Scale needs to be correct (which is only relevant for i16s).
1665 if (Scale != 1 && Scale * 8 != ExtSize)
1666 return ScalarCost;
1667 // And we need to zext (not sext) the indexes from a small enough type.
1668 if (const auto *ZExt = dyn_cast<ZExtInst>(Val: GEP->getOperand(i_nocapture: 1))) {
1669 if (ZExt->getOperand(i_nocapture: 0)->getType()->getScalarSizeInBits() <= ExtSize)
1670 return VectorCost;
1671 }
1672 return ScalarCost;
1673 }
1674 return ScalarCost;
1675}
1676
1677InstructionCost
1678ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
1679 std::optional<FastMathFlags> FMF,
1680 TTI::TargetCostKind CostKind) {
1681
1682 EVT ValVT = TLI->getValueType(DL, Ty: ValTy);
1683 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1684 unsigned EltSize = ValVT.getScalarSizeInBits();
1685
1686 // In general floating point reductions are a series of elementwise
1687 // operations, with free extracts on each step. These are either in-order or
1688 // treewise depending on whether that is allowed by the fast math flags.
1689 if ((ISD == ISD::FADD || ISD == ISD::FMUL) &&
1690 ((EltSize == 32 && ST->hasVFP2Base()) ||
1691 (EltSize == 64 && ST->hasFP64()) ||
1692 (EltSize == 16 && ST->hasFullFP16()))) {
1693 unsigned NumElts = cast<FixedVectorType>(Val: ValTy)->getNumElements();
1694 unsigned VecLimit = ST->hasMVEFloatOps() ? 128 : (ST->hasNEON() ? 64 : -1);
1695 InstructionCost VecCost = 0;
1696 while (!TTI::requiresOrderedReduction(FMF) && isPowerOf2_32(Value: NumElts) &&
1697 NumElts * EltSize > VecLimit) {
1698 Type *VecTy = FixedVectorType::get(ElementType: ValTy->getElementType(), NumElts: NumElts / 2);
1699 VecCost += getArithmeticInstrCost(Opcode, Ty: VecTy, CostKind);
1700 NumElts /= 2;
1701 }
1702
1703 // For fp16 we need to extract the upper lane elements. MVE can add a
1704 // VREV+FMIN/MAX to perform another vector step instead.
1705 InstructionCost ExtractCost = 0;
1706 if (!TTI::requiresOrderedReduction(FMF) && ST->hasMVEFloatOps() &&
1707 ValVT.getVectorElementType() == MVT::f16 && NumElts == 8) {
1708 VecCost += ST->getMVEVectorCostFactor(CostKind) * 2;
1709 NumElts /= 2;
1710 } else if (ValVT.getVectorElementType() == MVT::f16)
1711 ExtractCost = NumElts / 2;
1712
1713 return VecCost + ExtractCost +
1714 NumElts *
1715 getArithmeticInstrCost(Opcode, Ty: ValTy->getElementType(), CostKind);
1716 }
1717
1718 if ((ISD == ISD::AND || ISD == ISD::OR || ISD == ISD::XOR) &&
1719 (EltSize == 64 || EltSize == 32 || EltSize == 16 || EltSize == 8)) {
1720 unsigned NumElts = cast<FixedVectorType>(Val: ValTy)->getNumElements();
1721 unsigned VecLimit =
1722 ST->hasMVEIntegerOps() ? 128 : (ST->hasNEON() ? 64 : -1);
1723 InstructionCost VecCost = 0;
1724 while (isPowerOf2_32(Value: NumElts) && NumElts * EltSize > VecLimit) {
1725 Type *VecTy = FixedVectorType::get(ElementType: ValTy->getElementType(), NumElts: NumElts / 2);
1726 VecCost += getArithmeticInstrCost(Opcode, Ty: VecTy, CostKind);
1727 NumElts /= 2;
1728 }
1729 // For i16/i8, MVE will perform a VREV + VORR/VAND/VEOR for the 64bit vector
1730 // step.
1731 if (ST->hasMVEIntegerOps() && ValVT.getScalarSizeInBits() <= 16 &&
1732 NumElts * EltSize == 64) {
1733 Type *VecTy = FixedVectorType::get(ElementType: ValTy->getElementType(), NumElts);
1734 VecCost += ST->getMVEVectorCostFactor(CostKind) +
1735 getArithmeticInstrCost(Opcode, Ty: VecTy, CostKind);
1736 NumElts /= 2;
1737 }
1738
1739 // From here we extract the elements and perform the and/or/xor.
1740 InstructionCost ExtractCost = NumElts;
1741 return VecCost + ExtractCost +
1742 (NumElts - 1) * getArithmeticInstrCost(
1743 Opcode, Ty: ValTy->getElementType(), CostKind);
1744 }
1745
1746 if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD ||
1747 TTI::requiresOrderedReduction(FMF))
1748 return BaseT::getArithmeticReductionCost(Opcode, Ty: ValTy, FMF, CostKind);
1749
1750 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
1751
1752 static const CostTblEntry CostTblAdd[]{
1753 {ISD::ADD, MVT::v16i8, 1},
1754 {ISD::ADD, MVT::v8i16, 1},
1755 {ISD::ADD, MVT::v4i32, 1},
1756 };
1757 if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second))
1758 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
1759
1760 return BaseT::getArithmeticReductionCost(Opcode, Ty: ValTy, FMF, CostKind);
1761}
1762
1763InstructionCost ARMTTIImpl::getExtendedReductionCost(
1764 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
1765 FastMathFlags FMF, TTI::TargetCostKind CostKind) {
1766 EVT ValVT = TLI->getValueType(DL, Ty: ValTy);
1767 EVT ResVT = TLI->getValueType(DL, Ty: ResTy);
1768
1769 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1770
1771 switch (ISD) {
1772 case ISD::ADD:
1773 if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1774 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
1775
1776 // The legal cases are:
1777 // VADDV u/s 8/16/32
1778 // VADDLV u/s 32
1779 // Codegen currently cannot always handle larger than legal vectors very
1780 // well, especially for predicated reductions where the mask needs to be
1781 // split, so restrict to 128bit or smaller input types.
1782 unsigned RevVTSize = ResVT.getSizeInBits();
1783 if (ValVT.getSizeInBits() <= 128 &&
1784 ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
1785 (LT.second == MVT::v8i16 && RevVTSize <= 32) ||
1786 (LT.second == MVT::v4i32 && RevVTSize <= 64)))
1787 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1788 }
1789 break;
1790 default:
1791 break;
1792 }
1793 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, Ty: ValTy, FMF,
1794 CostKind);
1795}
1796
1797InstructionCost
1798ARMTTIImpl::getMulAccReductionCost(bool IsUnsigned, Type *ResTy,
1799 VectorType *ValTy,
1800 TTI::TargetCostKind CostKind) {
1801 EVT ValVT = TLI->getValueType(DL, Ty: ValTy);
1802 EVT ResVT = TLI->getValueType(DL, Ty: ResTy);
1803
1804 if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1805 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
1806
1807 // The legal cases are:
1808 // VMLAV u/s 8/16/32
1809 // VMLALV u/s 16/32
1810 // Codegen currently cannot always handle larger than legal vectors very
1811 // well, especially for predicated reductions where the mask needs to be
1812 // split, so restrict to 128bit or smaller input types.
1813 unsigned RevVTSize = ResVT.getSizeInBits();
1814 if (ValVT.getSizeInBits() <= 128 &&
1815 ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
1816 (LT.second == MVT::v8i16 && RevVTSize <= 64) ||
1817 (LT.second == MVT::v4i32 && RevVTSize <= 64)))
1818 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1819 }
1820
1821 return BaseT::getMulAccReductionCost(IsUnsigned, ResTy, Ty: ValTy, CostKind);
1822}
1823
1824InstructionCost
1825ARMTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
1826 FastMathFlags FMF,
1827 TTI::TargetCostKind CostKind) {
1828 EVT ValVT = TLI->getValueType(DL, Ty);
1829
1830 // In general floating point reductions are a series of elementwise
1831 // operations, with free extracts on each step. These are either in-order or
1832 // treewise depending on whether that is allowed by the fast math flags.
1833 if ((IID == Intrinsic::minnum || IID == Intrinsic::maxnum) &&
1834 ((ValVT.getVectorElementType() == MVT::f32 && ST->hasVFP2Base()) ||
1835 (ValVT.getVectorElementType() == MVT::f64 && ST->hasFP64()) ||
1836 (ValVT.getVectorElementType() == MVT::f16 && ST->hasFullFP16()))) {
1837 unsigned NumElts = cast<FixedVectorType>(Val: Ty)->getNumElements();
1838 unsigned EltSize = ValVT.getScalarSizeInBits();
1839 unsigned VecLimit = ST->hasMVEFloatOps() ? 128 : (ST->hasNEON() ? 64 : -1);
1840 InstructionCost VecCost;
1841 while (isPowerOf2_32(Value: NumElts) && NumElts * EltSize > VecLimit) {
1842 Type *VecTy = FixedVectorType::get(ElementType: Ty->getElementType(), NumElts: NumElts/2);
1843 IntrinsicCostAttributes ICA(IID, VecTy, {VecTy, VecTy}, FMF);
1844 VecCost += getIntrinsicInstrCost(ICA, CostKind);
1845 NumElts /= 2;
1846 }
1847
1848 // For fp16 we need to extract the upper lane elements. MVE can add a
1849 // VREV+FMIN/MAX to perform another vector step instead.
1850 InstructionCost ExtractCost = 0;
1851 if (ST->hasMVEFloatOps() && ValVT.getVectorElementType() == MVT::f16 &&
1852 NumElts == 8) {
1853 VecCost += ST->getMVEVectorCostFactor(CostKind) * 2;
1854 NumElts /= 2;
1855 } else if (ValVT.getVectorElementType() == MVT::f16)
1856 ExtractCost = cast<FixedVectorType>(Val: Ty)->getNumElements() / 2;
1857
1858 IntrinsicCostAttributes ICA(IID, Ty->getElementType(),
1859 {Ty->getElementType(), Ty->getElementType()},
1860 FMF);
1861 return VecCost + ExtractCost +
1862 (NumElts - 1) * getIntrinsicInstrCost(ICA, CostKind);
1863 }
1864
1865 if (IID == Intrinsic::smin || IID == Intrinsic::smax ||
1866 IID == Intrinsic::umin || IID == Intrinsic::umax) {
1867 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1868
1869 // All costs are the same for u/s min/max. These lower to vminv, which are
1870 // given a slightly higher cost as they tend to take multiple cycles for
1871 // smaller type sizes.
1872 static const CostTblEntry CostTblAdd[]{
1873 {ISD::SMIN, MVT::v16i8, 4},
1874 {ISD::SMIN, MVT::v8i16, 3},
1875 {ISD::SMIN, MVT::v4i32, 2},
1876 };
1877 if (const auto *Entry = CostTableLookup(CostTblAdd, ISD::SMIN, LT.second))
1878 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
1879 }
1880
1881 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1882}
1883
1884InstructionCost
1885ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1886 TTI::TargetCostKind CostKind) {
1887 switch (ICA.getID()) {
1888 case Intrinsic::get_active_lane_mask:
1889 // Currently we make a somewhat optimistic assumption that
1890 // active_lane_mask's are always free. In reality it may be freely folded
1891 // into a tail predicated loop, expanded into a VCPT or expanded into a lot
1892 // of add/icmp code. We may need to improve this in the future, but being
1893 // able to detect if it is free or not involves looking at a lot of other
1894 // code. We currently assume that the vectorizer inserted these, and knew
1895 // what it was doing in adding one.
1896 if (ST->hasMVEIntegerOps())
1897 return 0;
1898 break;
1899 case Intrinsic::sadd_sat:
1900 case Intrinsic::ssub_sat:
1901 case Intrinsic::uadd_sat:
1902 case Intrinsic::usub_sat: {
1903 if (!ST->hasMVEIntegerOps())
1904 break;
1905 Type *VT = ICA.getReturnType();
1906
1907 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: VT);
1908 if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1909 LT.second == MVT::v16i8) {
1910 // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we
1911 // need to extend the type, as it uses shr(qadd(shl, shl)).
1912 unsigned Instrs =
1913 LT.second.getScalarSizeInBits() == VT->getScalarSizeInBits() ? 1 : 4;
1914 return LT.first * ST->getMVEVectorCostFactor(CostKind) * Instrs;
1915 }
1916 break;
1917 }
1918 case Intrinsic::abs:
1919 case Intrinsic::smin:
1920 case Intrinsic::smax:
1921 case Intrinsic::umin:
1922 case Intrinsic::umax: {
1923 if (!ST->hasMVEIntegerOps())
1924 break;
1925 Type *VT = ICA.getReturnType();
1926
1927 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: VT);
1928 if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1929 LT.second == MVT::v16i8)
1930 return LT.first * ST->getMVEVectorCostFactor(CostKind);
1931 break;
1932 }
1933 case Intrinsic::minnum:
1934 case Intrinsic::maxnum: {
1935 if (!ST->hasMVEFloatOps())
1936 break;
1937 Type *VT = ICA.getReturnType();
1938 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: VT);
1939 if (LT.second == MVT::v4f32 || LT.second == MVT::v8f16)
1940 return LT.first * ST->getMVEVectorCostFactor(CostKind);
1941 break;
1942 }
1943 case Intrinsic::fptosi_sat:
1944 case Intrinsic::fptoui_sat: {
1945 if (ICA.getArgTypes().empty())
1946 break;
1947 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
1948 auto LT = getTypeLegalizationCost(Ty: ICA.getArgTypes()[0]);
1949 EVT MTy = TLI->getValueType(DL, Ty: ICA.getReturnType());
1950 // Check for the legal types, with the corect subtarget features.
1951 if ((ST->hasVFP2Base() && LT.second == MVT::f32 && MTy == MVT::i32) ||
1952 (ST->hasFP64() && LT.second == MVT::f64 && MTy == MVT::i32) ||
1953 (ST->hasFullFP16() && LT.second == MVT::f16 && MTy == MVT::i32))
1954 return LT.first;
1955
1956 // Equally for MVE vector types
1957 if (ST->hasMVEFloatOps() &&
1958 (LT.second == MVT::v4f32 || LT.second == MVT::v8f16) &&
1959 LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())
1960 return LT.first * ST->getMVEVectorCostFactor(CostKind);
1961
1962 // Otherwise we use a legal convert followed by a min+max
1963 if (((ST->hasVFP2Base() && LT.second == MVT::f32) ||
1964 (ST->hasFP64() && LT.second == MVT::f64) ||
1965 (ST->hasFullFP16() && LT.second == MVT::f16) ||
1966 (ST->hasMVEFloatOps() &&
1967 (LT.second == MVT::v4f32 || LT.second == MVT::v8f16))) &&
1968 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
1969 Type *LegalTy = Type::getIntNTy(C&: ICA.getReturnType()->getContext(),
1970 N: LT.second.getScalarSizeInBits());
1971 InstructionCost Cost =
1972 LT.second.isVector() ? ST->getMVEVectorCostFactor(CostKind) : 1;
1973 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin
1974 : Intrinsic::umin,
1975 LegalTy, {LegalTy, LegalTy});
1976 Cost += getIntrinsicInstrCost(ICA: Attrs1, CostKind);
1977 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax
1978 : Intrinsic::umax,
1979 LegalTy, {LegalTy, LegalTy});
1980 Cost += getIntrinsicInstrCost(ICA: Attrs2, CostKind);
1981 return LT.first * Cost;
1982 }
1983 break;
1984 }
1985 }
1986
1987 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1988}
1989
1990bool ARMTTIImpl::isLoweredToCall(const Function *F) {
1991 if (!F->isIntrinsic())
1992 return BaseT::isLoweredToCall(F);
1993
1994 // Assume all Arm-specific intrinsics map to an instruction.
1995 if (F->getName().starts_with(Prefix: "llvm.arm"))
1996 return false;
1997
1998 switch (F->getIntrinsicID()) {
1999 default: break;
2000 case Intrinsic::powi:
2001 case Intrinsic::sin:
2002 case Intrinsic::cos:
2003 case Intrinsic::pow:
2004 case Intrinsic::log:
2005 case Intrinsic::log10:
2006 case Intrinsic::log2:
2007 case Intrinsic::exp:
2008 case Intrinsic::exp2:
2009 return true;
2010 case Intrinsic::sqrt:
2011 case Intrinsic::fabs:
2012 case Intrinsic::copysign:
2013 case Intrinsic::floor:
2014 case Intrinsic::ceil:
2015 case Intrinsic::trunc:
2016 case Intrinsic::rint:
2017 case Intrinsic::nearbyint:
2018 case Intrinsic::round:
2019 case Intrinsic::canonicalize:
2020 case Intrinsic::lround:
2021 case Intrinsic::llround:
2022 case Intrinsic::lrint:
2023 case Intrinsic::llrint:
2024 if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
2025 return true;
2026 if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
2027 return true;
2028 // Some operations can be handled by vector instructions and assume
2029 // unsupported vectors will be expanded into supported scalar ones.
2030 // TODO Handle scalar operations properly.
2031 return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
2032 case Intrinsic::masked_store:
2033 case Intrinsic::masked_load:
2034 case Intrinsic::masked_gather:
2035 case Intrinsic::masked_scatter:
2036 return !ST->hasMVEIntegerOps();
2037 case Intrinsic::sadd_with_overflow:
2038 case Intrinsic::uadd_with_overflow:
2039 case Intrinsic::ssub_with_overflow:
2040 case Intrinsic::usub_with_overflow:
2041 case Intrinsic::sadd_sat:
2042 case Intrinsic::uadd_sat:
2043 case Intrinsic::ssub_sat:
2044 case Intrinsic::usub_sat:
2045 return false;
2046 }
2047
2048 return BaseT::isLoweredToCall(F);
2049}
2050
2051bool ARMTTIImpl::maybeLoweredToCall(Instruction &I) {
2052 unsigned ISD = TLI->InstructionOpcodeToISD(Opcode: I.getOpcode());
2053 EVT VT = TLI->getValueType(DL, Ty: I.getType(), AllowUnknown: true);
2054 if (TLI->getOperationAction(Op: ISD, VT) == TargetLowering::LibCall)
2055 return true;
2056
2057 // Check if an intrinsic will be lowered to a call and assume that any
2058 // other CallInst will generate a bl.
2059 if (auto *Call = dyn_cast<CallInst>(Val: &I)) {
2060 if (auto *II = dyn_cast<IntrinsicInst>(Val: Call)) {
2061 switch(II->getIntrinsicID()) {
2062 case Intrinsic::memcpy:
2063 case Intrinsic::memset:
2064 case Intrinsic::memmove:
2065 return getNumMemOps(I: II) == -1;
2066 default:
2067 if (const Function *F = Call->getCalledFunction())
2068 return isLoweredToCall(F);
2069 }
2070 }
2071 return true;
2072 }
2073
2074 // FPv5 provides conversions between integer, double-precision,
2075 // single-precision, and half-precision formats.
2076 switch (I.getOpcode()) {
2077 default:
2078 break;
2079 case Instruction::FPToSI:
2080 case Instruction::FPToUI:
2081 case Instruction::SIToFP:
2082 case Instruction::UIToFP:
2083 case Instruction::FPTrunc:
2084 case Instruction::FPExt:
2085 return !ST->hasFPARMv8Base();
2086 }
2087
2088 // FIXME: Unfortunately the approach of checking the Operation Action does
2089 // not catch all cases of Legalization that use library calls. Our
2090 // Legalization step categorizes some transformations into library calls as
2091 // Custom, Expand or even Legal when doing type legalization. So for now
2092 // we have to special case for instance the SDIV of 64bit integers and the
2093 // use of floating point emulation.
2094 if (VT.isInteger() && VT.getSizeInBits() >= 64) {
2095 switch (ISD) {
2096 default:
2097 break;
2098 case ISD::SDIV:
2099 case ISD::UDIV:
2100 case ISD::SREM:
2101 case ISD::UREM:
2102 case ISD::SDIVREM:
2103 case ISD::UDIVREM:
2104 return true;
2105 }
2106 }
2107
2108 // Assume all other non-float operations are supported.
2109 if (!VT.isFloatingPoint())
2110 return false;
2111
2112 // We'll need a library call to handle most floats when using soft.
2113 if (TLI->useSoftFloat()) {
2114 switch (I.getOpcode()) {
2115 default:
2116 return true;
2117 case Instruction::Alloca:
2118 case Instruction::Load:
2119 case Instruction::Store:
2120 case Instruction::Select:
2121 case Instruction::PHI:
2122 return false;
2123 }
2124 }
2125
2126 // We'll need a libcall to perform double precision operations on a single
2127 // precision only FPU.
2128 if (I.getType()->isDoubleTy() && !ST->hasFP64())
2129 return true;
2130
2131 // Likewise for half precision arithmetic.
2132 if (I.getType()->isHalfTy() && !ST->hasFullFP16())
2133 return true;
2134
2135 return false;
2136}
2137
2138bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
2139 AssumptionCache &AC,
2140 TargetLibraryInfo *LibInfo,
2141 HardwareLoopInfo &HWLoopInfo) {
2142 // Low-overhead branches are only supported in the 'low-overhead branch'
2143 // extension of v8.1-m.
2144 if (!ST->hasLOB() || DisableLowOverheadLoops) {
2145 LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n");
2146 return false;
2147 }
2148
2149 if (!SE.hasLoopInvariantBackedgeTakenCount(L)) {
2150 LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n");
2151 return false;
2152 }
2153
2154 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2155 if (isa<SCEVCouldNotCompute>(Val: BackedgeTakenCount)) {
2156 LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n");
2157 return false;
2158 }
2159
2160 const SCEV *TripCountSCEV =
2161 SE.getAddExpr(LHS: BackedgeTakenCount,
2162 RHS: SE.getOne(Ty: BackedgeTakenCount->getType()));
2163
2164 // We need to store the trip count in LR, a 32-bit register.
2165 if (SE.getUnsignedRangeMax(S: TripCountSCEV).getBitWidth() > 32) {
2166 LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n");
2167 return false;
2168 }
2169
2170 // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
2171 // point in generating a hardware loop if that's going to happen.
2172
2173 auto IsHardwareLoopIntrinsic = [](Instruction &I) {
2174 if (auto *Call = dyn_cast<IntrinsicInst>(Val: &I)) {
2175 switch (Call->getIntrinsicID()) {
2176 default:
2177 break;
2178 case Intrinsic::start_loop_iterations:
2179 case Intrinsic::test_start_loop_iterations:
2180 case Intrinsic::loop_decrement:
2181 case Intrinsic::loop_decrement_reg:
2182 return true;
2183 }
2184 }
2185 return false;
2186 };
2187
2188 // Scan the instructions to see if there's any that we know will turn into a
2189 // call or if this loop is already a low-overhead loop or will become a tail
2190 // predicated loop.
2191 bool IsTailPredLoop = false;
2192 auto ScanLoop = [&](Loop *L) {
2193 for (auto *BB : L->getBlocks()) {
2194 for (auto &I : *BB) {
2195 if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) ||
2196 isa<InlineAsm>(Val: I)) {
2197 LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n");
2198 return false;
2199 }
2200 if (auto *II = dyn_cast<IntrinsicInst>(&I))
2201 IsTailPredLoop |=
2202 II->getIntrinsicID() == Intrinsic::get_active_lane_mask ||
2203 II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 ||
2204 II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 ||
2205 II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 ||
2206 II->getIntrinsicID() == Intrinsic::arm_mve_vctp64;
2207 }
2208 }
2209 return true;
2210 };
2211
2212 // Visit inner loops.
2213 for (auto *Inner : *L)
2214 if (!ScanLoop(Inner))
2215 return false;
2216
2217 if (!ScanLoop(L))
2218 return false;
2219
2220 // TODO: Check whether the trip count calculation is expensive. If L is the
2221 // inner loop but we know it has a low trip count, calculating that trip
2222 // count (in the parent loop) may be detrimental.
2223
2224 LLVMContext &C = L->getHeader()->getContext();
2225 HWLoopInfo.CounterInReg = true;
2226 HWLoopInfo.IsNestingLegal = false;
2227 HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop;
2228 HWLoopInfo.CountType = Type::getInt32Ty(C);
2229 HWLoopInfo.LoopDecrement = ConstantInt::get(Ty: HWLoopInfo.CountType, V: 1);
2230 return true;
2231}
2232
2233static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
2234 // We don't allow icmp's, and because we only look at single block loops,
2235 // we simply count the icmps, i.e. there should only be 1 for the backedge.
2236 if (isa<ICmpInst>(Val: &I) && ++ICmpCount > 1)
2237 return false;
2238 // FIXME: This is a workaround for poor cost modelling. Min/Max intrinsics are
2239 // not currently canonical, but soon will be. Code without them uses icmp, and
2240 // so is not tail predicated as per the condition above. In order to get the
2241 // same performance we treat min and max the same as an icmp for tailpred
2242 // purposes for the moment (we often rely on non-tailpred and higher VF's to
2243 // pick more optimial instructions like VQDMULH. They need to be recognized
2244 // directly by the vectorizer).
2245 if (auto *II = dyn_cast<IntrinsicInst>(Val: &I))
2246 if ((II->getIntrinsicID() == Intrinsic::smin ||
2247 II->getIntrinsicID() == Intrinsic::smax ||
2248 II->getIntrinsicID() == Intrinsic::umin ||
2249 II->getIntrinsicID() == Intrinsic::umax) &&
2250 ++ICmpCount > 1)
2251 return false;
2252
2253 if (isa<FCmpInst>(Val: &I))
2254 return false;
2255
2256 // We could allow extending/narrowing FP loads/stores, but codegen is
2257 // too inefficient so reject this for now.
2258 if (isa<FPExtInst>(Val: &I) || isa<FPTruncInst>(Val: &I))
2259 return false;
2260
2261 // Extends have to be extending-loads
2262 if (isa<SExtInst>(Val: &I) || isa<ZExtInst>(Val: &I) )
2263 if (!I.getOperand(i: 0)->hasOneUse() || !isa<LoadInst>(Val: I.getOperand(i: 0)))
2264 return false;
2265
2266 // Truncs have to be narrowing-stores
2267 if (isa<TruncInst>(Val: &I) )
2268 if (!I.hasOneUse() || !isa<StoreInst>(Val: *I.user_begin()))
2269 return false;
2270
2271 return true;
2272}
2273
2274// To set up a tail-predicated loop, we need to know the total number of
2275// elements processed by that loop. Thus, we need to determine the element
2276// size and:
2277// 1) it should be uniform for all operations in the vector loop, so we
2278// e.g. don't want any widening/narrowing operations.
2279// 2) it should be smaller than i64s because we don't have vector operations
2280// that work on i64s.
2281// 3) we don't want elements to be reversed or shuffled, to make sure the
2282// tail-predication masks/predicates the right lanes.
2283//
2284static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
2285 const DataLayout &DL,
2286 const LoopAccessInfo *LAI) {
2287 LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
2288
2289 // If there are live-out values, it is probably a reduction. We can predicate
2290 // most reduction operations freely under MVE using a combination of
2291 // prefer-predicated-reduction-select and inloop reductions. We limit this to
2292 // floating point and integer reductions, but don't check for operators
2293 // specifically here. If the value ends up not being a reduction (and so the
2294 // vectorizer cannot tailfold the loop), we should fall back to standard
2295 // vectorization automatically.
2296 SmallVector< Instruction *, 8 > LiveOuts;
2297 LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
2298 bool ReductionsDisabled =
2299 EnableTailPredication == TailPredication::EnabledNoReductions ||
2300 EnableTailPredication == TailPredication::ForceEnabledNoReductions;
2301
2302 for (auto *I : LiveOuts) {
2303 if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() &&
2304 !I->getType()->isHalfTy()) {
2305 LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "
2306 "live-out value\n");
2307 return false;
2308 }
2309 if (ReductionsDisabled) {
2310 LLVM_DEBUG(dbgs() << "Reductions not enabled\n");
2311 return false;
2312 }
2313 }
2314
2315 // Next, check that all instructions can be tail-predicated.
2316 PredicatedScalarEvolution PSE = LAI->getPSE();
2317 SmallVector<Instruction *, 16> LoadStores;
2318 int ICmpCount = 0;
2319
2320 for (BasicBlock *BB : L->blocks()) {
2321 for (Instruction &I : BB->instructionsWithoutDebug()) {
2322 if (isa<PHINode>(Val: &I))
2323 continue;
2324 if (!canTailPredicateInstruction(I, ICmpCount)) {
2325 LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump());
2326 return false;
2327 }
2328
2329 Type *T = I.getType();
2330 if (T->getScalarSizeInBits() > 32) {
2331 LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump());
2332 return false;
2333 }
2334 if (isa<StoreInst>(Val: I) || isa<LoadInst>(Val: I)) {
2335 Value *Ptr = getLoadStorePointerOperand(V: &I);
2336 Type *AccessTy = getLoadStoreType(I: &I);
2337 int64_t NextStride = getPtrStride(PSE, AccessTy, Ptr, Lp: L).value_or(u: 0);
2338 if (NextStride == 1) {
2339 // TODO: for now only allow consecutive strides of 1. We could support
2340 // other strides as long as it is uniform, but let's keep it simple
2341 // for now.
2342 continue;
2343 } else if (NextStride == -1 ||
2344 (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) ||
2345 (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) {
2346 LLVM_DEBUG(dbgs()
2347 << "Consecutive strides of 2 found, vld2/vstr2 can't "
2348 "be tail-predicated\n.");
2349 return false;
2350 // TODO: don't tail predicate if there is a reversed load?
2351 } else if (EnableMaskedGatherScatters) {
2352 // Gather/scatters do allow loading from arbitrary strides, at
2353 // least if they are loop invariant.
2354 // TODO: Loop variant strides should in theory work, too, but
2355 // this requires further testing.
2356 const SCEV *PtrScev = PSE.getSE()->getSCEV(V: Ptr);
2357 if (auto AR = dyn_cast<SCEVAddRecExpr>(Val: PtrScev)) {
2358 const SCEV *Step = AR->getStepRecurrence(SE&: *PSE.getSE());
2359 if (PSE.getSE()->isLoopInvariant(S: Step, L))
2360 continue;
2361 }
2362 }
2363 LLVM_DEBUG(dbgs() << "Bad stride found, can't "
2364 "tail-predicate\n.");
2365 return false;
2366 }
2367 }
2368 }
2369
2370 LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n");
2371 return true;
2372}
2373
2374bool ARMTTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) {
2375 if (!EnableTailPredication) {
2376 LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n");
2377 return false;
2378 }
2379
2380 // Creating a predicated vector loop is the first step for generating a
2381 // tail-predicated hardware loop, for which we need the MVE masked
2382 // load/stores instructions:
2383 if (!ST->hasMVEIntegerOps())
2384 return false;
2385
2386 LoopVectorizationLegality *LVL = TFI->LVL;
2387 Loop *L = LVL->getLoop();
2388
2389 // For now, restrict this to single block loops.
2390 if (L->getNumBlocks() > 1) {
2391 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "
2392 "loop.\n");
2393 return false;
2394 }
2395
2396 assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected");
2397
2398 LoopInfo *LI = LVL->getLoopInfo();
2399 HardwareLoopInfo HWLoopInfo(L);
2400 if (!HWLoopInfo.canAnalyze(LI&: *LI)) {
2401 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2402 "analyzable.\n");
2403 return false;
2404 }
2405
2406 AssumptionCache *AC = LVL->getAssumptionCache();
2407 ScalarEvolution *SE = LVL->getScalarEvolution();
2408
2409 // This checks if we have the low-overhead branch architecture
2410 // extension, and if we will create a hardware-loop:
2411 if (!isHardwareLoopProfitable(L, SE&: *SE, AC&: *AC, LibInfo: TFI->TLI, HWLoopInfo)) {
2412 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2413 "profitable.\n");
2414 return false;
2415 }
2416
2417 DominatorTree *DT = LVL->getDominatorTree();
2418 if (!HWLoopInfo.isHardwareLoopCandidate(SE&: *SE, LI&: *LI, DT&: *DT)) {
2419 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2420 "a candidate.\n");
2421 return false;
2422 }
2423
2424 return canTailPredicateLoop(L, LI, SE&: *SE, DL, LAI: LVL->getLAI());
2425}
2426
2427TailFoldingStyle
2428ARMTTIImpl::getPreferredTailFoldingStyle(bool IVUpdateMayOverflow) const {
2429 if (!ST->hasMVEIntegerOps() || !EnableTailPredication)
2430 return TailFoldingStyle::DataWithoutLaneMask;
2431
2432 // Intrinsic @llvm.get.active.lane.mask is supported.
2433 // It is used in the MVETailPredication pass, which requires the number of
2434 // elements processed by this vector loop to setup the tail-predicated
2435 // loop.
2436 return TailFoldingStyle::Data;
2437}
2438void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
2439 TTI::UnrollingPreferences &UP,
2440 OptimizationRemarkEmitter *ORE) {
2441 // Enable Upper bound unrolling universally, providing that we do not see an
2442 // active lane mask, which will be better kept as a loop to become tail
2443 // predicated than to be conditionally unrolled.
2444 UP.UpperBound =
2445 !ST->hasMVEIntegerOps() || !any_of(Range&: *L->getHeader(), P: [](Instruction &I) {
2446 return isa<IntrinsicInst>(I) &&
2447 cast<IntrinsicInst>(I).getIntrinsicID() ==
2448 Intrinsic::get_active_lane_mask;
2449 });
2450
2451 // Only currently enable these preferences for M-Class cores.
2452 if (!ST->isMClass())
2453 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2454
2455 // Disable loop unrolling for Oz and Os.
2456 UP.OptSizeThreshold = 0;
2457 UP.PartialOptSizeThreshold = 0;
2458 if (L->getHeader()->getParent()->hasOptSize())
2459 return;
2460
2461 SmallVector<BasicBlock*, 4> ExitingBlocks;
2462 L->getExitingBlocks(ExitingBlocks);
2463 LLVM_DEBUG(dbgs() << "Loop has:\n"
2464 << "Blocks: " << L->getNumBlocks() << "\n"
2465 << "Exit blocks: " << ExitingBlocks.size() << "\n");
2466
2467 // Only allow another exit other than the latch. This acts as an early exit
2468 // as it mirrors the profitability calculation of the runtime unroller.
2469 if (ExitingBlocks.size() > 2)
2470 return;
2471
2472 // Limit the CFG of the loop body for targets with a branch predictor.
2473 // Allowing 4 blocks permits if-then-else diamonds in the body.
2474 if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)
2475 return;
2476
2477 // Don't unroll vectorized loops, including the remainder loop
2478 if (getBooleanLoopAttribute(TheLoop: L, Name: "llvm.loop.isvectorized"))
2479 return;
2480
2481 // Scan the loop: don't unroll loops with calls as this could prevent
2482 // inlining.
2483 InstructionCost Cost = 0;
2484 for (auto *BB : L->getBlocks()) {
2485 for (auto &I : *BB) {
2486 // Don't unroll vectorised loop. MVE does not benefit from it as much as
2487 // scalar code.
2488 if (I.getType()->isVectorTy())
2489 return;
2490
2491 if (isa<CallInst>(Val: I) || isa<InvokeInst>(Val: I)) {
2492 if (const Function *F = cast<CallBase>(Val&: I).getCalledFunction()) {
2493 if (!isLoweredToCall(F))
2494 continue;
2495 }
2496 return;
2497 }
2498
2499 SmallVector<const Value*, 4> Operands(I.operand_values());
2500 Cost += getInstructionCost(U: &I, Operands,
2501 CostKind: TargetTransformInfo::TCK_SizeAndLatency);
2502 }
2503 }
2504
2505 // On v6m cores, there are very few registers available. We can easily end up
2506 // spilling and reloading more registers in an unrolled loop. Look at the
2507 // number of LCSSA phis as a rough measure of how many registers will need to
2508 // be live out of the loop, reducing the default unroll count if more than 1
2509 // value is needed. In the long run, all of this should be being learnt by a
2510 // machine.
2511 unsigned UnrollCount = 4;
2512 if (ST->isThumb1Only()) {
2513 unsigned ExitingValues = 0;
2514 SmallVector<BasicBlock *, 4> ExitBlocks;
2515 L->getExitBlocks(ExitBlocks);
2516 for (auto *Exit : ExitBlocks) {
2517 // Count the number of LCSSA phis. Exclude values coming from GEP's as
2518 // only the last is expected to be needed for address operands.
2519 unsigned LiveOuts = count_if(Range: Exit->phis(), P: [](auto &PH) {
2520 return PH.getNumOperands() != 1 ||
2521 !isa<GetElementPtrInst>(PH.getOperand(0));
2522 });
2523 ExitingValues = ExitingValues < LiveOuts ? LiveOuts : ExitingValues;
2524 }
2525 if (ExitingValues)
2526 UnrollCount /= ExitingValues;
2527 if (UnrollCount <= 1)
2528 return;
2529 }
2530
2531 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2532 LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n");
2533
2534 UP.Partial = true;
2535 UP.Runtime = true;
2536 UP.UnrollRemainder = true;
2537 UP.DefaultUnrollRuntimeCount = UnrollCount;
2538 UP.UnrollAndJam = true;
2539 UP.UnrollAndJamInnerLoopThreshold = 60;
2540
2541 // Force unrolling small loops can be very useful because of the branch
2542 // taken cost of the backedge.
2543 if (Cost < 12)
2544 UP.Force = true;
2545}
2546
2547void ARMTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
2548 TTI::PeelingPreferences &PP) {
2549 BaseT::getPeelingPreferences(L, SE, PP);
2550}
2551
2552bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty,
2553 TTI::ReductionFlags Flags) const {
2554 if (!ST->hasMVEIntegerOps())
2555 return false;
2556
2557 unsigned ScalarBits = Ty->getScalarSizeInBits();
2558 switch (Opcode) {
2559 case Instruction::Add:
2560 return ScalarBits <= 64;
2561 default:
2562 return false;
2563 }
2564}
2565
2566bool ARMTTIImpl::preferPredicatedReductionSelect(
2567 unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const {
2568 if (!ST->hasMVEIntegerOps())
2569 return false;
2570 return true;
2571}
2572
2573InstructionCost ARMTTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
2574 int64_t BaseOffset,
2575 bool HasBaseReg, int64_t Scale,
2576 unsigned AddrSpace) const {
2577 TargetLoweringBase::AddrMode AM;
2578 AM.BaseGV = BaseGV;
2579 AM.BaseOffs = BaseOffset;
2580 AM.HasBaseReg = HasBaseReg;
2581 AM.Scale = Scale;
2582 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AS: AddrSpace)) {
2583 if (ST->hasFPAO())
2584 return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster
2585 return 0;
2586 }
2587 return -1;
2588}
2589
2590bool ARMTTIImpl::hasArmWideBranch(bool Thumb) const {
2591 if (Thumb) {
2592 // B.W is available in any Thumb2-supporting target, and also in every
2593 // version of Armv8-M, even Baseline which does not include the rest of
2594 // Thumb2.
2595 return ST->isThumb2() || ST->hasV8MBaselineOps();
2596 } else {
2597 // B is available in all versions of the Arm ISA, so the only question is
2598 // whether that ISA is available at all.
2599 return ST->hasARMOps();
2600 }
2601}
2602

source code of llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp