ARMTargetTransformInfo.cpp source code [llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp]

1	//===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#include "ARMTargetTransformInfo.h"
10	#include "ARMSubtarget.h"
11	#include "MCTargetDesc/ARMAddressingModes.h"
12	#include "llvm/ADT/APInt.h"
13	#include "llvm/ADT/SmallVector.h"
14	#include "llvm/Analysis/LoopInfo.h"
15	#include "llvm/CodeGen/CostTable.h"
16	#include "llvm/CodeGen/ISDOpcodes.h"
17	#include "llvm/CodeGen/ValueTypes.h"
18	#include "llvm/CodeGenTypes/MachineValueType.h"
19	#include "llvm/IR/BasicBlock.h"
20	#include "llvm/IR/DataLayout.h"
21	#include "llvm/IR/DerivedTypes.h"
22	#include "llvm/IR/Instruction.h"
23	#include "llvm/IR/Instructions.h"
24	#include "llvm/IR/IntrinsicInst.h"
25	#include "llvm/IR/Intrinsics.h"
26	#include "llvm/IR/IntrinsicsARM.h"
27	#include "llvm/IR/PatternMatch.h"
28	#include "llvm/IR/Type.h"
29	#include "llvm/Support/Casting.h"
30	#include "llvm/Support/KnownBits.h"
31	#include "llvm/Target/TargetMachine.h"
32	#include "llvm/TargetParser/SubtargetFeature.h"
33	#include "llvm/Transforms/InstCombine/InstCombiner.h"
34	#include "llvm/Transforms/Utils/Local.h"
35	#include "llvm/Transforms/Utils/LoopUtils.h"
36	#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
37	#include <algorithm>
38	#include <cassert>
39	#include <cstdint>
40	#include <optional>
41	#include <utility>
42
43	using namespace llvm;
44
45	#define DEBUG_TYPE "armtti"
46
47	static cl::opt<bool> EnableMaskedLoadStores(
48	"enable-arm-maskedldst", cl::Hidden, cl::init(Val: true),
49	cl::desc ("Enable the generation of masked loads and stores"));
50
51	static cl::opt<bool> DisableLowOverheadLoops(
52	"disable-arm-loloops", cl::Hidden, cl::init(Val: false),
53	cl::desc ("Disable the generation of low-overhead loops"));
54
55	static cl::opt<bool>
56	AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(Val: true),
57	cl::desc ("Enable the generation of WLS loops"));
58
59	extern cl::opt<TailPredication::Mode> EnableTailPredication;
60
61	extern cl::opt<bool> EnableMaskedGatherScatters;
62
63	extern cl::opt<unsigned> MVEMaxSupportedInterleaveFactor;
64
65	/// Convert a vector load intrinsic into a simple llvm load instruction.
66	/// This is beneficial when the underlying object being addressed comes
67	/// from a constant, since we get constant-folding for free.
68	static Value simplifyNeonVld1(const* IntrinsicInst &II, unsigned MemAlign,
69	InstCombiner::BuilderTy &Builder) {
70	auto *IntrAlign = dyn_cast<ConstantInt>(Val: II.getArgOperand(i: `1`));
71
72	if (!IntrAlign)
73	return nullptr;
74
75	unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign
76	? MemAlign
77	: IntrAlign->getLimitedValue();
78
79	if (!isPowerOf2_32(Value: Alignment))
80	return nullptr;
81
82	auto *BCastInst = Builder.CreateBitCast(V: II.getArgOperand(i: `0`),
83	DestTy: PointerType::get(ElementType: II.getType(), AddressSpace: `0`));
84	return Builder.CreateAlignedLoad(Ty: II.getType(), Ptr: BCastInst, Align: Align (Alignment));
85	}
86
87	bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
88	const Function Callee) const* {
89	const TargetMachine &TM = getTLI()->getTargetMachine();
90	const FeatureBitset &CallerBits =
91	TM.getSubtargetImpl(*Caller)->getFeatureBits();
92	const FeatureBitset &CalleeBits =
93	TM.getSubtargetImpl(*Callee)->getFeatureBits();
94
95	// To inline a callee, all features not in the allowed list must match exactly.
96	bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) ==
97	(CalleeBits & ~InlineFeaturesAllowed);
98	// For features in the allowed list, the callee's features must be a subset of
99	// the callers'.
100	bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) ==
101	(CalleeBits & InlineFeaturesAllowed);
102	return MatchExact && MatchSubset;
103	}
104
105	TTI::AddressingModeKind
106	ARMTTIImpl::getPreferredAddressingMode(const Loop *L,
107	ScalarEvolution SE) const* {
108	if (ST->hasMVEIntegerOps())
109	return TTI::AMK_PostIndexed;
110
111	if (L->getHeader()->getParent()->hasOptSize())
112	return TTI::AMK_None;
113
114	if (ST->isMClass() && ST->isThumb2() &&
115	L->getNumBlocks() == `1`)
116	return TTI::AMK_PreIndexed;
117
118	return TTI::AMK_None;
119	}
120
121	std::optional<Instruction *>
122	ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
123	using namespace PatternMatch;
124	Intrinsic::ID IID = II.getIntrinsicID();
125	switch (IID) {
126	default:
127	break;
128	case Intrinsic::arm_neon_vld1: {
129	Align MemAlign =
130	getKnownAlignment(V: II.getArgOperand(i: `0`), DL: IC.getDataLayout(), CxtI: &II,
131	AC: &IC.getAssumptionCache(), DT: &IC.getDominatorTree());
132	if (Value *V = simplifyNeonVld1(II, MemAlign: MemAlign.value(), Builder&: IC.Builder)) {
133	return IC.replaceInstUsesWith(I&: II, V);
134	}
135	break;
136	}
137
138	case Intrinsic::arm_neon_vld2:
139	case Intrinsic::arm_neon_vld3:
140	case Intrinsic::arm_neon_vld4:
141	case Intrinsic::arm_neon_vld2lane:
142	case Intrinsic::arm_neon_vld3lane:
143	case Intrinsic::arm_neon_vld4lane:
144	case Intrinsic::arm_neon_vst1:
145	case Intrinsic::arm_neon_vst2:
146	case Intrinsic::arm_neon_vst3:
147	case Intrinsic::arm_neon_vst4:
148	case Intrinsic::arm_neon_vst2lane:
149	case Intrinsic::arm_neon_vst3lane:
150	case Intrinsic::arm_neon_vst4lane: {
151	Align MemAlign =
152	getKnownAlignment(V: II.getArgOperand(i: `0`), DL: IC.getDataLayout(), CxtI: &II,
153	AC: &IC.getAssumptionCache(), DT: &IC.getDominatorTree());
154	unsigned AlignArg = II.arg_size() - `1`;
155	Value *AlignArgOp = II.getArgOperand(i: AlignArg);
156	MaybeAlign Align = cast<ConstantInt>(Val: AlignArgOp)->getMaybeAlignValue();
157	if (Align && *Align < MemAlign) {
158	return IC.replaceOperand(
159	I&: II, OpNum: AlignArg,
160	V: ConstantInt::get(Ty: Type::getInt32Ty(C&: II.getContext()), V: MemAlign.value(),
161	IsSigned: false));
162	}
163	break;
164	}
165
166	case Intrinsic::arm_mve_pred_i2v: {
167	Value *Arg = II.getArgOperand(i: `0`);
168	Value *ArgArg;
169	if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
170	PatternMatch::m_Value(ArgArg))) &&
171	II.getType() == ArgArg->getType()) {
172	return IC.replaceInstUsesWith(I&: II, V: ArgArg);
173	}
174	Constant *XorMask;
175	if (match(Arg, m_Xor(PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
176	PatternMatch::m_Value(ArgArg)),
177	PatternMatch::m_Constant(XorMask))) &&
178	II.getType() == ArgArg->getType()) {
179	if (auto *CI = dyn_cast<ConstantInt>(Val: XorMask)) {
180	if (CI->getValue().trunc(width: `16`).isAllOnes()) {
181	auto TrueVector = IC.Builder.CreateVectorSplat(
182	NumElts: cast<FixedVectorType>(Val: II.getType())->getNumElements(),
183	V: IC.Builder.getTrue());
184	return BinaryOperator::Create(Op: Instruction::Xor, S1: ArgArg, S2: TrueVector);
185	}
186	}
187	}
188	KnownBits ScalarKnown(`32`);
189	if (IC.SimplifyDemandedBits(I: &II, OpNo: `0`, DemandedMask: APInt::getLowBitsSet(numBits: `32`, loBitsSet: `16`),
190	Known&: ScalarKnown, Depth: `0`)) {
191	return &II;
192	}
193	break;
194	}
195	case Intrinsic::arm_mve_pred_v2i: {
196	Value *Arg = II.getArgOperand(i: `0`);
197	Value *ArgArg;
198	if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(
199	PatternMatch::m_Value(ArgArg)))) {
200	return IC.replaceInstUsesWith(I&: II, V: ArgArg);
201	}
202	if (!II.getMetadata(KindID: LLVMContext::MD_range)) {
203	Type *IntTy32 = Type::getInt32Ty(C&: II.getContext());
204	Metadata *M[] = {
205	ConstantAsMetadata::get(C: ConstantInt::get(Ty: IntTy32, V: `0`)),
206	ConstantAsMetadata::get(C: ConstantInt::get(Ty: IntTy32, V: `0x10000`))};
207	II.setMetadata(KindID: LLVMContext::MD_range, Node: MDNode::get(Context&: II.getContext(), MDs: M));
208	II.setMetadata(KindID: LLVMContext::MD_noundef,
209	Node: MDNode::get(Context&: II.getContext(), MDs: std::nullopt));
210	return &II;
211	}
212	break;
213	}
214	case Intrinsic::arm_mve_vadc:
215	case Intrinsic::arm_mve_vadc_predicated: {
216	unsigned CarryOp =
217	(II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? `3` : `2`;
218	assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == `32` &&
219	"Bad type for intrinsic!");
220
221	KnownBits CarryKnown(`32`);
222	if (IC.SimplifyDemandedBits(I: &II, OpNo: CarryOp, DemandedMask: APInt::getOneBitSet(numBits: `32`, BitNo: `29`),
223	Known&: CarryKnown)) {
224	return &II;
225	}
226	break;
227	}
228	case Intrinsic::arm_mve_vmldava: {
229	Instruction *I = cast<Instruction>(Val: &II);
230	if (I->hasOneUse()) {
231	auto User = cast<Instruction>(Val: I->user_begin());
232	Value *OpZ;
233	if (match(V: User, P: m_c_Add(L: m_Specific(V: I), R: m_Value(V&: OpZ))) &&
234	match(V: I->getOperand(i: `3`), P: m_Zero())) {
235	Value *OpX = I->getOperand(i: `4`);
236	Value *OpY = I->getOperand(i: `5`);
237	Type *OpTy = OpX->getType();
238
239	IC.Builder.SetInsertPoint(User);
240	Value *V =
241	IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy},
242	{I->getOperand(`0`), I->getOperand(`1`),
243	I->getOperand(`2`), OpZ, OpX, OpY});
244
245	IC.replaceInstUsesWith(I&: *User, V);
246	return IC.eraseInstFromFunction(I&: *User);
247	}
248	}
249	return std::nullopt;
250	}
251	}
252	return std::nullopt;
253	}
254
255	std::optional<Value *> ARMTTIImpl::simplifyDemandedVectorEltsIntrinsic(
256	InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
257	APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
258	std::function<void(Instruction , unsigned*, APInt, APInt &)>
259	SimplifyAndSetOp) const {
260
261	// Compute the demanded bits for a narrowing MVE intrinsic. The TopOpc is the
262	// opcode specifying a Top/Bottom instruction, which can change between
263	// instructions.
264	auto SimplifyNarrowInstrTopBottom =[&](unsigned TopOpc) {
265	unsigned NumElts = cast<FixedVectorType>(Val: II.getType())->getNumElements();
266	unsigned IsTop = cast<ConstantInt>(Val: II.getOperand(i_nocapture: TopOpc))->getZExtValue();
267
268	// The only odd/even lanes of operand 0 will only be demanded depending
269	// on whether this is a top/bottom instruction.
270	APInt DemandedElts =
271	APInt::getSplat(NewLen: NumElts, V: IsTop ? APInt::getLowBitsSet(numBits: `2`, loBitsSet: `1`)
272	: APInt::getHighBitsSet(numBits: `2`, hiBitsSet: `1`));
273	SimplifyAndSetOp (&II, `0`, OrigDemandedElts & DemandedElts, UndefElts);
274	// The other lanes will be defined from the inserted elements.
275	UndefElts &= APInt::getSplat(NewLen: NumElts, V: IsTop ? APInt::getLowBitsSet(numBits: `2`, loBitsSet: `1`)
276	: APInt::getHighBitsSet(numBits: `2`, hiBitsSet: `1`));
277	return std::nullopt;
278	};
279
280	switch (II.getIntrinsicID()) {
281	default:
282	break;
283	case Intrinsic::arm_mve_vcvt_narrow:
284	SimplifyNarrowInstrTopBottom (`2`);
285	break;
286	case Intrinsic::arm_mve_vqmovn:
287	SimplifyNarrowInstrTopBottom (`4`);
288	break;
289	case Intrinsic::arm_mve_vshrn:
290	SimplifyNarrowInstrTopBottom (`7`);
291	break;
292	}
293
294	return std::nullopt;
295	}
296
297	InstructionCost ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
298	TTI::TargetCostKind CostKind) {
299	assert(Ty->isIntegerTy());
300
301	unsigned Bits = Ty->getPrimitiveSizeInBits();
302	if (Bits == `0` \|\| Imm.getActiveBits() >= `64`)
303	return `4`;
304
305	int64_t SImmVal = Imm.getSExtValue();
306	uint64_t ZImmVal = Imm.getZExtValue();
307	if (!ST->isThumb()) {
308	if ((SImmVal >= `0` && SImmVal < `65536`) \|\|
309	(ARM_AM::getSOImmVal(Arg: ZImmVal) != -`1`) \|\|
310	(ARM_AM::getSOImmVal(Arg: ~ZImmVal) != -`1`))
311	return `1`;
312	return ST->hasV6T2Ops() ? `2` : `3`;
313	}
314	if (ST->isThumb2()) {
315	if ((SImmVal >= `0` && SImmVal < `65536`) \|\|
316	(ARM_AM::getT2SOImmVal(Arg: ZImmVal) != -`1`) \|\|
317	(ARM_AM::getT2SOImmVal(Arg: ~ZImmVal) != -`1`))
318	return `1`;
319	return ST->hasV6T2Ops() ? `2` : `3`;
320	}
321	// Thumb1, any i8 imm cost 1.
322	if (Bits == `8` \|\| (SImmVal >= `0` && SImmVal < `256`))
323	return `1`;
324	if ((~SImmVal < `256`) \|\| ARM_AM::isThumbImmShiftedVal(V: ZImmVal))
325	return `2`;
326	// Load from constantpool.
327	return `3`;
328	}
329
330	// Constants smaller than 256 fit in the immediate field of
331	// Thumb1 instructions so we return a zero cost and 1 otherwise.
332	InstructionCost ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
333	const APInt &Imm, Type *Ty) {
334	if (Imm.isNonNegative() && Imm.getLimitedValue() < `256`)
335	return `0`;
336
337	return `1`;
338	}
339
340	// Checks whether Inst is part of a min(max()) or max(min()) pattern
341	// that will match to an SSAT instruction. Returns the instruction being
342	// saturated, or null if no saturation pattern was found.
343	static Value isSSATMinMaxPattern(Instruction Inst, const APInt &Imm) {
344	Value LHS, RHS;
345	ConstantInt *C;
346	SelectPatternFlavor InstSPF = matchSelectPattern(V: Inst, LHS, RHS).Flavor;
347
348	if (InstSPF == SPF_SMAX &&
349	PatternMatch::match(V: RHS, P: PatternMatch::m_ConstantInt(CI&: C)) &&
350	C->getValue() == Imm && Imm.isNegative() && Imm.isNegatedPowerOf2()) {
351
352	auto isSSatMin = [&](Value *MinInst) {
353	if (isa<SelectInst>(Val: MinInst)) {
354	Value MinLHS, MinRHS;
355	ConstantInt *MinC;
356	SelectPatternFlavor MinSPF =
357	matchSelectPattern(V: MinInst, LHS&: MinLHS, RHS&: MinRHS).Flavor;
358	if (MinSPF == SPF_SMIN &&
359	PatternMatch::match(V: MinRHS, P: PatternMatch::m_ConstantInt(CI&: MinC)) &&
360	MinC->getValue() == ((-Imm) - `1`))
361	return true;
362	}
363	return false;
364	};
365
366	if (isSSatMin (Inst->getOperand(i: `1`)))
367	return cast<Instruction>(Val: Inst->getOperand(i: `1`))->getOperand(i: `1`);
368	if (Inst->hasNUses(N: `2`) &&
369	(isSSatMin (Inst->user_begin()) \|\| isSSatMin ((++Inst->user_begin()))))
370	return Inst->getOperand(i: `1`);
371	}
372	return nullptr;
373	}
374
375	// Look for a FP Saturation pattern, where the instruction can be simplified to
376	// a fptosi.sat. max(min(fptosi)). The constant in this case is always free.
377	static bool isFPSatMinMaxPattern(Instruction Inst, const* APInt &Imm) {
378	if (Imm.getBitWidth() != `64` \|\|
379	Imm != APInt::getHighBitsSet(numBits: `64`, hiBitsSet: `33`)) // -2147483648
380	return false;
381	Value *FP = isSSATMinMaxPattern(Inst, Imm);
382	if (!FP && isa<ICmpInst>(Val: Inst) && Inst->hasOneUse())
383	FP = isSSATMinMaxPattern(Inst: cast<Instruction>(Val: *Inst->user_begin()), Imm);
384	if (!FP)
385	return false;
386	return isa<FPToSIInst>(Val: FP);
387	}
388
389	InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
390	const APInt &Imm, Type *Ty,
391	TTI::TargetCostKind CostKind,
392	Instruction *Inst) {
393	// Division by a constant can be turned into multiplication, but only if we
394	// know it's constant. So it's not so much that the immediate is cheap (it's
395	// not), but that the alternative is worse.
396	// FIXME: this is probably unneeded with GlobalISel.
397	if ((Opcode == Instruction::SDiv \|\| Opcode == Instruction::UDiv \|\|
398	Opcode == Instruction::SRem \|\| Opcode == Instruction::URem) &&
399	Idx == `1`)
400	return `0`;
401
402	// Leave any gep offsets for the CodeGenPrepare, which will do a better job at
403	// splitting any large offsets.
404	if (Opcode == Instruction::GetElementPtr && Idx != `0`)
405	return `0`;
406
407	if (Opcode == Instruction::And) {
408	// UXTB/UXTH
409	if (Imm == `255` \|\| Imm == `65535`)
410	return `0`;
411	// Conversion to BIC is free, and means we can use ~Imm instead.
412	return std::min(a: getIntImmCost(Imm, Ty, CostKind),
413	b: getIntImmCost(Imm: ~Imm, Ty, CostKind));
414	}
415
416	if (Opcode == Instruction::Add)
417	// Conversion to SUB is free, and means we can use -Imm instead.
418	return std::min(a: getIntImmCost(Imm, Ty, CostKind),
419	b: getIntImmCost(Imm: -Imm, Ty, CostKind));
420
421	if (Opcode == Instruction::ICmp && Imm.isNegative() &&
422	Ty->getIntegerBitWidth() == `32`) {
423	int64_t NegImm = -Imm.getSExtValue();
424	if (ST->isThumb2() && NegImm < `1`<<`12`)
425	// icmp X, #-C -> cmn X, #C
426	return `0`;
427	if (ST->isThumb() && NegImm < `1`<<`8`)
428	// icmp X, #-C -> adds X, #C
429	return `0`;
430	}
431
432	// xor a, -1 can always be folded to MVN
433	if (Opcode == Instruction::Xor && Imm.isAllOnes())
434	return `0`;
435
436	// Ensures negative constant of min(max()) or max(min()) patterns that
437	// match to SSAT instructions don't get hoisted
438	if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) \|\| ST->isThumb2()) &&
439	Ty->getIntegerBitWidth() <= `32`) {
440	if (isSSATMinMaxPattern(Inst, Imm) \|\|
441	(isa<ICmpInst>(Val: Inst) && Inst->hasOneUse() &&
442	isSSATMinMaxPattern(Inst: cast<Instruction>(Val: *Inst->user_begin()), Imm)))
443	return `0`;
444	}
445
446	if (Inst && ST->hasVFP2Base() && isFPSatMinMaxPattern(Inst, Imm))
447	return `0`;
448
449	// We can convert <= -1 to < 0, which is generally quite cheap.
450	if (Inst && Opcode == Instruction::ICmp && Idx == `1` && Imm.isAllOnes()) {
451	ICmpInst::Predicate Pred = cast<ICmpInst>(Val: Inst)->getPredicate();
452	if (Pred == ICmpInst::ICMP_SGT \|\| Pred == ICmpInst::ICMP_SLE)
453	return std::min(a: getIntImmCost(Imm, Ty, CostKind),
454	b: getIntImmCost(Imm: Imm + `1`, Ty, CostKind));
455	}
456
457	return getIntImmCost(Imm, Ty, CostKind);
458	}
459
460	InstructionCost ARMTTIImpl::getCFInstrCost(unsigned Opcode,
461	TTI::TargetCostKind CostKind,
462	const Instruction *I) {
463	if (CostKind == TTI::TCK_RecipThroughput &&
464	(ST->hasNEON() \|\| ST->hasMVEIntegerOps())) {
465	// FIXME: The vectorizer is highly sensistive to the cost of these
466	// instructions, which suggests that it may be using the costs incorrectly.
467	// But, for now, just make them free to avoid performance regressions for
468	// vector targets.
469	return `0`;
470	}
471	return BaseT::getCFInstrCost(Opcode, CostKind, I);
472	}
473
474	InstructionCost ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
475	Type *Src,
476	TTI::CastContextHint CCH,
477	TTI::TargetCostKind CostKind,
478	const Instruction *I) {
479	int ISD = TLI->InstructionOpcodeToISD(Opcode);
480	assert(ISD && "Invalid opcode");
481
482	// TODO: Allow non-throughput costs that aren't binary.
483	auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
484	if (CostKind != TTI::TCK_RecipThroughput)
485	return Cost == `0` ? `0` : `1`;
486	return Cost;
487	};
488	auto IsLegalFPType = [this](EVT VT) {
489	EVT EltVT = VT.getScalarType();
490	return (EltVT == MVT::f32 && ST->hasVFP2Base()) \|\|
491	(EltVT == MVT::f64 && ST->hasFP64()) \|\|
492	(EltVT == MVT::f16 && ST->hasFullFP16());
493	};
494
495	EVT SrcTy = TLI->getValueType(DL, Ty: Src);
496	EVT DstTy = TLI->getValueType(DL, Ty: Dst);
497
498	if (!SrcTy.isSimple() \|\| !DstTy.isSimple())
499	return AdjustCost (
500	BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
501
502	// Extending masked load/Truncating masked stores is expensive because we
503	// currently don't split them. This means that we'll likely end up
504	// loading/storing each element individually (hence the high cost).
505	if ((ST->hasMVEIntegerOps() &&
506	(Opcode == Instruction::Trunc \|\| Opcode == Instruction::ZExt \|\|
507	Opcode == Instruction::SExt)) \|\|
508	(ST->hasMVEFloatOps() &&
509	(Opcode == Instruction::FPExt \|\| Opcode == Instruction::FPTrunc) &&
510	IsLegalFPType (SrcTy) && IsLegalFPType (DstTy)))
511	if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > `128`)
512	return `2` * DstTy.getVectorNumElements() *
513	ST->getMVEVectorCostFactor(CostKind);
514
515	// The extend of other kinds of load is free
516	if (CCH == TTI::CastContextHint::Normal \|\|
517	CCH == TTI::CastContextHint::Masked) {
518	static const TypeConversionCostTblEntry LoadConversionTbl[] = {
519	{ISD::SIGN_EXTEND, MVT::i32, MVT::i16, `0`},
520	{ISD::ZERO_EXTEND, MVT::i32, MVT::i16, `0`},
521	{ISD::SIGN_EXTEND, MVT::i32, MVT::i8, `0`},
522	{ISD::ZERO_EXTEND, MVT::i32, MVT::i8, `0`},
523	{ISD::SIGN_EXTEND, MVT::i16, MVT::i8, `0`},
524	{ISD::ZERO_EXTEND, MVT::i16, MVT::i8, `0`},
525	{ISD::SIGN_EXTEND, MVT::i64, MVT::i32, `1`},
526	{ISD::ZERO_EXTEND, MVT::i64, MVT::i32, `1`},
527	{ISD::SIGN_EXTEND, MVT::i64, MVT::i16, `1`},
528	{ISD::ZERO_EXTEND, MVT::i64, MVT::i16, `1`},
529	{ISD::SIGN_EXTEND, MVT::i64, MVT::i8, `1`},
530	{ISD::ZERO_EXTEND, MVT::i64, MVT::i8, `1`},
531	};
532	if (const auto *Entry = ConvertCostTableLookup(
533	LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
534	return AdjustCost(Entry->Cost);
535
536	static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
537	{ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, `0`},
538	{ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, `0`},
539	{ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, `0`},
540	{ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, `0`},
541	{ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, `0`},
542	{ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, `0`},
543	// The following extend from a legal type to an illegal type, so need to
544	// split the load. This introduced an extra load operation, but the
545	// extend is still "free".
546	{ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, `1`},
547	{ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, `1`},
548	{ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, `3`},
549	{ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, `3`},
550	{ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, `1`},
551	{ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, `1`},
552	};
553	if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
554	if (const auto *Entry =
555	ConvertCostTableLookup(MVELoadConversionTbl, ISD,
556	DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
557	return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
558	}
559
560	static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
561	// FPExtends are similar but also require the VCVT instructions.
562	{ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, `1`},
563	{ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, `3`},
564	};
565	if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
566	if (const auto *Entry =
567	ConvertCostTableLookup(MVEFLoadConversionTbl, ISD,
568	DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
569	return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
570	}
571
572	// The truncate of a store is free. This is the mirror of extends above.
573	static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = {
574	{ISD::TRUNCATE, MVT::v4i32, MVT::v4i16, `0`},
575	{ISD::TRUNCATE, MVT::v4i32, MVT::v4i8, `0`},
576	{ISD::TRUNCATE, MVT::v8i16, MVT::v8i8, `0`},
577	{ISD::TRUNCATE, MVT::v8i32, MVT::v8i16, `1`},
578	{ISD::TRUNCATE, MVT::v8i32, MVT::v8i8, `1`},
579	{ISD::TRUNCATE, MVT::v16i32, MVT::v16i8, `3`},
580	{ISD::TRUNCATE, MVT::v16i16, MVT::v16i8, `1`},
581	};
582	if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
583	if (const auto *Entry =
584	ConvertCostTableLookup(MVEStoreConversionTbl, ISD,
585	SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
586	return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
587	}
588
589	static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {
590	{ISD::FP_ROUND, MVT::v4f32, MVT::v4f16, `1`},
591	{ISD::FP_ROUND, MVT::v8f32, MVT::v8f16, `3`},
592	};
593	if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
594	if (const auto *Entry =
595	ConvertCostTableLookup(MVEFStoreConversionTbl, ISD,
596	SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
597	return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
598	}
599	}
600
601	// NEON vector operations that can extend their inputs.
602	if ((ISD == ISD::SIGN_EXTEND \|\| ISD == ISD::ZERO_EXTEND) &&
603	I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) {
604	static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = {
605	// vaddl
606	{ ISD::ADD, MVT::v4i32, MVT::v4i16, `0` },
607	{ ISD::ADD, MVT::v8i16, MVT::v8i8, `0` },
608	// vsubl
609	{ ISD::SUB, MVT::v4i32, MVT::v4i16, `0` },
610	{ ISD::SUB, MVT::v8i16, MVT::v8i8, `0` },
611	// vmull
612	{ ISD::MUL, MVT::v4i32, MVT::v4i16, `0` },
613	{ ISD::MUL, MVT::v8i16, MVT::v8i8, `0` },
614	// vshll
615	{ ISD::SHL, MVT::v4i32, MVT::v4i16, `0` },
616	{ ISD::SHL, MVT::v8i16, MVT::v8i8, `0` },
617	};
618
619	auto User = cast<Instruction>(Val: I->user_begin());
620	int UserISD = TLI->InstructionOpcodeToISD(Opcode: User->getOpcode());
621	if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD,
622	DstTy.getSimpleVT(),
623	SrcTy.getSimpleVT())) {
624	return AdjustCost(Entry->Cost);
625	}
626	}
627
628	// Single to/from double precision conversions.
629	if (Src->isVectorTy() && ST->hasNEON() &&
630	((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 &&
631	DstTy.getScalarType() == MVT::f32) \|\|
632	(ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 &&
633	DstTy.getScalarType() == MVT::f64))) {
634	static const CostTblEntry NEONFltDblTbl[] = {
635	// Vector fptrunc/fpext conversions.
636	{ISD::FP_ROUND, MVT::v2f64, `2`},
637	{ISD::FP_EXTEND, MVT::v2f32, `2`},
638	{ISD::FP_EXTEND, MVT::v4f32, `4`}};
639
640	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Src);
641	if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
642	return AdjustCost(LT.first * Entry->Cost);
643	}
644
645	// Some arithmetic, load and store operations have specific instructions
646	// to cast up/down their types automatically at no extra cost.
647	// TODO: Get these tables to know at least what the related operations are.
648	static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
649	{ ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, `1` },
650	{ ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, `1` },
651	{ ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, `1` },
652	{ ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, `1` },
653	{ ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, `0` },
654	{ ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, `1` },
655
656	// The number of vmovl instructions for the extension.
657	{ ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, `1` },
658	{ ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, `1` },
659	{ ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, `2` },
660	{ ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, `2` },
661	{ ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, `3` },
662	{ ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, `3` },
663	{ ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, `2` },
664	{ ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, `2` },
665	{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, `3` },
666	{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, `3` },
667	{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, `3` },
668	{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, `3` },
669	{ ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, `7` },
670	{ ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, `7` },
671	{ ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, `6` },
672	{ ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, `6` },
673	{ ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, `6` },
674	{ ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, `6` },
675
676	// Operations that we legalize using splitting.
677	{ ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, `6` },
678	{ ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, `3` },
679
680	// Vector float <-> i32 conversions.
681	{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, `1` },
682	{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, `1` },
683
684	{ ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, `3` },
685	{ ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, `3` },
686	{ ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, `2` },
687	{ ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, `2` },
688	{ ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, `1` },
689	{ ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, `1` },
690	{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, `3` },
691	{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, `3` },
692	{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, `3` },
693	{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, `3` },
694	{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, `2` },
695	{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, `2` },
696	{ ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, `4` },
697	{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, `4` },
698	{ ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, `2` },
699	{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, `2` },
700	{ ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, `8` },
701	{ ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, `8` },
702	{ ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, `4` },
703	{ ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, `4` },
704
705	{ ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, `1` },
706	{ ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, `1` },
707	{ ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, `3` },
708	{ ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, `3` },
709	{ ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, `2` },
710	{ ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, `2` },
711
712	// Vector double <-> i32 conversions.
713	{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, `2` },
714	{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, `2` },
715
716	{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, `4` },
717	{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, `4` },
718	{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, `3` },
719	{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, `3` },
720	{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, `2` },
721	{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, `2` },
722
723	{ ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, `2` },
724	{ ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, `2` },
725	{ ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, `4` },
726	{ ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, `4` },
727	{ ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f32, `8` },
728	{ ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, `8` }
729	};
730
731	if (SrcTy.isVector() && ST->hasNEON()) {
732	if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
733	DstTy.getSimpleVT(),
734	SrcTy.getSimpleVT()))
735	return AdjustCost(Entry->Cost);
736	}
737
738	// Scalar float to integer conversions.
739	static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
740	{ ISD::FP_TO_SINT, MVT::i1, MVT::f32, `2` },
741	{ ISD::FP_TO_UINT, MVT::i1, MVT::f32, `2` },
742	{ ISD::FP_TO_SINT, MVT::i1, MVT::f64, `2` },
743	{ ISD::FP_TO_UINT, MVT::i1, MVT::f64, `2` },
744	{ ISD::FP_TO_SINT, MVT::i8, MVT::f32, `2` },
745	{ ISD::FP_TO_UINT, MVT::i8, MVT::f32, `2` },
746	{ ISD::FP_TO_SINT, MVT::i8, MVT::f64, `2` },
747	{ ISD::FP_TO_UINT, MVT::i8, MVT::f64, `2` },
748	{ ISD::FP_TO_SINT, MVT::i16, MVT::f32, `2` },
749	{ ISD::FP_TO_UINT, MVT::i16, MVT::f32, `2` },
750	{ ISD::FP_TO_SINT, MVT::i16, MVT::f64, `2` },
751	{ ISD::FP_TO_UINT, MVT::i16, MVT::f64, `2` },
752	{ ISD::FP_TO_SINT, MVT::i32, MVT::f32, `2` },
753	{ ISD::FP_TO_UINT, MVT::i32, MVT::f32, `2` },
754	{ ISD::FP_TO_SINT, MVT::i32, MVT::f64, `2` },
755	{ ISD::FP_TO_UINT, MVT::i32, MVT::f64, `2` },
756	{ ISD::FP_TO_SINT, MVT::i64, MVT::f32, `10` },
757	{ ISD::FP_TO_UINT, MVT::i64, MVT::f32, `10` },
758	{ ISD::FP_TO_SINT, MVT::i64, MVT::f64, `10` },
759	{ ISD::FP_TO_UINT, MVT::i64, MVT::f64, `10` }
760	};
761	if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
762	if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
763	DstTy.getSimpleVT(),
764	SrcTy.getSimpleVT()))
765	return AdjustCost(Entry->Cost);
766	}
767
768	// Scalar integer to float conversions.
769	static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
770	{ ISD::SINT_TO_FP, MVT::f32, MVT::i1, `2` },
771	{ ISD::UINT_TO_FP, MVT::f32, MVT::i1, `2` },
772	{ ISD::SINT_TO_FP, MVT::f64, MVT::i1, `2` },
773	{ ISD::UINT_TO_FP, MVT::f64, MVT::i1, `2` },
774	{ ISD::SINT_TO_FP, MVT::f32, MVT::i8, `2` },
775	{ ISD::UINT_TO_FP, MVT::f32, MVT::i8, `2` },
776	{ ISD::SINT_TO_FP, MVT::f64, MVT::i8, `2` },
777	{ ISD::UINT_TO_FP, MVT::f64, MVT::i8, `2` },
778	{ ISD::SINT_TO_FP, MVT::f32, MVT::i16, `2` },
779	{ ISD::UINT_TO_FP, MVT::f32, MVT::i16, `2` },
780	{ ISD::SINT_TO_FP, MVT::f64, MVT::i16, `2` },
781	{ ISD::UINT_TO_FP, MVT::f64, MVT::i16, `2` },
782	{ ISD::SINT_TO_FP, MVT::f32, MVT::i32, `2` },
783	{ ISD::UINT_TO_FP, MVT::f32, MVT::i32, `2` },
784	{ ISD::SINT_TO_FP, MVT::f64, MVT::i32, `2` },
785	{ ISD::UINT_TO_FP, MVT::f64, MVT::i32, `2` },
786	{ ISD::SINT_TO_FP, MVT::f32, MVT::i64, `10` },
787	{ ISD::UINT_TO_FP, MVT::f32, MVT::i64, `10` },
788	{ ISD::SINT_TO_FP, MVT::f64, MVT::i64, `10` },
789	{ ISD::UINT_TO_FP, MVT::f64, MVT::i64, `10` }
790	};
791
792	if (SrcTy.isInteger() && ST->hasNEON()) {
793	if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
794	ISD, DstTy.getSimpleVT(),
795	SrcTy.getSimpleVT()))
796	return AdjustCost(Entry->Cost);
797	}
798
799	// MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
800	// instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
801	// are linearised so take more.
802	static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
803	{ ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, `1` },
804	{ ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, `1` },
805	{ ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, `2` },
806	{ ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, `2` },
807	{ ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, `10` },
808	{ ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, `2` },
809	{ ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, `1` },
810	{ ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, `1` },
811	{ ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, `10` },
812	{ ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, `2` },
813	{ ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, `8` },
814	{ ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, `2` },
815	};
816
817	if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
818	if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
819	ISD, DstTy.getSimpleVT(),
820	SrcTy.getSimpleVT()))
821	return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
822	}
823
824	if (ISD == ISD::FP_ROUND \|\| ISD == ISD::FP_EXTEND) {
825	// As general rule, fp converts that were not matched above are scalarized
826	// and cost 1 vcvt for each lane, so long as the instruction is available.
827	// If not it will become a series of function calls.
828	const InstructionCost CallCost =
829	getCallInstrCost(F: nullptr, RetTy: Dst, Tys: {Src}, CostKind);
830	int Lanes = `1`;
831	if (SrcTy.isFixedLengthVector())
832	Lanes = SrcTy.getVectorNumElements();
833
834	if (IsLegalFPType(SrcTy) && IsLegalFPType (DstTy))
835	return Lanes;
836	else
837	return Lanes * CallCost;
838	}
839
840	if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() &&
841	SrcTy.isFixedLengthVector()) {
842	// Treat a truncate with larger than legal source (128bits for MVE) as
843	// expensive, 2 instructions per lane.
844	if ((SrcTy.getScalarType() == MVT::i8 \|\|
845	SrcTy.getScalarType() == MVT::i16 \|\|
846	SrcTy.getScalarType() == MVT::i32) &&
847	SrcTy.getSizeInBits() > `128` &&
848	SrcTy.getSizeInBits() > DstTy.getSizeInBits())
849	return SrcTy.getVectorNumElements() * `2`;
850	}
851
852	// Scalar integer conversion costs.
853	static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
854	// i16 -> i64 requires two dependent operations.
855	{ ISD::SIGN_EXTEND, MVT::i64, MVT::i16, `2` },
856
857	// Truncates on i64 are assumed to be free.
858	{ ISD::TRUNCATE, MVT::i32, MVT::i64, `0` },
859	{ ISD::TRUNCATE, MVT::i16, MVT::i64, `0` },
860	{ ISD::TRUNCATE, MVT::i8, MVT::i64, `0` },
861	{ ISD::TRUNCATE, MVT::i1, MVT::i64, `0` }
862	};
863
864	if (SrcTy.isInteger()) {
865	if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
866	DstTy.getSimpleVT(),
867	SrcTy.getSimpleVT()))
868	return AdjustCost(Entry->Cost);
869	}
870
871	int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
872	? ST->getMVEVectorCostFactor(CostKind)
873	: `1`;
874	return AdjustCost (
875	BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
876	}
877
878	InstructionCost ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
879	TTI::TargetCostKind CostKind,
880	unsigned Index, Value *Op0,
881	Value *Op1) {
882	// Penalize inserting into an D-subregister. We end up with a three times
883	// lower estimated throughput on swift.
884	if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
885	ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= `32`)
886	return `3`;
887
888	if (ST->hasNEON() && (Opcode == Instruction::InsertElement \|\|
889	Opcode == Instruction::ExtractElement)) {
890	// Cross-class copies are expensive on many microarchitectures,
891	// so assume they are expensive by default.
892	if (cast<VectorType>(Val: ValTy)->getElementType()->isIntegerTy())
893	return `3`;
894
895	// Even if it's not a cross class copy, this likely leads to mixing
896	// of NEON and VFP code and should be therefore penalized.
897	if (ValTy->isVectorTy() &&
898	ValTy->getScalarSizeInBits() <= `32`)
899	return std::max<InstructionCost>(
900	a: BaseT::getVectorInstrCost(Opcode, Val: ValTy, CostKind, Index, Op0, Op1),
901	b: `2U`);
902	}
903
904	if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement \|\|
905	Opcode == Instruction::ExtractElement)) {
906	// Integer cross-lane moves are more expensive than float, which can
907	// sometimes just be vmovs. Integer involve being passes to GPR registers,
908	// causing more of a delay.
909	std::pair<InstructionCost, MVT> LT =
910	getTypeLegalizationCost(Ty: ValTy->getScalarType());
911	return LT.first * (ValTy->getScalarType()->isIntegerTy() ? `4` : `1`);
912	}
913
914	return BaseT::getVectorInstrCost(Opcode, Val: ValTy, CostKind, Index, Op0, Op1);
915	}
916
917	InstructionCost ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
918	Type *CondTy,
919	CmpInst::Predicate VecPred,
920	TTI::TargetCostKind CostKind,
921	const Instruction *I) {
922	int ISD = TLI->InstructionOpcodeToISD(Opcode);
923
924	// Thumb scalar code size cost for select.
925	if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT &&
926	ST->isThumb() && !ValTy->isVectorTy()) {
927	// Assume expensive structs.
928	if (TLI->getValueType(DL, ValTy, true) == MVT::Other)
929	return TTI::TCC_Expensive;
930
931	// Select costs can vary because they:
932	// - may require one or more conditional mov (including an IT),
933	// - can't operate directly on immediates,
934	// - require live flags, which we can't copy around easily.
935	InstructionCost Cost = getTypeLegalizationCost(Ty: ValTy).first;
936
937	// Possible IT instruction for Thumb2, or more for Thumb1.
938	++Cost;
939
940	// i1 values may need rematerialising by using mov immediates and/or
941	// flag setting instructions.
942	if (ValTy->isIntegerTy(Bitwidth: `1`))
943	++Cost;
944
945	return Cost;
946	}
947
948	// If this is a vector min/max/abs, use the cost of that intrinsic directly
949	// instead. Hopefully when min/max intrinsics are more prevalent this code
950	// will not be needed.
951	const Instruction *Sel = I;
952	if ((Opcode == Instruction::ICmp \|\| Opcode == Instruction::FCmp) && Sel &&
953	Sel->hasOneUse())
954	Sel = cast<Instruction>(Val: Sel->user_back());
955	if (Sel && ValTy->isVectorTy() &&
956	(ValTy->isIntOrIntVectorTy() \|\| ValTy->isFPOrFPVectorTy())) {
957	const Value LHS, RHS;
958	SelectPatternFlavor SPF = matchSelectPattern(V: Sel, LHS, RHS).Flavor;
959	unsigned IID = `0`;
960	switch (SPF) {
961	case SPF_ABS:
962	IID = Intrinsic::abs;
963	break;
964	case SPF_SMIN:
965	IID = Intrinsic::smin;
966	break;
967	case SPF_SMAX:
968	IID = Intrinsic::smax;
969	break;
970	case SPF_UMIN:
971	IID = Intrinsic::umin;
972	break;
973	case SPF_UMAX:
974	IID = Intrinsic::umax;
975	break;
976	case SPF_FMINNUM:
977	IID = Intrinsic::minnum;
978	break;
979	case SPF_FMAXNUM:
980	IID = Intrinsic::maxnum;
981	break;
982	default:
983	break;
984	}
985	if (IID) {
986	// The ICmp is free, the select gets the cost of the min/max/etc
987	if (Sel != I)
988	return `0`;
989	IntrinsicCostAttributes CostAttrs(IID, ValTy, {ValTy, ValTy});
990	return getIntrinsicInstrCost(ICA: CostAttrs, CostKind);
991	}
992	}
993
994	// On NEON a vector select gets lowered to vbsl.
995	if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) {
996	// Lowering of some vector selects is currently far from perfect.
997	static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
998	{ ISD::SELECT, MVT::v4i1, MVT::v4i64, `4``4` + `1``2` + `1` },
999	{ ISD::SELECT, MVT::v8i1, MVT::v8i64, `50` },
1000	{ ISD::SELECT, MVT::v16i1, MVT::v16i64, `100` }
1001	};
1002
1003	EVT SelCondTy = TLI->getValueType(DL, Ty: CondTy);
1004	EVT SelValTy = TLI->getValueType(DL, Ty: ValTy);
1005	if (SelCondTy.isSimple() && SelValTy.isSimple()) {
1006	if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
1007	SelCondTy.getSimpleVT(),
1008	SelValTy.getSimpleVT()))
1009	return Entry->Cost;
1010	}
1011
1012	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
1013	return LT.first;
1014	}
1015
1016	if (ST->hasMVEIntegerOps() && ValTy->isVectorTy() &&
1017	(Opcode == Instruction::ICmp \|\| Opcode == Instruction::FCmp) &&
1018	cast<FixedVectorType>(Val: ValTy)->getNumElements() > `1`) {
1019	FixedVectorType *VecValTy = cast<FixedVectorType>(Val: ValTy);
1020	FixedVectorType *VecCondTy = dyn_cast_or_null<FixedVectorType>(Val: CondTy);
1021	if (!VecCondTy)
1022	VecCondTy = cast<FixedVectorType>(Val: CmpInst::makeCmpResultType(opnd_type: VecValTy));
1023
1024	// If we don't have mve.fp any fp operations will need to be scalarized.
1025	if (Opcode == Instruction::FCmp && !ST->hasMVEFloatOps()) {
1026	// One scalaization insert, one scalarization extract and the cost of the
1027	// fcmps.
1028	return BaseT::getScalarizationOverhead(InTy: VecValTy, /Insert/ false,
1029	/Extract/ true, CostKind) +
1030	BaseT::getScalarizationOverhead(InTy: VecCondTy, /Insert/ true,
1031	/Extract/ false, CostKind) +
1032	VecValTy->getNumElements() *
1033	getCmpSelInstrCost(Opcode, ValTy: ValTy->getScalarType(),
1034	CondTy: VecCondTy->getScalarType(), VecPred,
1035	CostKind, I);
1036	}
1037
1038	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
1039	int BaseCost = ST->getMVEVectorCostFactor(CostKind);
1040	// There are two types - the input that specifies the type of the compare
1041	// and the output vXi1 type. Because we don't know how the output will be
1042	// split, we may need an expensive shuffle to get two in sync. This has the
1043	// effect of making larger than legal compares (v8i32 for example)
1044	// expensive.
1045	if (LT.second.isVector() && LT.second.getVectorNumElements() > `2`) {
1046	if (LT.first > `1`)
1047	return LT.first * BaseCost +
1048	BaseT::getScalarizationOverhead(InTy: VecCondTy, /Insert/ true,
1049	/Extract/ false, CostKind);
1050	return BaseCost;
1051	}
1052	}
1053
1054	// Default to cheap (throughput/size of 1 instruction) but adjust throughput
1055	// for "multiple beats" potentially needed by MVE instructions.
1056	int BaseCost = `1`;
1057	if (ST->hasMVEIntegerOps() && ValTy->isVectorTy())
1058	BaseCost = ST->getMVEVectorCostFactor(CostKind);
1059
1060	return BaseCost *
1061	BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
1062	}
1063
1064	InstructionCost ARMTTIImpl::getAddressComputationCost(Type *Ty,
1065	ScalarEvolution *SE,
1066	const SCEV *Ptr) {
1067	// Address computations in vectorized code with non-consecutive addresses will
1068	// likely result in more instructions compared to scalar code where the
1069	// computation can more often be merged into the index mode. The resulting
1070	// extra micro-ops can significantly decrease throughput.
1071	unsigned NumVectorInstToHideOverhead = `10`;
1072	int MaxMergeDistance = `64`;
1073
1074	if (ST->hasNEON()) {
1075	if (Ty->isVectorTy() && SE &&
1076	!BaseT::isConstantStridedAccessLessThan(SE, Ptr, MergeDistance: MaxMergeDistance + `1`))
1077	return NumVectorInstToHideOverhead;
1078
1079	// In many cases the address computation is not merged into the instruction
1080	// addressing mode.
1081	return `1`;
1082	}
1083	return BaseT::getAddressComputationCost(Ty, SE, Ptr);
1084	}
1085
1086	bool ARMTTIImpl::isProfitableLSRChainElement(Instruction *I) {
1087	if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: I)) {
1088	// If a VCTP is part of a chain, it's already profitable and shouldn't be
1089	// optimized, else LSR may block tail-predication.
1090	switch (II->getIntrinsicID()) {
1091	case Intrinsic::arm_mve_vctp8:
1092	case Intrinsic::arm_mve_vctp16:
1093	case Intrinsic::arm_mve_vctp32:
1094	case Intrinsic::arm_mve_vctp64:
1095	return true;
1096	default:
1097	break;
1098	}
1099	}
1100	return false;
1101	}
1102
1103	bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
1104	if (!EnableMaskedLoadStores \|\| !ST->hasMVEIntegerOps())
1105	return false;
1106
1107	if (auto *VecTy = dyn_cast<FixedVectorType>(Val: DataTy)) {
1108	// Don't support v2i1 yet.
1109	if (VecTy->getNumElements() == `2`)
1110	return false;
1111
1112	// We don't support extending fp types.
1113	unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
1114	if (VecWidth != `128` && VecTy->getElementType()->isFloatingPointTy())
1115	return false;
1116	}
1117
1118	unsigned EltWidth = DataTy->getScalarSizeInBits();
1119	return (EltWidth == `32` && Alignment >= `4`) \|\|
1120	(EltWidth == `16` && Alignment >= `2`) \|\| (EltWidth == `8`);
1121	}
1122
1123	bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) {
1124	if (!EnableMaskedGatherScatters \|\| !ST->hasMVEIntegerOps())
1125	return false;
1126
1127	unsigned EltWidth = Ty->getScalarSizeInBits();
1128	return ((EltWidth == `32` && Alignment >= `4`) \|\|
1129	(EltWidth == `16` && Alignment >= `2`) \|\| EltWidth == `8`);
1130	}
1131
1132	/// Given a memcpy/memset/memmove instruction, return the number of memory
1133	/// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a
1134	/// call is used.
1135	int ARMTTIImpl::getNumMemOps(const IntrinsicInst I) const* {
1136	MemOp MOp;
1137	unsigned DstAddrSpace = ~`0u`;
1138	unsigned SrcAddrSpace = ~`0u`;
1139	const Function *F = I->getParent()->getParent();
1140
1141	if (const auto *MC = dyn_cast<MemTransferInst>(Val: I)) {
1142	ConstantInt *C = dyn_cast<ConstantInt>(Val: MC->getLength());
1143	// If 'size' is not a constant, a library call will be generated.
1144	if (!C)
1145	return -`1`;
1146
1147	const unsigned Size = C->getValue().getZExtValue();
1148	const Align DstAlign = *MC->getDestAlign();
1149	const Align SrcAlign = *MC->getSourceAlign();
1150
1151	MOp = MemOp::Copy(Size, /DstAlignCanChange/ false, DstAlign, SrcAlign,
1152	/IsVolatile/ false);
1153	DstAddrSpace = MC->getDestAddressSpace();
1154	SrcAddrSpace = MC->getSourceAddressSpace();
1155	}
1156	else if (const auto *MS = dyn_cast<MemSetInst>(Val: I)) {
1157	ConstantInt *C = dyn_cast<ConstantInt>(Val: MS->getLength());
1158	// If 'size' is not a constant, a library call will be generated.
1159	if (!C)
1160	return -`1`;
1161
1162	const unsigned Size = C->getValue().getZExtValue();
1163	const Align DstAlign = *MS->getDestAlign();
1164
1165	MOp = MemOp::Set(Size, /DstAlignCanChange/ false, DstAlign,
1166	/IsZeroMemset/ false, /IsVolatile/ false);
1167	DstAddrSpace = MS->getDestAddressSpace();
1168	}
1169	else
1170	llvm_unreachable("Expected a memcpy/move or memset!");
1171
1172	unsigned Limit, Factor = `2`;
1173	switch(I->getIntrinsicID()) {
1174	case Intrinsic::memcpy:
1175	Limit = TLI->getMaxStoresPerMemcpy(OptSize: F->hasMinSize());
1176	break;
1177	case Intrinsic::memmove:
1178	Limit = TLI->getMaxStoresPerMemmove(OptSize: F->hasMinSize());
1179	break;
1180	case Intrinsic::memset:
1181	Limit = TLI->getMaxStoresPerMemset(OptSize: F->hasMinSize());
1182	Factor = `1`;
1183	break;
1184	default:
1185	llvm_unreachable("Expected a memcpy/move or memset!");
1186	}
1187
1188	// MemOps will be poplulated with a list of data types that needs to be
1189	// loaded and stored. That's why we multiply the number of elements by 2 to
1190	// get the cost for this memcpy.
1191	std::vector<EVT> MemOps;
1192	if (getTLI()->findOptimalMemOpLowering(
1193	MemOps, Limit, Op: MOp, DstAS: DstAddrSpace,
1194	SrcAS: SrcAddrSpace, FuncAttributes: F->getAttributes()))
1195	return MemOps.size() * Factor;
1196
1197	// If we can't find an optimal memop lowering, return the default cost
1198	return -`1`;
1199	}
1200
1201	InstructionCost ARMTTIImpl::getMemcpyCost(const Instruction *I) {
1202	int NumOps = getNumMemOps(I: cast<IntrinsicInst>(Val: I));
1203
1204	// To model the cost of a library call, we assume 1 for the call, and
1205	// 3 for the argument setup.
1206	if (NumOps == -`1`)
1207	return `4`;
1208	return NumOps;
1209	}
1210
1211	InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1212	VectorType Tp, ArrayRef<int*> Mask,
1213	TTI::TargetCostKind CostKind,
1214	int Index, VectorType *SubTp,
1215	ArrayRef<const Value *> Args,
1216	const Instruction *CxtI) {
1217	Kind = improveShuffleKindFromMask(Kind, Mask, Ty: Tp, Index, SubTy&: SubTp);
1218	// Treat extractsubvector as single op permutation.
1219	bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
1220	if (IsExtractSubvector)
1221	Kind = TTI::SK_PermuteSingleSrc;
1222	if (ST->hasNEON()) {
1223	if (Kind == TTI::SK_Broadcast) {
1224	static const CostTblEntry NEONDupTbl[] = {
1225	// VDUP handles these cases.
1226	{ISD::VECTOR_SHUFFLE, MVT::v2i32, `1`},
1227	{ISD::VECTOR_SHUFFLE, MVT::v2f32, `1`},
1228	{ISD::VECTOR_SHUFFLE, MVT::v2i64, `1`},
1229	{ISD::VECTOR_SHUFFLE, MVT::v2f64, `1`},
1230	{ISD::VECTOR_SHUFFLE, MVT::v4i16, `1`},
1231	{ISD::VECTOR_SHUFFLE, MVT::v8i8, `1`},
1232
1233	{ISD::VECTOR_SHUFFLE, MVT::v4i32, `1`},
1234	{ISD::VECTOR_SHUFFLE, MVT::v4f32, `1`},
1235	{ISD::VECTOR_SHUFFLE, MVT::v8i16, `1`},
1236	{ISD::VECTOR_SHUFFLE, MVT::v16i8, `1`}};
1237
1238	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Tp);
1239	if (const auto *Entry =
1240	CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))
1241	return LT.first * Entry->Cost;
1242	}
1243	if (Kind == TTI::SK_Reverse) {
1244	static const CostTblEntry NEONShuffleTbl[] = {
1245	// Reverse shuffle cost one instruction if we are shuffling within a
1246	// double word (vrev) or two if we shuffle a quad word (vrev, vext).
1247	{ISD::VECTOR_SHUFFLE, MVT::v2i32, `1`},
1248	{ISD::VECTOR_SHUFFLE, MVT::v2f32, `1`},
1249	{ISD::VECTOR_SHUFFLE, MVT::v2i64, `1`},
1250	{ISD::VECTOR_SHUFFLE, MVT::v2f64, `1`},
1251	{ISD::VECTOR_SHUFFLE, MVT::v4i16, `1`},
1252	{ISD::VECTOR_SHUFFLE, MVT::v8i8, `1`},
1253
1254	{ISD::VECTOR_SHUFFLE, MVT::v4i32, `2`},
1255	{ISD::VECTOR_SHUFFLE, MVT::v4f32, `2`},
1256	{ISD::VECTOR_SHUFFLE, MVT::v8i16, `2`},
1257	{ISD::VECTOR_SHUFFLE, MVT::v16i8, `2`}};
1258
1259	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Tp);
1260	if (const auto *Entry =
1261	CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
1262	return LT.first * Entry->Cost;
1263	}
1264	if (Kind == TTI::SK_Select) {
1265	static const CostTblEntry NEONSelShuffleTbl[] = {
1266	// Select shuffle cost table for ARM. Cost is the number of
1267	// instructions
1268	// required to create the shuffled vector.
1269
1270	{ISD::VECTOR_SHUFFLE, MVT::v2f32, `1`},
1271	{ISD::VECTOR_SHUFFLE, MVT::v2i64, `1`},
1272	{ISD::VECTOR_SHUFFLE, MVT::v2f64, `1`},
1273	{ISD::VECTOR_SHUFFLE, MVT::v2i32, `1`},
1274
1275	{ISD::VECTOR_SHUFFLE, MVT::v4i32, `2`},
1276	{ISD::VECTOR_SHUFFLE, MVT::v4f32, `2`},
1277	{ISD::VECTOR_SHUFFLE, MVT::v4i16, `2`},
1278
1279	{ISD::VECTOR_SHUFFLE, MVT::v8i16, `16`},
1280
1281	{ISD::VECTOR_SHUFFLE, MVT::v16i8, `32`}};
1282
1283	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Tp);
1284	if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
1285	ISD::VECTOR_SHUFFLE, LT.second))
1286	return LT.first * Entry->Cost;
1287	}
1288	}
1289	if (ST->hasMVEIntegerOps()) {
1290	if (Kind == TTI::SK_Broadcast) {
1291	static const CostTblEntry MVEDupTbl[] = {
1292	// VDUP handles these cases.
1293	{ISD::VECTOR_SHUFFLE, MVT::v4i32, `1`},
1294	{ISD::VECTOR_SHUFFLE, MVT::v8i16, `1`},
1295	{ISD::VECTOR_SHUFFLE, MVT::v16i8, `1`},
1296	{ISD::VECTOR_SHUFFLE, MVT::v4f32, `1`},
1297	{ISD::VECTOR_SHUFFLE, MVT::v8f16, `1`}};
1298
1299	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Tp);
1300	if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
1301	LT.second))
1302	return LT.first * Entry->Cost *
1303	ST->getMVEVectorCostFactor(CostKind: TTI::TCK_RecipThroughput);
1304	}
1305
1306	if (!Mask.empty()) {
1307	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Tp);
1308	if (LT.second.isVector() &&
1309	Mask.size() <= LT.second.getVectorNumElements() &&
1310	(isVREVMask(M: Mask, VT: LT.second, BlockSize: `16`) \|\| isVREVMask(M: Mask, VT: LT.second, BlockSize: `32`) \|\|
1311	isVREVMask(M: Mask, VT: LT.second, BlockSize: `64`)))
1312	return ST->getMVEVectorCostFactor(CostKind: TTI::TCK_RecipThroughput) * LT.first;
1313	}
1314	}
1315
1316	// Restore optimal kind.
1317	if (IsExtractSubvector)
1318	Kind = TTI::SK_ExtractSubvector;
1319	int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy()
1320	? ST->getMVEVectorCostFactor(CostKind: TTI::TCK_RecipThroughput)
1321	: `1`;
1322	return BaseCost *
1323	BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
1324	}
1325
1326	InstructionCost ARMTTIImpl::getArithmeticInstrCost(
1327	unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1328	TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
1329	ArrayRef<const Value *> Args,
1330	const Instruction *CxtI) {
1331	int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
1332	if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(Bitwidth: `1`)) {
1333	// Make operations on i1 relatively expensive as this often involves
1334	// combining predicates. AND and XOR should be easier to handle with IT
1335	// blocks.
1336	switch (ISDOpcode) {
1337	default:
1338	break;
1339	case ISD::AND:
1340	case ISD::XOR:
1341	return `2`;
1342	case ISD::OR:
1343	return `3`;
1344	}
1345	}
1346
1347	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1348
1349	if (ST->hasNEON()) {
1350	const unsigned FunctionCallDivCost = `20`;
1351	const unsigned ReciprocalDivCost = `10`;
1352	static const CostTblEntry CostTbl[] = {
1353	// Division.
1354	// These costs are somewhat random. Choose a cost of 20 to indicate that
1355	// vectorizing devision (added function call) is going to be very expensive.
1356	// Double registers types.
1357	{ ISD::SDIV, MVT::v1i64, `1` * FunctionCallDivCost},
1358	{ ISD::UDIV, MVT::v1i64, `1` * FunctionCallDivCost},
1359	{ ISD::SREM, MVT::v1i64, `1` * FunctionCallDivCost},
1360	{ ISD::UREM, MVT::v1i64, `1` * FunctionCallDivCost},
1361	{ ISD::SDIV, MVT::v2i32, `2` * FunctionCallDivCost},
1362	{ ISD::UDIV, MVT::v2i32, `2` * FunctionCallDivCost},
1363	{ ISD::SREM, MVT::v2i32, `2` * FunctionCallDivCost},
1364	{ ISD::UREM, MVT::v2i32, `2` * FunctionCallDivCost},
1365	{ ISD::SDIV, MVT::v4i16, ReciprocalDivCost},
1366	{ ISD::UDIV, MVT::v4i16, ReciprocalDivCost},
1367	{ ISD::SREM, MVT::v4i16, `4` * FunctionCallDivCost},
1368	{ ISD::UREM, MVT::v4i16, `4` * FunctionCallDivCost},
1369	{ ISD::SDIV, MVT::v8i8, ReciprocalDivCost},
1370	{ ISD::UDIV, MVT::v8i8, ReciprocalDivCost},
1371	{ ISD::SREM, MVT::v8i8, `8` * FunctionCallDivCost},
1372	{ ISD::UREM, MVT::v8i8, `8` * FunctionCallDivCost},
1373	// Quad register types.
1374	{ ISD::SDIV, MVT::v2i64, `2` * FunctionCallDivCost},
1375	{ ISD::UDIV, MVT::v2i64, `2` * FunctionCallDivCost},
1376	{ ISD::SREM, MVT::v2i64, `2` * FunctionCallDivCost},
1377	{ ISD::UREM, MVT::v2i64, `2` * FunctionCallDivCost},
1378	{ ISD::SDIV, MVT::v4i32, `4` * FunctionCallDivCost},
1379	{ ISD::UDIV, MVT::v4i32, `4` * FunctionCallDivCost},
1380	{ ISD::SREM, MVT::v4i32, `4` * FunctionCallDivCost},
1381	{ ISD::UREM, MVT::v4i32, `4` * FunctionCallDivCost},
1382	{ ISD::SDIV, MVT::v8i16, `8` * FunctionCallDivCost},
1383	{ ISD::UDIV, MVT::v8i16, `8` * FunctionCallDivCost},
1384	{ ISD::SREM, MVT::v8i16, `8` * FunctionCallDivCost},
1385	{ ISD::UREM, MVT::v8i16, `8` * FunctionCallDivCost},
1386	{ ISD::SDIV, MVT::v16i8, `16` * FunctionCallDivCost},
1387	{ ISD::UDIV, MVT::v16i8, `16` * FunctionCallDivCost},
1388	{ ISD::SREM, MVT::v16i8, `16` * FunctionCallDivCost},
1389	{ ISD::UREM, MVT::v16i8, `16` * FunctionCallDivCost},
1390	// Multiplication.
1391	};
1392
1393	if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
1394	return LT.first * Entry->Cost;
1395
1396	InstructionCost Cost = BaseT::getArithmeticInstrCost(
1397	Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info);
1398
1399	// This is somewhat of a hack. The problem that we are facing is that SROA
1400	// creates a sequence of shift, and, or instructions to construct values.
1401	// These sequences are recognized by the ISel and have zero-cost. Not so for
1402	// the vectorized code. Because we have support for v2i64 but not i64 those
1403	// sequences look particularly beneficial to vectorize.
1404	// To work around this we increase the cost of v2i64 operations to make them
1405	// seem less beneficial.
1406	if (LT.second == MVT::v2i64 && Op2Info.isUniform() && Op2Info.isConstant())
1407	Cost += `4`;
1408
1409	return Cost;
1410	}
1411
1412	// If this operation is a shift on arm/thumb2, it might well be folded into
1413	// the following instruction, hence having a cost of 0.
1414	auto LooksLikeAFreeShift = [&]() {
1415	if (ST->isThumb1Only() \|\| Ty->isVectorTy())
1416	return false;
1417
1418	if (!CxtI \|\| !CxtI->hasOneUse() \|\| !CxtI->isShift())
1419	return false;
1420	if (!Op2Info.isUniform() \|\| !Op2Info.isConstant())
1421	return false;
1422
1423	// Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB
1424	switch (cast<Instruction>(Val: CxtI->user_back())->getOpcode()) {
1425	case Instruction::Add:
1426	case Instruction::Sub:
1427	case Instruction::And:
1428	case Instruction::Xor:
1429	case Instruction::Or:
1430	case Instruction::ICmp:
1431	return true;
1432	default:
1433	return false;
1434	}
1435	};
1436	if (LooksLikeAFreeShift ())
1437	return `0`;
1438
1439	// Default to cheap (throughput/size of 1 instruction) but adjust throughput
1440	// for "multiple beats" potentially needed by MVE instructions.
1441	int BaseCost = `1`;
1442	if (ST->hasMVEIntegerOps() && Ty->isVectorTy())
1443	BaseCost = ST->getMVEVectorCostFactor(CostKind);
1444
1445	// The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
1446	// without treating floats as more expensive that scalars or increasing the
1447	// costs for custom operations. The results is also multiplied by the
1448	// MVEVectorCostFactor where appropriate.
1449	if (TLI->isOperationLegalOrCustomOrPromote(Op: ISDOpcode, VT: LT.second))
1450	return LT.first * BaseCost;
1451
1452	// Else this is expand, assume that we need to scalarize this op.
1453	if (auto *VTy = dyn_cast<FixedVectorType>(Val: Ty)) {
1454	unsigned Num = VTy->getNumElements();
1455	InstructionCost Cost =
1456	getArithmeticInstrCost(Opcode, Ty: Ty->getScalarType(), CostKind);
1457	// Return the cost of multiple scalar invocation plus the cost of
1458	// inserting and extracting the values.
1459	SmallVector<Type *> Tys(Args.size(), Ty);
1460	return BaseT::getScalarizationOverhead(RetTy: VTy, Args, Tys, CostKind) +
1461	Num * Cost;
1462	}
1463
1464	return BaseCost;
1465	}
1466
1467	InstructionCost ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1468	MaybeAlign Alignment,
1469	unsigned AddressSpace,
1470	TTI::TargetCostKind CostKind,
1471	TTI::OperandValueInfo OpInfo,
1472	const Instruction *I) {
1473	// TODO: Handle other cost kinds.
1474	if (CostKind != TTI::TCK_RecipThroughput)
1475	return `1`;
1476
1477	// Type legalization can't handle structs
1478	if (TLI->getValueType(DL, Src, true) == MVT::Other)
1479	return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1480	CostKind);
1481
1482	if (ST->hasNEON() && Src->isVectorTy() &&
1483	(Alignment && *Alignment != Align (`16`)) &&
1484	cast<VectorType>(Val: Src)->getElementType()->isDoubleTy()) {
1485	// Unaligned loads/stores are extremely inefficient.
1486	// We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
1487	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Src);
1488	return LT.first * `4`;
1489	}
1490
1491	// MVE can optimize a fpext(load(4xhalf)) using an extending integer load.
1492	// Same for stores.
1493	if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Val: Src) && I &&
1494	((Opcode == Instruction::Load && I->hasOneUse() &&
1495	isa<FPExtInst>(Val: *I->user_begin())) \|\|
1496	(Opcode == Instruction::Store && isa<FPTruncInst>(Val: I->getOperand(i: `0`))))) {
1497	FixedVectorType *SrcVTy = cast<FixedVectorType>(Val: Src);
1498	Type *DstTy =
1499	Opcode == Instruction::Load
1500	? (*I->user_begin())->getType()
1501	: cast<Instruction>(Val: I->getOperand(i: `0`))->getOperand(i: `0`)->getType();
1502	if (SrcVTy->getNumElements() == `4` && SrcVTy->getScalarType()->isHalfTy() &&
1503	DstTy->getScalarType()->isFloatTy())
1504	return ST->getMVEVectorCostFactor(CostKind);
1505	}
1506
1507	int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
1508	? ST->getMVEVectorCostFactor(CostKind)
1509	: `1`;
1510	return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1511	CostKind, OpInfo, I);
1512	}
1513
1514	InstructionCost
1515	ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
1516	unsigned AddressSpace,
1517	TTI::TargetCostKind CostKind) {
1518	if (ST->hasMVEIntegerOps()) {
1519	if (Opcode == Instruction::Load && isLegalMaskedLoad(DataTy: Src, Alignment))
1520	return ST->getMVEVectorCostFactor(CostKind);
1521	if (Opcode == Instruction::Store && isLegalMaskedStore(DataTy: Src, Alignment))
1522	return ST->getMVEVectorCostFactor(CostKind);
1523	}
1524	if (!isa<FixedVectorType>(Val: Src))
1525	return BaseT::getMaskedMemoryOpCost(Opcode, DataTy: Src, Alignment, AddressSpace,
1526	CostKind);
1527	// Scalar cost, which is currently very high due to the efficiency of the
1528	// generated code.
1529	return cast<FixedVectorType>(Val: Src)->getNumElements() * `8`;
1530	}
1531
1532	InstructionCost ARMTTIImpl::getInterleavedMemoryOpCost(
1533	unsigned Opcode, Type VecTy, unsigned* Factor, ArrayRef<unsigned> Indices,
1534	Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1535	bool UseMaskForCond, bool UseMaskForGaps) {
1536	assert(Factor >= `2` && "Invalid interleave factor");
1537	assert(isa<VectorType>(VecTy) && "Expect a vector type");
1538
1539	// vldN/vstN doesn't support vector types of i64/f64 element.
1540	bool EltIs64Bits = DL.getTypeSizeInBits(Ty: VecTy->getScalarType()) == `64`;
1541
1542	if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
1543	!UseMaskForCond && !UseMaskForGaps) {
1544	unsigned NumElts = cast<FixedVectorType>(Val: VecTy)->getNumElements();
1545	auto *SubVecTy =
1546	FixedVectorType::get(ElementType: VecTy->getScalarType(), NumElts: NumElts / Factor);
1547
1548	// vldN/vstN only support legal vector types of size 64 or 128 in bits.
1549	// Accesses having vector types that are a multiple of 128 bits can be
1550	// matched to more than one vldN/vstN instruction.
1551	int BaseCost =
1552	ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor(CostKind) : `1`;
1553	if (NumElts % Factor == `0` &&
1554	TLI->isLegalInterleavedAccessType(Factor, VecTy: SubVecTy, Alignment, DL))
1555	return Factor * BaseCost * TLI->getNumInterleavedAccesses(VecTy: SubVecTy, DL);
1556
1557	// Some smaller than legal interleaved patterns are cheap as we can make
1558	// use of the vmovn or vrev patterns to interleave a standard load. This is
1559	// true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
1560	// promoted differently). The cost of 2 here is then a load and vrev or
1561	// vmovn.
1562	if (ST->hasMVEIntegerOps() && Factor == `2` && NumElts / Factor > `2` &&
1563	VecTy->isIntOrIntVectorTy() &&
1564	DL.getTypeSizeInBits(Ty: SubVecTy).getFixedValue() <= `64`)
1565	return `2` * BaseCost;
1566	}
1567
1568	return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1569	Alignment, AddressSpace, CostKind,
1570	UseMaskForCond, UseMaskForGaps);
1571	}
1572
1573	InstructionCost ARMTTIImpl::getGatherScatterOpCost(
1574	unsigned Opcode, Type DataTy, const* Value Ptr, bool* VariableMask,
1575	Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
1576	using namespace PatternMatch;
1577	if (!ST->hasMVEIntegerOps() \|\| !EnableMaskedGatherScatters)
1578	return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1579	Alignment, CostKind, I);
1580
1581	assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!");
1582	auto *VTy = cast<FixedVectorType>(Val: DataTy);
1583
1584	// TODO: Splitting, once we do that.
1585
1586	unsigned NumElems = VTy->getNumElements();
1587	unsigned EltSize = VTy->getScalarSizeInBits();
1588	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: DataTy);
1589
1590	// For now, it is assumed that for the MVE gather instructions the loads are
1591	// all effectively serialised. This means the cost is the scalar cost
1592	// multiplied by the number of elements being loaded. This is possibly very
1593	// conservative, but even so we still end up vectorising loops because the
1594	// cost per iteration for many loops is lower than for scalar loops.
1595	InstructionCost VectorCost =
1596	NumElems * LT.first * ST->getMVEVectorCostFactor(CostKind);
1597	// The scalarization cost should be a lot higher. We use the number of vector
1598	// elements plus the scalarization overhead. If masking is required then a lot
1599	// of little blocks will be needed and potentially a scalarized p0 mask,
1600	// greatly increasing the cost.
1601	InstructionCost ScalarCost =
1602	NumElems * LT.first + (VariableMask ? NumElems * `5` : `0`) +
1603	BaseT::getScalarizationOverhead(InTy: VTy, /Insert/ true, /Extract/ false,
1604	CostKind) +
1605	BaseT::getScalarizationOverhead(InTy: VTy, /Insert/ false, /Extract/ true,
1606	CostKind);
1607
1608	if (EltSize < `8` \|\| Alignment < EltSize / `8`)
1609	return ScalarCost;
1610
1611	unsigned ExtSize = EltSize;
1612	// Check whether there's a single user that asks for an extended type
1613	if (I != nullptr) {
1614	// Dependent of the caller of this function, a gather instruction will
1615	// either have opcode Instruction::Load or be a call to the masked_gather
1616	// intrinsic
1617	if ((I->getOpcode() == Instruction::Load \|\|
1618	match(I, m_Intrinsic<Intrinsic::masked_gather>())) &&
1619	I->hasOneUse()) {
1620	const User Us = I->users().begin();
1621	if (isa<ZExtInst>(Val: Us) \|\| isa<SExtInst>(Val: Us)) {
1622	// only allow valid type combinations
1623	unsigned TypeSize =
1624	cast<Instruction>(Val: Us)->getType()->getScalarSizeInBits();
1625	if (((TypeSize == `32` && (EltSize == `8` \|\| EltSize == `16`)) \|\|
1626	(TypeSize == `16` && EltSize == `8`)) &&
1627	TypeSize * NumElems == `128`) {
1628	ExtSize = TypeSize;
1629	}
1630	}
1631	}
1632	// Check whether the input data needs to be truncated
1633	TruncInst *T;
1634	if ((I->getOpcode() == Instruction::Store \|\|
1635	match(I, m_Intrinsic<Intrinsic::masked_scatter>())) &&
1636	(T = dyn_cast<TruncInst>(I->getOperand(`0`)))) {
1637	// Only allow valid type combinations
1638	unsigned TypeSize = T->getOperand(i_nocapture: `0`)->getType()->getScalarSizeInBits();
1639	if (((EltSize == `16` && TypeSize == `32`) \|\|
1640	(EltSize == `8` && (TypeSize == `32` \|\| TypeSize == `16`))) &&
1641	TypeSize * NumElems == `128`)
1642	ExtSize = TypeSize;
1643	}
1644	}
1645
1646	if (ExtSize * NumElems != `128` \|\| NumElems < `4`)
1647	return ScalarCost;
1648
1649	// Any (aligned) i32 gather will not need to be scalarised.
1650	if (ExtSize == `32`)
1651	return VectorCost;
1652	// For smaller types, we need to ensure that the gep's inputs are correctly
1653	// extended from a small enough value. Other sizes (including i64) are
1654	// scalarized for now.
1655	if (ExtSize != `8` && ExtSize != `16`)
1656	return ScalarCost;
1657
1658	if (const auto *BC = dyn_cast<BitCastInst>(Val: Ptr))
1659	Ptr = BC->getOperand(i_nocapture: `0`);
1660	if (const auto *GEP = dyn_cast<GetElementPtrInst>(Val: Ptr)) {
1661	if (GEP->getNumOperands() != `2`)
1662	return ScalarCost;
1663	unsigned Scale = DL.getTypeAllocSize(Ty: GEP->getResultElementType());
1664	// Scale needs to be correct (which is only relevant for i16s).
1665	if (Scale != `1` && Scale * `8` != ExtSize)
1666	return ScalarCost;
1667	// And we need to zext (not sext) the indexes from a small enough type.
1668	if (const auto *ZExt = dyn_cast<ZExtInst>(Val: GEP->getOperand(i_nocapture: `1`))) {
1669	if (ZExt->getOperand(i_nocapture: `0`)->getType()->getScalarSizeInBits() <= ExtSize)
1670	return VectorCost;
1671	}
1672	return ScalarCost;
1673	}
1674	return ScalarCost;
1675	}
1676
1677	InstructionCost
1678	ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
1679	std::optional<FastMathFlags> FMF,
1680	TTI::TargetCostKind CostKind) {
1681
1682	EVT ValVT = TLI->getValueType(DL, Ty: ValTy);
1683	int ISD = TLI->InstructionOpcodeToISD(Opcode);
1684	unsigned EltSize = ValVT.getScalarSizeInBits();
1685
1686	// In general floating point reductions are a series of elementwise
1687	// operations, with free extracts on each step. These are either in-order or
1688	// treewise depending on whether that is allowed by the fast math flags.
1689	if ((ISD == ISD::FADD \|\| ISD == ISD::FMUL) &&
1690	((EltSize == `32` && ST->hasVFP2Base()) \|\|
1691	(EltSize == `64` && ST->hasFP64()) \|\|
1692	(EltSize == `16` && ST->hasFullFP16()))) {
1693	unsigned NumElts = cast<FixedVectorType>(Val: ValTy)->getNumElements();
1694	unsigned VecLimit = ST->hasMVEFloatOps() ? `128` : (ST->hasNEON() ? `64` : -`1`);
1695	InstructionCost VecCost = `0`;
1696	while (!TTI::requiresOrderedReduction(FMF) && isPowerOf2_32(Value: NumElts) &&
1697	NumElts * EltSize > VecLimit) {
1698	Type *VecTy = FixedVectorType::get(ElementType: ValTy->getElementType(), NumElts: NumElts / `2`);
1699	VecCost += getArithmeticInstrCost(Opcode, Ty: VecTy, CostKind);
1700	NumElts /= `2`;
1701	}
1702
1703	// For fp16 we need to extract the upper lane elements. MVE can add a
1704	// VREV+FMIN/MAX to perform another vector step instead.
1705	InstructionCost ExtractCost = `0`;
1706	if (!TTI::requiresOrderedReduction(FMF) && ST->hasMVEFloatOps() &&
1707	ValVT.getVectorElementType() == MVT::f16 && NumElts == `8`) {
1708	VecCost += ST->getMVEVectorCostFactor(CostKind) * `2`;
1709	NumElts /= `2`;
1710	} else if (ValVT.getVectorElementType() == MVT::f16)
1711	ExtractCost = NumElts / `2`;
1712
1713	return VecCost + ExtractCost +
1714	NumElts *
1715	getArithmeticInstrCost(Opcode, Ty: ValTy->getElementType(), CostKind);
1716	}
1717
1718	if ((ISD == ISD::AND \|\| ISD == ISD::OR \|\| ISD == ISD::XOR) &&
1719	(EltSize == `64` \|\| EltSize == `32` \|\| EltSize == `16` \|\| EltSize == `8`)) {
1720	unsigned NumElts = cast<FixedVectorType>(Val: ValTy)->getNumElements();
1721	unsigned VecLimit =
1722	ST->hasMVEIntegerOps() ? `128` : (ST->hasNEON() ? `64` : -`1`);
1723	InstructionCost VecCost = `0`;
1724	while (isPowerOf2_32(Value: NumElts) && NumElts * EltSize > VecLimit) {
1725	Type *VecTy = FixedVectorType::get(ElementType: ValTy->getElementType(), NumElts: NumElts / `2`);
1726	VecCost += getArithmeticInstrCost(Opcode, Ty: VecTy, CostKind);
1727	NumElts /= `2`;
1728	}
1729	// For i16/i8, MVE will perform a VREV + VORR/VAND/VEOR for the 64bit vector
1730	// step.
1731	if (ST->hasMVEIntegerOps() && ValVT.getScalarSizeInBits() <= `16` &&
1732	NumElts * EltSize == `64`) {
1733	Type *VecTy = FixedVectorType::get(ElementType: ValTy->getElementType(), NumElts);
1734	VecCost += ST->getMVEVectorCostFactor(CostKind) +
1735	getArithmeticInstrCost(Opcode, Ty: VecTy, CostKind);
1736	NumElts /= `2`;
1737	}
1738
1739	// From here we extract the elements and perform the and/or/xor.
1740	InstructionCost ExtractCost = NumElts;
1741	return VecCost + ExtractCost +
1742	(NumElts - `1`) * getArithmeticInstrCost(
1743	Opcode, Ty: ValTy->getElementType(), CostKind);
1744	}
1745
1746	if (!ST->hasMVEIntegerOps() \|\| !ValVT.isSimple() \|\| ISD != ISD::ADD \|\|
1747	TTI::requiresOrderedReduction(FMF))
1748	return BaseT::getArithmeticReductionCost(Opcode, Ty: ValTy, FMF, CostKind);
1749
1750	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
1751
1752	static const CostTblEntry CostTblAdd[]{
1753	{ISD::ADD, MVT::v16i8, `1`},
1754	{ISD::ADD, MVT::v8i16, `1`},
1755	{ISD::ADD, MVT::v4i32, `1`},
1756	};
1757	if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second))
1758	return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
1759
1760	return BaseT::getArithmeticReductionCost(Opcode, Ty: ValTy, FMF, CostKind);
1761	}
1762
1763	InstructionCost ARMTTIImpl::getExtendedReductionCost(
1764	unsigned Opcode, bool IsUnsigned, Type ResTy, VectorType ValTy,
1765	FastMathFlags FMF, TTI::TargetCostKind CostKind) {
1766	EVT ValVT = TLI->getValueType(DL, Ty: ValTy);
1767	EVT ResVT = TLI->getValueType(DL, Ty: ResTy);
1768
1769	int ISD = TLI->InstructionOpcodeToISD(Opcode);
1770
1771	switch (ISD) {
1772	case ISD::ADD:
1773	if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1774	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
1775
1776	// The legal cases are:
1777	// VADDV u/s 8/16/32
1778	// VADDLV u/s 32
1779	// Codegen currently cannot always handle larger than legal vectors very
1780	// well, especially for predicated reductions where the mask needs to be
1781	// split, so restrict to 128bit or smaller input types.
1782	unsigned RevVTSize = ResVT.getSizeInBits();
1783	if (ValVT.getSizeInBits() <= `128` &&
1784	((LT.second == MVT::v16i8 && RevVTSize <= `32`) \|\|
1785	(LT.second == MVT::v8i16 && RevVTSize <= `32`) \|\|
1786	(LT.second == MVT::v4i32 && RevVTSize <= `64`)))
1787	return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1788	}
1789	break;
1790	default:
1791	break;
1792	}
1793	return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, Ty: ValTy, FMF,
1794	CostKind);
1795	}
1796
1797	InstructionCost
1798	ARMTTIImpl::getMulAccReductionCost(bool IsUnsigned, Type *ResTy,
1799	VectorType *ValTy,
1800	TTI::TargetCostKind CostKind) {
1801	EVT ValVT = TLI->getValueType(DL, Ty: ValTy);
1802	EVT ResVT = TLI->getValueType(DL, Ty: ResTy);
1803
1804	if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1805	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
1806
1807	// The legal cases are:
1808	// VMLAV u/s 8/16/32
1809	// VMLALV u/s 16/32
1810	// Codegen currently cannot always handle larger than legal vectors very
1811	// well, especially for predicated reductions where the mask needs to be
1812	// split, so restrict to 128bit or smaller input types.
1813	unsigned RevVTSize = ResVT.getSizeInBits();
1814	if (ValVT.getSizeInBits() <= `128` &&
1815	((LT.second == MVT::v16i8 && RevVTSize <= `32`) \|\|
1816	(LT.second == MVT::v8i16 && RevVTSize <= `64`) \|\|
1817	(LT.second == MVT::v4i32 && RevVTSize <= `64`)))
1818	return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1819	}
1820
1821	return BaseT::getMulAccReductionCost(IsUnsigned, ResTy, Ty: ValTy, CostKind);
1822	}
1823
1824	InstructionCost
1825	ARMTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
1826	FastMathFlags FMF,
1827	TTI::TargetCostKind CostKind) {
1828	EVT ValVT = TLI->getValueType(DL, Ty);
1829
1830	// In general floating point reductions are a series of elementwise
1831	// operations, with free extracts on each step. These are either in-order or
1832	// treewise depending on whether that is allowed by the fast math flags.
1833	if ((IID == Intrinsic::minnum \|\| IID == Intrinsic::maxnum) &&
1834	((ValVT.getVectorElementType() == MVT::f32 && ST->hasVFP2Base()) \|\|
1835	(ValVT.getVectorElementType() == MVT::f64 && ST->hasFP64()) \|\|
1836	(ValVT.getVectorElementType() == MVT::f16 && ST->hasFullFP16()))) {
1837	unsigned NumElts = cast<FixedVectorType>(Val: Ty)->getNumElements();
1838	unsigned EltSize = ValVT.getScalarSizeInBits();
1839	unsigned VecLimit = ST->hasMVEFloatOps() ? `128` : (ST->hasNEON() ? `64` : -`1`);
1840	InstructionCost VecCost;
1841	while (isPowerOf2_32(Value: NumElts) && NumElts * EltSize > VecLimit) {
1842	Type *VecTy = FixedVectorType::get(ElementType: Ty->getElementType(), NumElts: NumElts/`2`);
1843	IntrinsicCostAttributes ICA(IID, VecTy, {VecTy, VecTy}, FMF);
1844	VecCost += getIntrinsicInstrCost(ICA, CostKind);
1845	NumElts /= `2`;
1846	}
1847
1848	// For fp16 we need to extract the upper lane elements. MVE can add a
1849	// VREV+FMIN/MAX to perform another vector step instead.
1850	InstructionCost ExtractCost = `0`;
1851	if (ST->hasMVEFloatOps() && ValVT.getVectorElementType() == MVT::f16 &&
1852	NumElts == `8`) {
1853	VecCost += ST->getMVEVectorCostFactor(CostKind) * `2`;
1854	NumElts /= `2`;
1855	} else if (ValVT.getVectorElementType() == MVT::f16)
1856	ExtractCost = cast<FixedVectorType>(Val: Ty)->getNumElements() / `2`;
1857
1858	IntrinsicCostAttributes ICA(IID, Ty->getElementType(),
1859	{Ty->getElementType(), Ty->getElementType()},
1860	FMF);
1861	return VecCost + ExtractCost +
1862	(NumElts - `1`) * getIntrinsicInstrCost(ICA, CostKind);
1863	}
1864
1865	if (IID == Intrinsic::smin \|\| IID == Intrinsic::smax \|\|
1866	IID == Intrinsic::umin \|\| IID == Intrinsic::umax) {
1867	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1868
1869	// All costs are the same for u/s min/max. These lower to vminv, which are
1870	// given a slightly higher cost as they tend to take multiple cycles for
1871	// smaller type sizes.
1872	static const CostTblEntry CostTblAdd[]{
1873	{ISD::SMIN, MVT::v16i8, `4`},
1874	{ISD::SMIN, MVT::v8i16, `3`},
1875	{ISD::SMIN, MVT::v4i32, `2`},
1876	};
1877	if (const auto *Entry = CostTableLookup(CostTblAdd, ISD::SMIN, LT.second))
1878	return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
1879	}
1880
1881	return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1882	}
1883
1884	InstructionCost
1885	ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1886	TTI::TargetCostKind CostKind) {
1887	switch (ICA.getID()) {
1888	case Intrinsic::get_active_lane_mask:
1889	// Currently we make a somewhat optimistic assumption that
1890	// active_lane_mask's are always free. In reality it may be freely folded
1891	// into a tail predicated loop, expanded into a VCPT or expanded into a lot
1892	// of add/icmp code. We may need to improve this in the future, but being
1893	// able to detect if it is free or not involves looking at a lot of other
1894	// code. We currently assume that the vectorizer inserted these, and knew
1895	// what it was doing in adding one.
1896	if (ST->hasMVEIntegerOps())
1897	return `0`;
1898	break;
1899	case Intrinsic::sadd_sat:
1900	case Intrinsic::ssub_sat:
1901	case Intrinsic::uadd_sat:
1902	case Intrinsic::usub_sat: {
1903	if (!ST->hasMVEIntegerOps())
1904	break;
1905	Type *VT = ICA.getReturnType();
1906
1907	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: VT);
1908	if (LT.second == MVT::v4i32 \|\| LT.second == MVT::v8i16 \|\|
1909	LT.second == MVT::v16i8) {
1910	// This is a base cost of 1 for the vqadd, plus 3 extract shifts if we
1911	// need to extend the type, as it uses shr(qadd(shl, shl)).
1912	unsigned Instrs =
1913	LT.second.getScalarSizeInBits() == VT->getScalarSizeInBits() ? `1` : `4`;
1914	return LT.first * ST->getMVEVectorCostFactor(CostKind) * Instrs;
1915	}
1916	break;
1917	}
1918	case Intrinsic::abs:
1919	case Intrinsic::smin:
1920	case Intrinsic::smax:
1921	case Intrinsic::umin:
1922	case Intrinsic::umax: {
1923	if (!ST->hasMVEIntegerOps())
1924	break;
1925	Type *VT = ICA.getReturnType();
1926
1927	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: VT);
1928	if (LT.second == MVT::v4i32 \|\| LT.second == MVT::v8i16 \|\|
1929	LT.second == MVT::v16i8)
1930	return LT.first * ST->getMVEVectorCostFactor(CostKind);
1931	break;
1932	}
1933	case Intrinsic::minnum:
1934	case Intrinsic::maxnum: {
1935	if (!ST->hasMVEFloatOps())
1936	break;
1937	Type *VT = ICA.getReturnType();
1938	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: VT);
1939	if (LT.second == MVT::v4f32 \|\| LT.second == MVT::v8f16)
1940	return LT.first * ST->getMVEVectorCostFactor(CostKind);
1941	break;
1942	}
1943	case Intrinsic::fptosi_sat:
1944	case Intrinsic::fptoui_sat: {
1945	if (ICA.getArgTypes().empty())
1946	break;
1947	bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
1948	auto LT = getTypeLegalizationCost(Ty: ICA.getArgTypes()[`0`]);
1949	EVT MTy = TLI->getValueType(DL, Ty: ICA.getReturnType());
1950	// Check for the legal types, with the corect subtarget features.
1951	if ((ST->hasVFP2Base() && LT.second == MVT::f32 && MTy == MVT::i32) \|\|
1952	(ST->hasFP64() && LT.second == MVT::f64 && MTy == MVT::i32) \|\|
1953	(ST->hasFullFP16() && LT.second == MVT::f16 && MTy == MVT::i32))
1954	return LT.first;
1955
1956	// Equally for MVE vector types
1957	if (ST->hasMVEFloatOps() &&
1958	(LT.second == MVT::v4f32 \|\| LT.second == MVT::v8f16) &&
1959	LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())
1960	return LT.first * ST->getMVEVectorCostFactor(CostKind);
1961
1962	// Otherwise we use a legal convert followed by a min+max
1963	if (((ST->hasVFP2Base() && LT.second == MVT::f32) \|\|
1964	(ST->hasFP64() && LT.second == MVT::f64) \|\|
1965	(ST->hasFullFP16() && LT.second == MVT::f16) \|\|
1966	(ST->hasMVEFloatOps() &&
1967	(LT.second == MVT::v4f32 \|\| LT.second == MVT::v8f16))) &&
1968	LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
1969	Type *LegalTy = Type::getIntNTy(C&: ICA.getReturnType()->getContext(),
1970	N: LT.second.getScalarSizeInBits());
1971	InstructionCost Cost =
1972	LT.second.isVector() ? ST->getMVEVectorCostFactor(CostKind) : `1`;
1973	IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin
1974	: Intrinsic::umin,
1975	LegalTy, {LegalTy, LegalTy});
1976	Cost += getIntrinsicInstrCost(ICA: Attrs1, CostKind);
1977	IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax
1978	: Intrinsic::umax,
1979	LegalTy, {LegalTy, LegalTy});
1980	Cost += getIntrinsicInstrCost(ICA: Attrs2, CostKind);
1981	return LT.first * Cost;
1982	}
1983	break;
1984	}
1985	}
1986
1987	return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1988	}
1989
1990	bool ARMTTIImpl::isLoweredToCall(const Function *F) {
1991	if (!F->isIntrinsic())
1992	return BaseT::isLoweredToCall(F);
1993
1994	// Assume all Arm-specific intrinsics map to an instruction.
1995	if (F->getName().starts_with(Prefix: "llvm.arm"))
1996	return false;
1997
1998	switch (F->getIntrinsicID()) {
1999	default: break;
2000	case Intrinsic::powi:
2001	case Intrinsic::sin:
2002	case Intrinsic::cos:
2003	case Intrinsic::pow:
2004	case Intrinsic::log:
2005	case Intrinsic::log10:
2006	case Intrinsic::log2:
2007	case Intrinsic::exp:
2008	case Intrinsic::exp2:
2009	return true;
2010	case Intrinsic::sqrt:
2011	case Intrinsic::fabs:
2012	case Intrinsic::copysign:
2013	case Intrinsic::floor:
2014	case Intrinsic::ceil:
2015	case Intrinsic::trunc:
2016	case Intrinsic::rint:
2017	case Intrinsic::nearbyint:
2018	case Intrinsic::round:
2019	case Intrinsic::canonicalize:
2020	case Intrinsic::lround:
2021	case Intrinsic::llround:
2022	case Intrinsic::lrint:
2023	case Intrinsic::llrint:
2024	if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
2025	return true;
2026	if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
2027	return true;
2028	// Some operations can be handled by vector instructions and assume
2029	// unsupported vectors will be expanded into supported scalar ones.
2030	// TODO Handle scalar operations properly.
2031	return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
2032	case Intrinsic::masked_store:
2033	case Intrinsic::masked_load:
2034	case Intrinsic::masked_gather:
2035	case Intrinsic::masked_scatter:
2036	return !ST->hasMVEIntegerOps();
2037	case Intrinsic::sadd_with_overflow:
2038	case Intrinsic::uadd_with_overflow:
2039	case Intrinsic::ssub_with_overflow:
2040	case Intrinsic::usub_with_overflow:
2041	case Intrinsic::sadd_sat:
2042	case Intrinsic::uadd_sat:
2043	case Intrinsic::ssub_sat:
2044	case Intrinsic::usub_sat:
2045	return false;
2046	}
2047
2048	return BaseT::isLoweredToCall(F);
2049	}
2050
2051	bool ARMTTIImpl::maybeLoweredToCall(Instruction &I) {
2052	unsigned ISD = TLI->InstructionOpcodeToISD(Opcode: I.getOpcode());
2053	EVT VT = TLI->getValueType(DL, Ty: I.getType(), AllowUnknown: true);
2054	if (TLI->getOperationAction(Op: ISD, VT) == TargetLowering::LibCall)
2055	return true;
2056
2057	// Check if an intrinsic will be lowered to a call and assume that any
2058	// other CallInst will generate a bl.
2059	if (auto *Call = dyn_cast<CallInst>(Val: &I)) {
2060	if (auto *II = dyn_cast<IntrinsicInst>(Val: Call)) {
2061	switch(II->getIntrinsicID()) {
2062	case Intrinsic::memcpy:
2063	case Intrinsic::memset:
2064	case Intrinsic::memmove:
2065	return getNumMemOps(I: II) == -`1`;
2066	default:
2067	if (const Function *F = Call->getCalledFunction())
2068	return isLoweredToCall(F);
2069	}
2070	}
2071	return true;
2072	}
2073
2074	// FPv5 provides conversions between integer, double-precision,
2075	// single-precision, and half-precision formats.
2076	switch (I.getOpcode()) {
2077	default:
2078	break;
2079	case Instruction::FPToSI:
2080	case Instruction::FPToUI:
2081	case Instruction::SIToFP:
2082	case Instruction::UIToFP:
2083	case Instruction::FPTrunc:
2084	case Instruction::FPExt:
2085	return !ST->hasFPARMv8Base();
2086	}
2087
2088	// FIXME: Unfortunately the approach of checking the Operation Action does
2089	// not catch all cases of Legalization that use library calls. Our
2090	// Legalization step categorizes some transformations into library calls as
2091	// Custom, Expand or even Legal when doing type legalization. So for now
2092	// we have to special case for instance the SDIV of 64bit integers and the
2093	// use of floating point emulation.
2094	if (VT.isInteger() && VT.getSizeInBits() >= `64`) {
2095	switch (ISD) {
2096	default:
2097	break;
2098	case ISD::SDIV:
2099	case ISD::UDIV:
2100	case ISD::SREM:
2101	case ISD::UREM:
2102	case ISD::SDIVREM:
2103	case ISD::UDIVREM:
2104	return true;
2105	}
2106	}
2107
2108	// Assume all other non-float operations are supported.
2109	if (!VT.isFloatingPoint())
2110	return false;
2111
2112	// We'll need a library call to handle most floats when using soft.
2113	if (TLI->useSoftFloat()) {
2114	switch (I.getOpcode()) {
2115	default:
2116	return true;
2117	case Instruction::Alloca:
2118	case Instruction::Load:
2119	case Instruction::Store:
2120	case Instruction::Select:
2121	case Instruction::PHI:
2122	return false;
2123	}
2124	}
2125
2126	// We'll need a libcall to perform double precision operations on a single
2127	// precision only FPU.
2128	if (I.getType()->isDoubleTy() && !ST->hasFP64())
2129	return true;
2130
2131	// Likewise for half precision arithmetic.
2132	if (I.getType()->isHalfTy() && !ST->hasFullFP16())
2133	return true;
2134
2135	return false;
2136	}
2137
2138	bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
2139	AssumptionCache &AC,
2140	TargetLibraryInfo *LibInfo,
2141	HardwareLoopInfo &HWLoopInfo) {
2142	// Low-overhead branches are only supported in the 'low-overhead branch'
2143	// extension of v8.1-m.
2144	if (!ST->hasLOB() \|\| DisableLowOverheadLoops) {
2145	LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n");
2146	return false;
2147	}
2148
2149	if (!SE.hasLoopInvariantBackedgeTakenCount(L)) {
2150	LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n");
2151	return false;
2152	}
2153
2154	const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2155	if (isa<SCEVCouldNotCompute>(Val: BackedgeTakenCount)) {
2156	LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n");
2157	return false;
2158	}
2159
2160	const SCEV *TripCountSCEV =
2161	SE.getAddExpr(LHS: BackedgeTakenCount,
2162	RHS: SE.getOne(Ty: BackedgeTakenCount->getType()));
2163
2164	// We need to store the trip count in LR, a 32-bit register.
2165	if (SE.getUnsignedRangeMax(S: TripCountSCEV).getBitWidth() > `32`) {
2166	LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n");
2167	return false;
2168	}
2169
2170	// Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
2171	// point in generating a hardware loop if that's going to happen.
2172
2173	auto IsHardwareLoopIntrinsic = [](Instruction &I) {
2174	if (auto *Call = dyn_cast<IntrinsicInst>(Val: &I)) {
2175	switch (Call->getIntrinsicID()) {
2176	default:
2177	break;
2178	case Intrinsic::start_loop_iterations:
2179	case Intrinsic::test_start_loop_iterations:
2180	case Intrinsic::loop_decrement:
2181	case Intrinsic::loop_decrement_reg:
2182	return true;
2183	}
2184	}
2185	return false;
2186	};
2187
2188	// Scan the instructions to see if there's any that we know will turn into a
2189	// call or if this loop is already a low-overhead loop or will become a tail
2190	// predicated loop.
2191	bool IsTailPredLoop = false;
2192	auto ScanLoop = [&](Loop *L) {
2193	for (auto *BB : L->getBlocks()) {
2194	for (auto &I : *BB) {
2195	if (maybeLoweredToCall(I) \|\| IsHardwareLoopIntrinsic (I) \|\|
2196	isa<InlineAsm>(Val: I)) {
2197	LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n");
2198	return false;
2199	}
2200	if (auto *II = dyn_cast<IntrinsicInst>(&I))
2201	IsTailPredLoop \|=
2202	II->getIntrinsicID() == Intrinsic::get_active_lane_mask \|\|
2203	II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 \|\|
2204	II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 \|\|
2205	II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 \|\|
2206	II->getIntrinsicID() == Intrinsic::arm_mve_vctp64;
2207	}
2208	}
2209	return true;
2210	};
2211
2212	// Visit inner loops.
2213	for (auto Inner : L)
2214	if (!ScanLoop (Inner))
2215	return false;
2216
2217	if (!ScanLoop (L))
2218	return false;
2219
2220	// TODO: Check whether the trip count calculation is expensive. If L is the
2221	// inner loop but we know it has a low trip count, calculating that trip
2222	// count (in the parent loop) may be detrimental.
2223
2224	LLVMContext &C = L->getHeader()->getContext();
2225	HWLoopInfo.CounterInReg = true;
2226	HWLoopInfo.IsNestingLegal = false;
2227	HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop;
2228	HWLoopInfo.CountType = Type::getInt32Ty(C);
2229	HWLoopInfo.LoopDecrement = ConstantInt::get(Ty: HWLoopInfo.CountType, V: `1`);
2230	return true;
2231	}
2232
2233	static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
2234	// We don't allow icmp's, and because we only look at single block loops,
2235	// we simply count the icmps, i.e. there should only be 1 for the backedge.
2236	if (isa<ICmpInst>(Val: &I) && ++ICmpCount > `1`)
2237	return false;
2238	// FIXME: This is a workaround for poor cost modelling. Min/Max intrinsics are
2239	// not currently canonical, but soon will be. Code without them uses icmp, and
2240	// so is not tail predicated as per the condition above. In order to get the
2241	// same performance we treat min and max the same as an icmp for tailpred
2242	// purposes for the moment (we often rely on non-tailpred and higher VF's to
2243	// pick more optimial instructions like VQDMULH. They need to be recognized
2244	// directly by the vectorizer).
2245	if (auto *II = dyn_cast<IntrinsicInst>(Val: &I))
2246	if ((II->getIntrinsicID() == Intrinsic::smin \|\|
2247	II->getIntrinsicID() == Intrinsic::smax \|\|
2248	II->getIntrinsicID() == Intrinsic::umin \|\|
2249	II->getIntrinsicID() == Intrinsic::umax) &&
2250	++ICmpCount > `1`)
2251	return false;
2252
2253	if (isa<FCmpInst>(Val: &I))
2254	return false;
2255
2256	// We could allow extending/narrowing FP loads/stores, but codegen is
2257	// too inefficient so reject this for now.
2258	if (isa<FPExtInst>(Val: &I) \|\| isa<FPTruncInst>(Val: &I))
2259	return false;
2260
2261	// Extends have to be extending-loads
2262	if (isa<SExtInst>(Val: &I) \|\| isa<ZExtInst>(Val: &I) )
2263	if (!I.getOperand(i: `0`)->hasOneUse() \|\| !isa<LoadInst>(Val: I.getOperand(i: `0`)))
2264	return false;
2265
2266	// Truncs have to be narrowing-stores
2267	if (isa<TruncInst>(Val: &I) )
2268	if (!I.hasOneUse() \|\| !isa<StoreInst>(Val: *I.user_begin()))
2269	return false;
2270
2271	return true;
2272	}
2273
2274	// To set up a tail-predicated loop, we need to know the total number of
2275	// elements processed by that loop. Thus, we need to determine the element
2276	// size and:
2277	// 1) it should be uniform for all operations in the vector loop, so we
2278	// e.g. don't want any widening/narrowing operations.
2279	// 2) it should be smaller than i64s because we don't have vector operations
2280	// that work on i64s.
2281	// 3) we don't want elements to be reversed or shuffled, to make sure the
2282	// tail-predication masks/predicates the right lanes.
2283	//
2284	static bool canTailPredicateLoop(Loop L, LoopInfo LI, ScalarEvolution &SE,
2285	const DataLayout &DL,
2286	const LoopAccessInfo *LAI) {
2287	LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
2288
2289	// If there are live-out values, it is probably a reduction. We can predicate
2290	// most reduction operations freely under MVE using a combination of
2291	// prefer-predicated-reduction-select and inloop reductions. We limit this to
2292	// floating point and integer reductions, but don't check for operators
2293	// specifically here. If the value ends up not being a reduction (and so the
2294	// vectorizer cannot tailfold the loop), we should fall back to standard
2295	// vectorization automatically.
2296	SmallVector< Instruction *, `8` > LiveOuts;
2297	LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
2298	bool ReductionsDisabled =
2299	EnableTailPredication == TailPredication::EnabledNoReductions \|\|
2300	EnableTailPredication == TailPredication::ForceEnabledNoReductions;
2301
2302	for (auto *I : LiveOuts) {
2303	if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() &&
2304	!I->getType()->isHalfTy()) {
2305	LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "
2306	"live-out value\n");
2307	return false;
2308	}
2309	if (ReductionsDisabled) {
2310	LLVM_DEBUG(dbgs() << "Reductions not enabled\n");
2311	return false;
2312	}
2313	}
2314
2315	// Next, check that all instructions can be tail-predicated.
2316	PredicatedScalarEvolution PSE = LAI->getPSE();
2317	SmallVector<Instruction *, `16`> LoadStores;
2318	int ICmpCount = `0`;
2319
2320	for (BasicBlock *BB : L->blocks()) {
2321	for (Instruction &I : BB->instructionsWithoutDebug()) {
2322	if (isa<PHINode>(Val: &I))
2323	continue;
2324	if (!canTailPredicateInstruction(I, ICmpCount)) {
2325	LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump());
2326	return false;
2327	}
2328
2329	Type *T = I.getType();
2330	if (T->getScalarSizeInBits() > `32`) {
2331	LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump());
2332	return false;
2333	}
2334	if (isa<StoreInst>(Val: I) \|\| isa<LoadInst>(Val: I)) {
2335	Value *Ptr = getLoadStorePointerOperand(V: &I);
2336	Type *AccessTy = getLoadStoreType(I: &I);
2337	int64_t NextStride = getPtrStride(PSE, AccessTy, Ptr, Lp: L).value_or(u: `0`);
2338	if (NextStride == `1`) {
2339	// TODO: for now only allow consecutive strides of 1. We could support
2340	// other strides as long as it is uniform, but let's keep it simple
2341	// for now.
2342	continue;
2343	} else if (NextStride == -`1` \|\|
2344	(NextStride == `2` && MVEMaxSupportedInterleaveFactor >= `2`) \|\|
2345	(NextStride == `4` && MVEMaxSupportedInterleaveFactor >= `4`)) {
2346	LLVM_DEBUG(dbgs()
2347	<< "Consecutive strides of 2 found, vld2/vstr2 can't "
2348	"be tail-predicated\n.");
2349	return false;
2350	// TODO: don't tail predicate if there is a reversed load?
2351	} else if (EnableMaskedGatherScatters) {
2352	// Gather/scatters do allow loading from arbitrary strides, at
2353	// least if they are loop invariant.
2354	// TODO: Loop variant strides should in theory work, too, but
2355	// this requires further testing.
2356	const SCEV *PtrScev = PSE.getSE()->getSCEV(V: Ptr);
2357	if (auto AR = dyn_cast<SCEVAddRecExpr>(Val: PtrScev)) {
2358	const SCEV Step = AR->getStepRecurrence(SE&: PSE.getSE());
2359	if (PSE.getSE()->isLoopInvariant(S: Step, L))
2360	continue;
2361	}
2362	}
2363	LLVM_DEBUG(dbgs() << "Bad stride found, can't "
2364	"tail-predicate\n.");
2365	return false;
2366	}
2367	}
2368	}
2369
2370	LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n");
2371	return true;
2372	}
2373
2374	bool ARMTTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) {
2375	if (!EnableTailPredication) {
2376	LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n");
2377	return false;
2378	}
2379
2380	// Creating a predicated vector loop is the first step for generating a
2381	// tail-predicated hardware loop, for which we need the MVE masked
2382	// load/stores instructions:
2383	if (!ST->hasMVEIntegerOps())
2384	return false;
2385
2386	LoopVectorizationLegality *LVL = TFI->LVL;
2387	Loop *L = LVL->getLoop();
2388
2389	// For now, restrict this to single block loops.
2390	if (L->getNumBlocks() > `1`) {
2391	LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "
2392	"loop.\n");
2393	return false;
2394	}
2395
2396	assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected");
2397
2398	LoopInfo *LI = LVL->getLoopInfo();
2399	HardwareLoopInfo HWLoopInfo(L);
2400	if (!HWLoopInfo.canAnalyze(LI&: *LI)) {
2401	LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2402	"analyzable.\n");
2403	return false;
2404	}
2405
2406	AssumptionCache *AC = LVL->getAssumptionCache();
2407	ScalarEvolution *SE = LVL->getScalarEvolution();
2408
2409	// This checks if we have the low-overhead branch architecture
2410	// extension, and if we will create a hardware-loop:
2411	if (!isHardwareLoopProfitable(L, SE&: SE, AC&: AC, LibInfo: TFI->TLI, HWLoopInfo)) {
2412	LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2413	"profitable.\n");
2414	return false;
2415	}
2416
2417	DominatorTree *DT = LVL->getDominatorTree();
2418	if (!HWLoopInfo.isHardwareLoopCandidate(SE&: SE, LI&: LI, DT&: *DT)) {
2419	LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2420	"a candidate.\n");
2421	return false;
2422	}
2423
2424	return canTailPredicateLoop(L, LI, SE&: *SE, DL, LAI: LVL->getLAI());
2425	}
2426
2427	TailFoldingStyle
2428	ARMTTIImpl::getPreferredTailFoldingStyle(bool IVUpdateMayOverflow) const {
2429	if (!ST->hasMVEIntegerOps() \|\| !EnableTailPredication)
2430	return TailFoldingStyle::DataWithoutLaneMask;
2431
2432	// Intrinsic @llvm.get.active.lane.mask is supported.
2433	// It is used in the MVETailPredication pass, which requires the number of
2434	// elements processed by this vector loop to setup the tail-predicated
2435	// loop.
2436	return TailFoldingStyle::Data;
2437	}
2438	void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
2439	TTI::UnrollingPreferences &UP,
2440	OptimizationRemarkEmitter *ORE) {
2441	// Enable Upper bound unrolling universally, providing that we do not see an
2442	// active lane mask, which will be better kept as a loop to become tail
2443	// predicated than to be conditionally unrolled.
2444	UP.UpperBound =
2445	!ST->hasMVEIntegerOps() \|\| !any_of(Range&: *L->getHeader(), P: [](Instruction &I) {
2446	return isa<IntrinsicInst>(I) &&
2447	cast<IntrinsicInst>(I).getIntrinsicID() ==
2448	Intrinsic::get_active_lane_mask;
2449	});
2450
2451	// Only currently enable these preferences for M-Class cores.
2452	if (!ST->isMClass())
2453	return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2454
2455	// Disable loop unrolling for Oz and Os.
2456	UP.OptSizeThreshold = `0`;
2457	UP.PartialOptSizeThreshold = `0`;
2458	if (L->getHeader()->getParent()->hasOptSize())
2459	return;
2460
2461	SmallVector<BasicBlock*, `4`> ExitingBlocks;
2462	L->getExitingBlocks(ExitingBlocks);
2463	LLVM_DEBUG(dbgs() << "Loop has:\n"
2464	<< "Blocks: " << L->getNumBlocks() << "\n"
2465	<< "Exit blocks: " << ExitingBlocks.size() << "\n");
2466
2467	// Only allow another exit other than the latch. This acts as an early exit
2468	// as it mirrors the profitability calculation of the runtime unroller.
2469	if (ExitingBlocks.size() > `2`)
2470	return;
2471
2472	// Limit the CFG of the loop body for targets with a branch predictor.
2473	// Allowing 4 blocks permits if-then-else diamonds in the body.
2474	if (ST->hasBranchPredictor() && L->getNumBlocks() > `4`)
2475	return;
2476
2477	// Don't unroll vectorized loops, including the remainder loop
2478	if (getBooleanLoopAttribute(TheLoop: L, Name: "llvm.loop.isvectorized"))
2479	return;
2480
2481	// Scan the loop: don't unroll loops with calls as this could prevent
2482	// inlining.
2483	InstructionCost Cost = `0`;
2484	for (auto *BB : L->getBlocks()) {
2485	for (auto &I : *BB) {
2486	// Don't unroll vectorised loop. MVE does not benefit from it as much as
2487	// scalar code.
2488	if (I.getType()->isVectorTy())
2489	return;
2490
2491	if (isa<CallInst>(Val: I) \|\| isa<InvokeInst>(Val: I)) {
2492	if (const Function *F = cast<CallBase>(Val&: I).getCalledFunction()) {
2493	if (!isLoweredToCall(F))
2494	continue;
2495	}
2496	return;
2497	}
2498
2499	SmallVector<const Value*, `4`> Operands(I.operand_values());
2500	Cost += getInstructionCost(U: &I, Operands,
2501	CostKind: TargetTransformInfo::TCK_SizeAndLatency);
2502	}
2503	}
2504
2505	// On v6m cores, there are very few registers available. We can easily end up
2506	// spilling and reloading more registers in an unrolled loop. Look at the
2507	// number of LCSSA phis as a rough measure of how many registers will need to
2508	// be live out of the loop, reducing the default unroll count if more than 1
2509	// value is needed. In the long run, all of this should be being learnt by a
2510	// machine.
2511	unsigned UnrollCount = `4`;
2512	if (ST->isThumb1Only()) {
2513	unsigned ExitingValues = `0`;
2514	SmallVector<BasicBlock *, `4`> ExitBlocks;
2515	L->getExitBlocks(ExitBlocks);
2516	for (auto *Exit : ExitBlocks) {
2517	// Count the number of LCSSA phis. Exclude values coming from GEP's as
2518	// only the last is expected to be needed for address operands.
2519	unsigned LiveOuts = count_if(Range: Exit->phis(), P: [](auto &PH) {
2520	return PH.getNumOperands() != `1` \|\|
2521	!isa<GetElementPtrInst>(PH.getOperand(`0`));
2522	});
2523	ExitingValues = ExitingValues < LiveOuts ? LiveOuts : ExitingValues;
2524	}
2525	if (ExitingValues)
2526	UnrollCount /= ExitingValues;
2527	if (UnrollCount <= `1`)
2528	return;
2529	}
2530
2531	LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2532	LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n");
2533
2534	UP.Partial = true;
2535	UP.Runtime = true;
2536	UP.UnrollRemainder = true;
2537	UP.DefaultUnrollRuntimeCount = UnrollCount;
2538	UP.UnrollAndJam = true;
2539	UP.UnrollAndJamInnerLoopThreshold = `60`;
2540
2541	// Force unrolling small loops can be very useful because of the branch
2542	// taken cost of the backedge.
2543	if (Cost < `12`)
2544	UP.Force = true;
2545	}
2546
2547	void ARMTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
2548	TTI::PeelingPreferences &PP) {
2549	BaseT::getPeelingPreferences(L, SE, PP);
2550	}
2551
2552	bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty,
2553	TTI::ReductionFlags Flags) const {
2554	if (!ST->hasMVEIntegerOps())
2555	return false;
2556
2557	unsigned ScalarBits = Ty->getScalarSizeInBits();
2558	switch (Opcode) {
2559	case Instruction::Add:
2560	return ScalarBits <= `64`;
2561	default:
2562	return false;
2563	}
2564	}
2565
2566	bool ARMTTIImpl::preferPredicatedReductionSelect(
2567	unsigned Opcode, Type Ty, TTI::ReductionFlags Flags) const* {
2568	if (!ST->hasMVEIntegerOps())
2569	return false;
2570	return true;
2571	}
2572
2573	InstructionCost ARMTTIImpl::getScalingFactorCost(Type Ty, GlobalValue BaseGV,
2574	int64_t BaseOffset,
2575	bool HasBaseReg, int64_t Scale,
2576	unsigned AddrSpace) const {
2577	TargetLoweringBase::AddrMode AM;
2578	AM.BaseGV = BaseGV;
2579	AM.BaseOffs = BaseOffset;
2580	AM.HasBaseReg = HasBaseReg;
2581	AM.Scale = Scale;
2582	if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AS: AddrSpace)) {
2583	if (ST->hasFPAO())
2584	return AM.Scale < `0` ? `1` : `0`; // positive offsets execute faster
2585	return `0`;
2586	}
2587	return -`1`;
2588	}
2589
2590	bool ARMTTIImpl::hasArmWideBranch(bool Thumb) const {
2591	if (Thumb) {
2592	// B.W is available in any Thumb2-supporting target, and also in every
2593	// version of Armv8-M, even Baseline which does not include the rest of
2594	// Thumb2.
2595	return ST->isThumb2() \|\| ST->hasV8MBaselineOps();
2596	} else {
2597	// B is available in all versions of the Arm ISA, so the only question is
2598	// whether that ISA is available at all.
2599	return ST->hasARMOps();
2600	}
2601	}
2602

source code of llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp