AArch64TargetTransformInfo.cpp source code [llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp]

1	//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#include "AArch64TargetTransformInfo.h"
10	#include "AArch64ExpandImm.h"
11	#include "AArch64PerfectShuffle.h"
12	#include "MCTargetDesc/AArch64AddressingModes.h"
13	#include "llvm/Analysis/IVDescriptors.h"
14	#include "llvm/Analysis/LoopInfo.h"
15	#include "llvm/Analysis/TargetTransformInfo.h"
16	#include "llvm/CodeGen/BasicTTIImpl.h"
17	#include "llvm/CodeGen/CostTable.h"
18	#include "llvm/CodeGen/TargetLowering.h"
19	#include "llvm/IR/IntrinsicInst.h"
20	#include "llvm/IR/Intrinsics.h"
21	#include "llvm/IR/IntrinsicsAArch64.h"
22	#include "llvm/IR/PatternMatch.h"
23	#include "llvm/Support/Debug.h"
24	#include "llvm/Transforms/InstCombine/InstCombiner.h"
25	#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
26	#include <algorithm>
27	#include <optional>
28	using namespace llvm;
29	using namespace llvm::PatternMatch;
30
31	#define DEBUG_TYPE "aarch64tti"
32
33	static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
34	cl::init(Val: true), cl::Hidden);
35
36	static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(Val: `10`),
37	cl::Hidden);
38
39	static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
40	cl::init(Val: `10`), cl::Hidden);
41
42	static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
43	cl::init(Val: `15`), cl::Hidden);
44
45	static cl::opt<unsigned>
46	NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(Val: `10`),
47	cl::Hidden);
48
49	static cl::opt<unsigned> CallPenaltyChangeSM(
50	"call-penalty-sm-change", cl::init(Val: `5`), cl::Hidden,
51	cl::desc (
52	"Penalty of calling a function that requires a change to PSTATE.SM"));
53
54	static cl::opt<unsigned> InlineCallPenaltyChangeSM(
55	"inline-call-penalty-sm-change", cl::init(Val: `10`), cl::Hidden,
56	cl::desc ("Penalty of inlining a call that requires a change to PSTATE.SM"));
57
58	static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
59	cl::init(Val: true), cl::Hidden);
60
61	namespace {
62	class TailFoldingOption {
63	// These bitfields will only ever be set to something non-zero in operator=,
64	// when setting the -sve-tail-folding option. This option should always be of
65	// the form (default\|simple\|all\|disable)[+(Flag1\|Flag2\|etc)], where here
66	// InitialBits is one of (disabled\|all\|simple). EnableBits represents
67	// additional flags we're enabling, and DisableBits for those flags we're
68	// disabling. The default flag is tracked in the variable NeedsDefault, since
69	// at the time of setting the option we may not know what the default value
70	// for the CPU is.
71	TailFoldingOpts InitialBits = TailFoldingOpts::Disabled;
72	TailFoldingOpts EnableBits = TailFoldingOpts::Disabled;
73	TailFoldingOpts DisableBits = TailFoldingOpts::Disabled;
74
75	// This value needs to be initialised to true in case the user does not
76	// explicitly set the -sve-tail-folding option.
77	bool NeedsDefault = true;
78
79	void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; }
80
81	void setNeedsDefault(bool V) { NeedsDefault = V; }
82
83	void setEnableBit(TailFoldingOpts Bit) {
84	EnableBits \|= Bit;
85	DisableBits &= ~Bit;
86	}
87
88	void setDisableBit(TailFoldingOpts Bit) {
89	EnableBits &= ~Bit;
90	DisableBits \|= Bit;
91	}
92
93	TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const {
94	TailFoldingOpts Bits = TailFoldingOpts::Disabled;
95
96	assert((InitialBits == TailFoldingOpts::Disabled \|\| !NeedsDefault) &&
97	"Initial bits should only include one of "
98	"(disabled\|all\|simple\|default)");
99	Bits = NeedsDefault ? DefaultBits : InitialBits;
100	Bits \|= EnableBits;
101	Bits &= ~DisableBits;
102
103	return Bits;
104	}
105
106	void reportError(std::string Opt) {
107	errs() << "invalid argument '" << Opt
108	<< "' to -sve-tail-folding=; the option should be of the form\n"
109	" (disabled\|all\|default\|simple)[+(reductions\|recurrences"
110	"\|reverse\|noreductions\|norecurrences\|noreverse)]\n";
111	report_fatal_error(reason: "Unrecognised tail-folding option");
112	}
113
114	public:
115
116	void operator=(const std::string &Val) {
117	// If the user explicitly sets -sve-tail-folding= then treat as an error.
118	if (Val.empty()) {
119	reportError(Opt: "");
120	return;
121	}
122
123	// Since the user is explicitly setting the option we don't automatically
124	// need the default unless they require it.
125	setNeedsDefault(false);
126
127	SmallVector<StringRef, `4`> TailFoldTypes;
128	StringRef (Val).split(A&: TailFoldTypes, Separator: `'+'`, MaxSplit: -`1`, KeepEmpty: false);
129
130	unsigned StartIdx = `1`;
131	if (TailFoldTypes [`0`] == "disabled")
132	setInitialBits(TailFoldingOpts::Disabled);
133	else if (TailFoldTypes [`0`] == "all")
134	setInitialBits(TailFoldingOpts::All);
135	else if (TailFoldTypes [`0`] == "default")
136	setNeedsDefault(true);
137	else if (TailFoldTypes [`0`] == "simple")
138	setInitialBits(TailFoldingOpts::Simple);
139	else {
140	StartIdx = `0`;
141	setInitialBits(TailFoldingOpts::Disabled);
142	}
143
144	for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) {
145	if (TailFoldTypes [I] == "reductions")
146	setEnableBit(TailFoldingOpts::Reductions);
147	else if (TailFoldTypes [I] == "recurrences")
148	setEnableBit(TailFoldingOpts::Recurrences);
149	else if (TailFoldTypes [I] == "reverse")
150	setEnableBit(TailFoldingOpts::Reverse);
151	else if (TailFoldTypes [I] == "noreductions")
152	setDisableBit(TailFoldingOpts::Reductions);
153	else if (TailFoldTypes [I] == "norecurrences")
154	setDisableBit(TailFoldingOpts::Recurrences);
155	else if (TailFoldTypes [I] == "noreverse")
156	setDisableBit(TailFoldingOpts::Reverse);
157	else
158	reportError(Opt: Val);
159	}
160	}
161
162	bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const {
163	return (getBits(DefaultBits) & Required) == Required;
164	}
165	};
166	} // namespace
167
168	TailFoldingOption TailFoldingOptionLoc;
169
170	cl::opt<TailFoldingOption, true, cl::parser<std::string>> SVETailFolding(
171	"sve-tail-folding",
172	cl::desc (
173	"Control the use of vectorisation using tail-folding for SVE where the"
174	" option is specified in the form (Initial)[+(Flag1\|Flag2\|...)]:"
175	"\ndisabled (Initial) No loop types will vectorize using "
176	"tail-folding"
177	"\ndefault (Initial) Uses the default tail-folding settings for "
178	"the target CPU"
179	"\nall (Initial) All legal loop types will vectorize using "
180	"tail-folding"
181	"\nsimple (Initial) Use tail-folding for simple loops (not "
182	"reductions or recurrences)"
183	"\nreductions Use tail-folding for loops containing reductions"
184	"\nnoreductions Inverse of above"
185	"\nrecurrences Use tail-folding for loops containing fixed order "
186	"recurrences"
187	"\nnorecurrences Inverse of above"
188	"\nreverse Use tail-folding for loops requiring reversed "
189	"predicates"
190	"\nnoreverse Inverse of above"),
191	cl::location(L&: TailFoldingOptionLoc));
192
193	// Experimental option that will only be fully functional when the
194	// code-generator is changed to use SVE instead of NEON for all fixed-width
195	// operations.
196	static cl::opt<bool> EnableFixedwidthAutovecInStreamingMode(
197	"enable-fixedwidth-autovec-in-streaming-mode", cl::init(Val: false), cl::Hidden);
198
199	// Experimental option that will only be fully functional when the cost-model
200	// and code-generator have been changed to avoid using scalable vector
201	// instructions that are not legal in streaming SVE mode.
202	static cl::opt<bool> EnableScalableAutovecInStreamingMode(
203	"enable-scalable-autovec-in-streaming-mode", cl::init(Val: false), cl::Hidden);
204
205	static bool isSMEABIRoutineCall(const CallInst &CI) {
206	const auto *F = CI.getCalledFunction();
207	return F && StringSwitch<bool>(F->getName())
208	.Case(S: "__arm_sme_state", Value: true)
209	.Case(S: "__arm_tpidr2_save", Value: true)
210	.Case(S: "__arm_tpidr2_restore", Value: true)
211	.Case(S: "__arm_za_disable", Value: true)
212	.Default(Value: false);
213	}
214
215	/// Returns true if the function has explicit operations that can only be
216	/// lowered using incompatible instructions for the selected mode. This also
217	/// returns true if the function F may use or modify ZA state.
218	static bool hasPossibleIncompatibleOps(const Function *F) {
219	for (const BasicBlock &BB : *F) {
220	for (const Instruction &I : BB) {
221	// Be conservative for now and assume that any call to inline asm or to
222	// intrinsics could could result in non-streaming ops (e.g. calls to
223	// @llvm.aarch64. or @llvm.gather/scatter intrinsics). We can assume that*
224	// all native LLVM instructions can be lowered to compatible instructions.
225	if (isa<CallInst>(Val: I) && !I.isDebugOrPseudoInst() &&
226	(cast<CallInst>(Val: I).isInlineAsm() \|\| isa<IntrinsicInst>(Val: I) \|\|
227	isSMEABIRoutineCall(CI: cast<CallInst>(Val: I))))
228	return true;
229	}
230	}
231	return false;
232	}
233
234	bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
235	const Function Callee) const* {
236	SMEAttrs CallerAttrs(Caller), CalleeAttrs(Callee);
237
238	// When inlining, we should consider the body of the function, not the
239	// interface.
240	if (CalleeAttrs.hasStreamingBody()) {
241	CalleeAttrs.set(M: SMEAttrs::SM_Compatible, Enable: false);
242	CalleeAttrs.set(M: SMEAttrs::SM_Enabled, Enable: true);
243	}
244
245	if (CalleeAttrs.isNewZA())
246	return false;
247
248	if (CallerAttrs.requiresLazySave(Callee: CalleeAttrs) \|\|
249	CallerAttrs.requiresSMChange(Callee: CalleeAttrs)) {
250	if (hasPossibleIncompatibleOps(F: Callee))
251	return false;
252	}
253
254	const TargetMachine &TM = getTLI()->getTargetMachine();
255
256	const FeatureBitset &CallerBits =
257	TM.getSubtargetImpl(*Caller)->getFeatureBits();
258	const FeatureBitset &CalleeBits =
259	TM.getSubtargetImpl(*Callee)->getFeatureBits();
260
261	// Inline a callee if its target-features are a subset of the callers
262	// target-features.
263	return (CallerBits & CalleeBits) == CalleeBits;
264	}
265
266	bool AArch64TTIImpl::areTypesABICompatible(
267	const Function Caller, const* Function *Callee,
268	const ArrayRef<Type > &Types) const* {
269	if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
270	return false;
271
272	// We need to ensure that argument promotion does not attempt to promote
273	// pointers to fixed-length vector types larger than 128 bits like
274	// <8 x float> (and pointers to aggregate types which have such fixed-length
275	// vector type members) into the values of the pointees. Such vector types
276	// are used for SVE VLS but there is no ABI for SVE VLS arguments and the
277	// backend cannot lower such value arguments. The 128-bit fixed-length SVE
278	// types can be safely treated as 128-bit NEON types and they cannot be
279	// distinguished in IR.
280	if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Range: Types, P: [](Type *Ty) {
281	auto FVTy = dyn_cast<FixedVectorType>(Val: Ty);
282	return FVTy &&
283	FVTy->getScalarSizeInBits() * FVTy->getNumElements() > `128`;
284	}))
285	return false;
286
287	return true;
288	}
289
290	unsigned
291	AArch64TTIImpl::getInlineCallPenalty(const Function F, const* CallBase &Call,
292	unsigned DefaultCallPenalty) const {
293	// This function calculates a penalty for executing Call in F.
294	//
295	// There are two ways this function can be called:
296	// (1) F:
297	// call from F -> G (the call here is Call)
298	//
299	// For (1), Call.getCaller() == F, so it will always return a high cost if
300	// a streaming-mode change is required (thus promoting the need to inline the
301	// function)
302	//
303	// (2) F:
304	// call from F -> G (the call here is not Call)
305	// G:
306	// call from G -> H (the call here is Call)
307	//
308	// For (2), if after inlining the body of G into F the call to H requires a
309	// streaming-mode change, and the call to G from F would also require a
310	// streaming-mode change, then there is benefit to do the streaming-mode
311	// change only once and avoid inlining of G into F.
312	SMEAttrs FAttrs(*F);
313	SMEAttrs CalleeAttrs(Call);
314	if (FAttrs.requiresSMChange(Callee: CalleeAttrs)) {
315	if (F == Call.getCaller()) // (1)
316	return CallPenaltyChangeSM * DefaultCallPenalty;
317	if (FAttrs.requiresSMChange(Callee: SMEAttrs (Call.getCaller()))) // (2)*
318	return InlineCallPenaltyChangeSM * DefaultCallPenalty;
319	}
320
321	return DefaultCallPenalty;
322	}
323
324	bool AArch64TTIImpl::shouldMaximizeVectorBandwidth(
325	TargetTransformInfo::RegisterKind K) const {
326	assert(K != TargetTransformInfo::RGK_Scalar);
327	return (K == TargetTransformInfo::RGK_FixedWidthVector &&
328	ST->isNeonAvailable());
329	}
330
331	/// Calculate the cost of materializing a 64-bit value. This helper
332	/// method might only calculate a fraction of a larger immediate. Therefore it
333	/// is valid to return a cost of ZERO.
334	InstructionCost AArch64TTIImpl::getIntImmCost(int64_t Val) {
335	// Check if the immediate can be encoded within an instruction.
336	if (Val == `0` \|\| AArch64_AM::isLogicalImmediate(imm: Val, regSize: `64`))
337	return `0`;
338
339	if (Val < `0`)
340	Val = ~Val;
341
342	// Calculate how many moves we will need to materialize this constant.
343	SmallVector<AArch64_IMM::ImmInsnModel, `4`> Insn;
344	AArch64_IMM::expandMOVImm(Imm: Val, BitSize: `64`, Insn);
345	return Insn.size();
346	}
347
348	/// Calculate the cost of materializing the given constant.
349	InstructionCost AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
350	TTI::TargetCostKind CostKind) {
351	assert(Ty->isIntegerTy());
352
353	unsigned BitSize = Ty->getPrimitiveSizeInBits();
354	if (BitSize == `0`)
355	return ~`0U`;
356
357	// Sign-extend all constants to a multiple of 64-bit.
358	APInt ImmVal = Imm;
359	if (BitSize & `0x3f`)
360	ImmVal = Imm.sext(width: (BitSize + `63`) & ~`0x3fU`);
361
362	// Split the constant into 64-bit chunks and calculate the cost for each
363	// chunk.
364	InstructionCost Cost = `0`;
365	for (unsigned ShiftVal = `0`; ShiftVal < BitSize; ShiftVal += `64`) {
366	APInt Tmp = ImmVal.ashr(ShiftAmt: ShiftVal).sextOrTrunc(width: `64`);
367	int64_t Val = Tmp.getSExtValue();
368	Cost += getIntImmCost(Val);
369	}
370	// We need at least one instruction to materialze the constant.
371	return std::max<InstructionCost>(a: `1`, b: Cost);
372	}
373
374	InstructionCost AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
375	const APInt &Imm, Type *Ty,
376	TTI::TargetCostKind CostKind,
377	Instruction *Inst) {
378	assert(Ty->isIntegerTy());
379
380	unsigned BitSize = Ty->getPrimitiveSizeInBits();
381	// There is no cost model for constants with a bit size of 0. Return TCC_Free
382	// here, so that constant hoisting will ignore this constant.
383	if (BitSize == `0`)
384	return TTI::TCC_Free;
385
386	unsigned ImmIdx = ~`0U`;
387	switch (Opcode) {
388	default:
389	return TTI::TCC_Free;
390	case Instruction::GetElementPtr:
391	// Always hoist the base address of a GetElementPtr.
392	if (Idx == `0`)
393	return `2` * TTI::TCC_Basic;
394	return TTI::TCC_Free;
395	case Instruction::Store:
396	ImmIdx = `0`;
397	break;
398	case Instruction::Add:
399	case Instruction::Sub:
400	case Instruction::Mul:
401	case Instruction::UDiv:
402	case Instruction::SDiv:
403	case Instruction::URem:
404	case Instruction::SRem:
405	case Instruction::And:
406	case Instruction::Or:
407	case Instruction::Xor:
408	case Instruction::ICmp:
409	ImmIdx = `1`;
410	break;
411	// Always return TCC_Free for the shift value of a shift instruction.
412	case Instruction::Shl:
413	case Instruction::LShr:
414	case Instruction::AShr:
415	if (Idx == `1`)
416	return TTI::TCC_Free;
417	break;
418	case Instruction::Trunc:
419	case Instruction::ZExt:
420	case Instruction::SExt:
421	case Instruction::IntToPtr:
422	case Instruction::PtrToInt:
423	case Instruction::BitCast:
424	case Instruction::PHI:
425	case Instruction::Call:
426	case Instruction::Select:
427	case Instruction::Ret:
428	case Instruction::Load:
429	break;
430	}
431
432	if (Idx == ImmIdx) {
433	int NumConstants = (BitSize + `63`) / `64`;
434	InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
435	return (Cost <= NumConstants * TTI::TCC_Basic)
436	? static_cast<int>(TTI::TCC_Free)
437	: Cost;
438	}
439	return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
440	}
441
442	InstructionCost
443	AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
444	const APInt &Imm, Type *Ty,
445	TTI::TargetCostKind CostKind) {
446	assert(Ty->isIntegerTy());
447
448	unsigned BitSize = Ty->getPrimitiveSizeInBits();
449	// There is no cost model for constants with a bit size of 0. Return TCC_Free
450	// here, so that constant hoisting will ignore this constant.
451	if (BitSize == `0`)
452	return TTI::TCC_Free;
453
454	// Most (all?) AArch64 intrinsics do not support folding immediates into the
455	// selected instruction, so we compute the materialization cost for the
456	// immediate directly.
457	if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
458	return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
459
460	switch (IID) {
461	default:
462	return TTI::TCC_Free;
463	case Intrinsic::sadd_with_overflow:
464	case Intrinsic::uadd_with_overflow:
465	case Intrinsic::ssub_with_overflow:
466	case Intrinsic::usub_with_overflow:
467	case Intrinsic::smul_with_overflow:
468	case Intrinsic::umul_with_overflow:
469	if (Idx == `1`) {
470	int NumConstants = (BitSize + `63`) / `64`;
471	InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
472	return (Cost <= NumConstants * TTI::TCC_Basic)
473	? static_cast<int>(TTI::TCC_Free)
474	: Cost;
475	}
476	break;
477	case Intrinsic::experimental_stackmap:
478	if ((Idx < `2`) \|\| (Imm.getBitWidth() <= `64` && isInt<`64`>(x: Imm.getSExtValue())))
479	return TTI::TCC_Free;
480	break;
481	case Intrinsic::experimental_patchpoint_void:
482	case Intrinsic::experimental_patchpoint:
483	if ((Idx < `4`) \|\| (Imm.getBitWidth() <= `64` && isInt<`64`>(x: Imm.getSExtValue())))
484	return TTI::TCC_Free;
485	break;
486	case Intrinsic::experimental_gc_statepoint:
487	if ((Idx < `5`) \|\| (Imm.getBitWidth() <= `64` && isInt<`64`>(x: Imm.getSExtValue())))
488	return TTI::TCC_Free;
489	break;
490	}
491	return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
492	}
493
494	TargetTransformInfo::PopcntSupportKind
495	AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {
496	assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
497	if (TyWidth == `32` \|\| TyWidth == `64`)
498	return TTI::PSK_FastHardware;
499	// TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
500	return TTI::PSK_Software;
501	}
502
503	static bool isUnpackedVectorVT(EVT VecVT) {
504	return VecVT.isScalableVector() &&
505	VecVT.getSizeInBits().getKnownMinValue() < AArch64::SVEBitsPerBlock;
506	}
507
508	InstructionCost
509	AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
510	TTI::TargetCostKind CostKind) {
511	auto *RetTy = ICA.getReturnType();
512	switch (ICA.getID()) {
513	case Intrinsic::umin:
514	case Intrinsic::umax:
515	case Intrinsic::smin:
516	case Intrinsic::smax: {
517	static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
518	MVT::v8i16, MVT::v2i32, MVT::v4i32,
519	MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32,
520	MVT::nxv2i64};
521	auto LT = getTypeLegalizationCost(Ty: RetTy);
522	// v2i64 types get converted to cmp+bif hence the cost of 2
523	if (LT.second == MVT::v2i64)
524	return LT.first * `2`;
525	if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
526	return LT.first;
527	break;
528	}
529	case Intrinsic::sadd_sat:
530	case Intrinsic::ssub_sat:
531	case Intrinsic::uadd_sat:
532	case Intrinsic::usub_sat: {
533	static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
534	MVT::v8i16, MVT::v2i32, MVT::v4i32,
535	MVT::v2i64};
536	auto LT = getTypeLegalizationCost(Ty: RetTy);
537	// This is a base cost of 1 for the vadd, plus 3 extract shifts if we
538	// need to extend the type, as it uses shr(qadd(shl, shl)).
539	unsigned Instrs =
540	LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? `1` : `4`;
541	if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
542	return LT.first * Instrs;
543	break;
544	}
545	case Intrinsic::abs: {
546	static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
547	MVT::v8i16, MVT::v2i32, MVT::v4i32,
548	MVT::v2i64};
549	auto LT = getTypeLegalizationCost(Ty: RetTy);
550	if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }))
551	return LT.first;
552	break;
553	}
554	case Intrinsic::bswap: {
555	static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32,
556	MVT::v4i32, MVT::v2i64};
557	auto LT = getTypeLegalizationCost(Ty: RetTy);
558	if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }) &&
559	LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits())
560	return LT.first;
561	break;
562	}
563	case Intrinsic::experimental_stepvector: {
564	InstructionCost Cost = `1`; // Cost of the `index' instruction
565	auto LT = getTypeLegalizationCost(Ty: RetTy);
566	// Legalisation of illegal vectors involves an `index' instruction plus
567	// (LT.first - 1) vector adds.
568	if (LT.first > `1`) {
569	Type *LegalVTy = EVT (LT.second).getTypeForEVT(Context&: RetTy->getContext());
570	InstructionCost AddCost =
571	getArithmeticInstrCost(Opcode: Instruction::Add, Ty: LegalVTy, CostKind);
572	Cost += AddCost * (LT.first - `1`);
573	}
574	return Cost;
575	}
576	case Intrinsic::vector_extract:
577	case Intrinsic::vector_insert: {
578	// If both the vector and subvector types are legal types and the index
579	// is 0, then this should be a no-op or simple operation; return a
580	// relatively low cost.
581
582	// If arguments aren't actually supplied, then we cannot determine the
583	// value of the index. We also want to skip predicate types.
584	if (ICA.getArgs().size() != ICA.getArgTypes().size() \|\|
585	ICA.getReturnType()->getScalarType()->isIntegerTy(Bitwidth: `1`))
586	break;
587
588	LLVMContext &C = RetTy->getContext();
589	EVT VecVT = getTLI()->getValueType(DL, Ty: ICA.getArgTypes()[`0`]);
590	bool IsExtract = ICA.getID() == Intrinsic::vector_extract;
591	EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, Ty: RetTy)
592	: getTLI()->getValueType(DL, Ty: ICA.getArgTypes()[`1`]);
593	// Skip this if either the vector or subvector types are unpacked
594	// SVE types; they may get lowered to stack stores and loads.
595	if (isUnpackedVectorVT(VecVT) \|\| isUnpackedVectorVT(VecVT: SubVecVT))
596	break;
597
598	TargetLoweringBase::LegalizeKind SubVecLK =
599	getTLI()->getTypeConversion(Context&: C, VT: SubVecVT);
600	TargetLoweringBase::LegalizeKind VecLK =
601	getTLI()->getTypeConversion(Context&: C, VT: VecVT);
602	const Value *Idx = IsExtract ? ICA.getArgs()[`1`] : ICA.getArgs()[`2`];
603	const ConstantInt *CIdx = cast<ConstantInt>(Val: Idx);
604	if (SubVecLK.first == TargetLoweringBase::TypeLegal &&
605	VecLK.first == TargetLoweringBase::TypeLegal && CIdx->isZero())
606	return TTI::TCC_Free;
607	break;
608	}
609	case Intrinsic::bitreverse: {
610	static const CostTblEntry BitreverseTbl[] = {
611	{Intrinsic::bitreverse, MVT::i32, `1`},
612	{Intrinsic::bitreverse, MVT::i64, `1`},
613	{Intrinsic::bitreverse, MVT::v8i8, `1`},
614	{Intrinsic::bitreverse, MVT::v16i8, `1`},
615	{Intrinsic::bitreverse, MVT::v4i16, `2`},
616	{Intrinsic::bitreverse, MVT::v8i16, `2`},
617	{Intrinsic::bitreverse, MVT::v2i32, `2`},
618	{Intrinsic::bitreverse, MVT::v4i32, `2`},
619	{Intrinsic::bitreverse, MVT::v1i64, `2`},
620	{Intrinsic::bitreverse, MVT::v2i64, `2`},
621	};
622	const auto LegalisationCost = getTypeLegalizationCost(Ty: RetTy);
623	const auto *Entry =
624	CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
625	if (Entry) {
626	// Cost Model is using the legal type(i32) that i8 and i16 will be
627	// converted to +1 so that we match the actual lowering cost
628	if (TLI->getValueType(DL, RetTy, true) == MVT::i8 \|\|
629	TLI->getValueType(DL, RetTy, true) == MVT::i16)
630	return LegalisationCost.first * Entry->Cost + `1`;
631
632	return LegalisationCost.first * Entry->Cost;
633	}
634	break;
635	}
636	case Intrinsic::ctpop: {
637	if (!ST->hasNEON()) {
638	// 32-bit or 64-bit ctpop without NEON is 12 instructions.
639	return getTypeLegalizationCost(Ty: RetTy).first * `12`;
640	}
641	static const CostTblEntry CtpopCostTbl[] = {
642	{ISD::CTPOP, MVT::v2i64, `4`},
643	{ISD::CTPOP, MVT::v4i32, `3`},
644	{ISD::CTPOP, MVT::v8i16, `2`},
645	{ISD::CTPOP, MVT::v16i8, `1`},
646	{ISD::CTPOP, MVT::i64, `4`},
647	{ISD::CTPOP, MVT::v2i32, `3`},
648	{ISD::CTPOP, MVT::v4i16, `2`},
649	{ISD::CTPOP, MVT::v8i8, `1`},
650	{ISD::CTPOP, MVT::i32, `5`},
651	};
652	auto LT = getTypeLegalizationCost(Ty: RetTy);
653	MVT MTy = LT.second;
654	if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
655	// Extra cost of +1 when illegal vector types are legalized by promoting
656	// the integer type.
657	int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
658	RetTy->getScalarSizeInBits()
659	? `1`
660	: `0`;
661	return LT.first * Entry->Cost + ExtraCost;
662	}
663	break;
664	}
665	case Intrinsic::sadd_with_overflow:
666	case Intrinsic::uadd_with_overflow:
667	case Intrinsic::ssub_with_overflow:
668	case Intrinsic::usub_with_overflow:
669	case Intrinsic::smul_with_overflow:
670	case Intrinsic::umul_with_overflow: {
671	static const CostTblEntry WithOverflowCostTbl[] = {
672	{Intrinsic::sadd_with_overflow, MVT::i8, `3`},
673	{Intrinsic::uadd_with_overflow, MVT::i8, `3`},
674	{Intrinsic::sadd_with_overflow, MVT::i16, `3`},
675	{Intrinsic::uadd_with_overflow, MVT::i16, `3`},
676	{Intrinsic::sadd_with_overflow, MVT::i32, `1`},
677	{Intrinsic::uadd_with_overflow, MVT::i32, `1`},
678	{Intrinsic::sadd_with_overflow, MVT::i64, `1`},
679	{Intrinsic::uadd_with_overflow, MVT::i64, `1`},
680	{Intrinsic::ssub_with_overflow, MVT::i8, `3`},
681	{Intrinsic::usub_with_overflow, MVT::i8, `3`},
682	{Intrinsic::ssub_with_overflow, MVT::i16, `3`},
683	{Intrinsic::usub_with_overflow, MVT::i16, `3`},
684	{Intrinsic::ssub_with_overflow, MVT::i32, `1`},
685	{Intrinsic::usub_with_overflow, MVT::i32, `1`},
686	{Intrinsic::ssub_with_overflow, MVT::i64, `1`},
687	{Intrinsic::usub_with_overflow, MVT::i64, `1`},
688	{Intrinsic::smul_with_overflow, MVT::i8, `5`},
689	{Intrinsic::umul_with_overflow, MVT::i8, `4`},
690	{Intrinsic::smul_with_overflow, MVT::i16, `5`},
691	{Intrinsic::umul_with_overflow, MVT::i16, `4`},
692	{Intrinsic::smul_with_overflow, MVT::i32, `2`}, // eg umull;tst
693	{Intrinsic::umul_with_overflow, MVT::i32, `2`}, // eg umull;cmp sxtw
694	{Intrinsic::smul_with_overflow, MVT::i64, `3`}, // eg mul;smulh;cmp
695	{Intrinsic::umul_with_overflow, MVT::i64, `3`}, // eg mul;umulh;cmp asr
696	};
697	EVT MTy = TLI->getValueType(DL, Ty: RetTy->getContainedType(i: `0`), AllowUnknown: true);
698	if (MTy.isSimple())
699	if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(),
700	MTy.getSimpleVT()))
701	return Entry->Cost;
702	break;
703	}
704	case Intrinsic::fptosi_sat:
705	case Intrinsic::fptoui_sat: {
706	if (ICA.getArgTypes().empty())
707	break;
708	bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
709	auto LT = getTypeLegalizationCost(Ty: ICA.getArgTypes()[`0`]);
710	EVT MTy = TLI->getValueType(DL, Ty: RetTy);
711	// Check for the legal types, which are where the size of the input and the
712	// output are the same, or we are using cvt f64->i32 or f32->i64.
713	if ((LT.second == MVT::f32 \|\| LT.second == MVT::f64 \|\|
714	LT.second == MVT::v2f32 \|\| LT.second == MVT::v4f32 \|\|
715	LT.second == MVT::v2f64) &&
716	(LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() \|\|
717	(LT.second == MVT::f64 && MTy == MVT::i32) \|\|
718	(LT.second == MVT::f32 && MTy == MVT::i64)))
719	return LT.first;
720	// Similarly for fp16 sizes
721	if (ST->hasFullFP16() &&
722	((LT.second == MVT::f16 && MTy == MVT::i32) \|\|
723	((LT.second == MVT::v4f16 \|\| LT.second == MVT::v8f16) &&
724	(LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits()))))
725	return LT.first;
726
727	// Otherwise we use a legal convert followed by a min+max
728	if ((LT.second.getScalarType() == MVT::f32 \|\|
729	LT.second.getScalarType() == MVT::f64 \|\|
730	(ST->hasFullFP16() && LT.second.getScalarType() == MVT::f16)) &&
731	LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
732	Type *LegalTy =
733	Type::getIntNTy(C&: RetTy->getContext(), N: LT.second.getScalarSizeInBits());
734	if (LT.second.isVector())
735	LegalTy = VectorType::get(ElementType: LegalTy, EC: LT.second.getVectorElementCount());
736	InstructionCost Cost = `1`;
737	IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin,
738	LegalTy, {LegalTy, LegalTy});
739	Cost += getIntrinsicInstrCost(ICA: Attrs1, CostKind);
740	IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax,
741	LegalTy, {LegalTy, LegalTy});
742	Cost += getIntrinsicInstrCost(ICA: Attrs2, CostKind);
743	return LT.first * Cost;
744	}
745	break;
746	}
747	case Intrinsic::fshl:
748	case Intrinsic::fshr: {
749	if (ICA.getArgs().empty())
750	break;
751
752	// TODO: Add handling for fshl where third argument is not a constant.
753	const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(V: ICA.getArgs()[`2`]);
754	if (!OpInfoZ.isConstant())
755	break;
756
757	const auto LegalisationCost = getTypeLegalizationCost(Ty: RetTy);
758	if (OpInfoZ.isUniform()) {
759	// FIXME: The costs could be lower if the codegen is better.
760	static const CostTblEntry FshlTbl[] = {
761	{Intrinsic::fshl, MVT::v4i32, `3`}, // ushr + shl + orr
762	{Intrinsic::fshl, MVT::v2i64, `3`}, {Intrinsic::fshl, MVT::v16i8, `4`},
763	{Intrinsic::fshl, MVT::v8i16, `4`}, {Intrinsic::fshl, MVT::v2i32, `3`},
764	{Intrinsic::fshl, MVT::v8i8, `4`}, {Intrinsic::fshl, MVT::v4i16, `4`}};
765	// Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl
766	// to avoid having to duplicate the costs.
767	const auto *Entry =
768	CostTableLookup(FshlTbl, Intrinsic::fshl, LegalisationCost.second);
769	if (Entry)
770	return LegalisationCost.first * Entry->Cost;
771	}
772
773	auto TyL = getTypeLegalizationCost(Ty: RetTy);
774	if (!RetTy->isIntegerTy())
775	break;
776
777	// Estimate cost manually, as types like i8 and i16 will get promoted to
778	// i32 and CostTableLookup will ignore the extra conversion cost.
779	bool HigherCost = (RetTy->getScalarSizeInBits() != `32` &&
780	RetTy->getScalarSizeInBits() < `64`) \|\|
781	(RetTy->getScalarSizeInBits() % `64` != `0`);
782	unsigned ExtraCost = HigherCost ? `1` : `0`;
783	if (RetTy->getScalarSizeInBits() == `32` \|\|
784	RetTy->getScalarSizeInBits() == `64`)
785	ExtraCost = `0`; // fhsl/fshr for i32 and i64 can be lowered to a single
786	// extr instruction.
787	else if (HigherCost)
788	ExtraCost = `1`;
789	else
790	break;
791	return TyL.first + ExtraCost;
792	}
793	case Intrinsic::get_active_lane_mask: {
794	auto *RetTy = dyn_cast<FixedVectorType>(Val: ICA.getReturnType());
795	if (RetTy) {
796	EVT RetVT = getTLI()->getValueType(DL, Ty: RetTy);
797	EVT OpVT = getTLI()->getValueType(DL, Ty: ICA.getArgTypes()[`0`]);
798	if (!getTLI()->shouldExpandGetActiveLaneMask(VT: RetVT, OpVT) &&
799	!getTLI()->isTypeLegal(VT: RetVT)) {
800	// We don't have enough context at this point to determine if the mask
801	// is going to be kept live after the block, which will force the vXi1
802	// type to be expanded to legal vectors of integers, e.g. v4i1->v4i32.
803	// For now, we just assume the vectorizer created this intrinsic and
804	// the result will be the input for a PHI. In this case the cost will
805	// be extremely high for fixed-width vectors.
806	// NOTE: getScalarizationOverhead returns a cost that's far too
807	// pessimistic for the actual generated codegen. In reality there are
808	// two instructions generated per lane.
809	return RetTy->getNumElements() * `2`;
810	}
811	}
812	break;
813	}
814	default:
815	break;
816	}
817	return BaseT::getIntrinsicInstrCost(ICA, CostKind);
818	}
819
820	/// The function will remove redundant reinterprets casting in the presence
821	/// of the control flow
822	static std::optional<Instruction *> processPhiNode(InstCombiner &IC,
823	IntrinsicInst &II) {
824	SmallVector<Instruction *, `32`> Worklist;
825	auto RequiredType = II.getType();
826
827	auto *PN = dyn_cast<PHINode>(Val: II.getArgOperand(i: `0`));
828	assert(PN && "Expected Phi Node!");
829
830	// Don't create a new Phi unless we can remove the old one.
831	if (!PN->hasOneUse())
832	return std::nullopt;
833
834	for (Value *IncValPhi : PN->incoming_values()) {
835	auto *Reinterpret = dyn_cast<IntrinsicInst>(Val: IncValPhi);
836	if (!Reinterpret \|\|
837	Reinterpret->getIntrinsicID() !=
838	Intrinsic::aarch64_sve_convert_to_svbool \|\|
839	RequiredType != Reinterpret->getArgOperand(`0`)->getType())
840	return std::nullopt;
841	}
842
843	// Create the new Phi
844	IC.Builder.SetInsertPoint(PN);
845	PHINode *NPN = IC.Builder.CreatePHI(Ty: RequiredType, NumReservedValues: PN->getNumIncomingValues());
846	Worklist.push_back(Elt: PN);
847
848	for (unsigned I = `0`; I < PN->getNumIncomingValues(); I++) {
849	auto *Reinterpret = cast<Instruction>(Val: PN->getIncomingValue(i: I));
850	NPN->addIncoming(V: Reinterpret->getOperand(i: `0`), BB: PN->getIncomingBlock(i: I));
851	Worklist.push_back(Elt: Reinterpret);
852	}
853
854	// Cleanup Phi Node and reinterprets
855	return IC.replaceInstUsesWith(I&: II, V: NPN);
856	}
857
858	// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
859	// => (binop (pred) (from_svbool _) (from_svbool _))
860	//
861	// The above transformation eliminates a `to_svbool` in the predicate
862	// operand of bitwise operation `binop` by narrowing the vector width of
863	// the operation. For example, it would convert a `<vscale x 16 x i1>
864	// and` into a `<vscale x 4 x i1> and`. This is profitable because
865	// to_svbool must zero the new lanes during widening, whereas
866	// from_svbool is free.
867	static std::optional<Instruction *>
868	tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II) {
869	auto BinOp = dyn_cast<IntrinsicInst>(Val: II.getOperand(i_nocapture: `0`));
870	if (!BinOp)
871	return std::nullopt;
872
873	auto IntrinsicID = BinOp->getIntrinsicID();
874	switch (IntrinsicID) {
875	case Intrinsic::aarch64_sve_and_z:
876	case Intrinsic::aarch64_sve_bic_z:
877	case Intrinsic::aarch64_sve_eor_z:
878	case Intrinsic::aarch64_sve_nand_z:
879	case Intrinsic::aarch64_sve_nor_z:
880	case Intrinsic::aarch64_sve_orn_z:
881	case Intrinsic::aarch64_sve_orr_z:
882	break;
883	default:
884	return std::nullopt;
885	}
886
887	auto BinOpPred = BinOp->getOperand(i_nocapture: `0`);
888	auto BinOpOp1 = BinOp->getOperand(i_nocapture: `1`);
889	auto BinOpOp2 = BinOp->getOperand(i_nocapture: `2`);
890
891	auto PredIntr = dyn_cast<IntrinsicInst>(Val: BinOpPred);
892	if (!PredIntr \|\|
893	PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
894	return std::nullopt;
895
896	auto PredOp = PredIntr->getOperand(i_nocapture: `0`);
897	auto PredOpTy = cast<VectorType>(Val: PredOp->getType());
898	if (PredOpTy != II.getType())
899	return std::nullopt;
900
901	SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
902	auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic(
903	Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
904	NarrowedBinOpArgs.push_back(Elt: NarrowBinOpOp1);
905	if (BinOpOp1 == BinOpOp2)
906	NarrowedBinOpArgs.push_back(Elt: NarrowBinOpOp1);
907	else
908	NarrowedBinOpArgs.push_back(IC.Builder.CreateIntrinsic(
909	Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
910
911	auto NarrowedBinOp =
912	IC.Builder.CreateIntrinsic(ID: IntrinsicID, Types: {PredOpTy}, Args: NarrowedBinOpArgs);
913	return IC.replaceInstUsesWith(I&: II, V: NarrowedBinOp);
914	}
915
916	static std::optional<Instruction *>
917	instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II) {
918	// If the reinterpret instruction operand is a PHI Node
919	if (isa<PHINode>(Val: II.getArgOperand(i: `0`)))
920	return processPhiNode(IC, II);
921
922	if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
923	return BinOpCombine;
924
925	// Ignore converts to/from svcount_t.
926	if (isa<TargetExtType>(Val: II.getArgOperand(i: `0`)->getType()) \|\|
927	isa<TargetExtType>(Val: II.getType()))
928	return std::nullopt;
929
930	SmallVector<Instruction *, `32`> CandidatesForRemoval;
931	Value Cursor = II.getOperand(i_nocapture: `0`), EarliestReplacement = nullptr;
932
933	const auto *IVTy = cast<VectorType>(Val: II.getType());
934
935	// Walk the chain of conversions.
936	while (Cursor) {
937	// If the type of the cursor has fewer lanes than the final result, zeroing
938	// must take place, which breaks the equivalence chain.
939	const auto *CursorVTy = cast<VectorType>(Val: Cursor->getType());
940	if (CursorVTy->getElementCount().getKnownMinValue() <
941	IVTy->getElementCount().getKnownMinValue())
942	break;
943
944	// If the cursor has the same type as I, it is a viable replacement.
945	if (Cursor->getType() == IVTy)
946	EarliestReplacement = Cursor;
947
948	auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Val: Cursor);
949
950	// If this is not an SVE conversion intrinsic, this is the end of the chain.
951	if (!IntrinsicCursor \|\| !(IntrinsicCursor->getIntrinsicID() ==
952	Intrinsic::aarch64_sve_convert_to_svbool \|\|
953	IntrinsicCursor->getIntrinsicID() ==
954	Intrinsic::aarch64_sve_convert_from_svbool))
955	break;
956
957	CandidatesForRemoval.insert(I: CandidatesForRemoval.begin(), Elt: IntrinsicCursor);
958	Cursor = IntrinsicCursor->getOperand(i_nocapture: `0`);
959	}
960
961	// If no viable replacement in the conversion chain was found, there is
962	// nothing to do.
963	if (!EarliestReplacement)
964	return std::nullopt;
965
966	return IC.replaceInstUsesWith(I&: II, V: EarliestReplacement);
967	}
968
969	static bool isAllActivePredicate(Value *Pred) {
970	// Look through convert.from.svbool(convert.to.svbool(...) chain.
971	Value *UncastedPred;
972	if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>(
973	m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>(
974	m_Value(UncastedPred)))))
975	// If the predicate has the same or less lanes than the uncasted
976	// predicate then we know the casting has no effect.
977	if (cast<ScalableVectorType>(Val: Pred->getType())->getMinNumElements() <=
978	cast<ScalableVectorType>(Val: UncastedPred->getType())->getMinNumElements())
979	Pred = UncastedPred;
980
981	return match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
982	m_ConstantInt<AArch64SVEPredPattern::all>()));
983	}
984
985	static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
986	IntrinsicInst &II) {
987	// svsel(ptrue, x, y) => x
988	auto *OpPredicate = II.getOperand(i_nocapture: `0`);
989	if (isAllActivePredicate(Pred: OpPredicate))
990	return IC.replaceInstUsesWith(I&: II, V: II.getOperand(i_nocapture: `1`));
991
992	auto Select =
993	IC.Builder.CreateSelect(C: OpPredicate, True: II.getOperand(i_nocapture: `1`), False: II.getOperand(i_nocapture: `2`));
994	return IC.replaceInstUsesWith(I&: II, V: Select);
995	}
996
997	static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
998	IntrinsicInst &II) {
999	IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(Val: II.getArgOperand(i: `1`));
1000	if (!Pg)
1001	return std::nullopt;
1002
1003	if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1004	return std::nullopt;
1005
1006	const auto PTruePattern =
1007	cast<ConstantInt>(Val: Pg->getOperand(i_nocapture: `0`))->getZExtValue();
1008	if (PTruePattern != AArch64SVEPredPattern::vl1)
1009	return std::nullopt;
1010
1011	// The intrinsic is inserting into lane zero so use an insert instead.
1012	auto *IdxTy = Type::getInt64Ty(C&: II.getContext());
1013	auto *Insert = InsertElementInst::Create(
1014	Vec: II.getArgOperand(i: `0`), NewElt: II.getArgOperand(i: `2`), Idx: ConstantInt::get(Ty: IdxTy, V: `0`));
1015	Insert->insertBefore(InsertPos: &II);
1016	Insert->takeName(V: &II);
1017
1018	return IC.replaceInstUsesWith(I&: II, V: Insert);
1019	}
1020
1021	static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
1022	IntrinsicInst &II) {
1023	// Replace DupX with a regular IR splat.
1024	auto *RetTy = cast<ScalableVectorType>(Val: II.getType());
1025	Value *Splat = IC.Builder.CreateVectorSplat(EC: RetTy->getElementCount(),
1026	V: II.getArgOperand(i: `0`));
1027	Splat->takeName(V: &II);
1028	return IC.replaceInstUsesWith(I&: II, V: Splat);
1029	}
1030
1031	static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
1032	IntrinsicInst &II) {
1033	LLVMContext &Ctx = II.getContext();
1034
1035	// Check that the predicate is all active
1036	auto *Pg = dyn_cast<IntrinsicInst>(Val: II.getArgOperand(i: `0`));
1037	if (!Pg \|\| Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1038	return std::nullopt;
1039
1040	const auto PTruePattern =
1041	cast<ConstantInt>(Val: Pg->getOperand(i_nocapture: `0`))->getZExtValue();
1042	if (PTruePattern != AArch64SVEPredPattern::all)
1043	return std::nullopt;
1044
1045	// Check that we have a compare of zero..
1046	auto *SplatValue =
1047	dyn_cast_or_null<ConstantInt>(Val: getSplatValue(V: II.getArgOperand(i: `2`)));
1048	if (!SplatValue \|\| !SplatValue->isZero())
1049	return std::nullopt;
1050
1051	// ..against a dupq
1052	auto *DupQLane = dyn_cast<IntrinsicInst>(Val: II.getArgOperand(i: `1`));
1053	if (!DupQLane \|\|
1054	DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
1055	return std::nullopt;
1056
1057	// Where the dupq is a lane 0 replicate of a vector insert
1058	if (!cast<ConstantInt>(Val: DupQLane->getArgOperand(i: `1`))->isZero())
1059	return std::nullopt;
1060
1061	auto *VecIns = dyn_cast<IntrinsicInst>(Val: DupQLane->getArgOperand(i: `0`));
1062	if (!VecIns \|\| VecIns->getIntrinsicID() != Intrinsic::vector_insert)
1063	return std::nullopt;
1064
1065	// Where the vector insert is a fixed constant vector insert into undef at
1066	// index zero
1067	if (!isa<UndefValue>(Val: VecIns->getArgOperand(i: `0`)))
1068	return std::nullopt;
1069
1070	if (!cast<ConstantInt>(Val: VecIns->getArgOperand(i: `2`))->isZero())
1071	return std::nullopt;
1072
1073	auto *ConstVec = dyn_cast<Constant>(Val: VecIns->getArgOperand(i: `1`));
1074	if (!ConstVec)
1075	return std::nullopt;
1076
1077	auto *VecTy = dyn_cast<FixedVectorType>(Val: ConstVec->getType());
1078	auto *OutTy = dyn_cast<ScalableVectorType>(Val: II.getType());
1079	if (!VecTy \|\| !OutTy \|\| VecTy->getNumElements() != OutTy->getMinNumElements())
1080	return std::nullopt;
1081
1082	unsigned NumElts = VecTy->getNumElements();
1083	unsigned PredicateBits = `0`;
1084
1085	// Expand intrinsic operands to a 16-bit byte level predicate
1086	for (unsigned I = `0`; I < NumElts; ++I) {
1087	auto *Arg = dyn_cast<ConstantInt>(Val: ConstVec->getAggregateElement(Elt: I));
1088	if (!Arg)
1089	return std::nullopt;
1090	if (!Arg->isZero())
1091	PredicateBits \|= `1` << (I * (`16` / NumElts));
1092	}
1093
1094	// If all bits are zero bail early with an empty predicate
1095	if (PredicateBits == `0`) {
1096	auto *PFalse = Constant::getNullValue(Ty: II.getType());
1097	PFalse->takeName(V: &II);
1098	return IC.replaceInstUsesWith(I&: II, V: PFalse);
1099	}
1100
1101	// Calculate largest predicate type used (where byte predicate is largest)
1102	unsigned Mask = `8`;
1103	for (unsigned I = `0`; I < `16`; ++I)
1104	if ((PredicateBits & (`1` << I)) != `0`)
1105	Mask \|= (I % `8`);
1106
1107	unsigned PredSize = Mask & -Mask;
1108	auto *PredType = ScalableVectorType::get(
1109	ElementType: Type::getInt1Ty(C&: Ctx), MinNumElts: AArch64::SVEBitsPerBlock / (PredSize * `8`));
1110
1111	// Ensure all relevant bits are set
1112	for (unsigned I = `0`; I < `16`; I += PredSize)
1113	if ((PredicateBits & (`1` << I)) == `0`)
1114	return std::nullopt;
1115
1116	auto *PTruePat =
1117	ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
1118	auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
1119	{PredType}, {PTruePat});
1120	auto *ConvertToSVBool = IC.Builder.CreateIntrinsic(
1121	Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
1122	auto *ConvertFromSVBool =
1123	IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
1124	{II.getType()}, {ConvertToSVBool});
1125
1126	ConvertFromSVBool->takeName(&II);
1127	return IC.replaceInstUsesWith(I&: II, V: ConvertFromSVBool);
1128	}
1129
1130	static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
1131	IntrinsicInst &II) {
1132	Value *Pg = II.getArgOperand(i: `0`);
1133	Value *Vec = II.getArgOperand(i: `1`);
1134	auto IntrinsicID = II.getIntrinsicID();
1135	bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
1136
1137	// lastX(splat(X)) --> X
1138	if (auto *SplatVal = getSplatValue(V: Vec))
1139	return IC.replaceInstUsesWith(I&: II, V: SplatVal);
1140
1141	// If x and/or y is a splat value then:
1142	// lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
1143	Value LHS, RHS;
1144	if (match(V: Vec, P: m_OneUse(SubPattern: m_BinOp(L: m_Value(V&: LHS), R: m_Value(V&: RHS))))) {
1145	if (isSplatValue(V: LHS) \|\| isSplatValue(V: RHS)) {
1146	auto *OldBinOp = cast<BinaryOperator>(Val: Vec);
1147	auto OpC = OldBinOp->getOpcode();
1148	auto *NewLHS =
1149	IC.Builder.CreateIntrinsic(ID: IntrinsicID, Types: {Vec->getType()}, Args: {Pg, LHS});
1150	auto *NewRHS =
1151	IC.Builder.CreateIntrinsic(ID: IntrinsicID, Types: {Vec->getType()}, Args: {Pg, RHS});
1152	auto *NewBinOp = BinaryOperator::CreateWithCopiedFlags(
1153	Opc: OpC, V1: NewLHS, V2: NewRHS, CopyO: OldBinOp, Name: OldBinOp->getName(), InsertBefore: II.getIterator());
1154	return IC.replaceInstUsesWith(I&: II, V: NewBinOp);
1155	}
1156	}
1157
1158	auto *C = dyn_cast<Constant>(Val: Pg);
1159	if (IsAfter && C && C->isNullValue()) {
1160	// The intrinsic is extracting lane 0 so use an extract instead.
1161	auto *IdxTy = Type::getInt64Ty(C&: II.getContext());
1162	auto *Extract = ExtractElementInst::Create(Vec, Idx: ConstantInt::get(Ty: IdxTy, V: `0`));
1163	Extract->insertBefore(InsertPos: &II);
1164	Extract->takeName(V: &II);
1165	return IC.replaceInstUsesWith(I&: II, V: Extract);
1166	}
1167
1168	auto *IntrPG = dyn_cast<IntrinsicInst>(Val: Pg);
1169	if (!IntrPG)
1170	return std::nullopt;
1171
1172	if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1173	return std::nullopt;
1174
1175	const auto PTruePattern =
1176	cast<ConstantInt>(Val: IntrPG->getOperand(i_nocapture: `0`))->getZExtValue();
1177
1178	// Can the intrinsic's predicate be converted to a known constant index?
1179	unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern: PTruePattern);
1180	if (!MinNumElts)
1181	return std::nullopt;
1182
1183	unsigned Idx = MinNumElts - `1`;
1184	// Increment the index if extracting the element after the last active
1185	// predicate element.
1186	if (IsAfter)
1187	++Idx;
1188
1189	// Ignore extracts whose index is larger than the known minimum vector
1190	// length. NOTE: This is an artificial constraint where we prefer to
1191	// maintain what the user asked for until an alternative is proven faster.
1192	auto *PgVTy = cast<ScalableVectorType>(Val: Pg->getType());
1193	if (Idx >= PgVTy->getMinNumElements())
1194	return std::nullopt;
1195
1196	// The intrinsic is extracting a fixed lane so use an extract instead.
1197	auto *IdxTy = Type::getInt64Ty(C&: II.getContext());
1198	auto *Extract = ExtractElementInst::Create(Vec, Idx: ConstantInt::get(Ty: IdxTy, V: Idx));
1199	Extract->insertBefore(InsertPos: &II);
1200	Extract->takeName(V: &II);
1201	return IC.replaceInstUsesWith(I&: II, V: Extract);
1202	}
1203
1204	static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
1205	IntrinsicInst &II) {
1206	// The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
1207	// integer variant across a variety of micro-architectures. Replace scalar
1208	// integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
1209	// bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
1210	// depending on the micro-architecture, but has been observed as generally
1211	// being faster, particularly when the CLAST[AB] op is a loop-carried
1212	// dependency.
1213	Value *Pg = II.getArgOperand(i: `0`);
1214	Value *Fallback = II.getArgOperand(i: `1`);
1215	Value *Vec = II.getArgOperand(i: `2`);
1216	Type *Ty = II.getType();
1217
1218	if (!Ty->isIntegerTy())
1219	return std::nullopt;
1220
1221	Type *FPTy;
1222	switch (cast<IntegerType>(Val: Ty)->getBitWidth()) {
1223	default:
1224	return std::nullopt;
1225	case `16`:
1226	FPTy = IC.Builder.getHalfTy();
1227	break;
1228	case `32`:
1229	FPTy = IC.Builder.getFloatTy();
1230	break;
1231	case `64`:
1232	FPTy = IC.Builder.getDoubleTy();
1233	break;
1234	}
1235
1236	Value *FPFallBack = IC.Builder.CreateBitCast(V: Fallback, DestTy: FPTy);
1237	auto *FPVTy = VectorType::get(
1238	ElementType: FPTy, EC: cast<VectorType>(Val: Vec->getType())->getElementCount());
1239	Value *FPVec = IC.Builder.CreateBitCast(V: Vec, DestTy: FPVTy);
1240	auto *FPII = IC.Builder.CreateIntrinsic(
1241	ID: II.getIntrinsicID(), Types: {FPVec->getType()}, Args: {Pg, FPFallBack, FPVec});
1242	Value *FPIItoInt = IC.Builder.CreateBitCast(V: FPII, DestTy: II.getType());
1243	return IC.replaceInstUsesWith(I&: II, V: FPIItoInt);
1244	}
1245
1246	static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
1247	IntrinsicInst &II) {
1248	LLVMContext &Ctx = II.getContext();
1249	// Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
1250	// can work with RDFFR_PP for ptest elimination.
1251	auto *AllPat =
1252	ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
1253	auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
1254	{II.getType()}, {AllPat});
1255	auto *RDFFR =
1256	IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {}, {PTrue});
1257	RDFFR->takeName(&II);
1258	return IC.replaceInstUsesWith(I&: II, V: RDFFR);
1259	}
1260
1261	static std::optional<Instruction *>
1262	instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) {
1263	const auto Pattern = cast<ConstantInt>(Val: II.getArgOperand(i: `0`))->getZExtValue();
1264
1265	if (Pattern == AArch64SVEPredPattern::all) {
1266	Constant *StepVal = ConstantInt::get(Ty: II.getType(), V: NumElts);
1267	auto *VScale = IC.Builder.CreateVScale(Scaling: StepVal);
1268	VScale->takeName(V: &II);
1269	return IC.replaceInstUsesWith(I&: II, V: VScale);
1270	}
1271
1272	unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
1273
1274	return MinNumElts && NumElts >= MinNumElts
1275	? std::optional<Instruction *>(IC.replaceInstUsesWith(
1276	I&: II, V: ConstantInt::get(Ty: II.getType(), V: MinNumElts)))
1277	: std::nullopt;
1278	}
1279
1280	static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
1281	IntrinsicInst &II) {
1282	Value *PgVal = II.getArgOperand(i: `0`);
1283	Value *OpVal = II.getArgOperand(i: `1`);
1284
1285	// PTEST_<FIRST\|LAST>(X, X) is equivalent to PTEST_ANY(X, X).
1286	// Later optimizations prefer this form.
1287	if (PgVal == OpVal &&
1288	(II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first \|\|
1289	II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
1290	Value *Ops[] = {PgVal, OpVal};
1291	Type *Tys[] = {PgVal->getType()};
1292
1293	auto *PTest =
1294	IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops);
1295	PTest->takeName(&II);
1296
1297	return IC.replaceInstUsesWith(I&: II, V: PTest);
1298	}
1299
1300	IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(Val: PgVal);
1301	IntrinsicInst *Op = dyn_cast<IntrinsicInst>(Val: OpVal);
1302
1303	if (!Pg \|\| !Op)
1304	return std::nullopt;
1305
1306	Intrinsic::ID OpIID = Op->getIntrinsicID();
1307
1308	if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
1309	OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
1310	Pg->getArgOperand(`0`)->getType() == Op->getArgOperand(`0`)->getType()) {
1311	Value *Ops[] = {Pg->getArgOperand(i: `0`), Op->getArgOperand(i: `0`)};
1312	Type *Tys[] = {Pg->getArgOperand(i: `0`)->getType()};
1313
1314	auto *PTest = IC.Builder.CreateIntrinsic(ID: II.getIntrinsicID(), Types: Tys, Args: Ops);
1315
1316	PTest->takeName(V: &II);
1317	return IC.replaceInstUsesWith(I&: II, V: PTest);
1318	}
1319
1320	// Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
1321	// Later optimizations may rewrite sequence to use the flag-setting variant
1322	// of instruction X to remove PTEST.
1323	if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
1324	((OpIID == Intrinsic::aarch64_sve_brka_z) \|\|
1325	(OpIID == Intrinsic::aarch64_sve_brkb_z) \|\|
1326	(OpIID == Intrinsic::aarch64_sve_brkpa_z) \|\|
1327	(OpIID == Intrinsic::aarch64_sve_brkpb_z) \|\|
1328	(OpIID == Intrinsic::aarch64_sve_rdffr_z) \|\|
1329	(OpIID == Intrinsic::aarch64_sve_and_z) \|\|
1330	(OpIID == Intrinsic::aarch64_sve_bic_z) \|\|
1331	(OpIID == Intrinsic::aarch64_sve_eor_z) \|\|
1332	(OpIID == Intrinsic::aarch64_sve_nand_z) \|\|
1333	(OpIID == Intrinsic::aarch64_sve_nor_z) \|\|
1334	(OpIID == Intrinsic::aarch64_sve_orn_z) \|\|
1335	(OpIID == Intrinsic::aarch64_sve_orr_z))) {
1336	Value *Ops[] = {Pg->getArgOperand(i: `0`), Pg};
1337	Type *Tys[] = {Pg->getType()};
1338
1339	auto *PTest = IC.Builder.CreateIntrinsic(ID: II.getIntrinsicID(), Types: Tys, Args: Ops);
1340	PTest->takeName(V: &II);
1341
1342	return IC.replaceInstUsesWith(I&: II, V: PTest);
1343	}
1344
1345	return std::nullopt;
1346	}
1347
1348	template <Intrinsic::ID MulOpc, typename Intrinsic::ID FuseOpc>
1349	static std::optional<Instruction *>
1350	instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II,
1351	bool MergeIntoAddendOp) {
1352	Value *P = II.getOperand(i_nocapture: `0`);
1353	Value MulOp0, MulOp1, AddendOp, Mul;
1354	if (MergeIntoAddendOp) {
1355	AddendOp = II.getOperand(i_nocapture: `1`);
1356	Mul = II.getOperand(i_nocapture: `2`);
1357	} else {
1358	AddendOp = II.getOperand(i_nocapture: `2`);
1359	Mul = II.getOperand(i_nocapture: `1`);
1360	}
1361
1362	if (!match(Mul, m_Intrinsic<MulOpc>(m_Specific(V: P), m_Value(V&: MulOp0),
1363	m_Value(V&: MulOp1))))
1364	return std::nullopt;
1365
1366	if (!Mul->hasOneUse())
1367	return std::nullopt;
1368
1369	Instruction FMFSource = nullptr*;
1370	if (II.getType()->isFPOrFPVectorTy()) {
1371	llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
1372	// Stop the combine when the flags on the inputs differ in case dropping
1373	// flags would lead to us missing out on more beneficial optimizations.
1374	if (FAddFlags != cast<CallInst>(Val: Mul)->getFastMathFlags())
1375	return std::nullopt;
1376	if (!FAddFlags.allowContract())
1377	return std::nullopt;
1378	FMFSource = &II;
1379	}
1380
1381	CallInst *Res;
1382	if (MergeIntoAddendOp)
1383	Res = IC.Builder.CreateIntrinsic(ID: FuseOpc, Types: {II.getType()},
1384	Args: {P, AddendOp, MulOp0, MulOp1}, FMFSource);
1385	else
1386	Res = IC.Builder.CreateIntrinsic(ID: FuseOpc, Types: {II.getType()},
1387	Args: {P, MulOp0, MulOp1, AddendOp}, FMFSource);
1388
1389	return IC.replaceInstUsesWith(I&: II, V: Res);
1390	}
1391
1392	static std::optional<Instruction *>
1393	instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
1394	Value *Pred = II.getOperand(i_nocapture: `0`);
1395	Value *PtrOp = II.getOperand(i_nocapture: `1`);
1396	Type *VecTy = II.getType();
1397
1398	if (isAllActivePredicate(Pred)) {
1399	LoadInst *Load = IC.Builder.CreateLoad(Ty: VecTy, Ptr: PtrOp);
1400	Load->copyMetadata(SrcInst: II);
1401	return IC.replaceInstUsesWith(I&: II, V: Load);
1402	}
1403
1404	CallInst *MaskedLoad =
1405	IC.Builder.CreateMaskedLoad(Ty: VecTy, Ptr: PtrOp, Alignment: PtrOp->getPointerAlignment(DL),
1406	Mask: Pred, PassThru: ConstantAggregateZero::get(Ty: VecTy));
1407	MaskedLoad->copyMetadata(SrcInst: II);
1408	return IC.replaceInstUsesWith(I&: II, V: MaskedLoad);
1409	}
1410
1411	static std::optional<Instruction *>
1412	instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
1413	Value *VecOp = II.getOperand(i_nocapture: `0`);
1414	Value *Pred = II.getOperand(i_nocapture: `1`);
1415	Value *PtrOp = II.getOperand(i_nocapture: `2`);
1416
1417	if (isAllActivePredicate(Pred)) {
1418	StoreInst *Store = IC.Builder.CreateStore(Val: VecOp, Ptr: PtrOp);
1419	Store->copyMetadata(SrcInst: II);
1420	return IC.eraseInstFromFunction(I&: II);
1421	}
1422
1423	CallInst *MaskedStore = IC.Builder.CreateMaskedStore(
1424	Val: VecOp, Ptr: PtrOp, Alignment: PtrOp->getPointerAlignment(DL), Mask: Pred);
1425	MaskedStore->copyMetadata(SrcInst: II);
1426	return IC.eraseInstFromFunction(I&: II);
1427	}
1428
1429	static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) {
1430	switch (Intrinsic) {
1431	case Intrinsic::aarch64_sve_fmul_u:
1432	return Instruction::BinaryOps::FMul;
1433	case Intrinsic::aarch64_sve_fadd_u:
1434	return Instruction::BinaryOps::FAdd;
1435	case Intrinsic::aarch64_sve_fsub_u:
1436	return Instruction::BinaryOps::FSub;
1437	default:
1438	return Instruction::BinaryOpsEnd;
1439	}
1440	}
1441
1442	static std::optional<Instruction *>
1443	instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II) {
1444	// Bail due to missing support for ISD::STRICT_ scalable vector operations.
1445	if (II.isStrictFP())
1446	return std::nullopt;
1447
1448	auto *OpPredicate = II.getOperand(i_nocapture: `0`);
1449	auto BinOpCode = intrinsicIDToBinOpCode(Intrinsic: II.getIntrinsicID());
1450	if (BinOpCode == Instruction::BinaryOpsEnd \|\|
1451	!match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1452	m_ConstantInt<AArch64SVEPredPattern::all>())))
1453	return std::nullopt;
1454	IRBuilderBase::FastMathFlagGuard FMFGuard(IC.Builder);
1455	IC.Builder.setFastMathFlags(II.getFastMathFlags());
1456	auto BinOp =
1457	IC.Builder.CreateBinOp(Opc: BinOpCode, LHS: II.getOperand(i_nocapture: `1`), RHS: II.getOperand(i_nocapture: `2`));
1458	return IC.replaceInstUsesWith(I&: II, V: BinOp);
1459	}
1460
1461	// Canonicalise operations that take an all active predicate (e.g. sve.add ->
1462	// sve.add_u).
1463	static std::optional<Instruction *> instCombineSVEAllActive(IntrinsicInst &II,
1464	Intrinsic::ID IID) {
1465	auto *OpPredicate = II.getOperand(i_nocapture: `0`);
1466	if (!match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1467	m_ConstantInt<AArch64SVEPredPattern::all>())))
1468	return std::nullopt;
1469
1470	auto *Mod = II.getModule();
1471	auto *NewDecl = Intrinsic::getDeclaration(M: Mod, id: IID, Tys: {II.getType()});
1472	II.setCalledFunction(NewDecl);
1473
1474	return &II;
1475	}
1476
1477	// Simplify operations where predicate has all inactive lanes or try to replace
1478	// with _u form when all lanes are active
1479	static std::optional<Instruction *>
1480	instCombineSVEAllOrNoActive(InstCombiner &IC, IntrinsicInst &II,
1481	Intrinsic::ID IID) {
1482	if (match(V: II.getOperand(i_nocapture: `0`), P: m_ZeroInt())) {
1483	// llvm_ir, pred(0), op1, op2 - Spec says to return op1 when all lanes are
1484	// inactive for sv[func]_m
1485	return IC.replaceInstUsesWith(I&: II, V: II.getOperand(i_nocapture: `1`));
1486	}
1487	return instCombineSVEAllActive(II, IID);
1488	}
1489
1490	static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
1491	IntrinsicInst &II) {
1492	if (auto II_U =
1493	instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_add_u))
1494	return II_U;
1495	if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1496	Intrinsic::aarch64_sve_mla>(
1497	IC, II, true))
1498	return MLA;
1499	if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1500	Intrinsic::aarch64_sve_mad>(
1501	IC, II, false))
1502	return MAD;
1503	return std::nullopt;
1504	}
1505
1506	static std::optional<Instruction *>
1507	instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II) {
1508	if (auto II_U =
1509	instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fadd_u))
1510	return II_U;
1511	if (auto FMLA =
1512	instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1513	Intrinsic::aarch64_sve_fmla>(IC, II,
1514	true))
1515	return FMLA;
1516	if (auto FMAD =
1517	instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1518	Intrinsic::aarch64_sve_fmad>(IC, II,
1519	false))
1520	return FMAD;
1521	if (auto FMLA =
1522	instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1523	Intrinsic::aarch64_sve_fmla>(IC, II,
1524	true))
1525	return FMLA;
1526	return std::nullopt;
1527	}
1528
1529	static std::optional<Instruction *>
1530	instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II) {
1531	if (auto FMLA =
1532	instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1533	Intrinsic::aarch64_sve_fmla>(IC, II,
1534	true))
1535	return FMLA;
1536	if (auto FMAD =
1537	instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1538	Intrinsic::aarch64_sve_fmad>(IC, II,
1539	false))
1540	return FMAD;
1541	if (auto FMLA_U =
1542	instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1543	Intrinsic::aarch64_sve_fmla_u>(
1544	IC, II, true))
1545	return FMLA_U;
1546	return instCombineSVEVectorBinOp(IC, II);
1547	}
1548
1549	static std::optional<Instruction *>
1550	instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II) {
1551	if (auto II_U =
1552	instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fsub_u))
1553	return II_U;
1554	if (auto FMLS =
1555	instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1556	Intrinsic::aarch64_sve_fmls>(IC, II,
1557	true))
1558	return FMLS;
1559	if (auto FMSB =
1560	instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1561	Intrinsic::aarch64_sve_fnmsb>(
1562	IC, II, false))
1563	return FMSB;
1564	if (auto FMLS =
1565	instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1566	Intrinsic::aarch64_sve_fmls>(IC, II,
1567	true))
1568	return FMLS;
1569	return std::nullopt;
1570	}
1571
1572	static std::optional<Instruction *>
1573	instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II) {
1574	if (auto FMLS =
1575	instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1576	Intrinsic::aarch64_sve_fmls>(IC, II,
1577	true))
1578	return FMLS;
1579	if (auto FMSB =
1580	instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1581	Intrinsic::aarch64_sve_fnmsb>(
1582	IC, II, false))
1583	return FMSB;
1584	if (auto FMLS_U =
1585	instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1586	Intrinsic::aarch64_sve_fmls_u>(
1587	IC, II, true))
1588	return FMLS_U;
1589	return instCombineSVEVectorBinOp(IC, II);
1590	}
1591
1592	static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
1593	IntrinsicInst &II) {
1594	if (auto II_U =
1595	instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sub_u))
1596	return II_U;
1597	if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1598	Intrinsic::aarch64_sve_mls>(
1599	IC, II, true))
1600	return MLS;
1601	return std::nullopt;
1602	}
1603
1604	static std::optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
1605	IntrinsicInst &II,
1606	Intrinsic::ID IID) {
1607	auto *OpPredicate = II.getOperand(i_nocapture: `0`);
1608	auto *OpMultiplicand = II.getOperand(i_nocapture: `1`);
1609	auto *OpMultiplier = II.getOperand(i_nocapture: `2`);
1610
1611	// Return true if a given instruction is a unit splat value, false otherwise.
1612	auto IsUnitSplat = [](auto *I) {
1613	auto *SplatValue = getSplatValue(I);
1614	if (!SplatValue)
1615	return false;
1616	return match(SplatValue, m_FPOne()) \|\| match(SplatValue, m_One());
1617	};
1618
1619	// Return true if a given instruction is an aarch64_sve_dup intrinsic call
1620	// with a unit splat value, false otherwise.
1621	auto IsUnitDup = [](auto *I) {
1622	auto *IntrI = dyn_cast<IntrinsicInst>(I);
1623	if (!IntrI \|\| IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup)
1624	return false;
1625
1626	auto *SplatValue = IntrI->getOperand(`2`);
1627	return match(SplatValue, m_FPOne()) \|\| match(SplatValue, m_One());
1628	};
1629
1630	if (IsUnitSplat (OpMultiplier)) {
1631	// [f]mul pg %n, (dupx 1) => %n
1632	OpMultiplicand->takeName(V: &II);
1633	return IC.replaceInstUsesWith(I&: II, V: OpMultiplicand);
1634	} else if (IsUnitDup (OpMultiplier)) {
1635	// [f]mul pg %n, (dup pg 1) => %n
1636	auto *DupInst = cast<IntrinsicInst>(Val: OpMultiplier);
1637	auto *DupPg = DupInst->getOperand(i_nocapture: `1`);
1638	// TODO: this is naive. The optimization is still valid if DupPg
1639	// 'encompasses' OpPredicate, not only if they're the same predicate.
1640	if (OpPredicate == DupPg) {
1641	OpMultiplicand->takeName(V: &II);
1642	return IC.replaceInstUsesWith(I&: II, V: OpMultiplicand);
1643	}
1644	}
1645
1646	return instCombineSVEVectorBinOp(IC, II);
1647	}
1648
1649	static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
1650	IntrinsicInst &II) {
1651	Value *UnpackArg = II.getArgOperand(i: `0`);
1652	auto *RetTy = cast<ScalableVectorType>(Val: II.getType());
1653	bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi \|\|
1654	II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
1655
1656	// Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
1657	// Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
1658	if (auto *ScalarArg = getSplatValue(V: UnpackArg)) {
1659	ScalarArg =
1660	IC.Builder.CreateIntCast(V: ScalarArg, DestTy: RetTy->getScalarType(), isSigned: IsSigned);
1661	Value *NewVal =
1662	IC.Builder.CreateVectorSplat(EC: RetTy->getElementCount(), V: ScalarArg);
1663	NewVal->takeName(V: &II);
1664	return IC.replaceInstUsesWith(I&: II, V: NewVal);
1665	}
1666
1667	return std::nullopt;
1668	}
1669	static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
1670	IntrinsicInst &II) {
1671	auto *OpVal = II.getOperand(i_nocapture: `0`);
1672	auto *OpIndices = II.getOperand(i_nocapture: `1`);
1673	VectorType *VTy = cast<VectorType>(Val: II.getType());
1674
1675	// Check whether OpIndices is a constant splat value < minimal element count
1676	// of result.
1677	auto *SplatValue = dyn_cast_or_null<ConstantInt>(Val: getSplatValue(V: OpIndices));
1678	if (!SplatValue \|\|
1679	SplatValue->getValue().uge(RHS: VTy->getElementCount().getKnownMinValue()))
1680	return std::nullopt;
1681
1682	// Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
1683	// splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
1684	auto *Extract = IC.Builder.CreateExtractElement(Vec: OpVal, Idx: SplatValue);
1685	auto *VectorSplat =
1686	IC.Builder.CreateVectorSplat(EC: VTy->getElementCount(), V: Extract);
1687
1688	VectorSplat->takeName(V: &II);
1689	return IC.replaceInstUsesWith(I&: II, V: VectorSplat);
1690	}
1691
1692	static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC,
1693	IntrinsicInst &II) {
1694	Value A, B;
1695	Type *RetTy = II.getType();
1696	constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool;
1697	constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool;
1698
1699	// uzp1(to_svbool(A), to_svbool(B)) --> <A, B>
1700	// uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B>
1701	if ((match(II.getArgOperand(i: `0`),
1702	m_Intrinsic<FromSVB>(m_Intrinsic<ToSVB>(m_Value(V&: A)))) &&
1703	match(II.getArgOperand(i: `1`),
1704	m_Intrinsic<FromSVB>(m_Intrinsic<ToSVB>(m_Value(V&: B))))) \|\|
1705	(match(II.getArgOperand(i: `0`), m_Intrinsic<ToSVB>(m_Value(V&: A))) &&
1706	match(II.getArgOperand(i: `1`), m_Intrinsic<ToSVB>(m_Value(V&: B))))) {
1707	auto *TyA = cast<ScalableVectorType>(Val: A->getType());
1708	if (TyA == B->getType() &&
1709	RetTy == ScalableVectorType::getDoubleElementsVectorType(VTy: TyA)) {
1710	auto *SubVec = IC.Builder.CreateInsertVector(
1711	DstType: RetTy, SrcVec: PoisonValue::get(T: RetTy), SubVec: A, Idx: IC.Builder.getInt64(C: `0`));
1712	auto *ConcatVec = IC.Builder.CreateInsertVector(
1713	DstType: RetTy, SrcVec: SubVec, SubVec: B, Idx: IC.Builder.getInt64(C: TyA->getMinNumElements()));
1714	ConcatVec->takeName(V: &II);
1715	return IC.replaceInstUsesWith(I&: II, V: ConcatVec);
1716	}
1717	}
1718
1719	return std::nullopt;
1720	}
1721
1722	static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
1723	IntrinsicInst &II) {
1724	// zip1(uzp1(A, B), uzp2(A, B)) --> A
1725	// zip2(uzp1(A, B), uzp2(A, B)) --> B
1726	Value A, B;
1727	if (match(II.getArgOperand(`0`),
1728	m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(m_Value(A), m_Value(B))) &&
1729	match(II.getArgOperand(`1`), m_Intrinsic<Intrinsic::aarch64_sve_uzp2>(
1730	m_Specific(A), m_Specific(B))))
1731	return IC.replaceInstUsesWith(
1732	II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
1733
1734	return std::nullopt;
1735	}
1736
1737	static std::optional<Instruction *>
1738	instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II) {
1739	Value *Mask = II.getOperand(i_nocapture: `0`);
1740	Value *BasePtr = II.getOperand(i_nocapture: `1`);
1741	Value *Index = II.getOperand(i_nocapture: `2`);
1742	Type *Ty = II.getType();
1743	Value *PassThru = ConstantAggregateZero::get(Ty);
1744
1745	// Contiguous gather => masked load.
1746	// (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
1747	// => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
1748	Value *IndexBase;
1749	if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1750	m_Value(IndexBase), m_SpecificInt(`1`)))) {
1751	Align Alignment =
1752	BasePtr->getPointerAlignment(DL: II.getModule()->getDataLayout());
1753
1754	Type *VecPtrTy = PointerType::getUnqual(ElementType: Ty);
1755	Value *Ptr = IC.Builder.CreateGEP(Ty: cast<VectorType>(Val: Ty)->getElementType(),
1756	Ptr: BasePtr, IdxList: IndexBase);
1757	Ptr = IC.Builder.CreateBitCast(V: Ptr, DestTy: VecPtrTy);
1758	CallInst *MaskedLoad =
1759	IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
1760	MaskedLoad->takeName(V: &II);
1761	return IC.replaceInstUsesWith(I&: II, V: MaskedLoad);
1762	}
1763
1764	return std::nullopt;
1765	}
1766
1767	static std::optional<Instruction *>
1768	instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II) {
1769	Value *Val = II.getOperand(i_nocapture: `0`);
1770	Value *Mask = II.getOperand(i_nocapture: `1`);
1771	Value *BasePtr = II.getOperand(i_nocapture: `2`);
1772	Value *Index = II.getOperand(i_nocapture: `3`);
1773	Type *Ty = Val->getType();
1774
1775	// Contiguous scatter => masked store.
1776	// (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
1777	// => (masked.store Value (gep BasePtr IndexBase) Align Mask)
1778	Value *IndexBase;
1779	if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1780	m_Value(IndexBase), m_SpecificInt(`1`)))) {
1781	Align Alignment =
1782	BasePtr->getPointerAlignment(DL: II.getModule()->getDataLayout());
1783
1784	Value *Ptr = IC.Builder.CreateGEP(Ty: cast<VectorType>(Val: Ty)->getElementType(),
1785	Ptr: BasePtr, IdxList: IndexBase);
1786	Type *VecPtrTy = PointerType::getUnqual(ElementType: Ty);
1787	Ptr = IC.Builder.CreateBitCast(V: Ptr, DestTy: VecPtrTy);
1788
1789	(void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
1790
1791	return IC.eraseInstFromFunction(I&: II);
1792	}
1793
1794	return std::nullopt;
1795	}
1796
1797	static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
1798	IntrinsicInst &II) {
1799	Type *Int32Ty = IC.Builder.getInt32Ty();
1800	Value *Pred = II.getOperand(i_nocapture: `0`);
1801	Value *Vec = II.getOperand(i_nocapture: `1`);
1802	Value *DivVec = II.getOperand(i_nocapture: `2`);
1803
1804	Value *SplatValue = getSplatValue(V: DivVec);
1805	ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(Val: SplatValue);
1806	if (!SplatConstantInt)
1807	return std::nullopt;
1808	APInt Divisor = SplatConstantInt->getValue();
1809
1810	if (Divisor.isPowerOf2()) {
1811	Constant *DivisorLog2 = ConstantInt::get(Ty: Int32Ty, V: Divisor.logBase2());
1812	auto ASRD = IC.Builder.CreateIntrinsic(
1813	Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
1814	return IC.replaceInstUsesWith(I&: II, V: ASRD);
1815	}
1816	if (Divisor.isNegatedPowerOf2()) {
1817	Divisor.negate();
1818	Constant *DivisorLog2 = ConstantInt::get(Ty: Int32Ty, V: Divisor.logBase2());
1819	auto ASRD = IC.Builder.CreateIntrinsic(
1820	Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
1821	auto NEG = IC.Builder.CreateIntrinsic(
1822	Intrinsic::aarch64_sve_neg, {ASRD->getType()}, {ASRD, Pred, ASRD});
1823	return IC.replaceInstUsesWith(I&: II, V: NEG);
1824	}
1825
1826	return std::nullopt;
1827	}
1828
1829	bool SimplifyValuePattern(SmallVector<Value > &Vec, bool* AllowPoison) {
1830	size_t VecSize = Vec.size();
1831	if (VecSize == `1`)
1832	return true;
1833	if (!isPowerOf2_64(Value: VecSize))
1834	return false;
1835	size_t HalfVecSize = VecSize / `2`;
1836
1837	for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;
1838	RHS != Vec.end(); LHS++, RHS++) {
1839	if (LHS != nullptr* && RHS != nullptr*) {
1840	if (LHS == RHS)
1841	continue;
1842	else
1843	return false;
1844	}
1845	if (!AllowPoison)
1846	return false;
1847	if (LHS == nullptr* && RHS != nullptr*)
1848	LHS = RHS;
1849	}
1850
1851	Vec.resize(N: HalfVecSize);
1852	SimplifyValuePattern(Vec, AllowPoison);
1853	return true;
1854	}
1855
1856	// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
1857	// to dupqlane(f64(C)) where C is A concatenated with B
1858	static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
1859	IntrinsicInst &II) {
1860	Value CurrentInsertElt = nullptr, Default = nullptr;
1861	if (!match(II.getOperand(`0`),
1862	m_Intrinsic<Intrinsic::vector_insert>(
1863	m_Value(Default), m_Value(CurrentInsertElt), m_Value())) \|\|
1864	!isa<FixedVectorType>(CurrentInsertElt->getType()))
1865	return std::nullopt;
1866	auto IIScalableTy = cast<ScalableVectorType>(Val: II.getType());
1867
1868	// Insert the scalars into a container ordered by InsertElement index
1869	SmallVector<Value > Elts(IIScalableTy->getMinNumElements(), nullptr*);
1870	while (auto InsertElt = dyn_cast<InsertElementInst>(Val: CurrentInsertElt)) {
1871	auto Idx = cast<ConstantInt>(Val: InsertElt->getOperand(i_nocapture: `2`));
1872	Elts [Idx->getValue().getZExtValue()] = InsertElt->getOperand(i_nocapture: `1`);
1873	CurrentInsertElt = InsertElt->getOperand(i_nocapture: `0`);
1874	}
1875
1876	bool AllowPoison =
1877	isa<PoisonValue>(Val: CurrentInsertElt) && isa<PoisonValue>(Val: Default);
1878	if (!SimplifyValuePattern(Vec&: Elts, AllowPoison))
1879	return std::nullopt;
1880
1881	// Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
1882	Value *InsertEltChain = PoisonValue::get(T: CurrentInsertElt->getType());
1883	for (size_t I = `0`; I < Elts.size(); I++) {
1884	if (Elts [I] == nullptr)
1885	continue;
1886	InsertEltChain = IC.Builder.CreateInsertElement(Vec: InsertEltChain, NewElt: Elts [I],
1887	Idx: IC.Builder.getInt64(C: I));
1888	}
1889	if (InsertEltChain == nullptr)
1890	return std::nullopt;
1891
1892	// Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
1893	// value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
1894	// be bitcast to a type wide enough to fit the sequence, be splatted, and then
1895	// be narrowed back to the original type.
1896	unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
1897	unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
1898	IIScalableTy->getMinNumElements() /
1899	PatternWidth;
1900
1901	IntegerType *WideTy = IC.Builder.getIntNTy(N: PatternWidth);
1902	auto *WideScalableTy = ScalableVectorType::get(ElementType: WideTy, MinNumElts: PatternElementCount);
1903	auto *WideShuffleMaskTy =
1904	ScalableVectorType::get(ElementType: IC.Builder.getInt32Ty(), MinNumElts: PatternElementCount);
1905
1906	auto ZeroIdx = ConstantInt::get(Ty: IC.Builder.getInt64Ty(), V: APInt (`64`, `0`));
1907	auto InsertSubvector = IC.Builder.CreateInsertVector(
1908	DstType: II.getType(), SrcVec: PoisonValue::get(T: II.getType()), SubVec: InsertEltChain, Idx: ZeroIdx);
1909	auto WideBitcast =
1910	IC.Builder.CreateBitOrPointerCast(V: InsertSubvector, DestTy: WideScalableTy);
1911	auto WideShuffleMask = ConstantAggregateZero::get(Ty: WideShuffleMaskTy);
1912	auto WideShuffle = IC.Builder.CreateShuffleVector(
1913	V1: WideBitcast, V2: PoisonValue::get(T: WideScalableTy), Mask: WideShuffleMask);
1914	auto NarrowBitcast =
1915	IC.Builder.CreateBitOrPointerCast(V: WideShuffle, DestTy: II.getType());
1916
1917	return IC.replaceInstUsesWith(I&: II, V: NarrowBitcast);
1918	}
1919
1920	static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
1921	IntrinsicInst &II) {
1922	Value *A = II.getArgOperand(i: `0`);
1923	Value *B = II.getArgOperand(i: `1`);
1924	if (A == B)
1925	return IC.replaceInstUsesWith(I&: II, V: A);
1926
1927	return std::nullopt;
1928	}
1929
1930	static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
1931	IntrinsicInst &II) {
1932	Value *Pred = II.getOperand(i_nocapture: `0`);
1933	Value *Vec = II.getOperand(i_nocapture: `1`);
1934	Value *Shift = II.getOperand(i_nocapture: `2`);
1935
1936	// Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
1937	Value AbsPred, MergedValue;
1938	if (!match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_sqabs>(
1939	m_Value(MergedValue), m_Value(AbsPred), m_Value())) &&
1940	!match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_abs>(
1941	m_Value(MergedValue), m_Value(AbsPred), m_Value())))
1942
1943	return std::nullopt;
1944
1945	// Transform is valid if any of the following are true:
1946	// The ABS merge value is an undef or non-negative*
1947	// The ABS predicate is all active*
1948	// The ABS predicate and the SRSHL predicates are the same*
1949	if (!isa<UndefValue>(Val: MergedValue) && !match(V: MergedValue, P: m_NonNegative()) &&
1950	AbsPred != Pred && !isAllActivePredicate(Pred: AbsPred))
1951	return std::nullopt;
1952
1953	// Only valid when the shift amount is non-negative, otherwise the rounding
1954	// behaviour of SRSHL cannot be ignored.
1955	if (!match(V: Shift, P: m_NonNegative()))
1956	return std::nullopt;
1957
1958	auto LSL = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl,
1959	{II.getType()}, {Pred, Vec, Shift});
1960
1961	return IC.replaceInstUsesWith(I&: II, V: LSL);
1962	}
1963
1964	std::optional<Instruction *>
1965	AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
1966	IntrinsicInst &II) const {
1967	Intrinsic::ID IID = II.getIntrinsicID();
1968	switch (IID) {
1969	default:
1970	break;
1971	case Intrinsic::aarch64_neon_fmaxnm:
1972	case Intrinsic::aarch64_neon_fminnm:
1973	return instCombineMaxMinNM(IC, II);
1974	case Intrinsic::aarch64_sve_convert_from_svbool:
1975	return instCombineConvertFromSVBool(IC, II);
1976	case Intrinsic::aarch64_sve_dup:
1977	return instCombineSVEDup(IC, II);
1978	case Intrinsic::aarch64_sve_dup_x:
1979	return instCombineSVEDupX(IC, II);
1980	case Intrinsic::aarch64_sve_cmpne:
1981	case Intrinsic::aarch64_sve_cmpne_wide:
1982	return instCombineSVECmpNE(IC, II);
1983	case Intrinsic::aarch64_sve_rdffr:
1984	return instCombineRDFFR(IC, II);
1985	case Intrinsic::aarch64_sve_lasta:
1986	case Intrinsic::aarch64_sve_lastb:
1987	return instCombineSVELast(IC, II);
1988	case Intrinsic::aarch64_sve_clasta_n:
1989	case Intrinsic::aarch64_sve_clastb_n:
1990	return instCombineSVECondLast(IC, II);
1991	case Intrinsic::aarch64_sve_cntd:
1992	return instCombineSVECntElts(IC, II, NumElts: `2`);
1993	case Intrinsic::aarch64_sve_cntw:
1994	return instCombineSVECntElts(IC, II, NumElts: `4`);
1995	case Intrinsic::aarch64_sve_cnth:
1996	return instCombineSVECntElts(IC, II, NumElts: `8`);
1997	case Intrinsic::aarch64_sve_cntb:
1998	return instCombineSVECntElts(IC, II, NumElts: `16`);
1999	case Intrinsic::aarch64_sve_ptest_any:
2000	case Intrinsic::aarch64_sve_ptest_first:
2001	case Intrinsic::aarch64_sve_ptest_last:
2002	return instCombineSVEPTest(IC, II);
2003	case Intrinsic::aarch64_sve_fabd:
2004	return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fabd_u);
2005	case Intrinsic::aarch64_sve_fadd:
2006	return instCombineSVEVectorFAdd(IC, II);
2007	case Intrinsic::aarch64_sve_fadd_u:
2008	return instCombineSVEVectorFAddU(IC, II);
2009	case Intrinsic::aarch64_sve_fdiv:
2010	return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fdiv_u);
2011	case Intrinsic::aarch64_sve_fmax:
2012	return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmax_u);
2013	case Intrinsic::aarch64_sve_fmaxnm:
2014	return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmaxnm_u);
2015	case Intrinsic::aarch64_sve_fmin:
2016	return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmin_u);
2017	case Intrinsic::aarch64_sve_fminnm:
2018	return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fminnm_u);
2019	case Intrinsic::aarch64_sve_fmla:
2020	return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmla_u);
2021	case Intrinsic::aarch64_sve_fmls:
2022	return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmls_u);
2023	case Intrinsic::aarch64_sve_fmul:
2024	if (auto II_U =
2025	instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmul_u))
2026	return II_U;
2027	return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_fmul_u);
2028	case Intrinsic::aarch64_sve_fmul_u:
2029	return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_fmul_u);
2030	case Intrinsic::aarch64_sve_fmulx:
2031	return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmulx_u);
2032	case Intrinsic::aarch64_sve_fnmla:
2033	return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fnmla_u);
2034	case Intrinsic::aarch64_sve_fnmls:
2035	return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fnmls_u);
2036	case Intrinsic::aarch64_sve_fsub:
2037	return instCombineSVEVectorFSub(IC, II);
2038	case Intrinsic::aarch64_sve_fsub_u:
2039	return instCombineSVEVectorFSubU(IC, II);
2040	case Intrinsic::aarch64_sve_add:
2041	return instCombineSVEVectorAdd(IC, II);
2042	case Intrinsic::aarch64_sve_add_u:
2043	return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2044	Intrinsic::aarch64_sve_mla_u>(
2045	IC, II, true);
2046	case Intrinsic::aarch64_sve_mla:
2047	return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mla_u);
2048	case Intrinsic::aarch64_sve_mls:
2049	return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mls_u);
2050	case Intrinsic::aarch64_sve_mul:
2051	if (auto II_U =
2052	instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mul_u))
2053	return II_U;
2054	return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_mul_u);
2055	case Intrinsic::aarch64_sve_mul_u:
2056	return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_mul_u);
2057	case Intrinsic::aarch64_sve_sabd:
2058	return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sabd_u);
2059	case Intrinsic::aarch64_sve_smax:
2060	return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smax_u);
2061	case Intrinsic::aarch64_sve_smin:
2062	return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smin_u);
2063	case Intrinsic::aarch64_sve_smulh:
2064	return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smulh_u);
2065	case Intrinsic::aarch64_sve_sub:
2066	return instCombineSVEVectorSub(IC, II);
2067	case Intrinsic::aarch64_sve_sub_u:
2068	return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2069	Intrinsic::aarch64_sve_mls_u>(
2070	IC, II, true);
2071	case Intrinsic::aarch64_sve_uabd:
2072	return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_uabd_u);
2073	case Intrinsic::aarch64_sve_umax:
2074	return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umax_u);
2075	case Intrinsic::aarch64_sve_umin:
2076	return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umin_u);
2077	case Intrinsic::aarch64_sve_umulh:
2078	return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umulh_u);
2079	case Intrinsic::aarch64_sve_asr:
2080	return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_asr_u);
2081	case Intrinsic::aarch64_sve_lsl:
2082	return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_lsl_u);
2083	case Intrinsic::aarch64_sve_lsr:
2084	return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_lsr_u);
2085	case Intrinsic::aarch64_sve_and:
2086	return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_and_u);
2087	case Intrinsic::aarch64_sve_bic:
2088	return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_bic_u);
2089	case Intrinsic::aarch64_sve_eor:
2090	return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_eor_u);
2091	case Intrinsic::aarch64_sve_orr:
2092	return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_orr_u);
2093	case Intrinsic::aarch64_sve_sqsub:
2094	return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sqsub_u);
2095	case Intrinsic::aarch64_sve_uqsub:
2096	return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_uqsub_u);
2097	case Intrinsic::aarch64_sve_tbl:
2098	return instCombineSVETBL(IC, II);
2099	case Intrinsic::aarch64_sve_uunpkhi:
2100	case Intrinsic::aarch64_sve_uunpklo:
2101	case Intrinsic::aarch64_sve_sunpkhi:
2102	case Intrinsic::aarch64_sve_sunpklo:
2103	return instCombineSVEUnpack(IC, II);
2104	case Intrinsic::aarch64_sve_uzp1:
2105	return instCombineSVEUzp1(IC, II);
2106	case Intrinsic::aarch64_sve_zip1:
2107	case Intrinsic::aarch64_sve_zip2:
2108	return instCombineSVEZip(IC, II);
2109	case Intrinsic::aarch64_sve_ld1_gather_index:
2110	return instCombineLD1GatherIndex(IC, II);
2111	case Intrinsic::aarch64_sve_st1_scatter_index:
2112	return instCombineST1ScatterIndex(IC, II);
2113	case Intrinsic::aarch64_sve_ld1:
2114	return instCombineSVELD1(IC, II, DL);
2115	case Intrinsic::aarch64_sve_st1:
2116	return instCombineSVEST1(IC, II, DL);
2117	case Intrinsic::aarch64_sve_sdiv:
2118	return instCombineSVESDIV(IC, II);
2119	case Intrinsic::aarch64_sve_sel:
2120	return instCombineSVESel(IC, II);
2121	case Intrinsic::aarch64_sve_srshl:
2122	return instCombineSVESrshl(IC, II);
2123	case Intrinsic::aarch64_sve_dupq_lane:
2124	return instCombineSVEDupqLane(IC, II);
2125	}
2126
2127	return std::nullopt;
2128	}
2129
2130	std::optional<Value *> AArch64TTIImpl::simplifyDemandedVectorEltsIntrinsic(
2131	InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
2132	APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
2133	std::function<void(Instruction , unsigned*, APInt, APInt &)>
2134	SimplifyAndSetOp) const {
2135	switch (II.getIntrinsicID()) {
2136	default:
2137	break;
2138	case Intrinsic::aarch64_neon_fcvtxn:
2139	case Intrinsic::aarch64_neon_rshrn:
2140	case Intrinsic::aarch64_neon_sqrshrn:
2141	case Intrinsic::aarch64_neon_sqrshrun:
2142	case Intrinsic::aarch64_neon_sqshrn:
2143	case Intrinsic::aarch64_neon_sqshrun:
2144	case Intrinsic::aarch64_neon_sqxtn:
2145	case Intrinsic::aarch64_neon_sqxtun:
2146	case Intrinsic::aarch64_neon_uqrshrn:
2147	case Intrinsic::aarch64_neon_uqshrn:
2148	case Intrinsic::aarch64_neon_uqxtn:
2149	SimplifyAndSetOp (&II, `0`, OrigDemandedElts, UndefElts);
2150	break;
2151	}
2152
2153	return std::nullopt;
2154	}
2155
2156	TypeSize
2157	AArch64TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
2158	switch (K) {
2159	case TargetTransformInfo::RGK_Scalar:
2160	return TypeSize::getFixed(ExactSize: `64`);
2161	case TargetTransformInfo::RGK_FixedWidthVector:
2162	if (!ST->isNeonAvailable() && !EnableFixedwidthAutovecInStreamingMode)
2163	return TypeSize::getFixed(ExactSize: `0`);
2164
2165	if (ST->hasSVE())
2166	return TypeSize::getFixed(
2167	ExactSize: std::max(a: ST->getMinSVEVectorSizeInBits(), b: `128u`));
2168
2169	return TypeSize::getFixed(ExactSize: ST->hasNEON() ? `128` : `0`);
2170	case TargetTransformInfo::RGK_ScalableVector:
2171	if (!ST->isSVEAvailable() && !EnableScalableAutovecInStreamingMode)
2172	return TypeSize::getScalable(MinimumSize: `0`);
2173
2174	return TypeSize::getScalable(MinimumSize: ST->hasSVE() ? `128` : `0`);
2175	}
2176	llvm_unreachable("Unsupported register kind");
2177	}
2178
2179	bool AArch64TTIImpl::isWideningInstruction(Type DstTy, unsigned* Opcode,
2180	ArrayRef<const Value *> Args,
2181	Type *SrcOverrideTy) {
2182	// A helper that returns a vector type from the given type. The number of
2183	// elements in type Ty determines the vector width.
2184	auto toVectorTy = [&](Type *ArgTy) {
2185	return VectorType::get(ElementType: ArgTy->getScalarType(),
2186	EC: cast<VectorType>(Val: DstTy)->getElementCount());
2187	};
2188
2189	// Exit early if DstTy is not a vector type whose elements are one of [i16,
2190	// i32, i64]. SVE doesn't generally have the same set of instructions to
2191	// perform an extend with the add/sub/mul. There are SMULLB style
2192	// instructions, but they operate on top/bottom, requiring some sort of lane
2193	// interleaving to be used with zext/sext.
2194	unsigned DstEltSize = DstTy->getScalarSizeInBits();
2195	if (!useNeonVector(Ty: DstTy) \|\| Args.size() != `2` \|\|
2196	(DstEltSize != `16` && DstEltSize != `32` && DstEltSize != `64`))
2197	return false;
2198
2199	// Determine if the operation has a widening variant. We consider both the
2200	// "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
2201	// instructions.
2202	//
2203	// TODO: Add additional widening operations (e.g., shl, etc.) once we
2204	// verify that their extending operands are eliminated during code
2205	// generation.
2206	Type *SrcTy = SrcOverrideTy;
2207	switch (Opcode) {
2208	case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
2209	case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
2210	// The second operand needs to be an extend
2211	if (isa<SExtInst>(Val: Args [`1`]) \|\| isa<ZExtInst>(Val: Args [`1`])) {
2212	if (!SrcTy)
2213	SrcTy =
2214	toVectorTy (cast<Instruction>(Val: Args [`1`])->getOperand(i: `0`)->getType());
2215	} else
2216	return false;
2217	break;
2218	case Instruction::Mul: { // SMULL(2), UMULL(2)
2219	// Both operands need to be extends of the same type.
2220	if ((isa<SExtInst>(Val: Args [`0`]) && isa<SExtInst>(Val: Args [`1`])) \|\|
2221	(isa<ZExtInst>(Val: Args [`0`]) && isa<ZExtInst>(Val: Args [`1`]))) {
2222	if (!SrcTy)
2223	SrcTy =
2224	toVectorTy (cast<Instruction>(Val: Args [`0`])->getOperand(i: `0`)->getType());
2225	} else if (isa<ZExtInst>(Val: Args [`0`]) \|\| isa<ZExtInst>(Val: Args [`1`])) {
2226	// If one of the operands is a Zext and the other has enough zero bits to
2227	// be treated as unsigned, we can still general a umull, meaning the zext
2228	// is free.
2229	KnownBits Known =
2230	computeKnownBits(V: isa<ZExtInst>(Val: Args [`0`]) ? Args [`1`] : Args [`0`], DL);
2231	if (Args [`0`]->getType()->getScalarSizeInBits() -
2232	Known.Zero.countLeadingOnes() >
2233	DstTy->getScalarSizeInBits() / `2`)
2234	return false;
2235	if (!SrcTy)
2236	SrcTy = toVectorTy (Type::getIntNTy(C&: DstTy->getContext(),
2237	N: DstTy->getScalarSizeInBits() / `2`));
2238	} else
2239	return false;
2240	break;
2241	}
2242	default:
2243	return false;
2244	}
2245
2246	// Legalize the destination type and ensure it can be used in a widening
2247	// operation.
2248	auto DstTyL = getTypeLegalizationCost(Ty: DstTy);
2249	if (!DstTyL.second.isVector() \|\| DstEltSize != DstTy->getScalarSizeInBits())
2250	return false;
2251
2252	// Legalize the source type and ensure it can be used in a widening
2253	// operation.
2254	assert(SrcTy && "Expected some SrcTy");
2255	auto SrcTyL = getTypeLegalizationCost(Ty: SrcTy);
2256	unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
2257	if (!SrcTyL.second.isVector() \|\| SrcElTySize != SrcTy->getScalarSizeInBits())
2258	return false;
2259
2260	// Get the total number of vector elements in the legalized types.
2261	InstructionCost NumDstEls =
2262	DstTyL.first * DstTyL.second.getVectorMinNumElements();
2263	InstructionCost NumSrcEls =
2264	SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
2265
2266	// Return true if the legalized types have the same number of vector elements
2267	// and the destination element type size is twice that of the source type.
2268	return NumDstEls == NumSrcEls && `2` * SrcElTySize == DstEltSize;
2269	}
2270
2271	// s/urhadd instructions implement the following pattern, making the
2272	// extends free:
2273	// %x = add ((zext i8 -> i16), 1)
2274	// %y = (zext i8 -> i16)
2275	// trunc i16 (lshr (add %x, %y), 1) -> i8
2276	//
2277	bool AArch64TTIImpl::isExtPartOfAvgExpr(const Instruction ExtUser, Type Dst,
2278	Type *Src) {
2279	// The source should be a legal vector type.
2280	if (!Src->isVectorTy() \|\| !TLI->isTypeLegal(VT: TLI->getValueType(DL, Ty: Src)) \|\|
2281	(Src->isScalableTy() && !ST->hasSVE2()))
2282	return false;
2283
2284	if (ExtUser->getOpcode() != Instruction::Add \|\| !ExtUser->hasOneUse())
2285	return false;
2286
2287	// Look for trunc/shl/add before trying to match the pattern.
2288	const Instruction *Add = ExtUser;
2289	auto *AddUser =
2290	dyn_cast_or_null<Instruction>(Val: Add->getUniqueUndroppableUser());
2291	if (AddUser && AddUser->getOpcode() == Instruction::Add)
2292	Add = AddUser;
2293
2294	auto *Shr = dyn_cast_or_null<Instruction>(Val: Add->getUniqueUndroppableUser());
2295	if (!Shr \|\| Shr->getOpcode() != Instruction::LShr)
2296	return false;
2297
2298	auto *Trunc = dyn_cast_or_null<Instruction>(Val: Shr->getUniqueUndroppableUser());
2299	if (!Trunc \|\| Trunc->getOpcode() != Instruction::Trunc \|\|
2300	Src->getScalarSizeInBits() !=
2301	cast<CastInst>(Val: Trunc)->getDestTy()->getScalarSizeInBits())
2302	return false;
2303
2304	// Try to match the whole pattern. Ext could be either the first or second
2305	// m_ZExtOrSExt matched.
2306	Instruction Ex1, Ex2;
2307	if (!(match(V: Add, P: m_c_Add(L: m_Instruction(I&: Ex1),
2308	R: m_c_Add(L: m_Instruction(I&: Ex2), R: m_SpecificInt(V: `1`))))))
2309	return false;
2310
2311	// Ensure both extends are of the same type
2312	if (match(V: Ex1, P: m_ZExtOrSExt(Op: m_Value())) &&
2313	Ex1->getOpcode() == Ex2->getOpcode())
2314	return true;
2315
2316	return false;
2317	}
2318
2319	InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
2320	Type *Src,
2321	TTI::CastContextHint CCH,
2322	TTI::TargetCostKind CostKind,
2323	const Instruction *I) {
2324	int ISD = TLI->InstructionOpcodeToISD(Opcode);
2325	assert(ISD && "Invalid opcode");
2326	// If the cast is observable, and it is used by a widening instruction (e.g.,
2327	// uaddl, saddw, etc.), it may be free.
2328	if (I && I->hasOneUser()) {
2329	auto SingleUser = cast<Instruction>(Val: I->user_begin());
2330	SmallVector<const Value *, `4`> Operands(SingleUser->operand_values());
2331	if (isWideningInstruction(DstTy: Dst, Opcode: SingleUser->getOpcode(), Args: Operands, SrcOverrideTy: Src)) {
2332	// For adds only count the second operand as free if both operands are
2333	// extends but not the same operation. (i.e both operands are not free in
2334	// add(sext, zext)).
2335	if (SingleUser->getOpcode() == Instruction::Add) {
2336	if (I == SingleUser->getOperand(i: `1`) \|\|
2337	(isa<CastInst>(Val: SingleUser->getOperand(i: `1`)) &&
2338	cast<CastInst>(Val: SingleUser->getOperand(i: `1`))->getOpcode() == Opcode))
2339	return `0`;
2340	} else // Others are free so long as isWideningInstruction returned true.
2341	return `0`;
2342	}
2343
2344	// The cast will be free for the s/urhadd instructions
2345	if ((isa<ZExtInst>(Val: I) \|\| isa<SExtInst>(Val: I)) &&
2346	isExtPartOfAvgExpr(ExtUser: SingleUser, Dst, Src))
2347	return `0`;
2348	}
2349
2350	// TODO: Allow non-throughput costs that aren't binary.
2351	auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
2352	if (CostKind != TTI::TCK_RecipThroughput)
2353	return Cost == `0` ? `0` : `1`;
2354	return Cost;
2355	};
2356
2357	EVT SrcTy = TLI->getValueType(DL, Ty: Src);
2358	EVT DstTy = TLI->getValueType(DL, Ty: Dst);
2359
2360	if (!SrcTy.isSimple() \|\| !DstTy.isSimple())
2361	return AdjustCost (
2362	BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
2363
2364	static const TypeConversionCostTblEntry
2365	ConversionTbl[] = {
2366	{ ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, `1`}, // xtn
2367	{ ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, `1`}, // xtn
2368	{ ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, `1`}, // xtn
2369	{ ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, `1`}, // xtn
2370	{ ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, `3`}, // 2 xtn + 1 uzp1
2371	{ ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, `1`}, // xtn
2372	{ ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, `2`}, // 1 uzp1 + 1 xtn
2373	{ ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, `1`}, // 1 uzp1
2374	{ ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, `1`}, // 1 xtn
2375	{ ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, `2`}, // 1 uzp1 + 1 xtn
2376	{ ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, `4`}, // 3 x uzp1 + xtn
2377	{ ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, `1`}, // 1 uzp1
2378	{ ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, `3`}, // 3 x uzp1
2379	{ ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, `2`}, // 2 x uzp1
2380	{ ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, `1`}, // uzp1
2381	{ ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, `3`}, // (2 + 1) x uzp1
2382	{ ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, `7`}, // (4 + 2 + 1) x uzp1
2383	{ ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, `2`}, // 2 x uzp1
2384	{ ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, `6`}, // (4 + 2) x uzp1
2385	{ ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, `4`}, // 4 x uzp1
2386
2387	// Truncations on nxvmiN
2388	{ ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, `1` },
2389	{ ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, `1` },
2390	{ ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, `1` },
2391	{ ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, `1` },
2392	{ ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, `1` },
2393	{ ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, `2` },
2394	{ ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, `1` },
2395	{ ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, `3` },
2396	{ ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, `5` },
2397	{ ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, `1` },
2398	{ ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, `1` },
2399	{ ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, `1` },
2400	{ ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, `1` },
2401	{ ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, `2` },
2402	{ ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, `3` },
2403	{ ISD::TRUNCATE, MVT::nxv8i32, MVT::nxv8i64, `6` },
2404
2405	// The number of shll instructions for the extension.
2406	{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, `3` },
2407	{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, `3` },
2408	{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, `2` },
2409	{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, `2` },
2410	{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, `3` },
2411	{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, `3` },
2412	{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, `2` },
2413	{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, `2` },
2414	{ ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, `7` },
2415	{ ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, `7` },
2416	{ ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, `6` },
2417	{ ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, `6` },
2418	{ ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, `2` },
2419	{ ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, `2` },
2420	{ ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, `6` },
2421	{ ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, `6` },
2422
2423	// LowerVectorINT_TO_FP:
2424	{ ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, `1` },
2425	{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, `1` },
2426	{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, `1` },
2427	{ ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, `1` },
2428	{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, `1` },
2429	{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, `1` },
2430
2431	// Complex: to v2f32
2432	{ ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, `3` },
2433	{ ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, `3` },
2434	{ ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, `2` },
2435	{ ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, `3` },
2436	{ ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, `3` },
2437	{ ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, `2` },
2438
2439	// Complex: to v4f32
2440	{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, `4` },
2441	{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, `2` },
2442	{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, `3` },
2443	{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, `2` },
2444
2445	// Complex: to v8f32
2446	{ ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, `10` },
2447	{ ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, `4` },
2448	{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, `10` },
2449	{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, `4` },
2450
2451	// Complex: to v16f32
2452	{ ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, `21` },
2453	{ ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, `21` },
2454
2455	// Complex: to v2f64
2456	{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, `4` },
2457	{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, `4` },
2458	{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, `2` },
2459	{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, `4` },
2460	{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, `4` },
2461	{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, `2` },
2462
2463	// Complex: to v4f64
2464	{ ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, `4` },
2465	{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, `4` },
2466
2467	// LowerVectorFP_TO_INT
2468	{ ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, `1` },
2469	{ ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, `1` },
2470	{ ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, `1` },
2471	{ ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, `1` },
2472	{ ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, `1` },
2473	{ ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, `1` },
2474
2475	// Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
2476	{ ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, `2` },
2477	{ ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, `1` },
2478	{ ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, `1` },
2479	{ ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, `2` },
2480	{ ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, `1` },
2481	{ ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, `1` },
2482
2483	// Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
2484	{ ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, `2` },
2485	{ ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, `2` },
2486	{ ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, `2` },
2487	{ ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, `2` },
2488
2489	// Complex, from nxv2f32.
2490	{ ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, `1` },
2491	{ ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, `1` },
2492	{ ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, `1` },
2493	{ ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, `1` },
2494	{ ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, `1` },
2495	{ ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, `1` },
2496	{ ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, `1` },
2497	{ ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, `1` },
2498
2499	// Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
2500	{ ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, `2` },
2501	{ ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, `2` },
2502	{ ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, `2` },
2503	{ ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, `2` },
2504	{ ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, `2` },
2505	{ ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, `2` },
2506
2507	// Complex, from nxv2f64.
2508	{ ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, `1` },
2509	{ ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, `1` },
2510	{ ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, `1` },
2511	{ ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, `1` },
2512	{ ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, `1` },
2513	{ ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, `1` },
2514	{ ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, `1` },
2515	{ ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, `1` },
2516
2517	// Complex, from nxv4f32.
2518	{ ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, `4` },
2519	{ ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, `1` },
2520	{ ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, `1` },
2521	{ ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, `1` },
2522	{ ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, `4` },
2523	{ ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, `1` },
2524	{ ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, `1` },
2525	{ ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, `1` },
2526
2527	// Complex, from nxv8f64. Illegal -> illegal conversions not required.
2528	{ ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, `7` },
2529	{ ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, `7` },
2530	{ ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, `7` },
2531	{ ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, `7` },
2532
2533	// Complex, from nxv4f64. Illegal -> illegal conversions not required.
2534	{ ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, `3` },
2535	{ ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, `3` },
2536	{ ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, `3` },
2537	{ ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, `3` },
2538	{ ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, `3` },
2539	{ ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, `3` },
2540
2541	// Complex, from nxv8f32. Illegal -> illegal conversions not required.
2542	{ ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, `3` },
2543	{ ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, `3` },
2544	{ ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, `3` },
2545	{ ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, `3` },
2546
2547	// Complex, from nxv8f16.
2548	{ ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, `10` },
2549	{ ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, `4` },
2550	{ ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, `1` },
2551	{ ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, `1` },
2552	{ ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, `10` },
2553	{ ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, `4` },
2554	{ ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, `1` },
2555	{ ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, `1` },
2556
2557	// Complex, from nxv4f16.
2558	{ ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, `4` },
2559	{ ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, `1` },
2560	{ ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, `1` },
2561	{ ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, `1` },
2562	{ ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, `4` },
2563	{ ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, `1` },
2564	{ ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, `1` },
2565	{ ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, `1` },
2566
2567	// Complex, from nxv2f16.
2568	{ ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, `1` },
2569	{ ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, `1` },
2570	{ ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, `1` },
2571	{ ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, `1` },
2572	{ ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, `1` },
2573	{ ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, `1` },
2574	{ ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, `1` },
2575	{ ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, `1` },
2576
2577	// Truncate from nxvmf32 to nxvmf16.
2578	{ ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, `1` },
2579	{ ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, `1` },
2580	{ ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, `3` },
2581
2582	// Truncate from nxvmf64 to nxvmf16.
2583	{ ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, `1` },
2584	{ ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, `3` },
2585	{ ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, `7` },
2586
2587	// Truncate from nxvmf64 to nxvmf32.
2588	{ ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, `1` },
2589	{ ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, `3` },
2590	{ ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, `6` },
2591
2592	// Extend from nxvmf16 to nxvmf32.
2593	{ ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, `1`},
2594	{ ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, `1`},
2595	{ ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, `2`},
2596
2597	// Extend from nxvmf16 to nxvmf64.
2598	{ ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, `1`},
2599	{ ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, `2`},
2600	{ ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, `4`},
2601
2602	// Extend from nxvmf32 to nxvmf64.
2603	{ ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, `1`},
2604	{ ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, `2`},
2605	{ ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, `6`},
2606
2607	// Bitcasts from float to integer
2608	{ ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, `0` },
2609	{ ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, `0` },
2610	{ ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, `0` },
2611
2612	// Bitcasts from integer to float
2613	{ ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, `0` },
2614	{ ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, `0` },
2615	{ ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, `0` },
2616
2617	// Add cost for extending to illegal -too wide- scalable vectors.
2618	// zero/sign extend are implemented by multiple unpack operations,
2619	// where each operation has a cost of 1.
2620	{ ISD::ZERO_EXTEND, MVT::nxv16i16, MVT::nxv16i8, `2`},
2621	{ ISD::ZERO_EXTEND, MVT::nxv16i32, MVT::nxv16i8, `6`},
2622	{ ISD::ZERO_EXTEND, MVT::nxv16i64, MVT::nxv16i8, `14`},
2623	{ ISD::ZERO_EXTEND, MVT::nxv8i32, MVT::nxv8i16, `2`},
2624	{ ISD::ZERO_EXTEND, MVT::nxv8i64, MVT::nxv8i16, `6`},
2625	{ ISD::ZERO_EXTEND, MVT::nxv4i64, MVT::nxv4i32, `2`},
2626
2627	{ ISD::SIGN_EXTEND, MVT::nxv16i16, MVT::nxv16i8, `2`},
2628	{ ISD::SIGN_EXTEND, MVT::nxv16i32, MVT::nxv16i8, `6`},
2629	{ ISD::SIGN_EXTEND, MVT::nxv16i64, MVT::nxv16i8, `14`},
2630	{ ISD::SIGN_EXTEND, MVT::nxv8i32, MVT::nxv8i16, `2`},
2631	{ ISD::SIGN_EXTEND, MVT::nxv8i64, MVT::nxv8i16, `6`},
2632	{ ISD::SIGN_EXTEND, MVT::nxv4i64, MVT::nxv4i32, `2`},
2633	};
2634
2635	// We have to estimate a cost of fixed length operation upon
2636	// SVE registers(operations) with the number of registers required
2637	// for a fixed type to be represented upon SVE registers.
2638	EVT WiderTy = SrcTy.bitsGT(VT: DstTy) ? SrcTy : DstTy;
2639	if (SrcTy.isFixedLengthVector() && DstTy.isFixedLengthVector() &&
2640	SrcTy.getVectorNumElements() == DstTy.getVectorNumElements() &&
2641	ST->useSVEForFixedLengthVectors(VT: WiderTy)) {
2642	std::pair<InstructionCost, MVT> LT =
2643	getTypeLegalizationCost(Ty: WiderTy.getTypeForEVT(Context&: Dst->getContext()));
2644	unsigned NumElements = AArch64::SVEBitsPerBlock /
2645	LT.second.getVectorElementType().getSizeInBits();
2646	return AdjustCost (
2647	LT.first *
2648	getCastInstrCost(
2649	Opcode, Dst: ScalableVectorType::get(ElementType: Dst->getScalarType(), MinNumElts: NumElements),
2650	Src: ScalableVectorType::get(ElementType: Src->getScalarType(), MinNumElts: NumElements), CCH,
2651	CostKind, I));
2652	}
2653
2654	if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD,
2655	DstTy.getSimpleVT(),
2656	SrcTy.getSimpleVT()))
2657	return AdjustCost(Entry->Cost);
2658
2659	static const TypeConversionCostTblEntry FP16Tbl[] = {
2660	{ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, `1`}, // fcvtzs
2661	{ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, `1`},
2662	{ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, `1`}, // fcvtzs
2663	{ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, `1`},
2664	{ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, `2`}, // fcvtl+fcvtzs
2665	{ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, `2`},
2666	{ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, `2`}, // fcvtzs+xtn
2667	{ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, `2`},
2668	{ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, `1`}, // fcvtzs
2669	{ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, `1`},
2670	{ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, `4`}, // 2fcvtl+2fcvtzs
2671	{ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, `4`},
2672	{ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, `3`}, // 2fcvtzs+xtn*
2673	{ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, `3`},
2674	{ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, `2`}, // 2fcvtzs*
2675	{ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, `2`},
2676	{ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, `8`}, // 4fcvtl+4fcvtzs
2677	{ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, `8`},
2678	{ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, `2`}, // ushll + ucvtf
2679	{ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, `2`}, // sshll + scvtf
2680	{ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, `4`}, // 2 ushl(2) + 2 * ucvtf*
2681	{ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, `4`}, // 2 sshl(2) + 2 * scvtf*
2682	};
2683
2684	if (ST->hasFullFP16())
2685	if (const auto *Entry = ConvertCostTableLookup(
2686	FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
2687	return AdjustCost(Entry->Cost);
2688
2689	if ((ISD == ISD::ZERO_EXTEND \|\| ISD == ISD::SIGN_EXTEND) &&
2690	CCH == TTI::CastContextHint::Masked && ST->hasSVEorSME() &&
2691	TLI->getTypeAction(Context&: Src->getContext(), VT: SrcTy) ==
2692	TargetLowering::TypePromoteInteger &&
2693	TLI->getTypeAction(Context&: Dst->getContext(), VT: DstTy) ==
2694	TargetLowering::TypeSplitVector) {
2695	// The standard behaviour in the backend for these cases is to split the
2696	// extend up into two parts:
2697	// 1. Perform an extending load or masked load up to the legal type.
2698	// 2. Extend the loaded data to the final type.
2699	std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Ty: Src);
2700	Type *LegalTy = EVT (SrcLT.second).getTypeForEVT(Context&: Src->getContext());
2701	InstructionCost Part1 = AArch64TTIImpl::getCastInstrCost(
2702	Opcode, Dst: LegalTy, Src, CCH, CostKind, I);
2703	InstructionCost Part2 = AArch64TTIImpl::getCastInstrCost(
2704	Opcode, Dst, Src: LegalTy, CCH: TTI::CastContextHint::None, CostKind, I);
2705	return Part1 + Part2;
2706	}
2707
2708	// The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal,
2709	// but we also want to include the TTI::CastContextHint::Masked case too.
2710	if ((ISD == ISD::ZERO_EXTEND \|\| ISD == ISD::SIGN_EXTEND) &&
2711	CCH == TTI::CastContextHint::Masked && ST->hasSVEorSME() &&
2712	TLI->isTypeLegal(VT: DstTy))
2713	CCH = TTI::CastContextHint::Normal;
2714
2715	return AdjustCost (
2716	BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
2717	}
2718
2719	InstructionCost AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode,
2720	Type *Dst,
2721	VectorType *VecTy,
2722	unsigned Index) {
2723
2724	// Make sure we were given a valid extend opcode.
2725	assert((Opcode == Instruction::SExt \|\| Opcode == Instruction::ZExt) &&
2726	"Invalid opcode");
2727
2728	// We are extending an element we extract from a vector, so the source type
2729	// of the extend is the element type of the vector.
2730	auto *Src = VecTy->getElementType();
2731
2732	// Sign- and zero-extends are for integer types only.
2733	assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
2734
2735	// Get the cost for the extract. We compute the cost (if any) for the extend
2736	// below.
2737	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2738	InstructionCost Cost = getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: VecTy,
2739	CostKind, Index, Op0: nullptr, Op1: nullptr);
2740
2741	// Legalize the types.
2742	auto VecLT = getTypeLegalizationCost(Ty: VecTy);
2743	auto DstVT = TLI->getValueType(DL, Ty: Dst);
2744	auto SrcVT = TLI->getValueType(DL, Ty: Src);
2745
2746	// If the resulting type is still a vector and the destination type is legal,
2747	// we may get the extension for free. If not, get the default cost for the
2748	// extend.
2749	if (!VecLT.second.isVector() \|\| !TLI->isTypeLegal(VT: DstVT))
2750	return Cost + getCastInstrCost(Opcode, Dst, Src, CCH: TTI::CastContextHint::None,
2751	CostKind);
2752
2753	// The destination type should be larger than the element type. If not, get
2754	// the default cost for the extend.
2755	if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
2756	return Cost + getCastInstrCost(Opcode, Dst, Src, CCH: TTI::CastContextHint::None,
2757	CostKind);
2758
2759	switch (Opcode) {
2760	default:
2761	llvm_unreachable("Opcode should be either SExt or ZExt");
2762
2763	// For sign-extends, we only need a smov, which performs the extension
2764	// automatically.
2765	case Instruction::SExt:
2766	return Cost;
2767
2768	// For zero-extends, the extend is performed automatically by a umov unless
2769	// the destination type is i64 and the element type is i8 or i16.
2770	case Instruction::ZExt:
2771	if (DstVT.getSizeInBits() != `64u` \|\| SrcVT.getSizeInBits() == `32u`)
2772	return Cost;
2773	}
2774
2775	// If we are unable to perform the extend for free, get the default cost.
2776	return Cost + getCastInstrCost(Opcode, Dst, Src, CCH: TTI::CastContextHint::None,
2777	CostKind);
2778	}
2779
2780	InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
2781	TTI::TargetCostKind CostKind,
2782	const Instruction *I) {
2783	if (CostKind != TTI::TCK_RecipThroughput)
2784	return Opcode == Instruction::PHI ? `0` : `1`;
2785	assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
2786	// Branches are assumed to be predicted.
2787	return `0`;
2788	}
2789
2790	InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(const Instruction *I,
2791	Type *Val,
2792	unsigned Index,
2793	bool HasRealUse) {
2794	assert(Val->isVectorTy() && "This must be a vector type");
2795
2796	if (Index != -`1U`) {
2797	// Legalize the type.
2798	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Val);
2799
2800	// This type is legalized to a scalar type.
2801	if (!LT.second.isVector())
2802	return `0`;
2803
2804	// The type may be split. For fixed-width vectors we can normalize the
2805	// index to the new type.
2806	if (LT.second.isFixedLengthVector()) {
2807	unsigned Width = LT.second.getVectorNumElements();
2808	Index = Index % Width;
2809	}
2810
2811	// The element at index zero is already inside the vector.
2812	// - For a physical (HasRealUse==true) insert-element or extract-element
2813	// instruction that extracts integers, an explicit FPR -> GPR move is
2814	// needed. So it has non-zero cost.
2815	// - For the rest of cases (virtual instruction or element type is float),
2816	// consider the instruction free.
2817	if (Index == `0` && (!HasRealUse \|\| !Val->getScalarType()->isIntegerTy()))
2818	return `0`;
2819
2820	// This is recognising a LD1 single-element structure to one lane of one
2821	// register instruction. I.e., if this is an `insertelement` instruction,
2822	// and its second operand is a load, then we will generate a LD1, which
2823	// are expensive instructions.
2824	if (I && dyn_cast<LoadInst>(Val: I->getOperand(i: `1`)))
2825	return ST->getVectorInsertExtractBaseCost() + `1`;
2826
2827	// i1 inserts and extract will include an extra cset or cmp of the vector
2828	// value. Increase the cost by 1 to account.
2829	if (Val->getScalarSizeInBits() == `1`)
2830	return ST->getVectorInsertExtractBaseCost() + `1`;
2831
2832	// FIXME:
2833	// If the extract-element and insert-element instructions could be
2834	// simplified away (e.g., could be combined into users by looking at use-def
2835	// context), they have no cost. This is not done in the first place for
2836	// compile-time considerations.
2837	}
2838
2839	// All other insert/extracts cost this much.
2840	return ST->getVectorInsertExtractBaseCost();
2841	}
2842
2843	InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
2844	TTI::TargetCostKind CostKind,
2845	unsigned Index, Value *Op0,
2846	Value *Op1) {
2847	bool HasRealUse =
2848	Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Val: Op0);
2849	return getVectorInstrCostHelper(I: nullptr, Val, Index, HasRealUse);
2850	}
2851
2852	InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,
2853	Type *Val,
2854	TTI::TargetCostKind CostKind,
2855	unsigned Index) {
2856	return getVectorInstrCostHelper(I: &I, Val, Index, HasRealUse: true / HasRealUse /);
2857	}
2858
2859	InstructionCost AArch64TTIImpl::getScalarizationOverhead(
2860	VectorType Ty, const* APInt &DemandedElts, bool Insert, bool Extract,
2861	TTI::TargetCostKind CostKind) {
2862	if (isa<ScalableVectorType>(Val: Ty))
2863	return InstructionCost::getInvalid();
2864	if (Ty->getElementType()->isFloatingPointTy())
2865	return BaseT::getScalarizationOverhead(InTy: Ty, DemandedElts, Insert, Extract,
2866	CostKind);
2867	return DemandedElts.popcount() * (Insert + Extract) *
2868	ST->getVectorInsertExtractBaseCost();
2869	}
2870
2871	InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
2872	unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
2873	TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
2874	ArrayRef<const Value *> Args,
2875	const Instruction *CxtI) {
2876
2877	// TODO: Handle more cost kinds.
2878	if (CostKind != TTI::TCK_RecipThroughput)
2879	return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info,
2880	Opd2Info: Op2Info, Args, CxtI);
2881
2882	// Legalize the type.
2883	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2884	int ISD = TLI->InstructionOpcodeToISD(Opcode);
2885
2886	switch (ISD) {
2887	default:
2888	return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info,
2889	Opd2Info: Op2Info);
2890	case ISD::SDIV:
2891	if (Op2Info.isConstant() && Op2Info.isUniform() && Op2Info.isPowerOf2()) {
2892	// On AArch64, scalar signed division by constants power-of-two are
2893	// normally expanded to the sequence ADD + CMP + SELECT + SRA.
2894	// The OperandValue properties many not be same as that of previous
2895	// operation; conservatively assume OP_None.
2896	InstructionCost Cost = getArithmeticInstrCost(
2897	Opcode: Instruction::Add, Ty, CostKind,
2898	Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps());
2899	Cost += getArithmeticInstrCost(Opcode: Instruction::Sub, Ty, CostKind,
2900	Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps());
2901	Cost += getArithmeticInstrCost(
2902	Opcode: Instruction::Select, Ty, CostKind,
2903	Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps());
2904	Cost += getArithmeticInstrCost(Opcode: Instruction::AShr, Ty, CostKind,
2905	Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps());
2906	return Cost;
2907	}
2908	[[fallthrough]];
2909	case ISD::UDIV: {
2910	if (Op2Info.isConstant() && Op2Info.isUniform()) {
2911	auto VT = TLI->getValueType(DL, Ty);
2912	if (TLI->isOperationLegalOrCustom(Op: ISD::MULHU, VT)) {
2913	// Vector signed division by constant are expanded to the
2914	// sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division
2915	// to MULHS + SUB + SRL + ADD + SRL.
2916	InstructionCost MulCost = getArithmeticInstrCost(
2917	Opcode: Instruction::Mul, Ty, CostKind, Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps());
2918	InstructionCost AddCost = getArithmeticInstrCost(
2919	Opcode: Instruction::Add, Ty, CostKind, Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps());
2920	InstructionCost ShrCost = getArithmeticInstrCost(
2921	Opcode: Instruction::AShr, Ty, CostKind, Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps());
2922	return MulCost * `2` + AddCost * `2` + ShrCost * `2` + `1`;
2923	}
2924	}
2925
2926	InstructionCost Cost = BaseT::getArithmeticInstrCost(
2927	Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info);
2928	if (Ty->isVectorTy()) {
2929	if (TLI->isOperationLegalOrCustom(Op: ISD, VT: LT.second) && ST->hasSVE()) {
2930	// SDIV/UDIV operations are lowered using SVE, then we can have less
2931	// costs.
2932	if (isa<FixedVectorType>(Val: Ty) && cast<FixedVectorType>(Val: Ty)
2933	->getPrimitiveSizeInBits()
2934	.getFixedValue() < `128`) {
2935	EVT VT = TLI->getValueType(DL, Ty);
2936	static const CostTblEntry DivTbl[]{
2937	{ISD::SDIV, MVT::v2i8, `5`}, {ISD::SDIV, MVT::v4i8, `8`},
2938	{ISD::SDIV, MVT::v8i8, `8`}, {ISD::SDIV, MVT::v2i16, `5`},
2939	{ISD::SDIV, MVT::v4i16, `5`}, {ISD::SDIV, MVT::v2i32, `1`},
2940	{ISD::UDIV, MVT::v2i8, `5`}, {ISD::UDIV, MVT::v4i8, `8`},
2941	{ISD::UDIV, MVT::v8i8, `8`}, {ISD::UDIV, MVT::v2i16, `5`},
2942	{ISD::UDIV, MVT::v4i16, `5`}, {ISD::UDIV, MVT::v2i32, `1`}};
2943
2944	const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT());
2945	if (nullptr != Entry)
2946	return Entry->Cost;
2947	}
2948	// For 8/16-bit elements, the cost is higher because the type
2949	// requires promotion and possibly splitting:
2950	if (LT.second.getScalarType() == MVT::i8)
2951	Cost *= `8`;
2952	else if (LT.second.getScalarType() == MVT::i16)
2953	Cost *= `4`;
2954	return Cost;
2955	} else {
2956	// If one of the operands is a uniform constant then the cost for each
2957	// element is Cost for insertion, extraction and division.
2958	// Insertion cost = 2, Extraction Cost = 2, Division = cost for the
2959	// operation with scalar type
2960	if ((Op1Info.isConstant() && Op1Info.isUniform()) \|\|
2961	(Op2Info.isConstant() && Op2Info.isUniform())) {
2962	if (auto *VTy = dyn_cast<FixedVectorType>(Val: Ty)) {
2963	InstructionCost DivCost = BaseT::getArithmeticInstrCost(
2964	Opcode, Ty: Ty->getScalarType(), CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info);
2965	return (`4` + DivCost) * VTy->getNumElements();
2966	}
2967	}
2968	// On AArch64, without SVE, vector divisions are expanded
2969	// into scalar divisions of each pair of elements.
2970	Cost += getArithmeticInstrCost(Opcode: Instruction::ExtractElement, Ty,
2971	CostKind, Op1Info, Op2Info);
2972	Cost += getArithmeticInstrCost(Opcode: Instruction::InsertElement, Ty, CostKind,
2973	Op1Info, Op2Info);
2974	}
2975
2976	// TODO: if one of the arguments is scalar, then it's not necessary to
2977	// double the cost of handling the vector elements.
2978	Cost += Cost;
2979	}
2980	return Cost;
2981	}
2982	case ISD::MUL:
2983	// When SVE is available, then we can lower the v2i64 operation using
2984	// the SVE mul instruction, which has a lower cost.
2985	if (LT.second == MVT::v2i64 && ST->hasSVE())
2986	return LT.first;
2987
2988	// When SVE is not available, there is no MUL.2d instruction,
2989	// which means mul <2 x i64> is expensive as elements are extracted
2990	// from the vectors and the muls scalarized.
2991	// As getScalarizationOverhead is a bit too pessimistic, we
2992	// estimate the cost for a i64 vector directly here, which is:
2993	// - four 2-cost i64 extracts,
2994	// - two 2-cost i64 inserts, and
2995	// - two 1-cost muls.
2996	// So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
2997	// LT.first = 2 the cost is 28. If both operands are extensions it will not
2998	// need to scalarize so the cost can be cheaper (smull or umull).
2999	// so the cost can be cheaper (smull or umull).
3000	if (LT.second != MVT::v2i64 \|\| isWideningInstruction(Ty, Opcode, Args))
3001	return LT.first;
3002	return LT.first * `14`;
3003	case ISD::ADD:
3004	case ISD::XOR:
3005	case ISD::OR:
3006	case ISD::AND:
3007	case ISD::SRL:
3008	case ISD::SRA:
3009	case ISD::SHL:
3010	// These nodes are marked as 'custom' for combining purposes only.
3011	// We know that they are legal. See LowerAdd in ISelLowering.
3012	return LT.first;
3013
3014	case ISD::FNEG:
3015	case ISD::FADD:
3016	case ISD::FSUB:
3017	// Increase the cost for half and bfloat types if not architecturally
3018	// supported.
3019	if ((Ty->getScalarType()->isHalfTy() && !ST->hasFullFP16()) \|\|
3020	(Ty->getScalarType()->isBFloatTy() && !ST->hasBF16()))
3021	return `2` * LT.first;
3022	if (!Ty->getScalarType()->isFP128Ty())
3023	return LT.first;
3024	[[fallthrough]];
3025	case ISD::FMUL:
3026	case ISD::FDIV:
3027	// These nodes are marked as 'custom' just to lower them to SVE.
3028	// We know said lowering will incur no additional cost.
3029	if (!Ty->getScalarType()->isFP128Ty())
3030	return `2` * LT.first;
3031
3032	return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info,
3033	Opd2Info: Op2Info);
3034	case ISD::FREM:
3035	// Pass nullptr as fmod/fmodf calls are emitted by the backend even when
3036	// those functions are not declared in the module.
3037	if (!Ty->isVectorTy())
3038	return getCallInstrCost(/Function/ F: nullptr, RetTy: Ty, Tys: {Ty, Ty}, CostKind);
3039	return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info,
3040	Opd2Info: Op2Info);
3041	}
3042	}
3043
3044	InstructionCost AArch64TTIImpl::getAddressComputationCost(Type *Ty,
3045	ScalarEvolution *SE,
3046	const SCEV *Ptr) {
3047	// Address computations in vectorized code with non-consecutive addresses will
3048	// likely result in more instructions compared to scalar code where the
3049	// computation can more often be merged into the index mode. The resulting
3050	// extra micro-ops can significantly decrease throughput.
3051	unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead;
3052	int MaxMergeDistance = `64`;
3053
3054	if (Ty->isVectorTy() && SE &&
3055	!BaseT::isConstantStridedAccessLessThan(SE, Ptr, MergeDistance: MaxMergeDistance + `1`))
3056	return NumVectorInstToHideOverhead;
3057
3058	// In many cases the address computation is not merged into the instruction
3059	// addressing mode.
3060	return `1`;
3061	}
3062
3063	InstructionCost AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
3064	Type *CondTy,
3065	CmpInst::Predicate VecPred,
3066	TTI::TargetCostKind CostKind,
3067	const Instruction *I) {
3068	// TODO: Handle other cost kinds.
3069	if (CostKind != TTI::TCK_RecipThroughput)
3070	return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3071	I);
3072
3073	int ISD = TLI->InstructionOpcodeToISD(Opcode);
3074	// We don't lower some vector selects well that are wider than the register
3075	// width.
3076	if (isa<FixedVectorType>(Val: ValTy) && ISD == ISD::SELECT) {
3077	// We would need this many instructions to hide the scalarization happening.
3078	const int AmortizationCost = `20`;
3079
3080	// If VecPred is not set, check if we can get a predicate from the context
3081	// instruction, if its type matches the requested ValTy.
3082	if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
3083	CmpInst::Predicate CurrentPred;
3084	if (match(V: I, P: m_Select(C: m_Cmp(Pred&: CurrentPred, L: m_Value(), R: m_Value()), L: m_Value(),
3085	R: m_Value())))
3086	VecPred = CurrentPred;
3087	}
3088	// Check if we have a compare/select chain that can be lowered using
3089	// a (F)CMxx & BFI pair.
3090	if (CmpInst::isIntPredicate(P: VecPred) \|\| VecPred == CmpInst::FCMP_OLE \|\|
3091	VecPred == CmpInst::FCMP_OLT \|\| VecPred == CmpInst::FCMP_OGT \|\|
3092	VecPred == CmpInst::FCMP_OGE \|\| VecPred == CmpInst::FCMP_OEQ \|\|
3093	VecPred == CmpInst::FCMP_UNE) {
3094	static const auto ValidMinMaxTys = {
3095	MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
3096	MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
3097	static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
3098
3099	auto LT = getTypeLegalizationCost(Ty: ValTy);
3100	if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }) \|\|
3101	(ST->hasFullFP16() &&
3102	any_of(ValidFP16MinMaxTys, [&LT](MVT M) { return M == LT.second; })))
3103	return LT.first;
3104	}
3105
3106	static const TypeConversionCostTblEntry
3107	VectorSelectTbl[] = {
3108	{ ISD::SELECT, MVT::v2i1, MVT::v2f32, `2` },
3109	{ ISD::SELECT, MVT::v2i1, MVT::v2f64, `2` },
3110	{ ISD::SELECT, MVT::v4i1, MVT::v4f32, `2` },
3111	{ ISD::SELECT, MVT::v4i1, MVT::v4f16, `2` },
3112	{ ISD::SELECT, MVT::v8i1, MVT::v8f16, `2` },
3113	{ ISD::SELECT, MVT::v16i1, MVT::v16i16, `16` },
3114	{ ISD::SELECT, MVT::v8i1, MVT::v8i32, `8` },
3115	{ ISD::SELECT, MVT::v16i1, MVT::v16i32, `16` },
3116	{ ISD::SELECT, MVT::v4i1, MVT::v4i64, `4` * AmortizationCost },
3117	{ ISD::SELECT, MVT::v8i1, MVT::v8i64, `8` * AmortizationCost },
3118	{ ISD::SELECT, MVT::v16i1, MVT::v16i64, `16` * AmortizationCost }
3119	};
3120
3121	EVT SelCondTy = TLI->getValueType(DL, Ty: CondTy);
3122	EVT SelValTy = TLI->getValueType(DL, Ty: ValTy);
3123	if (SelCondTy.isSimple() && SelValTy.isSimple()) {
3124	if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
3125	SelCondTy.getSimpleVT(),
3126	SelValTy.getSimpleVT()))
3127	return Entry->Cost;
3128	}
3129	}
3130
3131	if (isa<FixedVectorType>(Val: ValTy) && ISD == ISD::SETCC) {
3132	auto LT = getTypeLegalizationCost(Ty: ValTy);
3133	// Cost v4f16 FCmp without FP16 support via converting to v4f32 and back.
3134	if (LT.second == MVT::v4f16 && !ST->hasFullFP16())
3135	return LT.first * `4`; // fcvtl + fcvtl + fcmp + xtn
3136	}
3137
3138	// Treat the icmp in icmp(and, 0) as free, as we can make use of ands.
3139	// FIXME: This can apply to more conditions and add/sub if it can be shown to
3140	// be profitable.
3141	if (ValTy->isIntegerTy() && ISD == ISD::SETCC && I &&
3142	ICmpInst::isEquality(P: VecPred) &&
3143	TLI->isTypeLegal(VT: TLI->getValueType(DL, Ty: ValTy)) &&
3144	match(V: I->getOperand(i: `1`), P: m_Zero()) &&
3145	match(V: I->getOperand(i: `0`), P: m_And(L: m_Value(), R: m_Value())))
3146	return `0`;
3147
3148	// The base case handles scalable vectors fine for now, since it treats the
3149	// cost as 1 legalization cost.*
3150	return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
3151	}
3152
3153	AArch64TTIImpl::TTI::MemCmpExpansionOptions
3154	AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
3155	TTI::MemCmpExpansionOptions Options;
3156	if (ST->requiresStrictAlign()) {
3157	// TODO: Add cost modeling for strict align. Misaligned loads expand to
3158	// a bunch of instructions when strict align is enabled.
3159	return Options;
3160	}
3161	Options.AllowOverlappingLoads = true;
3162	Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
3163	Options.NumLoadsPerBlock = Options.MaxNumLoads;
3164	// TODO: Though vector loads usually perform well on AArch64, in some targets
3165	// they may wake up the FP unit, which raises the power consumption. Perhaps
3166	// they could be used with no holds barred (-O3).
3167	Options.LoadSizes = {`8`, `4`, `2`, `1`};
3168	Options.AllowedTailExpansions = {`3`, `5`, `6`};
3169	return Options;
3170	}
3171
3172	bool AArch64TTIImpl::prefersVectorizedAddressing() const {
3173	return ST->hasSVE();
3174	}
3175
3176	InstructionCost
3177	AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
3178	Align Alignment, unsigned AddressSpace,
3179	TTI::TargetCostKind CostKind) {
3180	if (useNeonVector(Ty: Src))
3181	return BaseT::getMaskedMemoryOpCost(Opcode, DataTy: Src, Alignment, AddressSpace,
3182	CostKind);
3183	auto LT = getTypeLegalizationCost(Ty: Src);
3184	if (!LT.first.isValid())
3185	return InstructionCost::getInvalid();
3186
3187	// The code-generator is currently not able to handle scalable vectors
3188	// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3189	// it. This change will be removed when code-generation for these types is
3190	// sufficiently reliable.
3191	if (cast<VectorType>(Val: Src)->getElementCount() == ElementCount::getScalable(MinVal: `1`))
3192	return InstructionCost::getInvalid();
3193
3194	return LT.first;
3195	}
3196
3197	static unsigned getSVEGatherScatterOverhead(unsigned Opcode) {
3198	return Opcode == Instruction::Load ? SVEGatherOverhead : SVEScatterOverhead;
3199	}
3200
3201	InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
3202	unsigned Opcode, Type DataTy, const* Value Ptr, bool* VariableMask,
3203	Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
3204	if (useNeonVector(Ty: DataTy) \|\| !isLegalMaskedGatherScatter(DataType: DataTy))
3205	return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
3206	Alignment, CostKind, I);
3207	auto *VT = cast<VectorType>(Val: DataTy);
3208	auto LT = getTypeLegalizationCost(Ty: DataTy);
3209	if (!LT.first.isValid())
3210	return InstructionCost::getInvalid();
3211
3212	if (!LT.second.isVector() \|\|
3213	!isElementTypeLegalForScalableVector(Ty: VT->getElementType()))
3214	return InstructionCost::getInvalid();
3215
3216	// The code-generator is currently not able to handle scalable vectors
3217	// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3218	// it. This change will be removed when code-generation for these types is
3219	// sufficiently reliable.
3220	if (cast<VectorType>(Val: DataTy)->getElementCount() ==
3221	ElementCount::getScalable(MinVal: `1`))
3222	return InstructionCost::getInvalid();
3223
3224	ElementCount LegalVF = LT.second.getVectorElementCount();
3225	InstructionCost MemOpCost =
3226	getMemoryOpCost(Opcode, Src: VT->getElementType(), Alignment, AddressSpace: `0`, CostKind,
3227	OpInfo: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I);
3228	// Add on an overhead cost for using gathers/scatters.
3229	// TODO: At the moment this is applied unilaterally for all CPUs, but at some
3230	// point we may want a per-CPU overhead.
3231	MemOpCost *= getSVEGatherScatterOverhead(Opcode);
3232	return LT.first * MemOpCost * getMaxNumElements(VF: LegalVF);
3233	}
3234
3235	bool AArch64TTIImpl::useNeonVector(const Type Ty) const* {
3236	return isa<FixedVectorType>(Val: Ty) && !ST->useSVEForFixedLengthVectors();
3237	}
3238
3239	InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
3240	MaybeAlign Alignment,
3241	unsigned AddressSpace,
3242	TTI::TargetCostKind CostKind,
3243	TTI::OperandValueInfo OpInfo,
3244	const Instruction *I) {
3245	EVT VT = TLI->getValueType(DL, Ty, AllowUnknown: true);
3246	// Type legalization can't handle structs
3247	if (VT == MVT::Other)
3248	return BaseT::getMemoryOpCost(Opcode, Src: Ty, Alignment, AddressSpace,
3249	CostKind);
3250
3251	auto LT = getTypeLegalizationCost(Ty);
3252	if (!LT.first.isValid())
3253	return InstructionCost::getInvalid();
3254
3255	// The code-generator is currently not able to handle scalable vectors
3256	// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3257	// it. This change will be removed when code-generation for these types is
3258	// sufficiently reliable.
3259	if (auto *VTy = dyn_cast<ScalableVectorType>(Val: Ty))
3260	if (VTy->getElementCount() == ElementCount::getScalable(MinVal: `1`))
3261	return InstructionCost::getInvalid();
3262
3263	// TODO: consider latency as well for TCK_SizeAndLatency.
3264	if (CostKind == TTI::TCK_CodeSize \|\| CostKind == TTI::TCK_SizeAndLatency)
3265	return LT.first;
3266
3267	if (CostKind != TTI::TCK_RecipThroughput)
3268	return `1`;
3269
3270	if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
3271	LT.second.is128BitVector() && (!Alignment \|\| *Alignment < Align (`16`))) {
3272	// Unaligned stores are extremely inefficient. We don't split all
3273	// unaligned 128-bit stores because the negative impact that has shown in
3274	// practice on inlined block copy code.
3275	// We make such stores expensive so that we will only vectorize if there
3276	// are 6 other instructions getting vectorized.
3277	const int AmortizationCost = `6`;
3278
3279	return LT.first * `2` * AmortizationCost;
3280	}
3281
3282	// Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs.
3283	if (Ty->isPtrOrPtrVectorTy())
3284	return LT.first;
3285
3286	if (useNeonVector(Ty)) {
3287	// Check truncating stores and extending loads.
3288	if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
3289	// v4i8 types are lowered to scalar a load/store and sshll/xtn.
3290	if (VT == MVT::v4i8)
3291	return `2`;
3292	// Otherwise we need to scalarize.
3293	return cast<FixedVectorType>(Val: Ty)->getNumElements() * `2`;
3294	}
3295	EVT EltVT = VT.getVectorElementType();
3296	unsigned EltSize = EltVT.getScalarSizeInBits();
3297	if (!isPowerOf2_32(Value: EltSize) \|\| EltSize < `8` \|\| EltSize > `64` \|\|
3298	VT.getVectorNumElements() >= (`128` / EltSize) \|\| !Alignment \|\|
3299	*Alignment != Align (`1`))
3300	return LT.first;
3301	// FIXME: v3i8 lowering currently is very inefficient, due to automatic
3302	// widening to v4i8, which produces suboptimal results.
3303	if (VT.getVectorNumElements() == `3` && EltVT == MVT::i8)
3304	return LT.first;
3305
3306	// Check non-power-of-2 loads/stores for legal vector element types with
3307	// NEON. Non-power-of-2 memory ops will get broken down to a set of
3308	// operations on smaller power-of-2 ops, including ld1/st1.
3309	LLVMContext &C = Ty->getContext();
3310	InstructionCost Cost(`0`);
3311	SmallVector<EVT> TypeWorklist;
3312	TypeWorklist.push_back(Elt: VT);
3313	while (!TypeWorklist.empty()) {
3314	EVT CurrVT = TypeWorklist.pop_back_val();
3315	unsigned CurrNumElements = CurrVT.getVectorNumElements();
3316	if (isPowerOf2_32(Value: CurrNumElements)) {
3317	Cost += `1`;
3318	continue;
3319	}
3320
3321	unsigned PrevPow2 = NextPowerOf2(A: CurrNumElements) / `2`;
3322	TypeWorklist.push_back(Elt: EVT::getVectorVT(Context&: C, VT: EltVT, NumElements: PrevPow2));
3323	TypeWorklist.push_back(
3324	Elt: EVT::getVectorVT(Context&: C, VT: EltVT, NumElements: CurrNumElements - PrevPow2));
3325	}
3326	return Cost;
3327	}
3328
3329	return LT.first;
3330	}
3331
3332	InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
3333	unsigned Opcode, Type VecTy, unsigned* Factor, ArrayRef<unsigned> Indices,
3334	Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
3335	bool UseMaskForCond, bool UseMaskForGaps) {
3336	assert(Factor >= `2` && "Invalid interleave factor");
3337	auto *VecVTy = cast<VectorType>(Val: VecTy);
3338
3339	if (VecTy->isScalableTy() && (!ST->hasSVE() \|\| Factor != `2`))
3340	return InstructionCost::getInvalid();
3341
3342	// Vectorization for masked interleaved accesses is only enabled for scalable
3343	// VF.
3344	if (!VecTy->isScalableTy() && (UseMaskForCond \|\| UseMaskForGaps))
3345	return InstructionCost::getInvalid();
3346
3347	if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
3348	unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
3349	auto *SubVecTy =
3350	VectorType::get(ElementType: VecVTy->getElementType(),
3351	EC: VecVTy->getElementCount().divideCoefficientBy(RHS: Factor));
3352
3353	// ldN/stN only support legal vector types of size 64 or 128 in bits.
3354	// Accesses having vector types that are a multiple of 128 bits can be
3355	// matched to more than one ldN/stN instruction.
3356	bool UseScalable;
3357	if (MinElts % Factor == `0` &&
3358	TLI->isLegalInterleavedAccessType(VecTy: SubVecTy, DL, UseScalable))
3359	return Factor * TLI->getNumInterleavedAccesses(VecTy: SubVecTy, DL, UseScalable);
3360	}
3361
3362	return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3363	Alignment, AddressSpace, CostKind,
3364	UseMaskForCond, UseMaskForGaps);
3365	}
3366
3367	InstructionCost
3368	AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
3369	InstructionCost Cost = `0`;
3370	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
3371	for (auto *I : Tys) {
3372	if (!I->isVectorTy())
3373	continue;
3374	if (I->getScalarSizeInBits() * cast<FixedVectorType>(Val: I)->getNumElements() ==
3375	`128`)
3376	Cost += getMemoryOpCost(Opcode: Instruction::Store, Ty: I, Alignment: Align (`128`), AddressSpace: `0`, CostKind) +
3377	getMemoryOpCost(Opcode: Instruction::Load, Ty: I, Alignment: Align (`128`), AddressSpace: `0`, CostKind);
3378	}
3379	return Cost;
3380	}
3381
3382	unsigned AArch64TTIImpl::getMaxInterleaveFactor(ElementCount VF) {
3383	return ST->getMaxInterleaveFactor();
3384	}
3385
3386	// For Falkor, we want to avoid having too many strided loads in a loop since
3387	// that can exhaust the HW prefetcher resources. We adjust the unroller
3388	// MaxCount preference below to attempt to ensure unrolling doesn't create too
3389	// many strided loads.
3390	static void
3391	getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
3392	TargetTransformInfo::UnrollingPreferences &UP) {
3393	enum { MaxStridedLoads = `7` };
3394	auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
3395	int StridedLoads = `0`;
3396	// FIXME? We could make this more precise by looking at the CFG and
3397	// e.g. not counting loads in each side of an if-then-else diamond.
3398	for (const auto BB : L->blocks()) {
3399	for (auto &I : *BB) {
3400	LoadInst *LMemI = dyn_cast<LoadInst>(Val: &I);
3401	if (!LMemI)
3402	continue;
3403
3404	Value *PtrValue = LMemI->getPointerOperand();
3405	if (L->isLoopInvariant(V: PtrValue))
3406	continue;
3407
3408	const SCEV *LSCEV = SE.getSCEV(V: PtrValue);
3409	const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(Val: LSCEV);
3410	if (!LSCEVAddRec \|\| !LSCEVAddRec->isAffine())
3411	continue;
3412
3413	// FIXME? We could take pairing of unrolled load copies into account
3414	// by looking at the AddRec, but we would probably have to limit this
3415	// to loops with no stores or other memory optimization barriers.
3416	++StridedLoads;
3417	// We've seen enough strided loads that seeing more won't make a
3418	// difference.
3419	if (StridedLoads > MaxStridedLoads / `2`)
3420	return StridedLoads;
3421	}
3422	}
3423	return StridedLoads;
3424	};
3425
3426	int StridedLoads = countStridedLoads (L, SE);
3427	LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
3428	<< " strided loads\n");
3429	// Pick the largest power of 2 unroll count that won't result in too many
3430	// strided loads.
3431	if (StridedLoads) {
3432	UP.MaxCount = `1` << Log2_32(Value: MaxStridedLoads / StridedLoads);
3433	LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
3434	<< UP.MaxCount << `'\n'`);
3435	}
3436	}
3437
3438	void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
3439	TTI::UnrollingPreferences &UP,
3440	OptimizationRemarkEmitter *ORE) {
3441	// Enable partial unrolling and runtime unrolling.
3442	BaseT::getUnrollingPreferences(L, SE, UP, ORE);
3443
3444	UP.UpperBound = true;
3445
3446	// For inner loop, it is more likely to be a hot one, and the runtime check
3447	// can be promoted out from LICM pass, so the overhead is less, let's try
3448	// a larger threshold to unroll more loops.
3449	if (L->getLoopDepth() > `1`)
3450	UP.PartialThreshold *= `2`;
3451
3452	// Disable partial & runtime unrolling on -Os.
3453	UP.PartialOptSizeThreshold = `0`;
3454
3455	if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
3456	EnableFalkorHWPFUnrollFix)
3457	getFalkorUnrollingPreferences(L, SE, UP);
3458
3459	// Scan the loop: don't unroll loops with calls as this could prevent
3460	// inlining. Don't unroll vector loops either, as they don't benefit much from
3461	// unrolling.
3462	for (auto *BB : L->getBlocks()) {
3463	for (auto &I : *BB) {
3464	// Don't unroll vectorised loop.
3465	if (I.getType()->isVectorTy())
3466	return;
3467
3468	if (isa<CallInst>(Val: I) \|\| isa<InvokeInst>(Val: I)) {
3469	if (const Function *F = cast<CallBase>(Val&: I).getCalledFunction()) {
3470	if (!isLoweredToCall(F))
3471	continue;
3472	}
3473	return;
3474	}
3475	}
3476	}
3477
3478	// Enable runtime unrolling for in-order models
3479	// If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
3480	// checking for that case, we can ensure that the default behaviour is
3481	// unchanged
3482	if (ST->getProcFamily() != AArch64Subtarget::Others &&
3483	!ST->getSchedModel().isOutOfOrder()) {
3484	UP.Runtime = true;
3485	UP.Partial = true;
3486	UP.UnrollRemainder = true;
3487	UP.DefaultUnrollRuntimeCount = `4`;
3488
3489	UP.UnrollAndJam = true;
3490	UP.UnrollAndJamInnerLoopThreshold = `60`;
3491	}
3492	}
3493
3494	void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
3495	TTI::PeelingPreferences &PP) {
3496	BaseT::getPeelingPreferences(L, SE, PP);
3497	}
3498
3499	Value AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst Inst,
3500	Type *ExpectedType) {
3501	switch (Inst->getIntrinsicID()) {
3502	default:
3503	return nullptr;
3504	case Intrinsic::aarch64_neon_st2:
3505	case Intrinsic::aarch64_neon_st3:
3506	case Intrinsic::aarch64_neon_st4: {
3507	// Create a struct type
3508	StructType *ST = dyn_cast<StructType>(Val: ExpectedType);
3509	if (!ST)
3510	return nullptr;
3511	unsigned NumElts = Inst->arg_size() - `1`;
3512	if (ST->getNumElements() != NumElts)
3513	return nullptr;
3514	for (unsigned i = `0`, e = NumElts; i != e; ++i) {
3515	if (Inst->getArgOperand(i)->getType() != ST->getElementType(N: i))
3516	return nullptr;
3517	}
3518	Value *Res = PoisonValue::get(T: ExpectedType);
3519	IRBuilder<> Builder(Inst);
3520	for (unsigned i = `0`, e = NumElts; i != e; ++i) {
3521	Value *L = Inst->getArgOperand(i);
3522	Res = Builder.CreateInsertValue(Agg: Res, Val: L, Idxs: i);
3523	}
3524	return Res;
3525	}
3526	case Intrinsic::aarch64_neon_ld2:
3527	case Intrinsic::aarch64_neon_ld3:
3528	case Intrinsic::aarch64_neon_ld4:
3529	if (Inst->getType() == ExpectedType)
3530	return Inst;
3531	return nullptr;
3532	}
3533	}
3534
3535	bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
3536	MemIntrinsicInfo &Info) {
3537	switch (Inst->getIntrinsicID()) {
3538	default:
3539	break;
3540	case Intrinsic::aarch64_neon_ld2:
3541	case Intrinsic::aarch64_neon_ld3:
3542	case Intrinsic::aarch64_neon_ld4:
3543	Info.ReadMem = true;
3544	Info.WriteMem = false;
3545	Info.PtrVal = Inst->getArgOperand(i: `0`);
3546	break;
3547	case Intrinsic::aarch64_neon_st2:
3548	case Intrinsic::aarch64_neon_st3:
3549	case Intrinsic::aarch64_neon_st4:
3550	Info.ReadMem = false;
3551	Info.WriteMem = true;
3552	Info.PtrVal = Inst->getArgOperand(i: Inst->arg_size() - `1`);
3553	break;
3554	}
3555
3556	switch (Inst->getIntrinsicID()) {
3557	default:
3558	return false;
3559	case Intrinsic::aarch64_neon_ld2:
3560	case Intrinsic::aarch64_neon_st2:
3561	Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
3562	break;
3563	case Intrinsic::aarch64_neon_ld3:
3564	case Intrinsic::aarch64_neon_st3:
3565	Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
3566	break;
3567	case Intrinsic::aarch64_neon_ld4:
3568	case Intrinsic::aarch64_neon_st4:
3569	Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
3570	break;
3571	}
3572	return true;
3573	}
3574
3575	/// See if \p I should be considered for address type promotion. We check if \p
3576	/// I is a sext with right type and used in memory accesses. If it used in a
3577	/// "complex" getelementptr, we allow it to be promoted without finding other
3578	/// sext instructions that sign extended the same initial value. A getelementptr
3579	/// is considered as "complex" if it has more than 2 operands.
3580	bool AArch64TTIImpl::shouldConsiderAddressTypePromotion(
3581	const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
3582	bool Considerable = false;
3583	AllowPromotionWithoutCommonHeader = false;
3584	if (!isa<SExtInst>(Val: &I))
3585	return false;
3586	Type *ConsideredSExtType =
3587	Type::getInt64Ty(C&: I.getParent()->getParent()->getContext());
3588	if (I.getType() != ConsideredSExtType)
3589	return false;
3590	// See if the sext is the one with the right type and used in at least one
3591	// GetElementPtrInst.
3592	for (const User *U : I.users()) {
3593	if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(Val: U)) {
3594	Considerable = true;
3595	// A getelementptr is considered as "complex" if it has more than 2
3596	// operands. We will promote a SExt used in such complex GEP as we
3597	// expect some computation to be merged if they are done on 64 bits.
3598	if (GEPInst->getNumOperands() > `2`) {
3599	AllowPromotionWithoutCommonHeader = true;
3600	break;
3601	}
3602	}
3603	}
3604	return Considerable;
3605	}
3606
3607	bool AArch64TTIImpl::isLegalToVectorizeReduction(
3608	const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
3609	if (!VF.isScalable())
3610	return true;
3611
3612	Type *Ty = RdxDesc.getRecurrenceType();
3613	if (Ty->isBFloatTy() \|\| !isElementTypeLegalForScalableVector(Ty))
3614	return false;
3615
3616	switch (RdxDesc.getRecurrenceKind()) {
3617	case RecurKind::Add:
3618	case RecurKind::FAdd:
3619	case RecurKind::And:
3620	case RecurKind::Or:
3621	case RecurKind::Xor:
3622	case RecurKind::SMin:
3623	case RecurKind::SMax:
3624	case RecurKind::UMin:
3625	case RecurKind::UMax:
3626	case RecurKind::FMin:
3627	case RecurKind::FMax:
3628	case RecurKind::FMulAdd:
3629	case RecurKind::IAnyOf:
3630	case RecurKind::FAnyOf:
3631	return true;
3632	default:
3633	return false;
3634	}
3635	}
3636
3637	InstructionCost
3638	AArch64TTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
3639	FastMathFlags FMF,
3640	TTI::TargetCostKind CostKind) {
3641	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
3642
3643	if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
3644	return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
3645
3646	InstructionCost LegalizationCost = `0`;
3647	if (LT.first > `1`) {
3648	Type *LegalVTy = EVT (LT.second).getTypeForEVT(Context&: Ty->getContext());
3649	IntrinsicCostAttributes Attrs(IID, LegalVTy, {LegalVTy, LegalVTy}, FMF);
3650	LegalizationCost = getIntrinsicInstrCost(ICA: Attrs, CostKind) * (LT.first - `1`);
3651	}
3652
3653	return LegalizationCost + /Cost of horizontal reduction/ `2`;
3654	}
3655
3656	InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE(
3657	unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) {
3658	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
3659	InstructionCost LegalizationCost = `0`;
3660	if (LT.first > `1`) {
3661	Type *LegalVTy = EVT (LT.second).getTypeForEVT(Context&: ValTy->getContext());
3662	LegalizationCost = getArithmeticInstrCost(Opcode, Ty: LegalVTy, CostKind);
3663	LegalizationCost *= LT.first - `1`;
3664	}
3665
3666	int ISD = TLI->InstructionOpcodeToISD(Opcode);
3667	assert(ISD && "Invalid opcode");
3668	// Add the final reduction cost for the legal horizontal reduction
3669	switch (ISD) {
3670	case ISD::ADD:
3671	case ISD::AND:
3672	case ISD::OR:
3673	case ISD::XOR:
3674	case ISD::FADD:
3675	return LegalizationCost + `2`;
3676	default:
3677	return InstructionCost::getInvalid();
3678	}
3679	}
3680
3681	InstructionCost
3682	AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
3683	std::optional<FastMathFlags> FMF,
3684	TTI::TargetCostKind CostKind) {
3685	if (TTI::requiresOrderedReduction(FMF)) {
3686	if (auto *FixedVTy = dyn_cast<FixedVectorType>(Val: ValTy)) {
3687	InstructionCost BaseCost =
3688	BaseT::getArithmeticReductionCost(Opcode, Ty: ValTy, FMF, CostKind);
3689	// Add on extra cost to reflect the extra overhead on some CPUs. We still
3690	// end up vectorizing for more computationally intensive loops.
3691	return BaseCost + FixedVTy->getNumElements();
3692	}
3693
3694	if (Opcode != Instruction::FAdd)
3695	return InstructionCost::getInvalid();
3696
3697	auto *VTy = cast<ScalableVectorType>(Val: ValTy);
3698	InstructionCost Cost =
3699	getArithmeticInstrCost(Opcode, Ty: VTy->getScalarType(), CostKind);
3700	Cost *= getMaxNumElements(VF: VTy->getElementCount());
3701	return Cost;
3702	}
3703
3704	if (isa<ScalableVectorType>(Val: ValTy))
3705	return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
3706
3707	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
3708	MVT MTy = LT.second;
3709	int ISD = TLI->InstructionOpcodeToISD(Opcode);
3710	assert(ISD && "Invalid opcode");
3711
3712	// Horizontal adds can use the 'addv' instruction. We model the cost of these
3713	// instructions as twice a normal vector add, plus 1 for each legalization
3714	// step (LT.first). This is the only arithmetic vector reduction operation for
3715	// which we have an instruction.
3716	// OR, XOR and AND costs should match the codegen from:
3717	// OR: llvm/test/CodeGen/AArch64/reduce-or.ll
3718	// XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
3719	// AND: llvm/test/CodeGen/AArch64/reduce-and.ll
3720	static const CostTblEntry CostTblNoPairwise[]{
3721	{ISD::ADD, MVT::v8i8, `2`},
3722	{ISD::ADD, MVT::v16i8, `2`},
3723	{ISD::ADD, MVT::v4i16, `2`},
3724	{ISD::ADD, MVT::v8i16, `2`},
3725	{ISD::ADD, MVT::v4i32, `2`},
3726	{ISD::ADD, MVT::v2i64, `2`},
3727	{ISD::OR, MVT::v8i8, `15`},
3728	{ISD::OR, MVT::v16i8, `17`},
3729	{ISD::OR, MVT::v4i16, `7`},
3730	{ISD::OR, MVT::v8i16, `9`},
3731	{ISD::OR, MVT::v2i32, `3`},
3732	{ISD::OR, MVT::v4i32, `5`},
3733	{ISD::OR, MVT::v2i64, `3`},
3734	{ISD::XOR, MVT::v8i8, `15`},
3735	{ISD::XOR, MVT::v16i8, `17`},
3736	{ISD::XOR, MVT::v4i16, `7`},
3737	{ISD::XOR, MVT::v8i16, `9`},
3738	{ISD::XOR, MVT::v2i32, `3`},
3739	{ISD::XOR, MVT::v4i32, `5`},
3740	{ISD::XOR, MVT::v2i64, `3`},
3741	{ISD::AND, MVT::v8i8, `15`},
3742	{ISD::AND, MVT::v16i8, `17`},
3743	{ISD::AND, MVT::v4i16, `7`},
3744	{ISD::AND, MVT::v8i16, `9`},
3745	{ISD::AND, MVT::v2i32, `3`},
3746	{ISD::AND, MVT::v4i32, `5`},
3747	{ISD::AND, MVT::v2i64, `3`},
3748	};
3749	switch (ISD) {
3750	default:
3751	break;
3752	case ISD::ADD:
3753	if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
3754	return (LT.first - `1`) + Entry->Cost;
3755	break;
3756	case ISD::XOR:
3757	case ISD::AND:
3758	case ISD::OR:
3759	const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
3760	if (!Entry)
3761	break;
3762	auto *ValVTy = cast<FixedVectorType>(Val: ValTy);
3763	if (MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
3764	isPowerOf2_32(Value: ValVTy->getNumElements())) {
3765	InstructionCost ExtraCost = `0`;
3766	if (LT.first != `1`) {
3767	// Type needs to be split, so there is an extra cost of LT.first - 1
3768	// arithmetic ops.
3769	auto *Ty = FixedVectorType::get(ElementType: ValTy->getElementType(),
3770	NumElts: MTy.getVectorNumElements());
3771	ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
3772	ExtraCost *= LT.first - `1`;
3773	}
3774	// All and/or/xor of i1 will be lowered with maxv/minv/addv + fmov
3775	auto Cost = ValVTy->getElementType()->isIntegerTy(Bitwidth: `1`) ? `2` : Entry->Cost;
3776	return Cost + ExtraCost;
3777	}
3778	break;
3779	}
3780	return BaseT::getArithmeticReductionCost(Opcode, Ty: ValTy, FMF, CostKind);
3781	}
3782
3783	InstructionCost AArch64TTIImpl::getSpliceCost(VectorType Tp, int* Index) {
3784	static const CostTblEntry ShuffleTbl[] = {
3785	{ TTI::SK_Splice, MVT::nxv16i8, `1` },
3786	{ TTI::SK_Splice, MVT::nxv8i16, `1` },
3787	{ TTI::SK_Splice, MVT::nxv4i32, `1` },
3788	{ TTI::SK_Splice, MVT::nxv2i64, `1` },
3789	{ TTI::SK_Splice, MVT::nxv2f16, `1` },
3790	{ TTI::SK_Splice, MVT::nxv4f16, `1` },
3791	{ TTI::SK_Splice, MVT::nxv8f16, `1` },
3792	{ TTI::SK_Splice, MVT::nxv2bf16, `1` },
3793	{ TTI::SK_Splice, MVT::nxv4bf16, `1` },
3794	{ TTI::SK_Splice, MVT::nxv8bf16, `1` },
3795	{ TTI::SK_Splice, MVT::nxv2f32, `1` },
3796	{ TTI::SK_Splice, MVT::nxv4f32, `1` },
3797	{ TTI::SK_Splice, MVT::nxv2f64, `1` },
3798	};
3799
3800	// The code-generator is currently not able to handle scalable vectors
3801	// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3802	// it. This change will be removed when code-generation for these types is
3803	// sufficiently reliable.
3804	if (Tp->getElementCount() == ElementCount::getScalable(MinVal: `1`))
3805	return InstructionCost::getInvalid();
3806
3807	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Tp);
3808	Type *LegalVTy = EVT (LT.second).getTypeForEVT(Context&: Tp->getContext());
3809	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
3810	EVT PromotedVT = LT.second.getScalarType() == MVT::i1
3811	? TLI->getPromotedVTForPredicate(EVT(LT.second))
3812	: LT.second;
3813	Type *PromotedVTy = EVT (PromotedVT).getTypeForEVT(Context&: Tp->getContext());
3814	InstructionCost LegalizationCost = `0`;
3815	if (Index < `0`) {
3816	LegalizationCost =
3817	getCmpSelInstrCost(Opcode: Instruction::ICmp, ValTy: PromotedVTy, CondTy: PromotedVTy,
3818	VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind) +
3819	getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: PromotedVTy, CondTy: LegalVTy,
3820	VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind);
3821	}
3822
3823	// Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
3824	// Cost performed on a promoted type.
3825	if (LT.second.getScalarType() == MVT::i1) {
3826	LegalizationCost +=
3827	getCastInstrCost(Opcode: Instruction::ZExt, Dst: PromotedVTy, Src: LegalVTy,
3828	CCH: TTI::CastContextHint::None, CostKind) +
3829	getCastInstrCost(Opcode: Instruction::Trunc, Dst: LegalVTy, Src: PromotedVTy,
3830	CCH: TTI::CastContextHint::None, CostKind);
3831	}
3832	const auto *Entry =
3833	CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
3834	assert(Entry && "Illegal Type for Splice");
3835	LegalizationCost += Entry->Cost;
3836	return LegalizationCost * LT.first;
3837	}
3838
3839	InstructionCost AArch64TTIImpl::getShuffleCost(
3840	TTI::ShuffleKind Kind, VectorType Tp, ArrayRef<int*> Mask,
3841	TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
3842	ArrayRef<const Value > Args, const* Instruction *CxtI) {
3843	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Tp);
3844
3845	// If we have a Mask, and the LT is being legalized somehow, split the Mask
3846	// into smaller vectors and sum the cost of each shuffle.
3847	if (!Mask.empty() && isa<FixedVectorType>(Val: Tp) && LT.second.isVector() &&
3848	Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
3849	Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) {
3850
3851	// Check for LD3/LD4 instructions, which are represented in llvm IR as
3852	// deinterleaving-shuffle(load). The shuffle cost could potentially be free,
3853	// but we model it with a cost of LT.first so that LD3/LD4 have a higher
3854	// cost than just the load.
3855	if (Args.size() >= `1` && isa<LoadInst>(Val: Args [`0`]) &&
3856	(ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, Factor: `3`) \|\|
3857	ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, Factor: `4`)))
3858	return std::max<InstructionCost>(a: `1`, b: LT.first / `4`);
3859
3860	// Check for ST3/ST4 instructions, which are represented in llvm IR as
3861	// store(interleaving-shuffle). The shuffle cost could potentially be free,
3862	// but we model it with a cost of LT.first so that ST3/ST4 have a higher
3863	// cost than just the store.
3864	if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(Val: *CxtI->user_begin()) &&
3865	(ShuffleVectorInst::isInterleaveMask(
3866	Mask, Factor: `4`, NumInputElts: Tp->getElementCount().getKnownMinValue() * `2`) \|\|
3867	ShuffleVectorInst::isInterleaveMask(
3868	Mask, Factor: `3`, NumInputElts: Tp->getElementCount().getKnownMinValue() * `2`)))
3869	return LT.first;
3870
3871	unsigned TpNumElts = Mask.size();
3872	unsigned LTNumElts = LT.second.getVectorNumElements();
3873	unsigned NumVecs = (TpNumElts + LTNumElts - `1`) / LTNumElts;
3874	VectorType *NTp =
3875	VectorType::get(ElementType: Tp->getScalarType(), EC: LT.second.getVectorElementCount());
3876	InstructionCost Cost;
3877	for (unsigned N = `0`; N < NumVecs; N++) {
3878	SmallVector<int> NMask;
3879	// Split the existing mask into chunks of size LTNumElts. Track the source
3880	// sub-vectors to ensure the result has at most 2 inputs.
3881	unsigned Source1, Source2;
3882	unsigned NumSources = `0`;
3883	for (unsigned E = `0`; E < LTNumElts; E++) {
3884	int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask [N * LTNumElts + E]
3885	: PoisonMaskElem;
3886	if (MaskElt < `0`) {
3887	NMask.push_back(Elt: PoisonMaskElem);
3888	continue;
3889	}
3890
3891	// Calculate which source from the input this comes from and whether it
3892	// is new to us.
3893	unsigned Source = MaskElt / LTNumElts;
3894	if (NumSources == `0`) {
3895	Source1 = Source;
3896	NumSources = `1`;
3897	} else if (NumSources == `1` && Source != Source1) {
3898	Source2 = Source;
3899	NumSources = `2`;
3900	} else if (NumSources >= `2` && Source != Source1 && Source != Source2) {
3901	NumSources++;
3902	}
3903
3904	// Add to the new mask. For the NumSources>2 case these are not correct,
3905	// but are only used for the modular lane number.
3906	if (Source == Source1)
3907	NMask.push_back(Elt: MaskElt % LTNumElts);
3908	else if (Source == Source2)
3909	NMask.push_back(Elt: MaskElt % LTNumElts + LTNumElts);
3910	else
3911	NMask.push_back(Elt: MaskElt % LTNumElts);
3912	}
3913	// If the sub-mask has at most 2 input sub-vectors then re-cost it using
3914	// getShuffleCost. If not then cost it using the worst case.
3915	if (NumSources <= `2`)
3916	Cost += getShuffleCost(Kind: NumSources <= `1` ? TTI::SK_PermuteSingleSrc
3917	: TTI::SK_PermuteTwoSrc,
3918	Tp: NTp, Mask: NMask, CostKind, Index: `0`, SubTp: nullptr, Args, CxtI);
3919	else if (any_of(Range: enumerate(First&: NMask), P: [&](const auto &ME) {
3920	return ME.value() % LTNumElts == ME.index();
3921	}))
3922	Cost += LTNumElts - `1`;
3923	else
3924	Cost += LTNumElts;
3925	}
3926	return Cost;
3927	}
3928
3929	Kind = improveShuffleKindFromMask(Kind, Mask, Ty: Tp, Index, SubTy&: SubTp);
3930	// Treat extractsubvector as single op permutation.
3931	bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
3932	if (IsExtractSubvector && LT.second.isFixedLengthVector())
3933	Kind = TTI::SK_PermuteSingleSrc;
3934
3935	// Check for broadcast loads, which are supported by the LD1R instruction.
3936	// In terms of code-size, the shuffle vector is free when a load + dup get
3937	// folded into a LD1R. That's what we check and return here. For performance
3938	// and reciprocal throughput, a LD1R is not completely free. In this case, we
3939	// return the cost for the broadcast below (i.e. 1 for most/all types), so
3940	// that we model the load + dup sequence slightly higher because LD1R is a
3941	// high latency instruction.
3942	if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) {
3943	bool IsLoad = !Args.empty() && isa<LoadInst>(Val: Args [`0`]);
3944	if (IsLoad && LT.second.isVector() &&
3945	isLegalBroadcastLoad(ElementTy: Tp->getElementType(),
3946	NumElements: LT.second.getVectorElementCount()))
3947	return `0`;
3948	}
3949
3950	// If we have 4 elements for the shuffle and a Mask, get the cost straight
3951	// from the perfect shuffle tables.
3952	if (Mask.size() == `4` && Tp->getElementCount() == ElementCount::getFixed(MinVal: `4`) &&
3953	(Tp->getScalarSizeInBits() == `16` \|\| Tp->getScalarSizeInBits() == `32`) &&
3954	all_of(Range&: Mask, P: [](int E) { return E < `8`; }))
3955	return getPerfectShuffleCost(M: Mask);
3956
3957	// Check for identity masks, which we can treat as free.
3958	if (!Mask.empty() && LT.second.isFixedLengthVector() &&
3959	(Kind == TTI::SK_PermuteTwoSrc \|\| Kind == TTI::SK_PermuteSingleSrc) &&
3960	all_of(Range: enumerate(First&: Mask), P: [](const auto &M) {
3961	return M.value() < `0` \|\| M.value() == (int)M.index();
3962	}))
3963	return `0`;
3964
3965	// Check for other shuffles that are not SK_ kinds but we have native
3966	// instructions for, for example ZIP and UZP.
3967	unsigned Unused;
3968	if (LT.second.isFixedLengthVector() &&
3969	LT.second.getVectorNumElements() == Mask.size() &&
3970	(Kind == TTI::SK_PermuteTwoSrc \|\| Kind == TTI::SK_PermuteSingleSrc) &&
3971	(isZIPMask(M: Mask, VT: LT.second, WhichResultOut&: Unused) \|\|
3972	isUZPMask(M: Mask, VT: LT.second, WhichResultOut&: Unused) \|\|
3973	// Check for non-zero lane splats
3974	all_of(Range: drop_begin(RangeOrContainer&: Mask),
3975	P: [&Mask](int M) { return M < `0` \|\| M == Mask [`0`]; })))
3976	return `1`;
3977
3978	if (Kind == TTI::SK_Broadcast \|\| Kind == TTI::SK_Transpose \|\|
3979	Kind == TTI::SK_Select \|\| Kind == TTI::SK_PermuteSingleSrc \|\|
3980	Kind == TTI::SK_Reverse \|\| Kind == TTI::SK_Splice) {
3981	static const CostTblEntry ShuffleTbl[] = {
3982	// Broadcast shuffle kinds can be performed with 'dup'.
3983	{TTI::SK_Broadcast, MVT::v8i8, `1`},
3984	{TTI::SK_Broadcast, MVT::v16i8, `1`},
3985	{TTI::SK_Broadcast, MVT::v4i16, `1`},
3986	{TTI::SK_Broadcast, MVT::v8i16, `1`},
3987	{TTI::SK_Broadcast, MVT::v2i32, `1`},
3988	{TTI::SK_Broadcast, MVT::v4i32, `1`},
3989	{TTI::SK_Broadcast, MVT::v2i64, `1`},
3990	{TTI::SK_Broadcast, MVT::v4f16, `1`},
3991	{TTI::SK_Broadcast, MVT::v8f16, `1`},
3992	{TTI::SK_Broadcast, MVT::v2f32, `1`},
3993	{TTI::SK_Broadcast, MVT::v4f32, `1`},
3994	{TTI::SK_Broadcast, MVT::v2f64, `1`},
3995	// Transpose shuffle kinds can be performed with 'trn1/trn2' and
3996	// 'zip1/zip2' instructions.
3997	{TTI::SK_Transpose, MVT::v8i8, `1`},
3998	{TTI::SK_Transpose, MVT::v16i8, `1`},
3999	{TTI::SK_Transpose, MVT::v4i16, `1`},
4000	{TTI::SK_Transpose, MVT::v8i16, `1`},
4001	{TTI::SK_Transpose, MVT::v2i32, `1`},
4002	{TTI::SK_Transpose, MVT::v4i32, `1`},
4003	{TTI::SK_Transpose, MVT::v2i64, `1`},
4004	{TTI::SK_Transpose, MVT::v4f16, `1`},
4005	{TTI::SK_Transpose, MVT::v8f16, `1`},
4006	{TTI::SK_Transpose, MVT::v2f32, `1`},
4007	{TTI::SK_Transpose, MVT::v4f32, `1`},
4008	{TTI::SK_Transpose, MVT::v2f64, `1`},
4009	// Select shuffle kinds.
4010	// TODO: handle vXi8/vXi16.
4011	{TTI::SK_Select, MVT::v2i32, `1`}, // mov.
4012	{TTI::SK_Select, MVT::v4i32, `2`}, // rev+trn (or similar).
4013	{TTI::SK_Select, MVT::v2i64, `1`}, // mov.
4014	{TTI::SK_Select, MVT::v2f32, `1`}, // mov.
4015	{TTI::SK_Select, MVT::v4f32, `2`}, // rev+trn (or similar).
4016	{TTI::SK_Select, MVT::v2f64, `1`}, // mov.
4017	// PermuteSingleSrc shuffle kinds.
4018	{TTI::SK_PermuteSingleSrc, MVT::v2i32, `1`}, // mov.
4019	{TTI::SK_PermuteSingleSrc, MVT::v4i32, `3`}, // perfectshuffle worst case.
4020	{TTI::SK_PermuteSingleSrc, MVT::v2i64, `1`}, // mov.
4021	{TTI::SK_PermuteSingleSrc, MVT::v2f32, `1`}, // mov.
4022	{TTI::SK_PermuteSingleSrc, MVT::v4f32, `3`}, // perfectshuffle worst case.
4023	{TTI::SK_PermuteSingleSrc, MVT::v2f64, `1`}, // mov.
4024	{TTI::SK_PermuteSingleSrc, MVT::v4i16, `3`}, // perfectshuffle worst case.
4025	{TTI::SK_PermuteSingleSrc, MVT::v4f16, `3`}, // perfectshuffle worst case.
4026	{TTI::SK_PermuteSingleSrc, MVT::v4bf16, `3`}, // same
4027	{TTI::SK_PermuteSingleSrc, MVT::v8i16, `8`}, // constpool + load + tbl
4028	{TTI::SK_PermuteSingleSrc, MVT::v8f16, `8`}, // constpool + load + tbl
4029	{TTI::SK_PermuteSingleSrc, MVT::v8bf16, `8`}, // constpool + load + tbl
4030	{TTI::SK_PermuteSingleSrc, MVT::v8i8, `8`}, // constpool + load + tbl
4031	{TTI::SK_PermuteSingleSrc, MVT::v16i8, `8`}, // constpool + load + tbl
4032	// Reverse can be lowered with `rev`.
4033	{TTI::SK_Reverse, MVT::v2i32, `1`}, // REV64
4034	{TTI::SK_Reverse, MVT::v4i32, `2`}, // REV64; EXT
4035	{TTI::SK_Reverse, MVT::v2i64, `1`}, // EXT
4036	{TTI::SK_Reverse, MVT::v2f32, `1`}, // REV64
4037	{TTI::SK_Reverse, MVT::v4f32, `2`}, // REV64; EXT
4038	{TTI::SK_Reverse, MVT::v2f64, `1`}, // EXT
4039	{TTI::SK_Reverse, MVT::v8f16, `2`}, // REV64; EXT
4040	{TTI::SK_Reverse, MVT::v8i16, `2`}, // REV64; EXT
4041	{TTI::SK_Reverse, MVT::v16i8, `2`}, // REV64; EXT
4042	{TTI::SK_Reverse, MVT::v4f16, `1`}, // REV64
4043	{TTI::SK_Reverse, MVT::v4i16, `1`}, // REV64
4044	{TTI::SK_Reverse, MVT::v8i8, `1`}, // REV64
4045	// Splice can all be lowered as `ext`.
4046	{TTI::SK_Splice, MVT::v2i32, `1`},
4047	{TTI::SK_Splice, MVT::v4i32, `1`},
4048	{TTI::SK_Splice, MVT::v2i64, `1`},
4049	{TTI::SK_Splice, MVT::v2f32, `1`},
4050	{TTI::SK_Splice, MVT::v4f32, `1`},
4051	{TTI::SK_Splice, MVT::v2f64, `1`},
4052	{TTI::SK_Splice, MVT::v8f16, `1`},
4053	{TTI::SK_Splice, MVT::v8bf16, `1`},
4054	{TTI::SK_Splice, MVT::v8i16, `1`},
4055	{TTI::SK_Splice, MVT::v16i8, `1`},
4056	{TTI::SK_Splice, MVT::v4bf16, `1`},
4057	{TTI::SK_Splice, MVT::v4f16, `1`},
4058	{TTI::SK_Splice, MVT::v4i16, `1`},
4059	{TTI::SK_Splice, MVT::v8i8, `1`},
4060	// Broadcast shuffle kinds for scalable vectors
4061	{TTI::SK_Broadcast, MVT::nxv16i8, `1`},
4062	{TTI::SK_Broadcast, MVT::nxv8i16, `1`},
4063	{TTI::SK_Broadcast, MVT::nxv4i32, `1`},
4064	{TTI::SK_Broadcast, MVT::nxv2i64, `1`},
4065	{TTI::SK_Broadcast, MVT::nxv2f16, `1`},
4066	{TTI::SK_Broadcast, MVT::nxv4f16, `1`},
4067	{TTI::SK_Broadcast, MVT::nxv8f16, `1`},
4068	{TTI::SK_Broadcast, MVT::nxv2bf16, `1`},
4069	{TTI::SK_Broadcast, MVT::nxv4bf16, `1`},
4070	{TTI::SK_Broadcast, MVT::nxv8bf16, `1`},
4071	{TTI::SK_Broadcast, MVT::nxv2f32, `1`},
4072	{TTI::SK_Broadcast, MVT::nxv4f32, `1`},
4073	{TTI::SK_Broadcast, MVT::nxv2f64, `1`},
4074	{TTI::SK_Broadcast, MVT::nxv16i1, `1`},
4075	{TTI::SK_Broadcast, MVT::nxv8i1, `1`},
4076	{TTI::SK_Broadcast, MVT::nxv4i1, `1`},
4077	{TTI::SK_Broadcast, MVT::nxv2i1, `1`},
4078	// Handle the cases for vector.reverse with scalable vectors
4079	{TTI::SK_Reverse, MVT::nxv16i8, `1`},
4080	{TTI::SK_Reverse, MVT::nxv8i16, `1`},
4081	{TTI::SK_Reverse, MVT::nxv4i32, `1`},
4082	{TTI::SK_Reverse, MVT::nxv2i64, `1`},
4083	{TTI::SK_Reverse, MVT::nxv2f16, `1`},
4084	{TTI::SK_Reverse, MVT::nxv4f16, `1`},
4085	{TTI::SK_Reverse, MVT::nxv8f16, `1`},
4086	{TTI::SK_Reverse, MVT::nxv2bf16, `1`},
4087	{TTI::SK_Reverse, MVT::nxv4bf16, `1`},
4088	{TTI::SK_Reverse, MVT::nxv8bf16, `1`},
4089	{TTI::SK_Reverse, MVT::nxv2f32, `1`},
4090	{TTI::SK_Reverse, MVT::nxv4f32, `1`},
4091	{TTI::SK_Reverse, MVT::nxv2f64, `1`},
4092	{TTI::SK_Reverse, MVT::nxv16i1, `1`},
4093	{TTI::SK_Reverse, MVT::nxv8i1, `1`},
4094	{TTI::SK_Reverse, MVT::nxv4i1, `1`},
4095	{TTI::SK_Reverse, MVT::nxv2i1, `1`},
4096	};
4097	if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
4098	return LT.first * Entry->Cost;
4099	}
4100
4101	if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Val: Tp))
4102	return getSpliceCost(Tp, Index);
4103
4104	// Inserting a subvector can often be done with either a D, S or H register
4105	// move, so long as the inserted vector is "aligned".
4106	if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
4107	LT.second.getSizeInBits() <= `128` && SubTp) {
4108	std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(Ty: SubTp);
4109	if (SubLT.second.isVector()) {
4110	int NumElts = LT.second.getVectorNumElements();
4111	int NumSubElts = SubLT.second.getVectorNumElements();
4112	if ((Index % NumSubElts) == `0` && (NumElts % NumSubElts) == `0`)
4113	return SubLT.first;
4114	}
4115	}
4116
4117	// Restore optimal kind.
4118	if (IsExtractSubvector)
4119	Kind = TTI::SK_ExtractSubvector;
4120	return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args,
4121	CxtI);
4122	}
4123
4124	static bool containsDecreasingPointers(Loop *TheLoop,
4125	PredicatedScalarEvolution *PSE) {
4126	const auto &Strides = DenseMap<Value , const* SCEV *>();
4127	for (BasicBlock *BB : TheLoop->blocks()) {
4128	// Scan the instructions in the block and look for addresses that are
4129	// consecutive and decreasing.
4130	for (Instruction &I : *BB) {
4131	if (isa<LoadInst>(Val: &I) \|\| isa<StoreInst>(Val: &I)) {
4132	Value *Ptr = getLoadStorePointerOperand(V: &I);
4133	Type *AccessTy = getLoadStoreType(I: &I);
4134	if (getPtrStride(PSE&: PSE, AccessTy, Ptr, Lp: TheLoop, StridesMap: Strides, /Assume=/*true,
4135	/ShouldCheckWrap=/false)
4136	.value_or(u: `0`) < `0`)
4137	return true;
4138	}
4139	}
4140	}
4141	return false;
4142	}
4143
4144	bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) {
4145	if (!ST->hasSVE())
4146	return false;
4147
4148	// We don't currently support vectorisation with interleaving for SVE - with
4149	// such loops we're better off not using tail-folding. This gives us a chance
4150	// to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
4151	if (TFI->IAI->hasGroups())
4152	return false;
4153
4154	TailFoldingOpts Required = TailFoldingOpts::Disabled;
4155	if (TFI->LVL->getReductionVars().size())
4156	Required \|= TailFoldingOpts::Reductions;
4157	if (TFI->LVL->getFixedOrderRecurrences().size())
4158	Required \|= TailFoldingOpts::Recurrences;
4159
4160	// We call this to discover whether any load/store pointers in the loop have
4161	// negative strides. This will require extra work to reverse the loop
4162	// predicate, which may be expensive.
4163	if (containsDecreasingPointers(TheLoop: TFI->LVL->getLoop(),
4164	PSE: TFI->LVL->getPredicatedScalarEvolution()))
4165	Required \|= TailFoldingOpts::Reverse;
4166	if (Required == TailFoldingOpts::Disabled)
4167	Required \|= TailFoldingOpts::Simple;
4168
4169	if (!TailFoldingOptionLoc.satisfies(DefaultBits: ST->getSVETailFoldingDefaultOpts(),
4170	Required))
4171	return false;
4172
4173	// Don't tail-fold for tight loops where we would be better off interleaving
4174	// with an unpredicated loop.
4175	unsigned NumInsns = `0`;
4176	for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) {
4177	NumInsns += BB->sizeWithoutDebug();
4178	}
4179
4180	// We expect 4 of these to be a IV PHI, IV add, IV compare and branch.
4181	return NumInsns >= SVETailFoldInsnThreshold;
4182	}
4183
4184	InstructionCost
4185	AArch64TTIImpl::getScalingFactorCost(Type Ty, GlobalValue BaseGV,
4186	int64_t BaseOffset, bool HasBaseReg,
4187	int64_t Scale, unsigned AddrSpace) const {
4188	// Scaling factors are not free at all.
4189	// Operands \| Rt Latency
4190	// -------------------------------------------
4191	// Rt, [Xn, Xm] \| 4
4192	// -------------------------------------------
4193	// Rt, [Xn, Xm, lsl #imm] \| Rn: 4 Rm: 5
4194	// Rt, [Xn, Wm, <extend> #imm] \|
4195	TargetLoweringBase::AddrMode AM;
4196	AM.BaseGV = BaseGV;
4197	AM.BaseOffs = BaseOffset;
4198	AM.HasBaseReg = HasBaseReg;
4199	AM.Scale = Scale;
4200	if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AS: AddrSpace))
4201	// Scale represents reg2 scale, thus account for 1 if*
4202	// it is not equal to 0 or 1.
4203	return AM.Scale != `0` && AM.Scale != `1`;
4204	return -`1`;
4205	}
4206
4207	bool AArch64TTIImpl::shouldTreatInstructionLikeSelect(const Instruction *I) {
4208	// For the binary operators (e.g. or) we need to be more careful than
4209	// selects, here we only transform them if they are already at a natural
4210	// break point in the code - the end of a block with an unconditional
4211	// terminator.
4212	if (EnableOrLikeSelectOpt && I->getOpcode() == Instruction::Or &&
4213	isa<BranchInst>(Val: I->getNextNode()) &&
4214	cast<BranchInst>(Val: I->getNextNode())->isUnconditional())
4215	return true;
4216	return BaseT::shouldTreatInstructionLikeSelect(I);
4217	}
4218

source code of llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp