AArch64ISelLowering.cpp source code [llvm/lib/Target/AArch64/AArch64ISelLowering.cpp]

1	//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file implements the AArch64TargetLowering class.
10	//
11	//===----------------------------------------------------------------------===//
12
13	#include "AArch64ISelLowering.h"
14	#include "AArch64CallingConvention.h"
15	#include "AArch64ExpandImm.h"
16	#include "AArch64MachineFunctionInfo.h"
17	#include "AArch64PerfectShuffle.h"
18	#include "AArch64RegisterInfo.h"
19	#include "AArch64Subtarget.h"
20	#include "MCTargetDesc/AArch64AddressingModes.h"
21	#include "Utils/AArch64BaseInfo.h"
22	#include "llvm/ADT/APFloat.h"
23	#include "llvm/ADT/APInt.h"
24	#include "llvm/ADT/ArrayRef.h"
25	#include "llvm/ADT/STLExtras.h"
26	#include "llvm/ADT/SmallSet.h"
27	#include "llvm/ADT/SmallVector.h"
28	#include "llvm/ADT/Statistic.h"
29	#include "llvm/ADT/StringRef.h"
30	#include "llvm/ADT/Twine.h"
31	#include "llvm/Analysis/LoopInfo.h"
32	#include "llvm/Analysis/MemoryLocation.h"
33	#include "llvm/Analysis/ObjCARCUtil.h"
34	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
35	#include "llvm/Analysis/TargetTransformInfo.h"
36	#include "llvm/Analysis/ValueTracking.h"
37	#include "llvm/Analysis/VectorUtils.h"
38	#include "llvm/CodeGen/Analysis.h"
39	#include "llvm/CodeGen/CallingConvLower.h"
40	#include "llvm/CodeGen/ComplexDeinterleavingPass.h"
41	#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
42	#include "llvm/CodeGen/GlobalISel/Utils.h"
43	#include "llvm/CodeGen/ISDOpcodes.h"
44	#include "llvm/CodeGen/MachineBasicBlock.h"
45	#include "llvm/CodeGen/MachineFrameInfo.h"
46	#include "llvm/CodeGen/MachineFunction.h"
47	#include "llvm/CodeGen/MachineInstr.h"
48	#include "llvm/CodeGen/MachineInstrBuilder.h"
49	#include "llvm/CodeGen/MachineMemOperand.h"
50	#include "llvm/CodeGen/MachineRegisterInfo.h"
51	#include "llvm/CodeGen/RuntimeLibcalls.h"
52	#include "llvm/CodeGen/SelectionDAG.h"
53	#include "llvm/CodeGen/SelectionDAGNodes.h"
54	#include "llvm/CodeGen/TargetCallingConv.h"
55	#include "llvm/CodeGen/TargetInstrInfo.h"
56	#include "llvm/CodeGen/TargetOpcodes.h"
57	#include "llvm/CodeGen/ValueTypes.h"
58	#include "llvm/CodeGenTypes/MachineValueType.h"
59	#include "llvm/IR/Attributes.h"
60	#include "llvm/IR/Constants.h"
61	#include "llvm/IR/DataLayout.h"
62	#include "llvm/IR/DebugLoc.h"
63	#include "llvm/IR/DerivedTypes.h"
64	#include "llvm/IR/Function.h"
65	#include "llvm/IR/GetElementPtrTypeIterator.h"
66	#include "llvm/IR/GlobalValue.h"
67	#include "llvm/IR/IRBuilder.h"
68	#include "llvm/IR/Instruction.h"
69	#include "llvm/IR/Instructions.h"
70	#include "llvm/IR/IntrinsicInst.h"
71	#include "llvm/IR/Intrinsics.h"
72	#include "llvm/IR/IntrinsicsAArch64.h"
73	#include "llvm/IR/Module.h"
74	#include "llvm/IR/PatternMatch.h"
75	#include "llvm/IR/Type.h"
76	#include "llvm/IR/Use.h"
77	#include "llvm/IR/Value.h"
78	#include "llvm/MC/MCRegisterInfo.h"
79	#include "llvm/Support/AtomicOrdering.h"
80	#include "llvm/Support/Casting.h"
81	#include "llvm/Support/CodeGen.h"
82	#include "llvm/Support/CommandLine.h"
83	#include "llvm/Support/Debug.h"
84	#include "llvm/Support/ErrorHandling.h"
85	#include "llvm/Support/InstructionCost.h"
86	#include "llvm/Support/KnownBits.h"
87	#include "llvm/Support/MathExtras.h"
88	#include "llvm/Support/raw_ostream.h"
89	#include "llvm/Target/TargetMachine.h"
90	#include "llvm/Target/TargetOptions.h"
91	#include "llvm/TargetParser/Triple.h"
92	#include <algorithm>
93	#include <bitset>
94	#include <cassert>
95	#include <cctype>
96	#include <cstdint>
97	#include <cstdlib>
98	#include <iterator>
99	#include <limits>
100	#include <optional>
101	#include <tuple>
102	#include <utility>
103	#include <vector>
104
105	using namespace llvm;
106	using namespace llvm::PatternMatch;
107
108	#define DEBUG_TYPE "aarch64-lower"
109
110	STATISTIC(NumTailCalls, "Number of tail calls");
111	STATISTIC(NumShiftInserts, "Number of vector shift inserts");
112	STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
113
114	// FIXME: The necessary dtprel relocations don't seem to be supported
115	// well in the GNU bfd and gold linkers at the moment. Therefore, by
116	// default, for now, fall back to GeneralDynamic code generation.
117	cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration(
118	"aarch64-elf-ldtls-generation", cl::Hidden,
119	cl::desc ("Allow AArch64 Local Dynamic TLS code generation"),
120	cl::init(Val: false));
121
122	static cl::opt<bool>
123	EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
124	cl::desc ("Enable AArch64 logical imm instruction "
125	"optimization"),
126	cl::init(Val: true));
127
128	// Temporary option added for the purpose of testing functionality added
129	// to DAGCombiner.cpp in D92230. It is expected that this can be removed
130	// in future when both implementations will be based off MGATHER rather
131	// than the GLD1 nodes added for the SVE gather load intrinsics.
132	static cl::opt<bool>
133	EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
134	cl::desc ("Combine extends of AArch64 masked "
135	"gather intrinsics"),
136	cl::init(Val: true));
137
138	static cl::opt<bool> EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden,
139	cl::desc ("Combine ext and trunc to TBL"),
140	cl::init(Val: true));
141
142	// All of the XOR, OR and CMP use ALU ports, and data dependency will become the
143	// bottleneck after this transform on high end CPU. So this max leaf node
144	// limitation is guard cmp+ccmp will be profitable.
145	static cl::opt<unsigned> MaxXors("aarch64-max-xors", cl::init(Val: `16`), cl::Hidden,
146	cl::desc ("Maximum of xors"));
147
148	/// Value type used for condition codes.
149	static const MVT MVT_CC = MVT::i32;
150
151	static const MCPhysReg GPRArgRegs[] = {AArch64::X0, AArch64::X1, AArch64::X2,
152	AArch64::X3, AArch64::X4, AArch64::X5,
153	AArch64::X6, AArch64::X7};
154	static const MCPhysReg FPRArgRegs[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
155	AArch64::Q3, AArch64::Q4, AArch64::Q5,
156	AArch64::Q6, AArch64::Q7};
157
158	ArrayRef<MCPhysReg> llvm::AArch64::getGPRArgRegs() { return GPRArgRegs; }
159
160	ArrayRef<MCPhysReg> llvm::AArch64::getFPRArgRegs() { return FPRArgRegs; }
161
162	static inline EVT getPackedSVEVectorVT(EVT VT) {
163	switch (VT.getSimpleVT().SimpleTy) {
164	default:
165	llvm_unreachable("unexpected element type for vector");
166	case MVT::i8:
167	return MVT::nxv16i8;
168	case MVT::i16:
169	return MVT::nxv8i16;
170	case MVT::i32:
171	return MVT::nxv4i32;
172	case MVT::i64:
173	return MVT::nxv2i64;
174	case MVT::f16:
175	return MVT::nxv8f16;
176	case MVT::f32:
177	return MVT::nxv4f32;
178	case MVT::f64:
179	return MVT::nxv2f64;
180	case MVT::bf16:
181	return MVT::nxv8bf16;
182	}
183	}
184
185	// NOTE: Currently there's only a need to return integer vector types. If this
186	// changes then just add an extra "type" parameter.
187	static inline EVT getPackedSVEVectorVT(ElementCount EC) {
188	switch (EC.getKnownMinValue()) {
189	default:
190	llvm_unreachable("unexpected element count for vector");
191	case `16`:
192	return MVT::nxv16i8;
193	case `8`:
194	return MVT::nxv8i16;
195	case `4`:
196	return MVT::nxv4i32;
197	case `2`:
198	return MVT::nxv2i64;
199	}
200	}
201
202	static inline EVT getPromotedVTForPredicate(EVT VT) {
203	assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) &&
204	"Expected scalable predicate vector type!");
205	switch (VT.getVectorMinNumElements()) {
206	default:
207	llvm_unreachable("unexpected element count for vector");
208	case `2`:
209	return MVT::nxv2i64;
210	case `4`:
211	return MVT::nxv4i32;
212	case `8`:
213	return MVT::nxv8i16;
214	case `16`:
215	return MVT::nxv16i8;
216	}
217	}
218
219	/// Returns true if VT's elements occupy the lowest bit positions of its
220	/// associated register class without any intervening space.
221	///
222	/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
223	/// same register class, but only nxv8f16 can be treated as a packed vector.
224	static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
225	assert(VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
226	"Expected legal vector type!");
227	return VT.isFixedLengthVector() \|\|
228	VT.getSizeInBits().getKnownMinValue() == AArch64::SVEBitsPerBlock;
229	}
230
231	// Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
232	// predicate and end with a passthru value matching the result type.
233	static bool isMergePassthruOpcode(unsigned Opc) {
234	switch (Opc) {
235	default:
236	return false;
237	case AArch64ISD::BITREVERSE_MERGE_PASSTHRU:
238	case AArch64ISD::BSWAP_MERGE_PASSTHRU:
239	case AArch64ISD::REVH_MERGE_PASSTHRU:
240	case AArch64ISD::REVW_MERGE_PASSTHRU:
241	case AArch64ISD::REVD_MERGE_PASSTHRU:
242	case AArch64ISD::CTLZ_MERGE_PASSTHRU:
243	case AArch64ISD::CTPOP_MERGE_PASSTHRU:
244	case AArch64ISD::DUP_MERGE_PASSTHRU:
245	case AArch64ISD::ABS_MERGE_PASSTHRU:
246	case AArch64ISD::NEG_MERGE_PASSTHRU:
247	case AArch64ISD::FNEG_MERGE_PASSTHRU:
248	case AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU:
249	case AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU:
250	case AArch64ISD::FCEIL_MERGE_PASSTHRU:
251	case AArch64ISD::FFLOOR_MERGE_PASSTHRU:
252	case AArch64ISD::FNEARBYINT_MERGE_PASSTHRU:
253	case AArch64ISD::FRINT_MERGE_PASSTHRU:
254	case AArch64ISD::FROUND_MERGE_PASSTHRU:
255	case AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU:
256	case AArch64ISD::FTRUNC_MERGE_PASSTHRU:
257	case AArch64ISD::FP_ROUND_MERGE_PASSTHRU:
258	case AArch64ISD::FP_EXTEND_MERGE_PASSTHRU:
259	case AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU:
260	case AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU:
261	case AArch64ISD::FCVTZU_MERGE_PASSTHRU:
262	case AArch64ISD::FCVTZS_MERGE_PASSTHRU:
263	case AArch64ISD::FSQRT_MERGE_PASSTHRU:
264	case AArch64ISD::FRECPX_MERGE_PASSTHRU:
265	case AArch64ISD::FABS_MERGE_PASSTHRU:
266	return true;
267	}
268	}
269
270	// Returns true if inactive lanes are known to be zeroed by construction.
271	static bool isZeroingInactiveLanes(SDValue Op) {
272	switch (Op.getOpcode()) {
273	default:
274	return false;
275	// We guarantee i1 splat_vectors to zero the other lanes
276	case ISD::SPLAT_VECTOR:
277	case AArch64ISD::PTRUE:
278	case AArch64ISD::SETCC_MERGE_ZERO:
279	return true;
280	case ISD::INTRINSIC_WO_CHAIN:
281	switch (Op.getConstantOperandVal(i: `0`)) {
282	default:
283	return false;
284	case Intrinsic::aarch64_sve_ptrue:
285	case Intrinsic::aarch64_sve_pnext:
286	case Intrinsic::aarch64_sve_cmpeq:
287	case Intrinsic::aarch64_sve_cmpne:
288	case Intrinsic::aarch64_sve_cmpge:
289	case Intrinsic::aarch64_sve_cmpgt:
290	case Intrinsic::aarch64_sve_cmphs:
291	case Intrinsic::aarch64_sve_cmphi:
292	case Intrinsic::aarch64_sve_cmpeq_wide:
293	case Intrinsic::aarch64_sve_cmpne_wide:
294	case Intrinsic::aarch64_sve_cmpge_wide:
295	case Intrinsic::aarch64_sve_cmpgt_wide:
296	case Intrinsic::aarch64_sve_cmplt_wide:
297	case Intrinsic::aarch64_sve_cmple_wide:
298	case Intrinsic::aarch64_sve_cmphs_wide:
299	case Intrinsic::aarch64_sve_cmphi_wide:
300	case Intrinsic::aarch64_sve_cmplo_wide:
301	case Intrinsic::aarch64_sve_cmpls_wide:
302	case Intrinsic::aarch64_sve_fcmpeq:
303	case Intrinsic::aarch64_sve_fcmpne:
304	case Intrinsic::aarch64_sve_fcmpge:
305	case Intrinsic::aarch64_sve_fcmpgt:
306	case Intrinsic::aarch64_sve_fcmpuo:
307	case Intrinsic::aarch64_sve_facgt:
308	case Intrinsic::aarch64_sve_facge:
309	case Intrinsic::aarch64_sve_whilege:
310	case Intrinsic::aarch64_sve_whilegt:
311	case Intrinsic::aarch64_sve_whilehi:
312	case Intrinsic::aarch64_sve_whilehs:
313	case Intrinsic::aarch64_sve_whilele:
314	case Intrinsic::aarch64_sve_whilelo:
315	case Intrinsic::aarch64_sve_whilels:
316	case Intrinsic::aarch64_sve_whilelt:
317	case Intrinsic::aarch64_sve_match:
318	case Intrinsic::aarch64_sve_nmatch:
319	case Intrinsic::aarch64_sve_whilege_x2:
320	case Intrinsic::aarch64_sve_whilegt_x2:
321	case Intrinsic::aarch64_sve_whilehi_x2:
322	case Intrinsic::aarch64_sve_whilehs_x2:
323	case Intrinsic::aarch64_sve_whilele_x2:
324	case Intrinsic::aarch64_sve_whilelo_x2:
325	case Intrinsic::aarch64_sve_whilels_x2:
326	case Intrinsic::aarch64_sve_whilelt_x2:
327	return true;
328	}
329	}
330	}
331
332	AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
333	const AArch64Subtarget &STI)
334	: TargetLowering (TM), Subtarget(&STI) {
335	// AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
336	// we have to make something up. Arbitrarily, choose ZeroOrOne.
337	setBooleanContents(ZeroOrOneBooleanContent);
338	// When comparing vectors the result sets the different elements in the
339	// vector to all-one or all-zero.
340	setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
341
342	// Set up the register classes.
343	addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
344	addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
345
346	if (Subtarget->hasLS64()) {
347	addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass);
348	setOperationAction(ISD::LOAD, MVT::i64x8, Custom);
349	setOperationAction(ISD::STORE, MVT::i64x8, Custom);
350	}
351
352	if (Subtarget->hasFPARMv8()) {
353	addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
354	addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
355	addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
356	addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
357	addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
358	}
359
360	if (Subtarget->hasNEON()) {
361	addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
362	addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
363	// Someone set us up the NEON.
364	addDRTypeForNEON(MVT::v2f32);
365	addDRTypeForNEON(MVT::v8i8);
366	addDRTypeForNEON(MVT::v4i16);
367	addDRTypeForNEON(MVT::v2i32);
368	addDRTypeForNEON(MVT::v1i64);
369	addDRTypeForNEON(MVT::v1f64);
370	addDRTypeForNEON(MVT::v4f16);
371	addDRTypeForNEON(MVT::v4bf16);
372
373	addQRTypeForNEON(MVT::v4f32);
374	addQRTypeForNEON(MVT::v2f64);
375	addQRTypeForNEON(MVT::v16i8);
376	addQRTypeForNEON(MVT::v8i16);
377	addQRTypeForNEON(MVT::v4i32);
378	addQRTypeForNEON(MVT::v2i64);
379	addQRTypeForNEON(MVT::v8f16);
380	addQRTypeForNEON(MVT::v8bf16);
381	}
382
383	if (Subtarget->hasSVEorSME()) {
384	// Add legal sve predicate types
385	addRegisterClass(MVT::nxv1i1, &AArch64::PPRRegClass);
386	addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
387	addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
388	addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
389	addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
390
391	// Add legal sve data types
392	addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
393	addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
394	addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
395	addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
396
397	addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
398	addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
399	addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
400	addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
401	addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
402	addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
403
404	addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
405	addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
406	addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
407
408	if (Subtarget->useSVEForFixedLengthVectors()) {
409	for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
410	if (useSVEForFixedLengthVectorVT(VT))
411	addRegisterClass(VT, &AArch64::ZPRRegClass);
412
413	for (MVT VT : MVT::fp_fixedlen_vector_valuetypes())
414	if (useSVEForFixedLengthVectorVT(VT))
415	addRegisterClass(VT, &AArch64::ZPRRegClass);
416	}
417	}
418
419	if (Subtarget->hasSVE2p1() \|\| Subtarget->hasSME2()) {
420	addRegisterClass(MVT::aarch64svcount, &AArch64::PPRRegClass);
421	setOperationPromotedToType(ISD::LOAD, MVT::aarch64svcount, MVT::nxv16i1);
422	setOperationPromotedToType(ISD::STORE, MVT::aarch64svcount, MVT::nxv16i1);
423
424	setOperationAction(ISD::SELECT, MVT::aarch64svcount, Custom);
425	setOperationAction(ISD::SELECT_CC, MVT::aarch64svcount, Expand);
426	}
427
428	// Compute derived properties from the register classes
429	computeRegisterProperties(Subtarget->getRegisterInfo());
430
431	// Provide all sorts of operation actions
432	setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
433	setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
434	setOperationAction(ISD::SETCC, MVT::i32, Custom);
435	setOperationAction(ISD::SETCC, MVT::i64, Custom);
436	setOperationAction(ISD::SETCC, MVT::bf16, Custom);
437	setOperationAction(ISD::SETCC, MVT::f16, Custom);
438	setOperationAction(ISD::SETCC, MVT::f32, Custom);
439	setOperationAction(ISD::SETCC, MVT::f64, Custom);
440	setOperationAction(ISD::STRICT_FSETCC, MVT::bf16, Custom);
441	setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom);
442	setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Custom);
443	setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Custom);
444	setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom);
445	setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Custom);
446	setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom);
447	setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
448	setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
449	setOperationAction(ISD::BRCOND, MVT::Other, Custom);
450	setOperationAction(ISD::BR_CC, MVT::i32, Custom);
451	setOperationAction(ISD::BR_CC, MVT::i64, Custom);
452	setOperationAction(ISD::BR_CC, MVT::f16, Custom);
453	setOperationAction(ISD::BR_CC, MVT::f32, Custom);
454	setOperationAction(ISD::BR_CC, MVT::f64, Custom);
455	setOperationAction(ISD::SELECT, MVT::i32, Custom);
456	setOperationAction(ISD::SELECT, MVT::i64, Custom);
457	setOperationAction(ISD::SELECT, MVT::f16, Custom);
458	setOperationAction(ISD::SELECT, MVT::bf16, Custom);
459	setOperationAction(ISD::SELECT, MVT::f32, Custom);
460	setOperationAction(ISD::SELECT, MVT::f64, Custom);
461	setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
462	setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
463	setOperationAction(ISD::SELECT_CC, MVT::f16, Custom);
464	setOperationAction(ISD::SELECT_CC, MVT::bf16, Custom);
465	setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
466	setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
467	setOperationAction(ISD::BR_JT, MVT::Other, Custom);
468	setOperationAction(ISD::JumpTable, MVT::i64, Custom);
469	setOperationAction(ISD::SETCCCARRY, MVT::i64, Custom);
470
471	setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
472	setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
473	setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
474
475	setOperationAction(ISD::FREM, MVT::f32, Expand);
476	setOperationAction(ISD::FREM, MVT::f64, Expand);
477	setOperationAction(ISD::FREM, MVT::f80, Expand);
478
479	setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
480
481	// Custom lowering hooks are needed for XOR
482	// to fold it into CSINC/CSINV.
483	setOperationAction(ISD::XOR, MVT::i32, Custom);
484	setOperationAction(ISD::XOR, MVT::i64, Custom);
485
486	// Virtually no operation on f128 is legal, but LLVM can't expand them when
487	// there's a valid register class, so we need custom operations in most cases.
488	setOperationAction(ISD::FABS, MVT::f128, Expand);
489	setOperationAction(ISD::FADD, MVT::f128, LibCall);
490	setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
491	setOperationAction(ISD::FCOS, MVT::f128, Expand);
492	setOperationAction(ISD::FDIV, MVT::f128, LibCall);
493	setOperationAction(ISD::FMA, MVT::f128, Expand);
494	setOperationAction(ISD::FMUL, MVT::f128, LibCall);
495	setOperationAction(ISD::FNEG, MVT::f128, Expand);
496	setOperationAction(ISD::FPOW, MVT::f128, Expand);
497	setOperationAction(ISD::FREM, MVT::f128, Expand);
498	setOperationAction(ISD::FRINT, MVT::f128, Expand);
499	setOperationAction(ISD::FSIN, MVT::f128, Expand);
500	setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
501	setOperationAction(ISD::FSQRT, MVT::f128, Expand);
502	setOperationAction(ISD::FSUB, MVT::f128, LibCall);
503	setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
504	setOperationAction(ISD::SETCC, MVT::f128, Custom);
505	setOperationAction(ISD::STRICT_FSETCC, MVT::f128, Custom);
506	setOperationAction(ISD::STRICT_FSETCCS, MVT::f128, Custom);
507	setOperationAction(ISD::BR_CC, MVT::f128, Custom);
508	setOperationAction(ISD::SELECT, MVT::f128, Custom);
509	setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
510	setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
511	// FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently
512	// aren't handled.
513
514	// Lowering for many of the conversions is actually specified by the non-f128
515	// type. The LowerXXX function will be trivial when f128 isn't involved.
516	setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
517	setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
518	setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
519	setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
520	setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
521	setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i128, Custom);
522	setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
523	setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
524	setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
525	setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
526	setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
527	setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i128, Custom);
528	setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
529	setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
530	setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
531	setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
532	setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
533	setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i128, Custom);
534	setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
535	setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
536	setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
537	setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
538	setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
539	setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom);
540	if (Subtarget->hasFPARMv8()) {
541	setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
542	setOperationAction(ISD::FP_ROUND, MVT::bf16, Custom);
543	}
544	setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
545	setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
546	if (Subtarget->hasFPARMv8()) {
547	setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
548	setOperationAction(ISD::STRICT_FP_ROUND, MVT::bf16, Custom);
549	}
550	setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
551	setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
552
553	setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i32, Custom);
554	setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);
555	setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i32, Custom);
556	setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);
557
558	// Variable arguments.
559	setOperationAction(ISD::VASTART, MVT::Other, Custom);
560	setOperationAction(ISD::VAARG, MVT::Other, Custom);
561	setOperationAction(ISD::VACOPY, MVT::Other, Custom);
562	setOperationAction(ISD::VAEND, MVT::Other, Expand);
563
564	// Variable-sized objects.
565	setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
566	setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
567
568	// Lowering Funnel Shifts to EXTR
569	setOperationAction(ISD::FSHR, MVT::i32, Custom);
570	setOperationAction(ISD::FSHR, MVT::i64, Custom);
571	setOperationAction(ISD::FSHL, MVT::i32, Custom);
572	setOperationAction(ISD::FSHL, MVT::i64, Custom);
573
574	setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
575
576	// Constant pool entries
577	setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
578
579	// BlockAddress
580	setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
581
582	// AArch64 lacks both left-rotate and popcount instructions.
583	setOperationAction(ISD::ROTL, MVT::i32, Expand);
584	setOperationAction(ISD::ROTL, MVT::i64, Expand);
585	for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
586	setOperationAction(ISD::ROTL, VT, Expand);
587	setOperationAction(ISD::ROTR, VT, Expand);
588	}
589
590	// AArch64 doesn't have i32 MULH{S\|U}.
591	setOperationAction(ISD::MULHU, MVT::i32, Expand);
592	setOperationAction(ISD::MULHS, MVT::i32, Expand);
593
594	// AArch64 doesn't have {U\|S}MUL_LOHI.
595	setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
596	setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
597	setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
598	setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
599
600	if (Subtarget->hasCSSC()) {
601	setOperationAction(ISD::CTPOP, MVT::i32, Legal);
602	setOperationAction(ISD::CTPOP, MVT::i64, Legal);
603	setOperationAction(ISD::CTPOP, MVT::i128, Expand);
604
605	setOperationAction(ISD::PARITY, MVT::i128, Expand);
606
607	setOperationAction(ISD::CTTZ, MVT::i32, Legal);
608	setOperationAction(ISD::CTTZ, MVT::i64, Legal);
609	setOperationAction(ISD::CTTZ, MVT::i128, Expand);
610
611	setOperationAction(ISD::ABS, MVT::i32, Legal);
612	setOperationAction(ISD::ABS, MVT::i64, Legal);
613
614	setOperationAction(ISD::SMAX, MVT::i32, Legal);
615	setOperationAction(ISD::SMAX, MVT::i64, Legal);
616	setOperationAction(ISD::UMAX, MVT::i32, Legal);
617	setOperationAction(ISD::UMAX, MVT::i64, Legal);
618
619	setOperationAction(ISD::SMIN, MVT::i32, Legal);
620	setOperationAction(ISD::SMIN, MVT::i64, Legal);
621	setOperationAction(ISD::UMIN, MVT::i32, Legal);
622	setOperationAction(ISD::UMIN, MVT::i64, Legal);
623	} else {
624	setOperationAction(ISD::CTPOP, MVT::i32, Custom);
625	setOperationAction(ISD::CTPOP, MVT::i64, Custom);
626	setOperationAction(ISD::CTPOP, MVT::i128, Custom);
627
628	setOperationAction(ISD::PARITY, MVT::i64, Custom);
629	setOperationAction(ISD::PARITY, MVT::i128, Custom);
630
631	setOperationAction(ISD::ABS, MVT::i32, Custom);
632	setOperationAction(ISD::ABS, MVT::i64, Custom);
633	}
634
635	setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
636	setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
637	for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
638	setOperationAction(ISD::SDIVREM, VT, Expand);
639	setOperationAction(ISD::UDIVREM, VT, Expand);
640	}
641	setOperationAction(ISD::SREM, MVT::i32, Expand);
642	setOperationAction(ISD::SREM, MVT::i64, Expand);
643	setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
644	setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
645	setOperationAction(ISD::UREM, MVT::i32, Expand);
646	setOperationAction(ISD::UREM, MVT::i64, Expand);
647
648	// Custom lower Add/Sub/Mul with overflow.
649	setOperationAction(ISD::SADDO, MVT::i32, Custom);
650	setOperationAction(ISD::SADDO, MVT::i64, Custom);
651	setOperationAction(ISD::UADDO, MVT::i32, Custom);
652	setOperationAction(ISD::UADDO, MVT::i64, Custom);
653	setOperationAction(ISD::SSUBO, MVT::i32, Custom);
654	setOperationAction(ISD::SSUBO, MVT::i64, Custom);
655	setOperationAction(ISD::USUBO, MVT::i32, Custom);
656	setOperationAction(ISD::USUBO, MVT::i64, Custom);
657	setOperationAction(ISD::SMULO, MVT::i32, Custom);
658	setOperationAction(ISD::SMULO, MVT::i64, Custom);
659	setOperationAction(ISD::UMULO, MVT::i32, Custom);
660	setOperationAction(ISD::UMULO, MVT::i64, Custom);
661
662	setOperationAction(ISD::UADDO_CARRY, MVT::i32, Custom);
663	setOperationAction(ISD::UADDO_CARRY, MVT::i64, Custom);
664	setOperationAction(ISD::USUBO_CARRY, MVT::i32, Custom);
665	setOperationAction(ISD::USUBO_CARRY, MVT::i64, Custom);
666	setOperationAction(ISD::SADDO_CARRY, MVT::i32, Custom);
667	setOperationAction(ISD::SADDO_CARRY, MVT::i64, Custom);
668	setOperationAction(ISD::SSUBO_CARRY, MVT::i32, Custom);
669	setOperationAction(ISD::SSUBO_CARRY, MVT::i64, Custom);
670
671	setOperationAction(ISD::FSIN, MVT::f32, Expand);
672	setOperationAction(ISD::FSIN, MVT::f64, Expand);
673	setOperationAction(ISD::FCOS, MVT::f32, Expand);
674	setOperationAction(ISD::FCOS, MVT::f64, Expand);
675	setOperationAction(ISD::FPOW, MVT::f32, Expand);
676	setOperationAction(ISD::FPOW, MVT::f64, Expand);
677	setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
678	setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
679	if (Subtarget->hasFullFP16()) {
680	setOperationAction(ISD::FCOPYSIGN, MVT::f16, Custom);
681	setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Custom);
682	} else {
683	setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote);
684	setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Promote);
685	}
686
687	for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
688	ISD::FCOS, ISD::FSIN, ISD::FSINCOS,
689	ISD::FEXP, ISD::FEXP2, ISD::FEXP10,
690	ISD::FLOG, ISD::FLOG2, ISD::FLOG10,
691	ISD::STRICT_FREM,
692	ISD::STRICT_FPOW, ISD::STRICT_FPOWI, ISD::STRICT_FCOS,
693	ISD::STRICT_FSIN, ISD::STRICT_FEXP, ISD::STRICT_FEXP2,
694	ISD::STRICT_FLOG, ISD::STRICT_FLOG2, ISD::STRICT_FLOG10}) {
695	setOperationAction(Op, MVT::f16, Promote);
696	setOperationAction(Op, MVT::v4f16, Expand);
697	setOperationAction(Op, MVT::v8f16, Expand);
698	setOperationAction(Op, MVT::bf16, Promote);
699	setOperationAction(Op, MVT::v4bf16, Expand);
700	setOperationAction(Op, MVT::v8bf16, Expand);
701	}
702
703	auto LegalizeNarrowFP = [this](MVT ScalarVT) {
704	for (auto Op : {
705	ISD::SETCC,
706	ISD::SELECT_CC,
707	ISD::BR_CC,
708	ISD::FADD,
709	ISD::FSUB,
710	ISD::FMUL,
711	ISD::FDIV,
712	ISD::FMA,
713	ISD::FCEIL,
714	ISD::FSQRT,
715	ISD::FFLOOR,
716	ISD::FNEARBYINT,
717	ISD::FRINT,
718	ISD::FROUND,
719	ISD::FROUNDEVEN,
720	ISD::FTRUNC,
721	ISD::FMINNUM,
722	ISD::FMAXNUM,
723	ISD::FMINIMUM,
724	ISD::FMAXIMUM,
725	ISD::STRICT_FADD,
726	ISD::STRICT_FSUB,
727	ISD::STRICT_FMUL,
728	ISD::STRICT_FDIV,
729	ISD::STRICT_FMA,
730	ISD::STRICT_FCEIL,
731	ISD::STRICT_FFLOOR,
732	ISD::STRICT_FSQRT,
733	ISD::STRICT_FRINT,
734	ISD::STRICT_FNEARBYINT,
735	ISD::STRICT_FROUND,
736	ISD::STRICT_FTRUNC,
737	ISD::STRICT_FROUNDEVEN,
738	ISD::STRICT_FMINNUM,
739	ISD::STRICT_FMAXNUM,
740	ISD::STRICT_FMINIMUM,
741	ISD::STRICT_FMAXIMUM,
742	})
743	setOperationAction(Op, VT: ScalarVT, Action: Promote);
744
745	for (auto Op : {ISD::FNEG, ISD::FABS})
746	setOperationAction(Op, VT: ScalarVT, Action: Legal);
747
748	// Round-to-integer need custom lowering for fp16, as Promote doesn't work
749	// because the result type is integer.
750	for (auto Op : {ISD::LROUND, ISD::LLROUND, ISD::LRINT, ISD::LLRINT,
751	ISD::STRICT_LROUND, ISD::STRICT_LLROUND, ISD::STRICT_LRINT,
752	ISD::STRICT_LLRINT})
753	setOperationAction(Op, VT: ScalarVT, Action: Custom);
754
755	// promote v4f16 to v4f32 when that is known to be safe.
756	auto V4Narrow = MVT::getVectorVT(VT: ScalarVT, NumElements: `4`);
757	setOperationPromotedToType(ISD::FADD, V4Narrow, MVT::v4f32);
758	setOperationPromotedToType(ISD::FSUB, V4Narrow, MVT::v4f32);
759	setOperationPromotedToType(ISD::FMUL, V4Narrow, MVT::v4f32);
760	setOperationPromotedToType(ISD::FDIV, V4Narrow, MVT::v4f32);
761	setOperationPromotedToType(ISD::FCEIL, V4Narrow, MVT::v4f32);
762	setOperationPromotedToType(ISD::FFLOOR, V4Narrow, MVT::v4f32);
763	setOperationPromotedToType(ISD::FROUND, V4Narrow, MVT::v4f32);
764	setOperationPromotedToType(ISD::FTRUNC, V4Narrow, MVT::v4f32);
765	setOperationPromotedToType(ISD::FROUNDEVEN, V4Narrow, MVT::v4f32);
766	setOperationPromotedToType(ISD::FRINT, V4Narrow, MVT::v4f32);
767	setOperationPromotedToType(ISD::FNEARBYINT, V4Narrow, MVT::v4f32);
768
769	setOperationAction(Op: ISD::FABS, VT: V4Narrow, Action: Legal);
770	setOperationAction(Op: ISD::FNEG, VT: V4Narrow, Action: Legal);
771	setOperationAction(Op: ISD::FMA, VT: V4Narrow, Action: Expand);
772	setOperationAction(Op: ISD::SETCC, VT: V4Narrow, Action: Custom);
773	setOperationAction(Op: ISD::BR_CC, VT: V4Narrow, Action: Expand);
774	setOperationAction(Op: ISD::SELECT, VT: V4Narrow, Action: Expand);
775	setOperationAction(Op: ISD::SELECT_CC, VT: V4Narrow, Action: Expand);
776	setOperationAction(Op: ISD::FCOPYSIGN, VT: V4Narrow, Action: Custom);
777	setOperationAction(Op: ISD::FSQRT, VT: V4Narrow, Action: Expand);
778
779	auto V8Narrow = MVT::getVectorVT(VT: ScalarVT, NumElements: `8`);
780	setOperationAction(Op: ISD::FABS, VT: V8Narrow, Action: Legal);
781	setOperationAction(Op: ISD::FADD, VT: V8Narrow, Action: Legal);
782	setOperationAction(Op: ISD::FCEIL, VT: V8Narrow, Action: Legal);
783	setOperationAction(Op: ISD::FCOPYSIGN, VT: V8Narrow, Action: Custom);
784	setOperationAction(Op: ISD::FDIV, VT: V8Narrow, Action: Legal);
785	setOperationAction(Op: ISD::FFLOOR, VT: V8Narrow, Action: Legal);
786	setOperationAction(Op: ISD::FMA, VT: V8Narrow, Action: Expand);
787	setOperationAction(Op: ISD::FMUL, VT: V8Narrow, Action: Legal);
788	setOperationAction(Op: ISD::FNEARBYINT, VT: V8Narrow, Action: Legal);
789	setOperationAction(Op: ISD::FNEG, VT: V8Narrow, Action: Legal);
790	setOperationAction(Op: ISD::FROUND, VT: V8Narrow, Action: Legal);
791	setOperationAction(Op: ISD::FROUNDEVEN, VT: V8Narrow, Action: Legal);
792	setOperationAction(Op: ISD::FRINT, VT: V8Narrow, Action: Legal);
793	setOperationAction(Op: ISD::FSQRT, VT: V8Narrow, Action: Expand);
794	setOperationAction(Op: ISD::FSUB, VT: V8Narrow, Action: Legal);
795	setOperationAction(Op: ISD::FTRUNC, VT: V8Narrow, Action: Legal);
796	setOperationAction(Op: ISD::SETCC, VT: V8Narrow, Action: Expand);
797	setOperationAction(Op: ISD::BR_CC, VT: V8Narrow, Action: Expand);
798	setOperationAction(Op: ISD::SELECT, VT: V8Narrow, Action: Expand);
799	setOperationAction(Op: ISD::SELECT_CC, VT: V8Narrow, Action: Expand);
800	setOperationAction(Op: ISD::FP_EXTEND, VT: V8Narrow, Action: Expand);
801	};
802
803	if (!Subtarget->hasFullFP16()) {
804	LegalizeNarrowFP(MVT::f16);
805	}
806	LegalizeNarrowFP(MVT::bf16);
807	setOperationAction(ISD::FP_ROUND, MVT::v4f32, Custom);
808	setOperationAction(ISD::FP_ROUND, MVT::v4bf16, Custom);
809
810	// AArch64 has implementations of a lot of rounding-like FP operations.
811	for (auto Op :
812	{ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL,
813	ISD::FRINT, ISD::FTRUNC, ISD::FROUND,
814	ISD::FROUNDEVEN, ISD::FMINNUM, ISD::FMAXNUM,
815	ISD::FMINIMUM, ISD::FMAXIMUM, ISD::LROUND,
816	ISD::LLROUND, ISD::LRINT, ISD::LLRINT,
817	ISD::STRICT_FFLOOR, ISD::STRICT_FCEIL, ISD::STRICT_FNEARBYINT,
818	ISD::STRICT_FRINT, ISD::STRICT_FTRUNC, ISD::STRICT_FROUNDEVEN,
819	ISD::STRICT_FROUND, ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM,
820	ISD::STRICT_FMINIMUM, ISD::STRICT_FMAXIMUM, ISD::STRICT_LROUND,
821	ISD::STRICT_LLROUND, ISD::STRICT_LRINT, ISD::STRICT_LLRINT}) {
822	for (MVT Ty : {MVT::f32, MVT::f64})
823	setOperationAction(Op, Ty, Legal);
824	if (Subtarget->hasFullFP16())
825	setOperationAction(Op, MVT::f16, Legal);
826	}
827
828	// Basic strict FP operations are legal
829	for (auto Op : {ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL,
830	ISD::STRICT_FDIV, ISD::STRICT_FMA, ISD::STRICT_FSQRT}) {
831	for (MVT Ty : {MVT::f32, MVT::f64})
832	setOperationAction(Op, Ty, Legal);
833	if (Subtarget->hasFullFP16())
834	setOperationAction(Op, MVT::f16, Legal);
835	}
836
837	// Strict conversion to a larger type is legal
838	for (auto VT : {MVT::f32, MVT::f64})
839	setOperationAction(ISD::STRICT_FP_EXTEND, VT, Legal);
840
841	setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
842
843	setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom);
844	setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
845
846	setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
847	if (!Subtarget->hasLSE() && !Subtarget->outlineAtomics()) {
848	setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, LibCall);
849	setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, LibCall);
850	} else {
851	setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Expand);
852	setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Expand);
853	}
854	setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Custom);
855	setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
856
857	// Generate outline atomics library calls only if LSE was not specified for
858	// subtarget
859	if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
860	setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, LibCall);
861	setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, LibCall);
862	setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, LibCall);
863	setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, LibCall);
864	setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, LibCall);
865	setOperationAction(ISD::ATOMIC_SWAP, MVT::i8, LibCall);
866	setOperationAction(ISD::ATOMIC_SWAP, MVT::i16, LibCall);
867	setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, LibCall);
868	setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, LibCall);
869	setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i8, LibCall);
870	setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i16, LibCall);
871	setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, LibCall);
872	setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, LibCall);
873	setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i8, LibCall);
874	setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i16, LibCall);
875	setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, LibCall);
876	setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, LibCall);
877	setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i8, LibCall);
878	setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i16, LibCall);
879	setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i32, LibCall);
880	setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i64, LibCall);
881	setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i8, LibCall);
882	setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i16, LibCall);
883	setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, LibCall);
884	setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, LibCall);
885	#define LCALLNAMES(A, B, N) \
886	setLibcallName(A##N##_RELAX, #B #N "_relax"); \
887	setLibcallName(A##N##_ACQ, #B #N "_acq"); \
888	setLibcallName(A##N##_REL, #B #N "_rel"); \
889	setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel");
890	#define LCALLNAME4(A, B) \
891	LCALLNAMES(A, B, 1) \
892	LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8)
893	#define LCALLNAME5(A, B) \
894	LCALLNAMES(A, B, 1) \
895	LCALLNAMES(A, B, 2) \
896	LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16)
897	LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas)
898	LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp)
899	LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd)
900	LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset)
901	LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr)
902	LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor)
903	#undef LCALLNAMES
904	#undef LCALLNAME4
905	#undef LCALLNAME5
906	}
907
908	if (Subtarget->hasLSE128()) {
909	// Custom lowering because i128 is not legal. Must be replaced by 2x64
910	// values. ATOMIC_LOAD_AND also needs op legalisation to emit LDCLRP.
911	setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i128, Custom);
912	setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i128, Custom);
913	setOperationAction(ISD::ATOMIC_SWAP, MVT::i128, Custom);
914	}
915
916	// 128-bit loads and stores can be done without expanding
917	setOperationAction(ISD::LOAD, MVT::i128, Custom);
918	setOperationAction(ISD::STORE, MVT::i128, Custom);
919
920	// Aligned 128-bit loads and stores are single-copy atomic according to the
921	// v8.4a spec. LRCPC3 introduces 128-bit STILP/LDIAPP but still requires LSE2.
922	if (Subtarget->hasLSE2()) {
923	setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom);
924	setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom);
925	}
926
927	// 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
928	// custom lowering, as there are no un-paired non-temporal stores and
929	// legalization will break up 256 bit inputs.
930	setOperationAction(ISD::STORE, MVT::v32i8, Custom);
931	setOperationAction(ISD::STORE, MVT::v16i16, Custom);
932	setOperationAction(ISD::STORE, MVT::v16f16, Custom);
933	setOperationAction(ISD::STORE, MVT::v16bf16, Custom);
934	setOperationAction(ISD::STORE, MVT::v8i32, Custom);
935	setOperationAction(ISD::STORE, MVT::v8f32, Custom);
936	setOperationAction(ISD::STORE, MVT::v4f64, Custom);
937	setOperationAction(ISD::STORE, MVT::v4i64, Custom);
938
939	// 256 bit non-temporal loads can be lowered to LDNP. This is done using
940	// custom lowering, as there are no un-paired non-temporal loads legalization
941	// will break up 256 bit inputs.
942	setOperationAction(ISD::LOAD, MVT::v32i8, Custom);
943	setOperationAction(ISD::LOAD, MVT::v16i16, Custom);
944	setOperationAction(ISD::LOAD, MVT::v16f16, Custom);
945	setOperationAction(ISD::LOAD, MVT::v16bf16, Custom);
946	setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
947	setOperationAction(ISD::LOAD, MVT::v8f32, Custom);
948	setOperationAction(ISD::LOAD, MVT::v4f64, Custom);
949	setOperationAction(ISD::LOAD, MVT::v4i64, Custom);
950
951	// Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0.
952	setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
953
954	if (getLibcallName(Call: RTLIB::SINCOS_STRET_F32) != nullptr &&
955	getLibcallName(Call: RTLIB::SINCOS_STRET_F64) != nullptr) {
956	// Issue __sincos_stret if available.
957	setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
958	setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
959	} else {
960	setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
961	setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
962	}
963
964	if (Subtarget->getTargetTriple().isOSMSVCRT()) {
965	// MSVCRT doesn't have powi; fall back to pow
966	setLibcallName(Call: RTLIB::POWI_F32, Name: nullptr);
967	setLibcallName(Call: RTLIB::POWI_F64, Name: nullptr);
968	}
969
970	// Make floating-point constants legal for the large code model, so they don't
971	// become loads from the constant pool.
972	if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
973	setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
974	setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
975	}
976
977	// AArch64 does not have floating-point extending loads, i1 sign-extending
978	// load, floating-point truncating stores, or v2i32->v2i16 truncating store.
979	for (MVT VT : MVT::fp_valuetypes()) {
980	setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
981	setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
982	setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
983	setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
984	setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
985	}
986	for (MVT VT : MVT::integer_valuetypes())
987	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);
988
989	for (MVT WideVT : MVT::fp_valuetypes()) {
990	for (MVT NarrowVT : MVT::fp_valuetypes()) {
991	if (WideVT.getScalarSizeInBits() > NarrowVT.getScalarSizeInBits()) {
992	setTruncStoreAction(WideVT, NarrowVT, Expand);
993	}
994	}
995	}
996
997	if (Subtarget->hasFPARMv8()) {
998	setOperationAction(ISD::BITCAST, MVT::i16, Custom);
999	setOperationAction(ISD::BITCAST, MVT::f16, Custom);
1000	setOperationAction(ISD::BITCAST, MVT::bf16, Custom);
1001	}
1002
1003	// Indexed loads and stores are supported.
1004	for (unsigned im = (unsigned)ISD::PRE_INC;
1005	im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
1006	setIndexedLoadAction(im, MVT::i8, Legal);
1007	setIndexedLoadAction(im, MVT::i16, Legal);
1008	setIndexedLoadAction(im, MVT::i32, Legal);
1009	setIndexedLoadAction(im, MVT::i64, Legal);
1010	setIndexedLoadAction(im, MVT::f64, Legal);
1011	setIndexedLoadAction(im, MVT::f32, Legal);
1012	setIndexedLoadAction(im, MVT::f16, Legal);
1013	setIndexedLoadAction(im, MVT::bf16, Legal);
1014	setIndexedStoreAction(im, MVT::i8, Legal);
1015	setIndexedStoreAction(im, MVT::i16, Legal);
1016	setIndexedStoreAction(im, MVT::i32, Legal);
1017	setIndexedStoreAction(im, MVT::i64, Legal);
1018	setIndexedStoreAction(im, MVT::f64, Legal);
1019	setIndexedStoreAction(im, MVT::f32, Legal);
1020	setIndexedStoreAction(im, MVT::f16, Legal);
1021	setIndexedStoreAction(im, MVT::bf16, Legal);
1022	}
1023
1024	// Trap.
1025	setOperationAction(ISD::TRAP, MVT::Other, Legal);
1026	setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
1027	setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
1028
1029	// We combine OR nodes for bitfield operations.
1030	setTargetDAGCombine(ISD::OR);
1031	// Try to create BICs for vector ANDs.
1032	setTargetDAGCombine(ISD::AND);
1033
1034	// Vector add and sub nodes may conceal a high-half opportunity.
1035	// Also, try to fold ADD into CSINC/CSINV..
1036	setTargetDAGCombine({ISD::ADD, ISD::ABS, ISD::SUB, ISD::XOR, ISD::SINT_TO_FP,
1037	ISD::UINT_TO_FP});
1038
1039	setTargetDAGCombine({ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,
1040	ISD::FP_TO_UINT_SAT, ISD::FADD, ISD::FDIV});
1041
1042	// Try and combine setcc with csel
1043	setTargetDAGCombine(ISD::SETCC);
1044
1045	setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
1046
1047	setTargetDAGCombine({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND,
1048	ISD::VECTOR_SPLICE, ISD::SIGN_EXTEND_INREG,
1049	ISD::CONCAT_VECTORS, ISD::EXTRACT_SUBVECTOR,
1050	ISD::INSERT_SUBVECTOR, ISD::STORE, ISD::BUILD_VECTOR});
1051	setTargetDAGCombine(ISD::TRUNCATE);
1052	setTargetDAGCombine(ISD::LOAD);
1053
1054	setTargetDAGCombine(ISD::MSTORE);
1055
1056	setTargetDAGCombine(ISD::MUL);
1057
1058	setTargetDAGCombine({ISD::SELECT, ISD::VSELECT});
1059
1060	setTargetDAGCombine({ISD::INTRINSIC_VOID, ISD::INTRINSIC_W_CHAIN,
1061	ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT,
1062	ISD::VECREDUCE_ADD, ISD::STEP_VECTOR});
1063
1064	setTargetDAGCombine({ISD::MGATHER, ISD::MSCATTER});
1065
1066	setTargetDAGCombine(ISD::FP_EXTEND);
1067
1068	setTargetDAGCombine(ISD::GlobalAddress);
1069
1070	setTargetDAGCombine(ISD::CTLZ);
1071
1072	setTargetDAGCombine(ISD::VECREDUCE_AND);
1073	setTargetDAGCombine(ISD::VECREDUCE_OR);
1074	setTargetDAGCombine(ISD::VECREDUCE_XOR);
1075
1076	setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
1077
1078	// In case of strict alignment, avoid an excessive number of byte wide stores.
1079	MaxStoresPerMemsetOptSize = `8`;
1080	MaxStoresPerMemset =
1081	Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : `32`;
1082
1083	MaxGluedStoresPerMemcpy = `4`;
1084	MaxStoresPerMemcpyOptSize = `4`;
1085	MaxStoresPerMemcpy =
1086	Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : `16`;
1087
1088	MaxStoresPerMemmoveOptSize = `4`;
1089	MaxStoresPerMemmove = `4`;
1090
1091	MaxLoadsPerMemcmpOptSize = `4`;
1092	MaxLoadsPerMemcmp =
1093	Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : `8`;
1094
1095	setStackPointerRegisterToSaveRestore(AArch64::SP);
1096
1097	setSchedulingPreference(Sched::Hybrid);
1098
1099	EnableExtLdPromotion = true;
1100
1101	// Set required alignment.
1102	setMinFunctionAlignment(Align (`4`));
1103	// Set preferred alignments.
1104
1105	// Don't align loops on Windows. The SEH unwind info generation needs to
1106	// know the exact length of functions before the alignments have been
1107	// expanded.
1108	if (!Subtarget->isTargetWindows())
1109	setPrefLoopAlignment(STI.getPrefLoopAlignment());
1110	setMaxBytesForAlignment(STI.getMaxBytesForLoopAlignment());
1111	setPrefFunctionAlignment(STI.getPrefFunctionAlignment());
1112
1113	// Only change the limit for entries in a jump table if specified by
1114	// the sub target, but not at the command line.
1115	unsigned MaxJT = STI.getMaximumJumpTableSize();
1116	if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
1117	setMaximumJumpTableSize(MaxJT);
1118
1119	setHasExtractBitsInsn(true);
1120
1121	setMaxDivRemBitWidthSupported(`128`);
1122
1123	setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1124
1125	if (Subtarget->hasNEON()) {
1126	// FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
1127	// silliness like this:
1128	for (auto Op :
1129	{ISD::SELECT, ISD::SELECT_CC,
1130	ISD::BR_CC, ISD::FADD, ISD::FSUB,
1131	ISD::FMUL, ISD::FDIV, ISD::FMA,
1132	ISD::FNEG, ISD::FABS, ISD::FCEIL,
1133	ISD::FSQRT, ISD::FFLOOR, ISD::FNEARBYINT,
1134	ISD::FSIN, ISD::FCOS, ISD::FPOW,
1135	ISD::FLOG, ISD::FLOG2, ISD::FLOG10,
1136	ISD::FEXP, ISD::FEXP2, ISD::FEXP10,
1137	ISD::FRINT, ISD::FROUND, ISD::FROUNDEVEN,
1138	ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM,
1139	ISD::FMINIMUM, ISD::FMAXIMUM, ISD::STRICT_FADD,
1140	ISD::STRICT_FSUB, ISD::STRICT_FMUL, ISD::STRICT_FDIV,
1141	ISD::STRICT_FMA, ISD::STRICT_FCEIL, ISD::STRICT_FFLOOR,
1142	ISD::STRICT_FSQRT, ISD::STRICT_FRINT, ISD::STRICT_FNEARBYINT,
1143	ISD::STRICT_FROUND, ISD::STRICT_FTRUNC, ISD::STRICT_FROUNDEVEN,
1144	ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM, ISD::STRICT_FMINIMUM,
1145	ISD::STRICT_FMAXIMUM})
1146	setOperationAction(Op, MVT::v1f64, Expand);
1147
1148	for (auto Op :
1149	{ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::SINT_TO_FP, ISD::UINT_TO_FP,
1150	ISD::FP_ROUND, ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT, ISD::MUL,
1151	ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT,
1152	ISD::STRICT_SINT_TO_FP, ISD::STRICT_UINT_TO_FP, ISD::STRICT_FP_ROUND})
1153	setOperationAction(Op, MVT::v1i64, Expand);
1154
1155	// AArch64 doesn't have a direct vector ->f32 conversion instructions for
1156	// elements smaller than i32, so promote the input to i32 first.
1157	setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
1158	setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
1159
1160	// Similarly, there is no direct i32 -> f64 vector conversion instruction.
1161	// Or, direct i32 -> f16 vector conversion. Set it so custom, so the
1162	// conversion happens in two steps: v4i32 -> v4f32 -> v4f16
1163	for (auto Op : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,
1164	ISD::STRICT_UINT_TO_FP})
1165	for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32})
1166	setOperationAction(Op, VT, Custom);
1167
1168	if (Subtarget->hasFullFP16()) {
1169	setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
1170	setOperationAction(ISD::ConstantFP, MVT::bf16, Legal);
1171
1172	setOperationAction(ISD::SINT_TO_FP, MVT::v8i8, Custom);
1173	setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom);
1174	setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Custom);
1175	setOperationAction(ISD::UINT_TO_FP, MVT::v16i8, Custom);
1176	setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
1177	setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
1178	setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom);
1179	setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
1180	} else {
1181	// when AArch64 doesn't have fullfp16 support, promote the input
1182	// to i32 first.
1183	setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
1184	setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
1185	setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i8, MVT::v16i32);
1186	setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i8, MVT::v16i32);
1187	setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
1188	setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
1189	setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
1190	setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
1191	}
1192
1193	setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
1194	setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
1195	setOperationAction(ISD::BITREVERSE, MVT::v8i8, Legal);
1196	setOperationAction(ISD::BITREVERSE, MVT::v16i8, Legal);
1197	setOperationAction(ISD::BITREVERSE, MVT::v2i32, Custom);
1198	setOperationAction(ISD::BITREVERSE, MVT::v4i32, Custom);
1199	setOperationAction(ISD::BITREVERSE, MVT::v1i64, Custom);
1200	setOperationAction(ISD::BITREVERSE, MVT::v2i64, Custom);
1201	for (auto VT : {MVT::v1i64, MVT::v2i64}) {
1202	setOperationAction(ISD::UMAX, VT, Custom);
1203	setOperationAction(ISD::SMAX, VT, Custom);
1204	setOperationAction(ISD::UMIN, VT, Custom);
1205	setOperationAction(ISD::SMIN, VT, Custom);
1206	}
1207
1208	// Custom handling for some quad-vector types to detect MULL.
1209	setOperationAction(ISD::MUL, MVT::v8i16, Custom);
1210	setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1211	setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1212	setOperationAction(ISD::MUL, MVT::v4i16, Custom);
1213	setOperationAction(ISD::MUL, MVT::v2i32, Custom);
1214	setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1215
1216	// Saturates
1217	for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1218	MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1219	setOperationAction(ISD::SADDSAT, VT, Legal);
1220	setOperationAction(ISD::UADDSAT, VT, Legal);
1221	setOperationAction(ISD::SSUBSAT, VT, Legal);
1222	setOperationAction(ISD::USUBSAT, VT, Legal);
1223	}
1224
1225	for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
1226	MVT::v4i32}) {
1227	setOperationAction(ISD::AVGFLOORS, VT, Legal);
1228	setOperationAction(ISD::AVGFLOORU, VT, Legal);
1229	setOperationAction(ISD::AVGCEILS, VT, Legal);
1230	setOperationAction(ISD::AVGCEILU, VT, Legal);
1231	setOperationAction(ISD::ABDS, VT, Legal);
1232	setOperationAction(ISD::ABDU, VT, Legal);
1233	}
1234
1235	// Vector reductions
1236	for (MVT VT : { MVT::v4f16, MVT::v2f32,
1237	MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1238	if (VT.getVectorElementType() != MVT::f16 \|\| Subtarget->hasFullFP16()) {
1239	setOperationAction(ISD::VECREDUCE_FMAX, VT, Legal);
1240	setOperationAction(ISD::VECREDUCE_FMIN, VT, Legal);
1241	setOperationAction(ISD::VECREDUCE_FMAXIMUM, VT, Legal);
1242	setOperationAction(ISD::VECREDUCE_FMINIMUM, VT, Legal);
1243
1244	setOperationAction(ISD::VECREDUCE_FADD, VT, Legal);
1245	}
1246	}
1247	for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1248	MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1249	setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
1250	setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
1251	setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
1252	setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
1253	setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
1254	setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1255	setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1256	setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1257	}
1258	setOperationAction(ISD::VECREDUCE_ADD, MVT::v2i64, Custom);
1259	setOperationAction(ISD::VECREDUCE_AND, MVT::v2i64, Custom);
1260	setOperationAction(ISD::VECREDUCE_OR, MVT::v2i64, Custom);
1261	setOperationAction(ISD::VECREDUCE_XOR, MVT::v2i64, Custom);
1262
1263	setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
1264	setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
1265	// Likewise, narrowing and extending vector loads/stores aren't handled
1266	// directly.
1267	for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
1268	setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
1269
1270	if (VT == MVT::v16i8 \|\| VT == MVT::v8i16 \|\| VT == MVT::v4i32) {
1271	setOperationAction(ISD::MULHS, VT, Legal);
1272	setOperationAction(ISD::MULHU, VT, Legal);
1273	} else {
1274	setOperationAction(ISD::MULHS, VT, Expand);
1275	setOperationAction(ISD::MULHU, VT, Expand);
1276	}
1277	setOperationAction(ISD::SMUL_LOHI, VT, Expand);
1278	setOperationAction(ISD::UMUL_LOHI, VT, Expand);
1279
1280	setOperationAction(ISD::BSWAP, VT, Expand);
1281	setOperationAction(ISD::CTTZ, VT, Expand);
1282
1283	for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1284	setTruncStoreAction(VT, InnerVT, Expand);
1285	setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1286	setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1287	setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1288	}
1289	}
1290
1291	// AArch64 has implementations of a lot of rounding-like FP operations.
1292	for (auto Op :
1293	{ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL, ISD::FRINT, ISD::FTRUNC,
1294	ISD::FROUND, ISD::FROUNDEVEN, ISD::STRICT_FFLOOR,
1295	ISD::STRICT_FNEARBYINT, ISD::STRICT_FCEIL, ISD::STRICT_FRINT,
1296	ISD::STRICT_FTRUNC, ISD::STRICT_FROUND, ISD::STRICT_FROUNDEVEN}) {
1297	for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1298	setOperationAction(Op, Ty, Legal);
1299	if (Subtarget->hasFullFP16())
1300	for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1301	setOperationAction(Op, Ty, Legal);
1302	}
1303
1304	setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
1305
1306	setOperationAction(ISD::BITCAST, MVT::i2, Custom);
1307	setOperationAction(ISD::BITCAST, MVT::i4, Custom);
1308	setOperationAction(ISD::BITCAST, MVT::i8, Custom);
1309	setOperationAction(ISD::BITCAST, MVT::i16, Custom);
1310
1311	setOperationAction(ISD::BITCAST, MVT::v2i8, Custom);
1312	setOperationAction(ISD::BITCAST, MVT::v2i16, Custom);
1313	setOperationAction(ISD::BITCAST, MVT::v4i8, Custom);
1314
1315	setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1316	setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1317	setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1318	setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1319	setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1320	setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1321
1322	// ADDP custom lowering
1323	for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1324	setOperationAction(ISD::ADD, VT, Custom);
1325	// FADDP custom lowering
1326	for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1327	setOperationAction(ISD::FADD, VT, Custom);
1328	}
1329
1330	if (Subtarget->hasSME()) {
1331	setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1332	}
1333
1334	// FIXME: Move lowering for more nodes here if those are common between
1335	// SVE and SME.
1336	if (Subtarget->hasSVEorSME()) {
1337	for (auto VT :
1338	{MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1339	setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
1340	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1341	setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom);
1342	setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom);
1343	}
1344	}
1345
1346	if (Subtarget->hasSVEorSME()) {
1347	for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1348	setOperationAction(ISD::BITREVERSE, VT, Custom);
1349	setOperationAction(ISD::BSWAP, VT, Custom);
1350	setOperationAction(ISD::CTLZ, VT, Custom);
1351	setOperationAction(ISD::CTPOP, VT, Custom);
1352	setOperationAction(ISD::CTTZ, VT, Custom);
1353	setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1354	setOperationAction(ISD::UINT_TO_FP, VT, Custom);
1355	setOperationAction(ISD::SINT_TO_FP, VT, Custom);
1356	setOperationAction(ISD::FP_TO_UINT, VT, Custom);
1357	setOperationAction(ISD::FP_TO_SINT, VT, Custom);
1358	setOperationAction(ISD::MGATHER, VT, Custom);
1359	setOperationAction(ISD::MSCATTER, VT, Custom);
1360	setOperationAction(ISD::MLOAD, VT, Custom);
1361	setOperationAction(ISD::MUL, VT, Custom);
1362	setOperationAction(ISD::MULHS, VT, Custom);
1363	setOperationAction(ISD::MULHU, VT, Custom);
1364	setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
1365	setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
1366	setOperationAction(ISD::SELECT, VT, Custom);
1367	setOperationAction(ISD::SETCC, VT, Custom);
1368	setOperationAction(ISD::SDIV, VT, Custom);
1369	setOperationAction(ISD::UDIV, VT, Custom);
1370	setOperationAction(ISD::SMIN, VT, Custom);
1371	setOperationAction(ISD::UMIN, VT, Custom);
1372	setOperationAction(ISD::SMAX, VT, Custom);
1373	setOperationAction(ISD::UMAX, VT, Custom);
1374	setOperationAction(ISD::SHL, VT, Custom);
1375	setOperationAction(ISD::SRL, VT, Custom);
1376	setOperationAction(ISD::SRA, VT, Custom);
1377	setOperationAction(ISD::ABS, VT, Custom);
1378	setOperationAction(ISD::ABDS, VT, Custom);
1379	setOperationAction(ISD::ABDU, VT, Custom);
1380	setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
1381	setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1382	setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1383	setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1384	setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
1385	setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
1386	setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
1387	setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
1388	setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom);
1389	setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom);
1390
1391	setOperationAction(ISD::UMUL_LOHI, VT, Expand);
1392	setOperationAction(ISD::SMUL_LOHI, VT, Expand);
1393	setOperationAction(ISD::SELECT_CC, VT, Expand);
1394	setOperationAction(ISD::ROTL, VT, Expand);
1395	setOperationAction(ISD::ROTR, VT, Expand);
1396
1397	setOperationAction(ISD::SADDSAT, VT, Legal);
1398	setOperationAction(ISD::UADDSAT, VT, Legal);
1399	setOperationAction(ISD::SSUBSAT, VT, Legal);
1400	setOperationAction(ISD::USUBSAT, VT, Legal);
1401	setOperationAction(ISD::UREM, VT, Expand);
1402	setOperationAction(ISD::SREM, VT, Expand);
1403	setOperationAction(ISD::SDIVREM, VT, Expand);
1404	setOperationAction(ISD::UDIVREM, VT, Expand);
1405
1406	setOperationAction(ISD::AVGFLOORS, VT, Custom);
1407	setOperationAction(ISD::AVGFLOORU, VT, Custom);
1408	setOperationAction(ISD::AVGCEILS, VT, Custom);
1409	setOperationAction(ISD::AVGCEILU, VT, Custom);
1410
1411	if (!Subtarget->isLittleEndian())
1412	setOperationAction(ISD::BITCAST, VT, Expand);
1413
1414	if (Subtarget->hasSVE2orSME())
1415	// For SLI/SRI.
1416	setOperationAction(ISD::OR, VT, Custom);
1417	}
1418
1419	// Illegal unpacked integer vector types.
1420	for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1421	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1422	setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1423	}
1424
1425	// Legalize unpacked bitcasts to REINTERPRET_CAST.
1426	for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32, MVT::nxv2bf16,
1427	MVT::nxv4bf16, MVT::nxv2f16, MVT::nxv4f16, MVT::nxv2f32})
1428	setOperationAction(ISD::BITCAST, VT, Custom);
1429
1430	for (auto VT :
1431	{ MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
1432	MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
1433	setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Legal);
1434
1435	for (auto VT :
1436	{MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1437	setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1438	setOperationAction(ISD::SELECT, VT, Custom);
1439	setOperationAction(ISD::SETCC, VT, Custom);
1440	setOperationAction(ISD::TRUNCATE, VT, Custom);
1441	setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1442	setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1443	setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1444
1445	setOperationAction(ISD::SELECT_CC, VT, Expand);
1446	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1447	setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1448
1449	// There are no legal MVT::nxv16f## based types.
1450	if (VT != MVT::nxv16i1) {
1451	setOperationAction(ISD::SINT_TO_FP, VT, Custom);
1452	setOperationAction(ISD::UINT_TO_FP, VT, Custom);
1453	}
1454	}
1455
1456	// NEON doesn't support masked loads/stores/gathers/scatters, but SVE does
1457	for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
1458	MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1459	MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1460	setOperationAction(ISD::MLOAD, VT, Custom);
1461	setOperationAction(ISD::MSTORE, VT, Custom);
1462	setOperationAction(ISD::MGATHER, VT, Custom);
1463	setOperationAction(ISD::MSCATTER, VT, Custom);
1464	}
1465
1466	// Firstly, exclude all scalable vector extending loads/truncating stores,
1467	// include both integer and floating scalable vector.
1468	for (MVT VT : MVT::scalable_vector_valuetypes()) {
1469	for (MVT InnerVT : MVT::scalable_vector_valuetypes()) {
1470	setTruncStoreAction(VT, InnerVT, Expand);
1471	setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1472	setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1473	setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1474	}
1475	}
1476
1477	// Then, selectively enable those which we directly support.
1478	setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i8, Legal);
1479	setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i16, Legal);
1480	setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i32, Legal);
1481	setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i8, Legal);
1482	setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i16, Legal);
1483	setTruncStoreAction(MVT::nxv8i16, MVT::nxv8i8, Legal);
1484	for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1485	setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i8, Legal);
1486	setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i16, Legal);
1487	setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i32, Legal);
1488	setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i8, Legal);
1489	setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i16, Legal);
1490	setLoadExtAction(Op, MVT::nxv8i16, MVT::nxv8i8, Legal);
1491	}
1492
1493	// SVE supports truncating stores of 64 and 128-bit vectors
1494	setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom);
1495	setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom);
1496	setTruncStoreAction(MVT::v2i64, MVT::v2i32, Custom);
1497	setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
1498	setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
1499
1500	for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1501	MVT::nxv4f32, MVT::nxv2f64}) {
1502	setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1503	setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1504	setOperationAction(ISD::MGATHER, VT, Custom);
1505	setOperationAction(ISD::MSCATTER, VT, Custom);
1506	setOperationAction(ISD::MLOAD, VT, Custom);
1507	setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
1508	setOperationAction(ISD::SELECT, VT, Custom);
1509	setOperationAction(ISD::SETCC, VT, Custom);
1510	setOperationAction(ISD::FADD, VT, Custom);
1511	setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1512	setOperationAction(ISD::FDIV, VT, Custom);
1513	setOperationAction(ISD::FMA, VT, Custom);
1514	setOperationAction(ISD::FMAXIMUM, VT, Custom);
1515	setOperationAction(ISD::FMAXNUM, VT, Custom);
1516	setOperationAction(ISD::FMINIMUM, VT, Custom);
1517	setOperationAction(ISD::FMINNUM, VT, Custom);
1518	setOperationAction(ISD::FMUL, VT, Custom);
1519	setOperationAction(ISD::FNEG, VT, Custom);
1520	setOperationAction(ISD::FSUB, VT, Custom);
1521	setOperationAction(ISD::FCEIL, VT, Custom);
1522	setOperationAction(ISD::FFLOOR, VT, Custom);
1523	setOperationAction(ISD::FNEARBYINT, VT, Custom);
1524	setOperationAction(ISD::FRINT, VT, Custom);
1525	setOperationAction(ISD::FROUND, VT, Custom);
1526	setOperationAction(ISD::FROUNDEVEN, VT, Custom);
1527	setOperationAction(ISD::FTRUNC, VT, Custom);
1528	setOperationAction(ISD::FSQRT, VT, Custom);
1529	setOperationAction(ISD::FABS, VT, Custom);
1530	setOperationAction(ISD::FP_EXTEND, VT, Custom);
1531	setOperationAction(ISD::FP_ROUND, VT, Custom);
1532	setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
1533	setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
1534	setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
1535	setOperationAction(ISD::VECREDUCE_FMAXIMUM, VT, Custom);
1536	setOperationAction(ISD::VECREDUCE_FMINIMUM, VT, Custom);
1537	if (Subtarget->isSVEAvailable())
1538	setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
1539	setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
1540	setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom);
1541	setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom);
1542
1543	setOperationAction(ISD::SELECT_CC, VT, Expand);
1544	setOperationAction(ISD::FREM, VT, Expand);
1545	setOperationAction(ISD::FPOW, VT, Expand);
1546	setOperationAction(ISD::FPOWI, VT, Expand);
1547	setOperationAction(ISD::FCOS, VT, Expand);
1548	setOperationAction(ISD::FSIN, VT, Expand);
1549	setOperationAction(ISD::FSINCOS, VT, Expand);
1550	setOperationAction(ISD::FEXP, VT, Expand);
1551	setOperationAction(ISD::FEXP2, VT, Expand);
1552	setOperationAction(ISD::FEXP10, VT, Expand);
1553	setOperationAction(ISD::FLOG, VT, Expand);
1554	setOperationAction(ISD::FLOG2, VT, Expand);
1555	setOperationAction(ISD::FLOG10, VT, Expand);
1556
1557	setCondCodeAction(ISD::SETO, VT, Expand);
1558	setCondCodeAction(ISD::SETOLT, VT, Expand);
1559	setCondCodeAction(ISD::SETLT, VT, Expand);
1560	setCondCodeAction(ISD::SETOLE, VT, Expand);
1561	setCondCodeAction(ISD::SETLE, VT, Expand);
1562	setCondCodeAction(ISD::SETULT, VT, Expand);
1563	setCondCodeAction(ISD::SETULE, VT, Expand);
1564	setCondCodeAction(ISD::SETUGE, VT, Expand);
1565	setCondCodeAction(ISD::SETUGT, VT, Expand);
1566	setCondCodeAction(ISD::SETUEQ, VT, Expand);
1567	setCondCodeAction(ISD::SETONE, VT, Expand);
1568
1569	if (!Subtarget->isLittleEndian())
1570	setOperationAction(ISD::BITCAST, VT, Expand);
1571	}
1572
1573	for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1574	setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1575	setOperationAction(ISD::MGATHER, VT, Custom);
1576	setOperationAction(ISD::MSCATTER, VT, Custom);
1577	setOperationAction(ISD::MLOAD, VT, Custom);
1578	setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1579	setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
1580
1581	if (!Subtarget->isLittleEndian())
1582	setOperationAction(ISD::BITCAST, VT, Expand);
1583	}
1584
1585	setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom);
1586	setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
1587
1588	// NEON doesn't support integer divides, but SVE does
1589	for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1590	MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1591	setOperationAction(ISD::SDIV, VT, Custom);
1592	setOperationAction(ISD::UDIV, VT, Custom);
1593	}
1594
1595	// NEON doesn't support 64-bit vector integer muls, but SVE does.
1596	setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1597	setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1598
1599	if (Subtarget->isSVEAvailable()) {
1600	// NEON doesn't support across-vector reductions, but SVE does.
1601	for (auto VT :
1602	{MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v2f64})
1603	setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
1604	}
1605
1606	// NOTE: Currently this has to happen after computeRegisterProperties rather
1607	// than the preferred option of combining it with the addRegisterClass call.
1608	if (Subtarget->useSVEForFixedLengthVectors()) {
1609	for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) {
1610	if (useSVEForFixedLengthVectorVT(
1611	VT, /OverrideNEON=/!Subtarget->isNeonAvailable()))
1612	addTypeForFixedLengthSVE(VT);
1613	}
1614	for (MVT VT : MVT::fp_fixedlen_vector_valuetypes()) {
1615	if (useSVEForFixedLengthVectorVT(
1616	VT, /OverrideNEON=/!Subtarget->isNeonAvailable()))
1617	addTypeForFixedLengthSVE(VT);
1618	}
1619
1620	// 64bit results can mean a bigger than NEON input.
1621	for (auto VT : {MVT::v8i8, MVT::v4i16})
1622	setOperationAction(ISD::TRUNCATE, VT, Custom);
1623	setOperationAction(ISD::FP_ROUND, MVT::v4f16, Custom);
1624
1625	// 128bit results imply a bigger than NEON input.
1626	for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1627	setOperationAction(ISD::TRUNCATE, VT, Custom);
1628	for (auto VT : {MVT::v8f16, MVT::v4f32})
1629	setOperationAction(ISD::FP_ROUND, VT, Custom);
1630
1631	// These operations are not supported on NEON but SVE can do them.
1632	setOperationAction(ISD::BITREVERSE, MVT::v1i64, Custom);
1633	setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);
1634	setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1635	setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
1636	setOperationAction(ISD::MULHS, MVT::v1i64, Custom);
1637	setOperationAction(ISD::MULHS, MVT::v2i64, Custom);
1638	setOperationAction(ISD::MULHU, MVT::v1i64, Custom);
1639	setOperationAction(ISD::MULHU, MVT::v2i64, Custom);
1640	setOperationAction(ISD::SMAX, MVT::v1i64, Custom);
1641	setOperationAction(ISD::SMAX, MVT::v2i64, Custom);
1642	setOperationAction(ISD::SMIN, MVT::v1i64, Custom);
1643	setOperationAction(ISD::SMIN, MVT::v2i64, Custom);
1644	setOperationAction(ISD::UMAX, MVT::v1i64, Custom);
1645	setOperationAction(ISD::UMAX, MVT::v2i64, Custom);
1646	setOperationAction(ISD::UMIN, MVT::v1i64, Custom);
1647	setOperationAction(ISD::UMIN, MVT::v2i64, Custom);
1648	setOperationAction(ISD::VECREDUCE_SMAX, MVT::v2i64, Custom);
1649	setOperationAction(ISD::VECREDUCE_SMIN, MVT::v2i64, Custom);
1650	setOperationAction(ISD::VECREDUCE_UMAX, MVT::v2i64, Custom);
1651	setOperationAction(ISD::VECREDUCE_UMIN, MVT::v2i64, Custom);
1652
1653	// Int operations with no NEON support.
1654	for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1655	MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
1656	setOperationAction(ISD::BITREVERSE, VT, Custom);
1657	setOperationAction(ISD::CTTZ, VT, Custom);
1658	setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1659	setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1660	setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1661	setOperationAction(ISD::MULHS, VT, Custom);
1662	setOperationAction(ISD::MULHU, VT, Custom);
1663	}
1664
1665
1666	// Use SVE for vectors with more than 2 elements.
1667	for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1668	setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
1669	}
1670
1671	setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv2i1, MVT::nxv2i64);
1672	setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32);
1673	setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv8i1, MVT::nxv8i16);
1674	setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8);
1675
1676	setOperationAction(ISD::VSCALE, MVT::i32, Custom);
1677	}
1678
1679	if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
1680	// Only required for llvm.aarch64.mops.memset.tag
1681	setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
1682	}
1683
1684	setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1685
1686	if (Subtarget->hasSVE()) {
1687	setOperationAction(ISD::FLDEXP, MVT::f64, Custom);
1688	setOperationAction(ISD::FLDEXP, MVT::f32, Custom);
1689	setOperationAction(ISD::FLDEXP, MVT::f16, Custom);
1690	setOperationAction(ISD::FLDEXP, MVT::bf16, Custom);
1691	}
1692
1693	PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
1694
1695	IsStrictFPEnabled = true;
1696	setMaxAtomicSizeInBitsSupported(`128`);
1697
1698	if (Subtarget->isWindowsArm64EC()) {
1699	// FIXME: are there intrinsics we need to exclude from this?
1700	for (int i = `0`; i < RTLIB::UNKNOWN_LIBCALL; ++i) {
1701	auto code = static_cast<RTLIB::Libcall>(i);
1702	auto libcallName = getLibcallName(Call: code);
1703	if ((libcallName != nullptr) && (libcallName[`0`] != `'#'`)) {
1704	setLibcallName(Call: code, Name: Saver.save(S: Twine ("#") + libcallName).data());
1705	}
1706	}
1707	}
1708	}
1709
1710	void AArch64TargetLowering::addTypeForNEON(MVT VT) {
1711	assert(VT.isVector() && "VT should be a vector type");
1712
1713	if (VT.isFloatingPoint()) {
1714	MVT PromoteTo = EVT (VT).changeVectorElementTypeToInteger().getSimpleVT();
1715	setOperationPromotedToType(Opc: ISD::LOAD, OrigVT: VT, DestVT: PromoteTo);
1716	setOperationPromotedToType(Opc: ISD::STORE, OrigVT: VT, DestVT: PromoteTo);
1717	}
1718
1719	// Mark vector float intrinsics as expand.
1720	if (VT == MVT::v2f32 \|\| VT == MVT::v4f32 \|\| VT == MVT::v2f64) {
1721	setOperationAction(Op: ISD::FSIN, VT, Action: Expand);
1722	setOperationAction(Op: ISD::FCOS, VT, Action: Expand);
1723	setOperationAction(Op: ISD::FPOW, VT, Action: Expand);
1724	setOperationAction(Op: ISD::FLOG, VT, Action: Expand);
1725	setOperationAction(Op: ISD::FLOG2, VT, Action: Expand);
1726	setOperationAction(Op: ISD::FLOG10, VT, Action: Expand);
1727	setOperationAction(Op: ISD::FEXP, VT, Action: Expand);
1728	setOperationAction(Op: ISD::FEXP2, VT, Action: Expand);
1729	setOperationAction(Op: ISD::FEXP10, VT, Action: Expand);
1730	}
1731
1732	// But we do support custom-lowering for FCOPYSIGN.
1733	if (VT == MVT::v2f32 \|\| VT == MVT::v4f32 \|\| VT == MVT::v2f64 \|\|
1734	((VT == MVT::v4bf16 \|\| VT == MVT::v8bf16 \|\| VT == MVT::v4f16 \|\|
1735	VT == MVT::v8f16) &&
1736	Subtarget->hasFullFP16()))
1737	setOperationAction(Op: ISD::FCOPYSIGN, VT, Action: Custom);
1738
1739	setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT, Action: Custom);
1740	setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT, Action: Custom);
1741	setOperationAction(Op: ISD::BUILD_VECTOR, VT, Action: Custom);
1742	setOperationAction(Op: ISD::ZERO_EXTEND_VECTOR_INREG, VT, Action: Custom);
1743	setOperationAction(Op: ISD::VECTOR_SHUFFLE, VT, Action: Custom);
1744	setOperationAction(Op: ISD::EXTRACT_SUBVECTOR, VT, Action: Custom);
1745	setOperationAction(Op: ISD::SRA, VT, Action: Custom);
1746	setOperationAction(Op: ISD::SRL, VT, Action: Custom);
1747	setOperationAction(Op: ISD::SHL, VT, Action: Custom);
1748	setOperationAction(Op: ISD::OR, VT, Action: Custom);
1749	setOperationAction(Op: ISD::SETCC, VT, Action: Custom);
1750	setOperationAction(Op: ISD::CONCAT_VECTORS, VT, Action: Legal);
1751
1752	setOperationAction(Op: ISD::SELECT, VT, Action: Expand);
1753	setOperationAction(Op: ISD::SELECT_CC, VT, Action: Expand);
1754	setOperationAction(Op: ISD::VSELECT, VT, Action: Expand);
1755	for (MVT InnerVT : MVT::all_valuetypes())
1756	setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1757
1758	// CNT supports only B element sizes, then use UADDLP to widen.
1759	if (VT != MVT::v8i8 && VT != MVT::v16i8)
1760	setOperationAction(Op: ISD::CTPOP, VT, Action: Custom);
1761
1762	setOperationAction(Op: ISD::UDIV, VT, Action: Expand);
1763	setOperationAction(Op: ISD::SDIV, VT, Action: Expand);
1764	setOperationAction(Op: ISD::UREM, VT, Action: Expand);
1765	setOperationAction(Op: ISD::SREM, VT, Action: Expand);
1766	setOperationAction(Op: ISD::FREM, VT, Action: Expand);
1767
1768	for (unsigned Opcode :
1769	{ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,
1770	ISD::FP_TO_UINT_SAT, ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT})
1771	setOperationAction(Op: Opcode, VT, Action: Custom);
1772
1773	if (!VT.isFloatingPoint())
1774	setOperationAction(Op: ISD::ABS, VT, Action: Legal);
1775
1776	// [SU][MIN\|MAX] are available for all NEON types apart from i64.
1777	if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
1778	for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
1779	setOperationAction(Op: Opcode, VT, Action: Legal);
1780
1781	// F[MIN\|MAX][NUM\|NAN] and simple strict operations are available for all FP
1782	// NEON types.
1783	if (VT.isFloatingPoint() &&
1784	VT.getVectorElementType() != MVT::bf16 &&
1785	(VT.getVectorElementType() != MVT::f16 \|\| Subtarget->hasFullFP16()))
1786	for (unsigned Opcode :
1787	{ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FMINNUM, ISD::FMAXNUM,
1788	ISD::STRICT_FMINIMUM, ISD::STRICT_FMAXIMUM, ISD::STRICT_FMINNUM,
1789	ISD::STRICT_FMAXNUM, ISD::STRICT_FADD, ISD::STRICT_FSUB,
1790	ISD::STRICT_FMUL, ISD::STRICT_FDIV, ISD::STRICT_FMA,
1791	ISD::STRICT_FSQRT})
1792	setOperationAction(Op: Opcode, VT, Action: Legal);
1793
1794	// Strict fp extend and trunc are legal
1795	if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != `16`)
1796	setOperationAction(Op: ISD::STRICT_FP_EXTEND, VT, Action: Legal);
1797	if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != `64`)
1798	setOperationAction(Op: ISD::STRICT_FP_ROUND, VT, Action: Legal);
1799
1800	// FIXME: We could potentially make use of the vector comparison instructions
1801	// for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of
1802	// complications:
1803	// FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons,*
1804	// so we would need to expand when the condition code doesn't match the
1805	// kind of comparison.
1806	// Some kinds of comparison require more than one FCMXY instruction so*
1807	// would need to be expanded instead.
1808	// The lowering of the non-strict versions involves target-specific ISD*
1809	// nodes so we would likely need to add strict versions of all of them and
1810	// handle them appropriately.
1811	setOperationAction(Op: ISD::STRICT_FSETCC, VT, Action: Expand);
1812	setOperationAction(Op: ISD::STRICT_FSETCCS, VT, Action: Expand);
1813
1814	if (Subtarget->isLittleEndian()) {
1815	for (unsigned im = (unsigned)ISD::PRE_INC;
1816	im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
1817	setIndexedLoadAction(IdxModes: im, VT, Action: Legal);
1818	setIndexedStoreAction(IdxModes: im, VT, Action: Legal);
1819	}
1820	}
1821
1822	if (Subtarget->hasD128()) {
1823	setOperationAction(ISD::READ_REGISTER, MVT::i128, Custom);
1824	setOperationAction(ISD::WRITE_REGISTER, MVT::i128, Custom);
1825	}
1826	}
1827
1828	bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT,
1829	EVT OpVT) const {
1830	// Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
1831	if (!Subtarget->hasSVE())
1832	return true;
1833
1834	// We can only support legal predicate result types. We can use the SVE
1835	// whilelo instruction for generating fixed-width predicates too.
1836	if (ResVT != MVT::nxv2i1 && ResVT != MVT::nxv4i1 && ResVT != MVT::nxv8i1 &&
1837	ResVT != MVT::nxv16i1 && ResVT != MVT::v2i1 && ResVT != MVT::v4i1 &&
1838	ResVT != MVT::v8i1 && ResVT != MVT::v16i1)
1839	return true;
1840
1841	// The whilelo instruction only works with i32 or i64 scalar inputs.
1842	if (OpVT != MVT::i32 && OpVT != MVT::i64)
1843	return true;
1844
1845	return false;
1846	}
1847
1848	bool AArch64TargetLowering::shouldExpandCttzElements(EVT VT) const {
1849	return !Subtarget->hasSVEorSME() \|\| VT != MVT::nxv16i1;
1850	}
1851
1852	void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
1853	assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
1854
1855	// By default everything must be expanded.
1856	for (unsigned Op = `0`; Op < ISD::BUILTIN_OP_END; ++Op)
1857	setOperationAction(Op, VT, Action: Expand);
1858
1859	if (VT.isFloatingPoint()) {
1860	setCondCodeAction(CCs: ISD::SETO, VT, Action: Expand);
1861	setCondCodeAction(CCs: ISD::SETOLT, VT, Action: Expand);
1862	setCondCodeAction(CCs: ISD::SETOLE, VT, Action: Expand);
1863	setCondCodeAction(CCs: ISD::SETULT, VT, Action: Expand);
1864	setCondCodeAction(CCs: ISD::SETULE, VT, Action: Expand);
1865	setCondCodeAction(CCs: ISD::SETUGE, VT, Action: Expand);
1866	setCondCodeAction(CCs: ISD::SETUGT, VT, Action: Expand);
1867	setCondCodeAction(CCs: ISD::SETUEQ, VT, Action: Expand);
1868	setCondCodeAction(CCs: ISD::SETONE, VT, Action: Expand);
1869	}
1870
1871	TargetLoweringBase::LegalizeAction Default =
1872	VT == MVT::v1f64 ? Expand : Custom;
1873
1874	// Mark integer truncating stores/extending loads as having custom lowering
1875	if (VT.isInteger()) {
1876	MVT InnerVT = VT.changeVectorElementType(MVT::i8);
1877	while (InnerVT != VT) {
1878	setTruncStoreAction(ValVT: VT, MemVT: InnerVT, Action: Default);
1879	setLoadExtAction(ExtType: ISD::ZEXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Default);
1880	setLoadExtAction(ExtType: ISD::SEXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Default);
1881	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Default);
1882	InnerVT = InnerVT.changeVectorElementType(
1883	EltVT: MVT::getIntegerVT(BitWidth: `2` * InnerVT.getScalarSizeInBits()));
1884	}
1885	}
1886
1887	// Mark floating-point truncating stores/extending loads as having custom
1888	// lowering
1889	if (VT.isFloatingPoint()) {
1890	MVT InnerVT = VT.changeVectorElementType(MVT::f16);
1891	while (InnerVT != VT) {
1892	setTruncStoreAction(ValVT: VT, MemVT: InnerVT, Action: Custom);
1893	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Default);
1894	InnerVT = InnerVT.changeVectorElementType(
1895	EltVT: MVT::getFloatingPointVT(BitWidth: `2` * InnerVT.getScalarSizeInBits()));
1896	}
1897	}
1898
1899	bool PreferNEON = VT.is64BitVector() \|\| VT.is128BitVector();
1900	bool PreferSVE = !PreferNEON && Subtarget->isSVEAvailable();
1901
1902	// Lower fixed length vector operations to scalable equivalents.
1903	setOperationAction(Op: ISD::ABS, VT, Action: Default);
1904	setOperationAction(Op: ISD::ADD, VT, Action: Default);
1905	setOperationAction(Op: ISD::AND, VT, Action: Default);
1906	setOperationAction(Op: ISD::ANY_EXTEND, VT, Action: Default);
1907	setOperationAction(Op: ISD::BITCAST, VT, Action: PreferNEON ? Legal : Default);
1908	setOperationAction(Op: ISD::BITREVERSE, VT, Action: Default);
1909	setOperationAction(Op: ISD::BSWAP, VT, Action: Default);
1910	setOperationAction(Op: ISD::BUILD_VECTOR, VT, Action: Default);
1911	setOperationAction(Op: ISD::CONCAT_VECTORS, VT, Action: Default);
1912	setOperationAction(Op: ISD::CTLZ, VT, Action: Default);
1913	setOperationAction(Op: ISD::CTPOP, VT, Action: Default);
1914	setOperationAction(Op: ISD::CTTZ, VT, Action: Default);
1915	setOperationAction(Op: ISD::EXTRACT_SUBVECTOR, VT, Action: Default);
1916	setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT, Action: Default);
1917	setOperationAction(Op: ISD::FABS, VT, Action: Default);
1918	setOperationAction(Op: ISD::FADD, VT, Action: Default);
1919	setOperationAction(Op: ISD::FCEIL, VT, Action: Default);
1920	setOperationAction(Op: ISD::FCOPYSIGN, VT, Action: Default);
1921	setOperationAction(Op: ISD::FDIV, VT, Action: Default);
1922	setOperationAction(Op: ISD::FFLOOR, VT, Action: Default);
1923	setOperationAction(Op: ISD::FMA, VT, Action: Default);
1924	setOperationAction(Op: ISD::FMAXIMUM, VT, Action: Default);
1925	setOperationAction(Op: ISD::FMAXNUM, VT, Action: Default);
1926	setOperationAction(Op: ISD::FMINIMUM, VT, Action: Default);
1927	setOperationAction(Op: ISD::FMINNUM, VT, Action: Default);
1928	setOperationAction(Op: ISD::FMUL, VT, Action: Default);
1929	setOperationAction(Op: ISD::FNEARBYINT, VT, Action: Default);
1930	setOperationAction(Op: ISD::FNEG, VT, Action: Default);
1931	setOperationAction(Op: ISD::FP_EXTEND, VT, Action: Default);
1932	setOperationAction(Op: ISD::FP_ROUND, VT, Action: Default);
1933	setOperationAction(Op: ISD::FP_TO_SINT, VT, Action: Default);
1934	setOperationAction(Op: ISD::FP_TO_UINT, VT, Action: Default);
1935	setOperationAction(Op: ISD::FRINT, VT, Action: Default);
1936	setOperationAction(Op: ISD::FROUND, VT, Action: Default);
1937	setOperationAction(Op: ISD::FROUNDEVEN, VT, Action: Default);
1938	setOperationAction(Op: ISD::FSQRT, VT, Action: Default);
1939	setOperationAction(Op: ISD::FSUB, VT, Action: Default);
1940	setOperationAction(Op: ISD::FTRUNC, VT, Action: Default);
1941	setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT, Action: Default);
1942	setOperationAction(Op: ISD::LOAD, VT, Action: PreferNEON ? Legal : Default);
1943	setOperationAction(Op: ISD::MGATHER, VT, Action: PreferSVE ? Default : Expand);
1944	setOperationAction(Op: ISD::MLOAD, VT, Action: Default);
1945	setOperationAction(Op: ISD::MSCATTER, VT, Action: PreferSVE ? Default : Expand);
1946	setOperationAction(Op: ISD::MSTORE, VT, Action: Default);
1947	setOperationAction(Op: ISD::MUL, VT, Action: Default);
1948	setOperationAction(Op: ISD::MULHS, VT, Action: Default);
1949	setOperationAction(Op: ISD::MULHU, VT, Action: Default);
1950	setOperationAction(Op: ISD::OR, VT, Action: Default);
1951	setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT, Action: PreferNEON ? Legal : Expand);
1952	setOperationAction(Op: ISD::SDIV, VT, Action: Default);
1953	setOperationAction(Op: ISD::SELECT, VT, Action: Default);
1954	setOperationAction(Op: ISD::SETCC, VT, Action: Default);
1955	setOperationAction(Op: ISD::SHL, VT, Action: Default);
1956	setOperationAction(Op: ISD::SIGN_EXTEND, VT, Action: Default);
1957	setOperationAction(Op: ISD::SIGN_EXTEND_INREG, VT, Action: Default);
1958	setOperationAction(Op: ISD::SINT_TO_FP, VT, Action: Default);
1959	setOperationAction(Op: ISD::SMAX, VT, Action: Default);
1960	setOperationAction(Op: ISD::SMIN, VT, Action: Default);
1961	setOperationAction(Op: ISD::SPLAT_VECTOR, VT, Action: Default);
1962	setOperationAction(Op: ISD::SRA, VT, Action: Default);
1963	setOperationAction(Op: ISD::SRL, VT, Action: Default);
1964	setOperationAction(Op: ISD::STORE, VT, Action: PreferNEON ? Legal : Default);
1965	setOperationAction(Op: ISD::SUB, VT, Action: Default);
1966	setOperationAction(Op: ISD::TRUNCATE, VT, Action: Default);
1967	setOperationAction(Op: ISD::UDIV, VT, Action: Default);
1968	setOperationAction(Op: ISD::UINT_TO_FP, VT, Action: Default);
1969	setOperationAction(Op: ISD::UMAX, VT, Action: Default);
1970	setOperationAction(Op: ISD::UMIN, VT, Action: Default);
1971	setOperationAction(Op: ISD::VECREDUCE_ADD, VT, Action: Default);
1972	setOperationAction(Op: ISD::VECREDUCE_AND, VT, Action: Default);
1973	setOperationAction(Op: ISD::VECREDUCE_FADD, VT, Action: Default);
1974	setOperationAction(Op: ISD::VECREDUCE_FMAX, VT, Action: Default);
1975	setOperationAction(Op: ISD::VECREDUCE_FMIN, VT, Action: Default);
1976	setOperationAction(Op: ISD::VECREDUCE_FMAXIMUM, VT, Action: Default);
1977	setOperationAction(Op: ISD::VECREDUCE_FMINIMUM, VT, Action: Default);
1978	setOperationAction(Op: ISD::VECREDUCE_OR, VT, Action: Default);
1979	setOperationAction(Op: ISD::VECREDUCE_SEQ_FADD, VT, Action: PreferSVE ? Default : Expand);
1980	setOperationAction(Op: ISD::VECREDUCE_SMAX, VT, Action: Default);
1981	setOperationAction(Op: ISD::VECREDUCE_SMIN, VT, Action: Default);
1982	setOperationAction(Op: ISD::VECREDUCE_UMAX, VT, Action: Default);
1983	setOperationAction(Op: ISD::VECREDUCE_UMIN, VT, Action: Default);
1984	setOperationAction(Op: ISD::VECREDUCE_XOR, VT, Action: Default);
1985	setOperationAction(Op: ISD::VECTOR_SHUFFLE, VT, Action: Default);
1986	setOperationAction(Op: ISD::VECTOR_SPLICE, VT, Action: Default);
1987	setOperationAction(Op: ISD::VSELECT, VT, Action: Default);
1988	setOperationAction(Op: ISD::XOR, VT, Action: Default);
1989	setOperationAction(Op: ISD::ZERO_EXTEND, VT, Action: Default);
1990	}
1991
1992	void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
1993	addRegisterClass(VT, &AArch64::FPR64RegClass);
1994	addTypeForNEON(VT);
1995	}
1996
1997	void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
1998	addRegisterClass(VT, &AArch64::FPR128RegClass);
1999	addTypeForNEON(VT);
2000	}
2001
2002	EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &,
2003	LLVMContext &C, EVT VT) const {
2004	if (!VT.isVector())
2005	return MVT::i32;
2006	if (VT.isScalableVector())
2007	return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount());
2008	return VT.changeVectorElementTypeToInteger();
2009	}
2010
2011	// isIntImmediate - This method tests to see if the node is a constant
2012	// operand. If so Imm will receive the value.
2013	static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
2014	if (const ConstantSDNode C = dyn_cast<const* ConstantSDNode>(Val: N)) {
2015	Imm = C->getZExtValue();
2016	return true;
2017	}
2018	return false;
2019	}
2020
2021	// isOpcWithIntImmediate - This method tests to see if the node is a specific
2022	// opcode and that it has a immediate integer right operand.
2023	// If so Imm will receive the value.
2024	static bool isOpcWithIntImmediate(const SDNode N, unsigned* Opc,
2025	uint64_t &Imm) {
2026	return N->getOpcode() == Opc &&
2027	isIntImmediate(N: N->getOperand(Num: `1`).getNode(), Imm);
2028	}
2029
2030	static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
2031	const APInt &Demanded,
2032	TargetLowering::TargetLoweringOpt &TLO,
2033	unsigned NewOpc) {
2034	uint64_t OldImm = Imm, NewImm, Enc;
2035	uint64_t Mask = ((uint64_t)(-`1LL`) >> (`64` - Size)), OrigMask = Mask;
2036
2037	// Return if the immediate is already all zeros, all ones, a bimm32 or a
2038	// bimm64.
2039	if (Imm == `0` \|\| Imm == Mask \|\|
2040	AArch64_AM::isLogicalImmediate(imm: Imm & Mask, regSize: Size))
2041	return false;
2042
2043	unsigned EltSize = Size;
2044	uint64_t DemandedBits = Demanded.getZExtValue();
2045
2046	// Clear bits that are not demanded.
2047	Imm &= DemandedBits;
2048
2049	while (true) {
2050	// The goal here is to set the non-demanded bits in a way that minimizes
2051	// the number of switching between 0 and 1. In order to achieve this goal,
2052	// we set the non-demanded bits to the value of the preceding demanded bits.
2053	// For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
2054	// non-demanded bit), we copy bit0 (1) to the least significant 'x',
2055	// bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
2056	// The final result is 0b11000011.
2057	uint64_t NonDemandedBits = ~DemandedBits;
2058	uint64_t InvertedImm = ~Imm & DemandedBits;
2059	uint64_t RotatedImm =
2060	((InvertedImm << `1`) \| (InvertedImm >> (EltSize - `1`) & `1`)) &
2061	NonDemandedBits;
2062	uint64_t Sum = RotatedImm + NonDemandedBits;
2063	bool Carry = NonDemandedBits & ~Sum & (`1ULL` << (EltSize - `1`));
2064	uint64_t Ones = (Sum + Carry) & NonDemandedBits;
2065	NewImm = (Imm \| Ones) & Mask;
2066
2067	// If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
2068	// or all-ones or all-zeros, in which case we can stop searching. Otherwise,
2069	// we halve the element size and continue the search.
2070	if (isShiftedMask_64(Value: NewImm) \|\| isShiftedMask_64(Value: ~(NewImm \| ~Mask)))
2071	break;
2072
2073	// We cannot shrink the element size any further if it is 2-bits.
2074	if (EltSize == `2`)
2075	return false;
2076
2077	EltSize /= `2`;
2078	Mask >>= EltSize;
2079	uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
2080
2081	// Return if there is mismatch in any of the demanded bits of Imm and Hi.
2082	if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != `0`)
2083	return false;
2084
2085	// Merge the upper and lower halves of Imm and DemandedBits.
2086	Imm \|= Hi;
2087	DemandedBits \|= DemandedBitsHi;
2088	}
2089
2090	++NumOptimizedImms;
2091
2092	// Replicate the element across the register width.
2093	while (EltSize < Size) {
2094	NewImm \|= NewImm << EltSize;
2095	EltSize *= `2`;
2096	}
2097
2098	(void)OldImm;
2099	assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == `0` &&
2100	"demanded bits should never be altered");
2101	assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
2102
2103	// Create the new constant immediate node.
2104	EVT VT = Op.getValueType();
2105	SDLoc DL(Op);
2106	SDValue New;
2107
2108	// If the new constant immediate is all-zeros or all-ones, let the target
2109	// independent DAG combine optimize this node.
2110	if (NewImm == `0` \|\| NewImm == OrigMask) {
2111	New = TLO.DAG.getNode(Opcode: Op.getOpcode(), DL, VT, N1: Op.getOperand(i: `0`),
2112	N2: TLO.DAG.getConstant(Val: NewImm, DL, VT));
2113	// Otherwise, create a machine node so that target independent DAG combine
2114	// doesn't undo this optimization.
2115	} else {
2116	Enc = AArch64_AM::encodeLogicalImmediate(imm: NewImm, regSize: Size);
2117	SDValue EncConst = TLO.DAG.getTargetConstant(Val: Enc, DL, VT);
2118	New = SDValue (
2119	TLO.DAG.getMachineNode(Opcode: NewOpc, dl: DL, VT, Op1: Op.getOperand(i: `0`), Op2: EncConst), `0`);
2120	}
2121
2122	return TLO.CombineTo(O: Op, N: New);
2123	}
2124
2125	bool AArch64TargetLowering::targetShrinkDemandedConstant(
2126	SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
2127	TargetLoweringOpt &TLO) const {
2128	// Delay this optimization to as late as possible.
2129	if (!TLO.LegalOps)
2130	return false;
2131
2132	if (!EnableOptimizeLogicalImm)
2133	return false;
2134
2135	EVT VT = Op.getValueType();
2136	if (VT.isVector())
2137	return false;
2138
2139	unsigned Size = VT.getSizeInBits();
2140	assert((Size == `32` \|\| Size == `64`) &&
2141	"i32 or i64 is expected after legalization.");
2142
2143	// Exit early if we demand all bits.
2144	if (DemandedBits.popcount() == Size)
2145	return false;
2146
2147	unsigned NewOpc;
2148	switch (Op.getOpcode()) {
2149	default:
2150	return false;
2151	case ISD::AND:
2152	NewOpc = Size == `32` ? AArch64::ANDWri : AArch64::ANDXri;
2153	break;
2154	case ISD::OR:
2155	NewOpc = Size == `32` ? AArch64::ORRWri : AArch64::ORRXri;
2156	break;
2157	case ISD::XOR:
2158	NewOpc = Size == `32` ? AArch64::EORWri : AArch64::EORXri;
2159	break;
2160	}
2161	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: `1`));
2162	if (!C)
2163	return false;
2164	uint64_t Imm = C->getZExtValue();
2165	return optimizeLogicalImm(Op, Size, Imm, Demanded: DemandedBits, TLO, NewOpc);
2166	}
2167
2168	/// computeKnownBitsForTargetNode - Determine which of the bits specified in
2169	/// Mask are known to be either zero or one and return them Known.
2170	void AArch64TargetLowering::computeKnownBitsForTargetNode(
2171	const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
2172	const SelectionDAG &DAG, unsigned Depth) const {
2173	switch (Op.getOpcode()) {
2174	default:
2175	break;
2176	case AArch64ISD::DUP: {
2177	SDValue SrcOp = Op.getOperand(i: `0`);
2178	Known = DAG.computeKnownBits(Op: SrcOp, Depth: Depth + `1`);
2179	if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) {
2180	assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() &&
2181	"Expected DUP implicit truncation");
2182	Known = Known.trunc(BitWidth: Op.getScalarValueSizeInBits());
2183	}
2184	break;
2185	}
2186	case AArch64ISD::CSEL: {
2187	KnownBits Known2;
2188	Known = DAG.computeKnownBits(Op: Op ->getOperand(Num: `0`), Depth: Depth + `1`);
2189	Known2 = DAG.computeKnownBits(Op: Op ->getOperand(Num: `1`), Depth: Depth + `1`);
2190	Known = Known.intersectWith(RHS: Known2);
2191	break;
2192	}
2193	case AArch64ISD::BICi: {
2194	// Compute the bit cleared value.
2195	uint64_t Mask =
2196	~(Op ->getConstantOperandVal(Num: `1`) << Op ->getConstantOperandVal(Num: `2`));
2197	Known = DAG.computeKnownBits(Op: Op ->getOperand(Num: `0`), Depth: Depth + `1`);
2198	Known &= KnownBits::makeConstant(C: APInt (Known.getBitWidth(), Mask));
2199	break;
2200	}
2201	case AArch64ISD::VLSHR: {
2202	KnownBits Known2;
2203	Known = DAG.computeKnownBits(Op: Op ->getOperand(Num: `0`), Depth: Depth + `1`);
2204	Known2 = DAG.computeKnownBits(Op: Op ->getOperand(Num: `1`), Depth: Depth + `1`);
2205	Known = KnownBits::lshr(LHS: Known, RHS: Known2);
2206	break;
2207	}
2208	case AArch64ISD::VASHR: {
2209	KnownBits Known2;
2210	Known = DAG.computeKnownBits(Op: Op ->getOperand(Num: `0`), Depth: Depth + `1`);
2211	Known2 = DAG.computeKnownBits(Op: Op ->getOperand(Num: `1`), Depth: Depth + `1`);
2212	Known = KnownBits::ashr(LHS: Known, RHS: Known2);
2213	break;
2214	}
2215	case AArch64ISD::VSHL: {
2216	KnownBits Known2;
2217	Known = DAG.computeKnownBits(Op: Op ->getOperand(Num: `0`), Depth: Depth + `1`);
2218	Known2 = DAG.computeKnownBits(Op: Op ->getOperand(Num: `1`), Depth: Depth + `1`);
2219	Known = KnownBits::shl(LHS: Known, RHS: Known2);
2220	break;
2221	}
2222	case AArch64ISD::MOVI: {
2223	Known = KnownBits::makeConstant(
2224	C: APInt (Known.getBitWidth(), Op ->getConstantOperandVal(Num: `0`)));
2225	break;
2226	}
2227	case AArch64ISD::LOADgot:
2228	case AArch64ISD::ADDlow: {
2229	if (!Subtarget->isTargetILP32())
2230	break;
2231	// In ILP32 mode all valid pointers are in the low 4GB of the address-space.
2232	Known.Zero = APInt::getHighBitsSet(numBits: `64`, hiBitsSet: `32`);
2233	break;
2234	}
2235	case AArch64ISD::ASSERT_ZEXT_BOOL: {
2236	Known = DAG.computeKnownBits(Op: Op ->getOperand(Num: `0`), Depth: Depth + `1`);
2237	Known.Zero \|= APInt (Known.getBitWidth(), `0xFE`);
2238	break;
2239	}
2240	case ISD::INTRINSIC_W_CHAIN: {
2241	Intrinsic::ID IntID =
2242	static_cast<Intrinsic::ID>(Op ->getConstantOperandVal(Num: `1`));
2243	switch (IntID) {
2244	default: return;
2245	case Intrinsic::aarch64_ldaxr:
2246	case Intrinsic::aarch64_ldxr: {
2247	unsigned BitWidth = Known.getBitWidth();
2248	EVT VT = cast<MemIntrinsicSDNode>(Val: Op)->getMemoryVT();
2249	unsigned MemBits = VT.getScalarSizeInBits();
2250	Known.Zero \|= APInt::getHighBitsSet(numBits: BitWidth, hiBitsSet: BitWidth - MemBits);
2251	return;
2252	}
2253	}
2254	break;
2255	}
2256	case ISD::INTRINSIC_WO_CHAIN:
2257	case ISD::INTRINSIC_VOID: {
2258	unsigned IntNo = Op.getConstantOperandVal(i: `0`);
2259	switch (IntNo) {
2260	default:
2261	break;
2262	case Intrinsic::aarch64_neon_uaddlv: {
2263	MVT VT = Op.getOperand(i: `1`).getValueType().getSimpleVT();
2264	unsigned BitWidth = Known.getBitWidth();
2265	if (VT == MVT::v8i8 \|\| VT == MVT::v16i8) {
2266	unsigned Bound = (VT == MVT::v8i8) ? `11` : `12`;
2267	assert(BitWidth >= Bound && "Unexpected width!");
2268	APInt Mask = APInt::getHighBitsSet(numBits: BitWidth, hiBitsSet: BitWidth - Bound);
2269	Known.Zero \|= Mask;
2270	}
2271	break;
2272	}
2273	case Intrinsic::aarch64_neon_umaxv:
2274	case Intrinsic::aarch64_neon_uminv: {
2275	// Figure out the datatype of the vector operand. The UMINV instruction
2276	// will zero extend the result, so we can mark as known zero all the
2277	// bits larger than the element datatype. 32-bit or larget doesn't need
2278	// this as those are legal types and will be handled by isel directly.
2279	MVT VT = Op.getOperand(i: `1`).getValueType().getSimpleVT();
2280	unsigned BitWidth = Known.getBitWidth();
2281	if (VT == MVT::v8i8 \|\| VT == MVT::v16i8) {
2282	assert(BitWidth >= `8` && "Unexpected width!");
2283	APInt Mask = APInt::getHighBitsSet(numBits: BitWidth, hiBitsSet: BitWidth - `8`);
2284	Known.Zero \|= Mask;
2285	} else if (VT == MVT::v4i16 \|\| VT == MVT::v8i16) {
2286	assert(BitWidth >= `16` && "Unexpected width!");
2287	APInt Mask = APInt::getHighBitsSet(numBits: BitWidth, hiBitsSet: BitWidth - `16`);
2288	Known.Zero \|= Mask;
2289	}
2290	break;
2291	} break;
2292	}
2293	}
2294	}
2295	}
2296
2297	unsigned AArch64TargetLowering::ComputeNumSignBitsForTargetNode(
2298	SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
2299	unsigned Depth) const {
2300	EVT VT = Op.getValueType();
2301	unsigned VTBits = VT.getScalarSizeInBits();
2302	unsigned Opcode = Op.getOpcode();
2303	switch (Opcode) {
2304	case AArch64ISD::CMEQ:
2305	case AArch64ISD::CMGE:
2306	case AArch64ISD::CMGT:
2307	case AArch64ISD::CMHI:
2308	case AArch64ISD::CMHS:
2309	case AArch64ISD::FCMEQ:
2310	case AArch64ISD::FCMGE:
2311	case AArch64ISD::FCMGT:
2312	case AArch64ISD::CMEQz:
2313	case AArch64ISD::CMGEz:
2314	case AArch64ISD::CMGTz:
2315	case AArch64ISD::CMLEz:
2316	case AArch64ISD::CMLTz:
2317	case AArch64ISD::FCMEQz:
2318	case AArch64ISD::FCMGEz:
2319	case AArch64ISD::FCMGTz:
2320	case AArch64ISD::FCMLEz:
2321	case AArch64ISD::FCMLTz:
2322	// Compares return either 0 or all-ones
2323	return VTBits;
2324	}
2325
2326	return `1`;
2327	}
2328
2329	MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL,
2330	EVT) const {
2331	return MVT::i64;
2332	}
2333
2334	bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
2335	EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2336	unsigned Fast) const* {
2337	if (Subtarget->requiresStrictAlign())
2338	return false;
2339
2340	if (Fast) {
2341	// Some CPUs are fine with unaligned stores except for 128-bit ones.
2342	*Fast = !Subtarget->isMisaligned128StoreSlow() \|\| VT.getStoreSize() != `16` \|\|
2343	// See comments in performSTORECombine() for more details about
2344	// these conditions.
2345
2346	// Code that uses clang vector extensions can mark that it
2347	// wants unaligned accesses to be treated as fast by
2348	// underspecifying alignment to be 1 or 2.
2349	Alignment <= `2` \|\|
2350
2351	// Disregard v2i64. Memcpy lowering produces those and splitting
2352	// them regresses performance on micro-benchmarks and olden/bh.
2353	VT == MVT::v2i64;
2354	}
2355	return true;
2356	}
2357
2358	// Same as above but handling LLTs instead.
2359	bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
2360	LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2361	unsigned Fast) const* {
2362	if (Subtarget->requiresStrictAlign())
2363	return false;
2364
2365	if (Fast) {
2366	// Some CPUs are fine with unaligned stores except for 128-bit ones.
2367	*Fast = !Subtarget->isMisaligned128StoreSlow() \|\|
2368	Ty.getSizeInBytes() != `16` \|\|
2369	// See comments in performSTORECombine() for more details about
2370	// these conditions.
2371
2372	// Code that uses clang vector extensions can mark that it
2373	// wants unaligned accesses to be treated as fast by
2374	// underspecifying alignment to be 1 or 2.
2375	Alignment <= `2` \|\|
2376
2377	// Disregard v2i64. Memcpy lowering produces those and splitting
2378	// them regresses performance on micro-benchmarks and olden/bh.
2379	Ty == LLT::fixed_vector(NumElements: `2`, ScalarSizeInBits: `64`);
2380	}
2381	return true;
2382	}
2383
2384	FastISel *
2385	AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
2386	const TargetLibraryInfo libInfo) const* {
2387	return AArch64::createFastISel(funcInfo, libInfo);
2388	}
2389
2390	const char AArch64TargetLowering::getTargetNodeName(unsigned* Opcode) const {
2391	#define MAKE_CASE(V) \
2392	case V: \
2393	return #V;
2394	switch ((AArch64ISD::NodeType)Opcode) {
2395	case AArch64ISD::FIRST_NUMBER:
2396	break;
2397	MAKE_CASE(AArch64ISD::COALESCER_BARRIER)
2398	MAKE_CASE(AArch64ISD::SMSTART)
2399	MAKE_CASE(AArch64ISD::SMSTOP)
2400	MAKE_CASE(AArch64ISD::RESTORE_ZA)
2401	MAKE_CASE(AArch64ISD::RESTORE_ZT)
2402	MAKE_CASE(AArch64ISD::SAVE_ZT)
2403	MAKE_CASE(AArch64ISD::CALL)
2404	MAKE_CASE(AArch64ISD::ADRP)
2405	MAKE_CASE(AArch64ISD::ADR)
2406	MAKE_CASE(AArch64ISD::ADDlow)
2407	MAKE_CASE(AArch64ISD::LOADgot)
2408	MAKE_CASE(AArch64ISD::RET_GLUE)
2409	MAKE_CASE(AArch64ISD::BRCOND)
2410	MAKE_CASE(AArch64ISD::CSEL)
2411	MAKE_CASE(AArch64ISD::CSINV)
2412	MAKE_CASE(AArch64ISD::CSNEG)
2413	MAKE_CASE(AArch64ISD::CSINC)
2414	MAKE_CASE(AArch64ISD::THREAD_POINTER)
2415	MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ)
2416	MAKE_CASE(AArch64ISD::PROBED_ALLOCA)
2417	MAKE_CASE(AArch64ISD::ABDS_PRED)
2418	MAKE_CASE(AArch64ISD::ABDU_PRED)
2419	MAKE_CASE(AArch64ISD::HADDS_PRED)
2420	MAKE_CASE(AArch64ISD::HADDU_PRED)
2421	MAKE_CASE(AArch64ISD::MUL_PRED)
2422	MAKE_CASE(AArch64ISD::MULHS_PRED)
2423	MAKE_CASE(AArch64ISD::MULHU_PRED)
2424	MAKE_CASE(AArch64ISD::RHADDS_PRED)
2425	MAKE_CASE(AArch64ISD::RHADDU_PRED)
2426	MAKE_CASE(AArch64ISD::SDIV_PRED)
2427	MAKE_CASE(AArch64ISD::SHL_PRED)
2428	MAKE_CASE(AArch64ISD::SMAX_PRED)
2429	MAKE_CASE(AArch64ISD::SMIN_PRED)
2430	MAKE_CASE(AArch64ISD::SRA_PRED)
2431	MAKE_CASE(AArch64ISD::SRL_PRED)
2432	MAKE_CASE(AArch64ISD::UDIV_PRED)
2433	MAKE_CASE(AArch64ISD::UMAX_PRED)
2434	MAKE_CASE(AArch64ISD::UMIN_PRED)
2435	MAKE_CASE(AArch64ISD::SRAD_MERGE_OP1)
2436	MAKE_CASE(AArch64ISD::FNEG_MERGE_PASSTHRU)
2437	MAKE_CASE(AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU)
2438	MAKE_CASE(AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU)
2439	MAKE_CASE(AArch64ISD::FCEIL_MERGE_PASSTHRU)
2440	MAKE_CASE(AArch64ISD::FFLOOR_MERGE_PASSTHRU)
2441	MAKE_CASE(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU)
2442	MAKE_CASE(AArch64ISD::FRINT_MERGE_PASSTHRU)
2443	MAKE_CASE(AArch64ISD::FROUND_MERGE_PASSTHRU)
2444	MAKE_CASE(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU)
2445	MAKE_CASE(AArch64ISD::FTRUNC_MERGE_PASSTHRU)
2446	MAKE_CASE(AArch64ISD::FP_ROUND_MERGE_PASSTHRU)
2447	MAKE_CASE(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU)
2448	MAKE_CASE(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU)
2449	MAKE_CASE(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU)
2450	MAKE_CASE(AArch64ISD::FCVTZU_MERGE_PASSTHRU)
2451	MAKE_CASE(AArch64ISD::FCVTZS_MERGE_PASSTHRU)
2452	MAKE_CASE(AArch64ISD::FSQRT_MERGE_PASSTHRU)
2453	MAKE_CASE(AArch64ISD::FRECPX_MERGE_PASSTHRU)
2454	MAKE_CASE(AArch64ISD::FABS_MERGE_PASSTHRU)
2455	MAKE_CASE(AArch64ISD::ABS_MERGE_PASSTHRU)
2456	MAKE_CASE(AArch64ISD::NEG_MERGE_PASSTHRU)
2457	MAKE_CASE(AArch64ISD::SETCC_MERGE_ZERO)
2458	MAKE_CASE(AArch64ISD::ADC)
2459	MAKE_CASE(AArch64ISD::SBC)
2460	MAKE_CASE(AArch64ISD::ADDS)
2461	MAKE_CASE(AArch64ISD::SUBS)
2462	MAKE_CASE(AArch64ISD::ADCS)
2463	MAKE_CASE(AArch64ISD::SBCS)
2464	MAKE_CASE(AArch64ISD::ANDS)
2465	MAKE_CASE(AArch64ISD::CCMP)
2466	MAKE_CASE(AArch64ISD::CCMN)
2467	MAKE_CASE(AArch64ISD::FCCMP)
2468	MAKE_CASE(AArch64ISD::FCMP)
2469	MAKE_CASE(AArch64ISD::STRICT_FCMP)
2470	MAKE_CASE(AArch64ISD::STRICT_FCMPE)
2471	MAKE_CASE(AArch64ISD::FCVTXN)
2472	MAKE_CASE(AArch64ISD::SME_ZA_LDR)
2473	MAKE_CASE(AArch64ISD::SME_ZA_STR)
2474	MAKE_CASE(AArch64ISD::DUP)
2475	MAKE_CASE(AArch64ISD::DUPLANE8)
2476	MAKE_CASE(AArch64ISD::DUPLANE16)
2477	MAKE_CASE(AArch64ISD::DUPLANE32)
2478	MAKE_CASE(AArch64ISD::DUPLANE64)
2479	MAKE_CASE(AArch64ISD::DUPLANE128)
2480	MAKE_CASE(AArch64ISD::MOVI)
2481	MAKE_CASE(AArch64ISD::MOVIshift)
2482	MAKE_CASE(AArch64ISD::MOVIedit)
2483	MAKE_CASE(AArch64ISD::MOVImsl)
2484	MAKE_CASE(AArch64ISD::FMOV)
2485	MAKE_CASE(AArch64ISD::MVNIshift)
2486	MAKE_CASE(AArch64ISD::MVNImsl)
2487	MAKE_CASE(AArch64ISD::BICi)
2488	MAKE_CASE(AArch64ISD::ORRi)
2489	MAKE_CASE(AArch64ISD::BSP)
2490	MAKE_CASE(AArch64ISD::ZIP1)
2491	MAKE_CASE(AArch64ISD::ZIP2)
2492	MAKE_CASE(AArch64ISD::UZP1)
2493	MAKE_CASE(AArch64ISD::UZP2)
2494	MAKE_CASE(AArch64ISD::TRN1)
2495	MAKE_CASE(AArch64ISD::TRN2)
2496	MAKE_CASE(AArch64ISD::REV16)
2497	MAKE_CASE(AArch64ISD::REV32)
2498	MAKE_CASE(AArch64ISD::REV64)
2499	MAKE_CASE(AArch64ISD::EXT)
2500	MAKE_CASE(AArch64ISD::SPLICE)
2501	MAKE_CASE(AArch64ISD::VSHL)
2502	MAKE_CASE(AArch64ISD::VLSHR)
2503	MAKE_CASE(AArch64ISD::VASHR)
2504	MAKE_CASE(AArch64ISD::VSLI)
2505	MAKE_CASE(AArch64ISD::VSRI)
2506	MAKE_CASE(AArch64ISD::CMEQ)
2507	MAKE_CASE(AArch64ISD::CMGE)
2508	MAKE_CASE(AArch64ISD::CMGT)
2509	MAKE_CASE(AArch64ISD::CMHI)
2510	MAKE_CASE(AArch64ISD::CMHS)
2511	MAKE_CASE(AArch64ISD::FCMEQ)
2512	MAKE_CASE(AArch64ISD::FCMGE)
2513	MAKE_CASE(AArch64ISD::FCMGT)
2514	MAKE_CASE(AArch64ISD::CMEQz)
2515	MAKE_CASE(AArch64ISD::CMGEz)
2516	MAKE_CASE(AArch64ISD::CMGTz)
2517	MAKE_CASE(AArch64ISD::CMLEz)
2518	MAKE_CASE(AArch64ISD::CMLTz)
2519	MAKE_CASE(AArch64ISD::FCMEQz)
2520	MAKE_CASE(AArch64ISD::FCMGEz)
2521	MAKE_CASE(AArch64ISD::FCMGTz)
2522	MAKE_CASE(AArch64ISD::FCMLEz)
2523	MAKE_CASE(AArch64ISD::FCMLTz)
2524	MAKE_CASE(AArch64ISD::SADDV)
2525	MAKE_CASE(AArch64ISD::UADDV)
2526	MAKE_CASE(AArch64ISD::UADDLV)
2527	MAKE_CASE(AArch64ISD::SADDLV)
2528	MAKE_CASE(AArch64ISD::SDOT)
2529	MAKE_CASE(AArch64ISD::UDOT)
2530	MAKE_CASE(AArch64ISD::SMINV)
2531	MAKE_CASE(AArch64ISD::UMINV)
2532	MAKE_CASE(AArch64ISD::SMAXV)
2533	MAKE_CASE(AArch64ISD::UMAXV)
2534	MAKE_CASE(AArch64ISD::SADDV_PRED)
2535	MAKE_CASE(AArch64ISD::UADDV_PRED)
2536	MAKE_CASE(AArch64ISD::SMAXV_PRED)
2537	MAKE_CASE(AArch64ISD::UMAXV_PRED)
2538	MAKE_CASE(AArch64ISD::SMINV_PRED)
2539	MAKE_CASE(AArch64ISD::UMINV_PRED)
2540	MAKE_CASE(AArch64ISD::ORV_PRED)
2541	MAKE_CASE(AArch64ISD::EORV_PRED)
2542	MAKE_CASE(AArch64ISD::ANDV_PRED)
2543	MAKE_CASE(AArch64ISD::CLASTA_N)
2544	MAKE_CASE(AArch64ISD::CLASTB_N)
2545	MAKE_CASE(AArch64ISD::LASTA)
2546	MAKE_CASE(AArch64ISD::LASTB)
2547	MAKE_CASE(AArch64ISD::REINTERPRET_CAST)
2548	MAKE_CASE(AArch64ISD::LS64_BUILD)
2549	MAKE_CASE(AArch64ISD::LS64_EXTRACT)
2550	MAKE_CASE(AArch64ISD::TBL)
2551	MAKE_CASE(AArch64ISD::FADD_PRED)
2552	MAKE_CASE(AArch64ISD::FADDA_PRED)
2553	MAKE_CASE(AArch64ISD::FADDV_PRED)
2554	MAKE_CASE(AArch64ISD::FDIV_PRED)
2555	MAKE_CASE(AArch64ISD::FMA_PRED)
2556	MAKE_CASE(AArch64ISD::FMAX_PRED)
2557	MAKE_CASE(AArch64ISD::FMAXV_PRED)
2558	MAKE_CASE(AArch64ISD::FMAXNM_PRED)
2559	MAKE_CASE(AArch64ISD::FMAXNMV_PRED)
2560	MAKE_CASE(AArch64ISD::FMIN_PRED)
2561	MAKE_CASE(AArch64ISD::FMINV_PRED)
2562	MAKE_CASE(AArch64ISD::FMINNM_PRED)
2563	MAKE_CASE(AArch64ISD::FMINNMV_PRED)
2564	MAKE_CASE(AArch64ISD::FMUL_PRED)
2565	MAKE_CASE(AArch64ISD::FSUB_PRED)
2566	MAKE_CASE(AArch64ISD::RDSVL)
2567	MAKE_CASE(AArch64ISD::BIC)
2568	MAKE_CASE(AArch64ISD::CBZ)
2569	MAKE_CASE(AArch64ISD::CBNZ)
2570	MAKE_CASE(AArch64ISD::TBZ)
2571	MAKE_CASE(AArch64ISD::TBNZ)
2572	MAKE_CASE(AArch64ISD::TC_RETURN)
2573	MAKE_CASE(AArch64ISD::PREFETCH)
2574	MAKE_CASE(AArch64ISD::SITOF)
2575	MAKE_CASE(AArch64ISD::UITOF)
2576	MAKE_CASE(AArch64ISD::NVCAST)
2577	MAKE_CASE(AArch64ISD::MRS)
2578	MAKE_CASE(AArch64ISD::SQSHL_I)
2579	MAKE_CASE(AArch64ISD::UQSHL_I)
2580	MAKE_CASE(AArch64ISD::SRSHR_I)
2581	MAKE_CASE(AArch64ISD::URSHR_I)
2582	MAKE_CASE(AArch64ISD::SQSHLU_I)
2583	MAKE_CASE(AArch64ISD::WrapperLarge)
2584	MAKE_CASE(AArch64ISD::LD2post)
2585	MAKE_CASE(AArch64ISD::LD3post)
2586	MAKE_CASE(AArch64ISD::LD4post)
2587	MAKE_CASE(AArch64ISD::ST2post)
2588	MAKE_CASE(AArch64ISD::ST3post)
2589	MAKE_CASE(AArch64ISD::ST4post)
2590	MAKE_CASE(AArch64ISD::LD1x2post)
2591	MAKE_CASE(AArch64ISD::LD1x3post)
2592	MAKE_CASE(AArch64ISD::LD1x4post)
2593	MAKE_CASE(AArch64ISD::ST1x2post)
2594	MAKE_CASE(AArch64ISD::ST1x3post)
2595	MAKE_CASE(AArch64ISD::ST1x4post)
2596	MAKE_CASE(AArch64ISD::LD1DUPpost)
2597	MAKE_CASE(AArch64ISD::LD2DUPpost)
2598	MAKE_CASE(AArch64ISD::LD3DUPpost)
2599	MAKE_CASE(AArch64ISD::LD4DUPpost)
2600	MAKE_CASE(AArch64ISD::LD1LANEpost)
2601	MAKE_CASE(AArch64ISD::LD2LANEpost)
2602	MAKE_CASE(AArch64ISD::LD3LANEpost)
2603	MAKE_CASE(AArch64ISD::LD4LANEpost)
2604	MAKE_CASE(AArch64ISD::ST2LANEpost)
2605	MAKE_CASE(AArch64ISD::ST3LANEpost)
2606	MAKE_CASE(AArch64ISD::ST4LANEpost)
2607	MAKE_CASE(AArch64ISD::SMULL)
2608	MAKE_CASE(AArch64ISD::UMULL)
2609	MAKE_CASE(AArch64ISD::PMULL)
2610	MAKE_CASE(AArch64ISD::FRECPE)
2611	MAKE_CASE(AArch64ISD::FRECPS)
2612	MAKE_CASE(AArch64ISD::FRSQRTE)
2613	MAKE_CASE(AArch64ISD::FRSQRTS)
2614	MAKE_CASE(AArch64ISD::STG)
2615	MAKE_CASE(AArch64ISD::STZG)
2616	MAKE_CASE(AArch64ISD::ST2G)
2617	MAKE_CASE(AArch64ISD::STZ2G)
2618	MAKE_CASE(AArch64ISD::SUNPKHI)
2619	MAKE_CASE(AArch64ISD::SUNPKLO)
2620	MAKE_CASE(AArch64ISD::UUNPKHI)
2621	MAKE_CASE(AArch64ISD::UUNPKLO)
2622	MAKE_CASE(AArch64ISD::INSR)
2623	MAKE_CASE(AArch64ISD::PTEST)
2624	MAKE_CASE(AArch64ISD::PTEST_ANY)
2625	MAKE_CASE(AArch64ISD::PTRUE)
2626	MAKE_CASE(AArch64ISD::LD1_MERGE_ZERO)
2627	MAKE_CASE(AArch64ISD::LD1S_MERGE_ZERO)
2628	MAKE_CASE(AArch64ISD::LDNF1_MERGE_ZERO)
2629	MAKE_CASE(AArch64ISD::LDNF1S_MERGE_ZERO)
2630	MAKE_CASE(AArch64ISD::LDFF1_MERGE_ZERO)
2631	MAKE_CASE(AArch64ISD::LDFF1S_MERGE_ZERO)
2632	MAKE_CASE(AArch64ISD::LD1RQ_MERGE_ZERO)
2633	MAKE_CASE(AArch64ISD::LD1RO_MERGE_ZERO)
2634	MAKE_CASE(AArch64ISD::SVE_LD2_MERGE_ZERO)
2635	MAKE_CASE(AArch64ISD::SVE_LD3_MERGE_ZERO)
2636	MAKE_CASE(AArch64ISD::SVE_LD4_MERGE_ZERO)
2637	MAKE_CASE(AArch64ISD::GLD1_MERGE_ZERO)
2638	MAKE_CASE(AArch64ISD::GLD1_SCALED_MERGE_ZERO)
2639	MAKE_CASE(AArch64ISD::GLD1_SXTW_MERGE_ZERO)
2640	MAKE_CASE(AArch64ISD::GLD1_UXTW_MERGE_ZERO)
2641	MAKE_CASE(AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO)
2642	MAKE_CASE(AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO)
2643	MAKE_CASE(AArch64ISD::GLD1_IMM_MERGE_ZERO)
2644	MAKE_CASE(AArch64ISD::GLD1Q_MERGE_ZERO)
2645	MAKE_CASE(AArch64ISD::GLD1Q_INDEX_MERGE_ZERO)
2646	MAKE_CASE(AArch64ISD::GLD1S_MERGE_ZERO)
2647	MAKE_CASE(AArch64ISD::GLD1S_SCALED_MERGE_ZERO)
2648	MAKE_CASE(AArch64ISD::GLD1S_SXTW_MERGE_ZERO)
2649	MAKE_CASE(AArch64ISD::GLD1S_UXTW_MERGE_ZERO)
2650	MAKE_CASE(AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO)
2651	MAKE_CASE(AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO)
2652	MAKE_CASE(AArch64ISD::GLD1S_IMM_MERGE_ZERO)
2653	MAKE_CASE(AArch64ISD::GLDFF1_MERGE_ZERO)
2654	MAKE_CASE(AArch64ISD::GLDFF1_SCALED_MERGE_ZERO)
2655	MAKE_CASE(AArch64ISD::GLDFF1_SXTW_MERGE_ZERO)
2656	MAKE_CASE(AArch64ISD::GLDFF1_UXTW_MERGE_ZERO)
2657	MAKE_CASE(AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO)
2658	MAKE_CASE(AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO)
2659	MAKE_CASE(AArch64ISD::GLDFF1_IMM_MERGE_ZERO)
2660	MAKE_CASE(AArch64ISD::GLDFF1S_MERGE_ZERO)
2661	MAKE_CASE(AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO)
2662	MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO)
2663	MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO)
2664	MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO)
2665	MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO)
2666	MAKE_CASE(AArch64ISD::GLDFF1S_IMM_MERGE_ZERO)
2667	MAKE_CASE(AArch64ISD::GLDNT1_MERGE_ZERO)
2668	MAKE_CASE(AArch64ISD::GLDNT1_INDEX_MERGE_ZERO)
2669	MAKE_CASE(AArch64ISD::GLDNT1S_MERGE_ZERO)
2670	MAKE_CASE(AArch64ISD::SST1Q_PRED)
2671	MAKE_CASE(AArch64ISD::SST1Q_INDEX_PRED)
2672	MAKE_CASE(AArch64ISD::ST1_PRED)
2673	MAKE_CASE(AArch64ISD::SST1_PRED)
2674	MAKE_CASE(AArch64ISD::SST1_SCALED_PRED)
2675	MAKE_CASE(AArch64ISD::SST1_SXTW_PRED)
2676	MAKE_CASE(AArch64ISD::SST1_UXTW_PRED)
2677	MAKE_CASE(AArch64ISD::SST1_SXTW_SCALED_PRED)
2678	MAKE_CASE(AArch64ISD::SST1_UXTW_SCALED_PRED)
2679	MAKE_CASE(AArch64ISD::SST1_IMM_PRED)
2680	MAKE_CASE(AArch64ISD::SSTNT1_PRED)
2681	MAKE_CASE(AArch64ISD::SSTNT1_INDEX_PRED)
2682	MAKE_CASE(AArch64ISD::LDP)
2683	MAKE_CASE(AArch64ISD::LDIAPP)
2684	MAKE_CASE(AArch64ISD::LDNP)
2685	MAKE_CASE(AArch64ISD::STP)
2686	MAKE_CASE(AArch64ISD::STILP)
2687	MAKE_CASE(AArch64ISD::STNP)
2688	MAKE_CASE(AArch64ISD::BITREVERSE_MERGE_PASSTHRU)
2689	MAKE_CASE(AArch64ISD::BSWAP_MERGE_PASSTHRU)
2690	MAKE_CASE(AArch64ISD::REVH_MERGE_PASSTHRU)
2691	MAKE_CASE(AArch64ISD::REVW_MERGE_PASSTHRU)
2692	MAKE_CASE(AArch64ISD::REVD_MERGE_PASSTHRU)
2693	MAKE_CASE(AArch64ISD::CTLZ_MERGE_PASSTHRU)
2694	MAKE_CASE(AArch64ISD::CTPOP_MERGE_PASSTHRU)
2695	MAKE_CASE(AArch64ISD::DUP_MERGE_PASSTHRU)
2696	MAKE_CASE(AArch64ISD::INDEX_VECTOR)
2697	MAKE_CASE(AArch64ISD::ADDP)
2698	MAKE_CASE(AArch64ISD::SADDLP)
2699	MAKE_CASE(AArch64ISD::UADDLP)
2700	MAKE_CASE(AArch64ISD::CALL_RVMARKER)
2701	MAKE_CASE(AArch64ISD::ASSERT_ZEXT_BOOL)
2702	MAKE_CASE(AArch64ISD::MOPS_MEMSET)
2703	MAKE_CASE(AArch64ISD::MOPS_MEMSET_TAGGING)
2704	MAKE_CASE(AArch64ISD::MOPS_MEMCOPY)
2705	MAKE_CASE(AArch64ISD::MOPS_MEMMOVE)
2706	MAKE_CASE(AArch64ISD::CALL_BTI)
2707	MAKE_CASE(AArch64ISD::MRRS)
2708	MAKE_CASE(AArch64ISD::MSRR)
2709	MAKE_CASE(AArch64ISD::RSHRNB_I)
2710	MAKE_CASE(AArch64ISD::CTTZ_ELTS)
2711	MAKE_CASE(AArch64ISD::CALL_ARM64EC_TO_X64)
2712	MAKE_CASE(AArch64ISD::URSHR_I_PRED)
2713	}
2714	#undef MAKE_CASE
2715	return nullptr;
2716	}
2717
2718	MachineBasicBlock *
2719	AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI,
2720	MachineBasicBlock MBB) const* {
2721	// We materialise the F128CSEL pseudo-instruction as some control flow and a
2722	// phi node:
2723
2724	// OrigBB:
2725	// [... previous instrs leading to comparison ...]
2726	// b.ne TrueBB
2727	// b EndBB
2728	// TrueBB:
2729	// ; Fallthrough
2730	// EndBB:
2731	// Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
2732
2733	MachineFunction *MF = MBB->getParent();
2734	const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2735	const BasicBlock *LLVM_BB = MBB->getBasicBlock();
2736	DebugLoc DL = MI.getDebugLoc();
2737	MachineFunction::iterator It = ++MBB->getIterator();
2738
2739	Register DestReg = MI.getOperand(i: `0`).getReg();
2740	Register IfTrueReg = MI.getOperand(i: `1`).getReg();
2741	Register IfFalseReg = MI.getOperand(i: `2`).getReg();
2742	unsigned CondCode = MI.getOperand(i: `3`).getImm();
2743	bool NZCVKilled = MI.getOperand(i: `4`).isKill();
2744
2745	MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(BB: LLVM_BB);
2746	MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(BB: LLVM_BB);
2747	MF->insert(MBBI: It, MBB: TrueBB);
2748	MF->insert(MBBI: It, MBB: EndBB);
2749
2750	// Transfer rest of current basic-block to EndBB
2751	EndBB->splice(Where: EndBB->begin(), Other: MBB, From: std::next(x: MachineBasicBlock::iterator (MI)),
2752	To: MBB->end());
2753	EndBB->transferSuccessorsAndUpdatePHIs(FromMBB: MBB);
2754
2755	BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
2756	BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
2757	MBB->addSuccessor(Succ: TrueBB);
2758	MBB->addSuccessor(Succ: EndBB);
2759
2760	// TrueBB falls through to the end.
2761	TrueBB->addSuccessor(Succ: EndBB);
2762
2763	if (!NZCVKilled) {
2764	TrueBB->addLiveIn(AArch64::NZCV);
2765	EndBB->addLiveIn(AArch64::NZCV);
2766	}
2767
2768	BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
2769	.addReg(IfTrueReg)
2770	.addMBB(TrueBB)
2771	.addReg(IfFalseReg)
2772	.addMBB(MBB);
2773
2774	MI.eraseFromParent();
2775	return EndBB;
2776	}
2777
2778	MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet(
2779	MachineInstr &MI, MachineBasicBlock BB) const* {
2780	assert(!isAsynchronousEHPersonality(classifyEHPersonality(
2781	BB->getParent()->getFunction().getPersonalityFn())) &&
2782	"SEH does not use catchret!");
2783	return BB;
2784	}
2785
2786	MachineBasicBlock *
2787	AArch64TargetLowering::EmitDynamicProbedAlloc(MachineInstr &MI,
2788	MachineBasicBlock MBB) const* {
2789	MachineFunction &MF = *MBB->getParent();
2790	MachineBasicBlock::iterator MBBI = MI.getIterator();
2791	DebugLoc DL = MBB->findDebugLoc(MBBI);
2792	const AArch64InstrInfo &TII =
2793	*MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
2794	Register TargetReg = MI.getOperand(i: `0`).getReg();
2795	MachineBasicBlock::iterator NextInst =
2796	TII.probedStackAlloc(MBBI, TargetReg, FrameSetup: false);
2797
2798	MI.eraseFromParent();
2799	return NextInst ->getParent();
2800	}
2801
2802	MachineBasicBlock *
2803	AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
2804	MachineInstr &MI,
2805	MachineBasicBlock BB) const* {
2806	const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2807	MachineInstrBuilder MIB = BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: Opc));
2808
2809	MIB.addReg(RegNo: BaseReg + MI.getOperand(i: `0`).getImm(), flags: RegState::Define);
2810	MIB.add(MO: MI.getOperand(i: `1`)); // slice index register
2811	MIB.add(MO: MI.getOperand(i: `2`)); // slice index offset
2812	MIB.add(MO: MI.getOperand(i: `3`)); // pg
2813	MIB.add(MO: MI.getOperand(i: `4`)); // base
2814	MIB.add(MO: MI.getOperand(i: `5`)); // offset
2815
2816	MI.eraseFromParent(); // The pseudo is gone now.
2817	return BB;
2818	}
2819
2820	MachineBasicBlock *
2821	AArch64TargetLowering::EmitFill(MachineInstr &MI, MachineBasicBlock BB) const* {
2822	const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2823	MachineInstrBuilder MIB =
2824	BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA));
2825
2826	MIB.addReg(AArch64::ZA, RegState::Define);
2827	MIB.add(MO: MI.getOperand(i: `0`)); // Vector select register
2828	MIB.add(MO: MI.getOperand(i: `1`)); // Vector select offset
2829	MIB.add(MO: MI.getOperand(i: `2`)); // Base
2830	MIB.add(MO: MI.getOperand(i: `1`)); // Offset, same as vector select offset
2831
2832	MI.eraseFromParent(); // The pseudo is gone now.
2833	return BB;
2834	}
2835
2836	MachineBasicBlock *AArch64TargetLowering::EmitZTInstr(MachineInstr &MI,
2837	MachineBasicBlock *BB,
2838	unsigned Opcode,
2839	bool Op0IsDef) const {
2840	const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2841	MachineInstrBuilder MIB;
2842
2843	MIB = BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode))
2844	.addReg(RegNo: MI.getOperand(i: `0`).getReg(), flags: Op0IsDef ? RegState::Define : `0`);
2845	for (unsigned I = `1`; I < MI.getNumOperands(); ++I)
2846	MIB.add(MO: MI.getOperand(i: I));
2847
2848	MI.eraseFromParent(); // The pseudo is gone now.
2849	return BB;
2850	}
2851
2852	MachineBasicBlock *
2853	AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg,
2854	MachineInstr &MI,
2855	MachineBasicBlock BB, bool* HasTile) const {
2856	const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2857	MachineInstrBuilder MIB = BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: Opc));
2858	unsigned StartIdx = `0`;
2859
2860	if (HasTile) {
2861	MIB.addReg(RegNo: BaseReg + MI.getOperand(i: `0`).getImm(), flags: RegState::Define);
2862	MIB.addReg(RegNo: BaseReg + MI.getOperand(i: `0`).getImm());
2863	StartIdx = `1`;
2864	} else
2865	MIB.addReg(RegNo: BaseReg, flags: RegState::Define).addReg(RegNo: BaseReg);
2866
2867	for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I)
2868	MIB.add(MO: MI.getOperand(i: I));
2869
2870	MI.eraseFromParent(); // The pseudo is gone now.
2871	return BB;
2872	}
2873
2874	MachineBasicBlock *
2875	AArch64TargetLowering::EmitZero(MachineInstr &MI, MachineBasicBlock BB) const* {
2876	const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2877	MachineInstrBuilder MIB =
2878	BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::ZERO_M));
2879	MIB.add(MO: MI.getOperand(i: `0`)); // Mask
2880
2881	unsigned Mask = MI.getOperand(i: `0`).getImm();
2882	for (unsigned I = `0`; I < `8`; I++) {
2883	if (Mask & (`1` << I))
2884	MIB.addDef(AArch64::ZAD0 + I, RegState::ImplicitDefine);
2885	}
2886
2887	MI.eraseFromParent(); // The pseudo is gone now.
2888	return BB;
2889	}
2890
2891	MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
2892	MachineInstr &MI, MachineBasicBlock BB) const* {
2893
2894	int SMEOrigInstr = AArch64::getSMEPseudoMap(Opcode: MI.getOpcode());
2895	if (SMEOrigInstr != -`1`) {
2896	const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2897	uint64_t SMEMatrixType =
2898	TII->get(Opcode: MI.getOpcode()).TSFlags & AArch64::SMEMatrixTypeMask;
2899	switch (SMEMatrixType) {
2900	case (AArch64::SMEMatrixArray):
2901	return EmitZAInstr(SMEOrigInstr, AArch64::ZA, MI, BB, /HasTile/ false);
2902	case (AArch64::SMEMatrixTileB):
2903	return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB, /HasTile/ true);
2904	case (AArch64::SMEMatrixTileH):
2905	return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB, /HasTile/ true);
2906	case (AArch64::SMEMatrixTileS):
2907	return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB, /HasTile/ true);
2908	case (AArch64::SMEMatrixTileD):
2909	return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB, /HasTile/ true);
2910	case (AArch64::SMEMatrixTileQ):
2911	return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB, /HasTile/ true);
2912	}
2913	}
2914
2915	switch (MI.getOpcode()) {
2916	default:
2917	#ifndef NDEBUG
2918	MI.dump();
2919	#endif
2920	llvm_unreachable("Unexpected instruction for custom inserter!");
2921
2922	case AArch64::F128CSEL:
2923	return EmitF128CSEL(MI, MBB: BB);
2924	case TargetOpcode::STATEPOINT:
2925	// STATEPOINT is a pseudo instruction which has no implicit defs/uses
2926	// while bl call instruction (where statepoint will be lowered at the end)
2927	// has implicit def. This def is early-clobber as it will be set at
2928	// the moment of the call and earlier than any use is read.
2929	// Add this implicit dead def here as a workaround.
2930	MI.addOperand(*MI.getMF(),
2931	MachineOperand::CreateReg(
2932	AArch64::LR, /isDef/ true,
2933	/isImp/ true, /isKill/ false, /isDead/ true,
2934	/isUndef/ false, /isEarlyClobber/ true));
2935	[[fallthrough]];
2936	case TargetOpcode::STACKMAP:
2937	case TargetOpcode::PATCHPOINT:
2938	return emitPatchPoint(MI, MBB: BB);
2939
2940	case TargetOpcode::PATCHABLE_EVENT_CALL:
2941	case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
2942	return BB;
2943
2944	case AArch64::CATCHRET:
2945	return EmitLoweredCatchRet(MI, BB);
2946
2947	case AArch64::PROBED_STACKALLOC_DYN:
2948	return EmitDynamicProbedAlloc(MI, MBB: BB);
2949
2950	case AArch64::LD1_MXIPXX_H_PSEUDO_B:
2951	return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
2952	case AArch64::LD1_MXIPXX_H_PSEUDO_H:
2953	return EmitTileLoad(AArch64::LD1_MXIPXX_H_H, AArch64::ZAH0, MI, BB);
2954	case AArch64::LD1_MXIPXX_H_PSEUDO_S:
2955	return EmitTileLoad(AArch64::LD1_MXIPXX_H_S, AArch64::ZAS0, MI, BB);
2956	case AArch64::LD1_MXIPXX_H_PSEUDO_D:
2957	return EmitTileLoad(AArch64::LD1_MXIPXX_H_D, AArch64::ZAD0, MI, BB);
2958	case AArch64::LD1_MXIPXX_H_PSEUDO_Q:
2959	return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q, AArch64::ZAQ0, MI, BB);
2960	case AArch64::LD1_MXIPXX_V_PSEUDO_B:
2961	return EmitTileLoad(AArch64::LD1_MXIPXX_V_B, AArch64::ZAB0, MI, BB);
2962	case AArch64::LD1_MXIPXX_V_PSEUDO_H:
2963	return EmitTileLoad(AArch64::LD1_MXIPXX_V_H, AArch64::ZAH0, MI, BB);
2964	case AArch64::LD1_MXIPXX_V_PSEUDO_S:
2965	return EmitTileLoad(AArch64::LD1_MXIPXX_V_S, AArch64::ZAS0, MI, BB);
2966	case AArch64::LD1_MXIPXX_V_PSEUDO_D:
2967	return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB);
2968	case AArch64::LD1_MXIPXX_V_PSEUDO_Q:
2969	return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB);
2970	case AArch64::LDR_ZA_PSEUDO:
2971	return EmitFill(MI, BB);
2972	case AArch64::LDR_TX_PSEUDO:
2973	return EmitZTInstr(MI, BB, AArch64::LDR_TX, /Op0IsDef=/true);
2974	case AArch64::STR_TX_PSEUDO:
2975	return EmitZTInstr(MI, BB, AArch64::STR_TX, /Op0IsDef=/false);
2976	case AArch64::ZERO_M_PSEUDO:
2977	return EmitZero(MI, BB);
2978	case AArch64::ZERO_T_PSEUDO:
2979	return EmitZTInstr(MI, BB, AArch64::ZERO_T, /Op0IsDef=/true);
2980	}
2981	}
2982
2983	//===----------------------------------------------------------------------===//
2984	// AArch64 Lowering private implementation.
2985	//===----------------------------------------------------------------------===//
2986
2987	//===----------------------------------------------------------------------===//
2988	// Lowering Code
2989	//===----------------------------------------------------------------------===//
2990
2991	// Forward declarations of SVE fixed length lowering helpers
2992	static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT);
2993	static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V);
2994	static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V);
2995	static SDValue convertFixedMaskToScalableVector(SDValue Mask,
2996	SelectionDAG &DAG);
2997	static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT);
2998	static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL,
2999	EVT VT);
3000
3001	/// isZerosVector - Check whether SDNode N is a zero-filled vector.
3002	static bool isZerosVector(const SDNode *N) {
3003	// Look through a bit convert.
3004	while (N->getOpcode() == ISD::BITCAST)
3005	N = N->getOperand(Num: `0`).getNode();
3006
3007	if (ISD::isConstantSplatVectorAllZeros(N))
3008	return true;
3009
3010	if (N->getOpcode() != AArch64ISD::DUP)
3011	return false;
3012
3013	auto Opnd0 = N->getOperand(Num: `0`);
3014	return isNullConstant(V: Opnd0) \|\| isNullFPConstant(V: Opnd0);
3015	}
3016
3017	/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
3018	/// CC
3019	static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) {
3020	switch (CC) {
3021	default:
3022	llvm_unreachable("Unknown condition code!");
3023	case ISD::SETNE:
3024	return AArch64CC::NE;
3025	case ISD::SETEQ:
3026	return AArch64CC::EQ;
3027	case ISD::SETGT:
3028	return AArch64CC::GT;
3029	case ISD::SETGE:
3030	return AArch64CC::GE;
3031	case ISD::SETLT:
3032	return AArch64CC::LT;
3033	case ISD::SETLE:
3034	return AArch64CC::LE;
3035	case ISD::SETUGT:
3036	return AArch64CC::HI;
3037	case ISD::SETUGE:
3038	return AArch64CC::HS;
3039	case ISD::SETULT:
3040	return AArch64CC::LO;
3041	case ISD::SETULE:
3042	return AArch64CC::LS;
3043	}
3044	}
3045
3046	/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
3047	static void changeFPCCToAArch64CC(ISD::CondCode CC,
3048	AArch64CC::CondCode &CondCode,
3049	AArch64CC::CondCode &CondCode2) {
3050	CondCode2 = AArch64CC::AL;
3051	switch (CC) {
3052	default:
3053	llvm_unreachable("Unknown FP condition!");
3054	case ISD::SETEQ:
3055	case ISD::SETOEQ:
3056	CondCode = AArch64CC::EQ;
3057	break;
3058	case ISD::SETGT:
3059	case ISD::SETOGT:
3060	CondCode = AArch64CC::GT;
3061	break;
3062	case ISD::SETGE:
3063	case ISD::SETOGE:
3064	CondCode = AArch64CC::GE;
3065	break;
3066	case ISD::SETOLT:
3067	CondCode = AArch64CC::MI;
3068	break;
3069	case ISD::SETOLE:
3070	CondCode = AArch64CC::LS;
3071	break;
3072	case ISD::SETONE:
3073	CondCode = AArch64CC::MI;
3074	CondCode2 = AArch64CC::GT;
3075	break;
3076	case ISD::SETO:
3077	CondCode = AArch64CC::VC;
3078	break;
3079	case ISD::SETUO:
3080	CondCode = AArch64CC::VS;
3081	break;
3082	case ISD::SETUEQ:
3083	CondCode = AArch64CC::EQ;
3084	CondCode2 = AArch64CC::VS;
3085	break;
3086	case ISD::SETUGT:
3087	CondCode = AArch64CC::HI;
3088	break;
3089	case ISD::SETUGE:
3090	CondCode = AArch64CC::PL;
3091	break;
3092	case ISD::SETLT:
3093	case ISD::SETULT:
3094	CondCode = AArch64CC::LT;
3095	break;
3096	case ISD::SETLE:
3097	case ISD::SETULE:
3098	CondCode = AArch64CC::LE;
3099	break;
3100	case ISD::SETNE:
3101	case ISD::SETUNE:
3102	CondCode = AArch64CC::NE;
3103	break;
3104	}
3105	}
3106
3107	/// Convert a DAG fp condition code to an AArch64 CC.
3108	/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
3109	/// should be AND'ed instead of OR'ed.
3110	static void changeFPCCToANDAArch64CC(ISD::CondCode CC,
3111	AArch64CC::CondCode &CondCode,
3112	AArch64CC::CondCode &CondCode2) {
3113	CondCode2 = AArch64CC::AL;
3114	switch (CC) {
3115	default:
3116	changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3117	assert(CondCode2 == AArch64CC::AL);
3118	break;
3119	case ISD::SETONE:
3120	// (a one b)
3121	// == ((a olt b) \|\| (a ogt b))
3122	// == ((a ord b) && (a une b))
3123	CondCode = AArch64CC::VC;
3124	CondCode2 = AArch64CC::NE;
3125	break;
3126	case ISD::SETUEQ:
3127	// (a ueq b)
3128	// == ((a uno b) \|\| (a oeq b))
3129	// == ((a ule b) && (a uge b))
3130	CondCode = AArch64CC::PL;
3131	CondCode2 = AArch64CC::LE;
3132	break;
3133	}
3134	}
3135
3136	/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
3137	/// CC usable with the vector instructions. Fewer operations are available
3138	/// without a real NZCV register, so we have to use less efficient combinations
3139	/// to get the same effect.
3140	static void changeVectorFPCCToAArch64CC(ISD::CondCode CC,
3141	AArch64CC::CondCode &CondCode,
3142	AArch64CC::CondCode &CondCode2,
3143	bool &Invert) {
3144	Invert = false;
3145	switch (CC) {
3146	default:
3147	// Mostly the scalar mappings work fine.
3148	changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3149	break;
3150	case ISD::SETUO:
3151	Invert = true;
3152	[[fallthrough]];
3153	case ISD::SETO:
3154	CondCode = AArch64CC::MI;
3155	CondCode2 = AArch64CC::GE;
3156	break;
3157	case ISD::SETUEQ:
3158	case ISD::SETULT:
3159	case ISD::SETULE:
3160	case ISD::SETUGT:
3161	case ISD::SETUGE:
3162	// All of the compare-mask comparisons are ordered, but we can switch
3163	// between the two by a double inversion. E.g. ULE == !OGT.
3164	Invert = true;
3165	changeFPCCToAArch64CC(getSetCCInverse(CC, / FP inverse / MVT::f32),
3166	CondCode, CondCode2);
3167	break;
3168	}
3169	}
3170
3171	static bool isLegalArithImmed(uint64_t C) {
3172	// Matches AArch64DAGToDAGISel::SelectArithImmed().
3173	bool IsLegal = (C >> `12` == `0`) \|\| ((C & `0xFFFULL`) == `0` && C >> `24` == `0`);
3174	LLVM_DEBUG(dbgs() << "Is imm " << C
3175	<< " legal: " << (IsLegal ? "yes\n" : "no\n"));
3176	return IsLegal;
3177	}
3178
3179	// Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
3180	// the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
3181	// can be set differently by this operation. It comes down to whether
3182	// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
3183	// everything is fine. If not then the optimization is wrong. Thus general
3184	// comparisons are only valid if op2 != 0.
3185	//
3186	// So, finally, the only LLVM-native comparisons that don't mention C and V
3187	// are SETEQ and SETNE. They're the only ones we can safely use CMN for in
3188	// the absence of information about op2.
3189	static bool isCMN(SDValue Op, ISD::CondCode CC) {
3190	return Op.getOpcode() == ISD::SUB && isNullConstant(V: Op.getOperand(i: `0`)) &&
3191	(CC == ISD::SETEQ \|\| CC == ISD::SETNE);
3192	}
3193
3194	static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl,
3195	SelectionDAG &DAG, SDValue Chain,
3196	bool IsSignaling) {
3197	EVT VT = LHS.getValueType();
3198	assert(VT != MVT::f128);
3199
3200	const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3201
3202	if ((VT == MVT::f16 && !FullFP16) \|\| VT == MVT::bf16) {
3203	LHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
3204	{Chain, LHS});
3205	RHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
3206	{LHS.getValue(`1`), RHS});
3207	Chain = RHS.getValue(R: `1`);
3208	VT = MVT::f32;
3209	}
3210	unsigned Opcode =
3211	IsSignaling ? AArch64ISD::STRICT_FCMPE : AArch64ISD::STRICT_FCMP;
3212	return DAG.getNode(Opcode, dl, {VT, MVT::Other}, {Chain, LHS, RHS});
3213	}
3214
3215	static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
3216	const SDLoc &dl, SelectionDAG &DAG) {
3217	EVT VT = LHS.getValueType();
3218	const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3219
3220	if (VT.isFloatingPoint()) {
3221	assert(VT != MVT::f128);
3222	if ((VT == MVT::f16 && !FullFP16) \|\| VT == MVT::bf16) {
3223	LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
3224	RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
3225	VT = MVT::f32;
3226	}
3227	return DAG.getNode(Opcode: AArch64ISD::FCMP, DL: dl, VT, N1: LHS, N2: RHS);
3228	}
3229
3230	// The CMP instruction is just an alias for SUBS, and representing it as
3231	// SUBS means that it's possible to get CSE with subtract operations.
3232	// A later phase can perform the optimization of setting the destination
3233	// register to WZR/XZR if it ends up being unused.
3234	unsigned Opcode = AArch64ISD::SUBS;
3235
3236	if (isCMN(Op: RHS, CC)) {
3237	// Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
3238	Opcode = AArch64ISD::ADDS;
3239	RHS = RHS.getOperand(i: `1`);
3240	} else if (isCMN(Op: LHS, CC)) {
3241	// As we are looking for EQ/NE compares, the operands can be commuted ; can
3242	// we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
3243	Opcode = AArch64ISD::ADDS;
3244	LHS = LHS.getOperand(i: `1`);
3245	} else if (isNullConstant(V: RHS) && !isUnsignedIntSetCC(Code: CC)) {
3246	if (LHS.getOpcode() == ISD::AND) {
3247	// Similarly, (CMP (and X, Y), 0) can be implemented with a TST
3248	// (a.k.a. ANDS) except that the flags are only guaranteed to work for one
3249	// of the signed comparisons.
3250	const SDValue ANDSNode = DAG.getNode(Opcode: AArch64ISD::ANDS, DL: dl,
3251	VTList: DAG.getVTList(VT1: VT, VT2: MVT_CC),
3252	N1: LHS.getOperand(i: `0`),
3253	N2: LHS.getOperand(i: `1`));
3254	// Replace all users of (and X, Y) with newly generated (ands X, Y)
3255	DAG.ReplaceAllUsesWith(From: LHS, To: ANDSNode);
3256	return ANDSNode.getValue(R: `1`);
3257	} else if (LHS.getOpcode() == AArch64ISD::ANDS) {
3258	// Use result of ANDS
3259	return LHS.getValue(R: `1`);
3260	}
3261	}
3262
3263	return DAG.getNode(Opcode, DL: dl, VTList: DAG.getVTList(VT1: VT, VT2: MVT_CC), N1: LHS, N2: RHS)
3264	.getValue(R: `1`);
3265	}
3266
3267	/// \defgroup AArch64CCMP CMP;CCMP matching
3268	///
3269	/// These functions deal with the formation of CMP;CCMP;... sequences.
3270	/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
3271	/// a comparison. They set the NZCV flags to a predefined value if their
3272	/// predicate is false. This allows to express arbitrary conjunctions, for
3273	/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
3274	/// expressed as:
3275	/// cmp A
3276	/// ccmp B, inv(CB), CA
3277	/// check for CB flags
3278	///
3279	/// This naturally lets us implement chains of AND operations with SETCC
3280	/// operands. And we can even implement some other situations by transforming
3281	/// them:
3282	/// - We can implement (NEG SETCC) i.e. negating a single comparison by
3283	/// negating the flags used in a CCMP/FCCMP operations.
3284	/// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
3285	/// by negating the flags we test for afterwards. i.e.
3286	/// NEG (CMP CCMP CCCMP ...) can be implemented.
3287	/// - Note that we can only ever negate all previously processed results.
3288	/// What we can not implement by flipping the flags to test is a negation
3289	/// of two sub-trees (because the negation affects all sub-trees emitted so
3290	/// far, so the 2nd sub-tree we emit would also affect the first).
3291	/// With those tools we can implement some OR operations:
3292	/// - (OR (SETCC A) (SETCC B)) can be implemented via:
3293	/// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
3294	/// - After transforming OR to NEG/AND combinations we may be able to use NEG
3295	/// elimination rules from earlier to implement the whole thing as a
3296	/// CCMP/FCCMP chain.
3297	///
3298	/// As complete example:
3299	/// or (or (setCA (cmp A)) (setCB (cmp B)))
3300	/// (and (setCC (cmp C)) (setCD (cmp D)))"
3301	/// can be reassociated to:
3302	/// or (and (setCC (cmp C)) setCD (cmp D))
3303	// (or (setCA (cmp A)) (setCB (cmp B)))
3304	/// can be transformed to:
3305	/// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
3306	/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
3307	/// which can be implemented as:
3308	/// cmp C
3309	/// ccmp D, inv(CD), CC
3310	/// ccmp A, CA, inv(CD)
3311	/// ccmp B, CB, inv(CA)
3312	/// check for CB flags
3313	///
3314	/// A counterexample is "or (and A B) (and C D)" which translates to
3315	/// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
3316	/// can only implement 1 of the inner (not) operations, but not both!
3317	/// @{
3318
3319	/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
3320	static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
3321	ISD::CondCode CC, SDValue CCOp,
3322	AArch64CC::CondCode Predicate,
3323	AArch64CC::CondCode OutCC,
3324	const SDLoc &DL, SelectionDAG &DAG) {
3325	unsigned Opcode = `0`;
3326	const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3327
3328	if (LHS.getValueType().isFloatingPoint()) {
3329	assert(LHS.getValueType() != MVT::f128);
3330	if ((LHS.getValueType() == MVT::f16 && !FullFP16) \|\|
3331	LHS.getValueType() == MVT::bf16) {
3332	LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
3333	RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
3334	}
3335	Opcode = AArch64ISD::FCCMP;
3336	} else if (RHS.getOpcode() == ISD::SUB) {
3337	SDValue SubOp0 = RHS.getOperand(i: `0`);
3338	if (isNullConstant(V: SubOp0) && (CC == ISD::SETEQ \|\| CC == ISD::SETNE)) {
3339	// See emitComparison() on why we can only do this for SETEQ and SETNE.
3340	Opcode = AArch64ISD::CCMN;
3341	RHS = RHS.getOperand(i: `1`);
3342	}
3343	}
3344	if (Opcode == `0`)
3345	Opcode = AArch64ISD::CCMP;
3346
3347	SDValue Condition = DAG.getConstant(Val: Predicate, DL, VT: MVT_CC);
3348	AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(Code: OutCC);
3349	unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(Code: InvOutCC);
3350	SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
3351	return DAG.getNode(Opcode, DL, VT: MVT_CC, N1: LHS, N2: RHS, N3: NZCVOp, N4: Condition, N5: CCOp);
3352	}
3353
3354	/// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
3355	/// expressed as a conjunction. See \ref AArch64CCMP.
3356	/// \param CanNegate Set to true if we can negate the whole sub-tree just by
3357	/// changing the conditions on the SETCC tests.
3358	/// (this means we can call emitConjunctionRec() with
3359	/// Negate==true on this sub-tree)
3360	/// \param MustBeFirst Set to true if this subtree needs to be negated and we
3361	/// cannot do the negation naturally. We are required to
3362	/// emit the subtree first in this case.
3363	/// \param WillNegate Is true if are called when the result of this
3364	/// subexpression must be negated. This happens when the
3365	/// outer expression is an OR. We can use this fact to know
3366	/// that we have a double negation (or (or ...) ...) that
3367	/// can be implemented for free.
3368	static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
3369	bool &MustBeFirst, bool WillNegate,
3370	unsigned Depth = `0`) {
3371	if (!Val.hasOneUse())
3372	return false;
3373	unsigned Opcode = Val ->getOpcode();
3374	if (Opcode == ISD::SETCC) {
3375	if (Val->getOperand(`0`).getValueType() == MVT::f128)
3376	return false;
3377	CanNegate = true;
3378	MustBeFirst = false;
3379	return true;
3380	}
3381	// Protect against exponential runtime and stack overflow.
3382	if (Depth > `6`)
3383	return false;
3384	if (Opcode == ISD::AND \|\| Opcode == ISD::OR) {
3385	bool IsOR = Opcode == ISD::OR;
3386	SDValue O0 = Val ->getOperand(Num: `0`);
3387	SDValue O1 = Val ->getOperand(Num: `1`);
3388	bool CanNegateL;
3389	bool MustBeFirstL;
3390	if (!canEmitConjunction(Val: O0, CanNegate&: CanNegateL, MustBeFirst&: MustBeFirstL, WillNegate: IsOR, Depth: Depth+`1`))
3391	return false;
3392	bool CanNegateR;
3393	bool MustBeFirstR;
3394	if (!canEmitConjunction(Val: O1, CanNegate&: CanNegateR, MustBeFirst&: MustBeFirstR, WillNegate: IsOR, Depth: Depth+`1`))
3395	return false;
3396
3397	if (MustBeFirstL && MustBeFirstR)
3398	return false;
3399
3400	if (IsOR) {
3401	// For an OR expression we need to be able to naturally negate at least
3402	// one side or we cannot do the transformation at all.
3403	if (!CanNegateL && !CanNegateR)
3404	return false;
3405	// If we the result of the OR will be negated and we can naturally negate
3406	// the leafs, then this sub-tree as a whole negates naturally.
3407	CanNegate = WillNegate && CanNegateL && CanNegateR;
3408	// If we cannot naturally negate the whole sub-tree, then this must be
3409	// emitted first.
3410	MustBeFirst = !CanNegate;
3411	} else {
3412	assert(Opcode == ISD::AND && "Must be OR or AND");
3413	// We cannot naturally negate an AND operation.
3414	CanNegate = false;
3415	MustBeFirst = MustBeFirstL \|\| MustBeFirstR;
3416	}
3417	return true;
3418	}
3419	return false;
3420	}
3421
3422	/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
3423	/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
3424	/// Tries to transform the given i1 producing node @p Val to a series compare
3425	/// and conditional compare operations. @returns an NZCV flags producing node
3426	/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
3427	/// transformation was not possible.
3428	/// \p Negate is true if we want this sub-tree being negated just by changing
3429	/// SETCC conditions.
3430	static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val,
3431	AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
3432	AArch64CC::CondCode Predicate) {
3433	// We're at a tree leaf, produce a conditional comparison operation.
3434	unsigned Opcode = Val ->getOpcode();
3435	if (Opcode == ISD::SETCC) {
3436	SDValue LHS = Val ->getOperand(Num: `0`);
3437	SDValue RHS = Val ->getOperand(Num: `1`);
3438	ISD::CondCode CC = cast<CondCodeSDNode>(Val: Val ->getOperand(Num: `2`))->get();
3439	bool isInteger = LHS.getValueType().isInteger();
3440	if (Negate)
3441	CC = getSetCCInverse(Operation: CC, Type: LHS.getValueType());
3442	SDLoc DL(Val);
3443	// Determine OutCC and handle FP special case.
3444	if (isInteger) {
3445	OutCC = changeIntCCToAArch64CC(CC);
3446	} else {
3447	assert(LHS.getValueType().isFloatingPoint());
3448	AArch64CC::CondCode ExtraCC;
3449	changeFPCCToANDAArch64CC(CC, CondCode&: OutCC, CondCode2&: ExtraCC);
3450	// Some floating point conditions can't be tested with a single condition
3451	// code. Construct an additional comparison in this case.
3452	if (ExtraCC != AArch64CC::AL) {
3453	SDValue ExtraCmp;
3454	if (!CCOp.getNode())
3455	ExtraCmp = emitComparison(LHS, RHS, CC, dl: DL, DAG);
3456	else
3457	ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
3458	OutCC: ExtraCC, DL, DAG);
3459	CCOp = ExtraCmp;
3460	Predicate = ExtraCC;
3461	}
3462	}
3463
3464	// Produce a normal comparison if we are first in the chain
3465	if (!CCOp)
3466	return emitComparison(LHS, RHS, CC, dl: DL, DAG);
3467	// Otherwise produce a ccmp.
3468	return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
3469	DAG);
3470	}
3471	assert(Val ->hasOneUse() && "Valid conjunction/disjunction tree");
3472
3473	bool IsOR = Opcode == ISD::OR;
3474
3475	SDValue LHS = Val ->getOperand(Num: `0`);
3476	bool CanNegateL;
3477	bool MustBeFirstL;
3478	bool ValidL = canEmitConjunction(Val: LHS, CanNegate&: CanNegateL, MustBeFirst&: MustBeFirstL, WillNegate: IsOR);
3479	assert(ValidL && "Valid conjunction/disjunction tree");
3480	(void)ValidL;
3481
3482	SDValue RHS = Val ->getOperand(Num: `1`);
3483	bool CanNegateR;
3484	bool MustBeFirstR;
3485	bool ValidR = canEmitConjunction(Val: RHS, CanNegate&: CanNegateR, MustBeFirst&: MustBeFirstR, WillNegate: IsOR);
3486	assert(ValidR && "Valid conjunction/disjunction tree");
3487	(void)ValidR;
3488
3489	// Swap sub-tree that must come first to the right side.
3490	if (MustBeFirstL) {
3491	assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
3492	std::swap(a&: LHS, b&: RHS);
3493	std::swap(a&: CanNegateL, b&: CanNegateR);
3494	std::swap(a&: MustBeFirstL, b&: MustBeFirstR);
3495	}
3496
3497	bool NegateR;
3498	bool NegateAfterR;
3499	bool NegateL;
3500	bool NegateAfterAll;
3501	if (Opcode == ISD::OR) {
3502	// Swap the sub-tree that we can negate naturally to the left.
3503	if (!CanNegateL) {
3504	assert(CanNegateR && "at least one side must be negatable");
3505	assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
3506	assert(!Negate);
3507	std::swap(a&: LHS, b&: RHS);
3508	NegateR = false;
3509	NegateAfterR = true;
3510	} else {
3511	// Negate the left sub-tree if possible, otherwise negate the result.
3512	NegateR = CanNegateR;
3513	NegateAfterR = !CanNegateR;
3514	}
3515	NegateL = true;
3516	NegateAfterAll = !Negate;
3517	} else {
3518	assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
3519	assert(!Negate && "Valid conjunction/disjunction tree");
3520
3521	NegateL = false;
3522	NegateR = false;
3523	NegateAfterR = false;
3524	NegateAfterAll = false;
3525	}
3526
3527	// Emit sub-trees.
3528	AArch64CC::CondCode RHSCC;
3529	SDValue CmpR = emitConjunctionRec(DAG, Val: RHS, OutCC&: RHSCC, Negate: NegateR, CCOp, Predicate);
3530	if (NegateAfterR)
3531	RHSCC = AArch64CC::getInvertedCondCode(Code: RHSCC);
3532	SDValue CmpL = emitConjunctionRec(DAG, Val: LHS, OutCC, Negate: NegateL, CCOp: CmpR, Predicate: RHSCC);
3533	if (NegateAfterAll)
3534	OutCC = AArch64CC::getInvertedCondCode(Code: OutCC);
3535	return CmpL;
3536	}
3537
3538	/// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
3539	/// In some cases this is even possible with OR operations in the expression.
3540	/// See \ref AArch64CCMP.
3541	/// \see emitConjunctionRec().
3542	static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val,
3543	AArch64CC::CondCode &OutCC) {
3544	bool DummyCanNegate;
3545	bool DummyMustBeFirst;
3546	if (!canEmitConjunction(Val, CanNegate&: DummyCanNegate, MustBeFirst&: DummyMustBeFirst, WillNegate: false))
3547	return SDValue ();
3548
3549	return emitConjunctionRec(DAG, Val, OutCC, Negate: false, CCOp: SDValue (), Predicate: AArch64CC::AL);
3550	}
3551
3552	/// @}
3553
3554	/// Returns how profitable it is to fold a comparison's operand's shift and/or
3555	/// extension operations.
3556	static unsigned getCmpOperandFoldingProfit(SDValue Op) {
3557	auto isSupportedExtend = [&](SDValue V) {
3558	if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
3559	return true;
3560
3561	if (V.getOpcode() == ISD::AND)
3562	if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(Val: V.getOperand(i: `1`))) {
3563	uint64_t Mask = MaskCst->getZExtValue();
3564	return (Mask == `0xFF` \|\| Mask == `0xFFFF` \|\| Mask == `0xFFFFFFFF`);
3565	}
3566
3567	return false;
3568	};
3569
3570	if (!Op.hasOneUse())
3571	return `0`;
3572
3573	if (isSupportedExtend (Op))
3574	return `1`;
3575
3576	unsigned Opc = Op.getOpcode();
3577	if (Opc == ISD::SHL \|\| Opc == ISD::SRL \|\| Opc == ISD::SRA)
3578	if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: `1`))) {
3579	uint64_t Shift = ShiftCst->getZExtValue();
3580	if (isSupportedExtend (Op.getOperand(i: `0`)))
3581	return (Shift <= `4`) ? `2` : `1`;
3582	EVT VT = Op.getValueType();
3583	if ((VT == MVT::i32 && Shift <= `31`) \|\| (VT == MVT::i64 && Shift <= `63`))
3584	return `1`;
3585	}
3586
3587	return `0`;
3588	}
3589
3590	static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
3591	SDValue &AArch64cc, SelectionDAG &DAG,
3592	const SDLoc &dl) {
3593	if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(Val: RHS.getNode())) {
3594	EVT VT = RHS.getValueType();
3595	uint64_t C = RHSC->getZExtValue();
3596	if (!isLegalArithImmed(C)) {
3597	// Constant does not fit, try adjusting it by one?
3598	switch (CC) {
3599	default:
3600	break;
3601	case ISD::SETLT:
3602	case ISD::SETGE:
3603	if ((VT == MVT::i32 && C != `0x80000000` &&
3604	isLegalArithImmed((uint32_t)(C - `1`))) \|\|
3605	(VT == MVT::i64 && C != `0x80000000ULL` &&
3606	isLegalArithImmed(C - `1ULL`))) {
3607	CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
3608	C = (VT == MVT::i32) ? (uint32_t)(C - `1`) : C - `1`;
3609	RHS = DAG.getConstant(Val: C, DL: dl, VT);
3610	}
3611	break;
3612	case ISD::SETULT:
3613	case ISD::SETUGE:
3614	if ((VT == MVT::i32 && C != `0` &&
3615	isLegalArithImmed((uint32_t)(C - `1`))) \|\|
3616	(VT == MVT::i64 && C != `0ULL` && isLegalArithImmed(C - `1ULL`))) {
3617	CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
3618	C = (VT == MVT::i32) ? (uint32_t)(C - `1`) : C - `1`;
3619	RHS = DAG.getConstant(Val: C, DL: dl, VT);
3620	}
3621	break;
3622	case ISD::SETLE:
3623	case ISD::SETGT:
3624	if ((VT == MVT::i32 && C != INT32_MAX &&
3625	isLegalArithImmed((uint32_t)(C + `1`))) \|\|
3626	(VT == MVT::i64 && C != INT64_MAX &&
3627	isLegalArithImmed(C + `1ULL`))) {
3628	CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
3629	C = (VT == MVT::i32) ? (uint32_t)(C + `1`) : C + `1`;
3630	RHS = DAG.getConstant(Val: C, DL: dl, VT);
3631	}
3632	break;
3633	case ISD::SETULE:
3634	case ISD::SETUGT:
3635	if ((VT == MVT::i32 && C != UINT32_MAX &&
3636	isLegalArithImmed((uint32_t)(C + `1`))) \|\|
3637	(VT == MVT::i64 && C != UINT64_MAX &&
3638	isLegalArithImmed(C + `1ULL`))) {
3639	CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
3640	C = (VT == MVT::i32) ? (uint32_t)(C + `1`) : C + `1`;
3641	RHS = DAG.getConstant(Val: C, DL: dl, VT);
3642	}
3643	break;
3644	}
3645	}
3646	}
3647
3648	// Comparisons are canonicalized so that the RHS operand is simpler than the
3649	// LHS one, the extreme case being when RHS is an immediate. However, AArch64
3650	// can fold some shift+extend operations on the RHS operand, so swap the
3651	// operands if that can be done.
3652	//
3653	// For example:
3654	// lsl w13, w11, #1
3655	// cmp w13, w12
3656	// can be turned into:
3657	// cmp w12, w11, lsl #1
3658	if (!isa<ConstantSDNode>(Val: RHS) \|\| !isLegalArithImmed(C: RHS ->getAsZExtVal())) {
3659	SDValue TheLHS = isCMN(Op: LHS, CC) ? LHS.getOperand(i: `1`) : LHS;
3660
3661	if (getCmpOperandFoldingProfit(Op: TheLHS) > getCmpOperandFoldingProfit(Op: RHS)) {
3662	std::swap(a&: LHS, b&: RHS);
3663	CC = ISD::getSetCCSwappedOperands(Operation: CC);
3664	}
3665	}
3666
3667	SDValue Cmp;
3668	AArch64CC::CondCode AArch64CC;
3669	if ((CC == ISD::SETEQ \|\| CC == ISD::SETNE) && isa<ConstantSDNode>(Val: RHS)) {
3670	const ConstantSDNode *RHSC = cast<ConstantSDNode>(Val&: RHS);
3671
3672	// The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
3673	// For the i8 operand, the largest immediate is 255, so this can be easily
3674	// encoded in the compare instruction. For the i16 operand, however, the
3675	// largest immediate cannot be encoded in the compare.
3676	// Therefore, use a sign extending load and cmn to avoid materializing the
3677	// -1 constant. For example,
3678	// movz w1, #65535
3679	// ldrh w0, [x0, #0]
3680	// cmp w0, w1
3681	// >
3682	// ldrsh w0, [x0, #0]
3683	// cmn w0, #1
3684	// Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
3685	// if and only if (sext LHS) == (sext RHS). The checks are in place to
3686	// ensure both the LHS and RHS are truly zero extended and to make sure the
3687	// transformation is profitable.
3688	if ((RHSC->getZExtValue() >> `16` == `0`) && isa<LoadSDNode>(LHS) &&
3689	cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
3690	cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
3691	LHS.getNode()->hasNUsesOfValue(`1`, `0`)) {
3692	int16_t ValueofRHS = RHS ->getAsZExtVal();
3693	if (ValueofRHS < `0` && isLegalArithImmed(C: -ValueofRHS)) {
3694	SDValue SExt =
3695	DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
3696	DAG.getValueType(MVT::i16));
3697	Cmp = emitComparison(LHS: SExt, RHS: DAG.getConstant(Val: ValueofRHS, DL: dl,
3698	VT: RHS.getValueType()),
3699	CC, dl, DAG);
3700	AArch64CC = changeIntCCToAArch64CC(CC);
3701	}
3702	}
3703
3704	if (!Cmp && (RHSC->isZero() \|\| RHSC->isOne())) {
3705	if ((Cmp = emitConjunction(DAG, Val: LHS, OutCC&: AArch64CC))) {
3706	if ((CC == ISD::SETNE) ^ RHSC->isZero())
3707	AArch64CC = AArch64CC::getInvertedCondCode(Code: AArch64CC);
3708	}
3709	}
3710	}
3711
3712	if (!Cmp) {
3713	Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
3714	AArch64CC = changeIntCCToAArch64CC(CC);
3715	}
3716	AArch64cc = DAG.getConstant(Val: AArch64CC, DL: dl, VT: MVT_CC);
3717	return Cmp;
3718	}
3719
3720	static std::pair<SDValue, SDValue>
3721	getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
3722	assert((Op.getValueType() == MVT::i32 \|\| Op.getValueType() == MVT::i64) &&
3723	"Unsupported value type");
3724	SDValue Value, Overflow;
3725	SDLoc DL(Op);
3726	SDValue LHS = Op.getOperand(i: `0`);
3727	SDValue RHS = Op.getOperand(i: `1`);
3728	unsigned Opc = `0`;
3729	switch (Op.getOpcode()) {
3730	default:
3731	llvm_unreachable("Unknown overflow instruction!");
3732	case ISD::SADDO:
3733	Opc = AArch64ISD::ADDS;
3734	CC = AArch64CC::VS;
3735	break;
3736	case ISD::UADDO:
3737	Opc = AArch64ISD::ADDS;
3738	CC = AArch64CC::HS;
3739	break;
3740	case ISD::SSUBO:
3741	Opc = AArch64ISD::SUBS;
3742	CC = AArch64CC::VS;
3743	break;
3744	case ISD::USUBO:
3745	Opc = AArch64ISD::SUBS;
3746	CC = AArch64CC::LO;
3747	break;
3748	// Multiply needs a little bit extra work.
3749	case ISD::SMULO:
3750	case ISD::UMULO: {
3751	CC = AArch64CC::NE;
3752	bool IsSigned = Op.getOpcode() == ISD::SMULO;
3753	if (Op.getValueType() == MVT::i32) {
3754	// Extend to 64-bits, then perform a 64-bit multiply.
3755	unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3756	LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
3757	RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
3758	SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
3759	Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
3760
3761	// Check that the result fits into a 32-bit integer.
3762	SDVTList VTs = DAG.getVTList(MVT::i64, MVT_CC);
3763	if (IsSigned) {
3764	// cmp xreg, wreg, sxtw
3765	SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value);
3766	Overflow =
3767	DAG.getNode(Opcode: AArch64ISD::SUBS, DL, VTList: VTs, N1: Mul, N2: SExtMul).getValue(R: `1`);
3768	} else {
3769	// tst xreg, #0xffffffff00000000
3770	SDValue UpperBits = DAG.getConstant(`0xFFFFFFFF00000000`, DL, MVT::i64);
3771	Overflow =
3772	DAG.getNode(Opcode: AArch64ISD::ANDS, DL, VTList: VTs, N1: Mul, N2: UpperBits).getValue(R: `1`);
3773	}
3774	break;
3775	}
3776	assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
3777	// For the 64 bit multiply
3778	Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
3779	if (IsSigned) {
3780	SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
3781	SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
3782	DAG.getConstant(`63`, DL, MVT::i64));
3783	// It is important that LowerBits is last, otherwise the arithmetic
3784	// shift will not be folded into the compare (SUBS).
3785	SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
3786	Overflow = DAG.getNode(Opcode: AArch64ISD::SUBS, DL, VTList: VTs, N1: UpperBits, N2: LowerBits)
3787	.getValue(R: `1`);
3788	} else {
3789	SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
3790	SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
3791	Overflow =
3792	DAG.getNode(AArch64ISD::SUBS, DL, VTs,
3793	DAG.getConstant(`0`, DL, MVT::i64),
3794	UpperBits).getValue(`1`);
3795	}
3796	break;
3797	}
3798	} // switch (...)
3799
3800	if (Opc) {
3801	SDVTList VTs = DAG.getVTList(Op->getValueType(`0`), MVT::i32);
3802
3803	// Emit the AArch64 operation with overflow check.
3804	Value = DAG.getNode(Opcode: Opc, DL, VTList: VTs, N1: LHS, N2: RHS);
3805	Overflow = Value.getValue(R: `1`);
3806	}
3807	return std::make_pair(x&: Value, y&: Overflow);
3808	}
3809
3810	SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
3811	if (useSVEForFixedLengthVectorVT(VT: Op.getValueType(),
3812	OverrideNEON: !Subtarget->isNeonAvailable()))
3813	return LowerToScalableOp(Op, DAG);
3814
3815	SDValue Sel = Op.getOperand(i: `0`);
3816	SDValue Other = Op.getOperand(i: `1`);
3817	SDLoc dl(Sel);
3818
3819	// If the operand is an overflow checking operation, invert the condition
3820	// code and kill the Not operation. I.e., transform:
3821	// (xor (overflow_op_bool, 1))
3822	// -->
3823	// (csel 1, 0, invert(cc), overflow_op_bool)
3824	// ... which later gets transformed to just a cset instruction with an
3825	// inverted condition code, rather than a cset + eor sequence.
3826	if (isOneConstant(V: Other) && ISD::isOverflowIntrOpRes(Op: Sel)) {
3827	// Only lower legal XALUO ops.
3828	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT: Sel ->getValueType(ResNo: `0`)))
3829	return SDValue ();
3830
3831	SDValue TVal = DAG.getConstant(`1`, dl, MVT::i32);
3832	SDValue FVal = DAG.getConstant(`0`, dl, MVT::i32);
3833	AArch64CC::CondCode CC;
3834	SDValue Value, Overflow;
3835	std::tie(args&: Value, args&: Overflow) = getAArch64XALUOOp(CC, Op: Sel.getValue(R: `0`), DAG);
3836	SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
3837	return DAG.getNode(Opcode: AArch64ISD::CSEL, DL: dl, VT: Op.getValueType(), N1: TVal, N2: FVal,
3838	N3: CCVal, N4: Overflow);
3839	}
3840	// If neither operand is a SELECT_CC, give up.
3841	if (Sel.getOpcode() != ISD::SELECT_CC)
3842	std::swap(a&: Sel, b&: Other);
3843	if (Sel.getOpcode() != ISD::SELECT_CC)
3844	return Op;
3845
3846	// The folding we want to perform is:
3847	// (xor x, (select_cc a, b, cc, 0, -1) )
3848	// -->
3849	// (csel x, (xor x, -1), cc ...)
3850	//
3851	// The latter will get matched to a CSINV instruction.
3852
3853	ISD::CondCode CC = cast<CondCodeSDNode>(Val: Sel.getOperand(i: `4`))->get();
3854	SDValue LHS = Sel.getOperand(i: `0`);
3855	SDValue RHS = Sel.getOperand(i: `1`);
3856	SDValue TVal = Sel.getOperand(i: `2`);
3857	SDValue FVal = Sel.getOperand(i: `3`);
3858
3859	// FIXME: This could be generalized to non-integer comparisons.
3860	if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
3861	return Op;
3862
3863	ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(Val&: FVal);
3864	ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(Val&: TVal);
3865
3866	// The values aren't constants, this isn't the pattern we're looking for.
3867	if (!CFVal \|\| !CTVal)
3868	return Op;
3869
3870	// We can commute the SELECT_CC by inverting the condition. This
3871	// might be needed to make this fit into a CSINV pattern.
3872	if (CTVal->isAllOnes() && CFVal->isZero()) {
3873	std::swap(a&: TVal, b&: FVal);
3874	std::swap(a&: CTVal, b&: CFVal);
3875	CC = ISD::getSetCCInverse(Operation: CC, Type: LHS.getValueType());
3876	}
3877
3878	// If the constants line up, perform the transform!
3879	if (CTVal->isZero() && CFVal->isAllOnes()) {
3880	SDValue CCVal;
3881	SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, AArch64cc&: CCVal, DAG, dl);
3882
3883	FVal = Other;
3884	TVal = DAG.getNode(Opcode: ISD::XOR, DL: dl, VT: Other.getValueType(), N1: Other,
3885	N2: DAG.getConstant(Val: -`1ULL`, DL: dl, VT: Other.getValueType()));
3886
3887	return DAG.getNode(Opcode: AArch64ISD::CSEL, DL: dl, VT: Sel.getValueType(), N1: FVal, N2: TVal,
3888	N3: CCVal, N4: Cmp);
3889	}
3890
3891	return Op;
3892	}
3893
3894	// If Invert is false, sets 'C' bit of NZCV to 0 if value is 0, else sets 'C'
3895	// bit to 1. If Invert is true, sets 'C' bit of NZCV to 1 if value is 0, else
3896	// sets 'C' bit to 0.
3897	static SDValue valueToCarryFlag(SDValue Value, SelectionDAG &DAG, bool Invert) {
3898	SDLoc DL(Value);
3899	EVT VT = Value.getValueType();
3900	SDValue Op0 = Invert ? DAG.getConstant(Val: `0`, DL, VT) : Value;
3901	SDValue Op1 = Invert ? Value : DAG.getConstant(Val: `1`, DL, VT);
3902	SDValue Cmp =
3903	DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::Glue), Op0, Op1);
3904	return Cmp.getValue(R: `1`);
3905	}
3906
3907	// If Invert is false, value is 1 if 'C' bit of NZCV is 1, else 0.
3908	// If Invert is true, value is 0 if 'C' bit of NZCV is 1, else 1.
3909	static SDValue carryFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG,
3910	bool Invert) {
3911	assert(Glue.getResNo() == `1`);
3912	SDLoc DL(Glue);
3913	SDValue Zero = DAG.getConstant(Val: `0`, DL, VT);
3914	SDValue One = DAG.getConstant(Val: `1`, DL, VT);
3915	unsigned Cond = Invert ? AArch64CC::LO : AArch64CC::HS;
3916	SDValue CC = DAG.getConstant(Cond, DL, MVT::i32);
3917	return DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT, N1: One, N2: Zero, N3: CC, N4: Glue);
3918	}
3919
3920	// Value is 1 if 'V' bit of NZCV is 1, else 0
3921	static SDValue overflowFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG) {
3922	assert(Glue.getResNo() == `1`);
3923	SDLoc DL(Glue);
3924	SDValue Zero = DAG.getConstant(Val: `0`, DL, VT);
3925	SDValue One = DAG.getConstant(Val: `1`, DL, VT);
3926	SDValue CC = DAG.getConstant(AArch64CC::VS, DL, MVT::i32);
3927	return DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT, N1: One, N2: Zero, N3: CC, N4: Glue);
3928	}
3929
3930	// This lowering is inefficient, but it will get cleaned up by
3931	// `foldOverflowCheck`
3932	static SDValue lowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG,
3933	unsigned Opcode, bool IsSigned) {
3934	EVT VT0 = Op.getValue(R: `0`).getValueType();
3935	EVT VT1 = Op.getValue(R: `1`).getValueType();
3936
3937	if (VT0 != MVT::i32 && VT0 != MVT::i64)
3938	return SDValue ();
3939
3940	bool InvertCarry = Opcode == AArch64ISD::SBCS;
3941	SDValue OpLHS = Op.getOperand(i: `0`);
3942	SDValue OpRHS = Op.getOperand(i: `1`);
3943	SDValue OpCarryIn = valueToCarryFlag(Value: Op.getOperand(i: `2`), DAG, Invert: InvertCarry);
3944
3945	SDLoc DL(Op);
3946	SDVTList VTs = DAG.getVTList(VT1: VT0, VT2: VT1);
3947
3948	SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, MVT::Glue), OpLHS,
3949	OpRHS, OpCarryIn);
3950
3951	SDValue OutFlag =
3952	IsSigned ? overflowFlagToValue(Glue: Sum.getValue(R: `1`), VT: VT1, DAG)
3953	: carryFlagToValue(Glue: Sum.getValue(R: `1`), VT: VT1, DAG, Invert: InvertCarry);
3954
3955	return DAG.getNode(Opcode: ISD::MERGE_VALUES, DL, VTList: VTs, N1: Sum, N2: OutFlag);
3956	}
3957
3958	static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
3959	// Let legalize expand this if it isn't a legal type yet.
3960	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT: Op.getValueType()))
3961	return SDValue ();
3962
3963	SDLoc dl(Op);
3964	AArch64CC::CondCode CC;
3965	// The actual operation that sets the overflow or carry flag.
3966	SDValue Value, Overflow;
3967	std::tie(args&: Value, args&: Overflow) = getAArch64XALUOOp(CC, Op, DAG);
3968
3969	// We use 0 and 1 as false and true values.
3970	SDValue TVal = DAG.getConstant(`1`, dl, MVT::i32);
3971	SDValue FVal = DAG.getConstant(`0`, dl, MVT::i32);
3972
3973	// We use an inverted condition, because the conditional select is inverted
3974	// too. This will allow it to be selected to a single instruction:
3975	// CSINC Wd, WZR, WZR, invert(cond).
3976	SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
3977	Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal,
3978	CCVal, Overflow);
3979
3980	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
3981	return DAG.getNode(Opcode: ISD::MERGE_VALUES, DL: dl, VTList: VTs, N1: Value, N2: Overflow);
3982	}
3983
3984	// Prefetch operands are:
3985	// 1: Address to prefetch
3986	// 2: bool isWrite
3987	// 3: int locality (0 = no locality ... 3 = extreme locality)
3988	// 4: bool isDataCache
3989	static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
3990	SDLoc DL(Op);
3991	unsigned IsWrite = Op.getConstantOperandVal(i: `2`);
3992	unsigned Locality = Op.getConstantOperandVal(i: `3`);
3993	unsigned IsData = Op.getConstantOperandVal(i: `4`);
3994
3995	bool IsStream = !Locality;
3996	// When the locality number is set
3997	if (Locality) {
3998	// The front-end should have filtered out the out-of-range values
3999	assert(Locality <= `3` && "Prefetch locality out-of-range");
4000	// The locality degree is the opposite of the cache speed.
4001	// Put the number the other way around.
4002	// The encoding starts at 0 for level 1
4003	Locality = `3` - Locality;
4004	}
4005
4006	// built the mask value encoding the expected behavior.
4007	unsigned PrfOp = (IsWrite << `4`) \| // Load/Store bit
4008	(!IsData << `3`) \| // IsDataCache bit
4009	(Locality << `1`) \| // Cache level bits
4010	(unsigned)IsStream; // Stream bit
4011	return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(`0`),
4012	DAG.getTargetConstant(PrfOp, DL, MVT::i32),
4013	Op.getOperand(`1`));
4014	}
4015
4016	SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
4017	SelectionDAG &DAG) const {
4018	EVT VT = Op.getValueType();
4019	if (VT.isScalableVector())
4020	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
4021
4022	if (useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable()))
4023	return LowerFixedLengthFPExtendToSVE(Op, DAG);
4024
4025	assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
4026	return SDValue ();
4027	}
4028
4029	SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
4030	SelectionDAG &DAG) const {
4031	EVT VT = Op.getValueType();
4032	if (VT.isScalableVector())
4033	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
4034
4035	bool IsStrict = Op ->isStrictFPOpcode();
4036	SDValue SrcVal = Op.getOperand(i: IsStrict ? `1` : `0`);
4037	EVT SrcVT = SrcVal.getValueType();
4038	bool Trunc = Op.getConstantOperandVal(i: IsStrict ? `2` : `1`) == `1`;
4039
4040	if (useSVEForFixedLengthVectorVT(VT: SrcVT, OverrideNEON: !Subtarget->isNeonAvailable()))
4041	return LowerFixedLengthFPRoundToSVE(Op, DAG);
4042
4043	// Expand cases where the result type is BF16 but we don't have hardware
4044	// instructions to lower it.
4045	if (VT.getScalarType() == MVT::bf16 &&
4046	!((Subtarget->hasNEON() \|\| Subtarget->hasSME()) &&
4047	Subtarget->hasBF16())) {
4048	SDLoc dl(Op);
4049	SDValue Narrow = SrcVal;
4050	SDValue NaN;
4051	EVT I32 = SrcVT.changeElementType(MVT::i32);
4052	EVT F32 = SrcVT.changeElementType(MVT::f32);
4053	if (SrcVT.getScalarType() == MVT::f32) {
4054	bool NeverSNaN = DAG.isKnownNeverSNaN(Op: Narrow);
4055	Narrow = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: I32, Operand: Narrow);
4056	if (!NeverSNaN) {
4057	// Set the quiet bit.
4058	NaN = DAG.getNode(Opcode: ISD::OR, DL: dl, VT: I32, N1: Narrow,
4059	N2: DAG.getConstant(Val: `0x400000`, DL: dl, VT: I32));
4060	}
4061	} else if (SrcVT.getScalarType() == MVT::f64) {
4062	Narrow = DAG.getNode(Opcode: AArch64ISD::FCVTXN, DL: dl, VT: F32, Operand: Narrow);
4063	Narrow = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: I32, Operand: Narrow);
4064	} else {
4065	return SDValue ();
4066	}
4067	if (!Trunc) {
4068	SDValue One = DAG.getConstant(Val: `1`, DL: dl, VT: I32);
4069	SDValue Lsb = DAG.getNode(Opcode: ISD::SRL, DL: dl, VT: I32, N1: Narrow,
4070	N2: DAG.getShiftAmountConstant(Val: `16`, VT: I32, DL: dl));
4071	Lsb = DAG.getNode(Opcode: ISD::AND, DL: dl, VT: I32, N1: Lsb, N2: One);
4072	SDValue RoundingBias =
4073	DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: I32, N1: DAG.getConstant(Val: `0x7fff`, DL: dl, VT: I32), N2: Lsb);
4074	Narrow = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: I32, N1: Narrow, N2: RoundingBias);
4075	}
4076
4077	// Don't round if we had a NaN, we don't want to turn 0x7fffffff into
4078	// 0x80000000.
4079	if (NaN) {
4080	SDValue IsNaN = DAG.getSetCC(
4081	DL: dl, VT: getSetCCResultType(DAG.getDataLayout(), C&: *DAG.getContext(), VT: SrcVT),
4082	LHS: SrcVal, RHS: SrcVal, Cond: ISD::SETUO);
4083	Narrow = DAG.getSelect(DL: dl, VT: I32, Cond: IsNaN, LHS: NaN, RHS: Narrow);
4084	}
4085
4086	// Now that we have rounded, shift the bits into position.
4087	Narrow = DAG.getNode(Opcode: ISD::SRL, DL: dl, VT: I32, N1: Narrow,
4088	N2: DAG.getShiftAmountConstant(Val: `16`, VT: I32, DL: dl));
4089	if (VT.isVector()) {
4090	EVT I16 = I32.changeVectorElementType(MVT::i16);
4091	Narrow = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: I16, Operand: Narrow);
4092	return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT, Operand: Narrow);
4093	}
4094	Narrow = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: F32, Operand: Narrow);
4095	SDValue Result = DAG.getTargetExtractSubreg(AArch64::hsub, dl, VT, Narrow);
4096	return IsStrict ? DAG.getMergeValues(Ops: {Result, Op.getOperand(i: `0`)}, dl)
4097	: Result;
4098	}
4099
4100	if (SrcVT != MVT::f128) {
4101	// Expand cases where the input is a vector bigger than NEON.
4102	if (useSVEForFixedLengthVectorVT(VT: SrcVT))
4103	return SDValue ();
4104
4105	// It's legal except when f128 is involved
4106	return Op;
4107	}
4108
4109	return SDValue ();
4110	}
4111
4112	SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
4113	SelectionDAG &DAG) const {
4114	// Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4115	// Any additional optimization in this function should be recorded
4116	// in the cost tables.
4117	bool IsStrict = Op ->isStrictFPOpcode();
4118	EVT InVT = Op.getOperand(i: IsStrict ? `1` : `0`).getValueType();
4119	EVT VT = Op.getValueType();
4120
4121	if (VT.isScalableVector()) {
4122	unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
4123	? AArch64ISD::FCVTZU_MERGE_PASSTHRU
4124	: AArch64ISD::FCVTZS_MERGE_PASSTHRU;
4125	return LowerToPredicatedOp(Op, DAG, NewOp: Opcode);
4126	}
4127
4128	if (useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable()) \|\|
4129	useSVEForFixedLengthVectorVT(VT: InVT, OverrideNEON: !Subtarget->isNeonAvailable()))
4130	return LowerFixedLengthFPToIntToSVE(Op, DAG);
4131
4132	unsigned NumElts = InVT.getVectorNumElements();
4133
4134	// f16 conversions are promoted to f32 when full fp16 is not supported.
4135	if ((InVT.getVectorElementType() == MVT::f16 && !Subtarget->hasFullFP16()) \|\|
4136	InVT.getVectorElementType() == MVT::bf16) {
4137	MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
4138	SDLoc dl(Op);
4139	if (IsStrict) {
4140	SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NewVT, MVT::Other},
4141	{Op.getOperand(`0`), Op.getOperand(`1`)});
4142	return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
4143	{Ext.getValue(`1`), Ext.getValue(`0`)});
4144	}
4145	return DAG.getNode(
4146	Opcode: Op.getOpcode(), DL: dl, VT: Op.getValueType(),
4147	Operand: DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: NewVT, Operand: Op.getOperand(i: `0`)));
4148	}
4149
4150	uint64_t VTSize = VT.getFixedSizeInBits();
4151	uint64_t InVTSize = InVT.getFixedSizeInBits();
4152	if (VTSize < InVTSize) {
4153	SDLoc dl(Op);
4154	if (IsStrict) {
4155	InVT = InVT.changeVectorElementTypeToInteger();
4156	SDValue Cv = DAG.getNode(Op.getOpcode(), dl, {InVT, MVT::Other},
4157	{Op.getOperand(`0`), Op.getOperand(`1`)});
4158	SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT, Operand: Cv);
4159	return DAG.getMergeValues(Ops: {Trunc, Cv.getValue(R: `1`)}, dl);
4160	}
4161	SDValue Cv =
4162	DAG.getNode(Opcode: Op.getOpcode(), DL: dl, VT: InVT.changeVectorElementTypeToInteger(),
4163	Operand: Op.getOperand(i: `0`));
4164	return DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT, Operand: Cv);
4165	}
4166
4167	if (VTSize > InVTSize) {
4168	SDLoc dl(Op);
4169	MVT ExtVT =
4170	MVT::getVectorVT(VT: MVT::getFloatingPointVT(BitWidth: VT.getScalarSizeInBits()),
4171	NumElements: VT.getVectorNumElements());
4172	if (IsStrict) {
4173	SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {ExtVT, MVT::Other},
4174	{Op.getOperand(`0`), Op.getOperand(`1`)});
4175	return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
4176	{Ext.getValue(`1`), Ext.getValue(`0`)});
4177	}
4178	SDValue Ext = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: ExtVT, Operand: Op.getOperand(i: `0`));
4179	return DAG.getNode(Opcode: Op.getOpcode(), DL: dl, VT, Operand: Ext);
4180	}
4181
4182	// Use a scalar operation for conversions between single-element vectors of
4183	// the same size.
4184	if (NumElts == `1`) {
4185	SDLoc dl(Op);
4186	SDValue Extract = DAG.getNode(
4187	ISD::EXTRACT_VECTOR_ELT, dl, InVT.getScalarType(),
4188	Op.getOperand(IsStrict ? `1` : `0`), DAG.getConstant(`0`, dl, MVT::i64));
4189	EVT ScalarVT = VT.getScalarType();
4190	if (IsStrict)
4191	return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
4192	{Op.getOperand(`0`), Extract});
4193	return DAG.getNode(Opcode: Op.getOpcode(), DL: dl, VT: ScalarVT, Operand: Extract);
4194	}
4195
4196	// Type changing conversions are illegal.
4197	return Op;
4198	}
4199
4200	SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
4201	SelectionDAG &DAG) const {
4202	bool IsStrict = Op ->isStrictFPOpcode();
4203	SDValue SrcVal = Op.getOperand(i: IsStrict ? `1` : `0`);
4204
4205	if (SrcVal.getValueType().isVector())
4206	return LowerVectorFP_TO_INT(Op, DAG);
4207
4208	// f16 conversions are promoted to f32 when full fp16 is not supported.
4209	if ((SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) \|\|
4210	SrcVal.getValueType() == MVT::bf16) {
4211	SDLoc dl(Op);
4212	if (IsStrict) {
4213	SDValue Ext =
4214	DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
4215	{Op.getOperand(`0`), SrcVal});
4216	return DAG.getNode(Op.getOpcode(), dl, {Op.getValueType(), MVT::Other},
4217	{Ext.getValue(`1`), Ext.getValue(`0`)});
4218	}
4219	return DAG.getNode(
4220	Op.getOpcode(), dl, Op.getValueType(),
4221	DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, SrcVal));
4222	}
4223
4224	if (SrcVal.getValueType() != MVT::f128) {
4225	// It's legal except when f128 is involved
4226	return Op;
4227	}
4228
4229	return SDValue ();
4230	}
4231
4232	SDValue
4233	AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
4234	SelectionDAG &DAG) const {
4235	// AArch64 FP-to-int conversions saturate to the destination element size, so
4236	// we can lower common saturating conversions to simple instructions.
4237	SDValue SrcVal = Op.getOperand(i: `0`);
4238	EVT SrcVT = SrcVal.getValueType();
4239	EVT DstVT = Op.getValueType();
4240	EVT SatVT = cast<VTSDNode>(Val: Op.getOperand(i: `1`))->getVT();
4241
4242	uint64_t SrcElementWidth = SrcVT.getScalarSizeInBits();
4243	uint64_t DstElementWidth = DstVT.getScalarSizeInBits();
4244	uint64_t SatWidth = SatVT.getScalarSizeInBits();
4245	assert(SatWidth <= DstElementWidth &&
4246	"Saturation width cannot exceed result width");
4247
4248	// TODO: Consider lowering to SVE operations, as in LowerVectorFP_TO_INT.
4249	// Currently, the `llvm.fpto[su]i.sat.` intrinsics don't accept scalable*
4250	// types, so this is hard to reach.
4251	if (DstVT.isScalableVector())
4252	return SDValue ();
4253
4254	EVT SrcElementVT = SrcVT.getVectorElementType();
4255
4256	// In the absence of FP16 support, promote f16 to f32 and saturate the result.
4257	if ((SrcElementVT == MVT::f16 &&
4258	(!Subtarget->hasFullFP16() \|\| DstElementWidth > `16`)) \|\|
4259	SrcElementVT == MVT::bf16) {
4260	MVT F32VT = MVT::getVectorVT(MVT::f32, SrcVT.getVectorNumElements());
4261	SrcVal = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SDLoc (Op), VT: F32VT, Operand: SrcVal);
4262	SrcVT = F32VT;
4263	SrcElementVT = MVT::f32;
4264	SrcElementWidth = `32`;
4265	} else if (SrcElementVT != MVT::f64 && SrcElementVT != MVT::f32 &&
4266	SrcElementVT != MVT::f16 && SrcElementVT != MVT::bf16)
4267	return SDValue ();
4268
4269	SDLoc DL(Op);
4270	// Cases that we can emit directly.
4271	if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth)
4272	return DAG.getNode(Opcode: Op.getOpcode(), DL, VT: DstVT, N1: SrcVal,
4273	N2: DAG.getValueType(DstVT.getScalarType()));
4274
4275	// Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4276	// result. This is only valid if the legal cvt is larger than the saturate
4277	// width. For double, as we don't have MIN/MAX, it can be simpler to scalarize
4278	// (at least until sqxtn is selected).
4279	if (SrcElementWidth < SatWidth \|\| SrcElementVT == MVT::f64)
4280	return SDValue ();
4281
4282	EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
4283	SDValue NativeCvt = DAG.getNode(Opcode: Op.getOpcode(), DL, VT: IntVT, N1: SrcVal,
4284	N2: DAG.getValueType(IntVT.getScalarType()));
4285	SDValue Sat;
4286	if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4287	SDValue MinC = DAG.getConstant(
4288	Val: APInt::getSignedMaxValue(numBits: SatWidth).sext(width: SrcElementWidth), DL, VT: IntVT);
4289	SDValue Min = DAG.getNode(Opcode: ISD::SMIN, DL, VT: IntVT, N1: NativeCvt, N2: MinC);
4290	SDValue MaxC = DAG.getConstant(
4291	Val: APInt::getSignedMinValue(numBits: SatWidth).sext(width: SrcElementWidth), DL, VT: IntVT);
4292	Sat = DAG.getNode(Opcode: ISD::SMAX, DL, VT: IntVT, N1: Min, N2: MaxC);
4293	} else {
4294	SDValue MinC = DAG.getConstant(
4295	Val: APInt::getAllOnes(numBits: SatWidth).zext(width: SrcElementWidth), DL, VT: IntVT);
4296	Sat = DAG.getNode(Opcode: ISD::UMIN, DL, VT: IntVT, N1: NativeCvt, N2: MinC);
4297	}
4298
4299	return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: DstVT, Operand: Sat);
4300	}
4301
4302	SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
4303	SelectionDAG &DAG) const {
4304	// AArch64 FP-to-int conversions saturate to the destination register size, so
4305	// we can lower common saturating conversions to simple instructions.
4306	SDValue SrcVal = Op.getOperand(i: `0`);
4307	EVT SrcVT = SrcVal.getValueType();
4308
4309	if (SrcVT.isVector())
4310	return LowerVectorFP_TO_INT_SAT(Op, DAG);
4311
4312	EVT DstVT = Op.getValueType();
4313	EVT SatVT = cast<VTSDNode>(Val: Op.getOperand(i: `1`))->getVT();
4314	uint64_t SatWidth = SatVT.getScalarSizeInBits();
4315	uint64_t DstWidth = DstVT.getScalarSizeInBits();
4316	assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
4317
4318	// In the absence of FP16 support, promote f16 to f32 and saturate the result.
4319	if ((SrcVT == MVT::f16 && !Subtarget->hasFullFP16()) \|\| SrcVT == MVT::bf16) {
4320	SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, SrcVal);
4321	SrcVT = MVT::f32;
4322	} else if (SrcVT != MVT::f64 && SrcVT != MVT::f32 && SrcVT != MVT::f16 &&
4323	SrcVT != MVT::bf16)
4324	return SDValue ();
4325
4326	SDLoc DL(Op);
4327	// Cases that we can emit directly.
4328	if ((SrcVT == MVT::f64 \|\| SrcVT == MVT::f32 \|\|
4329	(SrcVT == MVT::f16 && Subtarget->hasFullFP16())) &&
4330	DstVT == SatVT && (DstVT == MVT::i64 \|\| DstVT == MVT::i32))
4331	return DAG.getNode(Opcode: Op.getOpcode(), DL, VT: DstVT, N1: SrcVal,
4332	N2: DAG.getValueType(DstVT));
4333
4334	// Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4335	// result. This is only valid if the legal cvt is larger than the saturate
4336	// width.
4337	if (DstWidth < SatWidth)
4338	return SDValue ();
4339
4340	SDValue NativeCvt =
4341	DAG.getNode(Opcode: Op.getOpcode(), DL, VT: DstVT, N1: SrcVal, N2: DAG.getValueType(DstVT));
4342	SDValue Sat;
4343	if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4344	SDValue MinC = DAG.getConstant(
4345	Val: APInt::getSignedMaxValue(numBits: SatWidth).sext(width: DstWidth), DL, VT: DstVT);
4346	SDValue Min = DAG.getNode(Opcode: ISD::SMIN, DL, VT: DstVT, N1: NativeCvt, N2: MinC);
4347	SDValue MaxC = DAG.getConstant(
4348	Val: APInt::getSignedMinValue(numBits: SatWidth).sext(width: DstWidth), DL, VT: DstVT);
4349	Sat = DAG.getNode(Opcode: ISD::SMAX, DL, VT: DstVT, N1: Min, N2: MaxC);
4350	} else {
4351	SDValue MinC = DAG.getConstant(
4352	Val: APInt::getAllOnes(numBits: SatWidth).zext(width: DstWidth), DL, VT: DstVT);
4353	Sat = DAG.getNode(Opcode: ISD::UMIN, DL, VT: DstVT, N1: NativeCvt, N2: MinC);
4354	}
4355
4356	return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: DstVT, Operand: Sat);
4357	}
4358
4359	SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
4360	SelectionDAG &DAG) const {
4361	// Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4362	// Any additional optimization in this function should be recorded
4363	// in the cost tables.
4364	bool IsStrict = Op ->isStrictFPOpcode();
4365	EVT VT = Op.getValueType();
4366	SDLoc dl(Op);
4367	SDValue In = Op.getOperand(i: IsStrict ? `1` : `0`);
4368	EVT InVT = In.getValueType();
4369	unsigned Opc = Op.getOpcode();
4370	bool IsSigned = Opc == ISD::SINT_TO_FP \|\| Opc == ISD::STRICT_SINT_TO_FP;
4371
4372	if (VT.isScalableVector()) {
4373	if (InVT.getVectorElementType() == MVT::i1) {
4374	// We can't directly extend an SVE predicate; extend it first.
4375	unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4376	EVT CastVT = getPromotedVTForPredicate(VT: InVT);
4377	In = DAG.getNode(Opcode: CastOpc, DL: dl, VT: CastVT, Operand: In);
4378	return DAG.getNode(Opcode: Opc, DL: dl, VT, Operand: In);
4379	}
4380
4381	unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
4382	: AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
4383	return LowerToPredicatedOp(Op, DAG, NewOp: Opcode);
4384	}
4385
4386	if (useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable()) \|\|
4387	useSVEForFixedLengthVectorVT(VT: InVT, OverrideNEON: !Subtarget->isNeonAvailable()))
4388	return LowerFixedLengthIntToFPToSVE(Op, DAG);
4389
4390	// Promote bf16 conversions to f32.
4391	if (VT.getVectorElementType() == MVT::bf16) {
4392	EVT F32 = VT.changeElementType(MVT::f32);
4393	if (IsStrict) {
4394	SDValue Val = DAG.getNode(Op.getOpcode(), dl, {F32, MVT::Other},
4395	{Op.getOperand(`0`), In});
4396	return DAG.getNode(
4397	ISD::STRICT_FP_ROUND, dl, {Op.getValueType(), MVT::Other},
4398	{Val.getValue(`1`), Val.getValue(`0`), DAG.getIntPtrConstant(`0`, dl)});
4399	}
4400	return DAG.getNode(Opcode: ISD::FP_ROUND, DL: dl, VT: Op.getValueType(),
4401	N1: DAG.getNode(Opcode: Op.getOpcode(), DL: dl, VT: F32, Operand: In),
4402	N2: DAG.getIntPtrConstant(Val: `0`, DL: dl));
4403	}
4404
4405	uint64_t VTSize = VT.getFixedSizeInBits();
4406	uint64_t InVTSize = InVT.getFixedSizeInBits();
4407	if (VTSize < InVTSize) {
4408	MVT CastVT =
4409	MVT::getVectorVT(VT: MVT::getFloatingPointVT(BitWidth: InVT.getScalarSizeInBits()),
4410	NumElements: InVT.getVectorNumElements());
4411	if (IsStrict) {
4412	In = DAG.getNode(Opc, dl, {CastVT, MVT::Other},
4413	{Op.getOperand(`0`), In});
4414	return DAG.getNode(
4415	ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
4416	{In.getValue(`1`), In.getValue(`0`), DAG.getIntPtrConstant(`0`, dl)});
4417	}
4418	In = DAG.getNode(Opcode: Opc, DL: dl, VT: CastVT, Operand: In);
4419	return DAG.getNode(Opcode: ISD::FP_ROUND, DL: dl, VT, N1: In,
4420	N2: DAG.getIntPtrConstant(Val: `0`, DL: dl, /isTarget=/true));
4421	}
4422
4423	if (VTSize > InVTSize) {
4424	unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4425	EVT CastVT = VT.changeVectorElementTypeToInteger();
4426	In = DAG.getNode(Opcode: CastOpc, DL: dl, VT: CastVT, Operand: In);
4427	if (IsStrict)
4428	return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op.getOperand(`0`), In});
4429	return DAG.getNode(Opcode: Opc, DL: dl, VT, Operand: In);
4430	}
4431
4432	// Use a scalar operation for conversions between single-element vectors of
4433	// the same size.
4434	if (VT.getVectorNumElements() == `1`) {
4435	SDValue Extract = DAG.getNode(
4436	ISD::EXTRACT_VECTOR_ELT, dl, InVT.getScalarType(),
4437	In, DAG.getConstant(`0`, dl, MVT::i64));
4438	EVT ScalarVT = VT.getScalarType();
4439	if (IsStrict)
4440	return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
4441	{Op.getOperand(`0`), Extract});
4442	return DAG.getNode(Opcode: Op.getOpcode(), DL: dl, VT: ScalarVT, Operand: Extract);
4443	}
4444
4445	return Op;
4446	}
4447
4448	SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
4449	SelectionDAG &DAG) const {
4450	if (Op.getValueType().isVector())
4451	return LowerVectorINT_TO_FP(Op, DAG);
4452
4453	bool IsStrict = Op ->isStrictFPOpcode();
4454	SDValue SrcVal = Op.getOperand(i: IsStrict ? `1` : `0`);
4455
4456	bool IsSigned = Op ->getOpcode() == ISD::STRICT_SINT_TO_FP \|\|
4457	Op ->getOpcode() == ISD::SINT_TO_FP;
4458
4459	auto IntToFpViaPromotion = [&](EVT PromoteVT) {
4460	SDLoc dl(Op);
4461	if (IsStrict) {
4462	SDValue Val = DAG.getNode(Op.getOpcode(), dl, {PromoteVT, MVT::Other},
4463	{Op.getOperand(`0`), SrcVal});
4464	return DAG.getNode(
4465	ISD::STRICT_FP_ROUND, dl, {Op.getValueType(), MVT::Other},
4466	{Val.getValue(`1`), Val.getValue(`0`), DAG.getIntPtrConstant(`0`, dl)});
4467	}
4468	return DAG.getNode(Opcode: ISD::FP_ROUND, DL: dl, VT: Op.getValueType(),
4469	N1: DAG.getNode(Opcode: Op.getOpcode(), DL: dl, VT: PromoteVT, Operand: SrcVal),
4470	N2: DAG.getIntPtrConstant(Val: `0`, DL: dl));
4471	};
4472
4473	if (Op.getValueType() == MVT::bf16) {
4474	unsigned MaxWidth = IsSigned
4475	? DAG.ComputeMaxSignificantBits(Op: SrcVal)
4476	: DAG.computeKnownBits(Op: SrcVal).countMaxActiveBits();
4477	// bf16 conversions are promoted to f32 when converting from i16.
4478	if (MaxWidth <= `24`) {
4479	return IntToFpViaPromotion(MVT::f32);
4480	}
4481
4482	// bf16 conversions are promoted to f64 when converting from i32.
4483	if (MaxWidth <= `53`) {
4484	return IntToFpViaPromotion(MVT::f64);
4485	}
4486
4487	// We need to be careful about i64 -> bf16.
4488	// Consider an i32 22216703.
4489	// This number cannot be represented exactly as an f32 and so a itofp will
4490	// turn it into 22216704.0 fptrunc to bf16 will turn this into 22282240.0
4491	// However, the correct bf16 was supposed to be 22151168.0
4492	// We need to use sticky rounding to get this correct.
4493	if (SrcVal.getValueType() == MVT::i64) {
4494	SDLoc DL(Op);
4495	// This algorithm is equivalent to the following:
4496	// uint64_t SrcHi = SrcVal & ~0xfffull;
4497	// uint64_t SrcLo = SrcVal & 0xfffull;
4498	// uint64_t Highest = SrcVal >> 53;
4499	// bool HasHighest = Highest != 0;
4500	// uint64_t ToRound = HasHighest ? SrcHi : SrcVal;
4501	// double Rounded = static_cast<double>(ToRound);
4502	// uint64_t RoundedBits = std::bit_cast<uint64_t>(Rounded);
4503	// uint64_t HasLo = SrcLo != 0;
4504	// bool NeedsAdjustment = HasHighest & HasLo;
4505	// uint64_t AdjustedBits = RoundedBits \| uint64_t{NeedsAdjustment};
4506	// double Adjusted = std::bit_cast<double>(AdjustedBits);
4507	// return static_cast<__bf16>(Adjusted);
4508	//
4509	// Essentially, what happens is that SrcVal either fits perfectly in a
4510	// double-precision value or it is too big. If it is sufficiently small,
4511	// we should just go u64 -> double -> bf16 in a naive way. Otherwise, we
4512	// ensure that u64 -> double has no rounding error by only using the 52
4513	// MSB of the input. The low order bits will get merged into a sticky bit
4514	// which will avoid issues incurred by double rounding.
4515
4516	// Signed conversion is more or less like so:
4517	// copysign((__bf16)abs(SrcVal), SrcVal)
4518	SDValue SignBit;
4519	if (IsSigned) {
4520	SignBit = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
4521	DAG.getConstant(`1ull` << `63`, DL, MVT::i64));
4522	SrcVal = DAG.getNode(ISD::ABS, DL, MVT::i64, SrcVal);
4523	}
4524	SDValue SrcHi = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
4525	DAG.getConstant(~`0xfffull`, DL, MVT::i64));
4526	SDValue SrcLo = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
4527	DAG.getConstant(`0xfffull`, DL, MVT::i64));
4528	SDValue Highest =
4529	DAG.getNode(ISD::SRL, DL, MVT::i64, SrcVal,
4530	DAG.getShiftAmountConstant(`53`, MVT::i64, DL));
4531	SDValue Zero64 = DAG.getConstant(`0`, DL, MVT::i64);
4532	SDValue ToRound =
4533	DAG.getSelectCC(DL, LHS: Highest, RHS: Zero64, True: SrcHi, False: SrcVal, Cond: ISD::SETNE);
4534	SDValue Rounded =
4535	IsStrict ? DAG.getNode(Op.getOpcode(), DL, {MVT::f64, MVT::Other},
4536	{Op.getOperand(`0`), ToRound})
4537	: DAG.getNode(Op.getOpcode(), DL, MVT::f64, ToRound);
4538
4539	SDValue RoundedBits = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Rounded);
4540	if (SignBit) {
4541	RoundedBits = DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, SignBit);
4542	}
4543
4544	SDValue HasHighest = DAG.getSetCC(
4545	DL,
4546	getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
4547	Highest, Zero64, ISD::SETNE);
4548
4549	SDValue HasLo = DAG.getSetCC(
4550	DL,
4551	getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
4552	SrcLo, Zero64, ISD::SETNE);
4553
4554	SDValue NeedsAdjustment =
4555	DAG.getNode(Opcode: ISD::AND, DL, VT: HasLo.getValueType(), N1: HasHighest, N2: HasLo);
4556	NeedsAdjustment = DAG.getZExtOrTrunc(NeedsAdjustment, DL, MVT::i64);
4557
4558	SDValue AdjustedBits =
4559	DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, NeedsAdjustment);
4560	SDValue Adjusted = DAG.getNode(ISD::BITCAST, DL, MVT::f64, AdjustedBits);
4561	return IsStrict
4562	? DAG.getNode(ISD::STRICT_FP_ROUND, DL,
4563	{Op.getValueType(), MVT::Other},
4564	{Rounded.getValue(`1`), Adjusted,
4565	DAG.getIntPtrConstant(`0`, DL)})
4566	: DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(), Adjusted,
4567	DAG.getIntPtrConstant(`0`, DL, true));
4568	}
4569	}
4570
4571	// f16 conversions are promoted to f32 when full fp16 is not supported.
4572	if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
4573	return IntToFpViaPromotion(MVT::f32);
4574	}
4575
4576	// i128 conversions are libcalls.
4577	if (SrcVal.getValueType() == MVT::i128)
4578	return SDValue ();
4579
4580	// Other conversions are legal, unless it's to the completely software-based
4581	// fp128.
4582	if (Op.getValueType() != MVT::f128)
4583	return Op;
4584	return SDValue ();
4585	}
4586
4587	SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
4588	SelectionDAG &DAG) const {
4589	// For iOS, we want to call an alternative entry point: __sincos_stret,
4590	// which returns the values in two S / D registers.
4591	SDLoc dl(Op);
4592	SDValue Arg = Op.getOperand(i: `0`);
4593	EVT ArgVT = Arg.getValueType();
4594	Type ArgTy = ArgVT.getTypeForEVT(Context&: DAG.getContext());
4595
4596	ArgListTy Args;
4597	ArgListEntry Entry;
4598
4599	Entry.Node = Arg;
4600	Entry.Ty = ArgTy;
4601	Entry.IsSExt = false;
4602	Entry.IsZExt = false;
4603	Args.push_back(x: Entry);
4604
4605	RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
4606	: RTLIB::SINCOS_STRET_F32;
4607	const char *LibcallName = getLibcallName(Call: LC);
4608	SDValue Callee =
4609	DAG.getExternalSymbol(Sym: LibcallName, VT: getPointerTy(DL: DAG.getDataLayout()));
4610
4611	StructType *RetTy = StructType::get(elt1: ArgTy, elts: ArgTy);
4612	TargetLowering::CallLoweringInfo CLI(DAG);
4613	CLI.setDebugLoc(dl)
4614	.setChain(DAG.getEntryNode())
4615	.setLibCallee(CC: CallingConv::Fast, ResultType: RetTy, Target: Callee, ArgsList: std::move(Args));
4616
4617	std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
4618	return CallResult.first;
4619	}
4620
4621	static MVT getSVEContainerType(EVT ContentTy);
4622
4623	SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
4624	SelectionDAG &DAG) const {
4625	EVT OpVT = Op.getValueType();
4626	EVT ArgVT = Op.getOperand(i: `0`).getValueType();
4627
4628	if (useSVEForFixedLengthVectorVT(VT: OpVT))
4629	return LowerFixedLengthBitcastToSVE(Op, DAG);
4630
4631	if (OpVT.isScalableVector()) {
4632	// Bitcasting between unpacked vector types of different element counts is
4633	// not a NOP because the live elements are laid out differently.
4634	// 01234567
4635	// e.g. nxv2i32 = XX??XX??
4636	// nxv4f16 = X?X?X?X?
4637	if (OpVT.getVectorElementCount() != ArgVT.getVectorElementCount())
4638	return SDValue ();
4639
4640	if (isTypeLegal(VT: OpVT) && !isTypeLegal(VT: ArgVT)) {
4641	assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() &&
4642	"Expected int->fp bitcast!");
4643	SDValue ExtResult =
4644	DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SDLoc (Op), VT: getSVEContainerType(ContentTy: ArgVT),
4645	Operand: Op.getOperand(i: `0`));
4646	return getSVESafeBitCast(VT: OpVT, Op: ExtResult, DAG);
4647	}
4648	return getSVESafeBitCast(VT: OpVT, Op: Op.getOperand(i: `0`), DAG);
4649	}
4650
4651	if (OpVT != MVT::f16 && OpVT != MVT::bf16)
4652	return SDValue ();
4653
4654	// Bitcasts between f16 and bf16 are legal.
4655	if (ArgVT == MVT::f16 \|\| ArgVT == MVT::bf16)
4656	return Op;
4657
4658	assert(ArgVT == MVT::i16);
4659	SDLoc DL(Op);
4660
4661	Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(`0`));
4662	Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
4663	return DAG.getTargetExtractSubreg(AArch64::hsub, DL, OpVT, Op);
4664	}
4665
4666	static EVT getExtensionTo64Bits(const EVT &OrigVT) {
4667	if (OrigVT.getSizeInBits() >= `64`)
4668	return OrigVT;
4669
4670	assert(OrigVT.isSimple() && "Expecting a simple value type");
4671
4672	MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
4673	switch (OrigSimpleTy) {
4674	default: llvm_unreachable("Unexpected Vector Type");
4675	case MVT::v2i8:
4676	case MVT::v2i16:
4677	return MVT::v2i32;
4678	case MVT::v4i8:
4679	return MVT::v4i16;
4680	}
4681	}
4682
4683	static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG,
4684	const EVT &OrigTy,
4685	const EVT &ExtTy,
4686	unsigned ExtOpcode) {
4687	// The vector originally had a size of OrigTy. It was then extended to ExtTy.
4688	// We expect the ExtTy to be 128-bits total. If the OrigTy is less than
4689	// 64-bits we need to insert a new extension so that it will be 64-bits.
4690	assert(ExtTy.is128BitVector() && "Unexpected extension size");
4691	if (OrigTy.getSizeInBits() >= `64`)
4692	return N;
4693
4694	// Must extend size to at least 64 bits to be used as an operand for VMULL.
4695	EVT NewVT = getExtensionTo64Bits(OrigVT: OrigTy);
4696
4697	return DAG.getNode(Opcode: ExtOpcode, DL: SDLoc (N), VT: NewVT, Operand: N);
4698	}
4699
4700	// Returns lane if Op extracts from a two-element vector and lane is constant
4701	// (i.e., extractelt(<2 x Ty> %v, ConstantLane)), and std::nullopt otherwise.
4702	static std::optional<uint64_t>
4703	getConstantLaneNumOfExtractHalfOperand(SDValue &Op) {
4704	SDNode *OpNode = Op.getNode();
4705	if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
4706	return std::nullopt;
4707
4708	EVT VT = OpNode->getOperand(Num: `0`).getValueType();
4709	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val: OpNode->getOperand(Num: `1`));
4710	if (!VT.isFixedLengthVector() \|\| VT.getVectorNumElements() != `2` \|\| !C)
4711	return std::nullopt;
4712
4713	return C->getZExtValue();
4714	}
4715
4716	static bool isExtendedBUILD_VECTOR(SDValue N, SelectionDAG &DAG,
4717	bool isSigned) {
4718	EVT VT = N.getValueType();
4719
4720	if (N.getOpcode() != ISD::BUILD_VECTOR)
4721	return false;
4722
4723	for (const SDValue &Elt : N ->op_values()) {
4724	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val: Elt)) {
4725	unsigned EltSize = VT.getScalarSizeInBits();
4726	unsigned HalfSize = EltSize / `2`;
4727	if (isSigned) {
4728	if (!isIntN(N: HalfSize, x: C->getSExtValue()))
4729	return false;
4730	} else {
4731	if (!isUIntN(N: HalfSize, x: C->getZExtValue()))
4732	return false;
4733	}
4734	continue;
4735	}
4736	return false;
4737	}
4738
4739	return true;
4740	}
4741
4742	static SDValue skipExtensionForVectorMULL(SDValue N, SelectionDAG &DAG) {
4743	EVT VT = N.getValueType();
4744	assert(VT.is128BitVector() && "Unexpected vector MULL size");
4745
4746	unsigned NumElts = VT.getVectorNumElements();
4747	unsigned OrigEltSize = VT.getScalarSizeInBits();
4748	unsigned EltSize = OrigEltSize / `2`;
4749	MVT TruncVT = MVT::getVectorVT(VT: MVT::getIntegerVT(BitWidth: EltSize), NumElements: NumElts);
4750
4751	APInt HiBits = APInt::getHighBitsSet(numBits: OrigEltSize, hiBitsSet: EltSize);
4752	if (DAG.MaskedValueIsZero(Op: N, Mask: HiBits))
4753	return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SDLoc (N), VT: TruncVT, Operand: N);
4754
4755	if (ISD::isExtOpcode(Opcode: N.getOpcode()))
4756	return addRequiredExtensionForVectorMULL(N: N.getOperand(i: `0`), DAG,
4757	OrigTy: N.getOperand(i: `0`).getValueType(), ExtTy: VT,
4758	ExtOpcode: N.getOpcode());
4759
4760	assert(N.getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
4761	SDLoc dl(N);
4762	SmallVector<SDValue, `8`> Ops;
4763	for (unsigned i = `0`; i != NumElts; ++i) {
4764	const APInt &CInt = N.getConstantOperandAPInt(i);
4765	// Element types smaller than 32 bits are not legal, so use i32 elements.
4766	// The values are implicitly truncated so sext vs. zext doesn't matter.
4767	Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(`32`), dl, MVT::i32));
4768	}
4769	return DAG.getBuildVector(VT: TruncVT, DL: dl, Ops);
4770	}
4771
4772	static bool isSignExtended(SDValue N, SelectionDAG &DAG) {
4773	return N.getOpcode() == ISD::SIGN_EXTEND \|\|
4774	N.getOpcode() == ISD::ANY_EXTEND \|\|
4775	isExtendedBUILD_VECTOR(N, DAG, isSigned: true);
4776	}
4777
4778	static bool isZeroExtended(SDValue N, SelectionDAG &DAG) {
4779	return N.getOpcode() == ISD::ZERO_EXTEND \|\|
4780	N.getOpcode() == ISD::ANY_EXTEND \|\|
4781	isExtendedBUILD_VECTOR(N, DAG, isSigned: false);
4782	}
4783
4784	static bool isAddSubSExt(SDValue N, SelectionDAG &DAG) {
4785	unsigned Opcode = N.getOpcode();
4786	if (Opcode == ISD::ADD \|\| Opcode == ISD::SUB) {
4787	SDValue N0 = N.getOperand(i: `0`);
4788	SDValue N1 = N.getOperand(i: `1`);
4789	return N0 ->hasOneUse() && N1 ->hasOneUse() &&
4790	isSignExtended(N: N0, DAG) && isSignExtended(N: N1, DAG);
4791	}
4792	return false;
4793	}
4794
4795	static bool isAddSubZExt(SDValue N, SelectionDAG &DAG) {
4796	unsigned Opcode = N.getOpcode();
4797	if (Opcode == ISD::ADD \|\| Opcode == ISD::SUB) {
4798	SDValue N0 = N.getOperand(i: `0`);
4799	SDValue N1 = N.getOperand(i: `1`);
4800	return N0 ->hasOneUse() && N1 ->hasOneUse() &&
4801	isZeroExtended(N: N0, DAG) && isZeroExtended(N: N1, DAG);
4802	}
4803	return false;
4804	}
4805
4806	SDValue AArch64TargetLowering::LowerGET_ROUNDING(SDValue Op,
4807	SelectionDAG &DAG) const {
4808	// The rounding mode is in bits 23:22 of the FPSCR.
4809	// The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
4810	// The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
4811	// so that the shift + and get folded into a bitfield extract.
4812	SDLoc dl(Op);
4813
4814	SDValue Chain = Op.getOperand(i: `0`);
4815	SDValue FPCR_64 = DAG.getNode(
4816	ISD::INTRINSIC_W_CHAIN, dl, {MVT::i64, MVT::Other},
4817	{Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl, MVT::i64)});
4818	Chain = FPCR_64.getValue(R: `1`);
4819	SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64);
4820	SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32,
4821	DAG.getConstant(`1U` << `22`, dl, MVT::i32));
4822	SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
4823	DAG.getConstant(`22`, dl, MVT::i32));
4824	SDValue AND = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
4825	DAG.getConstant(`3`, dl, MVT::i32));
4826	return DAG.getMergeValues(Ops: {AND, Chain}, dl);
4827	}
4828
4829	SDValue AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op,
4830	SelectionDAG &DAG) const {
4831	SDLoc DL(Op);
4832	SDValue Chain = Op ->getOperand(Num: `0`);
4833	SDValue RMValue = Op ->getOperand(Num: `1`);
4834
4835	// The rounding mode is in bits 23:22 of the FPCR.
4836	// The llvm.set.rounding argument value to the rounding mode in FPCR mapping
4837	// is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
4838	// ((arg - 1) & 3) << 22).
4839	//
4840	// The argument of llvm.set.rounding must be within the segment [0, 3], so
4841	// NearestTiesToAway (4) is not handled here. It is responsibility of the code
4842	// generated llvm.set.rounding to ensure this condition.
4843
4844	// Calculate new value of FPCR[23:22].
4845	RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
4846	DAG.getConstant(`1`, DL, MVT::i32));
4847	RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
4848	DAG.getConstant(`0x3`, DL, MVT::i32));
4849	RMValue =
4850	DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
4851	DAG.getConstant(AArch64::RoundingBitsPos, DL, MVT::i32));
4852	RMValue = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, RMValue);
4853
4854	// Get current value of FPCR.
4855	SDValue Ops[] = {
4856	Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
4857	SDValue FPCR =
4858	DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
4859	Chain = FPCR.getValue(R: `1`);
4860	FPCR = FPCR.getValue(R: `0`);
4861
4862	// Put new rounding mode into FPSCR[23:22].
4863	const int RMMask = ~(AArch64::Rounding::rmMask << AArch64::RoundingBitsPos);
4864	FPCR = DAG.getNode(ISD::AND, DL, MVT::i64, FPCR,
4865	DAG.getConstant(RMMask, DL, MVT::i64));
4866	FPCR = DAG.getNode(ISD::OR, DL, MVT::i64, FPCR, RMValue);
4867	SDValue Ops2[] = {
4868	Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
4869	FPCR};
4870	return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
4871	}
4872
4873	static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG,
4874	SDLoc DL, bool &IsMLA) {
4875	bool IsN0SExt = isSignExtended(N: N0, DAG);
4876	bool IsN1SExt = isSignExtended(N: N1, DAG);
4877	if (IsN0SExt && IsN1SExt)
4878	return AArch64ISD::SMULL;
4879
4880	bool IsN0ZExt = isZeroExtended(N: N0, DAG);
4881	bool IsN1ZExt = isZeroExtended(N: N1, DAG);
4882
4883	if (IsN0ZExt && IsN1ZExt)
4884	return AArch64ISD::UMULL;
4885
4886	// Select SMULL if we can replace zext with sext.
4887	if (((IsN0SExt && IsN1ZExt) \|\| (IsN0ZExt && IsN1SExt)) &&
4888	!isExtendedBUILD_VECTOR(N: N0, DAG, isSigned: false) &&
4889	!isExtendedBUILD_VECTOR(N: N1, DAG, isSigned: false)) {
4890	SDValue ZextOperand;
4891	if (IsN0ZExt)
4892	ZextOperand = N0.getOperand(i: `0`);
4893	else
4894	ZextOperand = N1.getOperand(i: `0`);
4895	if (DAG.SignBitIsZero(Op: ZextOperand)) {
4896	SDValue NewSext =
4897	DAG.getSExtOrTrunc(Op: ZextOperand, DL, VT: N0.getValueType());
4898	if (IsN0ZExt)
4899	N0 = NewSext;
4900	else
4901	N1 = NewSext;
4902	return AArch64ISD::SMULL;
4903	}
4904	}
4905
4906	// Select UMULL if we can replace the other operand with an extend.
4907	if (IsN0ZExt \|\| IsN1ZExt) {
4908	EVT VT = N0.getValueType();
4909	APInt Mask = APInt::getHighBitsSet(numBits: VT.getScalarSizeInBits(),
4910	hiBitsSet: VT.getScalarSizeInBits() / `2`);
4911	if (DAG.MaskedValueIsZero(Op: IsN0ZExt ? N1 : N0, Mask))
4912	return AArch64ISD::UMULL;
4913	}
4914
4915	if (!IsN1SExt && !IsN1ZExt)
4916	return `0`;
4917
4918	// Look for (s/zext A + s/zext B) (s/zext C). We want to turn these*
4919	// into (s/zext A s/zext C) + (s/zext B * s/zext C)*
4920	if (IsN1SExt && isAddSubSExt(N: N0, DAG)) {
4921	IsMLA = true;
4922	return AArch64ISD::SMULL;
4923	}
4924	if (IsN1ZExt && isAddSubZExt(N: N0, DAG)) {
4925	IsMLA = true;
4926	return AArch64ISD::UMULL;
4927	}
4928	if (IsN0ZExt && isAddSubZExt(N: N1, DAG)) {
4929	std::swap(a&: N0, b&: N1);
4930	IsMLA = true;
4931	return AArch64ISD::UMULL;
4932	}
4933	return `0`;
4934	}
4935
4936	SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
4937	EVT VT = Op.getValueType();
4938
4939	bool OverrideNEON = !Subtarget->isNeonAvailable();
4940	if (VT.isScalableVector() \|\| useSVEForFixedLengthVectorVT(VT, OverrideNEON))
4941	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::MUL_PRED);
4942
4943	// Multiplications are only custom-lowered for 128-bit and 64-bit vectors so
4944	// that VMULL can be detected. Otherwise v2i64 multiplications are not legal.
4945	assert((VT.is128BitVector() \|\| VT.is64BitVector()) && VT.isInteger() &&
4946	"unexpected type for custom-lowering ISD::MUL");
4947	SDValue N0 = Op.getOperand(i: `0`);
4948	SDValue N1 = Op.getOperand(i: `1`);
4949	bool isMLA = false;
4950	EVT OVT = VT;
4951	if (VT.is64BitVector()) {
4952	if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
4953	isNullConstant(V: N0.getOperand(i: `1`)) &&
4954	N1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
4955	isNullConstant(V: N1.getOperand(i: `1`))) {
4956	N0 = N0.getOperand(i: `0`);
4957	N1 = N1.getOperand(i: `0`);
4958	VT = N0.getValueType();
4959	} else {
4960	if (VT == MVT::v1i64) {
4961	if (Subtarget->hasSVE())
4962	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::MUL_PRED);
4963	// Fall through to expand this. It is not legal.
4964	return SDValue ();
4965	} else
4966	// Other vector multiplications are legal.
4967	return Op;
4968	}
4969	}
4970
4971	SDLoc DL(Op);
4972	unsigned NewOpc = selectUmullSmull(N0, N1, DAG, DL, IsMLA&: isMLA);
4973
4974	if (!NewOpc) {
4975	if (VT.getVectorElementType() == MVT::i64) {
4976	// If SVE is available then i64 vector multiplications can also be made
4977	// legal.
4978	if (Subtarget->hasSVE())
4979	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::MUL_PRED);
4980	// Fall through to expand this. It is not legal.
4981	return SDValue ();
4982	} else
4983	// Other vector multiplications are legal.
4984	return Op;
4985	}
4986
4987	// Legalize to a S/UMULL instruction
4988	SDValue Op0;
4989	SDValue Op1 = skipExtensionForVectorMULL(N: N1, DAG);
4990	if (!isMLA) {
4991	Op0 = skipExtensionForVectorMULL(N: N0, DAG);
4992	assert(Op0.getValueType().is64BitVector() &&
4993	Op1.getValueType().is64BitVector() &&
4994	"unexpected types for extended operands to VMULL");
4995	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OVT,
4996	DAG.getNode(NewOpc, DL, VT, Op0, Op1),
4997	DAG.getConstant(`0`, DL, MVT::i64));
4998	}
4999	// Optimizing (zext A + zext B) C, to (S/UMULL A, C) + (S/UMULL B, C) during*
5000	// isel lowering to take advantage of no-stall back to back s/umul + s/umla.
5001	// This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
5002	SDValue N00 = skipExtensionForVectorMULL(N: N0.getOperand(i: `0`), DAG);
5003	SDValue N01 = skipExtensionForVectorMULL(N: N0.getOperand(i: `1`), DAG);
5004	EVT Op1VT = Op1.getValueType();
5005	return DAG.getNode(
5006	ISD::EXTRACT_SUBVECTOR, DL, OVT,
5007	DAG.getNode(N0.getOpcode(), DL, VT,
5008	DAG.getNode(NewOpc, DL, VT,
5009	DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
5010	DAG.getNode(NewOpc, DL, VT,
5011	DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)),
5012	DAG.getConstant(`0`, DL, MVT::i64));
5013	}
5014
5015	static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
5016	int Pattern) {
5017	if (VT == MVT::nxv1i1 && Pattern == AArch64SVEPredPattern::all)
5018	return DAG.getConstant(`1`, DL, MVT::nxv1i1);
5019	return DAG.getNode(AArch64ISD::PTRUE, DL, VT,
5020	DAG.getTargetConstant(Pattern, DL, MVT::i32));
5021	}
5022
5023	static SDValue optimizeIncrementingWhile(SDValue Op, SelectionDAG &DAG,
5024	bool IsSigned, bool IsEqual) {
5025	if (!isa<ConstantSDNode>(Val: Op.getOperand(i: `1`)) \|\|
5026	!isa<ConstantSDNode>(Val: Op.getOperand(i: `2`)))
5027	return SDValue ();
5028
5029	SDLoc dl(Op);
5030	APInt X = Op.getConstantOperandAPInt(i: `1`);
5031	APInt Y = Op.getConstantOperandAPInt(i: `2`);
5032	bool Overflow;
5033	APInt NumActiveElems =
5034	IsSigned ? Y.ssub_ov(RHS: X, Overflow) : Y.usub_ov(RHS: X, Overflow);
5035
5036	if (Overflow)
5037	return SDValue ();
5038
5039	if (IsEqual) {
5040	APInt One(NumActiveElems.getBitWidth(), `1`, IsSigned);
5041	NumActiveElems = IsSigned ? NumActiveElems.sadd_ov(RHS: One, Overflow)
5042	: NumActiveElems.uadd_ov(RHS: One, Overflow);
5043	if (Overflow)
5044	return SDValue ();
5045	}
5046
5047	std::optional<unsigned> PredPattern =
5048	getSVEPredPatternFromNumElements(NumActiveElems.getZExtValue());
5049	unsigned MinSVEVectorSize = std::max(
5050	a: DAG.getSubtarget<AArch64Subtarget>().getMinSVEVectorSizeInBits(), b: `128u`);
5051	unsigned ElementSize = `128` / Op.getValueType().getVectorMinNumElements();
5052	if (PredPattern != std::nullopt &&
5053	NumActiveElems.getZExtValue() <= (MinSVEVectorSize / ElementSize))
5054	return getPTrue(DAG, DL: dl, VT: Op.getValueType(), Pattern: *PredPattern);
5055
5056	return SDValue ();
5057	}
5058
5059	// Returns a safe bitcast between two scalable vector predicates, where
5060	// any newly created lanes from a widening bitcast are defined as zero.
5061	static SDValue getSVEPredicateBitCast(EVT VT, SDValue Op, SelectionDAG &DAG) {
5062	SDLoc DL(Op);
5063	EVT InVT = Op.getValueType();
5064
5065	assert(InVT.getVectorElementType() == MVT::i1 &&
5066	VT.getVectorElementType() == MVT::i1 &&
5067	"Expected a predicate-to-predicate bitcast");
5068	assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
5069	InVT.isScalableVector() &&
5070	DAG.getTargetLoweringInfo().isTypeLegal(InVT) &&
5071	"Only expect to cast between legal scalable predicate types!");
5072
5073	// Return the operand if the cast isn't changing type,
5074	// e.g. <n x 16 x i1> -> <n x 16 x i1>
5075	if (InVT == VT)
5076	return Op;
5077
5078	SDValue Reinterpret = DAG.getNode(Opcode: AArch64ISD::REINTERPRET_CAST, DL, VT, Operand: Op);
5079
5080	// We only have to zero the lanes if new lanes are being defined, e.g. when
5081	// casting from <vscale x 2 x i1> to <vscale x 16 x i1>. If this is not the
5082	// case (e.g. when casting from <vscale x 16 x i1> -> <vscale x 2 x i1>) then
5083	// we can return here.
5084	if (InVT.bitsGT(VT))
5085	return Reinterpret;
5086
5087	// Check if the other lanes are already known to be zeroed by
5088	// construction.
5089	if (isZeroingInactiveLanes(Op))
5090	return Reinterpret;
5091
5092	// Zero the newly introduced lanes.
5093	SDValue Mask = DAG.getConstant(Val: `1`, DL, VT: InVT);
5094	Mask = DAG.getNode(Opcode: AArch64ISD::REINTERPRET_CAST, DL, VT, Operand: Mask);
5095	return DAG.getNode(Opcode: ISD::AND, DL, VT, N1: Reinterpret, N2: Mask);
5096	}
5097
5098	SDValue AArch64TargetLowering::getRuntimePStateSM(SelectionDAG &DAG,
5099	SDValue Chain, SDLoc DL,
5100	EVT VT) const {
5101	SDValue Callee = DAG.getExternalSymbol(Sym: "__arm_sme_state",
5102	VT: getPointerTy(DL: DAG.getDataLayout()));
5103	Type Int64Ty = Type::getInt64Ty(C&: DAG.getContext());
5104	Type *RetTy = StructType::get(elt1: Int64Ty, elts: Int64Ty);
5105	TargetLowering::CallLoweringInfo CLI(DAG);
5106	ArgListTy Args;
5107	CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
5108	CC: CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2,
5109	ResultType: RetTy, Target: Callee, ArgsList: std::move(Args));
5110	std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
5111	SDValue Mask = DAG.getConstant(/PSTATE.SM/ `1`, DL, MVT::i64);
5112	return DAG.getNode(ISD::AND, DL, MVT::i64, CallResult.first.getOperand(`0`),
5113	Mask);
5114	}
5115
5116	// Lower an SME LDR/STR ZA intrinsic
5117	// Case 1: If the vector number (vecnum) is an immediate in range, it gets
5118	// folded into the instruction
5119	// ldr(%tileslice, %ptr, 11) -> ldr [%tileslice, 11], [%ptr, 11]
5120	// Case 2: If the vecnum is not an immediate, then it is used to modify the base
5121	// and tile slice registers
5122	// ldr(%tileslice, %ptr, %vecnum)
5123	// ->
5124	// %svl = rdsvl
5125	// %ptr2 = %ptr + %svl %vecnum*
5126	// %tileslice2 = %tileslice + %vecnum
5127	// ldr [%tileslice2, 0], [%ptr2, 0]
5128	// Case 3: If the vecnum is an immediate out of range, then the same is done as
5129	// case 2, but the base and slice registers are modified by the greatest
5130	// multiple of 15 lower than the vecnum and the remainder is folded into the
5131	// instruction. This means that successive loads and stores that are offset from
5132	// each other can share the same base and slice register updates.
5133	// ldr(%tileslice, %ptr, 22)
5134	// ldr(%tileslice, %ptr, 23)
5135	// ->
5136	// %svl = rdsvl
5137	// %ptr2 = %ptr + %svl 15*
5138	// %tileslice2 = %tileslice + 15
5139	// ldr [%tileslice2, 7], [%ptr2, 7]
5140	// ldr [%tileslice2, 8], [%ptr2, 8]
5141	// Case 4: If the vecnum is an add of an immediate, then the non-immediate
5142	// operand and the immediate can be folded into the instruction, like case 2.
5143	// ldr(%tileslice, %ptr, %vecnum + 7)
5144	// ldr(%tileslice, %ptr, %vecnum + 8)
5145	// ->
5146	// %svl = rdsvl
5147	// %ptr2 = %ptr + %svl %vecnum*
5148	// %tileslice2 = %tileslice + %vecnum
5149	// ldr [%tileslice2, 7], [%ptr2, 7]
5150	// ldr [%tileslice2, 8], [%ptr2, 8]
5151	// Case 5: The vecnum being an add of an immediate out of range is also handled,
5152	// in which case the same remainder logic as case 3 is used.
5153	SDValue LowerSMELdrStr(SDValue N, SelectionDAG &DAG, bool IsLoad) {
5154	SDLoc DL(N);
5155
5156	SDValue TileSlice = N ->getOperand(Num: `2`);
5157	SDValue Base = N ->getOperand(Num: `3`);
5158	SDValue VecNum = N ->getOperand(Num: `4`);
5159	int32_t ConstAddend = `0`;
5160	SDValue VarAddend = VecNum;
5161
5162	// If the vnum is an add of an immediate, we can fold it into the instruction
5163	if (VecNum.getOpcode() == ISD::ADD &&
5164	isa<ConstantSDNode>(Val: VecNum.getOperand(i: `1`))) {
5165	ConstAddend = cast<ConstantSDNode>(Val: VecNum.getOperand(i: `1`))->getSExtValue();
5166	VarAddend = VecNum.getOperand(i: `0`);
5167	} else if (auto ImmNode = dyn_cast<ConstantSDNode>(Val&: VecNum)) {
5168	ConstAddend = ImmNode->getSExtValue();
5169	VarAddend = SDValue ();
5170	}
5171
5172	int32_t ImmAddend = ConstAddend % `16`;
5173	if (int32_t C = (ConstAddend - ImmAddend)) {
5174	SDValue CVal = DAG.getTargetConstant(C, DL, MVT::i32);
5175	VarAddend = VarAddend
5176	? DAG.getNode(ISD::ADD, DL, MVT::i32, {VarAddend, CVal})
5177	: CVal;
5178	}
5179
5180	if (VarAddend) {
5181	// Get the vector length that will be multiplied by vnum
5182	auto SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
5183	DAG.getConstant(`1`, DL, MVT::i32));
5184
5185	// Multiply SVL and vnum then add it to the base
5186	SDValue Mul = DAG.getNode(
5187	ISD::MUL, DL, MVT::i64,
5188	{SVL, DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, VarAddend)});
5189	Base = DAG.getNode(ISD::ADD, DL, MVT::i64, {Base, Mul});
5190	// Just add vnum to the tileslice
5191	TileSlice = DAG.getNode(ISD::ADD, DL, MVT::i32, {TileSlice, VarAddend});
5192	}
5193
5194	return DAG.getNode(IsLoad ? AArch64ISD::SME_ZA_LDR : AArch64ISD::SME_ZA_STR,
5195	DL, MVT::Other,
5196	{/Chain=/N.getOperand(`0`), TileSlice, Base,
5197	DAG.getTargetConstant(ImmAddend, DL, MVT::i32)});
5198	}
5199
5200	SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
5201	SelectionDAG &DAG) const {
5202	unsigned IntNo = Op.getConstantOperandVal(i: `1`);
5203	SDLoc DL(Op);
5204	switch (IntNo) {
5205	default:
5206	return SDValue (); // Don't custom lower most intrinsics.
5207	case Intrinsic::aarch64_prefetch: {
5208	SDValue Chain = Op.getOperand(i: `0`);
5209	SDValue Addr = Op.getOperand(i: `2`);
5210
5211	unsigned IsWrite = Op.getConstantOperandVal(i: `3`);
5212	unsigned Locality = Op.getConstantOperandVal(i: `4`);
5213	unsigned IsStream = Op.getConstantOperandVal(i: `5`);
5214	unsigned IsData = Op.getConstantOperandVal(i: `6`);
5215	unsigned PrfOp = (IsWrite << `4`) \| // Load/Store bit
5216	(!IsData << `3`) \| // IsDataCache bit
5217	(Locality << `1`) \| // Cache level bits
5218	(unsigned)IsStream; // Stream bit
5219
5220	return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Chain,
5221	DAG.getTargetConstant(PrfOp, DL, MVT::i32), Addr);
5222	}
5223	case Intrinsic::aarch64_sme_str:
5224	case Intrinsic::aarch64_sme_ldr: {
5225	return LowerSMELdrStr(Op, DAG, IntNo == Intrinsic::aarch64_sme_ldr);
5226	}
5227	case Intrinsic::aarch64_sme_za_enable:
5228	return DAG.getNode(
5229	AArch64ISD::SMSTART, DL, MVT::Other,
5230	Op->getOperand(`0`), // Chain
5231	DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
5232	DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
5233	case Intrinsic::aarch64_sme_za_disable:
5234	return DAG.getNode(
5235	AArch64ISD::SMSTOP, DL, MVT::Other,
5236	Op->getOperand(`0`), // Chain
5237	DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
5238	DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
5239	}
5240	}
5241
5242	SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
5243	SelectionDAG &DAG) const {
5244	unsigned IntNo = Op.getConstantOperandVal(i: `1`);
5245	SDLoc DL(Op);
5246	switch (IntNo) {
5247	default:
5248	return SDValue (); // Don't custom lower most intrinsics.
5249	case Intrinsic::aarch64_mops_memset_tag: {
5250	auto Node = cast<MemIntrinsicSDNode>(Val: Op.getNode());
5251	SDValue Chain = Node->getChain();
5252	SDValue Dst = Op.getOperand(i: `2`);
5253	SDValue Val = Op.getOperand(i: `3`);
5254	Val = DAG.getAnyExtOrTrunc(Val, DL, MVT::i64);
5255	SDValue Size = Op.getOperand(i: `4`);
5256	auto Alignment = Node->getMemOperand()->getAlign();
5257	bool IsVol = Node->isVolatile();
5258	auto DstPtrInfo = Node->getPointerInfo();
5259
5260	const auto &SDI =
5261	static_cast<const AArch64SelectionDAGInfo &>(DAG.getSelectionDAGInfo());
5262	SDValue MS =
5263	SDI.EmitMOPS(SDOpcode: AArch64ISD::MOPS_MEMSET_TAGGING, DAG, DL, Chain, Dst, SrcOrValue: Val,
5264	Size, Alignment, isVolatile: IsVol, DstPtrInfo, SrcPtrInfo: MachinePointerInfo {});
5265
5266	// MOPS_MEMSET_TAGGING has 3 results (DstWb, SizeWb, Chain) whereas the
5267	// intrinsic has 2. So hide SizeWb using MERGE_VALUES. Otherwise
5268	// LowerOperationWrapper will complain that the number of results has
5269	// changed.
5270	return DAG.getMergeValues(Ops: {MS.getValue(R: `0`), MS.getValue(R: `2`)}, dl: DL);
5271	}
5272	}
5273	}
5274
5275	SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
5276	SelectionDAG &DAG) const {
5277	unsigned IntNo = Op.getConstantOperandVal(i: `0`);
5278	SDLoc dl(Op);
5279	switch (IntNo) {
5280	default: return SDValue (); // Don't custom lower most intrinsics.
5281	case Intrinsic::thread_pointer: {
5282	EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
5283	return DAG.getNode(Opcode: AArch64ISD::THREAD_POINTER, DL: dl, VT: PtrVT);
5284	}
5285	case Intrinsic::aarch64_neon_abs: {
5286	EVT Ty = Op.getValueType();
5287	if (Ty == MVT::i64) {
5288	SDValue Result = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64,
5289	Op.getOperand(`1`));
5290	Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result);
5291	return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result);
5292	} else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(VT: Ty)) {
5293	return DAG.getNode(Opcode: ISD::ABS, DL: dl, VT: Ty, Operand: Op.getOperand(i: `1`));
5294	} else {
5295	report_fatal_error(reason: "Unexpected type for AArch64 NEON intrinic");
5296	}
5297	}
5298	case Intrinsic::aarch64_neon_pmull64: {
5299	SDValue LHS = Op.getOperand(i: `1`);
5300	SDValue RHS = Op.getOperand(i: `2`);
5301
5302	std::optional<uint64_t> LHSLane =
5303	getConstantLaneNumOfExtractHalfOperand(Op&: LHS);
5304	std::optional<uint64_t> RHSLane =
5305	getConstantLaneNumOfExtractHalfOperand(Op&: RHS);
5306
5307	assert((!LHSLane \|\| *LHSLane < `2`) && "Expect lane to be None or 0 or 1");
5308	assert((!RHSLane \|\| *RHSLane < `2`) && "Expect lane to be None or 0 or 1");
5309
5310	// 'aarch64_neon_pmull64' takes i64 parameters; while pmull/pmull2
5311	// instructions execute on SIMD registers. So canonicalize i64 to v1i64,
5312	// which ISel recognizes better. For example, generate a ldr into d*
5313	// registers as opposed to a GPR load followed by a fmov.
5314	auto TryVectorizeOperand = [](SDValue N, std::optional<uint64_t> NLane,
5315	std::optional<uint64_t> OtherLane,
5316	const SDLoc &dl,
5317	SelectionDAG &DAG) -> SDValue {
5318	// If the operand is an higher half itself, rewrite it to
5319	// extract_high_v2i64; this way aarch64_neon_pmull64 could
5320	// re-use the dag-combiner function with aarch64_neon_{pmull,smull,umull}.
5321	if (NLane && *NLane == `1`)
5322	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64,
5323	N.getOperand(`0`), DAG.getConstant(`1`, dl, MVT::i64));
5324
5325	// Operand N is not a higher half but the other operand is.
5326	if (OtherLane && *OtherLane == `1`) {
5327	// If this operand is a lower half, rewrite it to
5328	// extract_high_v2i64(duplane(<2 x Ty>, 0)). This saves a roundtrip to
5329	// align lanes of two operands. A roundtrip sequence (to move from lane
5330	// 1 to lane 0) is like this:
5331	// mov x8, v0.d[1]
5332	// fmov d0, x8
5333	if (NLane && *NLane == `0`)
5334	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64,
5335	DAG.getNode(AArch64ISD::DUPLANE64, dl, MVT::v2i64,
5336	N.getOperand(`0`),
5337	DAG.getConstant(`0`, dl, MVT::i64)),
5338	DAG.getConstant(`1`, dl, MVT::i64));
5339
5340	// Otherwise just dup from main to all lanes.
5341	return DAG.getNode(AArch64ISD::DUP, dl, MVT::v1i64, N);
5342	}
5343
5344	// Neither operand is an extract of higher half, so codegen may just use
5345	// the non-high version of PMULL instruction. Use v1i64 to represent i64.
5346	assert(N.getValueType() == MVT::i64 &&
5347	"Intrinsic aarch64_neon_pmull64 requires i64 parameters");
5348	return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, N);
5349	};
5350
5351	LHS = TryVectorizeOperand (LHS, LHSLane, RHSLane, dl, DAG);
5352	RHS = TryVectorizeOperand (RHS, RHSLane, LHSLane, dl, DAG);
5353
5354	return DAG.getNode(Opcode: AArch64ISD::PMULL, DL: dl, VT: Op.getValueType(), N1: LHS, N2: RHS);
5355	}
5356	case Intrinsic::aarch64_neon_smax:
5357	return DAG.getNode(Opcode: ISD::SMAX, DL: dl, VT: Op.getValueType(),
5358	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`));
5359	case Intrinsic::aarch64_neon_umax:
5360	return DAG.getNode(Opcode: ISD::UMAX, DL: dl, VT: Op.getValueType(),
5361	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`));
5362	case Intrinsic::aarch64_neon_smin:
5363	return DAG.getNode(Opcode: ISD::SMIN, DL: dl, VT: Op.getValueType(),
5364	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`));
5365	case Intrinsic::aarch64_neon_umin:
5366	return DAG.getNode(Opcode: ISD::UMIN, DL: dl, VT: Op.getValueType(),
5367	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`));
5368	case Intrinsic::aarch64_neon_scalar_sqxtn:
5369	case Intrinsic::aarch64_neon_scalar_sqxtun:
5370	case Intrinsic::aarch64_neon_scalar_uqxtn: {
5371	assert(Op.getValueType() == MVT::i32 \|\| Op.getValueType() == MVT::f32);
5372	if (Op.getValueType() == MVT::i32)
5373	return DAG.getNode(ISD::BITCAST, dl, MVT::i32,
5374	DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::f32,
5375	Op.getOperand(`0`),
5376	DAG.getNode(ISD::BITCAST, dl, MVT::f64,
5377	Op.getOperand(`1`))));
5378	return SDValue ();
5379	}
5380	case Intrinsic::aarch64_sve_whilelo:
5381	return optimizeIncrementingWhile(Op, DAG, /IsSigned=/false,
5382	/IsEqual=/false);
5383	case Intrinsic::aarch64_sve_whilelt:
5384	return optimizeIncrementingWhile(Op, DAG, /IsSigned=/true,
5385	/IsEqual=/false);
5386	case Intrinsic::aarch64_sve_whilels:
5387	return optimizeIncrementingWhile(Op, DAG, /IsSigned=/false,
5388	/IsEqual=/true);
5389	case Intrinsic::aarch64_sve_whilele:
5390	return optimizeIncrementingWhile(Op, DAG, /IsSigned=/true,
5391	/IsEqual=/true);
5392	case Intrinsic::aarch64_sve_sunpkhi:
5393	return DAG.getNode(Opcode: AArch64ISD::SUNPKHI, DL: dl, VT: Op.getValueType(),
5394	Operand: Op.getOperand(i: `1`));
5395	case Intrinsic::aarch64_sve_sunpklo:
5396	return DAG.getNode(Opcode: AArch64ISD::SUNPKLO, DL: dl, VT: Op.getValueType(),
5397	Operand: Op.getOperand(i: `1`));
5398	case Intrinsic::aarch64_sve_uunpkhi:
5399	return DAG.getNode(Opcode: AArch64ISD::UUNPKHI, DL: dl, VT: Op.getValueType(),
5400	Operand: Op.getOperand(i: `1`));
5401	case Intrinsic::aarch64_sve_uunpklo:
5402	return DAG.getNode(Opcode: AArch64ISD::UUNPKLO, DL: dl, VT: Op.getValueType(),
5403	Operand: Op.getOperand(i: `1`));
5404	case Intrinsic::aarch64_sve_clasta_n:
5405	return DAG.getNode(Opcode: AArch64ISD::CLASTA_N, DL: dl, VT: Op.getValueType(),
5406	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`), N3: Op.getOperand(i: `3`));
5407	case Intrinsic::aarch64_sve_clastb_n:
5408	return DAG.getNode(Opcode: AArch64ISD::CLASTB_N, DL: dl, VT: Op.getValueType(),
5409	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`), N3: Op.getOperand(i: `3`));
5410	case Intrinsic::aarch64_sve_lasta:
5411	return DAG.getNode(Opcode: AArch64ISD::LASTA, DL: dl, VT: Op.getValueType(),
5412	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`));
5413	case Intrinsic::aarch64_sve_lastb:
5414	return DAG.getNode(Opcode: AArch64ISD::LASTB, DL: dl, VT: Op.getValueType(),
5415	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`));
5416	case Intrinsic::aarch64_sve_rev:
5417	return DAG.getNode(Opcode: ISD::VECTOR_REVERSE, DL: dl, VT: Op.getValueType(),
5418	Operand: Op.getOperand(i: `1`));
5419	case Intrinsic::aarch64_sve_tbl:
5420	return DAG.getNode(Opcode: AArch64ISD::TBL, DL: dl, VT: Op.getValueType(),
5421	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`));
5422	case Intrinsic::aarch64_sve_trn1:
5423	return DAG.getNode(Opcode: AArch64ISD::TRN1, DL: dl, VT: Op.getValueType(),
5424	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`));
5425	case Intrinsic::aarch64_sve_trn2:
5426	return DAG.getNode(Opcode: AArch64ISD::TRN2, DL: dl, VT: Op.getValueType(),
5427	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`));
5428	case Intrinsic::aarch64_sve_uzp1:
5429	return DAG.getNode(Opcode: AArch64ISD::UZP1, DL: dl, VT: Op.getValueType(),
5430	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`));
5431	case Intrinsic::aarch64_sve_uzp2:
5432	return DAG.getNode(Opcode: AArch64ISD::UZP2, DL: dl, VT: Op.getValueType(),
5433	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`));
5434	case Intrinsic::aarch64_sve_zip1:
5435	return DAG.getNode(Opcode: AArch64ISD::ZIP1, DL: dl, VT: Op.getValueType(),
5436	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`));
5437	case Intrinsic::aarch64_sve_zip2:
5438	return DAG.getNode(Opcode: AArch64ISD::ZIP2, DL: dl, VT: Op.getValueType(),
5439	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`));
5440	case Intrinsic::aarch64_sve_splice:
5441	return DAG.getNode(Opcode: AArch64ISD::SPLICE, DL: dl, VT: Op.getValueType(),
5442	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`), N3: Op.getOperand(i: `3`));
5443	case Intrinsic::aarch64_sve_ptrue:
5444	return getPTrue(DAG, DL: dl, VT: Op.getValueType(), Pattern: Op.getConstantOperandVal(i: `1`));
5445	case Intrinsic::aarch64_sve_clz:
5446	return DAG.getNode(Opcode: AArch64ISD::CTLZ_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5447	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`), N3: Op.getOperand(i: `1`));
5448	case Intrinsic::aarch64_sme_cntsb:
5449	return DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
5450	DAG.getConstant(`1`, dl, MVT::i32));
5451	case Intrinsic::aarch64_sme_cntsh: {
5452	SDValue One = DAG.getConstant(`1`, dl, MVT::i32);
5453	SDValue Bytes = DAG.getNode(Opcode: AArch64ISD::RDSVL, DL: dl, VT: Op.getValueType(), Operand: One);
5454	return DAG.getNode(Opcode: ISD::SRL, DL: dl, VT: Op.getValueType(), N1: Bytes, N2: One);
5455	}
5456	case Intrinsic::aarch64_sme_cntsw: {
5457	SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
5458	DAG.getConstant(`1`, dl, MVT::i32));
5459	return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes,
5460	DAG.getConstant(`2`, dl, MVT::i32));
5461	}
5462	case Intrinsic::aarch64_sme_cntsd: {
5463	SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
5464	DAG.getConstant(`1`, dl, MVT::i32));
5465	return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes,
5466	DAG.getConstant(`3`, dl, MVT::i32));
5467	}
5468	case Intrinsic::aarch64_sve_cnt: {
5469	SDValue Data = Op.getOperand(i: `3`);
5470	// CTPOP only supports integer operands.
5471	if (Data.getValueType().isFloatingPoint())
5472	Data = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: Op.getValueType(), Operand: Data);
5473	return DAG.getNode(Opcode: AArch64ISD::CTPOP_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5474	N1: Op.getOperand(i: `2`), N2: Data, N3: Op.getOperand(i: `1`));
5475	}
5476	case Intrinsic::aarch64_sve_dupq_lane:
5477	return LowerDUPQLane(Op, DAG);
5478	case Intrinsic::aarch64_sve_convert_from_svbool:
5479	if (Op.getValueType() == MVT::aarch64svcount)
5480	return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: Op.getValueType(), Operand: Op.getOperand(i: `1`));
5481	return getSVEPredicateBitCast(VT: Op.getValueType(), Op: Op.getOperand(i: `1`), DAG);
5482	case Intrinsic::aarch64_sve_convert_to_svbool:
5483	if (Op.getOperand(`1`).getValueType() == MVT::aarch64svcount)
5484	return DAG.getNode(ISD::BITCAST, dl, MVT::nxv16i1, Op.getOperand(`1`));
5485	return getSVEPredicateBitCast(MVT::nxv16i1, Op.getOperand(`1`), DAG);
5486	case Intrinsic::aarch64_sve_fneg:
5487	return DAG.getNode(Opcode: AArch64ISD::FNEG_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5488	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`), N3: Op.getOperand(i: `1`));
5489	case Intrinsic::aarch64_sve_frintp:
5490	return DAG.getNode(Opcode: AArch64ISD::FCEIL_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5491	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`), N3: Op.getOperand(i: `1`));
5492	case Intrinsic::aarch64_sve_frintm:
5493	return DAG.getNode(Opcode: AArch64ISD::FFLOOR_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5494	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`), N3: Op.getOperand(i: `1`));
5495	case Intrinsic::aarch64_sve_frinti:
5496	return DAG.getNode(Opcode: AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5497	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`), N3: Op.getOperand(i: `1`));
5498	case Intrinsic::aarch64_sve_frintx:
5499	return DAG.getNode(Opcode: AArch64ISD::FRINT_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5500	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`), N3: Op.getOperand(i: `1`));
5501	case Intrinsic::aarch64_sve_frinta:
5502	return DAG.getNode(Opcode: AArch64ISD::FROUND_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5503	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`), N3: Op.getOperand(i: `1`));
5504	case Intrinsic::aarch64_sve_frintn:
5505	return DAG.getNode(Opcode: AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5506	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`), N3: Op.getOperand(i: `1`));
5507	case Intrinsic::aarch64_sve_frintz:
5508	return DAG.getNode(Opcode: AArch64ISD::FTRUNC_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5509	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`), N3: Op.getOperand(i: `1`));
5510	case Intrinsic::aarch64_sve_ucvtf:
5511	return DAG.getNode(Opcode: AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU, DL: dl,
5512	VT: Op.getValueType(), N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`),
5513	N3: Op.getOperand(i: `1`));
5514	case Intrinsic::aarch64_sve_scvtf:
5515	return DAG.getNode(Opcode: AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU, DL: dl,
5516	VT: Op.getValueType(), N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`),
5517	N3: Op.getOperand(i: `1`));
5518	case Intrinsic::aarch64_sve_fcvtzu:
5519	return DAG.getNode(Opcode: AArch64ISD::FCVTZU_MERGE_PASSTHRU, DL: dl,
5520	VT: Op.getValueType(), N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`),
5521	N3: Op.getOperand(i: `1`));
5522	case Intrinsic::aarch64_sve_fcvtzs:
5523	return DAG.getNode(Opcode: AArch64ISD::FCVTZS_MERGE_PASSTHRU, DL: dl,
5524	VT: Op.getValueType(), N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`),
5525	N3: Op.getOperand(i: `1`));
5526	case Intrinsic::aarch64_sve_fsqrt:
5527	return DAG.getNode(Opcode: AArch64ISD::FSQRT_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5528	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`), N3: Op.getOperand(i: `1`));
5529	case Intrinsic::aarch64_sve_frecpx:
5530	return DAG.getNode(Opcode: AArch64ISD::FRECPX_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5531	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`), N3: Op.getOperand(i: `1`));
5532	case Intrinsic::aarch64_sve_frecpe_x:
5533	return DAG.getNode(Opcode: AArch64ISD::FRECPE, DL: dl, VT: Op.getValueType(),
5534	Operand: Op.getOperand(i: `1`));
5535	case Intrinsic::aarch64_sve_frecps_x:
5536	return DAG.getNode(Opcode: AArch64ISD::FRECPS, DL: dl, VT: Op.getValueType(),
5537	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`));
5538	case Intrinsic::aarch64_sve_frsqrte_x:
5539	return DAG.getNode(Opcode: AArch64ISD::FRSQRTE, DL: dl, VT: Op.getValueType(),
5540	Operand: Op.getOperand(i: `1`));
5541	case Intrinsic::aarch64_sve_frsqrts_x:
5542	return DAG.getNode(Opcode: AArch64ISD::FRSQRTS, DL: dl, VT: Op.getValueType(),
5543	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`));
5544	case Intrinsic::aarch64_sve_fabs:
5545	return DAG.getNode(Opcode: AArch64ISD::FABS_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5546	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`), N3: Op.getOperand(i: `1`));
5547	case Intrinsic::aarch64_sve_abs:
5548	return DAG.getNode(Opcode: AArch64ISD::ABS_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5549	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`), N3: Op.getOperand(i: `1`));
5550	case Intrinsic::aarch64_sve_neg:
5551	return DAG.getNode(Opcode: AArch64ISD::NEG_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5552	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`), N3: Op.getOperand(i: `1`));
5553	case Intrinsic::aarch64_sve_insr: {
5554	SDValue Scalar = Op.getOperand(i: `2`);
5555	EVT ScalarTy = Scalar.getValueType();
5556	if ((ScalarTy == MVT::i8) \|\| (ScalarTy == MVT::i16))
5557	Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
5558
5559	return DAG.getNode(Opcode: AArch64ISD::INSR, DL: dl, VT: Op.getValueType(),
5560	N1: Op.getOperand(i: `1`), N2: Scalar);
5561	}
5562	case Intrinsic::aarch64_sve_rbit:
5563	return DAG.getNode(Opcode: AArch64ISD::BITREVERSE_MERGE_PASSTHRU, DL: dl,
5564	VT: Op.getValueType(), N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`),
5565	N3: Op.getOperand(i: `1`));
5566	case Intrinsic::aarch64_sve_revb:
5567	return DAG.getNode(Opcode: AArch64ISD::BSWAP_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5568	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`), N3: Op.getOperand(i: `1`));
5569	case Intrinsic::aarch64_sve_revh:
5570	return DAG.getNode(Opcode: AArch64ISD::REVH_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5571	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`), N3: Op.getOperand(i: `1`));
5572	case Intrinsic::aarch64_sve_revw:
5573	return DAG.getNode(Opcode: AArch64ISD::REVW_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5574	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`), N3: Op.getOperand(i: `1`));
5575	case Intrinsic::aarch64_sve_revd:
5576	return DAG.getNode(Opcode: AArch64ISD::REVD_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5577	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`), N3: Op.getOperand(i: `1`));
5578	case Intrinsic::aarch64_sve_sxtb:
5579	return DAG.getNode(
5580	AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
5581	Op.getOperand(`2`), Op.getOperand(`3`),
5582	DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
5583	Op.getOperand(`1`));
5584	case Intrinsic::aarch64_sve_sxth:
5585	return DAG.getNode(
5586	AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
5587	Op.getOperand(`2`), Op.getOperand(`3`),
5588	DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
5589	Op.getOperand(`1`));
5590	case Intrinsic::aarch64_sve_sxtw:
5591	return DAG.getNode(
5592	AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
5593	Op.getOperand(`2`), Op.getOperand(`3`),
5594	DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
5595	Op.getOperand(`1`));
5596	case Intrinsic::aarch64_sve_uxtb:
5597	return DAG.getNode(
5598	AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
5599	Op.getOperand(`2`), Op.getOperand(`3`),
5600	DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
5601	Op.getOperand(`1`));
5602	case Intrinsic::aarch64_sve_uxth:
5603	return DAG.getNode(
5604	AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
5605	Op.getOperand(`2`), Op.getOperand(`3`),
5606	DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
5607	Op.getOperand(`1`));
5608	case Intrinsic::aarch64_sve_uxtw:
5609	return DAG.getNode(
5610	AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
5611	Op.getOperand(`2`), Op.getOperand(`3`),
5612	DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
5613	Op.getOperand(`1`));
5614	case Intrinsic::localaddress: {
5615	const auto &MF = DAG.getMachineFunction();
5616	const auto *RegInfo = Subtarget->getRegisterInfo();
5617	unsigned Reg = RegInfo->getLocalAddressRegister(MF);
5618	return DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl, Reg,
5619	VT: Op.getSimpleValueType());
5620	}
5621
5622	case Intrinsic::eh_recoverfp: {
5623	// FIXME: This needs to be implemented to correctly handle highly aligned
5624	// stack objects. For now we simply return the incoming FP. Refer D53541
5625	// for more details.
5626	SDValue FnOp = Op.getOperand(i: `1`);
5627	SDValue IncomingFPOp = Op.getOperand(i: `2`);
5628	GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Val&: FnOp);
5629	auto Fn = dyn_cast_or_null<Function>(Val: GSD ? GSD->getGlobal() : nullptr*);
5630	if (!Fn)
5631	report_fatal_error(
5632	reason: "llvm.eh.recoverfp must take a function as the first argument");
5633	return IncomingFPOp;
5634	}
5635
5636	case Intrinsic::aarch64_neon_vsri:
5637	case Intrinsic::aarch64_neon_vsli:
5638	case Intrinsic::aarch64_sve_sri:
5639	case Intrinsic::aarch64_sve_sli: {
5640	EVT Ty = Op.getValueType();
5641
5642	if (!Ty.isVector())
5643	report_fatal_error(reason: "Unexpected type for aarch64_neon_vsli");
5644
5645	assert(Op.getConstantOperandVal(`3`) <= Ty.getScalarSizeInBits());
5646
5647	bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri \|\|
5648	IntNo == Intrinsic::aarch64_sve_sri;
5649	unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
5650	return DAG.getNode(Opcode, DL: dl, VT: Ty, N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`),
5651	N3: Op.getOperand(i: `3`));
5652	}
5653
5654	case Intrinsic::aarch64_neon_srhadd:
5655	case Intrinsic::aarch64_neon_urhadd:
5656	case Intrinsic::aarch64_neon_shadd:
5657	case Intrinsic::aarch64_neon_uhadd: {
5658	bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd \|\|
5659	IntNo == Intrinsic::aarch64_neon_shadd);
5660	bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd \|\|
5661	IntNo == Intrinsic::aarch64_neon_urhadd);
5662	unsigned Opcode = IsSignedAdd
5663	? (IsRoundingAdd ? ISD::AVGCEILS : ISD::AVGFLOORS)
5664	: (IsRoundingAdd ? ISD::AVGCEILU : ISD::AVGFLOORU);
5665	return DAG.getNode(Opcode, DL: dl, VT: Op.getValueType(), N1: Op.getOperand(i: `1`),
5666	N2: Op.getOperand(i: `2`));
5667	}
5668	case Intrinsic::aarch64_neon_saddlp:
5669	case Intrinsic::aarch64_neon_uaddlp: {
5670	unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uaddlp
5671	? AArch64ISD::UADDLP
5672	: AArch64ISD::SADDLP;
5673	return DAG.getNode(Opcode, DL: dl, VT: Op.getValueType(), Operand: Op.getOperand(i: `1`));
5674	}
5675	case Intrinsic::aarch64_neon_sdot:
5676	case Intrinsic::aarch64_neon_udot:
5677	case Intrinsic::aarch64_sve_sdot:
5678	case Intrinsic::aarch64_sve_udot: {
5679	unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot \|\|
5680	IntNo == Intrinsic::aarch64_sve_udot)
5681	? AArch64ISD::UDOT
5682	: AArch64ISD::SDOT;
5683	return DAG.getNode(Opcode, DL: dl, VT: Op.getValueType(), N1: Op.getOperand(i: `1`),
5684	N2: Op.getOperand(i: `2`), N3: Op.getOperand(i: `3`));
5685	}
5686	case Intrinsic::get_active_lane_mask: {
5687	SDValue ID =
5688	DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl, MVT::i64);
5689	return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT: Op.getValueType(), N1: ID,
5690	N2: Op.getOperand(i: `1`), N3: Op.getOperand(i: `2`));
5691	}
5692	case Intrinsic::aarch64_neon_uaddlv: {
5693	EVT OpVT = Op.getOperand(i: `1`).getValueType();
5694	EVT ResVT = Op.getValueType();
5695	if (ResVT == MVT::i32 && (OpVT == MVT::v8i8 \|\| OpVT == MVT::v16i8 \|\|
5696	OpVT == MVT::v8i16 \|\| OpVT == MVT::v4i16)) {
5697	// In order to avoid insert_subvector, used v4i32 than v2i32.
5698	SDValue UADDLV =
5699	DAG.getNode(AArch64ISD::UADDLV, dl, MVT::v4i32, Op.getOperand(`1`));
5700	SDValue EXTRACT_VEC_ELT =
5701	DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, UADDLV,
5702	DAG.getConstant(`0`, dl, MVT::i64));
5703	return EXTRACT_VEC_ELT;
5704	}
5705	return SDValue ();
5706	}
5707	case Intrinsic::experimental_cttz_elts: {
5708	SDValue NewCttzElts =
5709	DAG.getNode(AArch64ISD::CTTZ_ELTS, dl, MVT::i64, Op.getOperand(`1`));
5710
5711	return DAG.getZExtOrTrunc(Op: NewCttzElts, DL: dl, VT: Op.getValueType());
5712	}
5713	}
5714	}
5715
5716	bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const {
5717	if (VT.getVectorElementType() == MVT::i8 \|\|
5718	VT.getVectorElementType() == MVT::i16) {
5719	EltTy = MVT::i32;
5720	return true;
5721	}
5722	return false;
5723	}
5724
5725	bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(SDValue Extend,
5726	EVT DataVT) const {
5727	const EVT IndexVT = Extend.getOperand(i: `0`).getValueType();
5728	// SVE only supports implicit extension of 32-bit indices.
5729	if (!Subtarget->hasSVE() \|\| IndexVT.getVectorElementType() != MVT::i32)
5730	return false;
5731
5732	// Indices cannot be smaller than the main data type.
5733	if (IndexVT.getScalarSizeInBits() < DataVT.getScalarSizeInBits())
5734	return false;
5735
5736	// Scalable vectors with "vscale 2" or fewer elements sit within a 64-bit*
5737	// element container type, which would violate the previous clause.
5738	return DataVT.isFixedLengthVector() \|\| DataVT.getVectorMinNumElements() > `2`;
5739	}
5740
5741	bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
5742	EVT ExtVT = ExtVal.getValueType();
5743	if (!ExtVT.isScalableVector() && !Subtarget->useSVEForFixedLengthVectors())
5744	return false;
5745
5746	// It may be worth creating extending masked loads if there are multiple
5747	// masked loads using the same predicate. That way we'll end up creating
5748	// extending masked loads that may then get split by the legaliser. This
5749	// results in just one set of predicate unpacks at the start, instead of
5750	// multiple sets of vector unpacks after each load.
5751	if (auto *Ld = dyn_cast<MaskedLoadSDNode>(Val: ExtVal ->getOperand(Num: `0`))) {
5752	if (!isLoadExtLegalOrCustom(ExtType: ISD::ZEXTLOAD, ValVT: ExtVT, MemVT: Ld->getValueType(ResNo: `0`))) {
5753	// Disable extending masked loads for fixed-width for now, since the code
5754	// quality doesn't look great.
5755	if (!ExtVT.isScalableVector())
5756	return false;
5757
5758	unsigned NumExtMaskedLoads = `0`;
5759	for (auto *U : Ld->getMask()->uses())
5760	if (isa<MaskedLoadSDNode>(Val: U))
5761	NumExtMaskedLoads++;
5762
5763	if (NumExtMaskedLoads <= `1`)
5764	return false;
5765	}
5766	}
5767
5768	return true;
5769	}
5770
5771	unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
5772	std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
5773	{std::make_tuple(/Scaled/ args: false, /Signed/ args: false, /Extend/ args: false),
5774	AArch64ISD::GLD1_MERGE_ZERO},
5775	{std::make_tuple(/Scaled/ args: false, /Signed/ args: false, /Extend/ args: true),
5776	AArch64ISD::GLD1_UXTW_MERGE_ZERO},
5777	{std::make_tuple(/Scaled/ args: false, /Signed/ args: true, /Extend/ args: false),
5778	AArch64ISD::GLD1_MERGE_ZERO},
5779	{std::make_tuple(/Scaled/ args: false, /Signed/ args: true, /Extend/ args: true),
5780	AArch64ISD::GLD1_SXTW_MERGE_ZERO},
5781	{std::make_tuple(/Scaled/ args: true, /Signed/ args: false, /Extend/ args: false),
5782	AArch64ISD::GLD1_SCALED_MERGE_ZERO},
5783	{std::make_tuple(/Scaled/ args: true, /Signed/ args: false, /Extend/ args: true),
5784	AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO},
5785	{std::make_tuple(/Scaled/ args: true, /Signed/ args: true, /Extend/ args: false),
5786	AArch64ISD::GLD1_SCALED_MERGE_ZERO},
5787	{std::make_tuple(/Scaled/ args: true, /Signed/ args: true, /Extend/ args: true),
5788	AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO},
5789	};
5790	auto Key = std::make_tuple(args&: IsScaled, args&: IsSigned, args&: NeedsExtend);
5791	return AddrModes.find(x: Key)->second;
5792	}
5793
5794	unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
5795	switch (Opcode) {
5796	default:
5797	llvm_unreachable("unimplemented opcode");
5798	return Opcode;
5799	case AArch64ISD::GLD1_MERGE_ZERO:
5800	return AArch64ISD::GLD1S_MERGE_ZERO;
5801	case AArch64ISD::GLD1_IMM_MERGE_ZERO:
5802	return AArch64ISD::GLD1S_IMM_MERGE_ZERO;
5803	case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
5804	return AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
5805	case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
5806	return AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
5807	case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
5808	return AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
5809	case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
5810	return AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
5811	case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
5812	return AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
5813	}
5814	}
5815
5816	SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
5817	SelectionDAG &DAG) const {
5818	MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Val&: Op);
5819
5820	SDLoc DL(Op);
5821	SDValue Chain = MGT->getChain();
5822	SDValue PassThru = MGT->getPassThru();
5823	SDValue Mask = MGT->getMask();
5824	SDValue BasePtr = MGT->getBasePtr();
5825	SDValue Index = MGT->getIndex();
5826	SDValue Scale = MGT->getScale();
5827	EVT VT = Op.getValueType();
5828	EVT MemVT = MGT->getMemoryVT();
5829	ISD::LoadExtType ExtType = MGT->getExtensionType();
5830	ISD::MemIndexType IndexType = MGT->getIndexType();
5831
5832	// SVE supports zero (and so undef) passthrough values only, everything else
5833	// must be handled manually by an explicit select on the load's output.
5834	if (!PassThru ->isUndef() && !isZerosVector(N: PassThru.getNode())) {
5835	SDValue Ops[] = {Chain, DAG.getUNDEF(VT), Mask, BasePtr, Index, Scale};
5836	SDValue Load =
5837	DAG.getMaskedGather(VTs: MGT->getVTList(), MemVT, dl: DL, Ops,
5838	MMO: MGT->getMemOperand(), IndexType, ExtTy: ExtType);
5839	SDValue Select = DAG.getSelect(DL, VT, Cond: Mask, LHS: Load, RHS: PassThru);
5840	return DAG.getMergeValues(Ops: {Select, Load.getValue(R: `1`)}, dl: DL);
5841	}
5842
5843	bool IsScaled = MGT->isIndexScaled();
5844	bool IsSigned = MGT->isIndexSigned();
5845
5846	// SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
5847	// must be calculated before hand.
5848	uint64_t ScaleVal = Scale ->getAsZExtVal();
5849	if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
5850	assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
5851	EVT IndexVT = Index.getValueType();
5852	Index = DAG.getNode(Opcode: ISD::SHL, DL, VT: IndexVT, N1: Index,
5853	N2: DAG.getConstant(Val: Log2_32(Value: ScaleVal), DL, VT: IndexVT));
5854	Scale = DAG.getTargetConstant(Val: `1`, DL, VT: Scale.getValueType());
5855
5856	SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
5857	return DAG.getMaskedGather(VTs: MGT->getVTList(), MemVT, dl: DL, Ops,
5858	MMO: MGT->getMemOperand(), IndexType, ExtTy: ExtType);
5859	}
5860
5861	// Lower fixed length gather to a scalable equivalent.
5862	if (VT.isFixedLengthVector()) {
5863	assert(Subtarget->useSVEForFixedLengthVectors() &&
5864	"Cannot lower when not using SVE for fixed vectors!");
5865
5866	// NOTE: Handle floating-point as if integer then bitcast the result.
5867	EVT DataVT = VT.changeVectorElementTypeToInteger();
5868	MemVT = MemVT.changeVectorElementTypeToInteger();
5869
5870	// Find the smallest integer fixed length vector we can use for the gather.
5871	EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
5872	if (DataVT.getVectorElementType() == MVT::i64 \|\|
5873	Index.getValueType().getVectorElementType() == MVT::i64 \|\|
5874	Mask.getValueType().getVectorElementType() == MVT::i64)
5875	PromotedVT = VT.changeVectorElementType(MVT::i64);
5876
5877	// Promote vector operands except for passthrough, which we know is either
5878	// undef or zero, and thus best constructed directly.
5879	unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
5880	Index = DAG.getNode(Opcode: ExtOpcode, DL, VT: PromotedVT, Operand: Index);
5881	Mask = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: PromotedVT, Operand: Mask);
5882
5883	// A promoted result type forces the need for an extending load.
5884	if (PromotedVT != DataVT && ExtType == ISD::NON_EXTLOAD)
5885	ExtType = ISD::EXTLOAD;
5886
5887	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: PromotedVT);
5888
5889	// Convert fixed length vector operands to scalable.
5890	MemVT = ContainerVT.changeVectorElementType(EltVT: MemVT.getVectorElementType());
5891	Index = convertToScalableVector(DAG, VT: ContainerVT, V: Index);
5892	Mask = convertFixedMaskToScalableVector(Mask, DAG);
5893	PassThru = PassThru ->isUndef() ? DAG.getUNDEF(VT: ContainerVT)
5894	: DAG.getConstant(Val: `0`, DL, VT: ContainerVT);
5895
5896	// Emit equivalent scalable vector gather.
5897	SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
5898	SDValue Load =
5899	DAG.getMaskedGather(DAG.getVTList(ContainerVT, MVT::Other), MemVT, DL,
5900	Ops, MGT->getMemOperand(), IndexType, ExtType);
5901
5902	// Extract fixed length data then convert to the required result type.
5903	SDValue Result = convertFromScalableVector(DAG, VT: PromotedVT, V: Load);
5904	Result = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: DataVT, Operand: Result);
5905	if (VT.isFloatingPoint())
5906	Result = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Result);
5907
5908	return DAG.getMergeValues(Ops: {Result, Load.getValue(R: `1`)}, dl: DL);
5909	}
5910
5911	// Everything else is legal.
5912	return Op;
5913	}
5914
5915	SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
5916	SelectionDAG &DAG) const {
5917	MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Val&: Op);
5918
5919	SDLoc DL(Op);
5920	SDValue Chain = MSC->getChain();
5921	SDValue StoreVal = MSC->getValue();
5922	SDValue Mask = MSC->getMask();
5923	SDValue BasePtr = MSC->getBasePtr();
5924	SDValue Index = MSC->getIndex();
5925	SDValue Scale = MSC->getScale();
5926	EVT VT = StoreVal.getValueType();
5927	EVT MemVT = MSC->getMemoryVT();
5928	ISD::MemIndexType IndexType = MSC->getIndexType();
5929	bool Truncating = MSC->isTruncatingStore();
5930
5931	bool IsScaled = MSC->isIndexScaled();
5932	bool IsSigned = MSC->isIndexSigned();
5933
5934	// SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
5935	// must be calculated before hand.
5936	uint64_t ScaleVal = Scale ->getAsZExtVal();
5937	if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
5938	assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
5939	EVT IndexVT = Index.getValueType();
5940	Index = DAG.getNode(Opcode: ISD::SHL, DL, VT: IndexVT, N1: Index,
5941	N2: DAG.getConstant(Val: Log2_32(Value: ScaleVal), DL, VT: IndexVT));
5942	Scale = DAG.getTargetConstant(Val: `1`, DL, VT: Scale.getValueType());
5943
5944	SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
5945	return DAG.getMaskedScatter(VTs: MSC->getVTList(), MemVT, dl: DL, Ops,
5946	MMO: MSC->getMemOperand(), IndexType, IsTruncating: Truncating);
5947	}
5948
5949	// Lower fixed length scatter to a scalable equivalent.
5950	if (VT.isFixedLengthVector()) {
5951	assert(Subtarget->useSVEForFixedLengthVectors() &&
5952	"Cannot lower when not using SVE for fixed vectors!");
5953
5954	// Once bitcast we treat floating-point scatters as if integer.
5955	if (VT.isFloatingPoint()) {
5956	VT = VT.changeVectorElementTypeToInteger();
5957	MemVT = MemVT.changeVectorElementTypeToInteger();
5958	StoreVal = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: StoreVal);
5959	}
5960
5961	// Find the smallest integer fixed length vector we can use for the scatter.
5962	EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
5963	if (VT.getVectorElementType() == MVT::i64 \|\|
5964	Index.getValueType().getVectorElementType() == MVT::i64 \|\|
5965	Mask.getValueType().getVectorElementType() == MVT::i64)
5966	PromotedVT = VT.changeVectorElementType(MVT::i64);
5967
5968	// Promote vector operands.
5969	unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
5970	Index = DAG.getNode(Opcode: ExtOpcode, DL, VT: PromotedVT, Operand: Index);
5971	Mask = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: PromotedVT, Operand: Mask);
5972	StoreVal = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: PromotedVT, Operand: StoreVal);
5973
5974	// A promoted value type forces the need for a truncating store.
5975	if (PromotedVT != VT)
5976	Truncating = true;
5977
5978	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: PromotedVT);
5979
5980	// Convert fixed length vector operands to scalable.
5981	MemVT = ContainerVT.changeVectorElementType(EltVT: MemVT.getVectorElementType());
5982	Index = convertToScalableVector(DAG, VT: ContainerVT, V: Index);
5983	Mask = convertFixedMaskToScalableVector(Mask, DAG);
5984	StoreVal = convertToScalableVector(DAG, VT: ContainerVT, V: StoreVal);
5985
5986	// Emit equivalent scalable vector scatter.
5987	SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
5988	return DAG.getMaskedScatter(VTs: MSC->getVTList(), MemVT, dl: DL, Ops,
5989	MMO: MSC->getMemOperand(), IndexType, IsTruncating: Truncating);
5990	}
5991
5992	// Everything else is legal.
5993	return Op;
5994	}
5995
5996	SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
5997	SDLoc DL(Op);
5998	MaskedLoadSDNode *LoadNode = cast<MaskedLoadSDNode>(Val&: Op);
5999	assert(LoadNode && "Expected custom lowering of a masked load node");
6000	EVT VT = Op ->getValueType(ResNo: `0`);
6001
6002	if (useSVEForFixedLengthVectorVT(VT, /OverrideNEON=/true))
6003	return LowerFixedLengthVectorMLoadToSVE(Op, DAG);
6004
6005	SDValue PassThru = LoadNode->getPassThru();
6006	SDValue Mask = LoadNode->getMask();
6007
6008	if (PassThru ->isUndef() \|\| isZerosVector(N: PassThru.getNode()))
6009	return Op;
6010
6011	SDValue Load = DAG.getMaskedLoad(
6012	VT, dl: DL, Chain: LoadNode->getChain(), Base: LoadNode->getBasePtr(),
6013	Offset: LoadNode->getOffset(), Mask, Src0: DAG.getUNDEF(VT), MemVT: LoadNode->getMemoryVT(),
6014	MMO: LoadNode->getMemOperand(), AM: LoadNode->getAddressingMode(),
6015	LoadNode->getExtensionType());
6016
6017	SDValue Result = DAG.getSelect(DL, VT, Cond: Mask, LHS: Load, RHS: PassThru);
6018
6019	return DAG.getMergeValues(Ops: {Result, Load.getValue(R: `1`)}, dl: DL);
6020	}
6021
6022	// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
6023	static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST,
6024	EVT VT, EVT MemVT,
6025	SelectionDAG &DAG) {
6026	assert(VT.isVector() && "VT should be a vector type");
6027	assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
6028
6029	SDValue Value = ST->getValue();
6030
6031	// It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
6032	// the word lane which represent the v4i8 subvector. It optimizes the store
6033	// to:
6034	//
6035	// xtn v0.8b, v0.8h
6036	// str s0, [x0]
6037
6038	SDValue Undef = DAG.getUNDEF(MVT::i16);
6039	SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
6040	{Undef, Undef, Undef, Undef});
6041
6042	SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
6043	Value, UndefVec);
6044	SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
6045
6046	Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
6047	SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
6048	Trunc, DAG.getConstant(`0`, DL, MVT::i64));
6049
6050	return DAG.getStore(Chain: ST->getChain(), dl: DL, Val: ExtractTrunc,
6051	Ptr: ST->getBasePtr(), MMO: ST->getMemOperand());
6052	}
6053
6054	// Custom lowering for any store, vector or scalar and/or default or with
6055	// a truncate operations. Currently only custom lower truncate operation
6056	// from vector v4i16 to v4i8 or volatile stores of i128.
6057	SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
6058	SelectionDAG &DAG) const {
6059	SDLoc Dl(Op);
6060	StoreSDNode *StoreNode = cast<StoreSDNode>(Val&: Op);
6061	assert (StoreNode && "Can only custom lower store nodes");
6062
6063	SDValue Value = StoreNode->getValue();
6064
6065	EVT VT = Value.getValueType();
6066	EVT MemVT = StoreNode->getMemoryVT();
6067
6068	if (VT.isVector()) {
6069	if (useSVEForFixedLengthVectorVT(
6070	VT,
6071	/OverrideNEON=/Subtarget->useSVEForFixedLengthVectors()))
6072	return LowerFixedLengthVectorStoreToSVE(Op, DAG);
6073
6074	unsigned AS = StoreNode->getAddressSpace();
6075	Align Alignment = StoreNode->getAlign();
6076	if (Alignment < MemVT.getStoreSize() &&
6077	!allowsMisalignedMemoryAccesses(VT: MemVT, AddrSpace: AS, Alignment,
6078	Flags: StoreNode->getMemOperand()->getFlags(),
6079	Fast: nullptr)) {
6080	return scalarizeVectorStore(ST: StoreNode, DAG);
6081	}
6082
6083	if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 &&
6084	MemVT == MVT::v4i8) {
6085	return LowerTruncateVectorStore(DL: Dl, ST: StoreNode, VT, MemVT, DAG);
6086	}
6087	// 256 bit non-temporal stores can be lowered to STNP. Do this as part of
6088	// the custom lowering, as there are no un-paired non-temporal stores and
6089	// legalization will break up 256 bit inputs.
6090	ElementCount EC = MemVT.getVectorElementCount();
6091	if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == `256u` &&
6092	EC.isKnownEven() && DAG.getDataLayout().isLittleEndian() &&
6093	(MemVT.getScalarSizeInBits() == `8u` \|\|
6094	MemVT.getScalarSizeInBits() == `16u` \|\|
6095	MemVT.getScalarSizeInBits() == `32u` \|\|
6096	MemVT.getScalarSizeInBits() == `64u`)) {
6097	SDValue Lo =
6098	DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl,
6099	MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
6100	StoreNode->getValue(), DAG.getConstant(`0`, Dl, MVT::i64));
6101	SDValue Hi =
6102	DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl,
6103	MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
6104	StoreNode->getValue(),
6105	DAG.getConstant(EC.getKnownMinValue() / `2`, Dl, MVT::i64));
6106	SDValue Result = DAG.getMemIntrinsicNode(
6107	AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other),
6108	{StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
6109	StoreNode->getMemoryVT(), StoreNode->getMemOperand());
6110	return Result;
6111	}
6112	} else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
6113	return LowerStore128(Op, DAG);
6114	} else if (MemVT == MVT::i64x8) {
6115	SDValue Value = StoreNode->getValue();
6116	assert(Value->getValueType(`0`) == MVT::i64x8);
6117	SDValue Chain = StoreNode->getChain();
6118	SDValue Base = StoreNode->getBasePtr();
6119	EVT PtrVT = Base.getValueType();
6120	for (unsigned i = `0`; i < `8`; i++) {
6121	SDValue Part = DAG.getNode(AArch64ISD::LS64_EXTRACT, Dl, MVT::i64,
6122	Value, DAG.getConstant(i, Dl, MVT::i32));
6123	SDValue Ptr = DAG.getNode(Opcode: ISD::ADD, DL: Dl, VT: PtrVT, N1: Base,
6124	N2: DAG.getConstant(Val: i * `8`, DL: Dl, VT: PtrVT));
6125	Chain = DAG.getStore(Chain, dl: Dl, Val: Part, Ptr, PtrInfo: StoreNode->getPointerInfo(),
6126	Alignment: StoreNode->getOriginalAlign());
6127	}
6128	return Chain;
6129	}
6130
6131	return SDValue ();
6132	}
6133
6134	/// Lower atomic or volatile 128-bit stores to a single STP instruction.
6135	SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
6136	SelectionDAG &DAG) const {
6137	MemSDNode *StoreNode = cast<MemSDNode>(Val&: Op);
6138	assert(StoreNode->getMemoryVT() == MVT::i128);
6139	assert(StoreNode->isVolatile() \|\| StoreNode->isAtomic());
6140
6141	bool IsStoreRelease =
6142	StoreNode->getMergedOrdering() == AtomicOrdering::Release;
6143	if (StoreNode->isAtomic())
6144	assert((Subtarget->hasFeature(AArch64::FeatureLSE2) &&
6145	Subtarget->hasFeature(AArch64::FeatureRCPC3) && IsStoreRelease) \|\|
6146	StoreNode->getMergedOrdering() == AtomicOrdering::Unordered \|\|
6147	StoreNode->getMergedOrdering() == AtomicOrdering::Monotonic);
6148
6149	SDValue Value = (StoreNode->getOpcode() == ISD::STORE \|\|
6150	StoreNode->getOpcode() == ISD::ATOMIC_STORE)
6151	? StoreNode->getOperand(Num: `1`)
6152	: StoreNode->getOperand(Num: `2`);
6153	SDLoc DL(Op);
6154	auto StoreValue = DAG.SplitScalar(Value, DL, MVT::i64, MVT::i64);
6155	unsigned Opcode = IsStoreRelease ? AArch64ISD::STILP : AArch64ISD::STP;
6156	if (DAG.getDataLayout().isBigEndian())
6157	std::swap(StoreValue.first, StoreValue.second);
6158	SDValue Result = DAG.getMemIntrinsicNode(
6159	Opcode, DL, DAG.getVTList(MVT::Other),
6160	{StoreNode->getChain(), StoreValue.first, StoreValue.second,
6161	StoreNode->getBasePtr()},
6162	StoreNode->getMemoryVT(), StoreNode->getMemOperand());
6163	return Result;
6164	}
6165
6166	SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
6167	SelectionDAG &DAG) const {
6168	SDLoc DL(Op);
6169	LoadSDNode *LoadNode = cast<LoadSDNode>(Val&: Op);
6170	assert(LoadNode && "Expected custom lowering of a load node");
6171
6172	if (LoadNode->getMemoryVT() == MVT::i64x8) {
6173	SmallVector<SDValue, `8`> Ops;
6174	SDValue Base = LoadNode->getBasePtr();
6175	SDValue Chain = LoadNode->getChain();
6176	EVT PtrVT = Base.getValueType();
6177	for (unsigned i = `0`; i < `8`; i++) {
6178	SDValue Ptr = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: Base,
6179	N2: DAG.getConstant(Val: i * `8`, DL, VT: PtrVT));
6180	SDValue Part = DAG.getLoad(MVT::i64, DL, Chain, Ptr,
6181	LoadNode->getPointerInfo(),
6182	LoadNode->getOriginalAlign());
6183	Ops.push_back(Elt: Part);
6184	Chain = SDValue (Part.getNode(), `1`);
6185	}
6186	SDValue Loaded = DAG.getNode(AArch64ISD::LS64_BUILD, DL, MVT::i64x8, Ops);
6187	return DAG.getMergeValues(Ops: {Loaded, Chain}, dl: DL);
6188	}
6189
6190	// Custom lowering for extending v4i8 vector loads.
6191	EVT VT = Op ->getValueType(ResNo: `0`);
6192	assert((VT == MVT::v4i16 \|\| VT == MVT::v4i32) && "Expected v4i16 or v4i32");
6193
6194	if (LoadNode->getMemoryVT() != MVT::v4i8)
6195	return SDValue ();
6196
6197	unsigned ExtType;
6198	if (LoadNode->getExtensionType() == ISD::SEXTLOAD)
6199	ExtType = ISD::SIGN_EXTEND;
6200	else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD \|\|
6201	LoadNode->getExtensionType() == ISD::EXTLOAD)
6202	ExtType = ISD::ZERO_EXTEND;
6203	else
6204	return SDValue ();
6205
6206	SDValue Load = DAG.getLoad(MVT::f32, DL, LoadNode->getChain(),
6207	LoadNode->getBasePtr(), MachinePointerInfo());
6208	SDValue Chain = Load.getValue(R: `1`);
6209	SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f32, Load);
6210	SDValue BC = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec);
6211	SDValue Ext = DAG.getNode(ExtType, DL, MVT::v8i16, BC);
6212	Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Ext,
6213	DAG.getConstant(`0`, DL, MVT::i64));
6214	if (VT == MVT::v4i32)
6215	Ext = DAG.getNode(ExtType, DL, MVT::v4i32, Ext);
6216	return DAG.getMergeValues(Ops: {Ext, Chain}, dl: DL);
6217	}
6218
6219	// Generate SUBS and CSEL for integer abs.
6220	SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
6221	MVT VT = Op.getSimpleValueType();
6222
6223	if (VT.isVector())
6224	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::ABS_MERGE_PASSTHRU);
6225
6226	SDLoc DL(Op);
6227	SDValue Neg = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: DAG.getConstant(Val: `0`, DL, VT),
6228	N2: Op.getOperand(i: `0`));
6229	// Generate SUBS & CSEL.
6230	SDValue Cmp =
6231	DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
6232	Op.getOperand(`0`), DAG.getConstant(`0`, DL, VT));
6233	return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(`0`), Neg,
6234	DAG.getConstant(AArch64CC::PL, DL, MVT::i32),
6235	Cmp.getValue(`1`));
6236	}
6237
6238	static SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) {
6239	SDValue Chain = Op.getOperand(i: `0`);
6240	SDValue Cond = Op.getOperand(i: `1`);
6241	SDValue Dest = Op.getOperand(i: `2`);
6242
6243	AArch64CC::CondCode CC;
6244	if (SDValue Cmp = emitConjunction(DAG, Val: Cond, OutCC&: CC)) {
6245	SDLoc dl(Op);
6246	SDValue CCVal = DAG.getConstant(CC, dl, MVT::i32);
6247	return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
6248	Cmp);
6249	}
6250
6251	return SDValue ();
6252	}
6253
6254	// Treat FSHR with constant shifts as legal operation, otherwise it is expanded
6255	// FSHL is converted to FSHR before deciding what to do with it
6256	static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG) {
6257	SDValue Shifts = Op.getOperand(i: `2`);
6258	// Check if the shift amount is a constant
6259	// If opcode is FSHL, convert it to FSHR
6260	if (auto *ShiftNo = dyn_cast<ConstantSDNode>(Val&: Shifts)) {
6261	SDLoc DL(Op);
6262	MVT VT = Op.getSimpleValueType();
6263
6264	if (Op.getOpcode() == ISD::FSHL) {
6265	unsigned int NewShiftNo =
6266	VT.getFixedSizeInBits() - ShiftNo->getZExtValue();
6267	return DAG.getNode(
6268	Opcode: ISD::FSHR, DL, VT, N1: Op.getOperand(i: `0`), N2: Op.getOperand(i: `1`),
6269	N3: DAG.getConstant(Val: NewShiftNo, DL, VT: Shifts.getValueType()));
6270	} else if (Op.getOpcode() == ISD::FSHR) {
6271	return Op;
6272	}
6273	}
6274
6275	return SDValue ();
6276	}
6277
6278	static SDValue LowerFLDEXP(SDValue Op, SelectionDAG &DAG) {
6279	SDValue X = Op.getOperand(i: `0`);
6280	EVT XScalarTy = X.getValueType();
6281	SDValue Exp = Op.getOperand(i: `1`);
6282
6283	SDLoc DL(Op);
6284	EVT XVT, ExpVT;
6285	switch (Op.getSimpleValueType().SimpleTy) {
6286	default:
6287	return SDValue ();
6288	case MVT::bf16:
6289	case MVT::f16:
6290	X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X);
6291	[[fallthrough]];
6292	case MVT::f32:
6293	XVT = MVT::nxv4f32;
6294	ExpVT = MVT::nxv4i32;
6295	break;
6296	case MVT::f64:
6297	XVT = MVT::nxv2f64;
6298	ExpVT = MVT::nxv2i64;
6299	Exp = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Exp);
6300	break;
6301	}
6302
6303	SDValue Zero = DAG.getConstant(`0`, DL, MVT::i64);
6304	SDValue VX =
6305	DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: XVT, N1: DAG.getUNDEF(VT: XVT), N2: X, N3: Zero);
6306	SDValue VExp = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: ExpVT,
6307	N1: DAG.getUNDEF(VT: ExpVT), N2: Exp, N3: Zero);
6308	SDValue VPg = getPTrue(DAG, DL, XVT.changeVectorElementType(MVT::i1),
6309	AArch64SVEPredPattern::all);
6310	SDValue FScale =
6311	DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, XVT,
6312	DAG.getConstant(Intrinsic::aarch64_sve_fscale, DL, MVT::i64),
6313	VPg, VX, VExp);
6314	SDValue Final =
6315	DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: X.getValueType(), N1: FScale, N2: Zero);
6316	if (X.getValueType() != XScalarTy)
6317	Final = DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT: XScalarTy, N1: Final,
6318	N2: DAG.getIntPtrConstant(Val: `1`, DL: SDLoc (Op)));
6319	return Final;
6320	}
6321
6322	SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
6323	SelectionDAG &DAG) const {
6324	LLVM_DEBUG(dbgs() << "Custom lowering: ");
6325	LLVM_DEBUG(Op.dump());
6326
6327	switch (Op.getOpcode()) {
6328	default:
6329	llvm_unreachable("unimplemented operand");
6330	return SDValue ();
6331	case ISD::BITCAST:
6332	return LowerBITCAST(Op, DAG);
6333	case ISD::GlobalAddress:
6334	return LowerGlobalAddress(Op, DAG);
6335	case ISD::GlobalTLSAddress:
6336	return LowerGlobalTLSAddress(Op, DAG);
6337	case ISD::SETCC:
6338	case ISD::STRICT_FSETCC:
6339	case ISD::STRICT_FSETCCS:
6340	return LowerSETCC(Op, DAG);
6341	case ISD::SETCCCARRY:
6342	return LowerSETCCCARRY(Op, DAG);
6343	case ISD::BRCOND:
6344	return LowerBRCOND(Op, DAG);
6345	case ISD::BR_CC:
6346	return LowerBR_CC(Op, DAG);
6347	case ISD::SELECT:
6348	return LowerSELECT(Op, DAG);
6349	case ISD::SELECT_CC:
6350	return LowerSELECT_CC(Op, DAG);
6351	case ISD::JumpTable:
6352	return LowerJumpTable(Op, DAG);
6353	case ISD::BR_JT:
6354	return LowerBR_JT(Op, DAG);
6355	case ISD::ConstantPool:
6356	return LowerConstantPool(Op, DAG);
6357	case ISD::BlockAddress:
6358	return LowerBlockAddress(Op, DAG);
6359	case ISD::VASTART:
6360	return LowerVASTART(Op, DAG);
6361	case ISD::VACOPY:
6362	return LowerVACOPY(Op, DAG);
6363	case ISD::VAARG:
6364	return LowerVAARG(Op, DAG);
6365	case ISD::UADDO_CARRY:
6366	return lowerADDSUBO_CARRY(Op, DAG, Opcode: AArch64ISD::ADCS, IsSigned: false /unsigned/);
6367	case ISD::USUBO_CARRY:
6368	return lowerADDSUBO_CARRY(Op, DAG, Opcode: AArch64ISD::SBCS, IsSigned: false /unsigned/);
6369	case ISD::SADDO_CARRY:
6370	return lowerADDSUBO_CARRY(Op, DAG, Opcode: AArch64ISD::ADCS, IsSigned: true /signed/);
6371	case ISD::SSUBO_CARRY:
6372	return lowerADDSUBO_CARRY(Op, DAG, Opcode: AArch64ISD::SBCS, IsSigned: true /signed/);
6373	case ISD::SADDO:
6374	case ISD::UADDO:
6375	case ISD::SSUBO:
6376	case ISD::USUBO:
6377	case ISD::SMULO:
6378	case ISD::UMULO:
6379	return LowerXALUO(Op, DAG);
6380	case ISD::FADD:
6381	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FADD_PRED);
6382	case ISD::FSUB:
6383	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FSUB_PRED);
6384	case ISD::FMUL:
6385	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FMUL_PRED);
6386	case ISD::FMA:
6387	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FMA_PRED);
6388	case ISD::FDIV:
6389	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FDIV_PRED);
6390	case ISD::FNEG:
6391	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FNEG_MERGE_PASSTHRU);
6392	case ISD::FCEIL:
6393	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FCEIL_MERGE_PASSTHRU);
6394	case ISD::FFLOOR:
6395	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FFLOOR_MERGE_PASSTHRU);
6396	case ISD::FNEARBYINT:
6397	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FNEARBYINT_MERGE_PASSTHRU);
6398	case ISD::FRINT:
6399	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FRINT_MERGE_PASSTHRU);
6400	case ISD::FROUND:
6401	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FROUND_MERGE_PASSTHRU);
6402	case ISD::FROUNDEVEN:
6403	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU);
6404	case ISD::FTRUNC:
6405	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FTRUNC_MERGE_PASSTHRU);
6406	case ISD::FSQRT:
6407	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FSQRT_MERGE_PASSTHRU);
6408	case ISD::FABS:
6409	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FABS_MERGE_PASSTHRU);
6410	case ISD::FP_ROUND:
6411	case ISD::STRICT_FP_ROUND:
6412	return LowerFP_ROUND(Op, DAG);
6413	case ISD::FP_EXTEND:
6414	return LowerFP_EXTEND(Op, DAG);
6415	case ISD::FRAMEADDR:
6416	return LowerFRAMEADDR(Op, DAG);
6417	case ISD::SPONENTRY:
6418	return LowerSPONENTRY(Op, DAG);
6419	case ISD::RETURNADDR:
6420	return LowerRETURNADDR(Op, DAG);
6421	case ISD::ADDROFRETURNADDR:
6422	return LowerADDROFRETURNADDR(Op, DAG);
6423	case ISD::CONCAT_VECTORS:
6424	return LowerCONCAT_VECTORS(Op, DAG);
6425	case ISD::INSERT_VECTOR_ELT:
6426	return LowerINSERT_VECTOR_ELT(Op, DAG);
6427	case ISD::EXTRACT_VECTOR_ELT:
6428	return LowerEXTRACT_VECTOR_ELT(Op, DAG);
6429	case ISD::BUILD_VECTOR:
6430	return LowerBUILD_VECTOR(Op, DAG);
6431	case ISD::ZERO_EXTEND_VECTOR_INREG:
6432	return LowerZERO_EXTEND_VECTOR_INREG(Op, DAG);
6433	case ISD::VECTOR_SHUFFLE:
6434	return LowerVECTOR_SHUFFLE(Op, DAG);
6435	case ISD::SPLAT_VECTOR:
6436	return LowerSPLAT_VECTOR(Op, DAG);
6437	case ISD::EXTRACT_SUBVECTOR:
6438	return LowerEXTRACT_SUBVECTOR(Op, DAG);
6439	case ISD::INSERT_SUBVECTOR:
6440	return LowerINSERT_SUBVECTOR(Op, DAG);
6441	case ISD::SDIV:
6442	case ISD::UDIV:
6443	return LowerDIV(Op, DAG);
6444	case ISD::SMIN:
6445	case ISD::UMIN:
6446	case ISD::SMAX:
6447	case ISD::UMAX:
6448	return LowerMinMax(Op, DAG);
6449	case ISD::SRA:
6450	case ISD::SRL:
6451	case ISD::SHL:
6452	return LowerVectorSRA_SRL_SHL(Op, DAG);
6453	case ISD::SHL_PARTS:
6454	case ISD::SRL_PARTS:
6455	case ISD::SRA_PARTS:
6456	return LowerShiftParts(Op, DAG);
6457	case ISD::CTPOP:
6458	case ISD::PARITY:
6459	return LowerCTPOP_PARITY(Op, DAG);
6460	case ISD::FCOPYSIGN:
6461	return LowerFCOPYSIGN(Op, DAG);
6462	case ISD::OR:
6463	return LowerVectorOR(Op, DAG);
6464	case ISD::XOR:
6465	return LowerXOR(Op, DAG);
6466	case ISD::PREFETCH:
6467	return LowerPREFETCH(Op, DAG);
6468	case ISD::SINT_TO_FP:
6469	case ISD::UINT_TO_FP:
6470	case ISD::STRICT_SINT_TO_FP:
6471	case ISD::STRICT_UINT_TO_FP:
6472	return LowerINT_TO_FP(Op, DAG);
6473	case ISD::FP_TO_SINT:
6474	case ISD::FP_TO_UINT:
6475	case ISD::STRICT_FP_TO_SINT:
6476	case ISD::STRICT_FP_TO_UINT:
6477	return LowerFP_TO_INT(Op, DAG);
6478	case ISD::FP_TO_SINT_SAT:
6479	case ISD::FP_TO_UINT_SAT:
6480	return LowerFP_TO_INT_SAT(Op, DAG);
6481	case ISD::FSINCOS:
6482	return LowerFSINCOS(Op, DAG);
6483	case ISD::GET_ROUNDING:
6484	return LowerGET_ROUNDING(Op, DAG);
6485	case ISD::SET_ROUNDING:
6486	return LowerSET_ROUNDING(Op, DAG);
6487	case ISD::MUL:
6488	return LowerMUL(Op, DAG);
6489	case ISD::MULHS:
6490	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::MULHS_PRED);
6491	case ISD::MULHU:
6492	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::MULHU_PRED);
6493	case ISD::INTRINSIC_W_CHAIN:
6494	return LowerINTRINSIC_W_CHAIN(Op, DAG);
6495	case ISD::INTRINSIC_WO_CHAIN:
6496	return LowerINTRINSIC_WO_CHAIN(Op, DAG);
6497	case ISD::INTRINSIC_VOID:
6498	return LowerINTRINSIC_VOID(Op, DAG);
6499	case ISD::ATOMIC_STORE:
6500	if (cast<MemSDNode>(Op)->getMemoryVT() == MVT::i128) {
6501	assert(Subtarget->hasLSE2() \|\| Subtarget->hasRCPC3());
6502	return LowerStore128(Op, DAG);
6503	}
6504	return SDValue ();
6505	case ISD::STORE:
6506	return LowerSTORE(Op, DAG);
6507	case ISD::MSTORE:
6508	return LowerFixedLengthVectorMStoreToSVE(Op, DAG);
6509	case ISD::MGATHER:
6510	return LowerMGATHER(Op, DAG);
6511	case ISD::MSCATTER:
6512	return LowerMSCATTER(Op, DAG);
6513	case ISD::VECREDUCE_SEQ_FADD:
6514	return LowerVECREDUCE_SEQ_FADD(ScalarOp: Op, DAG);
6515	case ISD::VECREDUCE_ADD:
6516	case ISD::VECREDUCE_AND:
6517	case ISD::VECREDUCE_OR:
6518	case ISD::VECREDUCE_XOR:
6519	case ISD::VECREDUCE_SMAX:
6520	case ISD::VECREDUCE_SMIN:
6521	case ISD::VECREDUCE_UMAX:
6522	case ISD::VECREDUCE_UMIN:
6523	case ISD::VECREDUCE_FADD:
6524	case ISD::VECREDUCE_FMAX:
6525	case ISD::VECREDUCE_FMIN:
6526	case ISD::VECREDUCE_FMAXIMUM:
6527	case ISD::VECREDUCE_FMINIMUM:
6528	return LowerVECREDUCE(Op, DAG);
6529	case ISD::ATOMIC_LOAD_AND:
6530	return LowerATOMIC_LOAD_AND(Op, DAG);
6531	case ISD::DYNAMIC_STACKALLOC:
6532	return LowerDYNAMIC_STACKALLOC(Op, DAG);
6533	case ISD::VSCALE:
6534	return LowerVSCALE(Op, DAG);
6535	case ISD::ANY_EXTEND:
6536	case ISD::SIGN_EXTEND:
6537	case ISD::ZERO_EXTEND:
6538	return LowerFixedLengthVectorIntExtendToSVE(Op, DAG);
6539	case ISD::SIGN_EXTEND_INREG: {
6540	// Only custom lower when ExtraVT has a legal byte based element type.
6541	EVT ExtraVT = cast<VTSDNode>(Val: Op.getOperand(i: `1`))->getVT();
6542	EVT ExtraEltVT = ExtraVT.getVectorElementType();
6543	if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) &&
6544	(ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64))
6545	return SDValue ();
6546
6547	return LowerToPredicatedOp(Op, DAG,
6548	NewOp: AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU);
6549	}
6550	case ISD::TRUNCATE:
6551	return LowerTRUNCATE(Op, DAG);
6552	case ISD::MLOAD:
6553	return LowerMLOAD(Op, DAG);
6554	case ISD::LOAD:
6555	if (useSVEForFixedLengthVectorVT(VT: Op.getValueType(),
6556	OverrideNEON: !Subtarget->isNeonAvailable()))
6557	return LowerFixedLengthVectorLoadToSVE(Op, DAG);
6558	return LowerLOAD(Op, DAG);
6559	case ISD::ADD:
6560	case ISD::AND:
6561	case ISD::SUB:
6562	return LowerToScalableOp(Op, DAG);
6563	case ISD::FMAXIMUM:
6564	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FMAX_PRED);
6565	case ISD::FMAXNUM:
6566	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FMAXNM_PRED);
6567	case ISD::FMINIMUM:
6568	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FMIN_PRED);
6569	case ISD::FMINNUM:
6570	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FMINNM_PRED);
6571	case ISD::VSELECT:
6572	return LowerFixedLengthVectorSelectToSVE(Op, DAG);
6573	case ISD::ABS:
6574	return LowerABS(Op, DAG);
6575	case ISD::ABDS:
6576	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::ABDS_PRED);
6577	case ISD::ABDU:
6578	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::ABDU_PRED);
6579	case ISD::AVGFLOORS:
6580	return LowerAVG(Op, DAG, NewOp: AArch64ISD::HADDS_PRED);
6581	case ISD::AVGFLOORU:
6582	return LowerAVG(Op, DAG, NewOp: AArch64ISD::HADDU_PRED);
6583	case ISD::AVGCEILS:
6584	return LowerAVG(Op, DAG, NewOp: AArch64ISD::RHADDS_PRED);
6585	case ISD::AVGCEILU:
6586	return LowerAVG(Op, DAG, NewOp: AArch64ISD::RHADDU_PRED);
6587	case ISD::BITREVERSE:
6588	return LowerBitreverse(Op, DAG);
6589	case ISD::BSWAP:
6590	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::BSWAP_MERGE_PASSTHRU);
6591	case ISD::CTLZ:
6592	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::CTLZ_MERGE_PASSTHRU);
6593	case ISD::CTTZ:
6594	return LowerCTTZ(Op, DAG);
6595	case ISD::VECTOR_SPLICE:
6596	return LowerVECTOR_SPLICE(Op, DAG);
6597	case ISD::VECTOR_DEINTERLEAVE:
6598	return LowerVECTOR_DEINTERLEAVE(Op, DAG);
6599	case ISD::VECTOR_INTERLEAVE:
6600	return LowerVECTOR_INTERLEAVE(Op, DAG);
6601	case ISD::LROUND:
6602	case ISD::LLROUND:
6603	case ISD::LRINT:
6604	case ISD::LLRINT: {
6605	assert((Op.getOperand(`0`).getValueType() == MVT::f16 \|\|
6606	Op.getOperand(`0`).getValueType() == MVT::bf16) &&
6607	"Expected custom lowering of rounding operations only for f16");
6608	SDLoc DL(Op);
6609	SDValue Ext = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Op.getOperand(`0`));
6610	return DAG.getNode(Opcode: Op.getOpcode(), DL, VT: Op.getValueType(), Operand: Ext);
6611	}
6612	case ISD::STRICT_LROUND:
6613	case ISD::STRICT_LLROUND:
6614	case ISD::STRICT_LRINT:
6615	case ISD::STRICT_LLRINT: {
6616	assert((Op.getOperand(`1`).getValueType() == MVT::f16 \|\|
6617	Op.getOperand(`1`).getValueType() == MVT::bf16) &&
6618	"Expected custom lowering of rounding operations only for f16");
6619	SDLoc DL(Op);
6620	SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
6621	{Op.getOperand(`0`), Op.getOperand(`1`)});
6622	return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
6623	{Ext.getValue(`1`), Ext.getValue(`0`)});
6624	}
6625	case ISD::WRITE_REGISTER: {
6626	assert(Op.getOperand(`2`).getValueType() == MVT::i128 &&
6627	"WRITE_REGISTER custom lowering is only for 128-bit sysregs");
6628	SDLoc DL(Op);
6629
6630	SDValue Chain = Op.getOperand(i: `0`);
6631	SDValue SysRegName = Op.getOperand(i: `1`);
6632	std::pair<SDValue, SDValue> Pair =
6633	DAG.SplitScalar(Op.getOperand(`2`), DL, MVT::i64, MVT::i64);
6634
6635	// chain = MSRR(chain, sysregname, lo, hi)
6636	SDValue Result = DAG.getNode(AArch64ISD::MSRR, DL, MVT::Other, Chain,
6637	SysRegName, Pair.first, Pair.second);
6638
6639	return Result;
6640	}
6641	case ISD::FSHL:
6642	case ISD::FSHR:
6643	return LowerFunnelShift(Op, DAG);
6644	case ISD::FLDEXP:
6645	return LowerFLDEXP(Op, DAG);
6646	}
6647	}
6648
6649	bool AArch64TargetLowering::mergeStoresAfterLegalization(EVT VT) const {
6650	return !Subtarget->useSVEForFixedLengthVectors();
6651	}
6652
6653	bool AArch64TargetLowering::useSVEForFixedLengthVectorVT(
6654	EVT VT, bool OverrideNEON) const {
6655	if (!VT.isFixedLengthVector() \|\| !VT.isSimple())
6656	return false;
6657
6658	// Don't use SVE for vectors we cannot scalarize if required.
6659	switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
6660	// Fixed length predicates should be promoted to i8.
6661	// NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
6662	case MVT::i1:
6663	default:
6664	return false;
6665	case MVT::i8:
6666	case MVT::i16:
6667	case MVT::i32:
6668	case MVT::i64:
6669	case MVT::f16:
6670	case MVT::f32:
6671	case MVT::f64:
6672	break;
6673	}
6674
6675	// NEON-sized vectors can be emulated using SVE instructions.
6676	if (OverrideNEON && (VT.is128BitVector() \|\| VT.is64BitVector()))
6677	return Subtarget->hasSVEorSME();
6678
6679	// Ensure NEON MVTs only belong to a single register class.
6680	if (VT.getFixedSizeInBits() <= `128`)
6681	return false;
6682
6683	// Ensure wider than NEON code generation is enabled.
6684	if (!Subtarget->useSVEForFixedLengthVectors())
6685	return false;
6686
6687	// Don't use SVE for types that don't fit.
6688	if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
6689	return false;
6690
6691	// TODO: Perhaps an artificial restriction, but worth having whilst getting
6692	// the base fixed length SVE support in place.
6693	if (!VT.isPow2VectorType())
6694	return false;
6695
6696	return true;
6697	}
6698
6699	//===----------------------------------------------------------------------===//
6700	// Calling Convention Implementation
6701	//===----------------------------------------------------------------------===//
6702
6703	static unsigned getIntrinsicID(const SDNode *N) {
6704	unsigned Opcode = N->getOpcode();
6705	switch (Opcode) {
6706	default:
6707	return Intrinsic::not_intrinsic;
6708	case ISD::INTRINSIC_WO_CHAIN: {
6709	unsigned IID = N->getConstantOperandVal(Num: `0`);
6710	if (IID < Intrinsic::num_intrinsics)
6711	return IID;
6712	return Intrinsic::not_intrinsic;
6713	}
6714	}
6715	}
6716
6717	bool AArch64TargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0,
6718	SDValue N1) const {
6719	if (!N0.hasOneUse())
6720	return false;
6721
6722	unsigned IID = getIntrinsicID(N: N1.getNode());
6723	// Avoid reassociating expressions that can be lowered to smlal/umlal.
6724	if (IID == Intrinsic::aarch64_neon_umull \|\|
6725	N1.getOpcode() == AArch64ISD::UMULL \|\|
6726	IID == Intrinsic::aarch64_neon_smull \|\|
6727	N1.getOpcode() == AArch64ISD::SMULL)
6728	return N0.getOpcode() != ISD::ADD;
6729
6730	return true;
6731	}
6732
6733	/// Selects the correct CCAssignFn for a given CallingConvention value.
6734	CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
6735	bool IsVarArg) const {
6736	switch (CC) {
6737	default:
6738	report_fatal_error(reason: "Unsupported calling convention.");
6739	case CallingConv::GHC:
6740	return CC_AArch64_GHC;
6741	case CallingConv::C:
6742	case CallingConv::Fast:
6743	case CallingConv::PreserveMost:
6744	case CallingConv::PreserveAll:
6745	case CallingConv::CXX_FAST_TLS:
6746	case CallingConv::Swift:
6747	case CallingConv::SwiftTail:
6748	case CallingConv::Tail:
6749	case CallingConv::GRAAL:
6750	if (Subtarget->isTargetWindows()) {
6751	if (IsVarArg) {
6752	if (Subtarget->isWindowsArm64EC())
6753	return CC_AArch64_Arm64EC_VarArg;
6754	return CC_AArch64_Win64_VarArg;
6755	}
6756	return CC_AArch64_Win64PCS;
6757	}
6758	if (!Subtarget->isTargetDarwin())
6759	return CC_AArch64_AAPCS;
6760	if (!IsVarArg)
6761	return CC_AArch64_DarwinPCS;
6762	return Subtarget->isTargetILP32() ? CC_AArch64_DarwinPCS_ILP32_VarArg
6763	: CC_AArch64_DarwinPCS_VarArg;
6764	case CallingConv::Win64:
6765	if (IsVarArg) {
6766	if (Subtarget->isWindowsArm64EC())
6767	return CC_AArch64_Arm64EC_VarArg;
6768	return CC_AArch64_Win64_VarArg;
6769	}
6770	return CC_AArch64_Win64PCS;
6771	case CallingConv::CFGuard_Check:
6772	if (Subtarget->isWindowsArm64EC())
6773	return CC_AArch64_Arm64EC_CFGuard_Check;
6774	return CC_AArch64_Win64_CFGuard_Check;
6775	case CallingConv::AArch64_VectorCall:
6776	case CallingConv::AArch64_SVE_VectorCall:
6777	case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0:
6778	case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2:
6779	return CC_AArch64_AAPCS;
6780	case CallingConv::ARM64EC_Thunk_X64:
6781	return CC_AArch64_Arm64EC_Thunk;
6782	case CallingConv::ARM64EC_Thunk_Native:
6783	return CC_AArch64_Arm64EC_Thunk_Native;
6784	}
6785	}
6786
6787	CCAssignFn *
6788	AArch64TargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const {
6789	switch (CC) {
6790	default:
6791	return RetCC_AArch64_AAPCS;
6792	case CallingConv::ARM64EC_Thunk_X64:
6793	return RetCC_AArch64_Arm64EC_Thunk;
6794	case CallingConv::CFGuard_Check:
6795	if (Subtarget->isWindowsArm64EC())
6796	return RetCC_AArch64_Arm64EC_CFGuard_Check;
6797	return RetCC_AArch64_AAPCS;
6798	}
6799	}
6800
6801
6802	unsigned
6803	AArch64TargetLowering::allocateLazySaveBuffer(SDValue &Chain, const SDLoc &DL,
6804	SelectionDAG &DAG) const {
6805	MachineFunction &MF = DAG.getMachineFunction();
6806	MachineFrameInfo &MFI = MF.getFrameInfo();
6807
6808	// Allocate a lazy-save buffer object of size SVL.B SVL.B (worst-case)*
6809	SDValue N = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
6810	DAG.getConstant(`1`, DL, MVT::i32));
6811	SDValue NN = DAG.getNode(ISD::MUL, DL, MVT::i64, N, N);
6812	SDValue Ops[] = {Chain, NN, DAG.getConstant(`1`, DL, MVT::i64)};
6813	SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Other);
6814	SDValue Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL, VTs, Ops);
6815	Chain = Buffer.getValue(R: `1`);
6816	MFI.CreateVariableSizedObject(Alignment: Align (`1`), Alloca: nullptr);
6817
6818	// Allocate an additional TPIDR2 object on the stack (16 bytes)
6819	unsigned TPIDR2Obj = MFI.CreateStackObject(Size: `16`, Alignment: Align (`16`), isSpillSlot: false);
6820
6821	// Store the buffer pointer to the TPIDR2 stack object.
6822	MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, Offset: TPIDR2Obj);
6823	SDValue Ptr = DAG.getFrameIndex(
6824	FI: TPIDR2Obj,
6825	VT: DAG.getTargetLoweringInfo().getFrameIndexTy(DL: DAG.getDataLayout()));
6826	Chain = DAG.getStore(Chain, dl: DL, Val: Buffer, Ptr, PtrInfo: MPI);
6827
6828	// Set the reserved bytes (10-15) to zero
6829	EVT PtrTy = Ptr.getValueType();
6830	SDValue ReservedPtr =
6831	DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrTy, N1: Ptr, N2: DAG.getConstant(Val: `10`, DL, VT: PtrTy));
6832	Chain = DAG.getStore(Chain, DL, DAG.getConstant(`0`, DL, MVT::i16), ReservedPtr,
6833	MPI);
6834	ReservedPtr =
6835	DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrTy, N1: Ptr, N2: DAG.getConstant(Val: `12`, DL, VT: PtrTy));
6836	Chain = DAG.getStore(Chain, DL, DAG.getConstant(`0`, DL, MVT::i32), ReservedPtr,
6837	MPI);
6838
6839	return TPIDR2Obj;
6840	}
6841
6842	static bool isPassedInFPR(EVT VT) {
6843	return VT.isFixedLengthVector() \|\|
6844	(VT.isFloatingPoint() && !VT.isScalableVector());
6845	}
6846
6847	SDValue AArch64TargetLowering::LowerFormalArguments(
6848	SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
6849	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
6850	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
6851	MachineFunction &MF = DAG.getMachineFunction();
6852	const Function &F = MF.getFunction();
6853	MachineFrameInfo &MFI = MF.getFrameInfo();
6854	bool IsWin64 = Subtarget->isCallingConvWin64(CC: F.getCallingConv());
6855	bool StackViaX4 = CallConv == CallingConv::ARM64EC_Thunk_X64 \|\|
6856	(isVarArg && Subtarget->isWindowsArm64EC());
6857	AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
6858
6859	SmallVector<ISD::OutputArg, `4`> Outs;
6860	GetReturnInfo(CC: CallConv, ReturnType: F.getReturnType(), attr: F.getAttributes(), Outs,
6861	TLI: DAG.getTargetLoweringInfo(), DL: MF.getDataLayout());
6862	if (any_of(Range&: Outs, P: [](ISD::OutputArg &Out){ return Out.VT.isScalableVector(); }))
6863	FuncInfo->setIsSVECC(true);
6864
6865	// Assign locations to all of the incoming arguments.
6866	SmallVector<CCValAssign, `16`> ArgLocs;
6867	DenseMap<unsigned, SDValue> CopiedRegs;
6868	CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
6869
6870	// At this point, Ins[].VT may already be promoted to i32. To correctly
6871	// handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
6872	// i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
6873	// Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
6874	// we use a special version of AnalyzeFormalArguments to pass in ValVT and
6875	// LocVT.
6876	unsigned NumArgs = Ins.size();
6877	Function::const_arg_iterator CurOrigArg = F.arg_begin();
6878	unsigned CurArgIdx = `0`;
6879	for (unsigned i = `0`; i != NumArgs; ++i) {
6880	MVT ValVT = Ins [i].VT;
6881	if (Ins [i].isOrigArg()) {
6882	std::advance(i&: CurOrigArg, n: Ins [i].getOrigArgIndex() - CurArgIdx);
6883	CurArgIdx = Ins [i].getOrigArgIndex();
6884
6885	// Get type of the original argument.
6886	EVT ActualVT = getValueType(DL: DAG.getDataLayout(), Ty: CurOrigArg->getType(),
6887	/AllowUnknown/ true);
6888	MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
6889	// If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
6890	if (ActualMVT == MVT::i1 \|\| ActualMVT == MVT::i8)
6891	ValVT = MVT::i8;
6892	else if (ActualMVT == MVT::i16)
6893	ValVT = MVT::i16;
6894	}
6895	bool UseVarArgCC = false;
6896	if (IsWin64)
6897	UseVarArgCC = isVarArg;
6898	CCAssignFn *AssignFn = CCAssignFnForCall(CC: CallConv, IsVarArg: UseVarArgCC);
6899	bool Res =
6900	AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins [i].Flags, CCInfo);
6901	assert(!Res && "Call operand has unhandled type");
6902	(void)Res;
6903	}
6904
6905	SMEAttrs Attrs(MF.getFunction());
6906	bool IsLocallyStreaming =
6907	!Attrs.hasStreamingInterface() && Attrs.hasStreamingBody();
6908	assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
6909	SDValue Glue = Chain.getValue(R: `1`);
6910
6911	SmallVector<SDValue, `16`> ArgValues;
6912	unsigned ExtraArgLocs = `0`;
6913	for (unsigned i = `0`, e = Ins.size(); i != e; ++i) {
6914	CCValAssign &VA = ArgLocs [i - ExtraArgLocs];
6915
6916	if (Ins [i].Flags.isByVal()) {
6917	// Byval is used for HFAs in the PCS, but the system should work in a
6918	// non-compliant manner for larger structs.
6919	EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
6920	int Size = Ins [i].Flags.getByValSize();
6921	unsigned NumRegs = (Size + `7`) / `8`;
6922
6923	// FIXME: This works on big-endian for composite byvals, which are the common
6924	// case. It should also work for fundamental types too.
6925	unsigned FrameIdx =
6926	MFI.CreateFixedObject(Size: `8` * NumRegs, SPOffset: VA.getLocMemOffset(), IsImmutable: false);
6927	SDValue FrameIdxN = DAG.getFrameIndex(FI: FrameIdx, VT: PtrVT);
6928	InVals.push_back(Elt: FrameIdxN);
6929
6930	continue;
6931	}
6932
6933	if (Ins [i].Flags.isSwiftAsync())
6934	MF.getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
6935
6936	SDValue ArgValue;
6937	if (VA.isRegLoc()) {
6938	// Arguments stored in registers.
6939	EVT RegVT = VA.getLocVT();
6940	const TargetRegisterClass *RC;
6941
6942	if (RegVT == MVT::i32)
6943	RC = &AArch64::GPR32RegClass;
6944	else if (RegVT == MVT::i64)
6945	RC = &AArch64::GPR64RegClass;
6946	else if (RegVT == MVT::f16 \|\| RegVT == MVT::bf16)
6947	RC = &AArch64::FPR16RegClass;
6948	else if (RegVT == MVT::f32)
6949	RC = &AArch64::FPR32RegClass;
6950	else if (RegVT == MVT::f64 \|\| RegVT.is64BitVector())
6951	RC = &AArch64::FPR64RegClass;
6952	else if (RegVT == MVT::f128 \|\| RegVT.is128BitVector())
6953	RC = &AArch64::FPR128RegClass;
6954	else if (RegVT.isScalableVector() &&
6955	RegVT.getVectorElementType() == MVT::i1) {
6956	FuncInfo->setIsSVECC(true);
6957	RC = &AArch64::PPRRegClass;
6958	} else if (RegVT == MVT::aarch64svcount) {
6959	FuncInfo->setIsSVECC(true);
6960	RC = &AArch64::PPRRegClass;
6961	} else if (RegVT.isScalableVector()) {
6962	FuncInfo->setIsSVECC(true);
6963	RC = &AArch64::ZPRRegClass;
6964	} else
6965	llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
6966
6967	// Transform the arguments in physical registers into virtual ones.
6968	Register Reg = MF.addLiveIn(PReg: VA.getLocReg(), RC);
6969
6970	if (IsLocallyStreaming) {
6971	// LocallyStreamingFunctions must insert the SMSTART in the correct
6972	// position, so we use Glue to ensure no instructions can be scheduled
6973	// between the chain of:
6974	// t0: ch,glue = EntryNode
6975	// t1: res,ch,glue = CopyFromReg
6976	// ...
6977	// tn: res,ch,glue = CopyFromReg t(n-1), ..
6978	// t(n+1): ch, glue = SMSTART t0:0, ...., tn:2
6979	// ^^^^^^
6980	// This will be the new Chain/Root node.
6981	ArgValue = DAG.getCopyFromReg(Chain, dl: DL, Reg, VT: RegVT, Glue);
6982	Glue = ArgValue.getValue(R: `2`);
6983	if (isPassedInFPR(VT: ArgValue.getValueType())) {
6984	ArgValue =
6985	DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
6986	DAG.getVTList(ArgValue.getValueType(), MVT::Glue),
6987	{ArgValue, Glue});
6988	Glue = ArgValue.getValue(R: `1`);
6989	}
6990	} else
6991	ArgValue = DAG.getCopyFromReg(Chain, dl: DL, Reg, VT: RegVT);
6992
6993	// If this is an 8, 16 or 32-bit value, it is really passed promoted
6994	// to 64 bits. Insert an assert[sz]ext to capture this, then
6995	// truncate to the right size.
6996	switch (VA.getLocInfo()) {
6997	default:
6998	llvm_unreachable("Unknown loc info!");
6999	case CCValAssign::Full:
7000	break;
7001	case CCValAssign::Indirect:
7002	assert(
7003	(VA.getValVT().isScalableVT() \|\| Subtarget->isWindowsArm64EC()) &&
7004	"Indirect arguments should be scalable on most subtargets");
7005	break;
7006	case CCValAssign::BCvt:
7007	ArgValue = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: VA.getValVT(), Operand: ArgValue);
7008	break;
7009	case CCValAssign::AExt:
7010	case CCValAssign::SExt:
7011	case CCValAssign::ZExt:
7012	break;
7013	case CCValAssign::AExtUpper:
7014	ArgValue = DAG.getNode(Opcode: ISD::SRL, DL, VT: RegVT, N1: ArgValue,
7015	N2: DAG.getConstant(Val: `32`, DL, VT: RegVT));
7016	ArgValue = DAG.getZExtOrTrunc(Op: ArgValue, DL, VT: VA.getValVT());
7017	break;
7018	}
7019	} else { // VA.isRegLoc()
7020	assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
7021	unsigned ArgOffset = VA.getLocMemOffset();
7022	unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
7023	? VA.getLocVT().getSizeInBits()
7024	: VA.getValVT().getSizeInBits()) / `8`;
7025
7026	uint32_t BEAlign = `0`;
7027	if (!Subtarget->isLittleEndian() && ArgSize < `8` &&
7028	!Ins [i].Flags.isInConsecutiveRegs())
7029	BEAlign = `8` - ArgSize;
7030
7031	SDValue FIN;
7032	MachinePointerInfo PtrInfo;
7033	if (StackViaX4) {
7034	// In both the ARM64EC varargs convention and the thunk convention,
7035	// arguments on the stack are accessed relative to x4, not sp. In
7036	// the thunk convention, there's an additional offset of 32 bytes
7037	// to account for the shadow store.
7038	unsigned ObjOffset = ArgOffset + BEAlign;
7039	if (CallConv == CallingConv::ARM64EC_Thunk_X64)
7040	ObjOffset += `32`;
7041	Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
7042	SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
7043	FIN = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
7044	DAG.getConstant(ObjOffset, DL, MVT::i64));
7045	PtrInfo = MachinePointerInfo::getUnknownStack(MF);
7046	} else {
7047	int FI = MFI.CreateFixedObject(Size: ArgSize, SPOffset: ArgOffset + BEAlign, IsImmutable: true);
7048
7049	// Create load nodes to retrieve arguments from the stack.
7050	FIN = DAG.getFrameIndex(FI, VT: getPointerTy(DL: DAG.getDataLayout()));
7051	PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
7052	}
7053
7054	// For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
7055	ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
7056	MVT MemVT = VA.getValVT();
7057
7058	switch (VA.getLocInfo()) {
7059	default:
7060	break;
7061	case CCValAssign::Trunc:
7062	case CCValAssign::BCvt:
7063	MemVT = VA.getLocVT();
7064	break;
7065	case CCValAssign::Indirect:
7066	assert((VA.getValVT().isScalableVector() \|\|
7067	Subtarget->isWindowsArm64EC()) &&
7068	"Indirect arguments should be scalable on most subtargets");
7069	MemVT = VA.getLocVT();
7070	break;
7071	case CCValAssign::SExt:
7072	ExtType = ISD::SEXTLOAD;
7073	break;
7074	case CCValAssign::ZExt:
7075	ExtType = ISD::ZEXTLOAD;
7076	break;
7077	case CCValAssign::AExt:
7078	ExtType = ISD::EXTLOAD;
7079	break;
7080	}
7081
7082	ArgValue = DAG.getExtLoad(ExtType, dl: DL, VT: VA.getLocVT(), Chain, Ptr: FIN, PtrInfo,
7083	MemVT);
7084	}
7085
7086	if (VA.getLocInfo() == CCValAssign::Indirect) {
7087	assert((VA.getValVT().isScalableVT() \|\|
7088	Subtarget->isWindowsArm64EC()) &&
7089	"Indirect arguments should be scalable on most subtargets");
7090
7091	uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinValue();
7092	unsigned NumParts = `1`;
7093	if (Ins [i].Flags.isInConsecutiveRegs()) {
7094	assert(!Ins[i].Flags.isInConsecutiveRegsLast());
7095	while (!Ins [i + NumParts - `1`].Flags.isInConsecutiveRegsLast())
7096	++NumParts;
7097	}
7098
7099	MVT PartLoad = VA.getValVT();
7100	SDValue Ptr = ArgValue;
7101
7102	// Ensure we generate all loads for each tuple part, whilst updating the
7103	// pointer after each load correctly using vscale.
7104	while (NumParts > `0`) {
7105	ArgValue = DAG.getLoad(VT: PartLoad, dl: DL, Chain, Ptr, PtrInfo: MachinePointerInfo ());
7106	InVals.push_back(Elt: ArgValue);
7107	NumParts--;
7108	if (NumParts > `0`) {
7109	SDValue BytesIncrement;
7110	if (PartLoad.isScalableVector()) {
7111	BytesIncrement = DAG.getVScale(
7112	DL, VT: Ptr.getValueType(),
7113	MulImm: APInt (Ptr.getValueSizeInBits().getFixedValue(), PartSize));
7114	} else {
7115	BytesIncrement = DAG.getConstant(
7116	Val: APInt (Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
7117	VT: Ptr.getValueType());
7118	}
7119	SDNodeFlags Flags;
7120	Flags.setNoUnsignedWrap(true);
7121	Ptr = DAG.getNode(Opcode: ISD::ADD, DL, VT: Ptr.getValueType(), N1: Ptr,
7122	N2: BytesIncrement, Flags);
7123	ExtraArgLocs++;
7124	i++;
7125	}
7126	}
7127	} else {
7128	if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
7129	ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
7130	ArgValue, DAG.getValueType(MVT::i32));
7131
7132	// i1 arguments are zero-extended to i8 by the caller. Emit a
7133	// hint to reflect this.
7134	if (Ins [i].isOrigArg()) {
7135	Argument *OrigArg = F.getArg(i: Ins [i].getOrigArgIndex());
7136	if (OrigArg->getType()->isIntegerTy(Bitwidth: `1`)) {
7137	if (!Ins [i].Flags.isZExt()) {
7138	ArgValue = DAG.getNode(Opcode: AArch64ISD::ASSERT_ZEXT_BOOL, DL,
7139	VT: ArgValue.getValueType(), Operand: ArgValue);
7140	}
7141	}
7142	}
7143
7144	InVals.push_back(Elt: ArgValue);
7145	}
7146	}
7147	assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
7148
7149	// Insert the SMSTART if this is a locally streaming function and
7150	// make sure it is Glued to the last CopyFromReg value.
7151	if (IsLocallyStreaming) {
7152	SDValue PStateSM;
7153	if (Attrs.hasStreamingCompatibleInterface()) {
7154	PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64);
7155	Register Reg = MF.getRegInfo().createVirtualRegister(
7156	RegClass: getRegClassFor(VT: PStateSM.getValueType().getSimpleVT()));
7157	FuncInfo->setPStateSMReg(Reg);
7158	Chain = DAG.getCopyToReg(Chain, dl: DL, Reg, N: PStateSM);
7159	Chain = changeStreamingMode(DAG, DL, /Enable/ true, Chain, InGlue: Glue,
7160	Condition: AArch64SME::IfCallerIsNonStreaming, PStateSM);
7161	} else
7162	Chain = changeStreamingMode(DAG, DL, /Enable/ true, Chain, InGlue: Glue,
7163	Condition: AArch64SME::Always);
7164
7165	// Ensure that the SMSTART happens after the CopyWithChain such that its
7166	// chain result is used.
7167	for (unsigned I=`0`; I<InVals.size(); ++I) {
7168	Register Reg = MF.getRegInfo().createVirtualRegister(
7169	RegClass: getRegClassFor(VT: InVals [I].getValueType().getSimpleVT()));
7170	Chain = DAG.getCopyToReg(Chain, dl: DL, Reg, N: InVals [I]);
7171	InVals [I] = DAG.getCopyFromReg(Chain, dl: DL, Reg,
7172	VT: InVals [I].getValueType());
7173	}
7174	}
7175
7176	// varargs
7177	if (isVarArg) {
7178	if (!Subtarget->isTargetDarwin() \|\| IsWin64) {
7179	// The AAPCS variadic function ABI is identical to the non-variadic
7180	// one. As a result there may be more arguments in registers and we should
7181	// save them for future reference.
7182	// Win64 variadic functions also pass arguments in registers, but all float
7183	// arguments are passed in integer registers.
7184	saveVarArgRegisters(CCInfo, DAG, DL, Chain);
7185	}
7186
7187	// This will point to the next argument passed via stack.
7188	unsigned VarArgsOffset = CCInfo.getStackSize();
7189	// We currently pass all varargs at 8-byte alignment, or 4 for ILP32
7190	VarArgsOffset = alignTo(Value: VarArgsOffset, Align: Subtarget->isTargetILP32() ? `4` : `8`);
7191	FuncInfo->setVarArgsStackOffset(VarArgsOffset);
7192	FuncInfo->setVarArgsStackIndex(
7193	MFI.CreateFixedObject(Size: `4`, SPOffset: VarArgsOffset, IsImmutable: true));
7194
7195	if (MFI.hasMustTailInVarArgFunc()) {
7196	SmallVector<MVT, `2`> RegParmTypes;
7197	RegParmTypes.push_back(MVT::i64);
7198	RegParmTypes.push_back(MVT::f128);
7199	// Compute the set of forwarded registers. The rest are scratch.
7200	SmallVectorImpl<ForwardedRegister> &Forwards =
7201	FuncInfo->getForwardedMustTailRegParms();
7202	CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
7203	Fn: CC_AArch64_AAPCS);
7204
7205	// Conservatively forward X8, since it might be used for aggregate return.
7206	if (!CCInfo.isAllocated(AArch64::X8)) {
7207	Register X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
7208	Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
7209	}
7210	}
7211	}
7212
7213	// On Windows, InReg pointers must be returned, so record the pointer in a
7214	// virtual register at the start of the function so it can be returned in the
7215	// epilogue.
7216	if (IsWin64 \|\| F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64) {
7217	for (unsigned I = `0`, E = Ins.size(); I != E; ++I) {
7218	if ((F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64 \|\|
7219	Ins [I].Flags.isInReg()) &&
7220	Ins [I].Flags.isSRet()) {
7221	assert(!FuncInfo->getSRetReturnReg());
7222
7223	MVT PtrTy = getPointerTy(DL: DAG.getDataLayout());
7224	Register Reg =
7225	MF.getRegInfo().createVirtualRegister(RegClass: getRegClassFor(VT: PtrTy));
7226	FuncInfo->setSRetReturnReg(Reg);
7227
7228	SDValue Copy = DAG.getCopyToReg(Chain: DAG.getEntryNode(), dl: DL, Reg, N: InVals [I]);
7229	Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
7230	break;
7231	}
7232	}
7233	}
7234
7235	unsigned StackArgSize = CCInfo.getStackSize();
7236	bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
7237	if (DoesCalleeRestoreStack(CallCC: CallConv, TailCallOpt)) {
7238	// This is a non-standard ABI so by fiat I say we're allowed to make full
7239	// use of the stack area to be popped, which must be aligned to 16 bytes in
7240	// any case:
7241	StackArgSize = alignTo(Value: StackArgSize, Align: `16`);
7242
7243	// If we're expected to restore the stack (e.g. fastcc) then we'll be adding
7244	// a multiple of 16.
7245	FuncInfo->setArgumentStackToRestore(StackArgSize);
7246
7247	// This realignment carries over to the available bytes below. Our own
7248	// callers will guarantee the space is free by giving an aligned value to
7249	// CALLSEQ_START.
7250	}
7251	// Even if we're not expected to free up the space, it's useful to know how
7252	// much is there while considering tail calls (because we can reuse it).
7253	FuncInfo->setBytesInStackArgArea(StackArgSize);
7254
7255	if (Subtarget->hasCustomCallingConv())
7256	Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
7257
7258	// Conservatively assume the function requires the lazy-save mechanism.
7259	if (SMEAttrs (MF.getFunction()).hasZAState()) {
7260	unsigned TPIDR2Obj = allocateLazySaveBuffer(Chain, DL, DAG);
7261	FuncInfo->setLazySaveTPIDR2Obj(TPIDR2Obj);
7262	}
7263
7264	return Chain;
7265	}
7266
7267	void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
7268	SelectionDAG &DAG,
7269	const SDLoc &DL,
7270	SDValue &Chain) const {
7271	MachineFunction &MF = DAG.getMachineFunction();
7272	MachineFrameInfo &MFI = MF.getFrameInfo();
7273	AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
7274	auto PtrVT = getPointerTy(DL: DAG.getDataLayout());
7275	bool IsWin64 = Subtarget->isCallingConvWin64(CC: MF.getFunction().getCallingConv());
7276
7277	SmallVector<SDValue, `8`> MemOps;
7278
7279	auto GPRArgRegs = AArch64::getGPRArgRegs();
7280	unsigned NumGPRArgRegs = GPRArgRegs.size();
7281	if (Subtarget->isWindowsArm64EC()) {
7282	// In the ARM64EC ABI, only x0-x3 are used to pass arguments to varargs
7283	// functions.
7284	NumGPRArgRegs = `4`;
7285	}
7286	unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(Regs: GPRArgRegs);
7287
7288	unsigned GPRSaveSize = `8` * (NumGPRArgRegs - FirstVariadicGPR);
7289	int GPRIdx = `0`;
7290	if (GPRSaveSize != `0`) {
7291	if (IsWin64) {
7292	GPRIdx = MFI.CreateFixedObject(Size: GPRSaveSize, SPOffset: -(int)GPRSaveSize, IsImmutable: false);
7293	if (GPRSaveSize & `15`)
7294	// The extra size here, if triggered, will always be 8.
7295	MFI.CreateFixedObject(Size: `16` - (GPRSaveSize & `15`), SPOffset: -(int)alignTo(Value: GPRSaveSize, Align: `16`), IsImmutable: false);
7296	} else
7297	GPRIdx = MFI.CreateStackObject(Size: GPRSaveSize, Alignment: Align (`8`), isSpillSlot: false);
7298
7299	SDValue FIN;
7300	if (Subtarget->isWindowsArm64EC()) {
7301	// With the Arm64EC ABI, we reserve the save area as usual, but we
7302	// compute its address relative to x4. For a normal AArch64->AArch64
7303	// call, x4 == sp on entry, but calls from an entry thunk can pass in a
7304	// different address.
7305	Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
7306	SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
7307	FIN = DAG.getNode(ISD::SUB, DL, MVT::i64, Val,
7308	DAG.getConstant(GPRSaveSize, DL, MVT::i64));
7309	} else {
7310	FIN = DAG.getFrameIndex(FI: GPRIdx, VT: PtrVT);
7311	}
7312
7313	for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
7314	Register VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
7315	SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
7316	SDValue Store =
7317	DAG.getStore(Chain: Val.getValue(R: `1`), dl: DL, Val, Ptr: FIN,
7318	PtrInfo: IsWin64 ? MachinePointerInfo::getFixedStack(
7319	MF, FI: GPRIdx, Offset: (i - FirstVariadicGPR) * `8`)
7320	: MachinePointerInfo::getStack(MF, Offset: i * `8`));
7321	MemOps.push_back(Elt: Store);
7322	FIN =
7323	DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: FIN, N2: DAG.getConstant(Val: `8`, DL, VT: PtrVT));
7324	}
7325	}
7326	FuncInfo->setVarArgsGPRIndex(GPRIdx);
7327	FuncInfo->setVarArgsGPRSize(GPRSaveSize);
7328
7329	if (Subtarget->hasFPARMv8() && !IsWin64) {
7330	auto FPRArgRegs = AArch64::getFPRArgRegs();
7331	const unsigned NumFPRArgRegs = FPRArgRegs.size();
7332	unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(Regs: FPRArgRegs);
7333
7334	unsigned FPRSaveSize = `16` * (NumFPRArgRegs - FirstVariadicFPR);
7335	int FPRIdx = `0`;
7336	if (FPRSaveSize != `0`) {
7337	FPRIdx = MFI.CreateStackObject(Size: FPRSaveSize, Alignment: Align (`16`), isSpillSlot: false);
7338
7339	SDValue FIN = DAG.getFrameIndex(FI: FPRIdx, VT: PtrVT);
7340
7341	for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
7342	Register VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
7343	SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
7344
7345	SDValue Store = DAG.getStore(Chain: Val.getValue(R: `1`), dl: DL, Val, Ptr: FIN,
7346	PtrInfo: MachinePointerInfo::getStack(MF, Offset: i * `16`));
7347	MemOps.push_back(Elt: Store);
7348	FIN = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: FIN,
7349	N2: DAG.getConstant(Val: `16`, DL, VT: PtrVT));
7350	}
7351	}
7352	FuncInfo->setVarArgsFPRIndex(FPRIdx);
7353	FuncInfo->setVarArgsFPRSize(FPRSaveSize);
7354	}
7355
7356	if (!MemOps.empty()) {
7357	Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
7358	}
7359	}
7360
7361	/// LowerCallResult - Lower the result values of a call into the
7362	/// appropriate copies out of appropriate physical registers.
7363	SDValue AArch64TargetLowering::LowerCallResult(
7364	SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
7365	const SmallVectorImpl<CCValAssign> &RVLocs, const SDLoc &DL,
7366	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
7367	SDValue ThisVal, bool RequiresSMChange) const {
7368	DenseMap<unsigned, SDValue> CopiedRegs;
7369	// Copy all of the result registers out of their specified physreg.
7370	for (unsigned i = `0`; i != RVLocs.size(); ++i) {
7371	CCValAssign VA = RVLocs [i];
7372
7373	// Pass 'this' value directly from the argument to return value, to avoid
7374	// reg unit interference
7375	if (i == `0` && isThisReturn) {
7376	assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
7377	"unexpected return calling convention register assignment");
7378	InVals.push_back(Elt: ThisVal);
7379	continue;
7380	}
7381
7382	// Avoid copying a physreg twice since RegAllocFast is incompetent and only
7383	// allows one use of a physreg per block.
7384	SDValue Val = CopiedRegs.lookup(Val: VA.getLocReg());
7385	if (!Val) {
7386	Val =
7387	DAG.getCopyFromReg(Chain, dl: DL, Reg: VA.getLocReg(), VT: VA.getLocVT(), Glue: InGlue);
7388	Chain = Val.getValue(R: `1`);
7389	InGlue = Val.getValue(R: `2`);
7390	CopiedRegs [VA.getLocReg()] = Val;
7391	}
7392
7393	switch (VA.getLocInfo()) {
7394	default:
7395	llvm_unreachable("Unknown loc info!");
7396	case CCValAssign::Full:
7397	break;
7398	case CCValAssign::BCvt:
7399	Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: VA.getValVT(), Operand: Val);
7400	break;
7401	case CCValAssign::AExtUpper:
7402	Val = DAG.getNode(Opcode: ISD::SRL, DL, VT: VA.getLocVT(), N1: Val,
7403	N2: DAG.getConstant(Val: `32`, DL, VT: VA.getLocVT()));
7404	[[fallthrough]];
7405	case CCValAssign::AExt:
7406	[[fallthrough]];
7407	case CCValAssign::ZExt:
7408	Val = DAG.getZExtOrTrunc(Op: Val, DL, VT: VA.getValVT());
7409	break;
7410	}
7411
7412	if (RequiresSMChange && isPassedInFPR(VT: VA.getValVT()))
7413	Val = DAG.getNode(Opcode: AArch64ISD::COALESCER_BARRIER, DL, VT: Val.getValueType(),
7414	Operand: Val);
7415
7416	InVals.push_back(Elt: Val);
7417	}
7418
7419	return Chain;
7420	}
7421
7422	/// Return true if the calling convention is one that we can guarantee TCO for.
7423	static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
7424	return (CC == CallingConv::Fast && GuaranteeTailCalls) \|\|
7425	CC == CallingConv::Tail \|\| CC == CallingConv::SwiftTail;
7426	}
7427
7428	/// Return true if we might ever do TCO for calls with this calling convention.
7429	static bool mayTailCallThisCC(CallingConv::ID CC) {
7430	switch (CC) {
7431	case CallingConv::C:
7432	case CallingConv::AArch64_SVE_VectorCall:
7433	case CallingConv::PreserveMost:
7434	case CallingConv::PreserveAll:
7435	case CallingConv::Swift:
7436	case CallingConv::SwiftTail:
7437	case CallingConv::Tail:
7438	case CallingConv::Fast:
7439	return true;
7440	default:
7441	return false;
7442	}
7443	}
7444
7445	static void analyzeCallOperands(const AArch64TargetLowering &TLI,
7446	const AArch64Subtarget *Subtarget,
7447	const TargetLowering::CallLoweringInfo &CLI,
7448	CCState &CCInfo) {
7449	const SelectionDAG &DAG = CLI.DAG;
7450	CallingConv::ID CalleeCC = CLI.CallConv;
7451	bool IsVarArg = CLI.IsVarArg;
7452	const SmallVector<ISD::OutputArg, `32`> &Outs = CLI.Outs;
7453	bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CC: CalleeCC);
7454
7455	// For Arm64EC thunks, allocate 32 extra bytes at the bottom of the stack
7456	// for the shadow store.
7457	if (CalleeCC == CallingConv::ARM64EC_Thunk_X64)
7458	CCInfo.AllocateStack(Size: `32`, Alignment: Align (`16`));
7459
7460	unsigned NumArgs = Outs.size();
7461	for (unsigned i = `0`; i != NumArgs; ++i) {
7462	MVT ArgVT = Outs [i].VT;
7463	ISD::ArgFlagsTy ArgFlags = Outs [i].Flags;
7464
7465	bool UseVarArgCC = false;
7466	if (IsVarArg) {
7467	// On Windows, the fixed arguments in a vararg call are passed in GPRs
7468	// too, so use the vararg CC to force them to integer registers.
7469	if (IsCalleeWin64) {
7470	UseVarArgCC = true;
7471	} else {
7472	UseVarArgCC = !Outs [i].IsFixed;
7473	}
7474	}
7475
7476	if (!UseVarArgCC) {
7477	// Get type of the original argument.
7478	EVT ActualVT =
7479	TLI.getValueType(DL: DAG.getDataLayout(), Ty: CLI.Args [Outs [i].OrigArgIndex].Ty,
7480	/AllowUnknown/ true);
7481	MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ArgVT;
7482	// If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
7483	if (ActualMVT == MVT::i1 \|\| ActualMVT == MVT::i8)
7484	ArgVT = MVT::i8;
7485	else if (ActualMVT == MVT::i16)
7486	ArgVT = MVT::i16;
7487	}
7488
7489	CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CC: CalleeCC, IsVarArg: UseVarArgCC);
7490	bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
7491	assert(!Res && "Call operand has unhandled type");
7492	(void)Res;
7493	}
7494	}
7495
7496	bool AArch64TargetLowering::isEligibleForTailCallOptimization(
7497	const CallLoweringInfo &CLI) const {
7498	CallingConv::ID CalleeCC = CLI.CallConv;
7499	if (!mayTailCallThisCC(CC: CalleeCC))
7500	return false;
7501
7502	SDValue Callee = CLI.Callee;
7503	bool IsVarArg = CLI.IsVarArg;
7504	const SmallVector<ISD::OutputArg, `32`> &Outs = CLI.Outs;
7505	const SmallVector<SDValue, `32`> &OutVals = CLI.OutVals;
7506	const SmallVector<ISD::InputArg, `32`> &Ins = CLI.Ins;
7507	const SelectionDAG &DAG = CLI.DAG;
7508	MachineFunction &MF = DAG.getMachineFunction();
7509	const Function &CallerF = MF.getFunction();
7510	CallingConv::ID CallerCC = CallerF.getCallingConv();
7511
7512	// SME Streaming functions are not eligible for TCO as they may require
7513	// the streaming mode or ZA to be restored after returning from the call.
7514	SMEAttrs CallerAttrs(MF.getFunction());
7515	auto CalleeAttrs = CLI.CB ? SMEAttrs (*CLI.CB) : SMEAttrs (SMEAttrs::Normal);
7516	if (CallerAttrs.requiresSMChange(Callee: CalleeAttrs) \|\|
7517	CallerAttrs.requiresLazySave(Callee: CalleeAttrs) \|\|
7518	CallerAttrs.hasStreamingBody())
7519	return false;
7520
7521	// Functions using the C or Fast calling convention that have an SVE signature
7522	// preserve more registers and should assume the SVE_VectorCall CC.
7523	// The check for matching callee-saved regs will determine whether it is
7524	// eligible for TCO.
7525	if ((CallerCC == CallingConv::C \|\| CallerCC == CallingConv::Fast) &&
7526	MF.getInfo<AArch64FunctionInfo>()->isSVECC())
7527	CallerCC = CallingConv::AArch64_SVE_VectorCall;
7528
7529	bool CCMatch = CallerCC == CalleeCC;
7530
7531	// When using the Windows calling convention on a non-windows OS, we want
7532	// to back up and restore X18 in such functions; we can't do a tail call
7533	// from those functions.
7534	if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&
7535	CalleeCC != CallingConv::Win64)
7536	return false;
7537
7538	// Byval parameters hand the function a pointer directly into the stack area
7539	// we want to reuse during a tail call. Working around this is* possible (see*
7540	// X86) but less efficient and uglier in LowerCall.
7541	for (Function::const_arg_iterator i = CallerF.arg_begin(),
7542	e = CallerF.arg_end();
7543	i != e; ++i) {
7544	if (i->hasByValAttr())
7545	return false;
7546
7547	// On Windows, "inreg" attributes signify non-aggregate indirect returns.
7548	// In this case, it is necessary to save/restore X0 in the callee. Tail
7549	// call opt interferes with this. So we disable tail call opt when the
7550	// caller has an argument with "inreg" attribute.
7551
7552	// FIXME: Check whether the callee also has an "inreg" argument.
7553	if (i->hasInRegAttr())
7554	return false;
7555	}
7556
7557	if (canGuaranteeTCO(CC: CalleeCC, GuaranteeTailCalls: getTargetMachine().Options.GuaranteedTailCallOpt))
7558	return CCMatch;
7559
7560	// Externally-defined functions with weak linkage should not be
7561	// tail-called on AArch64 when the OS does not support dynamic
7562	// pre-emption of symbols, as the AAELF spec requires normal calls
7563	// to undefined weak functions to be replaced with a NOP or jump to the
7564	// next instruction. The behaviour of branch instructions in this
7565	// situation (as used for tail calls) is implementation-defined, so we
7566	// cannot rely on the linker replacing the tail call with a return.
7567	if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Val&: Callee)) {
7568	const GlobalValue *GV = G->getGlobal();
7569	const Triple &TT = getTargetMachine().getTargetTriple();
7570	if (GV->hasExternalWeakLinkage() &&
7571	(!TT.isOSWindows() \|\| TT.isOSBinFormatELF() \|\| TT.isOSBinFormatMachO()))
7572	return false;
7573	}
7574
7575	// Now we search for cases where we can use a tail call without changing the
7576	// ABI. Sibcall is used in some places (particularly gcc) to refer to this
7577	// concept.
7578
7579	// I want anyone implementing a new calling convention to think long and hard
7580	// about this assert.
7581	assert((!IsVarArg \|\| CalleeCC == CallingConv::C) &&
7582	"Unexpected variadic calling convention");
7583
7584	LLVMContext &C = *DAG.getContext();
7585	// Check that the call results are passed in the same way.
7586	if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
7587	CalleeFn: CCAssignFnForCall(CC: CalleeCC, IsVarArg),
7588	CallerFn: CCAssignFnForCall(CC: CallerCC, IsVarArg)))
7589	return false;
7590	// The callee has to preserve all registers the caller needs to preserve.
7591	const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
7592	const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
7593	if (!CCMatch) {
7594	const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
7595	if (Subtarget->hasCustomCallingConv()) {
7596	TRI->UpdateCustomCallPreservedMask(MF, Mask: &CallerPreserved);
7597	TRI->UpdateCustomCallPreservedMask(MF, Mask: &CalleePreserved);
7598	}
7599	if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
7600	return false;
7601	}
7602
7603	// Nothing more to check if the callee is taking no arguments
7604	if (Outs.empty())
7605	return true;
7606
7607	SmallVector<CCValAssign, `16`> ArgLocs;
7608	CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, C);
7609
7610	analyzeCallOperands(TLI: *this, Subtarget, CLI, CCInfo);
7611
7612	if (IsVarArg && !(CLI.CB && CLI.CB->isMustTailCall())) {
7613	// When we are musttail, additional checks have been done and we can safely ignore this check
7614	// At least two cases here: if caller is fastcc then we can't have any
7615	// memory arguments (we'd be expected to clean up the stack afterwards). If
7616	// caller is C then we could potentially use its argument area.
7617
7618	// FIXME: for now we take the most conservative of these in both cases:
7619	// disallow all variadic memory operands.
7620	for (const CCValAssign &ArgLoc : ArgLocs)
7621	if (!ArgLoc.isRegLoc())
7622	return false;
7623	}
7624
7625	const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
7626
7627	// If any of the arguments is passed indirectly, it must be SVE, so the
7628	// 'getBytesInStackArgArea' is not sufficient to determine whether we need to
7629	// allocate space on the stack. That is why we determine this explicitly here
7630	// the call cannot be a tailcall.
7631	if (llvm::any_of(Range&: ArgLocs, P: [&](CCValAssign &A) {
7632	assert((A.getLocInfo() != CCValAssign::Indirect \|\|
7633	A.getValVT().isScalableVector() \|\|
7634	Subtarget->isWindowsArm64EC()) &&
7635	"Expected value to be scalable");
7636	return A.getLocInfo() == CCValAssign::Indirect;
7637	}))
7638	return false;
7639
7640	// If the stack arguments for this call do not fit into our own save area then
7641	// the call cannot be made tail.
7642	if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
7643	return false;
7644
7645	const MachineRegisterInfo &MRI = MF.getRegInfo();
7646	if (!parametersInCSRMatch(MRI, CallerPreservedMask: CallerPreserved, ArgLocs, OutVals))
7647	return false;
7648
7649	return true;
7650	}
7651
7652	SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
7653	SelectionDAG &DAG,
7654	MachineFrameInfo &MFI,
7655	int ClobberedFI) const {
7656	SmallVector<SDValue, `8`> ArgChains;
7657	int64_t FirstByte = MFI.getObjectOffset(ObjectIdx: ClobberedFI);
7658	int64_t LastByte = FirstByte + MFI.getObjectSize(ObjectIdx: ClobberedFI) - `1`;
7659
7660	// Include the original chain at the beginning of the list. When this is
7661	// used by target LowerCall hooks, this helps legalize find the
7662	// CALLSEQ_BEGIN node.
7663	ArgChains.push_back(Elt: Chain);
7664
7665	// Add a chain value for each stack argument corresponding
7666	for (SDNode *U : DAG.getEntryNode().getNode()->uses())
7667	if (LoadSDNode *L = dyn_cast<LoadSDNode>(Val: U))
7668	if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Val: L->getBasePtr()))
7669	if (FI->getIndex() < `0`) {
7670	int64_t InFirstByte = MFI.getObjectOffset(ObjectIdx: FI->getIndex());
7671	int64_t InLastByte = InFirstByte;
7672	InLastByte += MFI.getObjectSize(ObjectIdx: FI->getIndex()) - `1`;
7673
7674	if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) \|\|
7675	(FirstByte <= InFirstByte && InFirstByte <= LastByte))
7676	ArgChains.push_back(Elt: SDValue (L, `1`));
7677	}
7678
7679	// Build a tokenfactor for all the chains.
7680	return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
7681	}
7682
7683	bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
7684	bool TailCallOpt) const {
7685	return (CallCC == CallingConv::Fast && TailCallOpt) \|\|
7686	CallCC == CallingConv::Tail \|\| CallCC == CallingConv::SwiftTail;
7687	}
7688
7689	// Check if the value is zero-extended from i1 to i8
7690	static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
7691	unsigned SizeInBits = Arg.getValueType().getSizeInBits();
7692	if (SizeInBits < `8`)
7693	return false;
7694
7695	APInt RequredZero(SizeInBits, `0xFE`);
7696	KnownBits Bits = DAG.computeKnownBits(Op: Arg, Depth: `4`);
7697	bool ZExtBool = (Bits.Zero & RequredZero) == RequredZero;
7698	return ZExtBool;
7699	}
7700
7701	void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
7702	SDNode Node) const* {
7703	// Live-in physreg copies that are glued to SMSTART are applied as
7704	// implicit-def's in the InstrEmitter. Here we remove them, allowing the
7705	// register allocator to pass call args in callee saved regs, without extra
7706	// copies to avoid these fake clobbers of actually-preserved GPRs.
7707	if (MI.getOpcode() == AArch64::MSRpstatesvcrImm1 \|\|
7708	MI.getOpcode() == AArch64::MSRpstatePseudo) {
7709	for (unsigned I = MI.getNumOperands() - `1`; I > `0`; --I)
7710	if (MachineOperand &MO = MI.getOperand(i: I);
7711	MO.isReg() && MO.isImplicit() && MO.isDef() &&
7712	(AArch64::GPR32RegClass.contains(MO.getReg()) \|\|
7713	AArch64::GPR64RegClass.contains(MO.getReg())))
7714	MI.removeOperand(OpNo: I);
7715
7716	// The SVE vector length can change when entering/leaving streaming mode.
7717	if (MI.getOperand(`0`).getImm() == AArch64SVCR::SVCRSM \|\|
7718	MI.getOperand(`0`).getImm() == AArch64SVCR::SVCRSMZA) {
7719	MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /IsDef=/false,
7720	/IsImplicit=/true));
7721	MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /IsDef=/true,
7722	/IsImplicit=/true));
7723	}
7724	}
7725
7726	// Add an implicit use of 'VG' for ADDXri/SUBXri, which are instructions that
7727	// have nothing to do with VG, were it not that they are used to materialise a
7728	// frame-address. If they contain a frame-index to a scalable vector, this
7729	// will likely require an ADDVL instruction to materialise the address, thus
7730	// reading VG.
7731	const MachineFunction &MF = *MI.getMF();
7732	if (MF.getInfo<AArch64FunctionInfo>()->hasStreamingModeChanges() &&
7733	(MI.getOpcode() == AArch64::ADDXri \|\|
7734	MI.getOpcode() == AArch64::SUBXri)) {
7735	const MachineOperand &MO = MI.getOperand(i: `1`);
7736	if (MO.isFI() && MF.getFrameInfo().getStackID(MO.getIndex()) ==
7737	TargetStackID::ScalableVector)
7738	MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /IsDef=/false,
7739	/IsImplicit=/true));
7740	}
7741	}
7742
7743	SDValue AArch64TargetLowering::changeStreamingMode(SelectionDAG &DAG, SDLoc DL,
7744	bool Enable, SDValue Chain,
7745	SDValue InGlue,
7746	unsigned Condition,
7747	SDValue PStateSM) const {
7748	MachineFunction &MF = DAG.getMachineFunction();
7749	AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
7750	FuncInfo->setHasStreamingModeChanges(true);
7751
7752	const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
7753	SDValue RegMask = DAG.getRegisterMask(RegMask: TRI->getSMStartStopCallPreservedMask());
7754	SDValue MSROp =
7755	DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32);
7756	SDValue ConditionOp = DAG.getTargetConstant(Condition, DL, MVT::i64);
7757	SmallVector<SDValue> Ops = {Chain, MSROp, ConditionOp};
7758	if (Condition != AArch64SME::Always) {
7759	assert(PStateSM && "PStateSM should be defined");
7760	Ops.push_back(Elt: PStateSM);
7761	}
7762	Ops.push_back(Elt: RegMask);
7763
7764	if (InGlue)
7765	Ops.push_back(Elt: InGlue);
7766
7767	unsigned Opcode = Enable ? AArch64ISD::SMSTART : AArch64ISD::SMSTOP;
7768	return DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
7769	}
7770
7771	static unsigned getSMCondition(const SMEAttrs &CallerAttrs,
7772	const SMEAttrs &CalleeAttrs) {
7773	if (!CallerAttrs.hasStreamingCompatibleInterface() \|\|
7774	CallerAttrs.hasStreamingBody())
7775	return AArch64SME::Always;
7776	if (CalleeAttrs.hasNonStreamingInterface())
7777	return AArch64SME::IfCallerIsStreaming;
7778	if (CalleeAttrs.hasStreamingInterface())
7779	return AArch64SME::IfCallerIsNonStreaming;
7780
7781	llvm_unreachable("Unsupported attributes");
7782	}
7783
7784	/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
7785	/// and add input and output parameter nodes.
7786	SDValue
7787	AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
7788	SmallVectorImpl<SDValue> &InVals) const {
7789	SelectionDAG &DAG = CLI.DAG;
7790	SDLoc &DL = CLI.DL;
7791	SmallVector<ISD::OutputArg, `32`> &Outs = CLI.Outs;
7792	SmallVector<SDValue, `32`> &OutVals = CLI.OutVals;
7793	SmallVector<ISD::InputArg, `32`> &Ins = CLI.Ins;
7794	SDValue Chain = CLI.Chain;
7795	SDValue Callee = CLI.Callee;
7796	bool &IsTailCall = CLI.IsTailCall;
7797	CallingConv::ID &CallConv = CLI.CallConv;
7798	bool IsVarArg = CLI.IsVarArg;
7799
7800	MachineFunction &MF = DAG.getMachineFunction();
7801	MachineFunction::CallSiteInfo CSInfo;
7802	bool IsThisReturn = false;
7803
7804	AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
7805	bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
7806	bool IsCFICall = CLI.CB && CLI.CB->isIndirectCall() && CLI.CFIType;
7807	bool IsSibCall = false;
7808	bool GuardWithBTI = false;
7809
7810	if (CLI.CB && CLI.CB->hasFnAttr(Attribute::ReturnsTwice) &&
7811	!Subtarget->noBTIAtReturnTwice()) {
7812	GuardWithBTI = FuncInfo->branchTargetEnforcement();
7813	}
7814
7815	// Analyze operands of the call, assigning locations to each operand.
7816	SmallVector<CCValAssign, `16`> ArgLocs;
7817	CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
7818
7819	if (IsVarArg) {
7820	unsigned NumArgs = Outs.size();
7821
7822	for (unsigned i = `0`; i != NumArgs; ++i) {
7823	if (!Outs [i].IsFixed && Outs [i].VT.isScalableVector())
7824	report_fatal_error(reason: "Passing SVE types to variadic functions is "
7825	"currently not supported");
7826	}
7827	}
7828
7829	analyzeCallOperands(TLI: *this, Subtarget, CLI, CCInfo);
7830
7831	CCAssignFn *RetCC = CCAssignFnForReturn(CC: CallConv);
7832	// Assign locations to each value returned by this call.
7833	SmallVector<CCValAssign, `16`> RVLocs;
7834	CCState RetCCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
7835	*DAG.getContext());
7836	RetCCInfo.AnalyzeCallResult(Ins, Fn: RetCC);
7837
7838	// Check callee args/returns for SVE registers and set calling convention
7839	// accordingly.
7840	if (CallConv == CallingConv::C \|\| CallConv == CallingConv::Fast) {
7841	auto HasSVERegLoc = [](CCValAssign &Loc) {
7842	if (!Loc.isRegLoc())
7843	return false;
7844	return AArch64::ZPRRegClass.contains(Loc.getLocReg()) \|\|
7845	AArch64::PPRRegClass.contains(Loc.getLocReg());
7846	};
7847	if (any_of(Range&: RVLocs, P: HasSVERegLoc) \|\| any_of(Range&: ArgLocs, P: HasSVERegLoc))
7848	CallConv = CallingConv::AArch64_SVE_VectorCall;
7849	}
7850
7851	if (IsTailCall) {
7852	// Check if it's really possible to do a tail call.
7853	IsTailCall = isEligibleForTailCallOptimization(CLI);
7854
7855	// A sibling call is one where we're under the usual C ABI and not planning
7856	// to change that but can still do a tail call:
7857	if (!TailCallOpt && IsTailCall && CallConv != CallingConv::Tail &&
7858	CallConv != CallingConv::SwiftTail)
7859	IsSibCall = true;
7860
7861	if (IsTailCall)
7862	++NumTailCalls;
7863	}
7864
7865	if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
7866	report_fatal_error(reason: "failed to perform tail call elimination on a call "
7867	"site marked musttail");
7868
7869	// Get a count of how many bytes are to be pushed on the stack.
7870	unsigned NumBytes = CCInfo.getStackSize();
7871
7872	if (IsSibCall) {
7873	// Since we're not changing the ABI to make this a tail call, the memory
7874	// operands are already available in the caller's incoming argument space.
7875	NumBytes = `0`;
7876	}
7877
7878	// FPDiff is the byte offset of the call's argument area from the callee's.
7879	// Stores to callee stack arguments will be placed in FixedStackSlots offset
7880	// by this amount for a tail call. In a sibling call it must be 0 because the
7881	// caller will deallocate the entire stack and the callee still expects its
7882	// arguments to begin at SP+0. Completely unused for non-tail calls.
7883	int FPDiff = `0`;
7884
7885	if (IsTailCall && !IsSibCall) {
7886	unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
7887
7888	// Since callee will pop argument stack as a tail call, we must keep the
7889	// popped size 16-byte aligned.
7890	NumBytes = alignTo(Value: NumBytes, Align: `16`);
7891
7892	// FPDiff will be negative if this tail call requires more space than we
7893	// would automatically have in our incoming argument space. Positive if we
7894	// can actually shrink the stack.
7895	FPDiff = NumReusableBytes - NumBytes;
7896
7897	// Update the required reserved area if this is the tail call requiring the
7898	// most argument stack space.
7899	if (FPDiff < `0` && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff)
7900	FuncInfo->setTailCallReservedStack(-FPDiff);
7901
7902	// The stack pointer must be 16-byte aligned at all times it's used for a
7903	// memory operation, which in practice means at all* times and in*
7904	// particular across call boundaries. Therefore our own arguments started at
7905	// a 16-byte aligned SP and the delta applied for the tail call should
7906	// satisfy the same constraint.
7907	assert(FPDiff % `16` == `0` && "unaligned stack on tail call");
7908	}
7909
7910	// Determine whether we need any streaming mode changes.
7911	SMEAttrs CalleeAttrs, CallerAttrs(MF.getFunction());
7912	if (CLI.CB)
7913	CalleeAttrs = SMEAttrs (*CLI.CB);
7914	else if (auto *ES = dyn_cast<ExternalSymbolSDNode>(Val&: CLI.Callee))
7915	CalleeAttrs = SMEAttrs (ES->getSymbol());
7916
7917	auto DescribeCallsite =
7918	[&](OptimizationRemarkAnalysis &R) -> OptimizationRemarkAnalysis & {
7919	R << "call from '" << ore::NV ("Caller", MF.getName()) << "' to '";
7920	if (auto *ES = dyn_cast<ExternalSymbolSDNode>(Val&: CLI.Callee))
7921	R << ore::NV ("Callee", ES->getSymbol());
7922	else if (CLI.CB && CLI.CB->getCalledFunction())
7923	R << ore::NV ("Callee", CLI.CB->getCalledFunction()->getName());
7924	else
7925	R << "unknown callee";
7926	R << "'";
7927	return R;
7928	};
7929
7930	bool RequiresLazySave = CallerAttrs.requiresLazySave(Callee: CalleeAttrs);
7931	if (RequiresLazySave) {
7932	unsigned TPIDR2Obj = FuncInfo->getLazySaveTPIDR2Obj();
7933	MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, Offset: TPIDR2Obj);
7934	SDValue TPIDR2ObjAddr = DAG.getFrameIndex(FI: TPIDR2Obj,
7935	VT: DAG.getTargetLoweringInfo().getFrameIndexTy(DL: DAG.getDataLayout()));
7936	SDValue NumZaSaveSlicesAddr =
7937	DAG.getNode(Opcode: ISD::ADD, DL, VT: TPIDR2ObjAddr.getValueType(), N1: TPIDR2ObjAddr,
7938	N2: DAG.getConstant(Val: `8`, DL, VT: TPIDR2ObjAddr.getValueType()));
7939	SDValue NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
7940	DAG.getConstant(`1`, DL, MVT::i32));
7941	Chain = DAG.getTruncStore(Chain, DL, NumZaSaveSlices, NumZaSaveSlicesAddr,
7942	MPI, MVT::i16);
7943	Chain = DAG.getNode(
7944	ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
7945	DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
7946	TPIDR2ObjAddr);
7947	OptimizationRemarkEmitter ORE(&MF.getFunction());
7948	ORE.emit(RemarkBuilder: [&]() {
7949	auto R = CLI.CB ? OptimizationRemarkAnalysis ("sme", "SMELazySaveZA",
7950	CLI.CB)
7951	: OptimizationRemarkAnalysis ("sme", "SMELazySaveZA",
7952	&MF.getFunction());
7953	return DescribeCallsite (R) << " sets up a lazy save for ZA";
7954	});
7955	}
7956
7957	SDValue PStateSM;
7958	bool RequiresSMChange = CallerAttrs.requiresSMChange(Callee: CalleeAttrs);
7959	if (RequiresSMChange) {
7960	if (CallerAttrs.hasStreamingInterfaceOrBody())
7961	PStateSM = DAG.getConstant(`1`, DL, MVT::i64);
7962	else if (CallerAttrs.hasNonStreamingInterface())
7963	PStateSM = DAG.getConstant(`0`, DL, MVT::i64);
7964	else
7965	PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64);
7966	OptimizationRemarkEmitter ORE(&MF.getFunction());
7967	ORE.emit(RemarkBuilder: [&]() {
7968	auto R = CLI.CB ? OptimizationRemarkAnalysis ("sme", "SMETransition",
7969	CLI.CB)
7970	: OptimizationRemarkAnalysis ("sme", "SMETransition",
7971	&MF.getFunction());
7972	DescribeCallsite (R) << " requires a streaming mode transition";
7973	return R;
7974	});
7975	}
7976
7977	SDValue ZTFrameIdx;
7978	MachineFrameInfo &MFI = MF.getFrameInfo();
7979	bool ShouldPreserveZT0 = CallerAttrs.requiresPreservingZT0(Callee: CalleeAttrs);
7980
7981	// If the caller has ZT0 state which will not be preserved by the callee,
7982	// spill ZT0 before the call.
7983	if (ShouldPreserveZT0) {
7984	unsigned ZTObj = MFI.CreateSpillStackObject(Size: `64`, Alignment: Align (`16`));
7985	ZTFrameIdx = DAG.getFrameIndex(
7986	FI: ZTObj,
7987	VT: DAG.getTargetLoweringInfo().getFrameIndexTy(DL: DAG.getDataLayout()));
7988
7989	Chain = DAG.getNode(AArch64ISD::SAVE_ZT, DL, DAG.getVTList(MVT::Other),
7990	{Chain, DAG.getConstant(`0`, DL, MVT::i32), ZTFrameIdx});
7991	}
7992
7993	// If caller shares ZT0 but the callee is not shared ZA, we need to stop
7994	// PSTATE.ZA before the call if there is no lazy-save active.
7995	bool DisableZA = CallerAttrs.requiresDisablingZABeforeCall(Callee: CalleeAttrs);
7996	assert((!DisableZA \|\| !RequiresLazySave) &&
7997	"Lazy-save should have PSTATE.SM=1 on entry to the function");
7998
7999	if (DisableZA)
8000	Chain = DAG.getNode(
8001	AArch64ISD::SMSTOP, DL, MVT::Other, Chain,
8002	DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
8003	DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
8004
8005	// Adjust the stack pointer for the new arguments...
8006	// These operations are automatically eliminated by the prolog/epilog pass
8007	if (!IsSibCall)
8008	Chain = DAG.getCALLSEQ_START(Chain, InSize: IsTailCall ? `0` : NumBytes, OutSize: `0`, DL);
8009
8010	SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
8011	getPointerTy(DAG.getDataLayout()));
8012
8013	SmallVector<std::pair<unsigned, SDValue>, `8`> RegsToPass;
8014	SmallSet<unsigned, `8`> RegsUsed;
8015	SmallVector<SDValue, `8`> MemOpChains;
8016	auto PtrVT = getPointerTy(DL: DAG.getDataLayout());
8017
8018	if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) {
8019	const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
8020	for (const auto &F : Forwards) {
8021	SDValue Val = DAG.getCopyFromReg(Chain, dl: DL, Reg: F.VReg, VT: F.VT);
8022	RegsToPass.emplace_back(Args: F.PReg, Args&: Val);
8023	}
8024	}
8025
8026	// Walk the register/memloc assignments, inserting copies/loads.
8027	unsigned ExtraArgLocs = `0`;
8028	for (unsigned i = `0`, e = Outs.size(); i != e; ++i) {
8029	CCValAssign &VA = ArgLocs [i - ExtraArgLocs];
8030	SDValue Arg = OutVals [i];
8031	ISD::ArgFlagsTy Flags = Outs [i].Flags;
8032
8033	// Promote the value if needed.
8034	switch (VA.getLocInfo()) {
8035	default:
8036	llvm_unreachable("Unknown loc info!");
8037	case CCValAssign::Full:
8038	break;
8039	case CCValAssign::SExt:
8040	Arg = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
8041	break;
8042	case CCValAssign::ZExt:
8043	Arg = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
8044	break;
8045	case CCValAssign::AExt:
8046	if (Outs[i].ArgVT == MVT::i1) {
8047	// AAPCS requires i1 to be zero-extended to 8-bits by the caller.
8048	//
8049	// Check if we actually have to do this, because the value may
8050	// already be zero-extended.
8051	//
8052	// We cannot just emit a (zext i8 (trunc (assert-zext i8)))
8053	// and rely on DAGCombiner to fold this, because the following
8054	// (anyext i32) is combined with (zext i8) in DAG.getNode:
8055	//
8056	// (ext (zext x)) -> (zext x)
8057	//
8058	// This will give us (zext i32), which we cannot remove, so
8059	// try to check this beforehand.
8060	if (!checkZExtBool(Arg, DAG)) {
8061	Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
8062	Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
8063	}
8064	}
8065	Arg = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
8066	break;
8067	case CCValAssign::AExtUpper:
8068	assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
8069	Arg = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
8070	Arg = DAG.getNode(Opcode: ISD::SHL, DL, VT: VA.getLocVT(), N1: Arg,
8071	N2: DAG.getConstant(Val: `32`, DL, VT: VA.getLocVT()));
8072	break;
8073	case CCValAssign::BCvt:
8074	Arg = DAG.getBitcast(VT: VA.getLocVT(), V: Arg);
8075	break;
8076	case CCValAssign::Trunc:
8077	Arg = DAG.getZExtOrTrunc(Op: Arg, DL, VT: VA.getLocVT());
8078	break;
8079	case CCValAssign::FPExt:
8080	Arg = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
8081	break;
8082	case CCValAssign::Indirect:
8083	bool isScalable = VA.getValVT().isScalableVT();
8084	assert((isScalable \|\| Subtarget->isWindowsArm64EC()) &&
8085	"Indirect arguments should be scalable on most subtargets");
8086
8087	uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinValue();
8088	uint64_t PartSize = StoreSize;
8089	unsigned NumParts = `1`;
8090	if (Outs [i].Flags.isInConsecutiveRegs()) {
8091	assert(!Outs[i].Flags.isInConsecutiveRegsLast());
8092	while (!Outs [i + NumParts - `1`].Flags.isInConsecutiveRegsLast())
8093	++NumParts;
8094	StoreSize *= NumParts;
8095	}
8096
8097	Type Ty = EVT (VA.getValVT()).getTypeForEVT(Context&: DAG.getContext());
8098	Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
8099	MachineFrameInfo &MFI = MF.getFrameInfo();
8100	int FI = MFI.CreateStackObject(Size: StoreSize, Alignment, isSpillSlot: false);
8101	if (isScalable)
8102	MFI.setStackID(ObjectIdx: FI, ID: TargetStackID::ScalableVector);
8103
8104	MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, FI);
8105	SDValue Ptr = DAG.getFrameIndex(
8106	FI, VT: DAG.getTargetLoweringInfo().getFrameIndexTy(DL: DAG.getDataLayout()));
8107	SDValue SpillSlot = Ptr;
8108
8109	// Ensure we generate all stores for each tuple part, whilst updating the
8110	// pointer after each store correctly using vscale.
8111	while (NumParts) {
8112	SDValue Store = DAG.getStore(Chain, dl: DL, Val: OutVals [i], Ptr, PtrInfo: MPI);
8113	MemOpChains.push_back(Elt: Store);
8114
8115	NumParts--;
8116	if (NumParts > `0`) {
8117	SDValue BytesIncrement;
8118	if (isScalable) {
8119	BytesIncrement = DAG.getVScale(
8120	DL, VT: Ptr.getValueType(),
8121	MulImm: APInt (Ptr.getValueSizeInBits().getFixedValue(), PartSize));
8122	} else {
8123	BytesIncrement = DAG.getConstant(
8124	Val: APInt (Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
8125	VT: Ptr.getValueType());
8126	}
8127	SDNodeFlags Flags;
8128	Flags.setNoUnsignedWrap(true);
8129
8130	MPI = MachinePointerInfo (MPI.getAddrSpace());
8131	Ptr = DAG.getNode(Opcode: ISD::ADD, DL, VT: Ptr.getValueType(), N1: Ptr,
8132	N2: BytesIncrement, Flags);
8133	ExtraArgLocs++;
8134	i++;
8135	}
8136	}
8137
8138	Arg = SpillSlot;
8139	break;
8140	}
8141
8142	if (VA.isRegLoc()) {
8143	if (i == `0` && Flags.isReturned() && !Flags.isSwiftSelf() &&
8144	Outs[`0`].VT == MVT::i64) {
8145	assert(VA.getLocVT() == MVT::i64 &&
8146	"unexpected calling convention register assignment");
8147	assert(!Ins.empty() && Ins[`0`].VT == MVT::i64 &&
8148	"unexpected use of 'returned'");
8149	IsThisReturn = true;
8150	}
8151	if (RegsUsed.count(V: VA.getLocReg())) {
8152	// If this register has already been used then we're trying to pack
8153	// parts of an [N x i32] into an X-register. The extension type will
8154	// take care of putting the two halves in the right place but we have to
8155	// combine them.
8156	SDValue &Bits =
8157	llvm::find_if(Range&: RegsToPass,
8158	P: [=](const std::pair<unsigned, SDValue> &Elt) {
8159	return Elt.first == VA.getLocReg();
8160	})
8161	->second;
8162	Bits = DAG.getNode(Opcode: ISD::OR, DL, VT: Bits.getValueType(), N1: Bits, N2: Arg);
8163	// Call site info is used for function's parameter entry value
8164	// tracking. For now we track only simple cases when parameter
8165	// is transferred through whole register.
8166	llvm::erase_if(C&: CSInfo.ArgRegPairs,
8167	P: [&VA](MachineFunction::ArgRegPair ArgReg) {
8168	return ArgReg.Reg == VA.getLocReg();
8169	});
8170	} else {
8171	// Add an extra level of indirection for streaming mode changes by
8172	// using a pseudo copy node that cannot be rematerialised between a
8173	// smstart/smstop and the call by the simple register coalescer.
8174	if (RequiresSMChange && isPassedInFPR(VT: Arg.getValueType()))
8175	Arg = DAG.getNode(Opcode: AArch64ISD::COALESCER_BARRIER, DL,
8176	VT: Arg.getValueType(), Operand: Arg);
8177	RegsToPass.emplace_back(Args: VA.getLocReg(), Args&: Arg);
8178	RegsUsed.insert(V: VA.getLocReg());
8179	const TargetOptions &Options = DAG.getTarget().Options;
8180	if (Options.EmitCallSiteInfo)
8181	CSInfo.ArgRegPairs.emplace_back(Args: VA.getLocReg(), Args&: i);
8182	}
8183	} else {
8184	assert(VA.isMemLoc());
8185
8186	SDValue DstAddr;
8187	MachinePointerInfo DstInfo;
8188
8189	// FIXME: This works on big-endian for composite byvals, which are the
8190	// common case. It should also work for fundamental types too.
8191	uint32_t BEAlign = `0`;
8192	unsigned OpSize;
8193	if (VA.getLocInfo() == CCValAssign::Indirect \|\|
8194	VA.getLocInfo() == CCValAssign::Trunc)
8195	OpSize = VA.getLocVT().getFixedSizeInBits();
8196	else
8197	OpSize = Flags.isByVal() ? Flags.getByValSize() * `8`
8198	: VA.getValVT().getSizeInBits();
8199	OpSize = (OpSize + `7`) / `8`;
8200	if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
8201	!Flags.isInConsecutiveRegs()) {
8202	if (OpSize < `8`)
8203	BEAlign = `8` - OpSize;
8204	}
8205	unsigned LocMemOffset = VA.getLocMemOffset();
8206	int32_t Offset = LocMemOffset + BEAlign;
8207	SDValue PtrOff = DAG.getIntPtrConstant(Val: Offset, DL);
8208	PtrOff = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: StackPtr, N2: PtrOff);
8209
8210	if (IsTailCall) {
8211	Offset = Offset + FPDiff;
8212	int FI = MF.getFrameInfo().CreateFixedObject(Size: OpSize, SPOffset: Offset, IsImmutable: true);
8213
8214	DstAddr = DAG.getFrameIndex(FI, VT: PtrVT);
8215	DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
8216
8217	// Make sure any stack arguments overlapping with where we're storing
8218	// are loaded before this eventual operation. Otherwise they'll be
8219	// clobbered.
8220	Chain = addTokenForArgument(Chain, DAG, MFI&: MF.getFrameInfo(), ClobberedFI: FI);
8221	} else {
8222	SDValue PtrOff = DAG.getIntPtrConstant(Val: Offset, DL);
8223
8224	DstAddr = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: StackPtr, N2: PtrOff);
8225	DstInfo = MachinePointerInfo::getStack(MF, Offset: LocMemOffset);
8226	}
8227
8228	if (Outs [i].Flags.isByVal()) {
8229	SDValue SizeNode =
8230	DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
8231	SDValue Cpy = DAG.getMemcpy(
8232	Chain, dl: DL, Dst: DstAddr, Src: Arg, Size: SizeNode,
8233	Alignment: Outs [i].Flags.getNonZeroByValAlign(),
8234	/isVol = / false, /AlwaysInline = / false,
8235	/isTailCall = / false, DstPtrInfo: DstInfo, SrcPtrInfo: MachinePointerInfo ());
8236
8237	MemOpChains.push_back(Elt: Cpy);
8238	} else {
8239	// Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
8240	// promoted to a legal register type i32, we should truncate Arg back to
8241	// i1/i8/i16.
8242	if (VA.getValVT() == MVT::i1 \|\| VA.getValVT() == MVT::i8 \|\|
8243	VA.getValVT() == MVT::i16)
8244	Arg = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: VA.getValVT(), Operand: Arg);
8245
8246	SDValue Store = DAG.getStore(Chain, dl: DL, Val: Arg, Ptr: DstAddr, PtrInfo: DstInfo);
8247	MemOpChains.push_back(Elt: Store);
8248	}
8249	}
8250	}
8251
8252	if (IsVarArg && Subtarget->isWindowsArm64EC()) {
8253	SDValue ParamPtr = StackPtr;
8254	if (IsTailCall) {
8255	// Create a dummy object at the top of the stack that can be used to get
8256	// the SP after the epilogue
8257	int FI = MF.getFrameInfo().CreateFixedObject(Size: `1`, SPOffset: FPDiff, IsImmutable: true);
8258	ParamPtr = DAG.getFrameIndex(FI, VT: PtrVT);
8259	}
8260
8261	// For vararg calls, the Arm64EC ABI requires values in x4 and x5
8262	// describing the argument list. x4 contains the address of the
8263	// first stack parameter. x5 contains the size in bytes of all parameters
8264	// passed on the stack.
8265	RegsToPass.emplace_back(AArch64::X4, ParamPtr);
8266	RegsToPass.emplace_back(AArch64::X5,
8267	DAG.getConstant(NumBytes, DL, MVT::i64));
8268	}
8269
8270	if (!MemOpChains.empty())
8271	Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
8272
8273	SDValue InGlue;
8274	if (RequiresSMChange) {
8275	SDValue NewChain = changeStreamingMode(
8276	DAG, DL, Enable: CalleeAttrs.hasStreamingInterface(), Chain, InGlue,
8277	Condition: getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
8278	Chain = NewChain.getValue(R: `0`);
8279	InGlue = NewChain.getValue(R: `1`);
8280	}
8281
8282	// Build a sequence of copy-to-reg nodes chained together with token chain
8283	// and flag operands which copy the outgoing args into the appropriate regs.
8284	for (auto &RegToPass : RegsToPass) {
8285	Chain = DAG.getCopyToReg(Chain, dl: DL, Reg: RegToPass.first,
8286	N: RegToPass.second, Glue: InGlue);
8287	InGlue = Chain.getValue(R: `1`);
8288	}
8289
8290	// If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
8291	// direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
8292	// node so that legalize doesn't hack it.
8293	if (auto *G = dyn_cast<GlobalAddressSDNode>(Val&: Callee)) {
8294	auto GV = G->getGlobal();
8295	unsigned OpFlags =
8296	Subtarget->classifyGlobalFunctionReference(GV, TM: getTargetMachine());
8297	if (OpFlags & AArch64II::MO_GOT) {
8298	Callee = DAG.getTargetGlobalAddress(GV, DL, VT: PtrVT, offset: `0`, TargetFlags: OpFlags);
8299	Callee = DAG.getNode(Opcode: AArch64ISD::LOADgot, DL, VT: PtrVT, Operand: Callee);
8300	} else {
8301	const GlobalValue *GV = G->getGlobal();
8302	Callee = DAG.getTargetGlobalAddress(GV, DL, VT: PtrVT, offset: `0`, TargetFlags: OpFlags);
8303	}
8304	} else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Val&: Callee)) {
8305	bool UseGot = (getTargetMachine().getCodeModel() == CodeModel::Large &&
8306	Subtarget->isTargetMachO()) \|\|
8307	MF.getFunction().getParent()->getRtLibUseGOT();
8308	const char *Sym = S->getSymbol();
8309	if (UseGot) {
8310	Callee = DAG.getTargetExternalSymbol(Sym, VT: PtrVT, TargetFlags: AArch64II::MO_GOT);
8311	Callee = DAG.getNode(Opcode: AArch64ISD::LOADgot, DL, VT: PtrVT, Operand: Callee);
8312	} else {
8313	Callee = DAG.getTargetExternalSymbol(Sym, VT: PtrVT, TargetFlags: `0`);
8314	}
8315	}
8316
8317	// We don't usually want to end the call-sequence here because we would tidy
8318	// the frame up after* the call, however in the ABI-changing tail-call case*
8319	// we've carefully laid out the parameters so that when sp is reset they'll be
8320	// in the correct location.
8321	if (IsTailCall && !IsSibCall) {
8322	Chain = DAG.getCALLSEQ_END(Chain, Size1: `0`, Size2: `0`, Glue: InGlue, DL);
8323	InGlue = Chain.getValue(R: `1`);
8324	}
8325
8326	std::vector<SDValue> Ops;
8327	Ops.push_back(x: Chain);
8328	Ops.push_back(x: Callee);
8329
8330	if (IsTailCall) {
8331	// Each tail call may have to adjust the stack by a different amount, so
8332	// this information must travel along with the operation for eventual
8333	// consumption by emitEpilogue.
8334	Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
8335	}
8336
8337	// Add argument registers to the end of the list so that they are known live
8338	// into the call.
8339	for (auto &RegToPass : RegsToPass)
8340	Ops.push_back(x: DAG.getRegister(Reg: RegToPass.first,
8341	VT: RegToPass.second.getValueType()));
8342
8343	// Add a register mask operand representing the call-preserved registers.
8344	const uint32_t *Mask;
8345	const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8346	if (IsThisReturn) {
8347	// For 'this' returns, use the X0-preserving mask if applicable
8348	Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
8349	if (!Mask) {
8350	IsThisReturn = false;
8351	Mask = TRI->getCallPreservedMask(MF, CallConv);
8352	}
8353	} else
8354	Mask = TRI->getCallPreservedMask(MF, CallConv);
8355
8356	if (Subtarget->hasCustomCallingConv())
8357	TRI->UpdateCustomCallPreservedMask(MF, Mask: &Mask);
8358
8359	if (TRI->isAnyArgRegReserved(MF))
8360	TRI->emitReservedArgRegCallError(MF);
8361
8362	assert(Mask && "Missing call preserved mask for calling convention");
8363	Ops.push_back(x: DAG.getRegisterMask(RegMask: Mask));
8364
8365	if (InGlue.getNode())
8366	Ops.push_back(x: InGlue);
8367
8368	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
8369
8370	// If we're doing a tall call, use a TC_RETURN here rather than an
8371	// actual call instruction.
8372	if (IsTailCall) {
8373	MF.getFrameInfo().setHasTailCall();
8374	SDValue Ret = DAG.getNode(Opcode: AArch64ISD::TC_RETURN, DL, VTList: NodeTys, Ops);
8375
8376	if (IsCFICall)
8377	Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
8378
8379	DAG.addNoMergeSiteInfo(Node: Ret.getNode(), NoMerge: CLI.NoMerge);
8380	DAG.addCallSiteInfo(Node: Ret.getNode(), CallInfo: std::move(CSInfo));
8381	return Ret;
8382	}
8383
8384	unsigned CallOpc = AArch64ISD::CALL;
8385	// Calls with operand bundle "clang.arc.attachedcall" are special. They should
8386	// be expanded to the call, directly followed by a special marker sequence and
8387	// a call to an ObjC library function. Use CALL_RVMARKER to do that.
8388	if (CLI.CB && objcarc::hasAttachedCallOpBundle(CB: CLI.CB)) {
8389	assert(!IsTailCall &&
8390	"tail calls cannot be marked with clang.arc.attachedcall");
8391	CallOpc = AArch64ISD::CALL_RVMARKER;
8392
8393	// Add a target global address for the retainRV/claimRV runtime function
8394	// just before the call target.
8395	Function ARCFn = objcarc::getAttachedARCFunction(CB: CLI.CB);
8396	auto GA = DAG.getTargetGlobalAddress(GV: ARCFn, DL, VT: PtrVT);
8397	Ops.insert(position: Ops.begin() + `1`, x: GA);
8398	} else if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
8399	CallOpc = AArch64ISD::CALL_ARM64EC_TO_X64;
8400	} else if (GuardWithBTI) {
8401	CallOpc = AArch64ISD::CALL_BTI;
8402	}
8403
8404	// Returns a chain and a flag for retval copy to use.
8405	Chain = DAG.getNode(Opcode: CallOpc, DL, VTList: NodeTys, Ops);
8406
8407	if (IsCFICall)
8408	Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
8409
8410	DAG.addNoMergeSiteInfo(Node: Chain.getNode(), NoMerge: CLI.NoMerge);
8411	InGlue = Chain.getValue(R: `1`);
8412	DAG.addCallSiteInfo(Node: Chain.getNode(), CallInfo: std::move(CSInfo));
8413
8414	uint64_t CalleePopBytes =
8415	DoesCalleeRestoreStack(CallCC: CallConv, TailCallOpt) ? alignTo(Value: NumBytes, Align: `16`) : `0`;
8416
8417	Chain = DAG.getCALLSEQ_END(Chain, Size1: NumBytes, Size2: CalleePopBytes, Glue: InGlue, DL);
8418	InGlue = Chain.getValue(R: `1`);
8419
8420	// Handle result values, copying them out of physregs into vregs that we
8421	// return.
8422	SDValue Result = LowerCallResult(
8423	Chain, InGlue, CallConv, isVarArg: IsVarArg, RVLocs, DL, DAG, InVals, isThisReturn: IsThisReturn,
8424	ThisVal: IsThisReturn ? OutVals [`0`] : SDValue (), RequiresSMChange);
8425
8426	if (!Ins.empty())
8427	InGlue = Result.getValue(R: Result ->getNumValues() - `1`);
8428
8429	if (RequiresSMChange) {
8430	assert(PStateSM && "Expected a PStateSM to be set");
8431	Result = changeStreamingMode(
8432	DAG, DL, Enable: !CalleeAttrs.hasStreamingInterface(), Chain: Result, InGlue,
8433	Condition: getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
8434	}
8435
8436	if (CallerAttrs.requiresEnablingZAAfterCall(CalleeAttrs))
8437	// Unconditionally resume ZA.
8438	Result = DAG.getNode(
8439	AArch64ISD::SMSTART, DL, MVT::Other, Result,
8440	DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
8441	DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
8442
8443	if (ShouldPreserveZT0)
8444	Result =
8445	DAG.getNode(AArch64ISD::RESTORE_ZT, DL, DAG.getVTList(MVT::Other),
8446	{Result, DAG.getConstant(`0`, DL, MVT::i32), ZTFrameIdx});
8447
8448	if (RequiresLazySave) {
8449	// Conditionally restore the lazy save using a pseudo node.
8450	unsigned FI = FuncInfo->getLazySaveTPIDR2Obj();
8451	SDValue RegMask = DAG.getRegisterMask(
8452	RegMask: TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
8453	SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
8454	Sym: "__arm_tpidr2_restore", VT: getPointerTy(DL: DAG.getDataLayout()));
8455	SDValue TPIDR2_EL0 = DAG.getNode(
8456	ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result,
8457	DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32));
8458
8459	// Copy the address of the TPIDR2 block into X0 before 'calling' the
8460	// RESTORE_ZA pseudo.
8461	SDValue Glue;
8462	SDValue TPIDR2Block = DAG.getFrameIndex(
8463	FI, VT: DAG.getTargetLoweringInfo().getFrameIndexTy(DL: DAG.getDataLayout()));
8464	Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue);
8465	Result =
8466	DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
8467	{Result, TPIDR2_EL0, DAG.getRegister(AArch64::X0, MVT::i64),
8468	RestoreRoutine, RegMask, Result.getValue(`1`)});
8469
8470	// Finally reset the TPIDR2_EL0 register to 0.
8471	Result = DAG.getNode(
8472	ISD::INTRINSIC_VOID, DL, MVT::Other, Result,
8473	DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
8474	DAG.getConstant(`0`, DL, MVT::i64));
8475	}
8476
8477	if (RequiresSMChange \|\| RequiresLazySave \|\| ShouldPreserveZT0) {
8478	for (unsigned I = `0`; I < InVals.size(); ++I) {
8479	// The smstart/smstop is chained as part of the call, but when the
8480	// resulting chain is discarded (which happens when the call is not part
8481	// of a chain, e.g. a call to @llvm.cos()), we need to ensure the
8482	// smstart/smstop is chained to the result value. We can do that by doing
8483	// a vreg -> vreg copy.
8484	Register Reg = MF.getRegInfo().createVirtualRegister(
8485	RegClass: getRegClassFor(VT: InVals [I].getValueType().getSimpleVT()));
8486	SDValue X = DAG.getCopyToReg(Chain: Result, dl: DL, Reg, N: InVals [I]);
8487	InVals [I] = DAG.getCopyFromReg(Chain: X, dl: DL, Reg,
8488	VT: InVals [I].getValueType());
8489	}
8490	}
8491
8492	return Result;
8493	}
8494
8495	bool AArch64TargetLowering::CanLowerReturn(
8496	CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
8497	const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
8498	CCAssignFn *RetCC = CCAssignFnForReturn(CC: CallConv);
8499	SmallVector<CCValAssign, `16`> RVLocs;
8500	CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
8501	return CCInfo.CheckReturn(Outs, Fn: RetCC);
8502	}
8503
8504	SDValue
8505	AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
8506	bool isVarArg,
8507	const SmallVectorImpl<ISD::OutputArg> &Outs,
8508	const SmallVectorImpl<SDValue> &OutVals,
8509	const SDLoc &DL, SelectionDAG &DAG) const {
8510	auto &MF = DAG.getMachineFunction();
8511	auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8512
8513	CCAssignFn *RetCC = CCAssignFnForReturn(CC: CallConv);
8514	SmallVector<CCValAssign, `16`> RVLocs;
8515	CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
8516	CCInfo.AnalyzeReturn(Outs, Fn: RetCC);
8517
8518	// Copy the result values into the output registers.
8519	SDValue Glue;
8520	SmallVector<std::pair<unsigned, SDValue>, `4`> RetVals;
8521	SmallSet<unsigned, `4`> RegsUsed;
8522	for (unsigned i = `0`, realRVLocIdx = `0`; i != RVLocs.size();
8523	++i, ++realRVLocIdx) {
8524	CCValAssign &VA = RVLocs [i];
8525	assert(VA.isRegLoc() && "Can only return in registers!");
8526	SDValue Arg = OutVals [realRVLocIdx];
8527
8528	switch (VA.getLocInfo()) {
8529	default:
8530	llvm_unreachable("Unknown loc info!");
8531	case CCValAssign::Full:
8532	if (Outs[i].ArgVT == MVT::i1) {
8533	// AAPCS requires i1 to be zero-extended to i8 by the producer of the
8534	// value. This is strictly redundant on Darwin (which uses "zeroext
8535	// i1"), but will be optimised out before ISel.
8536	Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
8537	Arg = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
8538	}
8539	break;
8540	case CCValAssign::BCvt:
8541	Arg = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: VA.getLocVT(), Operand: Arg);
8542	break;
8543	case CCValAssign::AExt:
8544	case CCValAssign::ZExt:
8545	Arg = DAG.getZExtOrTrunc(Op: Arg, DL, VT: VA.getLocVT());
8546	break;
8547	case CCValAssign::AExtUpper:
8548	assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
8549	Arg = DAG.getZExtOrTrunc(Op: Arg, DL, VT: VA.getLocVT());
8550	Arg = DAG.getNode(Opcode: ISD::SHL, DL, VT: VA.getLocVT(), N1: Arg,
8551	N2: DAG.getConstant(Val: `32`, DL, VT: VA.getLocVT()));
8552	break;
8553	}
8554
8555	if (RegsUsed.count(V: VA.getLocReg())) {
8556	SDValue &Bits =
8557	llvm::find_if(Range&: RetVals, P: [=](const std::pair<unsigned, SDValue> &Elt) {
8558	return Elt.first == VA.getLocReg();
8559	})->second;
8560	Bits = DAG.getNode(Opcode: ISD::OR, DL, VT: Bits.getValueType(), N1: Bits, N2: Arg);
8561	} else {
8562	RetVals.emplace_back(Args: VA.getLocReg(), Args&: Arg);
8563	RegsUsed.insert(V: VA.getLocReg());
8564	}
8565	}
8566
8567	const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8568
8569	// Emit SMSTOP before returning from a locally streaming function
8570	SMEAttrs FuncAttrs(MF.getFunction());
8571	if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) {
8572	if (FuncAttrs.hasStreamingCompatibleInterface()) {
8573	Register Reg = FuncInfo->getPStateSMReg();
8574	assert(Reg.isValid() && "PStateSM Register is invalid");
8575	SDValue PStateSM = DAG.getCopyFromReg(Chain, DL, Reg, MVT::i64);
8576	Chain = changeStreamingMode(DAG, DL, /Enable/ false, Chain,
8577	/Glue/ InGlue: SDValue (),
8578	Condition: AArch64SME::IfCallerIsNonStreaming, PStateSM);
8579	} else
8580	Chain = changeStreamingMode(DAG, DL, /Enable/ false, Chain,
8581	/Glue/ InGlue: SDValue (), Condition: AArch64SME::Always);
8582	Glue = Chain.getValue(R: `1`);
8583	}
8584
8585	SmallVector<SDValue, `4`> RetOps(`1`, Chain);
8586	for (auto &RetVal : RetVals) {
8587	if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface() &&
8588	isPassedInFPR(VT: RetVal.second.getValueType()))
8589	RetVal.second = DAG.getNode(Opcode: AArch64ISD::COALESCER_BARRIER, DL,
8590	VT: RetVal.second.getValueType(), Operand: RetVal.second);
8591	Chain = DAG.getCopyToReg(Chain, dl: DL, Reg: RetVal.first, N: RetVal.second, Glue);
8592	Glue = Chain.getValue(R: `1`);
8593	RetOps.push_back(
8594	Elt: DAG.getRegister(Reg: RetVal.first, VT: RetVal.second.getValueType()));
8595	}
8596
8597	// Windows AArch64 ABIs require that for returning structs by value we copy
8598	// the sret argument into X0 for the return.
8599	// We saved the argument into a virtual register in the entry block,
8600	// so now we copy the value out and into X0.
8601	if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
8602	SDValue Val = DAG.getCopyFromReg(Chain: RetOps [`0`], dl: DL, Reg: SRetReg,
8603	VT: getPointerTy(DL: MF.getDataLayout()));
8604
8605	unsigned RetValReg = AArch64::X0;
8606	if (CallConv == CallingConv::ARM64EC_Thunk_X64)
8607	RetValReg = AArch64::X8;
8608	Chain = DAG.getCopyToReg(Chain, dl: DL, Reg: RetValReg, N: Val, Glue);
8609	Glue = Chain.getValue(R: `1`);
8610
8611	RetOps.push_back(
8612	Elt: DAG.getRegister(Reg: RetValReg, VT: getPointerTy(DL: DAG.getDataLayout())));
8613	}
8614
8615	const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(MF: &MF);
8616	if (I) {
8617	for (; *I; ++I) {
8618	if (AArch64::GPR64RegClass.contains(*I))
8619	RetOps.push_back(DAG.getRegister(*I, MVT::i64));
8620	else if (AArch64::FPR64RegClass.contains(*I))
8621	RetOps.push_back(Elt: DAG.getRegister(Reg: *I, VT: MVT::getFloatingPointVT(BitWidth: `64`)));
8622	else
8623	llvm_unreachable("Unexpected register class in CSRsViaCopy!");
8624	}
8625	}
8626
8627	RetOps [`0`] = Chain; // Update chain.
8628
8629	// Add the glue if we have it.
8630	if (Glue.getNode())
8631	RetOps.push_back(Elt: Glue);
8632
8633	if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
8634	// ARM64EC entry thunks use a special return sequence: instead of a regular
8635	// "ret" instruction, they need to explicitly call the emulator.
8636	EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
8637	SDValue Arm64ECRetDest =
8638	DAG.getExternalSymbol(Sym: "__os_arm64x_dispatch_ret", VT: PtrVT);
8639	Arm64ECRetDest =
8640	getAddr(N: cast<ExternalSymbolSDNode>(Val&: Arm64ECRetDest), DAG, Flags: `0`);
8641	Arm64ECRetDest = DAG.getLoad(VT: PtrVT, dl: DL, Chain: DAG.getEntryNode(), Ptr: Arm64ECRetDest,
8642	PtrInfo: MachinePointerInfo ());
8643	RetOps.insert(I: RetOps.begin() + `1`, Elt: Arm64ECRetDest);
8644	RetOps.insert(RetOps.begin() + `2`, DAG.getTargetConstant(`0`, DL, MVT::i32));
8645	return DAG.getNode(AArch64ISD::TC_RETURN, DL, MVT::Other, RetOps);
8646	}
8647
8648	return DAG.getNode(AArch64ISD::RET_GLUE, DL, MVT::Other, RetOps);
8649	}
8650
8651	//===----------------------------------------------------------------------===//
8652	// Other Lowering Code
8653	//===----------------------------------------------------------------------===//
8654
8655	SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
8656	SelectionDAG &DAG,
8657	unsigned Flag) const {
8658	return DAG.getTargetGlobalAddress(GV: N->getGlobal(), DL: SDLoc (N), VT: Ty,
8659	offset: N->getOffset(), TargetFlags: Flag);
8660	}
8661
8662	SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
8663	SelectionDAG &DAG,
8664	unsigned Flag) const {
8665	return DAG.getTargetJumpTable(JTI: N->getIndex(), VT: Ty, TargetFlags: Flag);
8666	}
8667
8668	SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
8669	SelectionDAG &DAG,
8670	unsigned Flag) const {
8671	return DAG.getTargetConstantPool(C: N->getConstVal(), VT: Ty, Align: N->getAlign(),
8672	Offset: N->getOffset(), TargetFlags: Flag);
8673	}
8674
8675	SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
8676	SelectionDAG &DAG,
8677	unsigned Flag) const {
8678	return DAG.getTargetBlockAddress(BA: N->getBlockAddress(), VT: Ty, Offset: `0`, TargetFlags: Flag);
8679	}
8680
8681	SDValue AArch64TargetLowering::getTargetNode(ExternalSymbolSDNode *N, EVT Ty,
8682	SelectionDAG &DAG,
8683	unsigned Flag) const {
8684	return DAG.getTargetExternalSymbol(Sym: N->getSymbol(), VT: Ty, TargetFlags: Flag);
8685	}
8686
8687	// (loadGOT sym)
8688	template <class NodeTy>
8689	SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
8690	unsigned Flags) const {
8691	LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
8692	SDLoc DL(N);
8693	EVT Ty = getPointerTy(DL: DAG.getDataLayout());
8694	SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT \| Flags);
8695	// FIXME: Once remat is capable of dealing with instructions with register
8696	// operands, expand this into two nodes instead of using a wrapper node.
8697	return DAG.getNode(Opcode: AArch64ISD::LOADgot, DL, VT: Ty, Operand: GotAddr);
8698	}
8699
8700	// (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
8701	template <class NodeTy>
8702	SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
8703	unsigned Flags) const {
8704	LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
8705	SDLoc DL(N);
8706	EVT Ty = getPointerTy(DL: DAG.getDataLayout());
8707	const unsigned char MO_NC = AArch64II::MO_NC;
8708	return DAG.getNode(
8709	AArch64ISD::WrapperLarge, DL, Ty,
8710	getTargetNode(N, Ty, DAG, AArch64II::MO_G3 \| Flags),
8711	getTargetNode(N, Ty, DAG, AArch64II::MO_G2 \| MO_NC \| Flags),
8712	getTargetNode(N, Ty, DAG, AArch64II::MO_G1 \| MO_NC \| Flags),
8713	getTargetNode(N, Ty, DAG, AArch64II::MO_G0 \| MO_NC \| Flags));
8714	}
8715
8716	// (addlow (adrp %hi(sym)) %lo(sym))
8717	template <class NodeTy>
8718	SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
8719	unsigned Flags) const {
8720	LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
8721	SDLoc DL(N);
8722	EVT Ty = getPointerTy(DL: DAG.getDataLayout());
8723	SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE \| Flags);
8724	SDValue Lo = getTargetNode(N, Ty, DAG,
8725	AArch64II::MO_PAGEOFF \| AArch64II::MO_NC \| Flags);
8726	SDValue ADRP = DAG.getNode(Opcode: AArch64ISD::ADRP, DL, VT: Ty, Operand: Hi);
8727	return DAG.getNode(Opcode: AArch64ISD::ADDlow, DL, VT: Ty, N1: ADRP, N2: Lo);
8728	}
8729
8730	// (adr sym)
8731	template <class NodeTy>
8732	SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
8733	unsigned Flags) const {
8734	LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
8735	SDLoc DL(N);
8736	EVT Ty = getPointerTy(DL: DAG.getDataLayout());
8737	SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
8738	return DAG.getNode(Opcode: AArch64ISD::ADR, DL, VT: Ty, Operand: Sym);
8739	}
8740
8741	SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
8742	SelectionDAG &DAG) const {
8743	GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Val&: Op);
8744	const GlobalValue *GV = GN->getGlobal();
8745	unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, TM: getTargetMachine());
8746
8747	if (OpFlags != AArch64II::MO_NO_FLAG)
8748	assert(cast<GlobalAddressSDNode>(Op)->getOffset() == `0` &&
8749	"unexpected offset in global node");
8750
8751	// This also catches the large code model case for Darwin, and tiny code
8752	// model with got relocations.
8753	if ((OpFlags & AArch64II::MO_GOT) != `0`) {
8754	return getGOT(N: GN, DAG, Flags: OpFlags);
8755	}
8756
8757	SDValue Result;
8758	if (getTargetMachine().getCodeModel() == CodeModel::Large &&
8759	!getTargetMachine().isPositionIndependent()) {
8760	Result = getAddrLarge(N: GN, DAG, Flags: OpFlags);
8761	} else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
8762	Result = getAddrTiny(N: GN, DAG, Flags: OpFlags);
8763	} else {
8764	Result = getAddr(N: GN, DAG, Flags: OpFlags);
8765	}
8766	EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
8767	SDLoc DL(GN);
8768	if (OpFlags & (AArch64II::MO_DLLIMPORT \| AArch64II::MO_COFFSTUB))
8769	Result = DAG.getLoad(VT: PtrVT, dl: DL, Chain: DAG.getEntryNode(), Ptr: Result,
8770	PtrInfo: MachinePointerInfo::getGOT(MF&: DAG.getMachineFunction()));
8771	return Result;
8772	}
8773
8774	/// Convert a TLS address reference into the correct sequence of loads
8775	/// and calls to compute the variable's address (for Darwin, currently) and
8776	/// return an SDValue containing the final node.
8777
8778	/// Darwin only has one TLS scheme which must be capable of dealing with the
8779	/// fully general situation, in the worst case. This means:
8780	/// + "extern __thread" declaration.
8781	/// + Defined in a possibly unknown dynamic library.
8782	///
8783	/// The general system is that each __thread variable has a [3 x i64] descriptor
8784	/// which contains information used by the runtime to calculate the address. The
8785	/// only part of this the compiler needs to know about is the first xword, which
8786	/// contains a function pointer that must be called with the address of the
8787	/// entire descriptor in "x0".
8788	///
8789	/// Since this descriptor may be in a different unit, in general even the
8790	/// descriptor must be accessed via an indirect load. The "ideal" code sequence
8791	/// is:
8792	/// adrp x0, _var@TLVPPAGE
8793	/// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
8794	/// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
8795	/// ; the function pointer
8796	/// blr x1 ; Uses descriptor address in x0
8797	/// ; Address of _var is now in x0.
8798	///
8799	/// If the address of _var's descriptor is* known to the linker, then it can*
8800	/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
8801	/// a slight efficiency gain.
8802	SDValue
8803	AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
8804	SelectionDAG &DAG) const {
8805	assert(Subtarget->isTargetDarwin() &&
8806	"This function expects a Darwin target");
8807
8808	SDLoc DL(Op);
8809	MVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
8810	MVT PtrMemVT = getPointerMemTy(DL: DAG.getDataLayout());
8811	const GlobalValue *GV = cast<GlobalAddressSDNode>(Val&: Op)->getGlobal();
8812
8813	SDValue TLVPAddr =
8814	DAG.getTargetGlobalAddress(GV, DL, VT: PtrVT, offset: `0`, TargetFlags: AArch64II::MO_TLS);
8815	SDValue DescAddr = DAG.getNode(Opcode: AArch64ISD::LOADgot, DL, VT: PtrVT, Operand: TLVPAddr);
8816
8817	// The first entry in the descriptor is a function pointer that we must call
8818	// to obtain the address of the variable.
8819	SDValue Chain = DAG.getEntryNode();
8820	SDValue FuncTLVGet = DAG.getLoad(
8821	VT: PtrMemVT, dl: DL, Chain, Ptr: DescAddr,
8822	PtrInfo: MachinePointerInfo::getGOT(MF&: DAG.getMachineFunction()),
8823	Alignment: Align (PtrMemVT.getSizeInBits() / `8`),
8824	MMOFlags: MachineMemOperand::MOInvariant \| MachineMemOperand::MODereferenceable);
8825	Chain = FuncTLVGet.getValue(R: `1`);
8826
8827	// Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
8828	FuncTLVGet = DAG.getZExtOrTrunc(Op: FuncTLVGet, DL, VT: PtrVT);
8829
8830	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
8831	MFI.setAdjustsStack(true);
8832
8833	// TLS calls preserve all registers except those that absolutely must be
8834	// trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
8835	// silly).
8836	const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8837	const uint32_t *Mask = TRI->getTLSCallPreservedMask();
8838	if (Subtarget->hasCustomCallingConv())
8839	TRI->UpdateCustomCallPreservedMask(MF&: DAG.getMachineFunction(), Mask: &Mask);
8840
8841	// Finally, we can make the call. This is just a degenerate version of a
8842	// normal AArch64 call node: x0 takes the address of the descriptor, and
8843	// returns the address of the variable in this thread.
8844	Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
8845	Chain =
8846	DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
8847	Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64),
8848	DAG.getRegisterMask(Mask), Chain.getValue(`1`));
8849	return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(`1`));
8850	}
8851
8852	/// Convert a thread-local variable reference into a sequence of instructions to
8853	/// compute the variable's address for the local exec TLS model of ELF targets.
8854	/// The sequence depends on the maximum TLS area size.
8855	SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV,
8856	SDValue ThreadBase,
8857	const SDLoc &DL,
8858	SelectionDAG &DAG) const {
8859	EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
8860	SDValue TPOff, Addr;
8861
8862	switch (DAG.getTarget().Options.TLSSize) {
8863	default:
8864	llvm_unreachable("Unexpected TLS size");
8865
8866	case `12`: {
8867	// mrs x0, TPIDR_EL0
8868	// add x0, x0, :tprel_lo12:a
8869	SDValue Var = DAG.getTargetGlobalAddress(
8870	GV, DL, VT: PtrVT, offset: `0`, TargetFlags: AArch64II::MO_TLS \| AArch64II::MO_PAGEOFF);
8871	return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
8872	Var,
8873	DAG.getTargetConstant(`0`, DL, MVT::i32)),
8874	`0`);
8875	}
8876
8877	case `24`: {
8878	// mrs x0, TPIDR_EL0
8879	// add x0, x0, :tprel_hi12:a
8880	// add x0, x0, :tprel_lo12_nc:a
8881	SDValue HiVar = DAG.getTargetGlobalAddress(
8882	GV, DL, VT: PtrVT, offset: `0`, TargetFlags: AArch64II::MO_TLS \| AArch64II::MO_HI12);
8883	SDValue LoVar = DAG.getTargetGlobalAddress(
8884	GV, DL, VT: PtrVT, offset: `0`,
8885	TargetFlags: AArch64II::MO_TLS \| AArch64II::MO_PAGEOFF \| AArch64II::MO_NC);
8886	Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
8887	HiVar,
8888	DAG.getTargetConstant(`0`, DL, MVT::i32)),
8889	`0`);
8890	return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr,
8891	LoVar,
8892	DAG.getTargetConstant(`0`, DL, MVT::i32)),
8893	`0`);
8894	}
8895
8896	case `32`: {
8897	// mrs x1, TPIDR_EL0
8898	// movz x0, #:tprel_g1:a
8899	// movk x0, #:tprel_g0_nc:a
8900	// add x0, x1, x0
8901	SDValue HiVar = DAG.getTargetGlobalAddress(
8902	GV, DL, VT: PtrVT, offset: `0`, TargetFlags: AArch64II::MO_TLS \| AArch64II::MO_G1);
8903	SDValue LoVar = DAG.getTargetGlobalAddress(
8904	GV, DL, VT: PtrVT, offset: `0`,
8905	TargetFlags: AArch64II::MO_TLS \| AArch64II::MO_G0 \| AArch64II::MO_NC);
8906	TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
8907	DAG.getTargetConstant(`16`, DL, MVT::i32)),
8908	`0`);
8909	TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
8910	DAG.getTargetConstant(`0`, DL, MVT::i32)),
8911	`0`);
8912	return DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: ThreadBase, N2: TPOff);
8913	}
8914
8915	case `48`: {
8916	// mrs x1, TPIDR_EL0
8917	// movz x0, #:tprel_g2:a
8918	// movk x0, #:tprel_g1_nc:a
8919	// movk x0, #:tprel_g0_nc:a
8920	// add x0, x1, x0
8921	SDValue HiVar = DAG.getTargetGlobalAddress(
8922	GV, DL, VT: PtrVT, offset: `0`, TargetFlags: AArch64II::MO_TLS \| AArch64II::MO_G2);
8923	SDValue MiVar = DAG.getTargetGlobalAddress(
8924	GV, DL, VT: PtrVT, offset: `0`,
8925	TargetFlags: AArch64II::MO_TLS \| AArch64II::MO_G1 \| AArch64II::MO_NC);
8926	SDValue LoVar = DAG.getTargetGlobalAddress(
8927	GV, DL, VT: PtrVT, offset: `0`,
8928	TargetFlags: AArch64II::MO_TLS \| AArch64II::MO_G0 \| AArch64II::MO_NC);
8929	TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
8930	DAG.getTargetConstant(`32`, DL, MVT::i32)),
8931	`0`);
8932	TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar,
8933	DAG.getTargetConstant(`16`, DL, MVT::i32)),
8934	`0`);
8935	TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
8936	DAG.getTargetConstant(`0`, DL, MVT::i32)),
8937	`0`);
8938	return DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: ThreadBase, N2: TPOff);
8939	}
8940	}
8941	}
8942
8943	/// When accessing thread-local variables under either the general-dynamic or
8944	/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
8945	/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
8946	/// is a function pointer to carry out the resolution.
8947	///
8948	/// The sequence is:
8949	/// adrp x0, :tlsdesc:var
8950	/// ldr x1, [x0, #:tlsdesc_lo12:var]
8951	/// add x0, x0, #:tlsdesc_lo12:var
8952	/// .tlsdesccall var
8953	/// blr x1
8954	/// (TPIDR_EL0 offset now in x0)
8955	///
8956	/// The above sequence must be produced unscheduled, to enable the linker to
8957	/// optimize/relax this sequence.
8958	/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
8959	/// above sequence, and expanded really late in the compilation flow, to ensure
8960	/// the sequence is produced as per above.
8961	SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
8962	const SDLoc &DL,
8963	SelectionDAG &DAG) const {
8964	EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
8965
8966	SDValue Chain = DAG.getEntryNode();
8967	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
8968
8969	Chain =
8970	DAG.getNode(Opcode: AArch64ISD::TLSDESC_CALLSEQ, DL, VTList: NodeTys, Ops: {Chain, SymAddr});
8971	SDValue Glue = Chain.getValue(R: `1`);
8972
8973	return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
8974	}
8975
8976	SDValue
8977	AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
8978	SelectionDAG &DAG) const {
8979	assert(Subtarget->isTargetELF() && "This function expects an ELF target");
8980
8981	const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Val&: Op);
8982
8983	TLSModel::Model Model = getTargetMachine().getTLSModel(GV: GA->getGlobal());
8984
8985	if (!EnableAArch64ELFLocalDynamicTLSGeneration) {
8986	if (Model == TLSModel::LocalDynamic)
8987	Model = TLSModel::GeneralDynamic;
8988	}
8989
8990	if (getTargetMachine().getCodeModel() == CodeModel::Large &&
8991	Model != TLSModel::LocalExec)
8992	report_fatal_error(reason: "ELF TLS only supported in small memory model or "
8993	"in local exec TLS model");
8994	// Different choices can be made for the maximum size of the TLS area for a
8995	// module. For the small address model, the default TLS size is 16MiB and the
8996	// maximum TLS size is 4GiB.
8997	// FIXME: add tiny and large code model support for TLS access models other
8998	// than local exec. We currently generate the same code as small for tiny,
8999	// which may be larger than needed.
9000
9001	SDValue TPOff;
9002	EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
9003	SDLoc DL(Op);
9004	const GlobalValue *GV = GA->getGlobal();
9005
9006	SDValue ThreadBase = DAG.getNode(Opcode: AArch64ISD::THREAD_POINTER, DL, VT: PtrVT);
9007
9008	if (Model == TLSModel::LocalExec) {
9009	return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG);
9010	} else if (Model == TLSModel::InitialExec) {
9011	TPOff = DAG.getTargetGlobalAddress(GV, DL, VT: PtrVT, offset: `0`, TargetFlags: AArch64II::MO_TLS);
9012	TPOff = DAG.getNode(Opcode: AArch64ISD::LOADgot, DL, VT: PtrVT, Operand: TPOff);
9013	} else if (Model == TLSModel::LocalDynamic) {
9014	// Local-dynamic accesses proceed in two phases. A general-dynamic TLS
9015	// descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
9016	// the beginning of the module's TLS region, followed by a DTPREL offset
9017	// calculation.
9018
9019	// These accesses will need deduplicating if there's more than one.
9020	AArch64FunctionInfo *MFI =
9021	DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
9022	MFI->incNumLocalDynamicTLSAccesses();
9023
9024	// The call needs a relocation too for linker relaxation. It doesn't make
9025	// sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
9026	// the address.
9027	SDValue SymAddr = DAG.getTargetExternalSymbol(Sym: "_TLS_MODULE_BASE_", VT: PtrVT,
9028	TargetFlags: AArch64II::MO_TLS);
9029
9030	// Now we can calculate the offset from TPIDR_EL0 to this module's
9031	// thread-local area.
9032	TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
9033
9034	// Now use :dtprel_whatever: operations to calculate this variable's offset
9035	// in its thread-storage area.
9036	SDValue HiVar = DAG.getTargetGlobalAddress(
9037	GV, DL, MVT::i64, `0`, AArch64II::MO_TLS \| AArch64II::MO_HI12);
9038	SDValue LoVar = DAG.getTargetGlobalAddress(
9039	GV, DL, MVT::i64, `0`,
9040	AArch64II::MO_TLS \| AArch64II::MO_PAGEOFF \| AArch64II::MO_NC);
9041
9042	TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
9043	DAG.getTargetConstant(`0`, DL, MVT::i32)),
9044	`0`);
9045	TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
9046	DAG.getTargetConstant(`0`, DL, MVT::i32)),
9047	`0`);
9048	} else if (Model == TLSModel::GeneralDynamic) {
9049	// The call needs a relocation too for linker relaxation. It doesn't make
9050	// sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
9051	// the address.
9052	SDValue SymAddr =
9053	DAG.getTargetGlobalAddress(GV, DL, VT: PtrVT, offset: `0`, TargetFlags: AArch64II::MO_TLS);
9054
9055	// Finally we can make a call to calculate the offset from tpidr_el0.
9056	TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
9057	} else
9058	llvm_unreachable("Unsupported ELF TLS access model");
9059
9060	return DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: ThreadBase, N2: TPOff);
9061	}
9062
9063	SDValue
9064	AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
9065	SelectionDAG &DAG) const {
9066	assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
9067
9068	SDValue Chain = DAG.getEntryNode();
9069	EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
9070	SDLoc DL(Op);
9071
9072	SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64);
9073
9074	// Load the ThreadLocalStoragePointer from the TEB
9075	// A pointer to the TLS array is located at offset 0x58 from the TEB.
9076	SDValue TLSArray =
9077	DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: TEB, N2: DAG.getIntPtrConstant(Val: `0x58`, DL));
9078	TLSArray = DAG.getLoad(VT: PtrVT, dl: DL, Chain, Ptr: TLSArray, PtrInfo: MachinePointerInfo ());
9079	Chain = TLSArray.getValue(R: `1`);
9080
9081	// Load the TLS index from the C runtime;
9082	// This does the same as getAddr(), but without having a GlobalAddressSDNode.
9083	// This also does the same as LOADgot, but using a generic i32 load,
9084	// while LOADgot only loads i64.
9085	SDValue TLSIndexHi =
9086	DAG.getTargetExternalSymbol(Sym: "_tls_index", VT: PtrVT, TargetFlags: AArch64II::MO_PAGE);
9087	SDValue TLSIndexLo = DAG.getTargetExternalSymbol(
9088	Sym: "_tls_index", VT: PtrVT, TargetFlags: AArch64II::MO_PAGEOFF \| AArch64II::MO_NC);
9089	SDValue ADRP = DAG.getNode(Opcode: AArch64ISD::ADRP, DL, VT: PtrVT, Operand: TLSIndexHi);
9090	SDValue TLSIndex =
9091	DAG.getNode(Opcode: AArch64ISD::ADDlow, DL, VT: PtrVT, N1: ADRP, N2: TLSIndexLo);
9092	TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo());
9093	Chain = TLSIndex.getValue(R: `1`);
9094
9095	// The pointer to the thread's TLS data area is at the TLS Index scaled by 8
9096	// offset into the TLSArray.
9097	TLSIndex = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: PtrVT, Operand: TLSIndex);
9098	SDValue Slot = DAG.getNode(Opcode: ISD::SHL, DL, VT: PtrVT, N1: TLSIndex,
9099	N2: DAG.getConstant(Val: `3`, DL, VT: PtrVT));
9100	SDValue TLS = DAG.getLoad(VT: PtrVT, dl: DL, Chain,
9101	Ptr: DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: TLSArray, N2: Slot),
9102	PtrInfo: MachinePointerInfo ());
9103	Chain = TLS.getValue(R: `1`);
9104
9105	const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Val&: Op);
9106	const GlobalValue *GV = GA->getGlobal();
9107	SDValue TGAHi = DAG.getTargetGlobalAddress(
9108	GV, DL, VT: PtrVT, offset: `0`, TargetFlags: AArch64II::MO_TLS \| AArch64II::MO_HI12);
9109	SDValue TGALo = DAG.getTargetGlobalAddress(
9110	GV, DL, VT: PtrVT, offset: `0`,
9111	TargetFlags: AArch64II::MO_TLS \| AArch64II::MO_PAGEOFF \| AArch64II::MO_NC);
9112
9113	// Add the offset from the start of the .tls section (section base).
9114	SDValue Addr =
9115	SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi,
9116	DAG.getTargetConstant(`0`, DL, MVT::i32)),
9117	`0`);
9118	Addr = DAG.getNode(Opcode: AArch64ISD::ADDlow, DL, VT: PtrVT, N1: Addr, N2: TGALo);
9119	return Addr;
9120	}
9121
9122	SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
9123	SelectionDAG &DAG) const {
9124	const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Val&: Op);
9125	if (DAG.getTarget().useEmulatedTLS())
9126	return LowerToTLSEmulatedModel(GA, DAG);
9127
9128	if (Subtarget->isTargetDarwin())
9129	return LowerDarwinGlobalTLSAddress(Op, DAG);
9130	if (Subtarget->isTargetELF())
9131	return LowerELFGlobalTLSAddress(Op, DAG);
9132	if (Subtarget->isTargetWindows())
9133	return LowerWindowsGlobalTLSAddress(Op, DAG);
9134
9135	llvm_unreachable("Unexpected platform trying to use TLS");
9136	}
9137
9138	// Looks through \param Val to determine the bit that can be used to
9139	// check the sign of the value. It returns the unextended value and
9140	// the sign bit position.
9141	std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) {
9142	if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG)
9143	return {Val.getOperand(i: `0`),
9144	cast<VTSDNode>(Val: Val.getOperand(i: `1`))->getVT().getFixedSizeInBits() -
9145	`1`};
9146
9147	if (Val.getOpcode() == ISD::SIGN_EXTEND)
9148	return {Val.getOperand(i: `0`),
9149	Val.getOperand(i: `0`)->getValueType(ResNo: `0`).getFixedSizeInBits() - `1`};
9150
9151	return {Val, Val.getValueSizeInBits() - `1`};
9152	}
9153
9154	SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
9155	SDValue Chain = Op.getOperand(i: `0`);
9156	ISD::CondCode CC = cast<CondCodeSDNode>(Val: Op.getOperand(i: `1`))->get();
9157	SDValue LHS = Op.getOperand(i: `2`);
9158	SDValue RHS = Op.getOperand(i: `3`);
9159	SDValue Dest = Op.getOperand(i: `4`);
9160	SDLoc dl(Op);
9161
9162	MachineFunction &MF = DAG.getMachineFunction();
9163	// Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
9164	// will not be produced, as they are conditional branch instructions that do
9165	// not set flags.
9166	bool ProduceNonFlagSettingCondBr =
9167	!MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
9168
9169	// Handle f128 first, since lowering it will result in comparing the return
9170	// value of a libcall against zero, which is just what the rest of LowerBR_CC
9171	// is expecting to deal with.
9172	if (LHS.getValueType() == MVT::f128) {
9173	softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
9174
9175	// If softenSetCCOperands returned a scalar, we need to compare the result
9176	// against zero to select between true and false values.
9177	if (!RHS.getNode()) {
9178	RHS = DAG.getConstant(Val: `0`, DL: dl, VT: LHS.getValueType());
9179	CC = ISD::SETNE;
9180	}
9181	}
9182
9183	// Optimize {s\|u}{add\|sub\|mul}.with.overflow feeding into a branch
9184	// instruction.
9185	if (ISD::isOverflowIntrOpRes(Op: LHS) && isOneConstant(V: RHS) &&
9186	(CC == ISD::SETEQ \|\| CC == ISD::SETNE)) {
9187	// Only lower legal XALUO ops.
9188	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT: LHS ->getValueType(ResNo: `0`)))
9189	return SDValue ();
9190
9191	// The actual operation with overflow check.
9192	AArch64CC::CondCode OFCC;
9193	SDValue Value, Overflow;
9194	std::tie(args&: Value, args&: Overflow) = getAArch64XALUOOp(CC&: OFCC, Op: LHS.getValue(R: `0`), DAG);
9195
9196	if (CC == ISD::SETNE)
9197	OFCC = getInvertedCondCode(Code: OFCC);
9198	SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32);
9199
9200	return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
9201	Overflow);
9202	}
9203
9204	if (LHS.getValueType().isInteger()) {
9205	assert((LHS.getValueType() == RHS.getValueType()) &&
9206	(LHS.getValueType() == MVT::i32 \|\| LHS.getValueType() == MVT::i64));
9207
9208	// If the RHS of the comparison is zero, we can potentially fold this
9209	// to a specialized branch.
9210	const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(Val&: RHS);
9211	if (RHSC && RHSC->getZExtValue() == `0` && ProduceNonFlagSettingCondBr) {
9212	if (CC == ISD::SETEQ) {
9213	// See if we can use a TBZ to fold in an AND as well.
9214	// TBZ has a smaller branch displacement than CBZ. If the offset is
9215	// out of bounds, a late MI-layer pass rewrites branches.
9216	// 403.gcc is an example that hits this case.
9217	if (LHS.getOpcode() == ISD::AND &&
9218	isa<ConstantSDNode>(Val: LHS.getOperand(i: `1`)) &&
9219	isPowerOf2_64(Value: LHS.getConstantOperandVal(i: `1`))) {
9220	SDValue Test = LHS.getOperand(i: `0`);
9221	uint64_t Mask = LHS.getConstantOperandVal(i: `1`);
9222	return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
9223	DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
9224	Dest);
9225	}
9226
9227	return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
9228	} else if (CC == ISD::SETNE) {
9229	// See if we can use a TBZ to fold in an AND as well.
9230	// TBZ has a smaller branch displacement than CBZ. If the offset is
9231	// out of bounds, a late MI-layer pass rewrites branches.
9232	// 403.gcc is an example that hits this case.
9233	if (LHS.getOpcode() == ISD::AND &&
9234	isa<ConstantSDNode>(Val: LHS.getOperand(i: `1`)) &&
9235	isPowerOf2_64(Value: LHS.getConstantOperandVal(i: `1`))) {
9236	SDValue Test = LHS.getOperand(i: `0`);
9237	uint64_t Mask = LHS.getConstantOperandVal(i: `1`);
9238	return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
9239	DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
9240	Dest);
9241	}
9242
9243	return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
9244	} else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
9245	// Don't combine AND since emitComparison converts the AND to an ANDS
9246	// (a.k.a. TST) and the test in the test bit and branch instruction
9247	// becomes redundant. This would also increase register pressure.
9248	uint64_t SignBitPos;
9249	std::tie(args&: LHS, args&: SignBitPos) = lookThroughSignExtension(Val: LHS);
9250	return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
9251	DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
9252	}
9253	}
9254	if (RHSC && RHSC->getSExtValue() == -`1` && CC == ISD::SETGT &&
9255	LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
9256	// Don't combine AND since emitComparison converts the AND to an ANDS
9257	// (a.k.a. TST) and the test in the test bit and branch instruction
9258	// becomes redundant. This would also increase register pressure.
9259	uint64_t SignBitPos;
9260	std::tie(args&: LHS, args&: SignBitPos) = lookThroughSignExtension(Val: LHS);
9261	return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
9262	DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
9263	}
9264
9265	SDValue CCVal;
9266	SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, AArch64cc&: CCVal, DAG, dl);
9267	return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
9268	Cmp);
9269	}
9270
9271	assert(LHS.getValueType() == MVT::f16 \|\| LHS.getValueType() == MVT::bf16 \|\|
9272	LHS.getValueType() == MVT::f32 \|\| LHS.getValueType() == MVT::f64);
9273
9274	// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
9275	// clean. Some of them require two branches to implement.
9276	SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
9277	AArch64CC::CondCode CC1, CC2;
9278	changeFPCCToAArch64CC(CC, CondCode&: CC1, CondCode2&: CC2);
9279	SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
9280	SDValue BR1 =
9281	DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
9282	if (CC2 != AArch64CC::AL) {
9283	SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
9284	return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
9285	Cmp);
9286	}
9287
9288	return BR1;
9289	}
9290
9291	SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
9292	SelectionDAG &DAG) const {
9293	if (!Subtarget->hasNEON())
9294	return SDValue ();
9295
9296	EVT VT = Op.getValueType();
9297	EVT IntVT = VT.changeTypeToInteger();
9298	SDLoc DL(Op);
9299
9300	SDValue In1 = Op.getOperand(i: `0`);
9301	SDValue In2 = Op.getOperand(i: `1`);
9302	EVT SrcVT = In2.getValueType();
9303
9304	if (!SrcVT.bitsEq(VT))
9305	In2 = DAG.getFPExtendOrRound(Op: In2, DL, VT);
9306
9307	if (VT.isScalableVector())
9308	IntVT =
9309	getPackedSVEVectorVT(VT: VT.getVectorElementType().changeTypeToInteger());
9310
9311	if (VT.isFixedLengthVector() &&
9312	useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable())) {
9313	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
9314
9315	In1 = convertToScalableVector(DAG, VT: ContainerVT, V: In1);
9316	In2 = convertToScalableVector(DAG, VT: ContainerVT, V: In2);
9317
9318	SDValue Res = DAG.getNode(Opcode: ISD::FCOPYSIGN, DL, VT: ContainerVT, N1: In1, N2: In2);
9319	return convertFromScalableVector(DAG, VT, V: Res);
9320	}
9321
9322	auto BitCast = [this](EVT VT, SDValue Op, SelectionDAG &DAG) {
9323	if (VT.isScalableVector())
9324	return getSVESafeBitCast(VT, Op, DAG);
9325
9326	return DAG.getBitcast(VT, V: Op);
9327	};
9328
9329	SDValue VecVal1, VecVal2;
9330	EVT VecVT;
9331	auto SetVecVal = [&](int Idx = -`1`) {
9332	if (!VT.isVector()) {
9333	VecVal1 =
9334	DAG.getTargetInsertSubreg(SRIdx: Idx, DL, VT: VecVT, Operand: DAG.getUNDEF(VT: VecVT), Subreg: In1);
9335	VecVal2 =
9336	DAG.getTargetInsertSubreg(SRIdx: Idx, DL, VT: VecVT, Operand: DAG.getUNDEF(VT: VecVT), Subreg: In2);
9337	} else {
9338	VecVal1 = BitCast (VecVT, In1, DAG);
9339	VecVal2 = BitCast (VecVT, In2, DAG);
9340	}
9341	};
9342	if (VT.isVector()) {
9343	VecVT = IntVT;
9344	SetVecVal ();
9345	} else if (VT == MVT::f64) {
9346	VecVT = MVT::v2i64;
9347	SetVecVal(AArch64::dsub);
9348	} else if (VT == MVT::f32) {
9349	VecVT = MVT::v4i32;
9350	SetVecVal(AArch64::ssub);
9351	} else if (VT == MVT::f16 \|\| VT == MVT::bf16) {
9352	VecVT = MVT::v8i16;
9353	SetVecVal(AArch64::hsub);
9354	} else {
9355	llvm_unreachable("Invalid type for copysign!");
9356	}
9357
9358	unsigned BitWidth = In1.getScalarValueSizeInBits();
9359	SDValue SignMaskV = DAG.getConstant(Val: ~APInt::getSignMask(BitWidth), DL, VT: VecVT);
9360
9361	// We want to materialize a mask with every bit but the high bit set, but the
9362	// AdvSIMD immediate moves cannot materialize that in a single instruction for
9363	// 64-bit elements. Instead, materialize all bits set and then negate that.
9364	if (VT == MVT::f64 \|\| VT == MVT::v2f64) {
9365	SignMaskV = DAG.getConstant(Val: APInt::getAllOnes(numBits: BitWidth), DL, VT: VecVT);
9366	SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, SignMaskV);
9367	SignMaskV = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, SignMaskV);
9368	SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, SignMaskV);
9369	}
9370
9371	SDValue BSP =
9372	DAG.getNode(Opcode: AArch64ISD::BSP, DL, VT: VecVT, N1: SignMaskV, N2: VecVal1, N3: VecVal2);
9373	if (VT == MVT::f16 \|\| VT == MVT::bf16)
9374	return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, BSP);
9375	if (VT == MVT::f32)
9376	return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, BSP);
9377	if (VT == MVT::f64)
9378	return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, BSP);
9379
9380	return BitCast (VT, BSP, DAG);
9381	}
9382
9383	SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
9384	SelectionDAG &DAG) const {
9385	if (DAG.getMachineFunction().getFunction().hasFnAttribute(
9386	Attribute::NoImplicitFloat))
9387	return SDValue ();
9388
9389	if (!Subtarget->hasNEON())
9390	return SDValue ();
9391
9392	bool IsParity = Op.getOpcode() == ISD::PARITY;
9393	SDValue Val = Op.getOperand(i: `0`);
9394	SDLoc DL(Op);
9395	EVT VT = Op.getValueType();
9396
9397	// for i32, general parity function using EORs is more efficient compared to
9398	// using floating point
9399	if (VT == MVT::i32 && IsParity)
9400	return SDValue ();
9401
9402	// If there is no CNT instruction available, GPR popcount can
9403	// be more efficiently lowered to the following sequence that uses
9404	// AdvSIMD registers/instructions as long as the copies to/from
9405	// the AdvSIMD registers are cheap.
9406	// FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
9407	// CNT V0.8B, V0.8B // 8xbyte pop-counts
9408	// ADDV B0, V0.8B // sum 8xbyte pop-counts
9409	// UMOV X0, V0.B[0] // copy byte result back to integer reg
9410	if (VT == MVT::i32 \|\| VT == MVT::i64) {
9411	if (VT == MVT::i32)
9412	Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
9413	Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
9414
9415	SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
9416	SDValue UaddLV = DAG.getNode(AArch64ISD::UADDLV, DL, MVT::v4i32, CtPop);
9417	UaddLV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, UaddLV,
9418	DAG.getConstant(`0`, DL, MVT::i64));
9419
9420	if (IsParity)
9421	UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV,
9422	DAG.getConstant(`1`, DL, MVT::i32));
9423
9424	if (VT == MVT::i64)
9425	UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
9426	return UaddLV;
9427	} else if (VT == MVT::i128) {
9428	Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);
9429
9430	SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
9431	SDValue UaddLV = DAG.getNode(AArch64ISD::UADDLV, DL, MVT::v4i32, CtPop);
9432	UaddLV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, UaddLV,
9433	DAG.getConstant(`0`, DL, MVT::i64));
9434
9435	if (IsParity)
9436	UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV,
9437	DAG.getConstant(`1`, DL, MVT::i32));
9438
9439	return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i128, UaddLV);
9440	}
9441
9442	assert(!IsParity && "ISD::PARITY of vector types not supported");
9443
9444	if (VT.isScalableVector() \|\|
9445	useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable()))
9446	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::CTPOP_MERGE_PASSTHRU);
9447
9448	assert((VT == MVT::v1i64 \|\| VT == MVT::v2i64 \|\| VT == MVT::v2i32 \|\|
9449	VT == MVT::v4i32 \|\| VT == MVT::v4i16 \|\| VT == MVT::v8i16) &&
9450	"Unexpected type for custom ctpop lowering");
9451
9452	EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
9453	Val = DAG.getBitcast(VT: VT8Bit, V: Val);
9454	Val = DAG.getNode(Opcode: ISD::CTPOP, DL, VT: VT8Bit, Operand: Val);
9455
9456	// Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
9457	unsigned EltSize = `8`;
9458	unsigned NumElts = VT.is64BitVector() ? `8` : `16`;
9459	while (EltSize != VT.getScalarSizeInBits()) {
9460	EltSize *= `2`;
9461	NumElts /= `2`;
9462	MVT WidenVT = MVT::getVectorVT(VT: MVT::getIntegerVT(BitWidth: EltSize), NumElements: NumElts);
9463	Val = DAG.getNode(
9464	ISD::INTRINSIC_WO_CHAIN, DL, WidenVT,
9465	DAG.getConstant(Intrinsic::aarch64_neon_uaddlp, DL, MVT::i32), Val);
9466	}
9467
9468	return Val;
9469	}
9470
9471	SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
9472	EVT VT = Op.getValueType();
9473	assert(VT.isScalableVector() \|\|
9474	useSVEForFixedLengthVectorVT(
9475	VT, /OverrideNEON=/Subtarget->useSVEForFixedLengthVectors()));
9476
9477	SDLoc DL(Op);
9478	SDValue RBIT = DAG.getNode(Opcode: ISD::BITREVERSE, DL, VT, Operand: Op.getOperand(i: `0`));
9479	return DAG.getNode(Opcode: ISD::CTLZ, DL, VT, Operand: RBIT);
9480	}
9481
9482	SDValue AArch64TargetLowering::LowerMinMax(SDValue Op,
9483	SelectionDAG &DAG) const {
9484
9485	EVT VT = Op.getValueType();
9486	SDLoc DL(Op);
9487	unsigned Opcode = Op.getOpcode();
9488	ISD::CondCode CC;
9489	switch (Opcode) {
9490	default:
9491	llvm_unreachable("Wrong instruction");
9492	case ISD::SMAX:
9493	CC = ISD::SETGT;
9494	break;
9495	case ISD::SMIN:
9496	CC = ISD::SETLT;
9497	break;
9498	case ISD::UMAX:
9499	CC = ISD::SETUGT;
9500	break;
9501	case ISD::UMIN:
9502	CC = ISD::SETULT;
9503	break;
9504	}
9505
9506	if (VT.isScalableVector() \|\|
9507	useSVEForFixedLengthVectorVT(
9508	VT, /OverrideNEON=/Subtarget->useSVEForFixedLengthVectors())) {
9509	switch (Opcode) {
9510	default:
9511	llvm_unreachable("Wrong instruction");
9512	case ISD::SMAX:
9513	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::SMAX_PRED);
9514	case ISD::SMIN:
9515	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::SMIN_PRED);
9516	case ISD::UMAX:
9517	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::UMAX_PRED);
9518	case ISD::UMIN:
9519	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::UMIN_PRED);
9520	}
9521	}
9522
9523	SDValue Op0 = Op.getOperand(i: `0`);
9524	SDValue Op1 = Op.getOperand(i: `1`);
9525	SDValue Cond = DAG.getSetCC(DL, VT, LHS: Op0, RHS: Op1, Cond: CC);
9526	return DAG.getSelect(DL, VT, Cond, LHS: Op0, RHS: Op1);
9527	}
9528
9529	SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,
9530	SelectionDAG &DAG) const {
9531	EVT VT = Op.getValueType();
9532
9533	if (VT.isScalableVector() \|\|
9534	useSVEForFixedLengthVectorVT(
9535	VT, /OverrideNEON=/Subtarget->useSVEForFixedLengthVectors()))
9536	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::BITREVERSE_MERGE_PASSTHRU);
9537
9538	SDLoc DL(Op);
9539	SDValue REVB;
9540	MVT VST;
9541
9542	switch (VT.getSimpleVT().SimpleTy) {
9543	default:
9544	llvm_unreachable("Invalid type for bitreverse!");
9545
9546	case MVT::v2i32: {
9547	VST = MVT::v8i8;
9548	REVB = DAG.getNode(Opcode: AArch64ISD::REV32, DL, VT: VST, Operand: Op.getOperand(i: `0`));
9549
9550	break;
9551	}
9552
9553	case MVT::v4i32: {
9554	VST = MVT::v16i8;
9555	REVB = DAG.getNode(Opcode: AArch64ISD::REV32, DL, VT: VST, Operand: Op.getOperand(i: `0`));
9556
9557	break;
9558	}
9559
9560	case MVT::v1i64: {
9561	VST = MVT::v8i8;
9562	REVB = DAG.getNode(Opcode: AArch64ISD::REV64, DL, VT: VST, Operand: Op.getOperand(i: `0`));
9563
9564	break;
9565	}
9566
9567	case MVT::v2i64: {
9568	VST = MVT::v16i8;
9569	REVB = DAG.getNode(Opcode: AArch64ISD::REV64, DL, VT: VST, Operand: Op.getOperand(i: `0`));
9570
9571	break;
9572	}
9573	}
9574
9575	return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT,
9576	Operand: DAG.getNode(Opcode: ISD::BITREVERSE, DL, VT: VST, Operand: REVB));
9577	}
9578
9579	// Check whether the continuous comparison sequence.
9580	static bool
9581	isOrXorChain(SDValue N, unsigned &Num,
9582	SmallVector<std::pair<SDValue, SDValue>, `16`> &WorkList) {
9583	if (Num == MaxXors)
9584	return false;
9585
9586	// Skip the one-use zext
9587	if (N ->getOpcode() == ISD::ZERO_EXTEND && N ->hasOneUse())
9588	N = N ->getOperand(Num: `0`);
9589
9590	// The leaf node must be XOR
9591	if (N ->getOpcode() == ISD::XOR) {
9592	WorkList.push_back(Elt: std::make_pair(x: N ->getOperand(Num: `0`), y: N ->getOperand(Num: `1`)));
9593	Num++;
9594	return true;
9595	}
9596
9597	// All the non-leaf nodes must be OR.
9598	if (N ->getOpcode() != ISD::OR \|\| !N ->hasOneUse())
9599	return false;
9600
9601	if (isOrXorChain(N: N ->getOperand(Num: `0`), Num, WorkList) &&
9602	isOrXorChain(N: N ->getOperand(Num: `1`), Num, WorkList))
9603	return true;
9604	return false;
9605	}
9606
9607	// Transform chains of ORs and XORs, which usually outlined by memcmp/bmp.
9608	static SDValue performOrXorChainCombine(SDNode *N, SelectionDAG &DAG) {
9609	SDValue LHS = N->getOperand(Num: `0`);
9610	SDValue RHS = N->getOperand(Num: `1`);
9611	SDLoc DL(N);
9612	EVT VT = N->getValueType(ResNo: `0`);
9613	SmallVector<std::pair<SDValue, SDValue>, `16`> WorkList;
9614
9615	// Only handle integer compares.
9616	if (N->getOpcode() != ISD::SETCC)
9617	return SDValue ();
9618
9619	ISD::CondCode Cond = cast<CondCodeSDNode>(Val: N->getOperand(Num: `2`))->get();
9620	// Try to express conjunction "cmp 0 (or (xor A0 A1) (xor B0 B1))" as:
9621	// sub A0, A1; ccmp B0, B1, 0, eq; cmp inv(Cond) flag
9622	unsigned NumXors = `0`;
9623	if ((Cond == ISD::SETEQ \|\| Cond == ISD::SETNE) && isNullConstant(V: RHS) &&
9624	LHS ->getOpcode() == ISD::OR && LHS ->hasOneUse() &&
9625	isOrXorChain(N: LHS, Num&: NumXors, WorkList)) {
9626	SDValue XOR0, XOR1;
9627	std::tie(args&: XOR0, args&: XOR1) = WorkList [`0`];
9628	unsigned LogicOp = (Cond == ISD::SETEQ) ? ISD::AND : ISD::OR;
9629	SDValue Cmp = DAG.getSetCC(DL, VT, LHS: XOR0, RHS: XOR1, Cond);
9630	for (unsigned I = `1`; I < WorkList.size(); I++) {
9631	std::tie(args&: XOR0, args&: XOR1) = WorkList [I];
9632	SDValue CmpChain = DAG.getSetCC(DL, VT, LHS: XOR0, RHS: XOR1, Cond);
9633	Cmp = DAG.getNode(Opcode: LogicOp, DL, VT, N1: Cmp, N2: CmpChain);
9634	}
9635
9636	// Exit early by inverting the condition, which help reduce indentations.
9637	return Cmp;
9638	}
9639
9640	return SDValue ();
9641	}
9642
9643	SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
9644
9645	if (Op.getValueType().isVector())
9646	return LowerVSETCC(Op, DAG);
9647
9648	bool IsStrict = Op ->isStrictFPOpcode();
9649	bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
9650	unsigned OpNo = IsStrict ? `1` : `0`;
9651	SDValue Chain;
9652	if (IsStrict)
9653	Chain = Op.getOperand(i: `0`);
9654	SDValue LHS = Op.getOperand(i: OpNo + `0`);
9655	SDValue RHS = Op.getOperand(i: OpNo + `1`);
9656	ISD::CondCode CC = cast<CondCodeSDNode>(Val: Op.getOperand(i: OpNo + `2`))->get();
9657	SDLoc dl(Op);
9658
9659	// We chose ZeroOrOneBooleanContents, so use zero and one.
9660	EVT VT = Op.getValueType();
9661	SDValue TVal = DAG.getConstant(Val: `1`, DL: dl, VT);
9662	SDValue FVal = DAG.getConstant(Val: `0`, DL: dl, VT);
9663
9664	// Handle f128 first, since one possible outcome is a normal integer
9665	// comparison which gets picked up by the next if statement.
9666	if (LHS.getValueType() == MVT::f128) {
9667	softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS, Chain,
9668	IsSignaling);
9669
9670	// If softenSetCCOperands returned a scalar, use it.
9671	if (!RHS.getNode()) {
9672	assert(LHS.getValueType() == Op.getValueType() &&
9673	"Unexpected setcc expansion!");
9674	return IsStrict ? DAG.getMergeValues(Ops: {LHS, Chain}, dl) : LHS;
9675	}
9676	}
9677
9678	if (LHS.getValueType().isInteger()) {
9679	SDValue CCVal;
9680	SDValue Cmp = getAArch64Cmp(
9681	LHS, RHS, CC: ISD::getSetCCInverse(Operation: CC, Type: LHS.getValueType()), AArch64cc&: CCVal, DAG, dl);
9682
9683	// Note that we inverted the condition above, so we reverse the order of
9684	// the true and false operands here. This will allow the setcc to be
9685	// matched to a single CSINC instruction.
9686	SDValue Res = DAG.getNode(Opcode: AArch64ISD::CSEL, DL: dl, VT, N1: FVal, N2: TVal, N3: CCVal, N4: Cmp);
9687	return IsStrict ? DAG.getMergeValues(Ops: {Res, Chain}, dl) : Res;
9688	}
9689
9690	// Now we know we're dealing with FP values.
9691	assert(LHS.getValueType() == MVT::bf16 \|\| LHS.getValueType() == MVT::f16 \|\|
9692	LHS.getValueType() == MVT::f32 \|\| LHS.getValueType() == MVT::f64);
9693
9694	// If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
9695	// and do the comparison.
9696	SDValue Cmp;
9697	if (IsStrict)
9698	Cmp = emitStrictFPComparison(LHS, RHS, dl, DAG, Chain, IsSignaling);
9699	else
9700	Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
9701
9702	AArch64CC::CondCode CC1, CC2;
9703	changeFPCCToAArch64CC(CC, CondCode&: CC1, CondCode2&: CC2);
9704	SDValue Res;
9705	if (CC2 == AArch64CC::AL) {
9706	changeFPCCToAArch64CC(CC: ISD::getSetCCInverse(Operation: CC, Type: LHS.getValueType()), CondCode&: CC1,
9707	CondCode2&: CC2);
9708	SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
9709
9710	// Note that we inverted the condition above, so we reverse the order of
9711	// the true and false operands here. This will allow the setcc to be
9712	// matched to a single CSINC instruction.
9713	Res = DAG.getNode(Opcode: AArch64ISD::CSEL, DL: dl, VT, N1: FVal, N2: TVal, N3: CC1Val, N4: Cmp);
9714	} else {
9715	// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
9716	// totally clean. Some of them require two CSELs to implement. As is in
9717	// this case, we emit the first CSEL and then emit a second using the output
9718	// of the first as the RHS. We're effectively OR'ing the two CC's together.
9719
9720	// FIXME: It would be nice if we could match the two CSELs to two CSINCs.
9721	SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
9722	SDValue CS1 =
9723	DAG.getNode(Opcode: AArch64ISD::CSEL, DL: dl, VT, N1: TVal, N2: FVal, N3: CC1Val, N4: Cmp);
9724
9725	SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
9726	Res = DAG.getNode(Opcode: AArch64ISD::CSEL, DL: dl, VT, N1: TVal, N2: CS1, N3: CC2Val, N4: Cmp);
9727	}
9728	return IsStrict ? DAG.getMergeValues(Ops: {Res, Cmp.getValue(R: `1`)}, dl) : Res;
9729	}
9730
9731	SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op,
9732	SelectionDAG &DAG) const {
9733
9734	SDValue LHS = Op.getOperand(i: `0`);
9735	SDValue RHS = Op.getOperand(i: `1`);
9736	EVT VT = LHS.getValueType();
9737	if (VT != MVT::i32 && VT != MVT::i64)
9738	return SDValue ();
9739
9740	SDLoc DL(Op);
9741	SDValue Carry = Op.getOperand(i: `2`);
9742	// SBCS uses a carry not a borrow so the carry flag should be inverted first.
9743	SDValue InvCarry = valueToCarryFlag(Value: Carry, DAG, Invert: true);
9744	SDValue Cmp = DAG.getNode(AArch64ISD::SBCS, DL, DAG.getVTList(VT, MVT::Glue),
9745	LHS, RHS, InvCarry);
9746
9747	EVT OpVT = Op.getValueType();
9748	SDValue TVal = DAG.getConstant(Val: `1`, DL, VT: OpVT);
9749	SDValue FVal = DAG.getConstant(Val: `0`, DL, VT: OpVT);
9750
9751	ISD::CondCode Cond = cast<CondCodeSDNode>(Val: Op.getOperand(i: `3`))->get();
9752	ISD::CondCode CondInv = ISD::getSetCCInverse(Operation: Cond, Type: VT);
9753	SDValue CCVal =
9754	DAG.getConstant(changeIntCCToAArch64CC(CondInv), DL, MVT::i32);
9755	// Inputs are swapped because the condition is inverted. This will allow
9756	// matching with a single CSINC instruction.
9757	return DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT: OpVT, N1: FVal, N2: TVal, N3: CCVal,
9758	N4: Cmp.getValue(R: `1`));
9759	}
9760
9761	SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
9762	SDValue RHS, SDValue TVal,
9763	SDValue FVal, const SDLoc &dl,
9764	SelectionDAG &DAG) const {
9765	// Handle f128 first, because it will result in a comparison of some RTLIB
9766	// call result against zero.
9767	if (LHS.getValueType() == MVT::f128) {
9768	softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
9769
9770	// If softenSetCCOperands returned a scalar, we need to compare the result
9771	// against zero to select between true and false values.
9772	if (!RHS.getNode()) {
9773	RHS = DAG.getConstant(Val: `0`, DL: dl, VT: LHS.getValueType());
9774	CC = ISD::SETNE;
9775	}
9776	}
9777
9778	// Also handle f16, for which we need to do a f32 comparison.
9779	if ((LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) \|\|
9780	LHS.getValueType() == MVT::bf16) {
9781	LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
9782	RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
9783	}
9784
9785	// Next, handle integers.
9786	if (LHS.getValueType().isInteger()) {
9787	assert((LHS.getValueType() == RHS.getValueType()) &&
9788	(LHS.getValueType() == MVT::i32 \|\| LHS.getValueType() == MVT::i64));
9789
9790	ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(Val&: FVal);
9791	ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(Val&: TVal);
9792	ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(Val&: RHS);
9793	// Check for sign pattern (SELECT_CC setgt, iN lhs, -1, 1, -1) and transform
9794	// into (OR (ASR lhs, N-1), 1), which requires less instructions for the
9795	// supported types.
9796	if (CC == ISD::SETGT && RHSC && RHSC->isAllOnes() && CTVal && CFVal &&
9797	CTVal->isOne() && CFVal->isAllOnes() &&
9798	LHS.getValueType() == TVal.getValueType()) {
9799	EVT VT = LHS.getValueType();
9800	SDValue Shift =
9801	DAG.getNode(Opcode: ISD::SRA, DL: dl, VT, N1: LHS,
9802	N2: DAG.getConstant(Val: VT.getSizeInBits() - `1`, DL: dl, VT));
9803	return DAG.getNode(Opcode: ISD::OR, DL: dl, VT, N1: Shift, N2: DAG.getConstant(Val: `1`, DL: dl, VT));
9804	}
9805
9806	// Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns.
9807	// (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
9808	// (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1))
9809	// Both require less instructions than compare and conditional select.
9810	if ((CC == ISD::SETGT \|\| CC == ISD::SETLT) && LHS == TVal &&
9811	RHSC && RHSC->isZero() && CFVal && CFVal->isZero() &&
9812	LHS.getValueType() == RHS.getValueType()) {
9813	EVT VT = LHS.getValueType();
9814	SDValue Shift =
9815	DAG.getNode(Opcode: ISD::SRA, DL: dl, VT, N1: LHS,
9816	N2: DAG.getConstant(Val: VT.getSizeInBits() - `1`, DL: dl, VT));
9817
9818	if (CC == ISD::SETGT)
9819	Shift = DAG.getNOT(DL: dl, Val: Shift, VT);
9820
9821	return DAG.getNode(Opcode: ISD::AND, DL: dl, VT, N1: LHS, N2: Shift);
9822	}
9823
9824	unsigned Opcode = AArch64ISD::CSEL;
9825
9826	// If both the TVal and the FVal are constants, see if we can swap them in
9827	// order to for a CSINV or CSINC out of them.
9828	if (CTVal && CFVal && CTVal->isAllOnes() && CFVal->isZero()) {
9829	std::swap(a&: TVal, b&: FVal);
9830	std::swap(a&: CTVal, b&: CFVal);
9831	CC = ISD::getSetCCInverse(Operation: CC, Type: LHS.getValueType());
9832	} else if (CTVal && CFVal && CTVal->isOne() && CFVal->isZero()) {
9833	std::swap(a&: TVal, b&: FVal);
9834	std::swap(a&: CTVal, b&: CFVal);
9835	CC = ISD::getSetCCInverse(Operation: CC, Type: LHS.getValueType());
9836	} else if (TVal.getOpcode() == ISD::XOR) {
9837	// If TVal is a NOT we want to swap TVal and FVal so that we can match
9838	// with a CSINV rather than a CSEL.
9839	if (isAllOnesConstant(V: TVal.getOperand(i: `1`))) {
9840	std::swap(a&: TVal, b&: FVal);
9841	std::swap(a&: CTVal, b&: CFVal);
9842	CC = ISD::getSetCCInverse(Operation: CC, Type: LHS.getValueType());
9843	}
9844	} else if (TVal.getOpcode() == ISD::SUB) {
9845	// If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
9846	// that we can match with a CSNEG rather than a CSEL.
9847	if (isNullConstant(V: TVal.getOperand(i: `0`))) {
9848	std::swap(a&: TVal, b&: FVal);
9849	std::swap(a&: CTVal, b&: CFVal);
9850	CC = ISD::getSetCCInverse(Operation: CC, Type: LHS.getValueType());
9851	}
9852	} else if (CTVal && CFVal) {
9853	const int64_t TrueVal = CTVal->getSExtValue();
9854	const int64_t FalseVal = CFVal->getSExtValue();
9855	bool Swap = false;
9856
9857	// If both TVal and FVal are constants, see if FVal is the
9858	// inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
9859	// instead of a CSEL in that case.
9860	if (TrueVal == ~FalseVal) {
9861	Opcode = AArch64ISD::CSINV;
9862	} else if (FalseVal > std::numeric_limits<int64_t>::min() &&
9863	TrueVal == -FalseVal) {
9864	Opcode = AArch64ISD::CSNEG;
9865	} else if (TVal.getValueType() == MVT::i32) {
9866	// If our operands are only 32-bit wide, make sure we use 32-bit
9867	// arithmetic for the check whether we can use CSINC. This ensures that
9868	// the addition in the check will wrap around properly in case there is
9869	// an overflow (which would not be the case if we do the check with
9870	// 64-bit arithmetic).
9871	const uint32_t TrueVal32 = CTVal->getZExtValue();
9872	const uint32_t FalseVal32 = CFVal->getZExtValue();
9873
9874	if ((TrueVal32 == FalseVal32 + `1`) \|\| (TrueVal32 + `1` == FalseVal32)) {
9875	Opcode = AArch64ISD::CSINC;
9876
9877	if (TrueVal32 > FalseVal32) {
9878	Swap = true;
9879	}
9880	}
9881	} else {
9882	// 64-bit check whether we can use CSINC.
9883	const uint64_t TrueVal64 = TrueVal;
9884	const uint64_t FalseVal64 = FalseVal;
9885
9886	if ((TrueVal64 == FalseVal64 + `1`) \|\| (TrueVal64 + `1` == FalseVal64)) {
9887	Opcode = AArch64ISD::CSINC;
9888
9889	if (TrueVal > FalseVal) {
9890	Swap = true;
9891	}
9892	}
9893	}
9894
9895	// Swap TVal and FVal if necessary.
9896	if (Swap) {
9897	std::swap(a&: TVal, b&: FVal);
9898	std::swap(a&: CTVal, b&: CFVal);
9899	CC = ISD::getSetCCInverse(Operation: CC, Type: LHS.getValueType());
9900	}
9901
9902	if (Opcode != AArch64ISD::CSEL) {
9903	// Drop FVal since we can get its value by simply inverting/negating
9904	// TVal.
9905	FVal = TVal;
9906	}
9907	}
9908
9909	// Avoid materializing a constant when possible by reusing a known value in
9910	// a register. However, don't perform this optimization if the known value
9911	// is one, zero or negative one in the case of a CSEL. We can always
9912	// materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
9913	// FVal, respectively.
9914	ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(Val&: RHS);
9915	if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
9916	!RHSVal->isZero() && !RHSVal->isAllOnes()) {
9917	AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
9918	// Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
9919	// "a != C ? x : a" to avoid materializing C.
9920	if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
9921	TVal = LHS;
9922	else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
9923	FVal = LHS;
9924	} else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
9925	assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
9926	// Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
9927	// avoid materializing C.
9928	AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
9929	if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
9930	Opcode = AArch64ISD::CSINV;
9931	TVal = LHS;
9932	FVal = DAG.getConstant(Val: `0`, DL: dl, VT: FVal.getValueType());
9933	}
9934	}
9935
9936	SDValue CCVal;
9937	SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, AArch64cc&: CCVal, DAG, dl);
9938	EVT VT = TVal.getValueType();
9939	return DAG.getNode(Opcode, DL: dl, VT, N1: TVal, N2: FVal, N3: CCVal, N4: Cmp);
9940	}
9941
9942	// Now we know we're dealing with FP values.
9943	assert(LHS.getValueType() == MVT::f16 \|\| LHS.getValueType() == MVT::f32 \|\|
9944	LHS.getValueType() == MVT::f64);
9945	assert(LHS.getValueType() == RHS.getValueType());
9946	EVT VT = TVal.getValueType();
9947	SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
9948
9949	// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
9950	// clean. Some of them require two CSELs to implement.
9951	AArch64CC::CondCode CC1, CC2;
9952	changeFPCCToAArch64CC(CC, CondCode&: CC1, CondCode2&: CC2);
9953
9954	if (DAG.getTarget().Options.UnsafeFPMath) {
9955	// Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
9956	// "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
9957	ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(Val&: RHS);
9958	if (RHSVal && RHSVal->isZero()) {
9959	ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(Val&: FVal);
9960	ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(Val&: TVal);
9961
9962	if ((CC == ISD::SETEQ \|\| CC == ISD::SETOEQ \|\| CC == ISD::SETUEQ) &&
9963	CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
9964	TVal = LHS;
9965	else if ((CC == ISD::SETNE \|\| CC == ISD::SETONE \|\| CC == ISD::SETUNE) &&
9966	CFVal && CFVal->isZero() &&
9967	FVal.getValueType() == LHS.getValueType())
9968	FVal = LHS;
9969	}
9970	}
9971
9972	// Emit first, and possibly only, CSEL.
9973	SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
9974	SDValue CS1 = DAG.getNode(Opcode: AArch64ISD::CSEL, DL: dl, VT, N1: TVal, N2: FVal, N3: CC1Val, N4: Cmp);
9975
9976	// If we need a second CSEL, emit it, using the output of the first as the
9977	// RHS. We're effectively OR'ing the two CC's together.
9978	if (CC2 != AArch64CC::AL) {
9979	SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
9980	return DAG.getNode(Opcode: AArch64ISD::CSEL, DL: dl, VT, N1: TVal, N2: CS1, N3: CC2Val, N4: Cmp);
9981	}
9982
9983	// Otherwise, return the output of the first CSEL.
9984	return CS1;
9985	}
9986
9987	SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op,
9988	SelectionDAG &DAG) const {
9989	EVT Ty = Op.getValueType();
9990	auto Idx = Op.getConstantOperandAPInt(i: `2`);
9991	int64_t IdxVal = Idx.getSExtValue();
9992	assert(Ty.isScalableVector() &&
9993	"Only expect scalable vectors for custom lowering of VECTOR_SPLICE");
9994
9995	// We can use the splice instruction for certain index values where we are
9996	// able to efficiently generate the correct predicate. The index will be
9997	// inverted and used directly as the input to the ptrue instruction, i.e.
9998	// -1 -> vl1, -2 -> vl2, etc. The predicate will then be reversed to get the
9999	// splice predicate. However, we can only do this if we can guarantee that
10000	// there are enough elements in the vector, hence we check the index <= min
10001	// number of elements.
10002	std::optional<unsigned> PredPattern;
10003	if (Ty.isScalableVector() && IdxVal < `0` &&
10004	(PredPattern = getSVEPredPatternFromNumElements(std::abs(IdxVal))) !=
10005	std::nullopt) {
10006	SDLoc DL(Op);
10007
10008	// Create a predicate where all but the last -IdxVal elements are false.
10009	EVT PredVT = Ty.changeVectorElementType(MVT::i1);
10010	SDValue Pred = getPTrue(DAG, DL, VT: PredVT, Pattern: *PredPattern);
10011	Pred = DAG.getNode(Opcode: ISD::VECTOR_REVERSE, DL, VT: PredVT, Operand: Pred);
10012
10013	// Now splice the two inputs together using the predicate.
10014	return DAG.getNode(Opcode: AArch64ISD::SPLICE, DL, VT: Ty, N1: Pred, N2: Op.getOperand(i: `0`),
10015	N3: Op.getOperand(i: `1`));
10016	}
10017
10018	// This will select to an EXT instruction, which has a maximum immediate
10019	// value of 255, hence 2048-bits is the maximum value we can lower.
10020	if (IdxVal >= `0` &&
10021	IdxVal < int64_t(`2048` / Ty.getVectorElementType().getSizeInBits()))
10022	return Op;
10023
10024	return SDValue ();
10025	}
10026
10027	SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
10028	SelectionDAG &DAG) const {
10029	ISD::CondCode CC = cast<CondCodeSDNode>(Val: Op.getOperand(i: `4`))->get();
10030	SDValue LHS = Op.getOperand(i: `0`);
10031	SDValue RHS = Op.getOperand(i: `1`);
10032	SDValue TVal = Op.getOperand(i: `2`);
10033	SDValue FVal = Op.getOperand(i: `3`);
10034	SDLoc DL(Op);
10035	return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, dl: DL, DAG);
10036	}
10037
10038	SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
10039	SelectionDAG &DAG) const {
10040	SDValue CCVal = Op ->getOperand(Num: `0`);
10041	SDValue TVal = Op ->getOperand(Num: `1`);
10042	SDValue FVal = Op ->getOperand(Num: `2`);
10043	SDLoc DL(Op);
10044
10045	EVT Ty = Op.getValueType();
10046	if (Ty == MVT::aarch64svcount) {
10047	TVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, TVal);
10048	FVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, FVal);
10049	SDValue Sel =
10050	DAG.getNode(ISD::SELECT, DL, MVT::nxv16i1, CCVal, TVal, FVal);
10051	return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: Ty, Operand: Sel);
10052	}
10053
10054	if (Ty.isScalableVector()) {
10055	MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount());
10056	SDValue SplatPred = DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: PredVT, Operand: CCVal);
10057	return DAG.getNode(Opcode: ISD::VSELECT, DL, VT: Ty, N1: SplatPred, N2: TVal, N3: FVal);
10058	}
10059
10060	if (useSVEForFixedLengthVectorVT(VT: Ty, OverrideNEON: !Subtarget->isNeonAvailable())) {
10061	// FIXME: Ideally this would be the same as above using i1 types, however
10062	// for the moment we can't deal with fixed i1 vector types properly, so
10063	// instead extend the predicate to a result type sized integer vector.
10064	MVT SplatValVT = MVT::getIntegerVT(BitWidth: Ty.getScalarSizeInBits());
10065	MVT PredVT = MVT::getVectorVT(VT: SplatValVT, EC: Ty.getVectorElementCount());
10066	SDValue SplatVal = DAG.getSExtOrTrunc(Op: CCVal, DL, VT: SplatValVT);
10067	SDValue SplatPred = DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: PredVT, Operand: SplatVal);
10068	return DAG.getNode(Opcode: ISD::VSELECT, DL, VT: Ty, N1: SplatPred, N2: TVal, N3: FVal);
10069	}
10070
10071	// Optimize {s\|u}{add\|sub\|mul}.with.overflow feeding into a select
10072	// instruction.
10073	if (ISD::isOverflowIntrOpRes(Op: CCVal)) {
10074	// Only lower legal XALUO ops.
10075	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT: CCVal ->getValueType(ResNo: `0`)))
10076	return SDValue ();
10077
10078	AArch64CC::CondCode OFCC;
10079	SDValue Value, Overflow;
10080	std::tie(args&: Value, args&: Overflow) = getAArch64XALUOOp(CC&: OFCC, Op: CCVal.getValue(R: `0`), DAG);
10081	SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32);
10082
10083	return DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT: Op.getValueType(), N1: TVal, N2: FVal,
10084	N3: CCVal, N4: Overflow);
10085	}
10086
10087	// Lower it the same way as we would lower a SELECT_CC node.
10088	ISD::CondCode CC;
10089	SDValue LHS, RHS;
10090	if (CCVal.getOpcode() == ISD::SETCC) {
10091	LHS = CCVal.getOperand(i: `0`);
10092	RHS = CCVal.getOperand(i: `1`);
10093	CC = cast<CondCodeSDNode>(Val: CCVal.getOperand(i: `2`))->get();
10094	} else {
10095	LHS = CCVal;
10096	RHS = DAG.getConstant(Val: `0`, DL, VT: CCVal.getValueType());
10097	CC = ISD::SETNE;
10098	}
10099
10100	// If we are lowering a f16 and we do not have fullf16, convert to a f32 in
10101	// order to use FCSELSrrr
10102	if ((Ty == MVT::f16 \|\| Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
10103	TVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
10104	DAG.getUNDEF(MVT::f32), TVal);
10105	FVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
10106	DAG.getUNDEF(MVT::f32), FVal);
10107	}
10108
10109	SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, dl: DL, DAG);
10110
10111	if ((Ty == MVT::f16 \|\| Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
10112	return DAG.getTargetExtractSubreg(AArch64::hsub, DL, Ty, Res);
10113	}
10114
10115	return Res;
10116	}
10117
10118	SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
10119	SelectionDAG &DAG) const {
10120	// Jump table entries as PC relative offsets. No additional tweaking
10121	// is necessary here. Just get the address of the jump table.
10122	JumpTableSDNode *JT = cast<JumpTableSDNode>(Val&: Op);
10123
10124	CodeModel::Model CM = getTargetMachine().getCodeModel();
10125	if (CM == CodeModel::Large && !getTargetMachine().isPositionIndependent() &&
10126	!Subtarget->isTargetMachO())
10127	return getAddrLarge(N: JT, DAG);
10128	if (CM == CodeModel::Tiny)
10129	return getAddrTiny(N: JT, DAG);
10130	return getAddr(N: JT, DAG);
10131	}
10132
10133	SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
10134	SelectionDAG &DAG) const {
10135	// Jump table entries as PC relative offsets. No additional tweaking
10136	// is necessary here. Just get the address of the jump table.
10137	SDLoc DL(Op);
10138	SDValue JT = Op.getOperand(i: `1`);
10139	SDValue Entry = Op.getOperand(i: `2`);
10140	int JTI = cast<JumpTableSDNode>(Val: JT.getNode())->getIndex();
10141
10142	auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
10143	AFI->setJumpTableEntryInfo(Idx: JTI, Size: `4`, PCRelSym: nullptr);
10144
10145	SDNode *Dest =
10146	DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
10147	Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
10148	SDValue JTInfo = DAG.getJumpTableDebugInfo(JTI, Chain: Op.getOperand(i: `0`), DL);
10149	return DAG.getNode(ISD::BRIND, DL, MVT::Other, JTInfo, SDValue(Dest, `0`));
10150	}
10151
10152	SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
10153	SelectionDAG &DAG) const {
10154	ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Val&: Op);
10155	CodeModel::Model CM = getTargetMachine().getCodeModel();
10156	if (CM == CodeModel::Large) {
10157	// Use the GOT for the large code model on iOS.
10158	if (Subtarget->isTargetMachO()) {
10159	return getGOT(N: CP, DAG);
10160	}
10161	if (!getTargetMachine().isPositionIndependent())
10162	return getAddrLarge(N: CP, DAG);
10163	} else if (CM == CodeModel::Tiny) {
10164	return getAddrTiny(N: CP, DAG);
10165	}
10166	return getAddr(N: CP, DAG);
10167	}
10168
10169	SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
10170	SelectionDAG &DAG) const {
10171	BlockAddressSDNode *BA = cast<BlockAddressSDNode>(Val&: Op);
10172	CodeModel::Model CM = getTargetMachine().getCodeModel();
10173	if (CM == CodeModel::Large && !Subtarget->isTargetMachO()) {
10174	if (!getTargetMachine().isPositionIndependent())
10175	return getAddrLarge(N: BA, DAG);
10176	} else if (CM == CodeModel::Tiny) {
10177	return getAddrTiny(N: BA, DAG);
10178	}
10179	return getAddr(N: BA, DAG);
10180	}
10181
10182	SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
10183	SelectionDAG &DAG) const {
10184	AArch64FunctionInfo *FuncInfo =
10185	DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
10186
10187	SDLoc DL(Op);
10188	SDValue FR = DAG.getFrameIndex(FI: FuncInfo->getVarArgsStackIndex(),
10189	VT: getPointerTy(DL: DAG.getDataLayout()));
10190	FR = DAG.getZExtOrTrunc(Op: FR, DL, VT: getPointerMemTy(DL: DAG.getDataLayout()));
10191	const Value *SV = cast<SrcValueSDNode>(Val: Op.getOperand(i: `2`))->getValue();
10192	return DAG.getStore(Chain: Op.getOperand(i: `0`), dl: DL, Val: FR, Ptr: Op.getOperand(i: `1`),
10193	PtrInfo: MachinePointerInfo (SV));
10194	}
10195
10196	SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
10197	SelectionDAG &DAG) const {
10198	MachineFunction &MF = DAG.getMachineFunction();
10199	AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
10200
10201	SDLoc DL(Op);
10202	SDValue FR;
10203	if (Subtarget->isWindowsArm64EC()) {
10204	// With the Arm64EC ABI, we compute the address of the varargs save area
10205	// relative to x4. For a normal AArch64->AArch64 call, x4 == sp on entry,
10206	// but calls from an entry thunk can pass in a different address.
10207	Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
10208	SDValue Val = DAG.getCopyFromReg(DAG.getEntryNode(), DL, VReg, MVT::i64);
10209	uint64_t StackOffset;
10210	if (FuncInfo->getVarArgsGPRSize() > `0`)
10211	StackOffset = -(uint64_t)FuncInfo->getVarArgsGPRSize();
10212	else
10213	StackOffset = FuncInfo->getVarArgsStackOffset();
10214	FR = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
10215	DAG.getConstant(StackOffset, DL, MVT::i64));
10216	} else {
10217	FR = DAG.getFrameIndex(FI: FuncInfo->getVarArgsGPRSize() > `0`
10218	? FuncInfo->getVarArgsGPRIndex()
10219	: FuncInfo->getVarArgsStackIndex(),
10220	VT: getPointerTy(DL: DAG.getDataLayout()));
10221	}
10222	const Value *SV = cast<SrcValueSDNode>(Val: Op.getOperand(i: `2`))->getValue();
10223	return DAG.getStore(Chain: Op.getOperand(i: `0`), dl: DL, Val: FR, Ptr: Op.getOperand(i: `1`),
10224	PtrInfo: MachinePointerInfo (SV));
10225	}
10226
10227	SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
10228	SelectionDAG &DAG) const {
10229	// The layout of the va_list struct is specified in the AArch64 Procedure Call
10230	// Standard, section B.3.
10231	MachineFunction &MF = DAG.getMachineFunction();
10232	AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
10233	unsigned PtrSize = Subtarget->isTargetILP32() ? `4` : `8`;
10234	auto PtrMemVT = getPointerMemTy(DL: DAG.getDataLayout());
10235	auto PtrVT = getPointerTy(DL: DAG.getDataLayout());
10236	SDLoc DL(Op);
10237
10238	SDValue Chain = Op.getOperand(i: `0`);
10239	SDValue VAList = Op.getOperand(i: `1`);
10240	const Value *SV = cast<SrcValueSDNode>(Val: Op.getOperand(i: `2`))->getValue();
10241	SmallVector<SDValue, `4`> MemOps;
10242
10243	// void __stack at offset 0*
10244	unsigned Offset = `0`;
10245	SDValue Stack = DAG.getFrameIndex(FI: FuncInfo->getVarArgsStackIndex(), VT: PtrVT);
10246	Stack = DAG.getZExtOrTrunc(Op: Stack, DL, VT: PtrMemVT);
10247	MemOps.push_back(Elt: DAG.getStore(Chain, dl: DL, Val: Stack, Ptr: VAList,
10248	PtrInfo: MachinePointerInfo (SV), Alignment: Align (PtrSize)));
10249
10250	// void __gr_top at offset 8 (4 on ILP32)*
10251	Offset += PtrSize;
10252	int GPRSize = FuncInfo->getVarArgsGPRSize();
10253	if (GPRSize > `0`) {
10254	SDValue GRTop, GRTopAddr;
10255
10256	GRTopAddr = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: VAList,
10257	N2: DAG.getConstant(Val: Offset, DL, VT: PtrVT));
10258
10259	GRTop = DAG.getFrameIndex(FI: FuncInfo->getVarArgsGPRIndex(), VT: PtrVT);
10260	GRTop = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: GRTop,
10261	N2: DAG.getConstant(Val: GPRSize, DL, VT: PtrVT));
10262	GRTop = DAG.getZExtOrTrunc(Op: GRTop, DL, VT: PtrMemVT);
10263
10264	MemOps.push_back(Elt: DAG.getStore(Chain, dl: DL, Val: GRTop, Ptr: GRTopAddr,
10265	PtrInfo: MachinePointerInfo (SV, Offset),
10266	Alignment: Align (PtrSize)));
10267	}
10268
10269	// void __vr_top at offset 16 (8 on ILP32)*
10270	Offset += PtrSize;
10271	int FPRSize = FuncInfo->getVarArgsFPRSize();
10272	if (FPRSize > `0`) {
10273	SDValue VRTop, VRTopAddr;
10274	VRTopAddr = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: VAList,
10275	N2: DAG.getConstant(Val: Offset, DL, VT: PtrVT));
10276
10277	VRTop = DAG.getFrameIndex(FI: FuncInfo->getVarArgsFPRIndex(), VT: PtrVT);
10278	VRTop = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: VRTop,
10279	N2: DAG.getConstant(Val: FPRSize, DL, VT: PtrVT));
10280	VRTop = DAG.getZExtOrTrunc(Op: VRTop, DL, VT: PtrMemVT);
10281
10282	MemOps.push_back(Elt: DAG.getStore(Chain, dl: DL, Val: VRTop, Ptr: VRTopAddr,
10283	PtrInfo: MachinePointerInfo (SV, Offset),
10284	Alignment: Align (PtrSize)));
10285	}
10286
10287	// int __gr_offs at offset 24 (12 on ILP32)
10288	Offset += PtrSize;
10289	SDValue GROffsAddr = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: VAList,
10290	N2: DAG.getConstant(Val: Offset, DL, VT: PtrVT));
10291	MemOps.push_back(
10292	DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32),
10293	GROffsAddr, MachinePointerInfo(SV, Offset), Align(`4`)));
10294
10295	// int __vr_offs at offset 28 (16 on ILP32)
10296	Offset += `4`;
10297	SDValue VROffsAddr = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: VAList,
10298	N2: DAG.getConstant(Val: Offset, DL, VT: PtrVT));
10299	MemOps.push_back(
10300	DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32),
10301	VROffsAddr, MachinePointerInfo(SV, Offset), Align(`4`)));
10302
10303	return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
10304	}
10305
10306	SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
10307	SelectionDAG &DAG) const {
10308	MachineFunction &MF = DAG.getMachineFunction();
10309
10310	if (Subtarget->isCallingConvWin64(CC: MF.getFunction().getCallingConv()))
10311	return LowerWin64_VASTART(Op, DAG);
10312	else if (Subtarget->isTargetDarwin())
10313	return LowerDarwin_VASTART(Op, DAG);
10314	else
10315	return LowerAAPCS_VASTART(Op, DAG);
10316	}
10317
10318	SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
10319	SelectionDAG &DAG) const {
10320	// AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
10321	// pointer.
10322	SDLoc DL(Op);
10323	unsigned PtrSize = Subtarget->isTargetILP32() ? `4` : `8`;
10324	unsigned VaListSize =
10325	(Subtarget->isTargetDarwin() \|\| Subtarget->isTargetWindows())
10326	? PtrSize
10327	: Subtarget->isTargetILP32() ? `20` : `32`;
10328	const Value *DestSV = cast<SrcValueSDNode>(Val: Op.getOperand(i: `3`))->getValue();
10329	const Value *SrcSV = cast<SrcValueSDNode>(Val: Op.getOperand(i: `4`))->getValue();
10330
10331	return DAG.getMemcpy(Op.getOperand(`0`), DL, Op.getOperand(`1`), Op.getOperand(`2`),
10332	DAG.getConstant(VaListSize, DL, MVT::i32),
10333	Align(PtrSize), false, false, false,
10334	MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV));
10335	}
10336
10337	SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
10338	assert(Subtarget->isTargetDarwin() &&
10339	"automatic va_arg instruction only works on Darwin");
10340
10341	const Value *V = cast<SrcValueSDNode>(Val: Op.getOperand(i: `2`))->getValue();
10342	EVT VT = Op.getValueType();
10343	SDLoc DL(Op);
10344	SDValue Chain = Op.getOperand(i: `0`);
10345	SDValue Addr = Op.getOperand(i: `1`);
10346	MaybeAlign Align(Op.getConstantOperandVal(i: `3`));
10347	unsigned MinSlotSize = Subtarget->isTargetILP32() ? `4` : `8`;
10348	auto PtrVT = getPointerTy(DL: DAG.getDataLayout());
10349	auto PtrMemVT = getPointerMemTy(DL: DAG.getDataLayout());
10350	SDValue VAList =
10351	DAG.getLoad(VT: PtrMemVT, dl: DL, Chain, Ptr: Addr, PtrInfo: MachinePointerInfo (V));
10352	Chain = VAList.getValue(R: `1`);
10353	VAList = DAG.getZExtOrTrunc(Op: VAList, DL, VT: PtrVT);
10354
10355	if (VT.isScalableVector())
10356	report_fatal_error(reason: "Passing SVE types to variadic functions is "
10357	"currently not supported");
10358
10359	if (Align && *Align > MinSlotSize) {
10360	VAList = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: VAList,
10361	N2: DAG.getConstant(Val: Align ->value() - `1`, DL, VT: PtrVT));
10362	VAList = DAG.getNode(Opcode: ISD::AND, DL, VT: PtrVT, N1: VAList,
10363	N2: DAG.getConstant(Val: -(int64_t)Align ->value(), DL, VT: PtrVT));
10364	}
10365
10366	Type ArgTy = VT.getTypeForEVT(Context&: DAG.getContext());
10367	unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(Ty: ArgTy);
10368
10369	// Scalar integer and FP values smaller than 64 bits are implicitly extended
10370	// up to 64 bits. At the very least, we have to increase the striding of the
10371	// vaargs list to match this, and for FP values we need to introduce
10372	// FP_ROUND nodes as well.
10373	if (VT.isInteger() && !VT.isVector())
10374	ArgSize = std::max(a: ArgSize, b: MinSlotSize);
10375	bool NeedFPTrunc = false;
10376	if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
10377	ArgSize = `8`;
10378	NeedFPTrunc = true;
10379	}
10380
10381	// Increment the pointer, VAList, to the next vaarg
10382	SDValue VANext = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: VAList,
10383	N2: DAG.getConstant(Val: ArgSize, DL, VT: PtrVT));
10384	VANext = DAG.getZExtOrTrunc(Op: VANext, DL, VT: PtrMemVT);
10385
10386	// Store the incremented VAList to the legalized pointer
10387	SDValue APStore =
10388	DAG.getStore(Chain, dl: DL, Val: VANext, Ptr: Addr, PtrInfo: MachinePointerInfo (V));
10389
10390	// Load the actual argument out of the pointer VAList
10391	if (NeedFPTrunc) {
10392	// Load the value as an f64.
10393	SDValue WideFP =
10394	DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
10395	// Round the value down to an f32.
10396	SDValue NarrowFP =
10397	DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT, N1: WideFP.getValue(R: `0`),
10398	N2: DAG.getIntPtrConstant(Val: `1`, DL, /isTarget=/true));
10399	SDValue Ops[] = { NarrowFP, WideFP.getValue(R: `1`) };
10400	// Merge the rounded value with the chain output of the load.
10401	return DAG.getMergeValues(Ops, dl: DL);
10402	}
10403
10404	return DAG.getLoad(VT, dl: DL, Chain: APStore, Ptr: VAList, PtrInfo: MachinePointerInfo ());
10405	}
10406
10407	SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
10408	SelectionDAG &DAG) const {
10409	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
10410	MFI.setFrameAddressIsTaken(true);
10411
10412	EVT VT = Op.getValueType();
10413	SDLoc DL(Op);
10414	unsigned Depth = Op.getConstantOperandVal(i: `0`);
10415	SDValue FrameAddr =
10416	DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64);
10417	while (Depth--)
10418	FrameAddr = DAG.getLoad(VT, dl: DL, Chain: DAG.getEntryNode(), Ptr: FrameAddr,
10419	PtrInfo: MachinePointerInfo ());
10420
10421	if (Subtarget->isTargetILP32())
10422	FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr,
10423	DAG.getValueType(VT));
10424
10425	return FrameAddr;
10426	}
10427
10428	SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
10429	SelectionDAG &DAG) const {
10430	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
10431
10432	EVT VT = getPointerTy(DL: DAG.getDataLayout());
10433	SDLoc DL(Op);
10434	int FI = MFI.CreateFixedObject(Size: `4`, SPOffset: `0`, IsImmutable: false);
10435	return DAG.getFrameIndex(FI, VT);
10436	}
10437
10438	#define GET_REGISTER_MATCHER
10439	#include "AArch64GenAsmMatcher.inc"
10440
10441	// FIXME? Maybe this could be a TableGen attribute on some registers and
10442	// this table could be generated automatically from RegInfo.
10443	Register AArch64TargetLowering::
10444	getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const {
10445	Register Reg = MatchRegisterName(RegName);
10446	if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
10447	const AArch64RegisterInfo *MRI = Subtarget->getRegisterInfo();
10448	unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);
10449	if (!Subtarget->isXRegisterReserved(i: DwarfRegNum) &&
10450	!MRI->isReservedReg(MF, Reg))
10451	Reg = `0`;
10452	}
10453	if (Reg)
10454	return Reg;
10455	report_fatal_error(reason: Twine("Invalid register name \""
10456	+ StringRef (RegName) + "\"."));
10457	}
10458
10459	SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
10460	SelectionDAG &DAG) const {
10461	DAG.getMachineFunction().getFrameInfo().setFrameAddressIsTaken(true);
10462
10463	EVT VT = Op.getValueType();
10464	SDLoc DL(Op);
10465
10466	SDValue FrameAddr =
10467	DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
10468	SDValue Offset = DAG.getConstant(Val: `8`, DL, VT: getPointerTy(DL: DAG.getDataLayout()));
10469
10470	return DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: FrameAddr, N2: Offset);
10471	}
10472
10473	SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
10474	SelectionDAG &DAG) const {
10475	MachineFunction &MF = DAG.getMachineFunction();
10476	MachineFrameInfo &MFI = MF.getFrameInfo();
10477	MFI.setReturnAddressIsTaken(true);
10478
10479	EVT VT = Op.getValueType();
10480	SDLoc DL(Op);
10481	unsigned Depth = Op.getConstantOperandVal(i: `0`);
10482	SDValue ReturnAddress;
10483	if (Depth) {
10484	SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
10485	SDValue Offset = DAG.getConstant(Val: `8`, DL, VT: getPointerTy(DL: DAG.getDataLayout()));
10486	ReturnAddress = DAG.getLoad(
10487	VT, dl: DL, Chain: DAG.getEntryNode(),
10488	Ptr: DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: FrameAddr, N2: Offset), PtrInfo: MachinePointerInfo ());
10489	} else {
10490	// Return LR, which contains the return address. Mark it an implicit
10491	// live-in.
10492	Register Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
10493	ReturnAddress = DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: DL, Reg, VT);
10494	}
10495
10496	// The XPACLRI instruction assembles to a hint-space instruction before
10497	// Armv8.3-A therefore this instruction can be safely used for any pre
10498	// Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use
10499	// that instead.
10500	SDNode *St;
10501	if (Subtarget->hasPAuth()) {
10502	St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddress);
10503	} else {
10504	// XPACLRI operates on LR therefore we must move the operand accordingly.
10505	SDValue Chain =
10506	DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress);
10507	St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain);
10508	}
10509	return SDValue (St, `0`);
10510	}
10511
10512	/// LowerShiftParts - Lower SHL_PARTS/SRA_PARTS/SRL_PARTS, which returns two
10513	/// i32 values and take a 2 x i32 value to shift plus a shift amount.
10514	SDValue AArch64TargetLowering::LowerShiftParts(SDValue Op,
10515	SelectionDAG &DAG) const {
10516	SDValue Lo, Hi;
10517	expandShiftParts(N: Op.getNode(), Lo, Hi, DAG);
10518	return DAG.getMergeValues(Ops: {Lo, Hi}, dl: SDLoc (Op));
10519	}
10520
10521	bool AArch64TargetLowering::isOffsetFoldingLegal(
10522	const GlobalAddressSDNode GA) const* {
10523	// Offsets are folded in the DAG combine rather than here so that we can
10524	// intelligently choose an offset based on the uses.
10525	return false;
10526	}
10527
10528	bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
10529	bool OptForSize) const {
10530	bool IsLegal = false;
10531	// We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
10532	// 16-bit case when target has full fp16 support.
10533	// We encode bf16 bit patterns as if they were fp16. This results in very
10534	// strange looking assembly but should populate the register with appropriate
10535	// values. Let's say we wanted to encode 0xR3FC0 which is 1.5 in BF16. We will
10536	// end up encoding this as the imm8 0x7f. This imm8 will be expanded to the
10537	// FP16 1.9375 which shares the same bit pattern as BF16 1.5.
10538	// FIXME: We should be able to handle f128 as well with a clever lowering.
10539	const APInt ImmInt = Imm.bitcastToAPInt();
10540	if (VT == MVT::f64)
10541	IsLegal = AArch64_AM::getFP64Imm(Imm: ImmInt) != -`1` \|\| Imm.isPosZero();
10542	else if (VT == MVT::f32)
10543	IsLegal = AArch64_AM::getFP32Imm(Imm: ImmInt) != -`1` \|\| Imm.isPosZero();
10544	else if (VT == MVT::f16 \|\| VT == MVT::bf16)
10545	IsLegal =
10546	(Subtarget->hasFullFP16() && AArch64_AM::getFP16Imm(Imm: ImmInt) != -`1`) \|\|
10547	Imm.isPosZero();
10548
10549	// If we can not materialize in immediate field for fmov, check if the
10550	// value can be encoded as the immediate operand of a logical instruction.
10551	// The immediate value will be created with either MOVZ, MOVN, or ORR.
10552	// TODO: fmov h0, w0 is also legal, however we don't have an isel pattern to
10553	// generate that fmov.
10554	if (!IsLegal && (VT == MVT::f64 \|\| VT == MVT::f32)) {
10555	// The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
10556	// however the mov+fmov sequence is always better because of the reduced
10557	// cache pressure. The timings are still the same if you consider
10558	// movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
10559	// movw+movk is fused). So we limit up to 2 instrdduction at most.
10560	SmallVector<AArch64_IMM::ImmInsnModel, `4`> Insn;
10561	AArch64_IMM::expandMOVImm(Imm: ImmInt.getZExtValue(), BitSize: VT.getSizeInBits(), Insn);
10562	unsigned Limit = (OptForSize ? `1` : (Subtarget->hasFuseLiterals() ? `5` : `2`));
10563	IsLegal = Insn.size() <= Limit;
10564	}
10565
10566	LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT
10567	<< " imm value: "; Imm.dump(););
10568	return IsLegal;
10569	}
10570
10571	//===----------------------------------------------------------------------===//
10572	// AArch64 Optimization Hooks
10573	//===----------------------------------------------------------------------===//
10574
10575	static SDValue getEstimate(const AArch64Subtarget ST, unsigned* Opcode,
10576	SDValue Operand, SelectionDAG &DAG,
10577	int &ExtraSteps) {
10578	EVT VT = Operand.getValueType();
10579	if ((ST->hasNEON() &&
10580	(VT == MVT::f64 \|\| VT == MVT::v1f64 \|\| VT == MVT::v2f64 \|\|
10581	VT == MVT::f32 \|\| VT == MVT::v1f32 \|\| VT == MVT::v2f32 \|\|
10582	VT == MVT::v4f32)) \|\|
10583	(ST->hasSVE() &&
10584	(VT == MVT::nxv8f16 \|\| VT == MVT::nxv4f32 \|\| VT == MVT::nxv2f64))) {
10585	if (ExtraSteps == TargetLoweringBase::ReciprocalEstimate::Unspecified) {
10586	// For the reciprocal estimates, convergence is quadratic, so the number
10587	// of digits is doubled after each iteration. In ARMv8, the accuracy of
10588	// the initial estimate is 2^-8. Thus the number of extra steps to refine
10589	// the result for float (23 mantissa bits) is 2 and for double (52
10590	// mantissa bits) is 3.
10591	constexpr unsigned AccurateBits = `8`;
10592	unsigned DesiredBits =
10593	APFloat::semanticsPrecision(DAG.EVTToAPFloatSemantics(VT));
10594	ExtraSteps = DesiredBits <= AccurateBits
10595	? `0`
10596	: Log2_64_Ceil(Value: DesiredBits) - Log2_64_Ceil(Value: AccurateBits);
10597	}
10598
10599	return DAG.getNode(Opcode, DL: SDLoc (Operand), VT, Operand);
10600	}
10601
10602	return SDValue ();
10603	}
10604
10605	SDValue
10606	AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
10607	const DenormalMode &Mode) const {
10608	SDLoc DL(Op);
10609	EVT VT = Op.getValueType();
10610	EVT CCVT = getSetCCResultType(DAG.getDataLayout(), C&: *DAG.getContext(), VT);
10611	SDValue FPZero = DAG.getConstantFP(Val: `0.0`, DL, VT);
10612	return DAG.getSetCC(DL, VT: CCVT, LHS: Op, RHS: FPZero, Cond: ISD::SETEQ);
10613	}
10614
10615	SDValue
10616	AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op,
10617	SelectionDAG &DAG) const {
10618	return Op;
10619	}
10620
10621	SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
10622	SelectionDAG &DAG, int Enabled,
10623	int &ExtraSteps,
10624	bool &UseOneConst,
10625	bool Reciprocal) const {
10626	if (Enabled == ReciprocalEstimate::Enabled \|\|
10627	(Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
10628	if (SDValue Estimate = getEstimate(ST: Subtarget, Opcode: AArch64ISD::FRSQRTE, Operand,
10629	DAG, ExtraSteps)) {
10630	SDLoc DL(Operand);
10631	EVT VT = Operand.getValueType();
10632
10633	SDNodeFlags Flags;
10634	Flags.setAllowReassociation(true);
10635
10636	// Newton reciprocal square root iteration: E 0.5 * (3 - X * E^2)*
10637	// AArch64 reciprocal square root iteration instruction: 0.5 (3 - M * N)*
10638	for (int i = ExtraSteps; i > `0`; --i) {
10639	SDValue Step = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Estimate, N2: Estimate,
10640	Flags);
10641	Step = DAG.getNode(Opcode: AArch64ISD::FRSQRTS, DL, VT, N1: Operand, N2: Step, Flags);
10642	Estimate = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Estimate, N2: Step, Flags);
10643	}
10644	if (!Reciprocal)
10645	Estimate = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Operand, N2: Estimate, Flags);
10646
10647	ExtraSteps = `0`;
10648	return Estimate;
10649	}
10650
10651	return SDValue ();
10652	}
10653
10654	SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
10655	SelectionDAG &DAG, int Enabled,
10656	int &ExtraSteps) const {
10657	if (Enabled == ReciprocalEstimate::Enabled)
10658	if (SDValue Estimate = getEstimate(ST: Subtarget, Opcode: AArch64ISD::FRECPE, Operand,
10659	DAG, ExtraSteps)) {
10660	SDLoc DL(Operand);
10661	EVT VT = Operand.getValueType();
10662
10663	SDNodeFlags Flags;
10664	Flags.setAllowReassociation(true);
10665
10666	// Newton reciprocal iteration: E (2 - X * E)*
10667	// AArch64 reciprocal iteration instruction: (2 - M N)*
10668	for (int i = ExtraSteps; i > `0`; --i) {
10669	SDValue Step = DAG.getNode(Opcode: AArch64ISD::FRECPS, DL, VT, N1: Operand,
10670	N2: Estimate, Flags);
10671	Estimate = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Estimate, N2: Step, Flags);
10672	}
10673
10674	ExtraSteps = `0`;
10675	return Estimate;
10676	}
10677
10678	return SDValue ();
10679	}
10680
10681	//===----------------------------------------------------------------------===//
10682	// AArch64 Inline Assembly Support
10683	//===----------------------------------------------------------------------===//
10684
10685	// Table of Constraints
10686	// TODO: This is the current set of constraints supported by ARM for the
10687	// compiler, not all of them may make sense.
10688	//
10689	// r - A general register
10690	// w - An FP/SIMD register of some size in the range v0-v31
10691	// x - An FP/SIMD register of some size in the range v0-v15
10692	// I - Constant that can be used with an ADD instruction
10693	// J - Constant that can be used with a SUB instruction
10694	// K - Constant that can be used with a 32-bit logical instruction
10695	// L - Constant that can be used with a 64-bit logical instruction
10696	// M - Constant that can be used as a 32-bit MOV immediate
10697	// N - Constant that can be used as a 64-bit MOV immediate
10698	// Q - A memory reference with base register and no offset
10699	// S - A symbolic address
10700	// Y - Floating point constant zero
10701	// Z - Integer constant zero
10702	//
10703	// Note that general register operands will be output using their 64-bit x
10704	// register name, whatever the size of the variable, unless the asm operand
10705	// is prefixed by the %w modifier. Floating-point and SIMD register operands
10706	// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
10707	// %q modifier.
10708	const char AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const* {
10709	// At this point, we have to lower this constraint to something else, so we
10710	// lower it to an "r" or "w". However, by doing this we will force the result
10711	// to be in register, while the X constraint is much more permissive.
10712	//
10713	// Although we are correct (we are free to emit anything, without
10714	// constraints), we might break use cases that would expect us to be more
10715	// efficient and emit something else.
10716	if (!Subtarget->hasFPARMv8())
10717	return "r";
10718
10719	if (ConstraintVT.isFloatingPoint())
10720	return "w";
10721
10722	if (ConstraintVT.isVector() &&
10723	(ConstraintVT.getSizeInBits() == `64` \|\|
10724	ConstraintVT.getSizeInBits() == `128`))
10725	return "w";
10726
10727	return "r";
10728	}
10729
10730	enum class PredicateConstraint { Uph, Upl, Upa };
10731
10732	static std::optional<PredicateConstraint>
10733	parsePredicateConstraint(StringRef Constraint) {
10734	return StringSwitch<std::optional<PredicateConstraint>>(Constraint)
10735	.Case(S: "Uph", Value: PredicateConstraint::Uph)
10736	.Case(S: "Upl", Value: PredicateConstraint::Upl)
10737	.Case(S: "Upa", Value: PredicateConstraint::Upa)
10738	.Default(Value: std::nullopt);
10739	}
10740
10741	static const TargetRegisterClass *
10742	getPredicateRegisterClass(PredicateConstraint Constraint, EVT VT) {
10743	if (VT != MVT::aarch64svcount &&
10744	(!VT.isScalableVector() \|\| VT.getVectorElementType() != MVT::i1))
10745	return nullptr;
10746
10747	switch (Constraint) {
10748	case PredicateConstraint::Uph:
10749	return VT == MVT::aarch64svcount ? &AArch64::PNR_p8to15RegClass
10750	: &AArch64::PPR_p8to15RegClass;
10751	case PredicateConstraint::Upl:
10752	return VT == MVT::aarch64svcount ? &AArch64::PNR_3bRegClass
10753	: &AArch64::PPR_3bRegClass;
10754	case PredicateConstraint::Upa:
10755	return VT == MVT::aarch64svcount ? &AArch64::PNRRegClass
10756	: &AArch64::PPRRegClass;
10757	}
10758
10759	llvm_unreachable("Missing PredicateConstraint!");
10760	}
10761
10762	enum class ReducedGprConstraint { Uci, Ucj };
10763
10764	static std::optional<ReducedGprConstraint>
10765	parseReducedGprConstraint(StringRef Constraint) {
10766	return StringSwitch<std::optional<ReducedGprConstraint>>(Constraint)
10767	.Case(S: "Uci", Value: ReducedGprConstraint::Uci)
10768	.Case(S: "Ucj", Value: ReducedGprConstraint::Ucj)
10769	.Default(Value: std::nullopt);
10770	}
10771
10772	static const TargetRegisterClass *
10773	getReducedGprRegisterClass(ReducedGprConstraint Constraint, EVT VT) {
10774	if (!VT.isScalarInteger() \|\| VT.getFixedSizeInBits() > `64`)
10775	return nullptr;
10776
10777	switch (Constraint) {
10778	case ReducedGprConstraint::Uci:
10779	return &AArch64::MatrixIndexGPR32_8_11RegClass;
10780	case ReducedGprConstraint::Ucj:
10781	return &AArch64::MatrixIndexGPR32_12_15RegClass;
10782	}
10783
10784	llvm_unreachable("Missing ReducedGprConstraint!");
10785	}
10786
10787	// The set of cc code supported is from
10788	// https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Flag-Output-Operands
10789	static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint) {
10790	AArch64CC::CondCode Cond = StringSwitch<AArch64CC::CondCode>(Constraint)
10791	.Case(S: "{@cchi}", Value: AArch64CC::HI)
10792	.Case(S: "{@cccs}", Value: AArch64CC::HS)
10793	.Case(S: "{@cclo}", Value: AArch64CC::LO)
10794	.Case(S: "{@ccls}", Value: AArch64CC::LS)
10795	.Case(S: "{@cccc}", Value: AArch64CC::LO)
10796	.Case(S: "{@cceq}", Value: AArch64CC::EQ)
10797	.Case(S: "{@ccgt}", Value: AArch64CC::GT)
10798	.Case(S: "{@ccge}", Value: AArch64CC::GE)
10799	.Case(S: "{@cclt}", Value: AArch64CC::LT)
10800	.Case(S: "{@ccle}", Value: AArch64CC::LE)
10801	.Case(S: "{@cchs}", Value: AArch64CC::HS)
10802	.Case(S: "{@ccne}", Value: AArch64CC::NE)
10803	.Case(S: "{@ccvc}", Value: AArch64CC::VC)
10804	.Case(S: "{@ccpl}", Value: AArch64CC::PL)
10805	.Case(S: "{@ccvs}", Value: AArch64CC::VS)
10806	.Case(S: "{@ccmi}", Value: AArch64CC::MI)
10807	.Default(Value: AArch64CC::Invalid);
10808	return Cond;
10809	}
10810
10811	/// Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR,
10812	/// WZR, invert(<cond>)'.
10813	static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL,
10814	SelectionDAG &DAG) {
10815	return DAG.getNode(
10816	AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(`0`, DL, MVT::i32),
10817	DAG.getConstant(`0`, DL, MVT::i32),
10818	DAG.getConstant(getInvertedCondCode(CC), DL, MVT::i32), NZCV);
10819	}
10820
10821	// Lower @cc flag output via getSETCC.
10822	SDValue AArch64TargetLowering::LowerAsmOutputForConstraint(
10823	SDValue &Chain, SDValue &Glue, const SDLoc &DL,
10824	const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
10825	AArch64CC::CondCode Cond = parseConstraintCode(Constraint: OpInfo.ConstraintCode);
10826	if (Cond == AArch64CC::Invalid)
10827	return SDValue ();
10828	// The output variable should be a scalar integer.
10829	if (OpInfo.ConstraintVT.isVector() \|\| !OpInfo.ConstraintVT.isInteger() \|\|
10830	OpInfo.ConstraintVT.getSizeInBits() < `8`)
10831	report_fatal_error(reason: "Flag output operand is of invalid type");
10832
10833	// Get NZCV register. Only update chain when copyfrom is glued.
10834	if (Glue.getNode()) {
10835	Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32, Glue);
10836	Chain = Glue.getValue(R: `1`);
10837	} else
10838	Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32);
10839	// Extract CC code.
10840	SDValue CC = getSETCC(CC: Cond, NZCV: Glue, DL, DAG);
10841
10842	SDValue Result;
10843
10844	// Truncate or ZERO_EXTEND based on value types.
10845	if (OpInfo.ConstraintVT.getSizeInBits() <= `32`)
10846	Result = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: OpInfo.ConstraintVT, Operand: CC);
10847	else
10848	Result = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: OpInfo.ConstraintVT, Operand: CC);
10849
10850	return Result;
10851	}
10852
10853	/// getConstraintType - Given a constraint letter, return the type of
10854	/// constraint it is for this target.
10855	AArch64TargetLowering::ConstraintType
10856	AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
10857	if (Constraint.size() == `1`) {
10858	switch (Constraint [`0`]) {
10859	default:
10860	break;
10861	case `'x'`:
10862	case `'w'`:
10863	case `'y'`:
10864	return C_RegisterClass;
10865	// An address with a single base register. Due to the way we
10866	// currently handle addresses it is the same as 'r'.
10867	case `'Q'`:
10868	return C_Memory;
10869	case `'I'`:
10870	case `'J'`:
10871	case `'K'`:
10872	case `'L'`:
10873	case `'M'`:
10874	case `'N'`:
10875	case `'Y'`:
10876	case `'Z'`:
10877	return C_Immediate;
10878	case `'z'`:
10879	case `'S'`: // A symbol or label reference with a constant offset
10880	return C_Other;
10881	}
10882	} else if (parsePredicateConstraint(Constraint))
10883	return C_RegisterClass;
10884	else if (parseReducedGprConstraint(Constraint))
10885	return C_RegisterClass;
10886	else if (parseConstraintCode(Constraint) != AArch64CC::Invalid)
10887	return C_Other;
10888	return TargetLowering::getConstraintType(Constraint);
10889	}
10890
10891	/// Examine constraint type and operand type and determine a weight value.
10892	/// This object must already have been set up with the operand type
10893	/// and the current alternative constraint selected.
10894	TargetLowering::ConstraintWeight
10895	AArch64TargetLowering::getSingleConstraintMatchWeight(
10896	AsmOperandInfo &info, const char constraint) const* {
10897	ConstraintWeight weight = CW_Invalid;
10898	Value *CallOperandVal = info.CallOperandVal;
10899	// If we don't have a value, we can't do a match,
10900	// but allow it at the lowest weight.
10901	if (!CallOperandVal)
10902	return CW_Default;
10903	Type *type = CallOperandVal->getType();
10904	// Look at the constraint type.
10905	switch (*constraint) {
10906	default:
10907	weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
10908	break;
10909	case `'x'`:
10910	case `'w'`:
10911	case `'y'`:
10912	if (type->isFloatingPointTy() \|\| type->isVectorTy())
10913	weight = CW_Register;
10914	break;
10915	case `'z'`:
10916	weight = CW_Constant;
10917	break;
10918	case `'U'`:
10919	if (parsePredicateConstraint(Constraint: constraint) \|\|
10920	parseReducedGprConstraint(Constraint: constraint))
10921	weight = CW_Register;
10922	break;
10923	}
10924	return weight;
10925	}
10926
10927	std::pair<unsigned, const TargetRegisterClass *>
10928	AArch64TargetLowering::getRegForInlineAsmConstraint(
10929	const TargetRegisterInfo TRI, StringRef Constraint, MVT VT) const* {
10930	if (Constraint.size() == `1`) {
10931	switch (Constraint [`0`]) {
10932	case `'r'`:
10933	if (VT.isScalableVector())
10934	return std::make_pair(x: `0U`, y: nullptr);
10935	if (Subtarget->hasLS64() && VT.getSizeInBits() == `512`)
10936	return std::make_pair(`0U`, &AArch64::GPR64x8ClassRegClass);
10937	if (VT.getFixedSizeInBits() == `64`)
10938	return std::make_pair(`0U`, &AArch64::GPR64commonRegClass);
10939	return std::make_pair(`0U`, &AArch64::GPR32commonRegClass);
10940	case `'w'`: {
10941	if (!Subtarget->hasFPARMv8())
10942	break;
10943	if (VT.isScalableVector()) {
10944	if (VT.getVectorElementType() != MVT::i1)
10945	return std::make_pair(`0U`, &AArch64::ZPRRegClass);
10946	return std::make_pair(x: `0U`, y: nullptr);
10947	}
10948	uint64_t VTSize = VT.getFixedSizeInBits();
10949	if (VTSize == `16`)
10950	return std::make_pair(`0U`, &AArch64::FPR16RegClass);
10951	if (VTSize == `32`)
10952	return std::make_pair(`0U`, &AArch64::FPR32RegClass);
10953	if (VTSize == `64`)
10954	return std::make_pair(`0U`, &AArch64::FPR64RegClass);
10955	if (VTSize == `128`)
10956	return std::make_pair(`0U`, &AArch64::FPR128RegClass);
10957	break;
10958	}
10959	// The instructions that this constraint is designed for can
10960	// only take 128-bit registers so just use that regclass.
10961	case `'x'`:
10962	if (!Subtarget->hasFPARMv8())
10963	break;
10964	if (VT.isScalableVector())
10965	return std::make_pair(`0U`, &AArch64::ZPR_4bRegClass);
10966	if (VT.getSizeInBits() == `128`)
10967	return std::make_pair(`0U`, &AArch64::FPR128_loRegClass);
10968	break;
10969	case `'y'`:
10970	if (!Subtarget->hasFPARMv8())
10971	break;
10972	if (VT.isScalableVector())
10973	return std::make_pair(`0U`, &AArch64::ZPR_3bRegClass);
10974	break;
10975	}
10976	} else {
10977	if (const auto PC = parsePredicateConstraint(Constraint))
10978	if (const auto RegClass = getPredicateRegisterClass(Constraint: PC, VT))
10979	return std::make_pair(x: `0U`, y&: RegClass);
10980
10981	if (const auto RGC = parseReducedGprConstraint(Constraint))
10982	if (const auto RegClass = getReducedGprRegisterClass(Constraint: RGC, VT))
10983	return std::make_pair(x: `0U`, y&: RegClass);
10984	}
10985	if (StringRef("{cc}").equals_insensitive(Constraint) \|\|
10986	parseConstraintCode(Constraint) != AArch64CC::Invalid)
10987	return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
10988
10989	if (Constraint == "{za}") {
10990	return std::make_pair(unsigned(AArch64::ZA), &AArch64::MPRRegClass);
10991	}
10992
10993	if (Constraint == "{zt0}") {
10994	return std::make_pair(unsigned(AArch64::ZT0), &AArch64::ZTRRegClass);
10995	}
10996
10997	// Use the default implementation in TargetLowering to convert the register
10998	// constraint into a member of a register class.
10999	std::pair<unsigned, const TargetRegisterClass *> Res;
11000	Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
11001
11002	// Not found as a standard register?
11003	if (!Res.second) {
11004	unsigned Size = Constraint.size();
11005	if ((Size == `4` \|\| Size == `5`) && Constraint [`0`] == `'{'` &&
11006	tolower(c: Constraint [`1`]) == `'v'` && Constraint [Size - `1`] == `'}'`) {
11007	int RegNo;
11008	bool Failed = Constraint.slice(Start: `2`, End: Size - `1`).getAsInteger(Radix: `10`, Result&: RegNo);
11009	if (!Failed && RegNo >= `0` && RegNo <= `31`) {
11010	// v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
11011	// By default we'll emit v0-v31 for this unless there's a modifier where
11012	// we'll emit the correct register as well.
11013	if (VT != MVT::Other && VT.getSizeInBits() == `64`) {
11014	Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
11015	Res.second = &AArch64::FPR64RegClass;
11016	} else {
11017	Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
11018	Res.second = &AArch64::FPR128RegClass;
11019	}
11020	}
11021	}
11022	}
11023
11024	if (Res.second && !Subtarget->hasFPARMv8() &&
11025	!AArch64::GPR32allRegClass.hasSubClassEq(Res.second) &&
11026	!AArch64::GPR64allRegClass.hasSubClassEq(Res.second))
11027	return std::make_pair(x: `0U`, y: nullptr);
11028
11029	return Res;
11030	}
11031
11032	EVT AArch64TargetLowering::getAsmOperandValueType(const DataLayout &DL,
11033	llvm::Type *Ty,
11034	bool AllowUnknown) const {
11035	if (Subtarget->hasLS64() && Ty->isIntegerTy(`512`))
11036	return EVT(MVT::i64x8);
11037
11038	return TargetLowering::getAsmOperandValueType(DL, Ty, AllowUnknown);
11039	}
11040
11041	/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
11042	/// vector. If it is invalid, don't add anything to Ops.
11043	void AArch64TargetLowering::LowerAsmOperandForConstraint(
11044	SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
11045	SelectionDAG &DAG) const {
11046	SDValue Result;
11047
11048	// Currently only support length 1 constraints.
11049	if (Constraint.size() != `1`)
11050	return;
11051
11052	char ConstraintLetter = Constraint [`0`];
11053	switch (ConstraintLetter) {
11054	default:
11055	break;
11056
11057	// This set of constraints deal with valid constants for various instructions.
11058	// Validate and return a target constant for them if we can.
11059	case `'z'`: {
11060	// 'z' maps to xzr or wzr so it needs an input of 0.
11061	if (!isNullConstant(V: Op))
11062	return;
11063
11064	if (Op.getValueType() == MVT::i64)
11065	Result = DAG.getRegister(AArch64::XZR, MVT::i64);
11066	else
11067	Result = DAG.getRegister(AArch64::WZR, MVT::i32);
11068	break;
11069	}
11070	case `'S'`:
11071	// Use the generic code path for "s". In GCC's aarch64 port, "S" is
11072	// supported for PIC while "s" isn't, making "s" less useful. We implement
11073	// "S" but not "s".
11074	TargetLowering::LowerAsmOperandForConstraint(Op, Constraint: "s", Ops, DAG);
11075	break;
11076
11077	case `'I'`:
11078	case `'J'`:
11079	case `'K'`:
11080	case `'L'`:
11081	case `'M'`:
11082	case `'N'`:
11083	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: Op);
11084	if (!C)
11085	return;
11086
11087	// Grab the value and do some validation.
11088	uint64_t CVal = C->getZExtValue();
11089	switch (ConstraintLetter) {
11090	// The I constraint applies only to simple ADD or SUB immediate operands:
11091	// i.e. 0 to 4095 with optional shift by 12
11092	// The J constraint applies only to ADD or SUB immediates that would be
11093	// valid when negated, i.e. if [an add pattern] were to be output as a SUB
11094	// instruction [or vice versa], in other words -1 to -4095 with optional
11095	// left shift by 12.
11096	case `'I'`:
11097	if (isUInt<`12`>(x: CVal) \|\| isShiftedUInt<`12`, `12`>(x: CVal))
11098	break;
11099	return;
11100	case `'J'`: {
11101	uint64_t NVal = -C->getSExtValue();
11102	if (isUInt<`12`>(x: NVal) \|\| isShiftedUInt<`12`, `12`>(x: NVal)) {
11103	CVal = C->getSExtValue();
11104	break;
11105	}
11106	return;
11107	}
11108	// The K and L constraints apply only* to logical immediates, including*
11109	// what used to be the MOVI alias for ORR (though the MOVI alias has now
11110	// been removed and MOV should be used). So these constraints have to
11111	// distinguish between bit patterns that are valid 32-bit or 64-bit
11112	// "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
11113	// not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
11114	// versa.
11115	case `'K'`:
11116	if (AArch64_AM::isLogicalImmediate(imm: CVal, regSize: `32`))
11117	break;
11118	return;
11119	case `'L'`:
11120	if (AArch64_AM::isLogicalImmediate(imm: CVal, regSize: `64`))
11121	break;
11122	return;
11123	// The M and N constraints are a superset of K and L respectively, for use
11124	// with the MOV (immediate) alias. As well as the logical immediates they
11125	// also match 32 or 64-bit immediates that can be loaded either using a
11126	// single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca*
11127	// (M) or 64-bit 0x1234000000000000 (N) etc.
11128	// As a note some of this code is liberally stolen from the asm parser.
11129	case `'M'`: {
11130	if (!isUInt<`32`>(x: CVal))
11131	return;
11132	if (AArch64_AM::isLogicalImmediate(imm: CVal, regSize: `32`))
11133	break;
11134	if ((CVal & `0xFFFF`) == CVal)
11135	break;
11136	if ((CVal & `0xFFFF0000ULL`) == CVal)
11137	break;
11138	uint64_t NCVal = ~(uint32_t)CVal;
11139	if ((NCVal & `0xFFFFULL`) == NCVal)
11140	break;
11141	if ((NCVal & `0xFFFF0000ULL`) == NCVal)
11142	break;
11143	return;
11144	}
11145	case `'N'`: {
11146	if (AArch64_AM::isLogicalImmediate(imm: CVal, regSize: `64`))
11147	break;
11148	if ((CVal & `0xFFFFULL`) == CVal)
11149	break;
11150	if ((CVal & `0xFFFF0000ULL`) == CVal)
11151	break;
11152	if ((CVal & `0xFFFF00000000ULL`) == CVal)
11153	break;
11154	if ((CVal & `0xFFFF000000000000ULL`) == CVal)
11155	break;
11156	uint64_t NCVal = ~CVal;
11157	if ((NCVal & `0xFFFFULL`) == NCVal)
11158	break;
11159	if ((NCVal & `0xFFFF0000ULL`) == NCVal)
11160	break;
11161	if ((NCVal & `0xFFFF00000000ULL`) == NCVal)
11162	break;
11163	if ((NCVal & `0xFFFF000000000000ULL`) == NCVal)
11164	break;
11165	return;
11166	}
11167	default:
11168	return;
11169	}
11170
11171	// All assembler immediates are 64-bit integers.
11172	Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
11173	break;
11174	}
11175
11176	if (Result.getNode()) {
11177	Ops.push_back(x: Result);
11178	return;
11179	}
11180
11181	return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
11182	}
11183
11184	//===----------------------------------------------------------------------===//
11185	// AArch64 Advanced SIMD Support
11186	//===----------------------------------------------------------------------===//
11187
11188	/// WidenVector - Given a value in the V64 register class, produce the
11189	/// equivalent value in the V128 register class.
11190	static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) {
11191	EVT VT = V64Reg.getValueType();
11192	unsigned NarrowSize = VT.getVectorNumElements();
11193	MVT EltTy = VT.getVectorElementType().getSimpleVT();
11194	MVT WideTy = MVT::getVectorVT(VT: EltTy, NumElements: `2` * NarrowSize);
11195	SDLoc DL(V64Reg);
11196
11197	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
11198	V64Reg, DAG.getConstant(`0`, DL, MVT::i64));
11199	}
11200
11201	/// getExtFactor - Determine the adjustment factor for the position when
11202	/// generating an "extract from vector registers" instruction.
11203	static unsigned getExtFactor(SDValue &V) {
11204	EVT EltType = V.getValueType().getVectorElementType();
11205	return EltType.getSizeInBits() / `8`;
11206	}
11207
11208	// Check if a vector is built from one vector via extracted elements of
11209	// another together with an AND mask, ensuring that all elements fit
11210	// within range. This can be reconstructed using AND and NEON's TBL1.
11211	SDValue ReconstructShuffleWithRuntimeMask(SDValue Op, SelectionDAG &DAG) {
11212	assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
11213	SDLoc dl(Op);
11214	EVT VT = Op.getValueType();
11215	assert(!VT.isScalableVector() &&
11216	"Scalable vectors cannot be used with ISD::BUILD_VECTOR");
11217
11218	// Can only recreate a shuffle with 16xi8 or 8xi8 elements, as they map
11219	// directly to TBL1.
11220	if (VT != MVT::v16i8 && VT != MVT::v8i8)
11221	return SDValue ();
11222
11223	unsigned NumElts = VT.getVectorNumElements();
11224	assert((NumElts == `8` \|\| NumElts == `16`) &&
11225	"Need to have exactly 8 or 16 elements in vector.");
11226
11227	SDValue SourceVec;
11228	SDValue MaskSourceVec;
11229	SmallVector<SDValue, `16`> AndMaskConstants;
11230
11231	for (unsigned i = `0`; i < NumElts; ++i) {
11232	SDValue V = Op.getOperand(i);
11233	if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
11234	return SDValue ();
11235
11236	SDValue OperandSourceVec = V.getOperand(i: `0`);
11237	if (!SourceVec)
11238	SourceVec = OperandSourceVec;
11239	else if (SourceVec != OperandSourceVec)
11240	return SDValue ();
11241
11242	// This only looks at shuffles with elements that are
11243	// a) truncated by a constant AND mask extracted from a mask vector, or
11244	// b) extracted directly from a mask vector.
11245	SDValue MaskSource = V.getOperand(i: `1`);
11246	if (MaskSource.getOpcode() == ISD::AND) {
11247	if (!isa<ConstantSDNode>(Val: MaskSource.getOperand(i: `1`)))
11248	return SDValue ();
11249
11250	AndMaskConstants.push_back(Elt: MaskSource.getOperand(i: `1`));
11251	MaskSource = MaskSource ->getOperand(Num: `0`);
11252	} else if (!AndMaskConstants.empty()) {
11253	// Either all or no operands should have an AND mask.
11254	return SDValue ();
11255	}
11256
11257	// An ANY_EXTEND may be inserted between the AND and the source vector
11258	// extraction. We don't care about that, so we can just skip it.
11259	if (MaskSource.getOpcode() == ISD::ANY_EXTEND)
11260	MaskSource = MaskSource.getOperand(i: `0`);
11261
11262	if (MaskSource.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
11263	return SDValue ();
11264
11265	SDValue MaskIdx = MaskSource.getOperand(i: `1`);
11266	if (!isa<ConstantSDNode>(Val: MaskIdx) \|\|
11267	!cast<ConstantSDNode>(Val&: MaskIdx)->getConstantIntValue()->equalsInt(V: i))
11268	return SDValue ();
11269
11270	// We only apply this if all elements come from the same vector with the
11271	// same vector type.
11272	if (!MaskSourceVec) {
11273	MaskSourceVec = MaskSource ->getOperand(Num: `0`);
11274	if (MaskSourceVec.getValueType() != VT)
11275	return SDValue ();
11276	} else if (MaskSourceVec != MaskSource ->getOperand(Num: `0`)) {
11277	return SDValue ();
11278	}
11279	}
11280
11281	// We need a v16i8 for TBL, so we extend the source with a placeholder vector
11282	// for v8i8 to get a v16i8. As the pattern we are replacing is extract +
11283	// insert, we know that the index in the mask must be smaller than the number
11284	// of elements in the source, or we would have an out-of-bounds access.
11285	if (NumElts == `8`)
11286	SourceVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, SourceVec,
11287	DAG.getUNDEF(VT));
11288
11289	// Preconditions met, so we can use a vector (AND +) TBL to build this vector.
11290	if (!AndMaskConstants.empty())
11291	MaskSourceVec = DAG.getNode(Opcode: ISD::AND, DL: dl, VT, N1: MaskSourceVec,
11292	N2: DAG.getBuildVector(VT, DL: dl, Ops: AndMaskConstants));
11293
11294	return DAG.getNode(
11295	ISD::INTRINSIC_WO_CHAIN, dl, VT,
11296	DAG.getConstant(Intrinsic::aarch64_neon_tbl1, dl, MVT::i32), SourceVec,
11297	MaskSourceVec);
11298	}
11299
11300	// Gather data to see if the operation can be modelled as a
11301	// shuffle in combination with VEXTs.
11302	SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
11303	SelectionDAG &DAG) const {
11304	assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
11305	LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
11306	SDLoc dl(Op);
11307	EVT VT = Op.getValueType();
11308	assert(!VT.isScalableVector() &&
11309	"Scalable vectors cannot be used with ISD::BUILD_VECTOR");
11310	unsigned NumElts = VT.getVectorNumElements();
11311
11312	struct ShuffleSourceInfo {
11313	SDValue Vec;
11314	unsigned MinElt;
11315	unsigned MaxElt;
11316
11317	// We may insert some combination of BITCASTs and VEXT nodes to force Vec to
11318	// be compatible with the shuffle we intend to construct. As a result
11319	// ShuffleVec will be some sliding window into the original Vec.
11320	SDValue ShuffleVec;
11321
11322	// Code should guarantee that element i in Vec starts at element "WindowBase
11323	// + i WindowScale in ShuffleVec".*
11324	int WindowBase;
11325	int WindowScale;
11326
11327	ShuffleSourceInfo(SDValue Vec)
11328	: Vec (Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(`0`),
11329	ShuffleVec (Vec), WindowBase(`0`), WindowScale(`1`) {}
11330
11331	bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
11332	};
11333
11334	// First gather all vectors used as an immediate source for this BUILD_VECTOR
11335	// node.
11336	SmallVector<ShuffleSourceInfo, `2`> Sources;
11337	for (unsigned i = `0`; i < NumElts; ++i) {
11338	SDValue V = Op.getOperand(i);
11339	if (V.isUndef())
11340	continue;
11341	else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
11342	!isa<ConstantSDNode>(Val: V.getOperand(i: `1`)) \|\|
11343	V.getOperand(i: `0`).getValueType().isScalableVector()) {
11344	LLVM_DEBUG(
11345	dbgs() << "Reshuffle failed: "
11346	"a shuffle can only come from building a vector from "
11347	"various elements of other fixed-width vectors, provided "
11348	"their indices are constant\n");
11349	return SDValue ();
11350	}
11351
11352	// Add this element source to the list if it's not already there.
11353	SDValue SourceVec = V.getOperand(i: `0`);
11354	auto Source = find(Range&: Sources, Val: SourceVec);
11355	if (Source == Sources.end())
11356	Source = Sources.insert(I: Sources.end(), Elt: ShuffleSourceInfo(SourceVec));
11357
11358	// Update the minimum and maximum lane number seen.
11359	unsigned EltNo = V.getConstantOperandVal(i: `1`);
11360	Source->MinElt = std::min(a: Source->MinElt, b: EltNo);
11361	Source->MaxElt = std::max(a: Source->MaxElt, b: EltNo);
11362	}
11363
11364	// If we have 3 or 4 sources, try to generate a TBL, which will at least be
11365	// better than moving to/from gpr registers for larger vectors.
11366	if ((Sources.size() == `3` \|\| Sources.size() == `4`) && NumElts > `4`) {
11367	// Construct a mask for the tbl. We may need to adjust the index for types
11368	// larger than i8.
11369	SmallVector<unsigned, `16`> Mask;
11370	unsigned OutputFactor = VT.getScalarSizeInBits() / `8`;
11371	for (unsigned I = `0`; I < NumElts; ++I) {
11372	SDValue V = Op.getOperand(i: I);
11373	if (V.isUndef()) {
11374	for (unsigned OF = `0`; OF < OutputFactor; OF++)
11375	Mask.push_back(Elt: -`1`);
11376	continue;
11377	}
11378	// Set the Mask lanes adjusted for the size of the input and output
11379	// lanes. The Mask is always i8, so it will set OutputFactor lanes per
11380	// output element, adjusted in their positions per input and output types.
11381	unsigned Lane = V.getConstantOperandVal(i: `1`);
11382	for (unsigned S = `0`; S < Sources.size(); S++) {
11383	if (V.getOperand(i: `0`) == Sources [S].Vec) {
11384	unsigned InputSize = Sources [S].Vec.getScalarValueSizeInBits();
11385	unsigned InputBase = `16` * S + Lane * InputSize / `8`;
11386	for (unsigned OF = `0`; OF < OutputFactor; OF++)
11387	Mask.push_back(Elt: InputBase + OF);
11388	break;
11389	}
11390	}
11391	}
11392
11393	// Construct the tbl3/tbl4 out of an intrinsic, the sources converted to
11394	// v16i8, and the TBLMask
11395	SmallVector<SDValue, `16`> TBLOperands;
11396	TBLOperands.push_back(DAG.getConstant(Sources.size() == `3`
11397	? Intrinsic::aarch64_neon_tbl3
11398	: Intrinsic::aarch64_neon_tbl4,
11399	dl, MVT::i32));
11400	for (unsigned i = `0`; i < Sources.size(); i++) {
11401	SDValue Src = Sources [i].Vec;
11402	EVT SrcVT = Src.getValueType();
11403	Src = DAG.getBitcast(SrcVT.is64BitVector() ? MVT::v8i8 : MVT::v16i8, Src);
11404	assert((SrcVT.is64BitVector() \|\| SrcVT.is128BitVector()) &&
11405	"Expected a legally typed vector");
11406	if (SrcVT.is64BitVector())
11407	Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, Src,
11408	DAG.getUNDEF(MVT::v8i8));
11409	TBLOperands.push_back(Elt: Src);
11410	}
11411
11412	SmallVector<SDValue, `16`> TBLMask;
11413	for (unsigned i = `0`; i < Mask.size(); i++)
11414	TBLMask.push_back(DAG.getConstant(Mask[i], dl, MVT::i32));
11415	assert((Mask.size() == `8` \|\| Mask.size() == `16`) &&
11416	"Expected a v8i8 or v16i8 Mask");
11417	TBLOperands.push_back(
11418	DAG.getBuildVector(Mask.size() == `8` ? MVT::v8i8 : MVT::v16i8, dl, TBLMask));
11419
11420	SDValue Shuffle =
11421	DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl,
11422	Mask.size() == `8` ? MVT::v8i8 : MVT::v16i8, TBLOperands);
11423	return DAG.getBitcast(VT, V: Shuffle);
11424	}
11425
11426	if (Sources.size() > `2`) {
11427	LLVM_DEBUG(dbgs() << "Reshuffle failed: currently only do something "
11428	<< "sensible when at most two source vectors are "
11429	<< "involved\n");
11430	return SDValue ();
11431	}
11432
11433	// Find out the smallest element size among result and two sources, and use
11434	// it as element size to build the shuffle_vector.
11435	EVT SmallestEltTy = VT.getVectorElementType();
11436	for (auto &Source : Sources) {
11437	EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
11438	if (SrcEltTy.bitsLT(VT: SmallestEltTy)) {
11439	SmallestEltTy = SrcEltTy;
11440	}
11441	}
11442	unsigned ResMultiplier =
11443	VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits();
11444	uint64_t VTSize = VT.getFixedSizeInBits();
11445	NumElts = VTSize / SmallestEltTy.getFixedSizeInBits();
11446	EVT ShuffleVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: SmallestEltTy, NumElements: NumElts);
11447
11448	// If the source vector is too wide or too narrow, we may nevertheless be able
11449	// to construct a compatible shuffle either by concatenating it with UNDEF or
11450	// extracting a suitable range of elements.
11451	for (auto &Src : Sources) {
11452	EVT SrcVT = Src.ShuffleVec.getValueType();
11453
11454	TypeSize SrcVTSize = SrcVT.getSizeInBits();
11455	if (SrcVTSize == TypeSize::getFixed(ExactSize: VTSize))
11456	continue;
11457
11458	// This stage of the search produces a source with the same element type as
11459	// the original, but with a total width matching the BUILD_VECTOR output.
11460	EVT EltVT = SrcVT.getVectorElementType();
11461	unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
11462	EVT DestVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: EltVT, NumElements: NumSrcElts);
11463
11464	if (SrcVTSize.getFixedValue() < VTSize) {
11465	assert(`2` * SrcVTSize == VTSize);
11466	// We can pad out the smaller vector for free, so if it's part of a
11467	// shuffle...
11468	Src.ShuffleVec =
11469	DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: dl, VT: DestVT, N1: Src.ShuffleVec,
11470	N2: DAG.getUNDEF(VT: Src.ShuffleVec.getValueType()));
11471	continue;
11472	}
11473
11474	if (SrcVTSize.getFixedValue() != `2` * VTSize) {
11475	LLVM_DEBUG(
11476	dbgs() << "Reshuffle failed: result vector too small to extract\n");
11477	return SDValue ();
11478	}
11479
11480	if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
11481	LLVM_DEBUG(
11482	dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
11483	return SDValue ();
11484	}
11485
11486	if (Src.MinElt >= NumSrcElts) {
11487	// The extraction can just take the second half
11488	Src.ShuffleVec =
11489	DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
11490	DAG.getConstant(NumSrcElts, dl, MVT::i64));
11491	Src.WindowBase = -NumSrcElts;
11492	} else if (Src.MaxElt < NumSrcElts) {
11493	// The extraction can just take the first half
11494	Src.ShuffleVec =
11495	DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
11496	DAG.getConstant(`0`, dl, MVT::i64));
11497	} else {
11498	// An actual VEXT is needed
11499	SDValue VEXTSrc1 =
11500	DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
11501	DAG.getConstant(`0`, dl, MVT::i64));
11502	SDValue VEXTSrc2 =
11503	DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
11504	DAG.getConstant(NumSrcElts, dl, MVT::i64));
11505	unsigned Imm = Src.MinElt * getExtFactor(V&: VEXTSrc1);
11506
11507	if (!SrcVT.is64BitVector()) {
11508	LLVM_DEBUG(
11509	dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT "
11510	"for SVE vectors.");
11511	return SDValue ();
11512	}
11513
11514	Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
11515	VEXTSrc2,
11516	DAG.getConstant(Imm, dl, MVT::i32));
11517	Src.WindowBase = -Src.MinElt;
11518	}
11519	}
11520
11521	// Another possible incompatibility occurs from the vector element types. We
11522	// can fix this by bitcasting the source vectors to the same type we intend
11523	// for the shuffle.
11524	for (auto &Src : Sources) {
11525	EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
11526	if (SrcEltTy == SmallestEltTy)
11527	continue;
11528	assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
11529	if (DAG.getDataLayout().isBigEndian()) {
11530	Src.ShuffleVec =
11531	DAG.getNode(Opcode: AArch64ISD::NVCAST, DL: dl, VT: ShuffleVT, Operand: Src.ShuffleVec);
11532	} else {
11533	Src.ShuffleVec = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: ShuffleVT, Operand: Src.ShuffleVec);
11534	}
11535	Src.WindowScale =
11536	SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits();
11537	Src.WindowBase *= Src.WindowScale;
11538	}
11539
11540	// Final check before we try to actually produce a shuffle.
11541	LLVM_DEBUG(for (auto Src
11542	: Sources)
11543	assert(Src.ShuffleVec.getValueType() == ShuffleVT););
11544
11545	// The stars all align, our next step is to produce the mask for the shuffle.
11546	SmallVector<int, `8`> Mask(ShuffleVT.getVectorNumElements(), -`1`);
11547	int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
11548	for (unsigned i = `0`; i < VT.getVectorNumElements(); ++i) {
11549	SDValue Entry = Op.getOperand(i);
11550	if (Entry.isUndef())
11551	continue;
11552
11553	auto Src = find(Range&: Sources, Val: Entry.getOperand(i: `0`));
11554	int EltNo = cast<ConstantSDNode>(Val: Entry.getOperand(i: `1`))->getSExtValue();
11555
11556	// EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
11557	// trunc. So only std::min(SrcBits, DestBits) actually get defined in this
11558	// segment.
11559	EVT OrigEltTy = Entry.getOperand(i: `0`).getValueType().getVectorElementType();
11560	int BitsDefined = std::min(a: OrigEltTy.getScalarSizeInBits(),
11561	b: VT.getScalarSizeInBits());
11562	int LanesDefined = BitsDefined / BitsPerShuffleLane;
11563
11564	// This source is expected to fill ResMultiplier lanes of the final shuffle,
11565	// starting at the appropriate offset.
11566	int LaneMask = &Mask [i ResMultiplier];
11567
11568	int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
11569	ExtractBase += NumElts * (Src - Sources.begin());
11570	for (int j = `0`; j < LanesDefined; ++j)
11571	LaneMask[j] = ExtractBase + j;
11572	}
11573
11574	// Final check before we try to produce nonsense...
11575	if (!isShuffleMaskLegal(M: Mask, VT: ShuffleVT)) {
11576	LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
11577	return SDValue ();
11578	}
11579
11580	SDValue ShuffleOps[] = { DAG.getUNDEF(VT: ShuffleVT), DAG.getUNDEF(VT: ShuffleVT) };
11581	for (unsigned i = `0`; i < Sources.size(); ++i)
11582	ShuffleOps[i] = Sources [i].ShuffleVec;
11583
11584	SDValue Shuffle = DAG.getVectorShuffle(VT: ShuffleVT, dl, N1: ShuffleOps[`0`],
11585	N2: ShuffleOps[`1`], Mask);
11586	SDValue V;
11587	if (DAG.getDataLayout().isBigEndian()) {
11588	V = DAG.getNode(Opcode: AArch64ISD::NVCAST, DL: dl, VT, Operand: Shuffle);
11589	} else {
11590	V = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT, Operand: Shuffle);
11591	}
11592
11593	LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
11594	dbgs() << "Reshuffle, creating node: "; V.dump(););
11595
11596	return V;
11597	}
11598
11599	// check if an EXT instruction can handle the shuffle mask when the
11600	// vector sources of the shuffle are the same.
11601	static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
11602	unsigned NumElts = VT.getVectorNumElements();
11603
11604	// Assume that the first shuffle index is not UNDEF. Fail if it is.
11605	if (M [`0`] < `0`)
11606	return false;
11607
11608	Imm = M [`0`];
11609
11610	// If this is a VEXT shuffle, the immediate value is the index of the first
11611	// element. The other shuffle indices must be the successive elements after
11612	// the first one.
11613	unsigned ExpectedElt = Imm;
11614	for (unsigned i = `1`; i < NumElts; ++i) {
11615	// Increment the expected index. If it wraps around, just follow it
11616	// back to index zero and keep going.
11617	++ExpectedElt;
11618	if (ExpectedElt == NumElts)
11619	ExpectedElt = `0`;
11620
11621	if (M [i] < `0`)
11622	continue; // ignore UNDEF indices
11623	if (ExpectedElt != static_cast<unsigned>(M [i]))
11624	return false;
11625	}
11626
11627	return true;
11628	}
11629
11630	// Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
11631	// v4i32s. This is really a truncate, which we can construct out of (legal)
11632	// concats and truncate nodes.
11633	static SDValue ReconstructTruncateFromBuildVector(SDValue V, SelectionDAG &DAG) {
11634	if (V.getValueType() != MVT::v16i8)
11635	return SDValue ();
11636	assert(V.getNumOperands() == `16` && "Expected 16 operands on the BUILDVECTOR");
11637
11638	for (unsigned X = `0`; X < `4`; X++) {
11639	// Check the first item in each group is an extract from lane 0 of a v4i32
11640	// or v4i16.
11641	SDValue BaseExt = V.getOperand(i: X * `4`);
11642	if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
11643	(BaseExt.getOperand(`0`).getValueType() != MVT::v4i16 &&
11644	BaseExt.getOperand(`0`).getValueType() != MVT::v4i32) \|\|
11645	!isa<ConstantSDNode>(BaseExt.getOperand(`1`)) \|\|
11646	BaseExt.getConstantOperandVal(`1`) != `0`)
11647	return SDValue ();
11648	SDValue Base = BaseExt.getOperand(i: `0`);
11649	// And check the other items are extracts from the same vector.
11650	for (unsigned Y = `1`; Y < `4`; Y++) {
11651	SDValue Ext = V.getOperand(i: X * `4` + Y);
11652	if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
11653	Ext.getOperand(i: `0`) != Base \|\|
11654	!isa<ConstantSDNode>(Val: Ext.getOperand(i: `1`)) \|\|
11655	Ext.getConstantOperandVal(i: `1`) != Y)
11656	return SDValue ();
11657	}
11658	}
11659
11660	// Turn the buildvector into a series of truncates and concates, which will
11661	// become uzip1's. Any v4i32s we found get truncated to v4i16, which are
11662	// concat together to produce 2 v8i16. These are both truncated and concat
11663	// together.
11664	SDLoc DL(V);
11665	SDValue Trunc[`4`] = {
11666	V.getOperand(i: `0`).getOperand(i: `0`), V.getOperand(i: `4`).getOperand(i: `0`),
11667	V.getOperand(i: `8`).getOperand(i: `0`), V.getOperand(i: `12`).getOperand(i: `0`)};
11668	for (SDValue &V : Trunc)
11669	if (V.getValueType() == MVT::v4i32)
11670	V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i16, V);
11671	SDValue Concat0 =
11672	DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[`0`], Trunc[`1`]);
11673	SDValue Concat1 =
11674	DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[`2`], Trunc[`3`]);
11675	SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat0);
11676	SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat1);
11677	return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Trunc0, Trunc1);
11678	}
11679
11680	/// Check if a vector shuffle corresponds to a DUP instructions with a larger
11681	/// element width than the vector lane type. If that is the case the function
11682	/// returns true and writes the value of the DUP instruction lane operand into
11683	/// DupLaneOp
11684	static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize,
11685	unsigned &DupLaneOp) {
11686	assert((BlockSize == `16` \|\| BlockSize == `32` \|\| BlockSize == `64`) &&
11687	"Only possible block sizes for wide DUP are: 16, 32, 64");
11688
11689	if (BlockSize <= VT.getScalarSizeInBits())
11690	return false;
11691	if (BlockSize % VT.getScalarSizeInBits() != `0`)
11692	return false;
11693	if (VT.getSizeInBits() % BlockSize != `0`)
11694	return false;
11695
11696	size_t SingleVecNumElements = VT.getVectorNumElements();
11697	size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits();
11698	size_t NumBlocks = VT.getSizeInBits() / BlockSize;
11699
11700	// We are looking for masks like
11701	// [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element
11702	// might be replaced by 'undefined'. BlockIndices will eventually contain
11703	// lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7]
11704	// for the above examples)
11705	SmallVector<int, `8`> BlockElts(NumEltsPerBlock, -`1`);
11706	for (size_t BlockIndex = `0`; BlockIndex < NumBlocks; BlockIndex++)
11707	for (size_t I = `0`; I < NumEltsPerBlock; I++) {
11708	int Elt = M [BlockIndex * NumEltsPerBlock + I];
11709	if (Elt < `0`)
11710	continue;
11711	// For now we don't support shuffles that use the second operand
11712	if ((unsigned)Elt >= SingleVecNumElements)
11713	return false;
11714	if (BlockElts [I] < `0`)
11715	BlockElts [I] = Elt;
11716	else if (BlockElts [I] != Elt)
11717	return false;
11718	}
11719
11720	// We found a candidate block (possibly with some undefs). It must be a
11721	// sequence of consecutive integers starting with a value divisible by
11722	// NumEltsPerBlock with some values possibly replaced by undef-s.
11723
11724	// Find first non-undef element
11725	auto FirstRealEltIter = find_if(Range&: BlockElts, P: [](int Elt) { return Elt >= `0`; });
11726	assert(FirstRealEltIter != BlockElts.end() &&
11727	"Shuffle with all-undefs must have been caught by previous cases, "
11728	"e.g. isSplat()");
11729	if (FirstRealEltIter == BlockElts.end()) {
11730	DupLaneOp = `0`;
11731	return true;
11732	}
11733
11734	// Index of FirstRealElt in BlockElts
11735	size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin();
11736
11737	if ((unsigned)*FirstRealEltIter < FirstRealIndex)
11738	return false;
11739	// BlockElts[0] must have the following value if it isn't undef:
11740	size_t Elt0 = *FirstRealEltIter - FirstRealIndex;
11741
11742	// Check the first element
11743	if (Elt0 % NumEltsPerBlock != `0`)
11744	return false;
11745	// Check that the sequence indeed consists of consecutive integers (modulo
11746	// undefs)
11747	for (size_t I = `0`; I < NumEltsPerBlock; I++)
11748	if (BlockElts [I] >= `0` && (unsigned)BlockElts [I] != Elt0 + I)
11749	return false;
11750
11751	DupLaneOp = Elt0 / NumEltsPerBlock;
11752	return true;
11753	}
11754
11755	// check if an EXT instruction can handle the shuffle mask when the
11756	// vector sources of the shuffle are different.
11757	static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
11758	unsigned &Imm) {
11759	// Look for the first non-undef element.
11760	const int FirstRealElt = find_if(Range&: M, P: [](int* Elt) { return Elt >= `0`; });
11761
11762	// Benefit form APInt to handle overflow when calculating expected element.
11763	unsigned NumElts = VT.getVectorNumElements();
11764	unsigned MaskBits = APInt (`32`, NumElts * `2`).logBase2();
11765	APInt ExpectedElt = APInt (MaskBits, *FirstRealElt + `1`);
11766	// The following shuffle indices must be the successive elements after the
11767	// first real element.
11768	bool FoundWrongElt = std::any_of(first: FirstRealElt + `1`, last: M.end(), pred: [&](int Elt) {
11769	return Elt != ExpectedElt ++ && Elt != -`1`;
11770	});
11771	if (FoundWrongElt)
11772	return false;
11773
11774	// The index of an EXT is the first element if it is not UNDEF.
11775	// Watch out for the beginning UNDEFs. The EXT index should be the expected
11776	// value of the first element. E.g.
11777	// <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
11778	// <-1, -1, 0, 1, ...> is treated as <2NumElts-2, 2NumElts-1, 0, 1, ...>.
11779	// ExpectedElt is the last mask index plus 1.
11780	Imm = ExpectedElt.getZExtValue();
11781
11782	// There are two difference cases requiring to reverse input vectors.
11783	// For example, for vector <4 x i32> we have the following cases,
11784	// Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
11785	// Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
11786	// For both cases, we finally use mask <5, 6, 7, 0>, which requires
11787	// to reverse two input vectors.
11788	if (Imm < NumElts)
11789	ReverseEXT = true;
11790	else
11791	Imm -= NumElts;
11792
11793	return true;
11794	}
11795
11796	/// isREVMask - Check if a vector shuffle corresponds to a REV
11797	/// instruction with the specified blocksize. (The order of the elements
11798	/// within each block of the vector is reversed.)
11799	static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
11800	assert((BlockSize == `16` \|\| BlockSize == `32` \|\| BlockSize == `64` \|\|
11801	BlockSize == `128`) &&
11802	"Only possible block sizes for REV are: 16, 32, 64, 128");
11803
11804	unsigned EltSz = VT.getScalarSizeInBits();
11805	unsigned NumElts = VT.getVectorNumElements();
11806	unsigned BlockElts = M [`0`] + `1`;
11807	// If the first shuffle index is UNDEF, be optimistic.
11808	if (M [`0`] < `0`)
11809	BlockElts = BlockSize / EltSz;
11810
11811	if (BlockSize <= EltSz \|\| BlockSize != BlockElts * EltSz)
11812	return false;
11813
11814	for (unsigned i = `0`; i < NumElts; ++i) {
11815	if (M [i] < `0`)
11816	continue; // ignore UNDEF indices
11817	if ((unsigned)M [i] != (i - i % BlockElts) + (BlockElts - `1` - i % BlockElts))
11818	return false;
11819	}
11820
11821	return true;
11822	}
11823
11824	static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
11825	unsigned NumElts = VT.getVectorNumElements();
11826	if (NumElts % `2` != `0`)
11827	return false;
11828	WhichResult = (M [`0`] == `0` ? `0` : `1`);
11829	for (unsigned i = `0`; i < NumElts; i += `2`) {
11830	if ((M [i] >= `0` && (unsigned)M [i] != i + WhichResult) \|\|
11831	(M [i + `1`] >= `0` && (unsigned)M [i + `1`] != i + NumElts + WhichResult))
11832	return false;
11833	}
11834	return true;
11835	}
11836
11837	/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
11838	/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
11839	/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
11840	static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
11841	unsigned NumElts = VT.getVectorNumElements();
11842	if (NumElts % `2` != `0`)
11843	return false;
11844	WhichResult = (M [`0`] == `0` ? `0` : `1`);
11845	unsigned Idx = WhichResult * NumElts / `2`;
11846	for (unsigned i = `0`; i != NumElts; i += `2`) {
11847	if ((M [i] >= `0` && (unsigned)M [i] != Idx) \|\|
11848	(M [i + `1`] >= `0` && (unsigned)M [i + `1`] != Idx))
11849	return false;
11850	Idx += `1`;
11851	}
11852
11853	return true;
11854	}
11855
11856	/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
11857	/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
11858	/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
11859	static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
11860	unsigned Half = VT.getVectorNumElements() / `2`;
11861	WhichResult = (M [`0`] == `0` ? `0` : `1`);
11862	for (unsigned j = `0`; j != `2`; ++j) {
11863	unsigned Idx = WhichResult;
11864	for (unsigned i = `0`; i != Half; ++i) {
11865	int MIdx = M [i + j * Half];
11866	if (MIdx >= `0` && (unsigned)MIdx != Idx)
11867	return false;
11868	Idx += `2`;
11869	}
11870	}
11871
11872	return true;
11873	}
11874
11875	/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
11876	/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
11877	/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
11878	static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
11879	unsigned NumElts = VT.getVectorNumElements();
11880	if (NumElts % `2` != `0`)
11881	return false;
11882	WhichResult = (M [`0`] == `0` ? `0` : `1`);
11883	for (unsigned i = `0`; i < NumElts; i += `2`) {
11884	if ((M [i] >= `0` && (unsigned)M [i] != i + WhichResult) \|\|
11885	(M [i + `1`] >= `0` && (unsigned)M [i + `1`] != i + WhichResult))
11886	return false;
11887	}
11888	return true;
11889	}
11890
11891	static bool isINSMask(ArrayRef<int> M, int NumInputElements,
11892	bool &DstIsLeft, int &Anomaly) {
11893	if (M.size() != static_cast<size_t>(NumInputElements))
11894	return false;
11895
11896	int NumLHSMatch = `0`, NumRHSMatch = `0`;
11897	int LastLHSMismatch = -`1`, LastRHSMismatch = -`1`;
11898
11899	for (int i = `0`; i < NumInputElements; ++i) {
11900	if (M [i] == -`1`) {
11901	++NumLHSMatch;
11902	++NumRHSMatch;
11903	continue;
11904	}
11905
11906	if (M [i] == i)
11907	++NumLHSMatch;
11908	else
11909	LastLHSMismatch = i;
11910
11911	if (M [i] == i + NumInputElements)
11912	++NumRHSMatch;
11913	else
11914	LastRHSMismatch = i;
11915	}
11916
11917	if (NumLHSMatch == NumInputElements - `1`) {
11918	DstIsLeft = true;
11919	Anomaly = LastLHSMismatch;
11920	return true;
11921	} else if (NumRHSMatch == NumInputElements - `1`) {
11922	DstIsLeft = false;
11923	Anomaly = LastRHSMismatch;
11924	return true;
11925	}
11926
11927	return false;
11928	}
11929
11930	static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
11931	if (VT.getSizeInBits() != `128`)
11932	return false;
11933
11934	unsigned NumElts = VT.getVectorNumElements();
11935
11936	for (int I = `0`, E = NumElts / `2`; I != E; I++) {
11937	if (Mask [I] != I)
11938	return false;
11939	}
11940
11941	int Offset = NumElts / `2`;
11942	for (int I = NumElts / `2`, E = NumElts; I != E; I++) {
11943	if (Mask [I] != I + SplitLHS * Offset)
11944	return false;
11945	}
11946
11947	return true;
11948	}
11949
11950	static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
11951	SDLoc DL(Op);
11952	EVT VT = Op.getValueType();
11953	SDValue V0 = Op.getOperand(i: `0`);
11954	SDValue V1 = Op.getOperand(i: `1`);
11955	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Val&: Op)->getMask();
11956
11957	if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() \|\|
11958	VT.getVectorElementType() != V1.getValueType().getVectorElementType())
11959	return SDValue ();
11960
11961	bool SplitV0 = V0.getValueSizeInBits() == `128`;
11962
11963	if (!isConcatMask(Mask, VT, SplitLHS: SplitV0))
11964	return SDValue ();
11965
11966	EVT CastVT = VT.getHalfNumVectorElementsVT(Context&: *DAG.getContext());
11967	if (SplitV0) {
11968	V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
11969	DAG.getConstant(`0`, DL, MVT::i64));
11970	}
11971	if (V1.getValueSizeInBits() == `128`) {
11972	V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
11973	DAG.getConstant(`0`, DL, MVT::i64));
11974	}
11975	return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT, N1: V0, N2: V1);
11976	}
11977
11978	/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
11979	/// the specified operations to build the shuffle. ID is the perfect-shuffle
11980	//ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle
11981	//table entry and LHS/RHS are the immediate inputs for this stage of the
11982	//shuffle.
11983	static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1,
11984	SDValue V2, unsigned PFEntry, SDValue LHS,
11985	SDValue RHS, SelectionDAG &DAG,
11986	const SDLoc &dl) {
11987	unsigned OpNum = (PFEntry >> `26`) & `0x0F`;
11988	unsigned LHSID = (PFEntry >> `13`) & ((`1` << `13`) - `1`);
11989	unsigned RHSID = (PFEntry >> `0`) & ((`1` << `13`) - `1`);
11990
11991	enum {
11992	OP_COPY = `0`, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
11993	OP_VREV,
11994	OP_VDUP0,
11995	OP_VDUP1,
11996	OP_VDUP2,
11997	OP_VDUP3,
11998	OP_VEXT1,
11999	OP_VEXT2,
12000	OP_VEXT3,
12001	OP_VUZPL, // VUZP, left result
12002	OP_VUZPR, // VUZP, right result
12003	OP_VZIPL, // VZIP, left result
12004	OP_VZIPR, // VZIP, right result
12005	OP_VTRNL, // VTRN, left result
12006	OP_VTRNR, // VTRN, right result
12007	OP_MOVLANE // Move lane. RHSID is the lane to move into
12008	};
12009
12010	if (OpNum == OP_COPY) {
12011	if (LHSID == (`1` * `9` + `2`) * `9` + `3`)
12012	return LHS;
12013	assert(LHSID == ((`4` * `9` + `5`) * `9` + `6`) * `9` + `7` && "Illegal OP_COPY!");
12014	return RHS;
12015	}
12016
12017	if (OpNum == OP_MOVLANE) {
12018	// Decompose a PerfectShuffle ID to get the Mask for lane Elt
12019	auto getPFIDLane = [](unsigned ID, int Elt) -> int {
12020	assert(Elt < `4` && "Expected Perfect Lanes to be less than 4");
12021	Elt = `3` - Elt;
12022	while (Elt > `0`) {
12023	ID /= `9`;
12024	Elt--;
12025	}
12026	return (ID % `9` == `8`) ? -`1` : ID % `9`;
12027	};
12028
12029	// For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We
12030	// get the lane to move from the PFID, which is always from the
12031	// original vectors (V1 or V2).
12032	SDValue OpLHS = GeneratePerfectShuffle(
12033	ID: LHSID, V1, V2, PFEntry: PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
12034	EVT VT = OpLHS.getValueType();
12035	assert(RHSID < `8` && "Expected a lane index for RHSID!");
12036	unsigned ExtLane = `0`;
12037	SDValue Input;
12038
12039	// OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs
12040	// convert into a higher type.
12041	if (RHSID & `0x4`) {
12042	int MaskElt = getPFIDLane (ID, (RHSID & `0x01`) << `1`) >> `1`;
12043	if (MaskElt == -`1`)
12044	MaskElt = (getPFIDLane (ID, ((RHSID & `0x01`) << `1`) + `1`) - `1`) >> `1`;
12045	assert(MaskElt >= `0` && "Didn't expect an undef movlane index!");
12046	ExtLane = MaskElt < `2` ? MaskElt : (MaskElt - `2`);
12047	Input = MaskElt < `2` ? V1 : V2;
12048	if (VT.getScalarSizeInBits() == `16`) {
12049	Input = DAG.getBitcast(MVT::v2f32, Input);
12050	OpLHS = DAG.getBitcast(MVT::v2f32, OpLHS);
12051	} else {
12052	assert(VT.getScalarSizeInBits() == `32` &&
12053	"Expected 16 or 32 bit shuffle elemements");
12054	Input = DAG.getBitcast(MVT::v2f64, Input);
12055	OpLHS = DAG.getBitcast(MVT::v2f64, OpLHS);
12056	}
12057	} else {
12058	int MaskElt = getPFIDLane (ID, RHSID);
12059	assert(MaskElt >= `0` && "Didn't expect an undef movlane index!");
12060	ExtLane = MaskElt < `4` ? MaskElt : (MaskElt - `4`);
12061	Input = MaskElt < `4` ? V1 : V2;
12062	// Be careful about creating illegal types. Use f16 instead of i16.
12063	if (VT == MVT::v4i16) {
12064	Input = DAG.getBitcast(MVT::v4f16, Input);
12065	OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS);
12066	}
12067	}
12068	SDValue Ext = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: dl,
12069	VT: Input.getValueType().getVectorElementType(),
12070	N1: Input, N2: DAG.getVectorIdxConstant(Val: ExtLane, DL: dl));
12071	SDValue Ins =
12072	DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL: dl, VT: Input.getValueType(), N1: OpLHS,
12073	N2: Ext, N3: DAG.getVectorIdxConstant(Val: RHSID & `0x3`, DL: dl));
12074	return DAG.getBitcast(VT, V: Ins);
12075	}
12076
12077	SDValue OpLHS, OpRHS;
12078	OpLHS = GeneratePerfectShuffle(ID: LHSID, V1, V2, PFEntry: PerfectShuffleTable[LHSID], LHS,
12079	RHS, DAG, dl);
12080	OpRHS = GeneratePerfectShuffle(ID: RHSID, V1, V2, PFEntry: PerfectShuffleTable[RHSID], LHS,
12081	RHS, DAG, dl);
12082	EVT VT = OpLHS.getValueType();
12083
12084	switch (OpNum) {
12085	default:
12086	llvm_unreachable("Unknown shuffle opcode!");
12087	case OP_VREV:
12088	// VREV divides the vector in half and swaps within the half.
12089	if (VT.getVectorElementType() == MVT::i32 \|\|
12090	VT.getVectorElementType() == MVT::f32)
12091	return DAG.getNode(Opcode: AArch64ISD::REV64, DL: dl, VT, Operand: OpLHS);
12092	// vrev <4 x i16> -> REV32
12093	if (VT.getVectorElementType() == MVT::i16 \|\|
12094	VT.getVectorElementType() == MVT::f16 \|\|
12095	VT.getVectorElementType() == MVT::bf16)
12096	return DAG.getNode(Opcode: AArch64ISD::REV32, DL: dl, VT, Operand: OpLHS);
12097	// vrev <4 x i8> -> REV16
12098	assert(VT.getVectorElementType() == MVT::i8);
12099	return DAG.getNode(Opcode: AArch64ISD::REV16, DL: dl, VT, Operand: OpLHS);
12100	case OP_VDUP0:
12101	case OP_VDUP1:
12102	case OP_VDUP2:
12103	case OP_VDUP3: {
12104	EVT EltTy = VT.getVectorElementType();
12105	unsigned Opcode;
12106	if (EltTy == MVT::i8)
12107	Opcode = AArch64ISD::DUPLANE8;
12108	else if (EltTy == MVT::i16 \|\| EltTy == MVT::f16 \|\| EltTy == MVT::bf16)
12109	Opcode = AArch64ISD::DUPLANE16;
12110	else if (EltTy == MVT::i32 \|\| EltTy == MVT::f32)
12111	Opcode = AArch64ISD::DUPLANE32;
12112	else if (EltTy == MVT::i64 \|\| EltTy == MVT::f64)
12113	Opcode = AArch64ISD::DUPLANE64;
12114	else
12115	llvm_unreachable("Invalid vector element type?");
12116
12117	if (VT.getSizeInBits() == `64`)
12118	OpLHS = WidenVector(V64Reg: OpLHS, DAG);
12119	SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64);
12120	return DAG.getNode(Opcode, DL: dl, VT, N1: OpLHS, N2: Lane);
12121	}
12122	case OP_VEXT1:
12123	case OP_VEXT2:
12124	case OP_VEXT3: {
12125	unsigned Imm = (OpNum - OP_VEXT1 + `1`) * getExtFactor(V&: OpLHS);
12126	return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS,
12127	DAG.getConstant(Imm, dl, MVT::i32));
12128	}
12129	case OP_VUZPL:
12130	return DAG.getNode(Opcode: AArch64ISD::UZP1, DL: dl, VT, N1: OpLHS, N2: OpRHS);
12131	case OP_VUZPR:
12132	return DAG.getNode(Opcode: AArch64ISD::UZP2, DL: dl, VT, N1: OpLHS, N2: OpRHS);
12133	case OP_VZIPL:
12134	return DAG.getNode(Opcode: AArch64ISD::ZIP1, DL: dl, VT, N1: OpLHS, N2: OpRHS);
12135	case OP_VZIPR:
12136	return DAG.getNode(Opcode: AArch64ISD::ZIP2, DL: dl, VT, N1: OpLHS, N2: OpRHS);
12137	case OP_VTRNL:
12138	return DAG.getNode(Opcode: AArch64ISD::TRN1, DL: dl, VT, N1: OpLHS, N2: OpRHS);
12139	case OP_VTRNR:
12140	return DAG.getNode(Opcode: AArch64ISD::TRN2, DL: dl, VT, N1: OpLHS, N2: OpRHS);
12141	}
12142	}
12143
12144	static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
12145	SelectionDAG &DAG) {
12146	// Check to see if we can use the TBL instruction.
12147	SDValue V1 = Op.getOperand(i: `0`);
12148	SDValue V2 = Op.getOperand(i: `1`);
12149	SDLoc DL(Op);
12150
12151	EVT EltVT = Op.getValueType().getVectorElementType();
12152	unsigned BytesPerElt = EltVT.getSizeInBits() / `8`;
12153
12154	bool Swap = false;
12155	if (V1.isUndef() \|\| isZerosVector(N: V1.getNode())) {
12156	std::swap(a&: V1, b&: V2);
12157	Swap = true;
12158	}
12159
12160	// If the V2 source is undef or zero then we can use a tbl1, as tbl1 will fill
12161	// out of range values with 0s. We do need to make sure that any out-of-range
12162	// values are really out-of-range for a v16i8 vector.
12163	bool IsUndefOrZero = V2.isUndef() \|\| isZerosVector(N: V2.getNode());
12164	MVT IndexVT = MVT::v8i8;
12165	unsigned IndexLen = `8`;
12166	if (Op.getValueSizeInBits() == `128`) {
12167	IndexVT = MVT::v16i8;
12168	IndexLen = `16`;
12169	}
12170
12171	SmallVector<SDValue, `8`> TBLMask;
12172	for (int Val : ShuffleMask) {
12173	for (unsigned Byte = `0`; Byte < BytesPerElt; ++Byte) {
12174	unsigned Offset = Byte + Val * BytesPerElt;
12175	if (Swap)
12176	Offset = Offset < IndexLen ? Offset + IndexLen : Offset - IndexLen;
12177	if (IsUndefOrZero && Offset >= IndexLen)
12178	Offset = `255`;
12179	TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
12180	}
12181	}
12182
12183	SDValue V1Cst = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: IndexVT, Operand: V1);
12184	SDValue V2Cst = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: IndexVT, Operand: V2);
12185
12186	SDValue Shuffle;
12187	if (IsUndefOrZero) {
12188	if (IndexLen == `8`)
12189	V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
12190	Shuffle = DAG.getNode(
12191	ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
12192	DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
12193	DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
12194	} else {
12195	if (IndexLen == `8`) {
12196	V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
12197	Shuffle = DAG.getNode(
12198	ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
12199	DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
12200	DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
12201	} else {
12202	// FIXME: We cannot, for the moment, emit a TBL2 instruction because we
12203	// cannot currently represent the register constraints on the input
12204	// table registers.
12205	// Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
12206	// DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
12207	// IndexLen));
12208	Shuffle = DAG.getNode(
12209	ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
12210	DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,
12211	V2Cst,
12212	DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
12213	}
12214	}
12215	return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: Op.getValueType(), Operand: Shuffle);
12216	}
12217
12218	static unsigned getDUPLANEOp(EVT EltType) {
12219	if (EltType == MVT::i8)
12220	return AArch64ISD::DUPLANE8;
12221	if (EltType == MVT::i16 \|\| EltType == MVT::f16 \|\| EltType == MVT::bf16)
12222	return AArch64ISD::DUPLANE16;
12223	if (EltType == MVT::i32 \|\| EltType == MVT::f32)
12224	return AArch64ISD::DUPLANE32;
12225	if (EltType == MVT::i64 \|\| EltType == MVT::f64)
12226	return AArch64ISD::DUPLANE64;
12227
12228	llvm_unreachable("Invalid vector element type?");
12229	}
12230
12231	static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT,
12232	unsigned Opcode, SelectionDAG &DAG) {
12233	// Try to eliminate a bitcasted extract subvector before a DUPLANE.
12234	auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
12235	// Match: dup (bitcast (extract_subv X, C)), LaneC
12236	if (BitCast.getOpcode() != ISD::BITCAST \|\|
12237	BitCast.getOperand(i: `0`).getOpcode() != ISD::EXTRACT_SUBVECTOR)
12238	return false;
12239
12240	// The extract index must align in the destination type. That may not
12241	// happen if the bitcast is from narrow to wide type.
12242	SDValue Extract = BitCast.getOperand(i: `0`);
12243	unsigned ExtIdx = Extract.getConstantOperandVal(i: `1`);
12244	unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
12245	unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
12246	unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
12247	if (ExtIdxInBits % CastedEltBitWidth != `0`)
12248	return false;
12249
12250	// Can't handle cases where vector size is not 128-bit
12251	if (!Extract.getOperand(i: `0`).getValueType().is128BitVector())
12252	return false;
12253
12254	// Update the lane value by offsetting with the scaled extract index.
12255	LaneC += ExtIdxInBits / CastedEltBitWidth;
12256
12257	// Determine the casted vector type of the wide vector input.
12258	// dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
12259	// Examples:
12260	// dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
12261	// dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
12262	unsigned SrcVecNumElts =
12263	Extract.getOperand(i: `0`).getValueSizeInBits() / CastedEltBitWidth;
12264	CastVT = MVT::getVectorVT(VT: BitCast.getSimpleValueType().getScalarType(),
12265	NumElements: SrcVecNumElts);
12266	return true;
12267	};
12268	MVT CastVT;
12269	if (getScaledOffsetDup (V, Lane, CastVT)) {
12270	V = DAG.getBitcast(VT: CastVT, V: V.getOperand(i: `0`).getOperand(i: `0`));
12271	} else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
12272	V.getOperand(i: `0`).getValueType().is128BitVector()) {
12273	// The lane is incremented by the index of the extract.
12274	// Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
12275	Lane += V.getConstantOperandVal(i: `1`);
12276	V = V.getOperand(i: `0`);
12277	} else if (V.getOpcode() == ISD::CONCAT_VECTORS) {
12278	// The lane is decremented if we are splatting from the 2nd operand.
12279	// Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
12280	unsigned Idx = Lane >= (int)VT.getVectorNumElements() / `2`;
12281	Lane -= Idx * VT.getVectorNumElements() / `2`;
12282	V = WidenVector(V64Reg: V.getOperand(i: Idx), DAG);
12283	} else if (VT.getSizeInBits() == `64`) {
12284	// Widen the operand to 128-bit register with undef.
12285	V = WidenVector(V64Reg: V, DAG);
12286	}
12287	return DAG.getNode(Opcode, dl, VT, V, DAG.getConstant(Lane, dl, MVT::i64));
12288	}
12289
12290	// Return true if we can get a new shuffle mask by checking the parameter mask
12291	// array to test whether every two adjacent mask values are continuous and
12292	// starting from an even number.
12293	static bool isWideTypeMask(ArrayRef<int> M, EVT VT,
12294	SmallVectorImpl<int> &NewMask) {
12295	unsigned NumElts = VT.getVectorNumElements();
12296	if (NumElts % `2` != `0`)
12297	return false;
12298
12299	NewMask.clear();
12300	for (unsigned i = `0`; i < NumElts; i += `2`) {
12301	int M0 = M [i];
12302	int M1 = M [i + `1`];
12303
12304	// If both elements are undef, new mask is undef too.
12305	if (M0 == -`1` && M1 == -`1`) {
12306	NewMask.push_back(Elt: -`1`);
12307	continue;
12308	}
12309
12310	if (M0 == -`1` && M1 != -`1` && (M1 % `2`) == `1`) {
12311	NewMask.push_back(Elt: M1 / `2`);
12312	continue;
12313	}
12314
12315	if (M0 != -`1` && (M0 % `2`) == `0` && ((M0 + `1`) == M1 \|\| M1 == -`1`)) {
12316	NewMask.push_back(Elt: M0 / `2`);
12317	continue;
12318	}
12319
12320	NewMask.clear();
12321	return false;
12322	}
12323
12324	assert(NewMask.size() == NumElts / `2` && "Incorrect size for mask!");
12325	return true;
12326	}
12327
12328	// Try to widen element type to get a new mask value for a better permutation
12329	// sequence, so that we can use NEON shuffle instructions, such as zip1/2,
12330	// UZP1/2, TRN1/2, REV, INS, etc.
12331	// For example:
12332	// shufflevector <4 x i32> %a, <4 x i32> %b,
12333	// <4 x i32> <i32 6, i32 7, i32 2, i32 3>
12334	// is equivalent to:
12335	// shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
12336	// Finally, we can get:
12337	// mov v0.d[0], v1.d[1]
12338	static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG) {
12339	SDLoc DL(Op);
12340	EVT VT = Op.getValueType();
12341	EVT ScalarVT = VT.getVectorElementType();
12342	unsigned ElementSize = ScalarVT.getFixedSizeInBits();
12343	SDValue V0 = Op.getOperand(i: `0`);
12344	SDValue V1 = Op.getOperand(i: `1`);
12345	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Val&: Op)->getMask();
12346
12347	// If combining adjacent elements, like two i16's -> i32, two i32's -> i64 ...
12348	// We need to make sure the wider element type is legal. Thus, ElementSize
12349	// should be not larger than 32 bits, and i1 type should also be excluded.
12350	if (ElementSize > `32` \|\| ElementSize == `1`)
12351	return SDValue ();
12352
12353	SmallVector<int, `8`> NewMask;
12354	if (isWideTypeMask(M: Mask, VT, NewMask)) {
12355	MVT NewEltVT = VT.isFloatingPoint()
12356	? MVT::getFloatingPointVT(BitWidth: ElementSize * `2`)
12357	: MVT::getIntegerVT(BitWidth: ElementSize * `2`);
12358	MVT NewVT = MVT::getVectorVT(VT: NewEltVT, NumElements: VT.getVectorNumElements() / `2`);
12359	if (DAG.getTargetLoweringInfo().isTypeLegal(VT: NewVT)) {
12360	V0 = DAG.getBitcast(VT: NewVT, V: V0);
12361	V1 = DAG.getBitcast(VT: NewVT, V: V1);
12362	return DAG.getBitcast(VT,
12363	V: DAG.getVectorShuffle(VT: NewVT, dl: DL, N1: V0, N2: V1, Mask: NewMask));
12364	}
12365	}
12366
12367	return SDValue ();
12368	}
12369
12370	// Try to fold shuffle (tbl2, tbl2) into a single tbl4.
12371	static SDValue tryToConvertShuffleOfTbl2ToTbl4(SDValue Op,
12372	ArrayRef<int> ShuffleMask,
12373	SelectionDAG &DAG) {
12374	SDValue Tbl1 = Op ->getOperand(Num: `0`);
12375	SDValue Tbl2 = Op ->getOperand(Num: `1`);
12376	SDLoc dl(Op);
12377	SDValue Tbl2ID =
12378	DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, dl, MVT::i64);
12379
12380	EVT VT = Op.getValueType();
12381	if (Tbl1 ->getOpcode() != ISD::INTRINSIC_WO_CHAIN \|\|
12382	Tbl1 ->getOperand(Num: `0`) != Tbl2ID \|\|
12383	Tbl2 ->getOpcode() != ISD::INTRINSIC_WO_CHAIN \|\|
12384	Tbl2 ->getOperand(Num: `0`) != Tbl2ID)
12385	return SDValue ();
12386
12387	if (Tbl1->getValueType(`0`) != MVT::v16i8 \|\|
12388	Tbl2->getValueType(`0`) != MVT::v16i8)
12389	return SDValue ();
12390
12391	SDValue Mask1 = Tbl1 ->getOperand(Num: `3`);
12392	SDValue Mask2 = Tbl2 ->getOperand(Num: `3`);
12393	SmallVector<SDValue, `16`> TBLMaskParts(`16`, SDValue ());
12394	for (unsigned I = `0`; I < `16`; I++) {
12395	if (ShuffleMask [I] < `16`)
12396	TBLMaskParts [I] = Mask1 ->getOperand(Num: ShuffleMask [I]);
12397	else {
12398	auto *C =
12399	dyn_cast<ConstantSDNode>(Val: Mask2 ->getOperand(Num: ShuffleMask [I] - `16`));
12400	if (!C)
12401	return SDValue ();
12402	TBLMaskParts[I] = DAG.getConstant(C->getSExtValue() + `32`, dl, MVT::i32);
12403	}
12404	}
12405
12406	SDValue TBLMask = DAG.getBuildVector(VT, DL: dl, Ops: TBLMaskParts);
12407	SDValue ID =
12408	DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl4, dl, MVT::i64);
12409
12410	return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v16i8,
12411	{ID, Tbl1->getOperand(`1`), Tbl1->getOperand(`2`),
12412	Tbl2->getOperand(`1`), Tbl2->getOperand(`2`), TBLMask});
12413	}
12414
12415	// Baseline legalization for ZERO_EXTEND_VECTOR_INREG will blend-in zeros,
12416	// but we don't have an appropriate instruction,
12417	// so custom-lower it as ZIP1-with-zeros.
12418	SDValue
12419	AArch64TargetLowering::LowerZERO_EXTEND_VECTOR_INREG(SDValue Op,
12420	SelectionDAG &DAG) const {
12421	SDLoc dl(Op);
12422	EVT VT = Op.getValueType();
12423	SDValue SrcOp = Op.getOperand(i: `0`);
12424	EVT SrcVT = SrcOp.getValueType();
12425	assert(VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits() == `0` &&
12426	"Unexpected extension factor.");
12427	unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
12428	// FIXME: support multi-step zipping?
12429	if (Scale != `2`)
12430	return SDValue ();
12431	SDValue Zeros = DAG.getConstant(Val: `0`, DL: dl, VT: SrcVT);
12432	return DAG.getBitcast(VT,
12433	V: DAG.getNode(Opcode: AArch64ISD::ZIP1, DL: dl, VT: SrcVT, N1: SrcOp, N2: Zeros));
12434	}
12435
12436	SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
12437	SelectionDAG &DAG) const {
12438	SDLoc dl(Op);
12439	EVT VT = Op.getValueType();
12440
12441	ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Val: Op.getNode());
12442
12443	if (useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable()))
12444	return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG);
12445
12446	// Convert shuffles that are directly supported on NEON to target-specific
12447	// DAG nodes, instead of keeping them as shuffles and matching them again
12448	// during code selection. This is more efficient and avoids the possibility
12449	// of inconsistencies between legalization and selection.
12450	ArrayRef<int> ShuffleMask = SVN->getMask();
12451
12452	SDValue V1 = Op.getOperand(i: `0`);
12453	SDValue V2 = Op.getOperand(i: `1`);
12454
12455	assert(V1.getValueType() == VT && "Unexpected VECTOR_SHUFFLE type!");
12456	assert(ShuffleMask.size() == VT.getVectorNumElements() &&
12457	"Unexpected VECTOR_SHUFFLE mask size!");
12458
12459	if (SDValue Res = tryToConvertShuffleOfTbl2ToTbl4(Op, ShuffleMask, DAG))
12460	return Res;
12461
12462	if (SVN->isSplat()) {
12463	int Lane = SVN->getSplatIndex();
12464	// If this is undef splat, generate it via "just" vdup, if possible.
12465	if (Lane == -`1`)
12466	Lane = `0`;
12467
12468	if (Lane == `0` && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
12469	return DAG.getNode(Opcode: AArch64ISD::DUP, DL: dl, VT: V1.getValueType(),
12470	Operand: V1.getOperand(i: `0`));
12471	// Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
12472	// constant. If so, we can just reference the lane's definition directly.
12473	if (V1.getOpcode() == ISD::BUILD_VECTOR &&
12474	!isa<ConstantSDNode>(Val: V1.getOperand(i: Lane)))
12475	return DAG.getNode(Opcode: AArch64ISD::DUP, DL: dl, VT, Operand: V1.getOperand(i: Lane));
12476
12477	// Otherwise, duplicate from the lane of the input vector.
12478	unsigned Opcode = getDUPLANEOp(EltType: V1.getValueType().getVectorElementType());
12479	return constructDup(V: V1, Lane, dl, VT, Opcode, DAG);
12480	}
12481
12482	// Check if the mask matches a DUP for a wider element
12483	for (unsigned LaneSize : {`64U`, `32U`, `16U`}) {
12484	unsigned Lane = `0`;
12485	if (isWideDUPMask(M: ShuffleMask, VT, BlockSize: LaneSize, DupLaneOp&: Lane)) {
12486	unsigned Opcode = LaneSize == `64` ? AArch64ISD::DUPLANE64
12487	: LaneSize == `32` ? AArch64ISD::DUPLANE32
12488	: AArch64ISD::DUPLANE16;
12489	// Cast V1 to an integer vector with required lane size
12490	MVT NewEltTy = MVT::getIntegerVT(BitWidth: LaneSize);
12491	unsigned NewEltCount = VT.getSizeInBits() / LaneSize;
12492	MVT NewVecTy = MVT::getVectorVT(VT: NewEltTy, NumElements: NewEltCount);
12493	V1 = DAG.getBitcast(VT: NewVecTy, V: V1);
12494	// Constuct the DUP instruction
12495	V1 = constructDup(V: V1, Lane, dl, VT: NewVecTy, Opcode, DAG);
12496	// Cast back to the original type
12497	return DAG.getBitcast(VT, V: V1);
12498	}
12499	}
12500
12501	if (isREVMask(M: ShuffleMask, VT, BlockSize: `64`))
12502	return DAG.getNode(Opcode: AArch64ISD::REV64, DL: dl, VT: V1.getValueType(), N1: V1, N2: V2);
12503	if (isREVMask(M: ShuffleMask, VT, BlockSize: `32`))
12504	return DAG.getNode(Opcode: AArch64ISD::REV32, DL: dl, VT: V1.getValueType(), N1: V1, N2: V2);
12505	if (isREVMask(M: ShuffleMask, VT, BlockSize: `16`))
12506	return DAG.getNode(Opcode: AArch64ISD::REV16, DL: dl, VT: V1.getValueType(), N1: V1, N2: V2);
12507
12508	if (((VT.getVectorNumElements() == `8` && VT.getScalarSizeInBits() == `16`) \|\|
12509	(VT.getVectorNumElements() == `16` && VT.getScalarSizeInBits() == `8`)) &&
12510	ShuffleVectorInst::isReverseMask(Mask: ShuffleMask, NumSrcElts: ShuffleMask.size())) {
12511	SDValue Rev = DAG.getNode(Opcode: AArch64ISD::REV64, DL: dl, VT, Operand: V1);
12512	return DAG.getNode(AArch64ISD::EXT, dl, VT, Rev, Rev,
12513	DAG.getConstant(`8`, dl, MVT::i32));
12514	}
12515
12516	bool ReverseEXT = false;
12517	unsigned Imm;
12518	if (isEXTMask(M: ShuffleMask, VT, ReverseEXT, Imm)) {
12519	if (ReverseEXT)
12520	std::swap(a&: V1, b&: V2);
12521	Imm *= getExtFactor(V&: V1);
12522	return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
12523	DAG.getConstant(Imm, dl, MVT::i32));
12524	} else if (V2 ->isUndef() && isSingletonEXTMask(M: ShuffleMask, VT, Imm)) {
12525	Imm *= getExtFactor(V&: V1);
12526	return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
12527	DAG.getConstant(Imm, dl, MVT::i32));
12528	}
12529
12530	unsigned WhichResult;
12531	if (isZIPMask(M: ShuffleMask, VT, WhichResultOut&: WhichResult)) {
12532	unsigned Opc = (WhichResult == `0`) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
12533	return DAG.getNode(Opcode: Opc, DL: dl, VT: V1.getValueType(), N1: V1, N2: V2);
12534	}
12535	if (isUZPMask(M: ShuffleMask, VT, WhichResultOut&: WhichResult)) {
12536	unsigned Opc = (WhichResult == `0`) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
12537	return DAG.getNode(Opcode: Opc, DL: dl, VT: V1.getValueType(), N1: V1, N2: V2);
12538	}
12539	if (isTRNMask(M: ShuffleMask, VT, WhichResult)) {
12540	unsigned Opc = (WhichResult == `0`) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
12541	return DAG.getNode(Opcode: Opc, DL: dl, VT: V1.getValueType(), N1: V1, N2: V2);
12542	}
12543
12544	if (isZIP_v_undef_Mask(M: ShuffleMask, VT, WhichResult)) {
12545	unsigned Opc = (WhichResult == `0`) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
12546	return DAG.getNode(Opcode: Opc, DL: dl, VT: V1.getValueType(), N1: V1, N2: V1);
12547	}
12548	if (isUZP_v_undef_Mask(M: ShuffleMask, VT, WhichResult)) {
12549	unsigned Opc = (WhichResult == `0`) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
12550	return DAG.getNode(Opcode: Opc, DL: dl, VT: V1.getValueType(), N1: V1, N2: V1);
12551	}
12552	if (isTRN_v_undef_Mask(M: ShuffleMask, VT, WhichResult)) {
12553	unsigned Opc = (WhichResult == `0`) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
12554	return DAG.getNode(Opcode: Opc, DL: dl, VT: V1.getValueType(), N1: V1, N2: V1);
12555	}
12556
12557	if (SDValue Concat = tryFormConcatFromShuffle(Op, DAG))
12558	return Concat;
12559
12560	bool DstIsLeft;
12561	int Anomaly;
12562	int NumInputElements = V1.getValueType().getVectorNumElements();
12563	if (isINSMask(M: ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
12564	SDValue DstVec = DstIsLeft ? V1 : V2;
12565	SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64);
12566
12567	SDValue SrcVec = V1;
12568	int SrcLane = ShuffleMask [Anomaly];
12569	if (SrcLane >= NumInputElements) {
12570	SrcVec = V2;
12571	SrcLane -= VT.getVectorNumElements();
12572	}
12573	SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64);
12574
12575	EVT ScalarVT = VT.getVectorElementType();
12576
12577	if (ScalarVT.getFixedSizeInBits() < `32` && ScalarVT.isInteger())
12578	ScalarVT = MVT::i32;
12579
12580	return DAG.getNode(
12581	Opcode: ISD::INSERT_VECTOR_ELT, DL: dl, VT, N1: DstVec,
12582	N2: DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: dl, VT: ScalarVT, N1: SrcVec, N2: SrcLaneV),
12583	N3: DstLaneV);
12584	}
12585
12586	if (SDValue NewSD = tryWidenMaskForShuffle(Op, DAG))
12587	return NewSD;
12588
12589	// If the shuffle is not directly supported and it has 4 elements, use
12590	// the PerfectShuffle-generated table to synthesize it from other shuffles.
12591	unsigned NumElts = VT.getVectorNumElements();
12592	if (NumElts == `4`) {
12593	unsigned PFIndexes[`4`];
12594	for (unsigned i = `0`; i != `4`; ++i) {
12595	if (ShuffleMask [i] < `0`)
12596	PFIndexes[i] = `8`;
12597	else
12598	PFIndexes[i] = ShuffleMask [i];
12599	}
12600
12601	// Compute the index in the perfect shuffle table.
12602	unsigned PFTableIndex = PFIndexes[`0`] * `9` * `9` * `9` + PFIndexes[`1`] * `9` * `9` +
12603	PFIndexes[`2`] * `9` + PFIndexes[`3`];
12604	unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
12605	return GeneratePerfectShuffle(ID: PFTableIndex, V1, V2, PFEntry, LHS: V1, RHS: V2, DAG,
12606	dl);
12607	}
12608
12609	return GenerateTBL(Op, ShuffleMask, DAG);
12610	}
12611
12612	SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
12613	SelectionDAG &DAG) const {
12614	EVT VT = Op.getValueType();
12615
12616	if (useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable()))
12617	return LowerToScalableOp(Op, DAG);
12618
12619	assert(VT.isScalableVector() && VT.getVectorElementType() == MVT::i1 &&
12620	"Unexpected vector type!");
12621
12622	// We can handle the constant cases during isel.
12623	if (isa<ConstantSDNode>(Val: Op.getOperand(i: `0`)))
12624	return Op;
12625
12626	// There isn't a natural way to handle the general i1 case, so we use some
12627	// trickery with whilelo.
12628	SDLoc DL(Op);
12629	SDValue SplatVal = DAG.getAnyExtOrTrunc(Op.getOperand(`0`), DL, MVT::i64);
12630	SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, SplatVal,
12631	DAG.getValueType(MVT::i1));
12632	SDValue ID =
12633	DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
12634	SDValue Zero = DAG.getConstant(`0`, DL, MVT::i64);
12635	if (VT == MVT::nxv1i1)
12636	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::nxv1i1,
12637	DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv2i1, ID,
12638	Zero, SplatVal),
12639	Zero);
12640	return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT, N1: ID, N2: Zero, N3: SplatVal);
12641	}
12642
12643	SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
12644	SelectionDAG &DAG) const {
12645	SDLoc DL(Op);
12646
12647	EVT VT = Op.getValueType();
12648	if (!isTypeLegal(VT) \|\| !VT.isScalableVector())
12649	return SDValue ();
12650
12651	// Current lowering only supports the SVE-ACLE types.
12652	if (VT.getSizeInBits().getKnownMinValue() != AArch64::SVEBitsPerBlock)
12653	return SDValue ();
12654
12655	// The DUPQ operation is indepedent of element type so normalise to i64s.
12656	SDValue Idx128 = Op.getOperand(i: `2`);
12657
12658	// DUPQ can be used when idx is in range.
12659	auto *CIdx = dyn_cast<ConstantSDNode>(Val&: Idx128);
12660	if (CIdx && (CIdx->getZExtValue() <= `3`)) {
12661	SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64);
12662	return DAG.getNode(Opcode: AArch64ISD::DUPLANE128, DL, VT, N1: Op.getOperand(i: `1`), N2: CI);
12663	}
12664
12665	SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(`1`));
12666
12667	// The ACLE says this must produce the same result as:
12668	// svtbl(data, svadd_x(svptrue_b64(),
12669	// svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
12670	// index 2))*
12671	SDValue One = DAG.getConstant(`1`, DL, MVT::i64);
12672	SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One);
12673
12674	// create the vector 0,1,0,1,...
12675	SDValue SV = DAG.getStepVector(DL, MVT::nxv2i64);
12676	SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne);
12677
12678	// create the vector idx64,idx64+1,idx64,idx64+1,...
12679	SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128);
12680	SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64);
12681	SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64);
12682
12683	// create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
12684	SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask);
12685	return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: TBL);
12686	}
12687
12688
12689	static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
12690	APInt &UndefBits) {
12691	EVT VT = BVN->getValueType(ResNo: `0`);
12692	APInt SplatBits, SplatUndef;
12693	unsigned SplatBitSize;
12694	bool HasAnyUndefs;
12695	if (BVN->isConstantSplat(SplatValue&: SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
12696	unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
12697
12698	for (unsigned i = `0`; i < NumSplats; ++i) {
12699	CnstBits <<= SplatBitSize;
12700	UndefBits <<= SplatBitSize;
12701	CnstBits \|= SplatBits.zextOrTrunc(width: VT.getSizeInBits());
12702	UndefBits \|= (SplatBits ^ SplatUndef).zextOrTrunc(width: VT.getSizeInBits());
12703	}
12704
12705	return true;
12706	}
12707
12708	return false;
12709	}
12710
12711	// Try 64-bit splatted SIMD immediate.
12712	static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12713	const APInt &Bits) {
12714	if (Bits.getHiBits(numBits: `64`) == Bits.getLoBits(numBits: `64`)) {
12715	uint64_t Value = Bits.zextOrTrunc(width: `64`).getZExtValue();
12716	EVT VT = Op.getValueType();
12717	MVT MovTy = (VT.getSizeInBits() == `128`) ? MVT::v2i64 : MVT::f64;
12718
12719	if (AArch64_AM::isAdvSIMDModImmType10(Imm: Value)) {
12720	Value = AArch64_AM::encodeAdvSIMDModImmType10(Imm: Value);
12721
12722	SDLoc dl(Op);
12723	SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
12724	DAG.getConstant(Value, dl, MVT::i32));
12725	return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL: dl, VT, Operand: Mov);
12726	}
12727	}
12728
12729	return SDValue ();
12730	}
12731
12732	// Try 32-bit splatted SIMD immediate.
12733	static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12734	const APInt &Bits,
12735	const SDValue LHS = nullptr*) {
12736	EVT VT = Op.getValueType();
12737	if (VT.isFixedLengthVector() &&
12738	!DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable())
12739	return SDValue ();
12740
12741	if (Bits.getHiBits(numBits: `64`) == Bits.getLoBits(numBits: `64`)) {
12742	uint64_t Value = Bits.zextOrTrunc(width: `64`).getZExtValue();
12743	MVT MovTy = (VT.getSizeInBits() == `128`) ? MVT::v4i32 : MVT::v2i32;
12744	bool isAdvSIMDModImm = false;
12745	uint64_t Shift;
12746
12747	if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Imm: Value))) {
12748	Value = AArch64_AM::encodeAdvSIMDModImmType1(Imm: Value);
12749	Shift = `0`;
12750	}
12751	else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Imm: Value))) {
12752	Value = AArch64_AM::encodeAdvSIMDModImmType2(Imm: Value);
12753	Shift = `8`;
12754	}
12755	else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Imm: Value))) {
12756	Value = AArch64_AM::encodeAdvSIMDModImmType3(Imm: Value);
12757	Shift = `16`;
12758	}
12759	else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Imm: Value))) {
12760	Value = AArch64_AM::encodeAdvSIMDModImmType4(Imm: Value);
12761	Shift = `24`;
12762	}
12763
12764	if (isAdvSIMDModImm) {
12765	SDLoc dl(Op);
12766	SDValue Mov;
12767
12768	if (LHS)
12769	Mov = DAG.getNode(NewOp, dl, MovTy,
12770	DAG.getNode(AArch64ISD::NVCAST, dl, MovTy, *LHS),
12771	DAG.getConstant(Value, dl, MVT::i32),
12772	DAG.getConstant(Shift, dl, MVT::i32));
12773	else
12774	Mov = DAG.getNode(NewOp, dl, MovTy,
12775	DAG.getConstant(Value, dl, MVT::i32),
12776	DAG.getConstant(Shift, dl, MVT::i32));
12777
12778	return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL: dl, VT, Operand: Mov);
12779	}
12780	}
12781
12782	return SDValue ();
12783	}
12784
12785	// Try 16-bit splatted SIMD immediate.
12786	static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12787	const APInt &Bits,
12788	const SDValue LHS = nullptr*) {
12789	EVT VT = Op.getValueType();
12790	if (VT.isFixedLengthVector() &&
12791	!DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable())
12792	return SDValue ();
12793
12794	if (Bits.getHiBits(numBits: `64`) == Bits.getLoBits(numBits: `64`)) {
12795	uint64_t Value = Bits.zextOrTrunc(width: `64`).getZExtValue();
12796	MVT MovTy = (VT.getSizeInBits() == `128`) ? MVT::v8i16 : MVT::v4i16;
12797	bool isAdvSIMDModImm = false;
12798	uint64_t Shift;
12799
12800	if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Imm: Value))) {
12801	Value = AArch64_AM::encodeAdvSIMDModImmType5(Imm: Value);
12802	Shift = `0`;
12803	}
12804	else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Imm: Value))) {
12805	Value = AArch64_AM::encodeAdvSIMDModImmType6(Imm: Value);
12806	Shift = `8`;
12807	}
12808
12809	if (isAdvSIMDModImm) {
12810	SDLoc dl(Op);
12811	SDValue Mov;
12812
12813	if (LHS)
12814	Mov = DAG.getNode(NewOp, dl, MovTy,
12815	DAG.getNode(AArch64ISD::NVCAST, dl, MovTy, *LHS),
12816	DAG.getConstant(Value, dl, MVT::i32),
12817	DAG.getConstant(Shift, dl, MVT::i32));
12818	else
12819	Mov = DAG.getNode(NewOp, dl, MovTy,
12820	DAG.getConstant(Value, dl, MVT::i32),
12821	DAG.getConstant(Shift, dl, MVT::i32));
12822
12823	return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL: dl, VT, Operand: Mov);
12824	}
12825	}
12826
12827	return SDValue ();
12828	}
12829
12830	// Try 32-bit splatted SIMD immediate with shifted ones.
12831	static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op,
12832	SelectionDAG &DAG, const APInt &Bits) {
12833	if (Bits.getHiBits(numBits: `64`) == Bits.getLoBits(numBits: `64`)) {
12834	uint64_t Value = Bits.zextOrTrunc(width: `64`).getZExtValue();
12835	EVT VT = Op.getValueType();
12836	MVT MovTy = (VT.getSizeInBits() == `128`) ? MVT::v4i32 : MVT::v2i32;
12837	bool isAdvSIMDModImm = false;
12838	uint64_t Shift;
12839
12840	if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Imm: Value))) {
12841	Value = AArch64_AM::encodeAdvSIMDModImmType7(Imm: Value);
12842	Shift = `264`;
12843	}
12844	else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Imm: Value))) {
12845	Value = AArch64_AM::encodeAdvSIMDModImmType8(Imm: Value);
12846	Shift = `272`;
12847	}
12848
12849	if (isAdvSIMDModImm) {
12850	SDLoc dl(Op);
12851	SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
12852	DAG.getConstant(Value, dl, MVT::i32),
12853	DAG.getConstant(Shift, dl, MVT::i32));
12854	return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL: dl, VT, Operand: Mov);
12855	}
12856	}
12857
12858	return SDValue ();
12859	}
12860
12861	// Try 8-bit splatted SIMD immediate.
12862	static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12863	const APInt &Bits) {
12864	if (Bits.getHiBits(numBits: `64`) == Bits.getLoBits(numBits: `64`)) {
12865	uint64_t Value = Bits.zextOrTrunc(width: `64`).getZExtValue();
12866	EVT VT = Op.getValueType();
12867	MVT MovTy = (VT.getSizeInBits() == `128`) ? MVT::v16i8 : MVT::v8i8;
12868
12869	if (AArch64_AM::isAdvSIMDModImmType9(Imm: Value)) {
12870	Value = AArch64_AM::encodeAdvSIMDModImmType9(Imm: Value);
12871
12872	SDLoc dl(Op);
12873	SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
12874	DAG.getConstant(Value, dl, MVT::i32));
12875	return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL: dl, VT, Operand: Mov);
12876	}
12877	}
12878
12879	return SDValue ();
12880	}
12881
12882	// Try FP splatted SIMD immediate.
12883	static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12884	const APInt &Bits) {
12885	if (Bits.getHiBits(numBits: `64`) == Bits.getLoBits(numBits: `64`)) {
12886	uint64_t Value = Bits.zextOrTrunc(width: `64`).getZExtValue();
12887	EVT VT = Op.getValueType();
12888	bool isWide = (VT.getSizeInBits() == `128`);
12889	MVT MovTy;
12890	bool isAdvSIMDModImm = false;
12891
12892	if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Imm: Value))) {
12893	Value = AArch64_AM::encodeAdvSIMDModImmType11(Imm: Value);
12894	MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
12895	}
12896	else if (isWide &&
12897	(isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Imm: Value))) {
12898	Value = AArch64_AM::encodeAdvSIMDModImmType12(Imm: Value);
12899	MovTy = MVT::v2f64;
12900	}
12901
12902	if (isAdvSIMDModImm) {
12903	SDLoc dl(Op);
12904	SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
12905	DAG.getConstant(Value, dl, MVT::i32));
12906	return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL: dl, VT, Operand: Mov);
12907	}
12908	}
12909
12910	return SDValue ();
12911	}
12912
12913	// Specialized code to quickly find if PotentialBVec is a BuildVector that
12914	// consists of only the same constant int value, returned in reference arg
12915	// ConstVal
12916	static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
12917	uint64_t &ConstVal) {
12918	BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(Val: PotentialBVec);
12919	if (!Bvec)
12920	return false;
12921	ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Val: Bvec->getOperand(Num: `0`));
12922	if (!FirstElt)
12923	return false;
12924	EVT VT = Bvec->getValueType(ResNo: `0`);
12925	unsigned NumElts = VT.getVectorNumElements();
12926	for (unsigned i = `1`; i < NumElts; ++i)
12927	if (dyn_cast<ConstantSDNode>(Val: Bvec->getOperand(Num: i)) != FirstElt)
12928	return false;
12929	ConstVal = FirstElt->getZExtValue();
12930	return true;
12931	}
12932
12933	static bool isAllInactivePredicate(SDValue N) {
12934	// Look through cast.
12935	while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST)
12936	N = N.getOperand(i: `0`);
12937
12938	return ISD::isConstantSplatVectorAllZeros(N: N.getNode());
12939	}
12940
12941	static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N) {
12942	unsigned NumElts = N.getValueType().getVectorMinNumElements();
12943
12944	// Look through cast.
12945	while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) {
12946	N = N.getOperand(i: `0`);
12947	// When reinterpreting from a type with fewer elements the "new" elements
12948	// are not active, so bail if they're likely to be used.
12949	if (N.getValueType().getVectorMinNumElements() < NumElts)
12950	return false;
12951	}
12952
12953	if (ISD::isConstantSplatVectorAllOnes(N: N.getNode()))
12954	return true;
12955
12956	// "ptrue p.<ty>, all" can be considered all active when <ty> is the same size
12957	// or smaller than the implicit element type represented by N.
12958	// NOTE: A larger element count implies a smaller element type.
12959	if (N.getOpcode() == AArch64ISD::PTRUE &&
12960	N.getConstantOperandVal(`0`) == AArch64SVEPredPattern::all)
12961	return N.getValueType().getVectorMinNumElements() >= NumElts;
12962
12963	// If we're compiling for a specific vector-length, we can check if the
12964	// pattern's VL equals that of the scalable vector at runtime.
12965	if (N.getOpcode() == AArch64ISD::PTRUE) {
12966	const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
12967	unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
12968	unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
12969	if (MaxSVESize && MinSVESize == MaxSVESize) {
12970	unsigned VScale = MaxSVESize / AArch64::SVEBitsPerBlock;
12971	unsigned PatNumElts =
12972	getNumElementsFromSVEPredPattern(Pattern: N.getConstantOperandVal(i: `0`));
12973	return PatNumElts == (NumElts * VScale);
12974	}
12975	}
12976
12977	return false;
12978	}
12979
12980	// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
12981	// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
12982	// BUILD_VECTORs with constant element C1, C2 is a constant, and:
12983	// - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
12984	// - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
12985	// The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
12986	static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
12987	EVT VT = N->getValueType(ResNo: `0`);
12988
12989	if (!VT.isVector())
12990	return SDValue ();
12991
12992	SDLoc DL(N);
12993
12994	SDValue And;
12995	SDValue Shift;
12996
12997	SDValue FirstOp = N->getOperand(Num: `0`);
12998	unsigned FirstOpc = FirstOp.getOpcode();
12999	SDValue SecondOp = N->getOperand(Num: `1`);
13000	unsigned SecondOpc = SecondOp.getOpcode();
13001
13002	// Is one of the operands an AND or a BICi? The AND may have been optimised to
13003	// a BICi in order to use an immediate instead of a register.
13004	// Is the other operand an shl or lshr? This will have been turned into:
13005	// AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift
13006	// or (AArch64ISD::SHL_PRED \|\| AArch64ISD::SRL_PRED) mask, vector, #shiftVec.
13007	if ((FirstOpc == ISD::AND \|\| FirstOpc == AArch64ISD::BICi) &&
13008	(SecondOpc == AArch64ISD::VSHL \|\| SecondOpc == AArch64ISD::VLSHR \|\|
13009	SecondOpc == AArch64ISD::SHL_PRED \|\|
13010	SecondOpc == AArch64ISD::SRL_PRED)) {
13011	And = FirstOp;
13012	Shift = SecondOp;
13013
13014	} else if ((SecondOpc == ISD::AND \|\| SecondOpc == AArch64ISD::BICi) &&
13015	(FirstOpc == AArch64ISD::VSHL \|\| FirstOpc == AArch64ISD::VLSHR \|\|
13016	FirstOpc == AArch64ISD::SHL_PRED \|\|
13017	FirstOpc == AArch64ISD::SRL_PRED)) {
13018	And = SecondOp;
13019	Shift = FirstOp;
13020	} else
13021	return SDValue ();
13022
13023	bool IsAnd = And.getOpcode() == ISD::AND;
13024	bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR \|\|
13025	Shift.getOpcode() == AArch64ISD::SRL_PRED;
13026	bool ShiftHasPredOp = Shift.getOpcode() == AArch64ISD::SHL_PRED \|\|
13027	Shift.getOpcode() == AArch64ISD::SRL_PRED;
13028
13029	// Is the shift amount constant and are all lanes active?
13030	uint64_t C2;
13031	if (ShiftHasPredOp) {
13032	if (!isAllActivePredicate(DAG, N: Shift.getOperand(i: `0`)))
13033	return SDValue ();
13034	APInt C;
13035	if (!ISD::isConstantSplatVector(N: Shift.getOperand(i: `2`).getNode(), SplatValue&: C))
13036	return SDValue ();
13037	C2 = C.getZExtValue();
13038	} else if (ConstantSDNode *C2node =
13039	dyn_cast<ConstantSDNode>(Val: Shift.getOperand(i: `1`)))
13040	C2 = C2node->getZExtValue();
13041	else
13042	return SDValue ();
13043
13044	APInt C1AsAPInt;
13045	unsigned ElemSizeInBits = VT.getScalarSizeInBits();
13046	if (IsAnd) {
13047	// Is the and mask vector all constant?
13048	if (!ISD::isConstantSplatVector(N: And.getOperand(i: `1`).getNode(), SplatValue&: C1AsAPInt))
13049	return SDValue ();
13050	} else {
13051	// Reconstruct the corresponding AND immediate from the two BICi immediates.
13052	ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(Val: And.getOperand(i: `1`));
13053	ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(Val: And.getOperand(i: `2`));
13054	assert(C1nodeImm && C1nodeShift);
13055	C1AsAPInt = ~(C1nodeImm->getAPIntValue() << C1nodeShift->getAPIntValue());
13056	C1AsAPInt = C1AsAPInt.zextOrTrunc(width: ElemSizeInBits);
13057	}
13058
13059	// Is C1 == ~(Ones(ElemSizeInBits) << C2) or
13060	// C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
13061	// how much one can shift elements of a particular size?
13062	if (C2 > ElemSizeInBits)
13063	return SDValue ();
13064
13065	APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(numBits: ElemSizeInBits, hiBitsSet: C2)
13066	: APInt::getLowBitsSet(numBits: ElemSizeInBits, loBitsSet: C2);
13067	if (C1AsAPInt != RequiredC1)
13068	return SDValue ();
13069
13070	SDValue X = And.getOperand(i: `0`);
13071	SDValue Y = ShiftHasPredOp ? Shift.getOperand(i: `1`) : Shift.getOperand(i: `0`);
13072	SDValue Imm = ShiftHasPredOp ? DAG.getTargetConstant(C2, DL, MVT::i32)
13073	: Shift.getOperand(`1`);
13074
13075	unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
13076	SDValue ResultSLI = DAG.getNode(Opcode: Inst, DL, VT, N1: X, N2: Y, N3: Imm);
13077
13078	LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n");
13079	LLVM_DEBUG(N->dump(&DAG));
13080	LLVM_DEBUG(dbgs() << "into: \n");
13081	LLVM_DEBUG(ResultSLI ->dump(&DAG));
13082
13083	++NumShiftInserts;
13084	return ResultSLI;
13085	}
13086
13087	SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
13088	SelectionDAG &DAG) const {
13089	if (useSVEForFixedLengthVectorVT(VT: Op.getValueType(),
13090	OverrideNEON: !Subtarget->isNeonAvailable()))
13091	return LowerToScalableOp(Op, DAG);
13092
13093	// Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
13094	if (SDValue Res = tryLowerToSLI(N: Op.getNode(), DAG))
13095	return Res;
13096
13097	EVT VT = Op.getValueType();
13098	if (VT.isScalableVector())
13099	return Op;
13100
13101	SDValue LHS = Op.getOperand(i: `0`);
13102	BuildVectorSDNode *BVN =
13103	dyn_cast<BuildVectorSDNode>(Val: Op.getOperand(i: `1`).getNode());
13104	if (!BVN) {
13105	// OR commutes, so try swapping the operands.
13106	LHS = Op.getOperand(i: `1`);
13107	BVN = dyn_cast<BuildVectorSDNode>(Val: Op.getOperand(i: `0`).getNode());
13108	}
13109	if (!BVN)
13110	return Op;
13111
13112	APInt DefBits(VT.getSizeInBits(), `0`);
13113	APInt UndefBits(VT.getSizeInBits(), `0`);
13114	if (resolveBuildVector(BVN, CnstBits&: DefBits, UndefBits)) {
13115	SDValue NewOp;
13116
13117	if ((NewOp = tryAdvSIMDModImm32(NewOp: AArch64ISD::ORRi, Op, DAG,
13118	Bits: DefBits, LHS: &LHS)) \|\|
13119	(NewOp = tryAdvSIMDModImm16(NewOp: AArch64ISD::ORRi, Op, DAG,
13120	Bits: DefBits, LHS: &LHS)))
13121	return NewOp;
13122
13123	if ((NewOp = tryAdvSIMDModImm32(NewOp: AArch64ISD::ORRi, Op, DAG,
13124	Bits: UndefBits, LHS: &LHS)) \|\|
13125	(NewOp = tryAdvSIMDModImm16(NewOp: AArch64ISD::ORRi, Op, DAG,
13126	Bits: UndefBits, LHS: &LHS)))
13127	return NewOp;
13128	}
13129
13130	// We can always fall back to a non-immediate OR.
13131	return Op;
13132	}
13133
13134	// Normalize the operands of BUILD_VECTOR. The value of constant operands will
13135	// be truncated to fit element width.
13136	static SDValue NormalizeBuildVector(SDValue Op,
13137	SelectionDAG &DAG) {
13138	assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
13139	SDLoc dl(Op);
13140	EVT VT = Op.getValueType();
13141	EVT EltTy= VT.getVectorElementType();
13142
13143	if (EltTy.isFloatingPoint() \|\| EltTy.getSizeInBits() > `16`)
13144	return Op;
13145
13146	SmallVector<SDValue, `16`> Ops;
13147	for (SDValue Lane : Op ->ops()) {
13148	// For integer vectors, type legalization would have promoted the
13149	// operands already. Otherwise, if Op is a floating-point splat
13150	// (with operands cast to integers), then the only possibilities
13151	// are constants and UNDEFs.
13152	if (auto *CstLane = dyn_cast<ConstantSDNode>(Val&: Lane)) {
13153	APInt LowBits(EltTy.getSizeInBits(),
13154	CstLane->getZExtValue());
13155	Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32);
13156	} else if (Lane.getNode()->isUndef()) {
13157	Lane = DAG.getUNDEF(MVT::i32);
13158	} else {
13159	assert(Lane.getValueType() == MVT::i32 &&
13160	"Unexpected BUILD_VECTOR operand type");
13161	}
13162	Ops.push_back(Elt: Lane);
13163	}
13164	return DAG.getBuildVector(VT, DL: dl, Ops);
13165	}
13166
13167	static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG,
13168	const AArch64Subtarget *ST) {
13169	EVT VT = Op.getValueType();
13170	assert((VT.getSizeInBits() == `64` \|\| VT.getSizeInBits() == `128`) &&
13171	"Expected a legal NEON vector");
13172
13173	APInt DefBits(VT.getSizeInBits(), `0`);
13174	APInt UndefBits(VT.getSizeInBits(), `0`);
13175	BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Val: Op.getNode());
13176	if (resolveBuildVector(BVN, CnstBits&: DefBits, UndefBits)) {
13177	auto TryMOVIWithBits = [&](APInt DefBits) {
13178	SDValue NewOp;
13179	if ((NewOp =
13180	tryAdvSIMDModImm64(NewOp: AArch64ISD::MOVIedit, Op, DAG, Bits: DefBits)) \|\|
13181	(NewOp =
13182	tryAdvSIMDModImm32(NewOp: AArch64ISD::MOVIshift, Op, DAG, Bits: DefBits)) \|\|
13183	(NewOp =
13184	tryAdvSIMDModImm321s(NewOp: AArch64ISD::MOVImsl, Op, DAG, Bits: DefBits)) \|\|
13185	(NewOp =
13186	tryAdvSIMDModImm16(NewOp: AArch64ISD::MOVIshift, Op, DAG, Bits: DefBits)) \|\|
13187	(NewOp = tryAdvSIMDModImm8(NewOp: AArch64ISD::MOVI, Op, DAG, Bits: DefBits)) \|\|
13188	(NewOp = tryAdvSIMDModImmFP(NewOp: AArch64ISD::FMOV, Op, DAG, Bits: DefBits)))
13189	return NewOp;
13190
13191	APInt NotDefBits = ~DefBits;
13192	if ((NewOp = tryAdvSIMDModImm32(NewOp: AArch64ISD::MVNIshift, Op, DAG,
13193	Bits: NotDefBits)) \|\|
13194	(NewOp = tryAdvSIMDModImm321s(NewOp: AArch64ISD::MVNImsl, Op, DAG,
13195	Bits: NotDefBits)) \|\|
13196	(NewOp =
13197	tryAdvSIMDModImm16(NewOp: AArch64ISD::MVNIshift, Op, DAG, Bits: NotDefBits)))
13198	return NewOp;
13199	return SDValue ();
13200	};
13201	if (SDValue R = TryMOVIWithBits (DefBits))
13202	return R;
13203	if (SDValue R = TryMOVIWithBits (UndefBits))
13204	return R;
13205
13206	// See if a fneg of the constant can be materialized with a MOVI, etc
13207	auto TryWithFNeg = [&](APInt DefBits, MVT FVT) {
13208	// FNegate each sub-element of the constant
13209	assert(VT.getSizeInBits() % FVT.getScalarSizeInBits() == `0`);
13210	APInt Neg = APInt::getHighBitsSet(numBits: FVT.getSizeInBits(), hiBitsSet: `1`)
13211	.zext(width: VT.getSizeInBits());
13212	APInt NegBits(VT.getSizeInBits(), `0`);
13213	unsigned NumElts = VT.getSizeInBits() / FVT.getScalarSizeInBits();
13214	for (unsigned i = `0`; i < NumElts; i++)
13215	NegBits \|= Neg << (FVT.getScalarSizeInBits() * i);
13216	NegBits = DefBits ^ NegBits;
13217
13218	// Try to create the new constants with MOVI, and if so generate a fneg
13219	// for it.
13220	if (SDValue NewOp = TryMOVIWithBits (NegBits)) {
13221	SDLoc DL(Op);
13222	MVT VFVT = NumElts == `1` ? FVT : MVT::getVectorVT(VT: FVT, NumElements: NumElts);
13223	return DAG.getNode(
13224	Opcode: AArch64ISD::NVCAST, DL, VT,
13225	Operand: DAG.getNode(Opcode: ISD::FNEG, DL, VT: VFVT,
13226	Operand: DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT: VFVT, Operand: NewOp)));
13227	}
13228	return SDValue ();
13229	};
13230	SDValue R;
13231	if ((R = TryWithFNeg(DefBits, MVT::f32)) \|\|
13232	(R = TryWithFNeg(DefBits, MVT::f64)) \|\|
13233	(ST->hasFullFP16() && (R = TryWithFNeg(DefBits, MVT::f16))))
13234	return R;
13235	}
13236
13237	return SDValue ();
13238	}
13239
13240	SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
13241	SelectionDAG &DAG) const {
13242	EVT VT = Op.getValueType();
13243
13244	if (useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable())) {
13245	if (auto SeqInfo = cast<BuildVectorSDNode>(Val&: Op)->isConstantSequence()) {
13246	SDLoc DL(Op);
13247	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
13248	SDValue Start = DAG.getConstant(Val: SeqInfo ->first, DL, VT: ContainerVT);
13249	SDValue Steps = DAG.getStepVector(DL, ResVT: ContainerVT, StepVal: SeqInfo ->second);
13250	SDValue Seq = DAG.getNode(Opcode: ISD::ADD, DL, VT: ContainerVT, N1: Start, N2: Steps);
13251	return convertFromScalableVector(DAG, VT: Op.getValueType(), V: Seq);
13252	}
13253
13254	// Revert to common legalisation for all other variants.
13255	return SDValue ();
13256	}
13257
13258	// Try to build a simple constant vector.
13259	Op = NormalizeBuildVector(Op, DAG);
13260	// Thought this might return a non-BUILD_VECTOR (e.g. CONCAT_VECTORS), if so,
13261	// abort.
13262	if (Op.getOpcode() != ISD::BUILD_VECTOR)
13263	return SDValue ();
13264
13265	// Certain vector constants, used to express things like logical NOT and
13266	// arithmetic NEG, are passed through unmodified. This allows special
13267	// patterns for these operations to match, which will lower these constants
13268	// to whatever is proven necessary.
13269	BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Val: Op.getNode());
13270	if (BVN->isConstant()) {
13271	if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
13272	unsigned BitSize = VT.getVectorElementType().getSizeInBits();
13273	APInt Val(BitSize,
13274	Const->getAPIntValue().zextOrTrunc(width: BitSize).getZExtValue());
13275	if (Val.isZero() \|\| (VT.isInteger() && Val.isAllOnes()))
13276	return Op;
13277	}
13278	if (ConstantFPSDNode *Const = BVN->getConstantFPSplatNode())
13279	if (Const->isZero() && !Const->isNegative())
13280	return Op;
13281	}
13282
13283	if (SDValue V = ConstantBuildVector(Op, DAG, ST: Subtarget))
13284	return V;
13285
13286	// Scan through the operands to find some interesting properties we can
13287	// exploit:
13288	// 1) If only one value is used, we can use a DUP, or
13289	// 2) if only the low element is not undef, we can just insert that, or
13290	// 3) if only one constant value is used (w/ some non-constant lanes),
13291	// we can splat the constant value into the whole vector then fill
13292	// in the non-constant lanes.
13293	// 4) FIXME: If different constant values are used, but we can intelligently
13294	// select the values we'll be overwriting for the non-constant
13295	// lanes such that we can directly materialize the vector
13296	// some other way (MOVI, e.g.), we can be sneaky.
13297	// 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
13298	SDLoc dl(Op);
13299	unsigned NumElts = VT.getVectorNumElements();
13300	bool isOnlyLowElement = true;
13301	bool usesOnlyOneValue = true;
13302	bool usesOnlyOneConstantValue = true;
13303	bool isConstant = true;
13304	bool AllLanesExtractElt = true;
13305	unsigned NumConstantLanes = `0`;
13306	unsigned NumDifferentLanes = `0`;
13307	unsigned NumUndefLanes = `0`;
13308	SDValue Value;
13309	SDValue ConstantValue;
13310	SmallMapVector<SDValue, unsigned, `16`> DifferentValueMap;
13311	unsigned ConsecutiveValCount = `0`;
13312	SDValue PrevVal;
13313	for (unsigned i = `0`; i < NumElts; ++i) {
13314	SDValue V = Op.getOperand(i);
13315	if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
13316	AllLanesExtractElt = false;
13317	if (V.isUndef()) {
13318	++NumUndefLanes;
13319	continue;
13320	}
13321	if (i > `0`)
13322	isOnlyLowElement = false;
13323	if (!isIntOrFPConstant(V))
13324	isConstant = false;
13325
13326	if (isIntOrFPConstant(V)) {
13327	++NumConstantLanes;
13328	if (!ConstantValue.getNode())
13329	ConstantValue = V;
13330	else if (ConstantValue != V)
13331	usesOnlyOneConstantValue = false;
13332	}
13333
13334	if (!Value.getNode())
13335	Value = V;
13336	else if (V != Value) {
13337	usesOnlyOneValue = false;
13338	++NumDifferentLanes;
13339	}
13340
13341	if (PrevVal != V) {
13342	ConsecutiveValCount = `0`;
13343	PrevVal = V;
13344	}
13345
13346	// Keep different values and its last consecutive count. For example,
13347	//
13348	// t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
13349	// t24, t24, t24, t24, t24, t24, t24, t24
13350	// t23 = consecutive count 8
13351	// t24 = consecutive count 8
13352	// ------------------------------------------------------------------
13353	// t22: v16i8 = build_vector t24, t24, t23, t23, t23, t23, t23, t24,
13354	// t24, t24, t24, t24, t24, t24, t24, t24
13355	// t23 = consecutive count 5
13356	// t24 = consecutive count 9
13357	DifferentValueMap [V] = ++ConsecutiveValCount;
13358	}
13359
13360	if (!Value.getNode()) {
13361	LLVM_DEBUG(
13362	dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
13363	return DAG.getUNDEF(VT);
13364	}
13365
13366	// Convert BUILD_VECTOR where all elements but the lowest are undef into
13367	// SCALAR_TO_VECTOR, except for when we have a single-element constant vector
13368	// as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
13369	if (isOnlyLowElement && !(NumElts == `1` && isIntOrFPConstant(V: Value))) {
13370	LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
13371	"SCALAR_TO_VECTOR node\n");
13372	return DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: dl, VT, Operand: Value);
13373	}
13374
13375	if (AllLanesExtractElt) {
13376	SDNode Vector = nullptr*;
13377	bool Even = false;
13378	bool Odd = false;
13379	// Check whether the extract elements match the Even pattern <0,2,4,...> or
13380	// the Odd pattern <1,3,5,...>.
13381	for (unsigned i = `0`; i < NumElts; ++i) {
13382	SDValue V = Op.getOperand(i);
13383	const SDNode *N = V.getNode();
13384	if (!isa<ConstantSDNode>(Val: N->getOperand(Num: `1`))) {
13385	Even = false;
13386	Odd = false;
13387	break;
13388	}
13389	SDValue N0 = N->getOperand(Num: `0`);
13390
13391	// All elements are extracted from the same vector.
13392	if (!Vector) {
13393	Vector = N0.getNode();
13394	// Check that the type of EXTRACT_VECTOR_ELT matches the type of
13395	// BUILD_VECTOR.
13396	if (VT.getVectorElementType() !=
13397	N0.getValueType().getVectorElementType())
13398	break;
13399	} else if (Vector != N0.getNode()) {
13400	Odd = false;
13401	Even = false;
13402	break;
13403	}
13404
13405	// Extracted values are either at Even indices <0,2,4,...> or at Odd
13406	// indices <1,3,5,...>.
13407	uint64_t Val = N->getConstantOperandVal(Num: `1`);
13408	if (Val == `2` * i) {
13409	Even = true;
13410	continue;
13411	}
13412	if (Val - `1` == `2` * i) {
13413	Odd = true;
13414	continue;
13415	}
13416
13417	// Something does not match: abort.
13418	Odd = false;
13419	Even = false;
13420	break;
13421	}
13422	if (Even \|\| Odd) {
13423	SDValue LHS =
13424	DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, `0`),
13425	DAG.getConstant(`0`, dl, MVT::i64));
13426	SDValue RHS =
13427	DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, `0`),
13428	DAG.getConstant(NumElts, dl, MVT::i64));
13429
13430	if (Even && !Odd)
13431	return DAG.getNode(Opcode: AArch64ISD::UZP1, DL: dl, VTList: DAG.getVTList(VT1: VT, VT2: VT), N1: LHS,
13432	N2: RHS);
13433	if (Odd && !Even)
13434	return DAG.getNode(Opcode: AArch64ISD::UZP2, DL: dl, VTList: DAG.getVTList(VT1: VT, VT2: VT), N1: LHS,
13435	N2: RHS);
13436	}
13437	}
13438
13439	// Use DUP for non-constant splats. For f32 constant splats, reduce to
13440	// i32 and try again.
13441	if (usesOnlyOneValue) {
13442	if (!isConstant) {
13443	if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
13444	Value.getValueType() != VT) {
13445	LLVM_DEBUG(
13446	dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
13447	return DAG.getNode(Opcode: AArch64ISD::DUP, DL: dl, VT, Operand: Value);
13448	}
13449
13450	// This is actually a DUPLANExx operation, which keeps everything vectory.
13451
13452	SDValue Lane = Value.getOperand(i: `1`);
13453	Value = Value.getOperand(i: `0`);
13454	if (Value.getValueSizeInBits() == `64`) {
13455	LLVM_DEBUG(
13456	dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
13457	"widening it\n");
13458	Value = WidenVector(V64Reg: Value, DAG);
13459	}
13460
13461	unsigned Opcode = getDUPLANEOp(EltType: VT.getVectorElementType());
13462	return DAG.getNode(Opcode, DL: dl, VT, N1: Value, N2: Lane);
13463	}
13464
13465	if (VT.getVectorElementType().isFloatingPoint()) {
13466	SmallVector<SDValue, `8`> Ops;
13467	EVT EltTy = VT.getVectorElementType();
13468	assert ((EltTy == MVT::f16 \|\| EltTy == MVT::bf16 \|\| EltTy == MVT::f32 \|\|
13469	EltTy == MVT::f64) && "Unsupported floating-point vector type");
13470	LLVM_DEBUG(
13471	dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
13472	"BITCASTS, and try again\n");
13473	MVT NewType = MVT::getIntegerVT(BitWidth: EltTy.getSizeInBits());
13474	for (unsigned i = `0`; i < NumElts; ++i)
13475	Ops.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: NewType, Operand: Op.getOperand(i)));
13476	EVT VecVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: NewType, NumElements: NumElts);
13477	SDValue Val = DAG.getBuildVector(VT: VecVT, DL: dl, Ops);
13478	LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
13479	Val.dump(););
13480	Val = LowerBUILD_VECTOR(Op: Val, DAG);
13481	if (Val.getNode())
13482	return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT, Operand: Val);
13483	}
13484	}
13485
13486	// If we need to insert a small number of different non-constant elements and
13487	// the vector width is sufficiently large, prefer using DUP with the common
13488	// value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred,
13489	// skip the constant lane handling below.
13490	bool PreferDUPAndInsert =
13491	!isConstant && NumDifferentLanes >= `1` &&
13492	NumDifferentLanes < ((NumElts - NumUndefLanes) / `2`) &&
13493	NumDifferentLanes >= NumConstantLanes;
13494
13495	// If there was only one constant value used and for more than one lane,
13496	// start by splatting that value, then replace the non-constant lanes. This
13497	// is better than the default, which will perform a separate initialization
13498	// for each lane.
13499	if (!PreferDUPAndInsert && NumConstantLanes > `0` && usesOnlyOneConstantValue) {
13500	// Firstly, try to materialize the splat constant.
13501	SDValue Val = DAG.getSplatBuildVector(VT, DL: dl, Op: ConstantValue);
13502	unsigned BitSize = VT.getScalarSizeInBits();
13503	APInt ConstantValueAPInt(`1`, `0`);
13504	if (auto *C = dyn_cast<ConstantSDNode>(Val&: ConstantValue))
13505	ConstantValueAPInt = C->getAPIntValue().zextOrTrunc(width: BitSize);
13506	if (!isNullConstant(V: ConstantValue) && !isNullFPConstant(V: ConstantValue) &&
13507	!ConstantValueAPInt.isAllOnes()) {
13508	Val = ConstantBuildVector(Op: Val, DAG, ST: Subtarget);
13509	if (!Val)
13510	// Otherwise, materialize the constant and splat it.
13511	Val = DAG.getNode(Opcode: AArch64ISD::DUP, DL: dl, VT, Operand: ConstantValue);
13512	}
13513
13514	// Now insert the non-constant lanes.
13515	for (unsigned i = `0`; i < NumElts; ++i) {
13516	SDValue V = Op.getOperand(i);
13517	SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
13518	if (!isIntOrFPConstant(V))
13519	// Note that type legalization likely mucked about with the VT of the
13520	// source operand, so we may have to convert it here before inserting.
13521	Val = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL: dl, VT, N1: Val, N2: V, N3: LaneIdx);
13522	}
13523	return Val;
13524	}
13525
13526	// This will generate a load from the constant pool.
13527	if (isConstant) {
13528	LLVM_DEBUG(
13529	dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
13530	"expansion\n");
13531	return SDValue ();
13532	}
13533
13534	// Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
13535	// v4i32s. This is really a truncate, which we can construct out of (legal)
13536	// concats and truncate nodes.
13537	if (SDValue M = ReconstructTruncateFromBuildVector(V: Op, DAG))
13538	return M;
13539
13540	// Empirical tests suggest this is rarely worth it for vectors of length <= 2.
13541	if (NumElts >= `4`) {
13542	if (SDValue Shuffle = ReconstructShuffle(Op, DAG))
13543	return Shuffle;
13544
13545	if (SDValue Shuffle = ReconstructShuffleWithRuntimeMask(Op, DAG))
13546	return Shuffle;
13547	}
13548
13549	if (PreferDUPAndInsert) {
13550	// First, build a constant vector with the common element.
13551	SmallVector<SDValue, `8`> Ops(NumElts, Value);
13552	SDValue NewVector = LowerBUILD_VECTOR(Op: DAG.getBuildVector(VT, DL: dl, Ops), DAG);
13553	// Next, insert the elements that do not match the common value.
13554	for (unsigned I = `0`; I < NumElts; ++I)
13555	if (Op.getOperand(I) != Value)
13556	NewVector =
13557	DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, NewVector,
13558	Op.getOperand(I), DAG.getConstant(I, dl, MVT::i64));
13559
13560	return NewVector;
13561	}
13562
13563	// If vector consists of two different values, try to generate two DUPs and
13564	// (CONCAT_VECTORS or VECTOR_SHUFFLE).
13565	if (DifferentValueMap.size() == `2` && NumUndefLanes == `0`) {
13566	SmallVector<SDValue, `2`> Vals;
13567	// Check the consecutive count of the value is the half number of vector
13568	// elements. In this case, we can use CONCAT_VECTORS. For example,
13569	//
13570	// canUseVECTOR_CONCAT = true;
13571	// t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
13572	// t24, t24, t24, t24, t24, t24, t24, t24
13573	//
13574	// canUseVECTOR_CONCAT = false;
13575	// t22: v16i8 = build_vector t23, t23, t23, t23, t23, t24, t24, t24,
13576	// t24, t24, t24, t24, t24, t24, t24, t24
13577	bool canUseVECTOR_CONCAT = true;
13578	for (auto Pair : DifferentValueMap) {
13579	// Check different values have same length which is NumElts / 2.
13580	if (Pair.second != NumElts / `2`)
13581	canUseVECTOR_CONCAT = false;
13582	Vals.push_back(Elt: Pair.first);
13583	}
13584
13585	// If canUseVECTOR_CONCAT is true, we can generate two DUPs and
13586	// CONCAT_VECTORs. For example,
13587	//
13588	// t22: v16i8 = BUILD_VECTOR t23, t23, t23, t23, t23, t23, t23, t23,
13589	// t24, t24, t24, t24, t24, t24, t24, t24
13590	// ==>
13591	// t26: v8i8 = AArch64ISD::DUP t23
13592	// t28: v8i8 = AArch64ISD::DUP t24
13593	// t29: v16i8 = concat_vectors t26, t28
13594	if (canUseVECTOR_CONCAT) {
13595	EVT SubVT = VT.getHalfNumVectorElementsVT(Context&: *DAG.getContext());
13596	if (isTypeLegal(VT: SubVT) && SubVT.isVector() &&
13597	SubVT.getVectorNumElements() >= `2`) {
13598	SmallVector<SDValue, `8`> Ops1(NumElts / `2`, Vals [`0`]);
13599	SmallVector<SDValue, `8`> Ops2(NumElts / `2`, Vals [`1`]);
13600	SDValue DUP1 =
13601	LowerBUILD_VECTOR(Op: DAG.getBuildVector(VT: SubVT, DL: dl, Ops: Ops1), DAG);
13602	SDValue DUP2 =
13603	LowerBUILD_VECTOR(Op: DAG.getBuildVector(VT: SubVT, DL: dl, Ops: Ops2), DAG);
13604	SDValue CONCAT_VECTORS =
13605	DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: dl, VT, N1: DUP1, N2: DUP2);
13606	return CONCAT_VECTORS;
13607	}
13608	}
13609
13610	// Let's try to generate VECTOR_SHUFFLE. For example,
13611	//
13612	// t24: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t26, t26, t26, t26
13613	// ==>
13614	// t27: v8i8 = BUILD_VECTOR t26, t26, t26, t26, t26, t26, t26, t26
13615	// t28: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t25, t25, t25, t25
13616	// t29: v8i8 = vector_shuffle<0,1,2,3,12,13,14,15> t27, t28
13617	if (NumElts >= `8`) {
13618	SmallVector<int, `16`> MaskVec;
13619	// Build mask for VECTOR_SHUFLLE.
13620	SDValue FirstLaneVal = Op.getOperand(i: `0`);
13621	for (unsigned i = `0`; i < NumElts; ++i) {
13622	SDValue Val = Op.getOperand(i);
13623	if (FirstLaneVal == Val)
13624	MaskVec.push_back(Elt: i);
13625	else
13626	MaskVec.push_back(Elt: i + NumElts);
13627	}
13628
13629	SmallVector<SDValue, `8`> Ops1(NumElts, Vals [`0`]);
13630	SmallVector<SDValue, `8`> Ops2(NumElts, Vals [`1`]);
13631	SDValue VEC1 = DAG.getBuildVector(VT, DL: dl, Ops: Ops1);
13632	SDValue VEC2 = DAG.getBuildVector(VT, DL: dl, Ops: Ops2);
13633	SDValue VECTOR_SHUFFLE =
13634	DAG.getVectorShuffle(VT, dl, N1: VEC1, N2: VEC2, Mask: MaskVec);
13635	return VECTOR_SHUFFLE;
13636	}
13637	}
13638
13639	// If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
13640	// know the default expansion would otherwise fall back on something even
13641	// worse. For a vector with one or two non-undef values, that's
13642	// scalar_to_vector for the elements followed by a shuffle (provided the
13643	// shuffle is valid for the target) and materialization element by element
13644	// on the stack followed by a load for everything else.
13645	if (!isConstant && !usesOnlyOneValue) {
13646	LLVM_DEBUG(
13647	dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
13648	"of INSERT_VECTOR_ELT\n");
13649
13650	SDValue Vec = DAG.getUNDEF(VT);
13651	SDValue Op0 = Op.getOperand(i: `0`);
13652	unsigned i = `0`;
13653
13654	// Use SCALAR_TO_VECTOR for lane zero to
13655	// a) Avoid a RMW dependency on the full vector register, and
13656	// b) Allow the register coalescer to fold away the copy if the
13657	// value is already in an S or D register, and we're forced to emit an
13658	// INSERT_SUBREG that we can't fold anywhere.
13659	//
13660	// We also allow types like i8 and i16 which are illegal scalar but legal
13661	// vector element types. After type-legalization the inserted value is
13662	// extended (i32) and it is safe to cast them to the vector type by ignoring
13663	// the upper bits of the lowest lane (e.g. v8i8, v4i16).
13664	if (!Op0.isUndef()) {
13665	LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
13666	Vec = DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: dl, VT, Operand: Op0);
13667	++i;
13668	}
13669	LLVM_DEBUG(if (i < NumElts) dbgs()
13670	<< "Creating nodes for the other vector elements:\n";);
13671	for (; i < NumElts; ++i) {
13672	SDValue V = Op.getOperand(i);
13673	if (V.isUndef())
13674	continue;
13675	SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
13676	Vec = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL: dl, VT, N1: Vec, N2: V, N3: LaneIdx);
13677	}
13678	return Vec;
13679	}
13680
13681	LLVM_DEBUG(
13682	dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
13683	"better alternative\n");
13684	return SDValue ();
13685	}
13686
13687	SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
13688	SelectionDAG &DAG) const {
13689	if (useSVEForFixedLengthVectorVT(VT: Op.getValueType(),
13690	OverrideNEON: !Subtarget->isNeonAvailable()))
13691	return LowerFixedLengthConcatVectorsToSVE(Op, DAG);
13692
13693	assert(Op.getValueType().isScalableVector() &&
13694	isTypeLegal(Op.getValueType()) &&
13695	"Expected legal scalable vector type!");
13696
13697	if (isTypeLegal(VT: Op.getOperand(i: `0`).getValueType())) {
13698	unsigned NumOperands = Op ->getNumOperands();
13699	assert(NumOperands > `1` && isPowerOf2_32(NumOperands) &&
13700	"Unexpected number of operands in CONCAT_VECTORS");
13701
13702	if (NumOperands == `2`)
13703	return Op;
13704
13705	// Concat each pair of subvectors and pack into the lower half of the array.
13706	SmallVector<SDValue> ConcatOps(Op ->op_begin(), Op ->op_end());
13707	while (ConcatOps.size() > `1`) {
13708	for (unsigned I = `0`, E = ConcatOps.size(); I != E; I += `2`) {
13709	SDValue V1 = ConcatOps [I];
13710	SDValue V2 = ConcatOps [I + `1`];
13711	EVT SubVT = V1.getValueType();
13712	EVT PairVT = SubVT.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
13713	ConcatOps [I / `2`] =
13714	DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc (Op), VT: PairVT, N1: V1, N2: V2);
13715	}
13716	ConcatOps.resize(N: ConcatOps.size() / `2`);
13717	}
13718	return ConcatOps [`0`];
13719	}
13720
13721	return SDValue ();
13722	}
13723
13724	SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
13725	SelectionDAG &DAG) const {
13726	assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
13727
13728	if (useSVEForFixedLengthVectorVT(VT: Op.getValueType(),
13729	OverrideNEON: !Subtarget->isNeonAvailable()))
13730	return LowerFixedLengthInsertVectorElt(Op, DAG);
13731
13732	EVT VT = Op.getOperand(i: `0`).getValueType();
13733
13734	if (VT.getScalarType() == MVT::i1) {
13735	EVT VectorVT = getPromotedVTForPredicate(VT);
13736	SDLoc DL(Op);
13737	SDValue ExtendedVector =
13738	DAG.getAnyExtOrTrunc(Op: Op.getOperand(i: `0`), DL, VT: VectorVT);
13739	SDValue ExtendedValue =
13740	DAG.getAnyExtOrTrunc(Op.getOperand(`1`), DL,
13741	VectorVT.getScalarType().getSizeInBits() < `32`
13742	? MVT::i32
13743	: VectorVT.getScalarType());
13744	ExtendedVector =
13745	DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: VectorVT, N1: ExtendedVector,
13746	N2: ExtendedValue, N3: Op.getOperand(i: `2`));
13747	return DAG.getAnyExtOrTrunc(Op: ExtendedVector, DL, VT);
13748	}
13749
13750	// Check for non-constant or out of range lane.
13751	ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: `2`));
13752	if (!CI \|\| CI->getZExtValue() >= VT.getVectorNumElements())
13753	return SDValue ();
13754
13755	return Op;
13756	}
13757
13758	SDValue
13759	AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
13760	SelectionDAG &DAG) const {
13761	assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
13762	EVT VT = Op.getOperand(i: `0`).getValueType();
13763
13764	if (VT.getScalarType() == MVT::i1) {
13765	// We can't directly extract from an SVE predicate; extend it first.
13766	// (This isn't the only possible lowering, but it's straightforward.)
13767	EVT VectorVT = getPromotedVTForPredicate(VT);
13768	SDLoc DL(Op);
13769	SDValue Extend =
13770	DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: VectorVT, Operand: Op.getOperand(i: `0`));
13771	MVT ExtractTy = VectorVT == MVT::nxv2i64 ? MVT::i64 : MVT::i32;
13772	SDValue Extract = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: ExtractTy,
13773	N1: Extend, N2: Op.getOperand(i: `1`));
13774	return DAG.getAnyExtOrTrunc(Op: Extract, DL, VT: Op.getValueType());
13775	}
13776
13777	if (useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable()))
13778	return LowerFixedLengthExtractVectorElt(Op, DAG);
13779
13780	// Check for non-constant or out of range lane.
13781	ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: `1`));
13782	if (!CI \|\| CI->getZExtValue() >= VT.getVectorNumElements())
13783	return SDValue ();
13784
13785	// Insertion/extraction are legal for V128 types.
13786	if (VT == MVT::v16i8 \|\| VT == MVT::v8i16 \|\| VT == MVT::v4i32 \|\|
13787	VT == MVT::v2i64 \|\| VT == MVT::v4f32 \|\| VT == MVT::v2f64 \|\|
13788	VT == MVT::v8f16 \|\| VT == MVT::v8bf16)
13789	return Op;
13790
13791	if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
13792	VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
13793	VT != MVT::v4bf16)
13794	return SDValue ();
13795
13796	// For V64 types, we perform extraction by expanding the value
13797	// to a V128 type and perform the extraction on that.
13798	SDLoc DL(Op);
13799	SDValue WideVec = WidenVector(V64Reg: Op.getOperand(i: `0`), DAG);
13800	EVT WideTy = WideVec.getValueType();
13801
13802	EVT ExtrTy = WideTy.getVectorElementType();
13803	if (ExtrTy == MVT::i16 \|\| ExtrTy == MVT::i8)
13804	ExtrTy = MVT::i32;
13805
13806	// For extractions, we just return the result directly.
13807	return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: ExtrTy, N1: WideVec,
13808	N2: Op.getOperand(i: `1`));
13809	}
13810
13811	SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
13812	SelectionDAG &DAG) const {
13813	assert(Op.getValueType().isFixedLengthVector() &&
13814	"Only cases that extract a fixed length vector are supported!");
13815
13816	EVT InVT = Op.getOperand(i: `0`).getValueType();
13817	unsigned Idx = Op.getConstantOperandVal(i: `1`);
13818	unsigned Size = Op.getValueSizeInBits();
13819
13820	// If we don't have legal types yet, do nothing
13821	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT: InVT))
13822	return SDValue ();
13823
13824	if (InVT.isScalableVector()) {
13825	// This will be matched by custom code during ISelDAGToDAG.
13826	if (Idx == `0` && isPackedVectorType(VT: InVT, DAG))
13827	return Op;
13828
13829	return SDValue ();
13830	}
13831
13832	// This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
13833	if (Idx == `0` && InVT.getSizeInBits() <= `128`)
13834	return Op;
13835
13836	// If this is extracting the upper 64-bits of a 128-bit vector, we match
13837	// that directly.
13838	if (Size == `64` && Idx * InVT.getScalarSizeInBits() == `64` &&
13839	InVT.getSizeInBits() == `128` && Subtarget->isNeonAvailable())
13840	return Op;
13841
13842	if (useSVEForFixedLengthVectorVT(VT: InVT, OverrideNEON: !Subtarget->isNeonAvailable())) {
13843	SDLoc DL(Op);
13844
13845	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: InVT);
13846	SDValue NewInVec =
13847	convertToScalableVector(DAG, VT: ContainerVT, V: Op.getOperand(i: `0`));
13848
13849	SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, ContainerVT, NewInVec,
13850	NewInVec, DAG.getConstant(Idx, DL, MVT::i64));
13851	return convertFromScalableVector(DAG, VT: Op.getValueType(), V: Splice);
13852	}
13853
13854	return SDValue ();
13855	}
13856
13857	SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
13858	SelectionDAG &DAG) const {
13859	assert(Op.getValueType().isScalableVector() &&
13860	"Only expect to lower inserts into scalable vectors!");
13861
13862	EVT InVT = Op.getOperand(i: `1`).getValueType();
13863	unsigned Idx = Op.getConstantOperandVal(i: `2`);
13864
13865	SDValue Vec0 = Op.getOperand(i: `0`);
13866	SDValue Vec1 = Op.getOperand(i: `1`);
13867	SDLoc DL(Op);
13868	EVT VT = Op.getValueType();
13869
13870	if (InVT.isScalableVector()) {
13871	if (!isTypeLegal(VT))
13872	return SDValue ();
13873
13874	// Break down insert_subvector into simpler parts.
13875	if (VT.getVectorElementType() == MVT::i1) {
13876	unsigned NumElts = VT.getVectorMinNumElements();
13877	EVT HalfVT = VT.getHalfNumVectorElementsVT(Context&: *DAG.getContext());
13878
13879	SDValue Lo, Hi;
13880	Lo = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: HalfVT, N1: Vec0,
13881	N2: DAG.getVectorIdxConstant(Val: `0`, DL));
13882	Hi = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: HalfVT, N1: Vec0,
13883	N2: DAG.getVectorIdxConstant(Val: NumElts / `2`, DL));
13884	if (Idx < (NumElts / `2`))
13885	Lo = DAG.getNode(Opcode: ISD::INSERT_SUBVECTOR, DL, VT: HalfVT, N1: Lo, N2: Vec1,
13886	N3: DAG.getVectorIdxConstant(Val: Idx, DL));
13887	else
13888	Hi = DAG.getNode(Opcode: ISD::INSERT_SUBVECTOR, DL, VT: HalfVT, N1: Hi, N2: Vec1,
13889	N3: DAG.getVectorIdxConstant(Val: Idx - (NumElts / `2`), DL));
13890
13891	return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT, N1: Lo, N2: Hi);
13892	}
13893
13894	// Ensure the subvector is half the size of the main vector.
13895	if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * `2`))
13896	return SDValue ();
13897
13898	// Here narrow and wide refers to the vector element types. After "casting"
13899	// both vectors must have the same bit length and so because the subvector
13900	// has fewer elements, those elements need to be bigger.
13901	EVT NarrowVT = getPackedSVEVectorVT(EC: VT.getVectorElementCount());
13902	EVT WideVT = getPackedSVEVectorVT(EC: InVT.getVectorElementCount());
13903
13904	// NOP cast operands to the largest legal vector of the same element count.
13905	if (VT.isFloatingPoint()) {
13906	Vec0 = getSVESafeBitCast(VT: NarrowVT, Op: Vec0, DAG);
13907	Vec1 = getSVESafeBitCast(VT: WideVT, Op: Vec1, DAG);
13908	} else {
13909	// Legal integer vectors are already their largest so Vec0 is fine as is.
13910	Vec1 = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: WideVT, Operand: Vec1);
13911	}
13912
13913	// To replace the top/bottom half of vector V with vector SubV we widen the
13914	// preserved half of V, concatenate this to SubV (the order depending on the
13915	// half being replaced) and then narrow the result.
13916	SDValue Narrow;
13917	if (Idx == `0`) {
13918	SDValue HiVec0 = DAG.getNode(Opcode: AArch64ISD::UUNPKHI, DL, VT: WideVT, Operand: Vec0);
13919	Narrow = DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: NarrowVT, N1: Vec1, N2: HiVec0);
13920	} else {
13921	assert(Idx == InVT.getVectorMinNumElements() &&
13922	"Invalid subvector index!");
13923	SDValue LoVec0 = DAG.getNode(Opcode: AArch64ISD::UUNPKLO, DL, VT: WideVT, Operand: Vec0);
13924	Narrow = DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: NarrowVT, N1: LoVec0, N2: Vec1);
13925	}
13926
13927	return getSVESafeBitCast(VT, Op: Narrow, DAG);
13928	}
13929
13930	if (Idx == `0` && isPackedVectorType(VT, DAG)) {
13931	// This will be matched by custom code during ISelDAGToDAG.
13932	if (Vec0.isUndef())
13933	return Op;
13934
13935	std::optional<unsigned> PredPattern =
13936	getSVEPredPatternFromNumElements(InVT.getVectorNumElements());
13937	auto PredTy = VT.changeVectorElementType(MVT::i1);
13938	SDValue PTrue = getPTrue(DAG, DL, PredTy, *PredPattern);
13939	SDValue ScalableVec1 = convertToScalableVector(DAG, VT, V: Vec1);
13940	return DAG.getNode(Opcode: ISD::VSELECT, DL, VT, N1: PTrue, N2: ScalableVec1, N3: Vec0);
13941	}
13942
13943	return SDValue ();
13944	}
13945
13946	static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated) {
13947	if (Op.getOpcode() != AArch64ISD::DUP &&
13948	Op.getOpcode() != ISD::SPLAT_VECTOR &&
13949	Op.getOpcode() != ISD::BUILD_VECTOR)
13950	return false;
13951
13952	if (Op.getOpcode() == ISD::BUILD_VECTOR &&
13953	!isAllConstantBuildVector(PotentialBVec: Op, ConstVal&: SplatVal))
13954	return false;
13955
13956	if (Op.getOpcode() != ISD::BUILD_VECTOR &&
13957	!isa<ConstantSDNode>(Val: Op ->getOperand(Num: `0`)))
13958	return false;
13959
13960	SplatVal = Op ->getConstantOperandVal(Num: `0`);
13961	if (Op.getValueType().getVectorElementType() != MVT::i64)
13962	SplatVal = (int32_t)SplatVal;
13963
13964	Negated = false;
13965	if (isPowerOf2_64(Value: SplatVal))
13966	return true;
13967
13968	Negated = true;
13969	if (isPowerOf2_64(Value: -SplatVal)) {
13970	SplatVal = -SplatVal;
13971	return true;
13972	}
13973
13974	return false;
13975	}
13976
13977	SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
13978	EVT VT = Op.getValueType();
13979	SDLoc dl(Op);
13980
13981	if (useSVEForFixedLengthVectorVT(VT, /OverrideNEON=/true))
13982	return LowerFixedLengthVectorIntDivideToSVE(Op, DAG);
13983
13984	assert(VT.isScalableVector() && "Expected a scalable vector.");
13985
13986	bool Signed = Op.getOpcode() == ISD::SDIV;
13987	unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
13988
13989	bool Negated;
13990	uint64_t SplatVal;
13991	if (Signed && isPow2Splat(Op: Op.getOperand(i: `1`), SplatVal, Negated)) {
13992	SDValue Pg = getPredicateForScalableVector(DAG, DL&: dl, VT);
13993	SDValue Res =
13994	DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, VT, Pg, Op->getOperand(`0`),
13995	DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32));
13996	if (Negated)
13997	Res = DAG.getNode(Opcode: ISD::SUB, DL: dl, VT, N1: DAG.getConstant(Val: `0`, DL: dl, VT), N2: Res);
13998
13999	return Res;
14000	}
14001
14002	if (VT == MVT::nxv4i32 \|\| VT == MVT::nxv2i64)
14003	return LowerToPredicatedOp(Op, DAG, NewOp: PredOpcode);
14004
14005	// SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit
14006	// operations, and truncate the result.
14007	EVT WidenedVT;
14008	if (VT == MVT::nxv16i8)
14009	WidenedVT = MVT::nxv8i16;
14010	else if (VT == MVT::nxv8i16)
14011	WidenedVT = MVT::nxv4i32;
14012	else
14013	llvm_unreachable("Unexpected Custom DIV operation");
14014
14015	unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
14016	unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
14017	SDValue Op0Lo = DAG.getNode(Opcode: UnpkLo, DL: dl, VT: WidenedVT, Operand: Op.getOperand(i: `0`));
14018	SDValue Op1Lo = DAG.getNode(Opcode: UnpkLo, DL: dl, VT: WidenedVT, Operand: Op.getOperand(i: `1`));
14019	SDValue Op0Hi = DAG.getNode(Opcode: UnpkHi, DL: dl, VT: WidenedVT, Operand: Op.getOperand(i: `0`));
14020	SDValue Op1Hi = DAG.getNode(Opcode: UnpkHi, DL: dl, VT: WidenedVT, Operand: Op.getOperand(i: `1`));
14021	SDValue ResultLo = DAG.getNode(Opcode: Op.getOpcode(), DL: dl, VT: WidenedVT, N1: Op0Lo, N2: Op1Lo);
14022	SDValue ResultHi = DAG.getNode(Opcode: Op.getOpcode(), DL: dl, VT: WidenedVT, N1: Op0Hi, N2: Op1Hi);
14023	return DAG.getNode(Opcode: AArch64ISD::UZP1, DL: dl, VT, N1: ResultLo, N2: ResultHi);
14024	}
14025
14026	bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
14027	// Currently no fixed length shuffles that require SVE are legal.
14028	if (useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable()))
14029	return false;
14030
14031	if (VT.getVectorNumElements() == `4` &&
14032	(VT.is128BitVector() \|\| VT.is64BitVector())) {
14033	unsigned Cost = getPerfectShuffleCost(M);
14034	if (Cost <= `1`)
14035	return true;
14036	}
14037
14038	bool DummyBool;
14039	int DummyInt;
14040	unsigned DummyUnsigned;
14041
14042	return (ShuffleVectorSDNode::isSplatMask(Mask: &M [`0`], VT) \|\| isREVMask(M, VT, BlockSize: `64`) \|\|
14043	isREVMask(M, VT, BlockSize: `32`) \|\| isREVMask(M, VT, BlockSize: `16`) \|\|
14044	isEXTMask(M, VT, ReverseEXT&: DummyBool, Imm&: DummyUnsigned) \|\|
14045	// isTBLMask(M, VT) \|\| // FIXME: Port TBL support from ARM.
14046	isTRNMask(M, VT, WhichResult&: DummyUnsigned) \|\| isUZPMask(M, VT, WhichResultOut&: DummyUnsigned) \|\|
14047	isZIPMask(M, VT, WhichResultOut&: DummyUnsigned) \|\|
14048	isTRN_v_undef_Mask(M, VT, WhichResult&: DummyUnsigned) \|\|
14049	isUZP_v_undef_Mask(M, VT, WhichResult&: DummyUnsigned) \|\|
14050	isZIP_v_undef_Mask(M, VT, WhichResult&: DummyUnsigned) \|\|
14051	isINSMask(M, NumInputElements: VT.getVectorNumElements(), DstIsLeft&: DummyBool, Anomaly&: DummyInt) \|\|
14052	isConcatMask(Mask: M, VT, SplitLHS: VT.getSizeInBits() == `128`));
14053	}
14054
14055	bool AArch64TargetLowering::isVectorClearMaskLegal(ArrayRef<int> M,
14056	EVT VT) const {
14057	// Just delegate to the generic legality, clear masks aren't special.
14058	return isShuffleMaskLegal(M, VT);
14059	}
14060
14061	/// getVShiftImm - Check if this is a valid build_vector for the immediate
14062	/// operand of a vector shift operation, where all the elements of the
14063	/// build_vector must have the same constant integer value.
14064	static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
14065	// Ignore bit_converts.
14066	while (Op.getOpcode() == ISD::BITCAST)
14067	Op = Op.getOperand(i: `0`);
14068	BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Val: Op.getNode());
14069	APInt SplatBits, SplatUndef;
14070	unsigned SplatBitSize;
14071	bool HasAnyUndefs;
14072	if (!BVN \|\| !BVN->isConstantSplat(SplatValue&: SplatBits, SplatUndef, SplatBitSize,
14073	HasAnyUndefs, MinSplatBits: ElementBits) \|\|
14074	SplatBitSize > ElementBits)
14075	return false;
14076	Cnt = SplatBits.getSExtValue();
14077	return true;
14078	}
14079
14080	/// isVShiftLImm - Check if this is a valid build_vector for the immediate
14081	/// operand of a vector shift left operation. That value must be in the range:
14082	/// 0 <= Value < ElementBits for a left shift; or
14083	/// 0 <= Value <= ElementBits for a long left shift.
14084	static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
14085	assert(VT.isVector() && "vector shift count is not a vector type");
14086	int64_t ElementBits = VT.getScalarSizeInBits();
14087	if (!getVShiftImm(Op, ElementBits, Cnt))
14088	return false;
14089	return (Cnt >= `0` && (isLong ? Cnt - `1` : Cnt) < ElementBits);
14090	}
14091
14092	/// isVShiftRImm - Check if this is a valid build_vector for the immediate
14093	/// operand of a vector shift right operation. The value must be in the range:
14094	/// 1 <= Value <= ElementBits for a right shift; or
14095	static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
14096	assert(VT.isVector() && "vector shift count is not a vector type");
14097	int64_t ElementBits = VT.getScalarSizeInBits();
14098	if (!getVShiftImm(Op, ElementBits, Cnt))
14099	return false;
14100	return (Cnt >= `1` && Cnt <= (isNarrow ? ElementBits / `2` : ElementBits));
14101	}
14102
14103	SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
14104	SelectionDAG &DAG) const {
14105	EVT VT = Op.getValueType();
14106
14107	if (VT.getScalarType() == MVT::i1) {
14108	// Lower i1 truncate to `(x & 1) != 0`.
14109	SDLoc dl(Op);
14110	EVT OpVT = Op.getOperand(i: `0`).getValueType();
14111	SDValue Zero = DAG.getConstant(Val: `0`, DL: dl, VT: OpVT);
14112	SDValue One = DAG.getConstant(Val: `1`, DL: dl, VT: OpVT);
14113	SDValue And = DAG.getNode(Opcode: ISD::AND, DL: dl, VT: OpVT, N1: Op.getOperand(i: `0`), N2: One);
14114	return DAG.getSetCC(DL: dl, VT, LHS: And, RHS: Zero, Cond: ISD::SETNE);
14115	}
14116
14117	if (!VT.isVector() \|\| VT.isScalableVector())
14118	return SDValue ();
14119
14120	if (useSVEForFixedLengthVectorVT(VT: Op.getOperand(i: `0`).getValueType(),
14121	OverrideNEON: !Subtarget->isNeonAvailable()))
14122	return LowerFixedLengthVectorTruncateToSVE(Op, DAG);
14123
14124	return SDValue ();
14125	}
14126
14127	// Check if we can we lower this SRL to a rounding shift instruction. ResVT is
14128	// possibly a truncated type, it tells how many bits of the value are to be
14129	// used.
14130	static bool canLowerSRLToRoundingShiftForVT(SDValue Shift, EVT ResVT,
14131	SelectionDAG &DAG,
14132	unsigned &ShiftValue,
14133	SDValue &RShOperand) {
14134	if (Shift ->getOpcode() != ISD::SRL)
14135	return false;
14136
14137	EVT VT = Shift.getValueType();
14138	assert(VT.isScalableVT());
14139
14140	auto ShiftOp1 =
14141	dyn_cast_or_null<ConstantSDNode>(Val: DAG.getSplatValue(V: Shift ->getOperand(Num: `1`)));
14142	if (!ShiftOp1)
14143	return false;
14144
14145	ShiftValue = ShiftOp1->getZExtValue();
14146	if (ShiftValue < `1` \|\| ShiftValue > ResVT.getScalarSizeInBits())
14147	return false;
14148
14149	SDValue Add = Shift ->getOperand(Num: `0`);
14150	if (Add ->getOpcode() != ISD::ADD \|\| !Add ->hasOneUse())
14151	return false;
14152
14153	assert(ResVT.getScalarSizeInBits() <= VT.getScalarSizeInBits() &&
14154	"ResVT must be truncated or same type as the shift.");
14155	// Check if an overflow can lead to incorrect results.
14156	uint64_t ExtraBits = VT.getScalarSizeInBits() - ResVT.getScalarSizeInBits();
14157	if (ShiftValue > ExtraBits && !Add ->getFlags().hasNoUnsignedWrap())
14158	return false;
14159
14160	auto AddOp1 =
14161	dyn_cast_or_null<ConstantSDNode>(Val: DAG.getSplatValue(V: Add ->getOperand(Num: `1`)));
14162	if (!AddOp1)
14163	return false;
14164	uint64_t AddValue = AddOp1->getZExtValue();
14165	if (AddValue != `1ULL` << (ShiftValue - `1`))
14166	return false;
14167
14168	RShOperand = Add ->getOperand(Num: `0`);
14169	return true;
14170	}
14171
14172	SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
14173	SelectionDAG &DAG) const {
14174	EVT VT = Op.getValueType();
14175	SDLoc DL(Op);
14176	int64_t Cnt;
14177
14178	if (!Op.getOperand(i: `1`).getValueType().isVector())
14179	return Op;
14180	unsigned EltSize = VT.getScalarSizeInBits();
14181
14182	switch (Op.getOpcode()) {
14183	case ISD::SHL:
14184	if (VT.isScalableVector() \|\|
14185	useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable()))
14186	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::SHL_PRED);
14187
14188	if (isVShiftLImm(Op.getOperand(`1`), VT, false, Cnt) && Cnt < EltSize)
14189	return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(`0`),
14190	DAG.getConstant(Cnt, DL, MVT::i32));
14191	return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
14192	DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL,
14193	MVT::i32),
14194	Op.getOperand(`0`), Op.getOperand(`1`));
14195	case ISD::SRA:
14196	case ISD::SRL:
14197	if (VT.isScalableVector() && Subtarget->hasSVE2orSME()) {
14198	SDValue RShOperand;
14199	unsigned ShiftValue;
14200	if (canLowerSRLToRoundingShiftForVT(Op, VT, DAG, ShiftValue, RShOperand))
14201	return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, VT,
14202	getPredicateForVector(DAG, DL, VT), RShOperand,
14203	DAG.getTargetConstant(ShiftValue, DL, MVT::i32));
14204	}
14205
14206	if (VT.isScalableVector() \|\|
14207	useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable())) {
14208	unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED
14209	: AArch64ISD::SRL_PRED;
14210	return LowerToPredicatedOp(Op, DAG, NewOp: Opc);
14211	}
14212
14213	// Right shift immediate
14214	if (isVShiftRImm(Op: Op.getOperand(i: `1`), VT, isNarrow: false, Cnt) && Cnt < EltSize) {
14215	unsigned Opc =
14216	(Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
14217	return DAG.getNode(Opc, DL, VT, Op.getOperand(`0`),
14218	DAG.getConstant(Cnt, DL, MVT::i32));
14219	}
14220
14221	// Right shift register. Note, there is not a shift right register
14222	// instruction, but the shift left register instruction takes a signed
14223	// value, where negative numbers specify a right shift.
14224	unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
14225	: Intrinsic::aarch64_neon_ushl;
14226	// negate the shift amount
14227	SDValue NegShift = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: DAG.getConstant(Val: `0`, DL, VT),
14228	N2: Op.getOperand(i: `1`));
14229	SDValue NegShiftLeft =
14230	DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
14231	DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(`0`),
14232	NegShift);
14233	return NegShiftLeft;
14234	}
14235
14236	llvm_unreachable("unexpected shift opcode");
14237	}
14238
14239	static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
14240	AArch64CC::CondCode CC, bool NoNans, EVT VT,
14241	const SDLoc &dl, SelectionDAG &DAG) {
14242	EVT SrcVT = LHS.getValueType();
14243	assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
14244	"function only supposed to emit natural comparisons");
14245
14246	APInt SplatValue;
14247	APInt SplatUndef;
14248	unsigned SplatBitSize = `0`;
14249	bool HasAnyUndefs;
14250
14251	BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Val: RHS.getNode());
14252	bool IsCnst = BVN && BVN->isConstantSplat(SplatValue, SplatUndef,
14253	SplatBitSize, HasAnyUndefs);
14254
14255	bool IsZero = IsCnst && SplatValue == `0`;
14256	bool IsOne =
14257	IsCnst && SrcVT.getScalarSizeInBits() == SplatBitSize && SplatValue == `1`;
14258	bool IsMinusOne = IsCnst && SplatValue.isAllOnes();
14259
14260	if (SrcVT.getVectorElementType().isFloatingPoint()) {
14261	switch (CC) {
14262	default:
14263	return SDValue ();
14264	case AArch64CC::NE: {
14265	SDValue Fcmeq;
14266	if (IsZero)
14267	Fcmeq = DAG.getNode(Opcode: AArch64ISD::FCMEQz, DL: dl, VT, Operand: LHS);
14268	else
14269	Fcmeq = DAG.getNode(Opcode: AArch64ISD::FCMEQ, DL: dl, VT, N1: LHS, N2: RHS);
14270	return DAG.getNOT(DL: dl, Val: Fcmeq, VT);
14271	}
14272	case AArch64CC::EQ:
14273	if (IsZero)
14274	return DAG.getNode(Opcode: AArch64ISD::FCMEQz, DL: dl, VT, Operand: LHS);
14275	return DAG.getNode(Opcode: AArch64ISD::FCMEQ, DL: dl, VT, N1: LHS, N2: RHS);
14276	case AArch64CC::GE:
14277	if (IsZero)
14278	return DAG.getNode(Opcode: AArch64ISD::FCMGEz, DL: dl, VT, Operand: LHS);
14279	return DAG.getNode(Opcode: AArch64ISD::FCMGE, DL: dl, VT, N1: LHS, N2: RHS);
14280	case AArch64CC::GT:
14281	if (IsZero)
14282	return DAG.getNode(Opcode: AArch64ISD::FCMGTz, DL: dl, VT, Operand: LHS);
14283	return DAG.getNode(Opcode: AArch64ISD::FCMGT, DL: dl, VT, N1: LHS, N2: RHS);
14284	case AArch64CC::LE:
14285	if (!NoNans)
14286	return SDValue ();
14287	// If we ignore NaNs then we can use to the LS implementation.
14288	[[fallthrough]];
14289	case AArch64CC::LS:
14290	if (IsZero)
14291	return DAG.getNode(Opcode: AArch64ISD::FCMLEz, DL: dl, VT, Operand: LHS);
14292	return DAG.getNode(Opcode: AArch64ISD::FCMGE, DL: dl, VT, N1: RHS, N2: LHS);
14293	case AArch64CC::LT:
14294	if (!NoNans)
14295	return SDValue ();
14296	// If we ignore NaNs then we can use to the MI implementation.
14297	[[fallthrough]];
14298	case AArch64CC::MI:
14299	if (IsZero)
14300	return DAG.getNode(Opcode: AArch64ISD::FCMLTz, DL: dl, VT, Operand: LHS);
14301	return DAG.getNode(Opcode: AArch64ISD::FCMGT, DL: dl, VT, N1: RHS, N2: LHS);
14302	}
14303	}
14304
14305	switch (CC) {
14306	default:
14307	return SDValue ();
14308	case AArch64CC::NE: {
14309	SDValue Cmeq;
14310	if (IsZero)
14311	Cmeq = DAG.getNode(Opcode: AArch64ISD::CMEQz, DL: dl, VT, Operand: LHS);
14312	else
14313	Cmeq = DAG.getNode(Opcode: AArch64ISD::CMEQ, DL: dl, VT, N1: LHS, N2: RHS);
14314	return DAG.getNOT(DL: dl, Val: Cmeq, VT);
14315	}
14316	case AArch64CC::EQ:
14317	if (IsZero)
14318	return DAG.getNode(Opcode: AArch64ISD::CMEQz, DL: dl, VT, Operand: LHS);
14319	return DAG.getNode(Opcode: AArch64ISD::CMEQ, DL: dl, VT, N1: LHS, N2: RHS);
14320	case AArch64CC::GE:
14321	if (IsZero)
14322	return DAG.getNode(Opcode: AArch64ISD::CMGEz, DL: dl, VT, Operand: LHS);
14323	return DAG.getNode(Opcode: AArch64ISD::CMGE, DL: dl, VT, N1: LHS, N2: RHS);
14324	case AArch64CC::GT:
14325	if (IsZero)
14326	return DAG.getNode(Opcode: AArch64ISD::CMGTz, DL: dl, VT, Operand: LHS);
14327	if (IsMinusOne)
14328	return DAG.getNode(Opcode: AArch64ISD::CMGEz, DL: dl, VT, N1: LHS, N2: RHS);
14329	return DAG.getNode(Opcode: AArch64ISD::CMGT, DL: dl, VT, N1: LHS, N2: RHS);
14330	case AArch64CC::LE:
14331	if (IsZero)
14332	return DAG.getNode(Opcode: AArch64ISD::CMLEz, DL: dl, VT, Operand: LHS);
14333	return DAG.getNode(Opcode: AArch64ISD::CMGE, DL: dl, VT, N1: RHS, N2: LHS);
14334	case AArch64CC::LS:
14335	return DAG.getNode(Opcode: AArch64ISD::CMHS, DL: dl, VT, N1: RHS, N2: LHS);
14336	case AArch64CC::LO:
14337	return DAG.getNode(Opcode: AArch64ISD::CMHI, DL: dl, VT, N1: RHS, N2: LHS);
14338	case AArch64CC::LT:
14339	if (IsZero)
14340	return DAG.getNode(Opcode: AArch64ISD::CMLTz, DL: dl, VT, Operand: LHS);
14341	if (IsOne)
14342	return DAG.getNode(Opcode: AArch64ISD::CMLEz, DL: dl, VT, Operand: LHS);
14343	return DAG.getNode(Opcode: AArch64ISD::CMGT, DL: dl, VT, N1: RHS, N2: LHS);
14344	case AArch64CC::HI:
14345	return DAG.getNode(Opcode: AArch64ISD::CMHI, DL: dl, VT, N1: LHS, N2: RHS);
14346	case AArch64CC::HS:
14347	return DAG.getNode(Opcode: AArch64ISD::CMHS, DL: dl, VT, N1: LHS, N2: RHS);
14348	}
14349	}
14350
14351	SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
14352	SelectionDAG &DAG) const {
14353	if (Op.getValueType().isScalableVector())
14354	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::SETCC_MERGE_ZERO);
14355
14356	if (useSVEForFixedLengthVectorVT(VT: Op.getOperand(i: `0`).getValueType(),
14357	OverrideNEON: !Subtarget->isNeonAvailable()))
14358	return LowerFixedLengthVectorSetccToSVE(Op, DAG);
14359
14360	ISD::CondCode CC = cast<CondCodeSDNode>(Val: Op.getOperand(i: `2`))->get();
14361	SDValue LHS = Op.getOperand(i: `0`);
14362	SDValue RHS = Op.getOperand(i: `1`);
14363	EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
14364	SDLoc dl(Op);
14365
14366	if (LHS.getValueType().getVectorElementType().isInteger()) {
14367	assert(LHS.getValueType() == RHS.getValueType());
14368	AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
14369	SDValue Cmp =
14370	EmitVectorComparison(LHS, RHS, CC: AArch64CC, NoNans: false, VT: CmpVT, dl, DAG);
14371	return DAG.getSExtOrTrunc(Op: Cmp, DL: dl, VT: Op.getValueType());
14372	}
14373
14374	// Lower isnan(x) \| isnan(never-nan) to x != x.
14375	// Lower !isnan(x) & !isnan(never-nan) to x == x.
14376	if (CC == ISD::SETUO \|\| CC == ISD::SETO) {
14377	bool OneNaN = false;
14378	if (LHS == RHS) {
14379	OneNaN = true;
14380	} else if (DAG.isKnownNeverNaN(Op: RHS)) {
14381	OneNaN = true;
14382	RHS = LHS;
14383	} else if (DAG.isKnownNeverNaN(Op: LHS)) {
14384	OneNaN = true;
14385	LHS = RHS;
14386	}
14387	if (OneNaN) {
14388	CC = CC == ISD::SETUO ? ISD::SETUNE : ISD::SETOEQ;
14389	}
14390	}
14391
14392	const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
14393
14394	// Make v4f16 (only) fcmp operations utilise vector instructions
14395	// v8f16 support will be a litle more complicated
14396	if ((!FullFP16 && LHS.getValueType().getVectorElementType() == MVT::f16) \|\|
14397	LHS.getValueType().getVectorElementType() == MVT::bf16) {
14398	if (LHS.getValueType().getVectorNumElements() == `4`) {
14399	LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, LHS);
14400	RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, RHS);
14401	SDValue NewSetcc = DAG.getSetCC(dl, MVT::v4i16, LHS, RHS, CC);
14402	DAG.ReplaceAllUsesWith(From: Op, To: NewSetcc);
14403	CmpVT = MVT::v4i32;
14404	} else
14405	return SDValue ();
14406	}
14407
14408	assert((!FullFP16 && LHS.getValueType().getVectorElementType() != MVT::f16) \|\|
14409	LHS.getValueType().getVectorElementType() != MVT::bf16 \|\|
14410	LHS.getValueType().getVectorElementType() != MVT::f128);
14411
14412	// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
14413	// clean. Some of them require two branches to implement.
14414	AArch64CC::CondCode CC1, CC2;
14415	bool ShouldInvert;
14416	changeVectorFPCCToAArch64CC(CC, CondCode&: CC1, CondCode2&: CC2, Invert&: ShouldInvert);
14417
14418	bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath \|\| Op ->getFlags().hasNoNaNs();
14419	SDValue Cmp =
14420	EmitVectorComparison(LHS, RHS, CC: CC1, NoNans: NoNaNs, VT: CmpVT, dl, DAG);
14421	if (!Cmp.getNode())
14422	return SDValue ();
14423
14424	if (CC2 != AArch64CC::AL) {
14425	SDValue Cmp2 =
14426	EmitVectorComparison(LHS, RHS, CC: CC2, NoNans: NoNaNs, VT: CmpVT, dl, DAG);
14427	if (!Cmp2.getNode())
14428	return SDValue ();
14429
14430	Cmp = DAG.getNode(Opcode: ISD::OR, DL: dl, VT: CmpVT, N1: Cmp, N2: Cmp2);
14431	}
14432
14433	Cmp = DAG.getSExtOrTrunc(Op: Cmp, DL: dl, VT: Op.getValueType());
14434
14435	if (ShouldInvert)
14436	Cmp = DAG.getNOT(DL: dl, Val: Cmp, VT: Cmp.getValueType());
14437
14438	return Cmp;
14439	}
14440
14441	static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
14442	SelectionDAG &DAG) {
14443	SDValue VecOp = ScalarOp.getOperand(i: `0`);
14444	auto Rdx = DAG.getNode(Opcode: Op, DL, VT: VecOp.getSimpleValueType(), Operand: VecOp);
14445	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
14446	DAG.getConstant(`0`, DL, MVT::i64));
14447	}
14448
14449	static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT,
14450	SDLoc DL, SelectionDAG &DAG) {
14451	unsigned ScalarOpcode;
14452	switch (Opcode) {
14453	case ISD::VECREDUCE_AND:
14454	ScalarOpcode = ISD::AND;
14455	break;
14456	case ISD::VECREDUCE_OR:
14457	ScalarOpcode = ISD::OR;
14458	break;
14459	case ISD::VECREDUCE_XOR:
14460	ScalarOpcode = ISD::XOR;
14461	break;
14462	default:
14463	llvm_unreachable("Expected bitwise vector reduction");
14464	return SDValue ();
14465	}
14466
14467	EVT VecVT = Vec.getValueType();
14468	assert(VecVT.isFixedLengthVector() && VecVT.isPow2VectorType() &&
14469	"Expected power-of-2 length vector");
14470
14471	EVT ElemVT = VecVT.getVectorElementType();
14472
14473	SDValue Result;
14474	unsigned NumElems = VecVT.getVectorNumElements();
14475
14476	// Special case for boolean reductions
14477	if (ElemVT == MVT::i1) {
14478	// Split large vectors into smaller ones
14479	if (NumElems > `16`) {
14480	SDValue Lo, Hi;
14481	std::tie(args&: Lo, args&: Hi) = DAG.SplitVector(N: Vec, DL);
14482	EVT HalfVT = Lo.getValueType();
14483	SDValue HalfVec = DAG.getNode(Opcode: ScalarOpcode, DL, VT: HalfVT, N1: Lo, N2: Hi);
14484	return getVectorBitwiseReduce(Opcode, Vec: HalfVec, VT, DL, DAG);
14485	}
14486
14487	// Vectors that are less than 64 bits get widened to neatly fit a 64 bit
14488	// register, so e.g. <4 x i1> gets lowered to <4 x i16>. Sign extending to
14489	// this element size leads to the best codegen, since e.g. setcc results
14490	// might need to be truncated otherwise.
14491	EVT ExtendedVT = MVT::getIntegerVT(BitWidth: std::max(a: `64u` / NumElems, b: `8u`));
14492
14493	// any_ext doesn't work with umin/umax, so only use it for uadd.
14494	unsigned ExtendOp =
14495	ScalarOpcode == ISD::XOR ? ISD::ANY_EXTEND : ISD::SIGN_EXTEND;
14496	SDValue Extended = DAG.getNode(
14497	Opcode: ExtendOp, DL, VT: VecVT.changeVectorElementType(EltVT: ExtendedVT), Operand: Vec);
14498	switch (ScalarOpcode) {
14499	case ISD::AND:
14500	Result = DAG.getNode(Opcode: ISD::VECREDUCE_UMIN, DL, VT: ExtendedVT, Operand: Extended);
14501	break;
14502	case ISD::OR:
14503	Result = DAG.getNode(Opcode: ISD::VECREDUCE_UMAX, DL, VT: ExtendedVT, Operand: Extended);
14504	break;
14505	case ISD::XOR:
14506	Result = DAG.getNode(Opcode: ISD::VECREDUCE_ADD, DL, VT: ExtendedVT, Operand: Extended);
14507	break;
14508	default:
14509	llvm_unreachable("Unexpected Opcode");
14510	}
14511
14512	Result = DAG.getAnyExtOrTrunc(Result, DL, MVT::i1);
14513	} else {
14514	// Iteratively split the vector in half and combine using the bitwise
14515	// operation until it fits in a 64 bit register.
14516	while (VecVT.getSizeInBits() > `64`) {
14517	SDValue Lo, Hi;
14518	std::tie(args&: Lo, args&: Hi) = DAG.SplitVector(N: Vec, DL);
14519	VecVT = Lo.getValueType();
14520	NumElems = VecVT.getVectorNumElements();
14521	Vec = DAG.getNode(Opcode: ScalarOpcode, DL, VT: VecVT, N1: Lo, N2: Hi);
14522	}
14523
14524	EVT ScalarVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: VecVT.getSizeInBits());
14525
14526	// Do the remaining work on a scalar since it allows the code generator to
14527	// combine the shift and bitwise operation into one instruction and since
14528	// integer instructions can have higher throughput than vector instructions.
14529	SDValue Scalar = DAG.getBitcast(VT: ScalarVT, V: Vec);
14530
14531	// Iteratively combine the lower and upper halves of the scalar using the
14532	// bitwise operation, halving the relevant region of the scalar in each
14533	// iteration, until the relevant region is just one element of the original
14534	// vector.
14535	for (unsigned Shift = NumElems / `2`; Shift > `0`; Shift /= `2`) {
14536	SDValue ShiftAmount =
14537	DAG.getConstant(Shift * ElemVT.getSizeInBits(), DL, MVT::i64);
14538	SDValue Shifted =
14539	DAG.getNode(Opcode: ISD::SRL, DL, VT: ScalarVT, N1: Scalar, N2: ShiftAmount);
14540	Scalar = DAG.getNode(Opcode: ScalarOpcode, DL, VT: ScalarVT, N1: Scalar, N2: Shifted);
14541	}
14542
14543	Result = DAG.getAnyExtOrTrunc(Op: Scalar, DL, VT: ElemVT);
14544	}
14545
14546	return DAG.getAnyExtOrTrunc(Op: Result, DL, VT);
14547	}
14548
14549	SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
14550	SelectionDAG &DAG) const {
14551	SDValue Src = Op.getOperand(i: `0`);
14552
14553	// Try to lower fixed length reductions to SVE.
14554	EVT SrcVT = Src.getValueType();
14555	bool OverrideNEON = !Subtarget->isNeonAvailable() \|\|
14556	Op.getOpcode() == ISD::VECREDUCE_AND \|\|
14557	Op.getOpcode() == ISD::VECREDUCE_OR \|\|
14558	Op.getOpcode() == ISD::VECREDUCE_XOR \|\|
14559	Op.getOpcode() == ISD::VECREDUCE_FADD \|\|
14560	(Op.getOpcode() != ISD::VECREDUCE_ADD &&
14561	SrcVT.getVectorElementType() == MVT::i64);
14562	if (SrcVT.isScalableVector() \|\|
14563	useSVEForFixedLengthVectorVT(
14564	VT: SrcVT, OverrideNEON: OverrideNEON && Subtarget->useSVEForFixedLengthVectors())) {
14565
14566	if (SrcVT.getVectorElementType() == MVT::i1)
14567	return LowerPredReductionToSVE(ScalarOp: Op, DAG);
14568
14569	switch (Op.getOpcode()) {
14570	case ISD::VECREDUCE_ADD:
14571	return LowerReductionToSVE(Opcode: AArch64ISD::UADDV_PRED, ScalarOp: Op, DAG);
14572	case ISD::VECREDUCE_AND:
14573	return LowerReductionToSVE(Opcode: AArch64ISD::ANDV_PRED, ScalarOp: Op, DAG);
14574	case ISD::VECREDUCE_OR:
14575	return LowerReductionToSVE(Opcode: AArch64ISD::ORV_PRED, ScalarOp: Op, DAG);
14576	case ISD::VECREDUCE_SMAX:
14577	return LowerReductionToSVE(Opcode: AArch64ISD::SMAXV_PRED, ScalarOp: Op, DAG);
14578	case ISD::VECREDUCE_SMIN:
14579	return LowerReductionToSVE(Opcode: AArch64ISD::SMINV_PRED, ScalarOp: Op, DAG);
14580	case ISD::VECREDUCE_UMAX:
14581	return LowerReductionToSVE(Opcode: AArch64ISD::UMAXV_PRED, ScalarOp: Op, DAG);
14582	case ISD::VECREDUCE_UMIN:
14583	return LowerReductionToSVE(Opcode: AArch64ISD::UMINV_PRED, ScalarOp: Op, DAG);
14584	case ISD::VECREDUCE_XOR:
14585	return LowerReductionToSVE(Opcode: AArch64ISD::EORV_PRED, ScalarOp: Op, DAG);
14586	case ISD::VECREDUCE_FADD:
14587	return LowerReductionToSVE(Opcode: AArch64ISD::FADDV_PRED, ScalarOp: Op, DAG);
14588	case ISD::VECREDUCE_FMAX:
14589	return LowerReductionToSVE(Opcode: AArch64ISD::FMAXNMV_PRED, ScalarOp: Op, DAG);
14590	case ISD::VECREDUCE_FMIN:
14591	return LowerReductionToSVE(Opcode: AArch64ISD::FMINNMV_PRED, ScalarOp: Op, DAG);
14592	case ISD::VECREDUCE_FMAXIMUM:
14593	return LowerReductionToSVE(Opcode: AArch64ISD::FMAXV_PRED, ScalarOp: Op, DAG);
14594	case ISD::VECREDUCE_FMINIMUM:
14595	return LowerReductionToSVE(Opcode: AArch64ISD::FMINV_PRED, ScalarOp: Op, DAG);
14596	default:
14597	llvm_unreachable("Unhandled fixed length reduction");
14598	}
14599	}
14600
14601	// Lower NEON reductions.
14602	SDLoc dl(Op);
14603	switch (Op.getOpcode()) {
14604	case ISD::VECREDUCE_AND:
14605	case ISD::VECREDUCE_OR:
14606	case ISD::VECREDUCE_XOR:
14607	return getVectorBitwiseReduce(Opcode: Op.getOpcode(), Vec: Op.getOperand(i: `0`),
14608	VT: Op.getValueType(), DL: dl, DAG);
14609	case ISD::VECREDUCE_ADD:
14610	return getReductionSDNode(Op: AArch64ISD::UADDV, DL: dl, ScalarOp: Op, DAG);
14611	case ISD::VECREDUCE_SMAX:
14612	return getReductionSDNode(Op: AArch64ISD::SMAXV, DL: dl, ScalarOp: Op, DAG);
14613	case ISD::VECREDUCE_SMIN:
14614	return getReductionSDNode(Op: AArch64ISD::SMINV, DL: dl, ScalarOp: Op, DAG);
14615	case ISD::VECREDUCE_UMAX:
14616	return getReductionSDNode(Op: AArch64ISD::UMAXV, DL: dl, ScalarOp: Op, DAG);
14617	case ISD::VECREDUCE_UMIN:
14618	return getReductionSDNode(Op: AArch64ISD::UMINV, DL: dl, ScalarOp: Op, DAG);
14619	default:
14620	llvm_unreachable("Unhandled reduction");
14621	}
14622	}
14623
14624	SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
14625	SelectionDAG &DAG) const {
14626	auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
14627	// No point replacing if we don't have the relevant instruction/libcall anyway
14628	if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
14629	return SDValue ();
14630
14631	// LSE has an atomic load-clear instruction, but not a load-and.
14632	SDLoc dl(Op);
14633	MVT VT = Op.getSimpleValueType();
14634	assert(VT != MVT::i128 && "Handled elsewhere, code replicated.");
14635	SDValue RHS = Op.getOperand(i: `2`);
14636	AtomicSDNode *AN = cast<AtomicSDNode>(Val: Op.getNode());
14637	RHS = DAG.getNode(Opcode: ISD::XOR, DL: dl, VT, N1: DAG.getConstant(Val: -`1ULL`, DL: dl, VT), N2: RHS);
14638	return DAG.getAtomic(Opcode: ISD::ATOMIC_LOAD_CLR, dl, MemVT: AN->getMemoryVT(),
14639	Chain: Op.getOperand(i: `0`), Ptr: Op.getOperand(i: `1`), Val: RHS,
14640	MMO: AN->getMemOperand());
14641	}
14642
14643	SDValue
14644	AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op,
14645	SelectionDAG &DAG) const {
14646
14647	SDLoc dl(Op);
14648	// Get the inputs.
14649	SDNode *Node = Op.getNode();
14650	SDValue Chain = Op.getOperand(i: `0`);
14651	SDValue Size = Op.getOperand(i: `1`);
14652	MaybeAlign Align =
14653	cast<ConstantSDNode>(Val: Op.getOperand(i: `2`))->getMaybeAlignValue();
14654	EVT VT = Node->getValueType(ResNo: `0`);
14655
14656	if (DAG.getMachineFunction().getFunction().hasFnAttribute(
14657	Kind: "no-stack-arg-probe")) {
14658	SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
14659	Chain = SP.getValue(R: `1`);
14660	SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
14661	if (Align)
14662	SP = DAG.getNode(Opcode: ISD::AND, DL: dl, VT, N1: SP.getValue(R: `0`),
14663	N2: DAG.getConstant(Val: -(uint64_t)Align ->value(), DL: dl, VT));
14664	Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
14665	SDValue Ops[`2`] = {SP, Chain};
14666	return DAG.getMergeValues(Ops, dl);
14667	}
14668
14669	Chain = DAG.getCALLSEQ_START(Chain, InSize: `0`, OutSize: `0`, DL: dl);
14670
14671	EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
14672	SDValue Callee = DAG.getTargetExternalSymbol(Sym: Subtarget->getChkStkName(),
14673	VT: PtrVT, TargetFlags: `0`);
14674
14675	const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
14676	const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
14677	if (Subtarget->hasCustomCallingConv())
14678	TRI->UpdateCustomCallPreservedMask(MF&: DAG.getMachineFunction(), Mask: &Mask);
14679
14680	Size = DAG.getNode(ISD::SRL, dl, MVT::i64, Size,
14681	DAG.getConstant(`4`, dl, MVT::i64));
14682	Chain = DAG.getCopyToReg(Chain, dl, AArch64::X15, Size, SDValue());
14683	Chain =
14684	DAG.getNode(AArch64ISD::CALL, dl, DAG.getVTList(MVT::Other, MVT::Glue),
14685	Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),
14686	DAG.getRegisterMask(Mask), Chain.getValue(`1`));
14687	// To match the actual intent better, we should read the output from X15 here
14688	// again (instead of potentially spilling it to the stack), but rereading Size
14689	// from X15 here doesn't work at -O0, since it thinks that X15 is undefined
14690	// here.
14691
14692	Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size,
14693	DAG.getConstant(`4`, dl, MVT::i64));
14694
14695	SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
14696	Chain = SP.getValue(R: `1`);
14697	SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
14698	if (Align)
14699	SP = DAG.getNode(Opcode: ISD::AND, DL: dl, VT, N1: SP.getValue(R: `0`),
14700	N2: DAG.getConstant(Val: -(uint64_t)Align ->value(), DL: dl, VT));
14701	Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
14702
14703	Chain = DAG.getCALLSEQ_END(Chain, Size1: `0`, Size2: `0`, Glue: SDValue (), DL: dl);
14704
14705	SDValue Ops[`2`] = {SP, Chain};
14706	return DAG.getMergeValues(Ops, dl);
14707	}
14708
14709	SDValue
14710	AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op,
14711	SelectionDAG &DAG) const {
14712	// Get the inputs.
14713	SDNode *Node = Op.getNode();
14714	SDValue Chain = Op.getOperand(i: `0`);
14715	SDValue Size = Op.getOperand(i: `1`);
14716
14717	MaybeAlign Align =
14718	cast<ConstantSDNode>(Val: Op.getOperand(i: `2`))->getMaybeAlignValue();
14719	SDLoc dl(Op);
14720	EVT VT = Node->getValueType(ResNo: `0`);
14721
14722	// Construct the new SP value in a GPR.
14723	SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
14724	Chain = SP.getValue(R: `1`);
14725	SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
14726	if (Align)
14727	SP = DAG.getNode(Opcode: ISD::AND, DL: dl, VT, N1: SP.getValue(R: `0`),
14728	N2: DAG.getConstant(Val: -(uint64_t)Align ->value(), DL: dl, VT));
14729
14730	// Set the real SP to the new value with a probing loop.
14731	Chain = DAG.getNode(AArch64ISD::PROBED_ALLOCA, dl, MVT::Other, Chain, SP);
14732	SDValue Ops[`2`] = {SP, Chain};
14733	return DAG.getMergeValues(Ops, dl);
14734	}
14735
14736	SDValue
14737	AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
14738	SelectionDAG &DAG) const {
14739	MachineFunction &MF = DAG.getMachineFunction();
14740
14741	if (Subtarget->isTargetWindows())
14742	return LowerWindowsDYNAMIC_STACKALLOC(Op, DAG);
14743	else if (hasInlineStackProbe(MF))
14744	return LowerInlineDYNAMIC_STACKALLOC(Op, DAG);
14745	else
14746	return SDValue ();
14747	}
14748
14749	// When x and y are extended, lower:
14750	// avgfloor(x, y) -> (x + y) >> 1
14751	// avgceil(x, y) -> (x + y + 1) >> 1
14752
14753	// Otherwise, lower to:
14754	// avgfloor(x, y) -> (x >> 1) + (y >> 1) + (x & y & 1)
14755	// avgceil(x, y) -> (x >> 1) + (y >> 1) + ((x \|\| y) & 1)
14756	SDValue AArch64TargetLowering::LowerAVG(SDValue Op, SelectionDAG &DAG,
14757	unsigned NewOp) const {
14758	if (Subtarget->hasSVE2())
14759	return LowerToPredicatedOp(Op, DAG, NewOp);
14760
14761	SDLoc dl(Op);
14762	SDValue OpA = Op ->getOperand(Num: `0`);
14763	SDValue OpB = Op ->getOperand(Num: `1`);
14764	EVT VT = Op.getValueType();
14765	bool IsCeil =
14766	(Op ->getOpcode() == ISD::AVGCEILS \|\| Op ->getOpcode() == ISD::AVGCEILU);
14767	bool IsSigned =
14768	(Op ->getOpcode() == ISD::AVGFLOORS \|\| Op ->getOpcode() == ISD::AVGCEILS);
14769	unsigned ShiftOpc = IsSigned ? ISD::SRA : ISD::SRL;
14770
14771	assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
14772
14773	auto IsZeroExtended = [&DAG](SDValue &Node) {
14774	KnownBits Known = DAG.computeKnownBits(Op: Node, Depth: `0`);
14775	return Known.Zero.isSignBitSet();
14776	};
14777
14778	auto IsSignExtended = [&DAG](SDValue &Node) {
14779	return (DAG.ComputeNumSignBits(Op: Node, Depth: `0`) > `1`);
14780	};
14781
14782	SDValue ConstantOne = DAG.getConstant(Val: `1`, DL: dl, VT);
14783	if ((!IsSigned && IsZeroExtended (OpA) && IsZeroExtended (OpB)) \|\|
14784	(IsSigned && IsSignExtended (OpA) && IsSignExtended (OpB))) {
14785	SDValue Add = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT, N1: OpA, N2: OpB);
14786	if (IsCeil)
14787	Add = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT, N1: Add, N2: ConstantOne);
14788	return DAG.getNode(Opcode: ShiftOpc, DL: dl, VT, N1: Add, N2: ConstantOne);
14789	}
14790
14791	SDValue ShiftOpA = DAG.getNode(Opcode: ShiftOpc, DL: dl, VT, N1: OpA, N2: ConstantOne);
14792	SDValue ShiftOpB = DAG.getNode(Opcode: ShiftOpc, DL: dl, VT, N1: OpB, N2: ConstantOne);
14793
14794	SDValue tmp = DAG.getNode(Opcode: IsCeil ? ISD::OR : ISD::AND, DL: dl, VT, N1: OpA, N2: OpB);
14795	tmp = DAG.getNode(Opcode: ISD::AND, DL: dl, VT, N1: tmp, N2: ConstantOne);
14796	SDValue Add = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT, N1: ShiftOpA, N2: ShiftOpB);
14797	return DAG.getNode(Opcode: ISD::ADD, DL: dl, VT, N1: Add, N2: tmp);
14798	}
14799
14800	SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
14801	SelectionDAG &DAG) const {
14802	EVT VT = Op.getValueType();
14803	assert(VT != MVT::i64 && "Expected illegal VSCALE node");
14804
14805	SDLoc DL(Op);
14806	APInt MulImm = Op.getConstantOperandAPInt(i: `0`);
14807	return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sext(`64`)), DL,
14808	VT);
14809	}
14810
14811	/// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
14812	template <unsigned NumVecs>
14813	static bool
14814	setInfoSVEStN(const AArch64TargetLowering &TLI, const DataLayout &DL,
14815	AArch64TargetLowering::IntrinsicInfo &Info, const CallInst &CI) {
14816	Info.opc = ISD::INTRINSIC_VOID;
14817	// Retrieve EC from first vector argument.
14818	const EVT VT = TLI.getMemValueType(DL, Ty: CI.getArgOperand(i: `0`)->getType());
14819	ElementCount EC = VT.getVectorElementCount();
14820	#ifndef NDEBUG
14821	// Check the assumption that all input vectors are the same type.
14822	for (unsigned I = `0`; I < NumVecs; ++I)
14823	assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) &&
14824	"Invalid type.");
14825	#endif
14826	// memVT is `NumVecs VT`.*
14827	Info.memVT = EVT::getVectorVT(Context&: CI.getType()->getContext(), VT: VT.getScalarType(),
14828	EC: EC * NumVecs);
14829	Info.ptrVal = CI.getArgOperand(i: CI.arg_size() - `1`);
14830	Info.offset = `0`;
14831	Info.align.reset();
14832	Info.flags = MachineMemOperand::MOStore;
14833	return true;
14834	}
14835
14836	/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
14837	/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
14838	/// specified in the intrinsic calls.
14839	bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
14840	const CallInst &I,
14841	MachineFunction &MF,
14842	unsigned Intrinsic) const {
14843	auto &DL = I.getModule()->getDataLayout();
14844	switch (Intrinsic) {
14845	case Intrinsic::aarch64_sve_st2:
14846	return setInfoSVEStN<`2`>(TLI: *this, DL, Info, CI: I);
14847	case Intrinsic::aarch64_sve_st3:
14848	return setInfoSVEStN<`3`>(TLI: *this, DL, Info, CI: I);
14849	case Intrinsic::aarch64_sve_st4:
14850	return setInfoSVEStN<`4`>(TLI: *this, DL, Info, CI: I);
14851	case Intrinsic::aarch64_neon_ld2:
14852	case Intrinsic::aarch64_neon_ld3:
14853	case Intrinsic::aarch64_neon_ld4:
14854	case Intrinsic::aarch64_neon_ld1x2:
14855	case Intrinsic::aarch64_neon_ld1x3:
14856	case Intrinsic::aarch64_neon_ld1x4: {
14857	Info.opc = ISD::INTRINSIC_W_CHAIN;
14858	uint64_t NumElts = DL.getTypeSizeInBits(Ty: I.getType()) / `64`;
14859	Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
14860	Info.ptrVal = I.getArgOperand(i: I.arg_size() - `1`);
14861	Info.offset = `0`;
14862	Info.align.reset();
14863	// volatile loads with NEON intrinsics not supported
14864	Info.flags = MachineMemOperand::MOLoad;
14865	return true;
14866	}
14867	case Intrinsic::aarch64_neon_ld2lane:
14868	case Intrinsic::aarch64_neon_ld3lane:
14869	case Intrinsic::aarch64_neon_ld4lane:
14870	case Intrinsic::aarch64_neon_ld2r:
14871	case Intrinsic::aarch64_neon_ld3r:
14872	case Intrinsic::aarch64_neon_ld4r: {
14873	Info.opc = ISD::INTRINSIC_W_CHAIN;
14874	// ldx return struct with the same vec type
14875	Type *RetTy = I.getType();
14876	auto *StructTy = cast<StructType>(Val: RetTy);
14877	unsigned NumElts = StructTy->getNumElements();
14878	Type *VecTy = StructTy->getElementType(N: `0`);
14879	MVT EleVT = MVT::getVT(Ty: VecTy).getVectorElementType();
14880	Info.memVT = EVT::getVectorVT(Context&: I.getType()->getContext(), VT: EleVT, NumElements: NumElts);
14881	Info.ptrVal = I.getArgOperand(i: I.arg_size() - `1`);
14882	Info.offset = `0`;
14883	Info.align.reset();
14884	// volatile loads with NEON intrinsics not supported
14885	Info.flags = MachineMemOperand::MOLoad;
14886	return true;
14887	}
14888	case Intrinsic::aarch64_neon_st2:
14889	case Intrinsic::aarch64_neon_st3:
14890	case Intrinsic::aarch64_neon_st4:
14891	case Intrinsic::aarch64_neon_st1x2:
14892	case Intrinsic::aarch64_neon_st1x3:
14893	case Intrinsic::aarch64_neon_st1x4: {
14894	Info.opc = ISD::INTRINSIC_VOID;
14895	unsigned NumElts = `0`;
14896	for (const Value *Arg : I.args()) {
14897	Type *ArgTy = Arg->getType();
14898	if (!ArgTy->isVectorTy())
14899	break;
14900	NumElts += DL.getTypeSizeInBits(Ty: ArgTy) / `64`;
14901	}
14902	Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
14903	Info.ptrVal = I.getArgOperand(i: I.arg_size() - `1`);
14904	Info.offset = `0`;
14905	Info.align.reset();
14906	// volatile stores with NEON intrinsics not supported
14907	Info.flags = MachineMemOperand::MOStore;
14908	return true;
14909	}
14910	case Intrinsic::aarch64_neon_st2lane:
14911	case Intrinsic::aarch64_neon_st3lane:
14912	case Intrinsic::aarch64_neon_st4lane: {
14913	Info.opc = ISD::INTRINSIC_VOID;
14914	unsigned NumElts = `0`;
14915	// all the vector type is same
14916	Type *VecTy = I.getArgOperand(i: `0`)->getType();
14917	MVT EleVT = MVT::getVT(Ty: VecTy).getVectorElementType();
14918
14919	for (const Value *Arg : I.args()) {
14920	Type *ArgTy = Arg->getType();
14921	if (!ArgTy->isVectorTy())
14922	break;
14923	NumElts += `1`;
14924	}
14925
14926	Info.memVT = EVT::getVectorVT(Context&: I.getType()->getContext(), VT: EleVT, NumElements: NumElts);
14927	Info.ptrVal = I.getArgOperand(i: I.arg_size() - `1`);
14928	Info.offset = `0`;
14929	Info.align.reset();
14930	// volatile stores with NEON intrinsics not supported
14931	Info.flags = MachineMemOperand::MOStore;
14932	return true;
14933	}
14934	case Intrinsic::aarch64_ldaxr:
14935	case Intrinsic::aarch64_ldxr: {
14936	Type *ValTy = I.getParamElementType(ArgNo: `0`);
14937	Info.opc = ISD::INTRINSIC_W_CHAIN;
14938	Info.memVT = MVT::getVT(Ty: ValTy);
14939	Info.ptrVal = I.getArgOperand(i: `0`);
14940	Info.offset = `0`;
14941	Info.align = DL.getABITypeAlign(Ty: ValTy);
14942	Info.flags = MachineMemOperand::MOLoad \| MachineMemOperand::MOVolatile;
14943	return true;
14944	}
14945	case Intrinsic::aarch64_stlxr:
14946	case Intrinsic::aarch64_stxr: {
14947	Type *ValTy = I.getParamElementType(ArgNo: `1`);
14948	Info.opc = ISD::INTRINSIC_W_CHAIN;
14949	Info.memVT = MVT::getVT(Ty: ValTy);
14950	Info.ptrVal = I.getArgOperand(i: `1`);
14951	Info.offset = `0`;
14952	Info.align = DL.getABITypeAlign(Ty: ValTy);
14953	Info.flags = MachineMemOperand::MOStore \| MachineMemOperand::MOVolatile;
14954	return true;
14955	}
14956	case Intrinsic::aarch64_ldaxp:
14957	case Intrinsic::aarch64_ldxp:
14958	Info.opc = ISD::INTRINSIC_W_CHAIN;
14959	Info.memVT = MVT::i128;
14960	Info.ptrVal = I.getArgOperand(i: `0`);
14961	Info.offset = `0`;
14962	Info.align = Align (`16`);
14963	Info.flags = MachineMemOperand::MOLoad \| MachineMemOperand::MOVolatile;
14964	return true;
14965	case Intrinsic::aarch64_stlxp:
14966	case Intrinsic::aarch64_stxp:
14967	Info.opc = ISD::INTRINSIC_W_CHAIN;
14968	Info.memVT = MVT::i128;
14969	Info.ptrVal = I.getArgOperand(i: `2`);
14970	Info.offset = `0`;
14971	Info.align = Align (`16`);
14972	Info.flags = MachineMemOperand::MOStore \| MachineMemOperand::MOVolatile;
14973	return true;
14974	case Intrinsic::aarch64_sve_ldnt1: {
14975	Type *ElTy = cast<VectorType>(Val: I.getType())->getElementType();
14976	Info.opc = ISD::INTRINSIC_W_CHAIN;
14977	Info.memVT = MVT::getVT(Ty: I.getType());
14978	Info.ptrVal = I.getArgOperand(i: `1`);
14979	Info.offset = `0`;
14980	Info.align = DL.getABITypeAlign(Ty: ElTy);
14981	Info.flags = MachineMemOperand::MOLoad \| MachineMemOperand::MONonTemporal;
14982	return true;
14983	}
14984	case Intrinsic::aarch64_sve_stnt1: {
14985	Type *ElTy =
14986	cast<VectorType>(Val: I.getArgOperand(i: `0`)->getType())->getElementType();
14987	Info.opc = ISD::INTRINSIC_W_CHAIN;
14988	Info.memVT = MVT::getVT(Ty: I.getOperand(i_nocapture: `0`)->getType());
14989	Info.ptrVal = I.getArgOperand(i: `2`);
14990	Info.offset = `0`;
14991	Info.align = DL.getABITypeAlign(Ty: ElTy);
14992	Info.flags = MachineMemOperand::MOStore \| MachineMemOperand::MONonTemporal;
14993	return true;
14994	}
14995	case Intrinsic::aarch64_mops_memset_tag: {
14996	Value *Dst = I.getArgOperand(i: `0`);
14997	Value *Val = I.getArgOperand(i: `1`);
14998	Info.opc = ISD::INTRINSIC_W_CHAIN;
14999	Info.memVT = MVT::getVT(Ty: Val->getType());
15000	Info.ptrVal = Dst;
15001	Info.offset = `0`;
15002	Info.align = I.getParamAlign(ArgNo: `0`).valueOrOne();
15003	Info.flags = MachineMemOperand::MOStore;
15004	// The size of the memory being operated on is unknown at this point
15005	Info.size = MemoryLocation::UnknownSize;
15006	return true;
15007	}
15008	default:
15009	break;
15010	}
15011
15012	return false;
15013	}
15014
15015	bool AArch64TargetLowering::shouldReduceLoadWidth(SDNode *Load,
15016	ISD::LoadExtType ExtTy,
15017	EVT NewVT) const {
15018	// TODO: This may be worth removing. Check regression tests for diffs.
15019	if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT))
15020	return false;
15021
15022	// If we're reducing the load width in order to avoid having to use an extra
15023	// instruction to do extension then it's probably a good idea.
15024	if (ExtTy != ISD::NON_EXTLOAD)
15025	return true;
15026	// Don't reduce load width if it would prevent us from combining a shift into
15027	// the offset.
15028	MemSDNode *Mem = dyn_cast<MemSDNode>(Val: Load);
15029	assert(Mem);
15030	const SDValue &Base = Mem->getBasePtr();
15031	if (Base.getOpcode() == ISD::ADD &&
15032	Base.getOperand(i: `1`).getOpcode() == ISD::SHL &&
15033	Base.getOperand(i: `1`).hasOneUse() &&
15034	Base.getOperand(i: `1`).getOperand(i: `1`).getOpcode() == ISD::Constant) {
15035	// It's unknown whether a scalable vector has a power-of-2 bitwidth.
15036	if (Mem->getMemoryVT().isScalableVector())
15037	return false;
15038	// The shift can be combined if it matches the size of the value being
15039	// loaded (and so reducing the width would make it not match).
15040	uint64_t ShiftAmount = Base.getOperand(i: `1`).getConstantOperandVal(i: `1`);
15041	uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/`8`;
15042	if (ShiftAmount == Log2_32(Value: LoadBytes))
15043	return false;
15044	}
15045	// We have no reason to disallow reducing the load width, so allow it.
15046	return true;
15047	}
15048
15049	// Treat a sext_inreg(extract(..)) as free if it has multiple uses.
15050	bool AArch64TargetLowering::shouldRemoveRedundantExtend(SDValue Extend) const {
15051	EVT VT = Extend.getValueType();
15052	if ((VT == MVT::i64 \|\| VT == MVT::i32) && Extend->use_size()) {
15053	SDValue Extract = Extend.getOperand(i: `0`);
15054	if (Extract.getOpcode() == ISD::ANY_EXTEND && Extract.hasOneUse())
15055	Extract = Extract.getOperand(i: `0`);
15056	if (Extract.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Extract.hasOneUse()) {
15057	EVT VecVT = Extract.getOperand(i: `0`).getValueType();
15058	if (VecVT.getScalarType() == MVT::i8 \|\| VecVT.getScalarType() == MVT::i16)
15059	return false;
15060	}
15061	}
15062	return true;
15063	}
15064
15065	// Truncations from 64-bit GPR to 32-bit GPR is free.
15066	bool AArch64TargetLowering::isTruncateFree(Type Ty1, Type Ty2) const {
15067	if (!Ty1->isIntegerTy() \|\| !Ty2->isIntegerTy())
15068	return false;
15069	uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedValue();
15070	uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedValue();
15071	return NumBits1 > NumBits2;
15072	}
15073	bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
15074	if (VT1.isVector() \|\| VT2.isVector() \|\| !VT1.isInteger() \|\| !VT2.isInteger())
15075	return false;
15076	uint64_t NumBits1 = VT1.getFixedSizeInBits();
15077	uint64_t NumBits2 = VT2.getFixedSizeInBits();
15078	return NumBits1 > NumBits2;
15079	}
15080
15081	/// Check if it is profitable to hoist instruction in then/else to if.
15082	/// Not profitable if I and it's user can form a FMA instruction
15083	/// because we prefer FMSUB/FMADD.
15084	bool AArch64TargetLowering::isProfitableToHoist(Instruction I) const* {
15085	if (I->getOpcode() != Instruction::FMul)
15086	return true;
15087
15088	if (!I->hasOneUse())
15089	return true;
15090
15091	Instruction *User = I->user_back();
15092
15093	if (!(User->getOpcode() == Instruction::FSub \|\|
15094	User->getOpcode() == Instruction::FAdd))
15095	return true;
15096
15097	const TargetOptions &Options = getTargetMachine().Options;
15098	const Function *F = I->getFunction();
15099	const DataLayout &DL = F->getParent()->getDataLayout();
15100	Type *Ty = User->getOperand(i: `0`)->getType();
15101
15102	return !(isFMAFasterThanFMulAndFAdd(F: *F, Ty) &&
15103	isOperationLegalOrCustom(Op: ISD::FMA, VT: getValueType(DL, Ty)) &&
15104	(Options.AllowFPOpFusion == FPOpFusion::Fast \|\|
15105	Options.UnsafeFPMath));
15106	}
15107
15108	// All 32-bit GPR operations implicitly zero the high-half of the corresponding
15109	// 64-bit GPR.
15110	bool AArch64TargetLowering::isZExtFree(Type Ty1, Type Ty2) const {
15111	if (!Ty1->isIntegerTy() \|\| !Ty2->isIntegerTy())
15112	return false;
15113	unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
15114	unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
15115	return NumBits1 == `32` && NumBits2 == `64`;
15116	}
15117	bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
15118	if (VT1.isVector() \|\| VT2.isVector() \|\| !VT1.isInteger() \|\| !VT2.isInteger())
15119	return false;
15120	unsigned NumBits1 = VT1.getSizeInBits();
15121	unsigned NumBits2 = VT2.getSizeInBits();
15122	return NumBits1 == `32` && NumBits2 == `64`;
15123	}
15124
15125	bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
15126	EVT VT1 = Val.getValueType();
15127	if (isZExtFree(VT1, VT2)) {
15128	return true;
15129	}
15130
15131	if (Val.getOpcode() != ISD::LOAD)
15132	return false;
15133
15134	// 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
15135	return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
15136	VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
15137	VT1.getSizeInBits() <= `32`);
15138	}
15139
15140	bool AArch64TargetLowering::isExtFreeImpl(const Instruction Ext) const* {
15141	if (isa<FPExtInst>(Val: Ext))
15142	return false;
15143
15144	// Vector types are not free.
15145	if (Ext->getType()->isVectorTy())
15146	return false;
15147
15148	for (const Use &U : Ext->uses()) {
15149	// The extension is free if we can fold it with a left shift in an
15150	// addressing mode or an arithmetic operation: add, sub, and cmp.
15151
15152	// Is there a shift?
15153	const Instruction *Instr = cast<Instruction>(Val: U.getUser());
15154
15155	// Is this a constant shift?
15156	switch (Instr->getOpcode()) {
15157	case Instruction::Shl:
15158	if (!isa<ConstantInt>(Val: Instr->getOperand(i: `1`)))
15159	return false;
15160	break;
15161	case Instruction::GetElementPtr: {
15162	gep_type_iterator GTI = gep_type_begin(GEP: Instr);
15163	auto &DL = Ext->getModule()->getDataLayout();
15164	std::advance(i&: GTI, n: U.getOperandNo()-`1`);
15165	Type *IdxTy = GTI.getIndexedType();
15166	// This extension will end up with a shift because of the scaling factor.
15167	// 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
15168	// Get the shift amount based on the scaling factor:
15169	// log2(sizeof(IdxTy)) - log2(8).
15170	if (IdxTy->isScalableTy())
15171	return false;
15172	uint64_t ShiftAmt =
15173	llvm::countr_zero(Val: DL.getTypeStoreSizeInBits(Ty: IdxTy).getFixedValue()) -
15174	`3`;
15175	// Is the constant foldable in the shift of the addressing mode?
15176	// I.e., shift amount is between 1 and 4 inclusive.
15177	if (ShiftAmt == `0` \|\| ShiftAmt > `4`)
15178	return false;
15179	break;
15180	}
15181	case Instruction::Trunc:
15182	// Check if this is a noop.
15183	// trunc(sext ty1 to ty2) to ty1.
15184	if (Instr->getType() == Ext->getOperand(i: `0`)->getType())
15185	continue;
15186	[[fallthrough]];
15187	default:
15188	return false;
15189	}
15190
15191	// At this point we can use the bfm family, so this extension is free
15192	// for that use.
15193	}
15194	return true;
15195	}
15196
15197	static bool isSplatShuffle(Value *V) {
15198	if (auto *Shuf = dyn_cast<ShuffleVectorInst>(Val: V))
15199	return all_equal(Range: Shuf->getShuffleMask());
15200	return false;
15201	}
15202
15203	/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
15204	/// or upper half of the vector elements.
15205	static bool areExtractShuffleVectors(Value Op1, Value Op2,
15206	bool AllowSplat = false) {
15207	auto areTypesHalfed = [](Value FullV, Value HalfV) {
15208	auto *FullTy = FullV->getType();
15209	auto *HalfTy = HalfV->getType();
15210	return FullTy->getPrimitiveSizeInBits().getFixedValue() ==
15211	`2` * HalfTy->getPrimitiveSizeInBits().getFixedValue();
15212	};
15213
15214	auto extractHalf = [](Value FullV, Value HalfV) {
15215	auto *FullVT = cast<FixedVectorType>(Val: FullV->getType());
15216	auto *HalfVT = cast<FixedVectorType>(Val: HalfV->getType());
15217	return FullVT->getNumElements() == `2` * HalfVT->getNumElements();
15218	};
15219
15220	ArrayRef<int> M1, M2;
15221	Value S1Op1 = nullptr, S2Op1 = nullptr;
15222	if (!match(V: Op1, P: m_Shuffle(v1: m_Value(V&: S1Op1), v2: m_Undef(), mask: m_Mask (M1))) \|\|
15223	!match(V: Op2, P: m_Shuffle(v1: m_Value(V&: S2Op1), v2: m_Undef(), mask: m_Mask (M2))))
15224	return false;
15225
15226	// If we allow splats, set S1Op1/S2Op1 to nullptr for the relavant arg so that
15227	// it is not checked as an extract below.
15228	if (AllowSplat && isSplatShuffle(V: Op1))
15229	S1Op1 = nullptr;
15230	if (AllowSplat && isSplatShuffle(V: Op2))
15231	S2Op1 = nullptr;
15232
15233	// Check that the operands are half as wide as the result and we extract
15234	// half of the elements of the input vectors.
15235	if ((S1Op1 && (!areTypesHalfed (S1Op1, Op1) \|\| !extractHalf (S1Op1, Op1))) \|\|
15236	(S2Op1 && (!areTypesHalfed (S2Op1, Op2) \|\| !extractHalf (S2Op1, Op2))))
15237	return false;
15238
15239	// Check the mask extracts either the lower or upper half of vector
15240	// elements.
15241	int M1Start = `0`;
15242	int M2Start = `0`;
15243	int NumElements = cast<FixedVectorType>(Val: Op1->getType())->getNumElements() * `2`;
15244	if ((S1Op1 &&
15245	!ShuffleVectorInst::isExtractSubvectorMask(Mask: M1, NumSrcElts: NumElements, Index&: M1Start)) \|\|
15246	(S2Op1 &&
15247	!ShuffleVectorInst::isExtractSubvectorMask(Mask: M2, NumSrcElts: NumElements, Index&: M2Start)))
15248	return false;
15249
15250	if ((M1Start != `0` && M1Start != (NumElements / `2`)) \|\|
15251	(M2Start != `0` && M2Start != (NumElements / `2`)))
15252	return false;
15253	if (S1Op1 && S2Op1 && M1Start != M2Start)
15254	return false;
15255
15256	return true;
15257	}
15258
15259	/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
15260	/// of the vector elements.
15261	static bool areExtractExts(Value Ext1, Value Ext2) {
15262	auto areExtDoubled = [](Instruction *Ext) {
15263	return Ext->getType()->getScalarSizeInBits() ==
15264	`2` * Ext->getOperand(i: `0`)->getType()->getScalarSizeInBits();
15265	};
15266
15267	if (!match(V: Ext1, P: m_ZExtOrSExt(Op: m_Value())) \|\|
15268	!match(V: Ext2, P: m_ZExtOrSExt(Op: m_Value())) \|\|
15269	!areExtDoubled (cast<Instruction>(Val: Ext1)) \|\|
15270	!areExtDoubled (cast<Instruction>(Val: Ext2)))
15271	return false;
15272
15273	return true;
15274	}
15275
15276	/// Check if Op could be used with vmull_high_p64 intrinsic.
15277	static bool isOperandOfVmullHighP64(Value *Op) {
15278	Value VectorOperand = nullptr*;
15279	ConstantInt ElementIndex = nullptr*;
15280	return match(V: Op, P: m_ExtractElt(Val: m_Value(V&: VectorOperand),
15281	Idx: m_ConstantInt(CI&: ElementIndex))) &&
15282	ElementIndex->getValue() == `1` &&
15283	isa<FixedVectorType>(Val: VectorOperand->getType()) &&
15284	cast<FixedVectorType>(Val: VectorOperand->getType())->getNumElements() == `2`;
15285	}
15286
15287	/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
15288	static bool areOperandsOfVmullHighP64(Value Op1, Value Op2) {
15289	return isOperandOfVmullHighP64(Op: Op1) && isOperandOfVmullHighP64(Op: Op2);
15290	}
15291
15292	static bool shouldSinkVectorOfPtrs(Value Ptrs, SmallVectorImpl<Use > &Ops) {
15293	// Restrict ourselves to the form CodeGenPrepare typically constructs.
15294	auto *GEP = dyn_cast<GetElementPtrInst>(Val: Ptrs);
15295	if (!GEP \|\| GEP->getNumOperands() != `2`)
15296	return false;
15297
15298	Value *Base = GEP->getOperand(i_nocapture: `0`);
15299	Value *Offsets = GEP->getOperand(i_nocapture: `1`);
15300
15301	// We only care about scalar_base+vector_offsets.
15302	if (Base->getType()->isVectorTy() \|\| !Offsets->getType()->isVectorTy())
15303	return false;
15304
15305	// Sink extends that would allow us to use 32-bit offset vectors.
15306	if (isa<SExtInst>(Val: Offsets) \|\| isa<ZExtInst>(Val: Offsets)) {
15307	auto *OffsetsInst = cast<Instruction>(Val: Offsets);
15308	if (OffsetsInst->getType()->getScalarSizeInBits() > `32` &&
15309	OffsetsInst->getOperand(i: `0`)->getType()->getScalarSizeInBits() <= `32`)
15310	Ops.push_back(Elt: &GEP->getOperandUse(i: `1`));
15311	}
15312
15313	// Sink the GEP.
15314	return true;
15315	}
15316
15317	/// We want to sink following cases:
15318	/// (add\|sub\|gep) A, ((mul\|shl) vscale, imm); (add\|sub\|gep) A, vscale
15319	static bool shouldSinkVScale(Value Op, SmallVectorImpl<Use > &Ops) {
15320	if (match(V: Op, P: m_VScale()))
15321	return true;
15322	if (match(V: Op, P: m_Shl(L: m_VScale(), R: m_ConstantInt())) \|\|
15323	match(V: Op, P: m_Mul(L: m_VScale(), R: m_ConstantInt()))) {
15324	Ops.push_back(Elt: &cast<Instruction>(Val: Op)->getOperandUse(i: `0`));
15325	return true;
15326	}
15327	return false;
15328	}
15329
15330	/// Check if sinking \p I's operands to I's basic block is profitable, because
15331	/// the operands can be folded into a target instruction, e.g.
15332	/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
15333	bool AArch64TargetLowering::shouldSinkOperands(
15334	Instruction I, SmallVectorImpl<Use > &Ops) const {
15335	if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: I)) {
15336	switch (II->getIntrinsicID()) {
15337	case Intrinsic::aarch64_neon_smull:
15338	case Intrinsic::aarch64_neon_umull:
15339	if (areExtractShuffleVectors(Op1: II->getOperand(i_nocapture: `0`), Op2: II->getOperand(i_nocapture: `1`),
15340	/AllowSplat=/true)) {
15341	Ops.push_back(Elt: &II->getOperandUse(i: `0`));
15342	Ops.push_back(Elt: &II->getOperandUse(i: `1`));
15343	return true;
15344	}
15345	[[fallthrough]];
15346
15347	case Intrinsic::fma:
15348	if (isa<VectorType>(Val: I->getType()) &&
15349	cast<VectorType>(Val: I->getType())->getElementType()->isHalfTy() &&
15350	!Subtarget->hasFullFP16())
15351	return false;
15352	[[fallthrough]];
15353	case Intrinsic::aarch64_neon_sqdmull:
15354	case Intrinsic::aarch64_neon_sqdmulh:
15355	case Intrinsic::aarch64_neon_sqrdmulh:
15356	// Sink splats for index lane variants
15357	if (isSplatShuffle(V: II->getOperand(i_nocapture: `0`)))
15358	Ops.push_back(Elt: &II->getOperandUse(i: `0`));
15359	if (isSplatShuffle(V: II->getOperand(i_nocapture: `1`)))
15360	Ops.push_back(Elt: &II->getOperandUse(i: `1`));
15361	return !Ops.empty();
15362	case Intrinsic::aarch64_neon_fmlal:
15363	case Intrinsic::aarch64_neon_fmlal2:
15364	case Intrinsic::aarch64_neon_fmlsl:
15365	case Intrinsic::aarch64_neon_fmlsl2:
15366	// Sink splats for index lane variants
15367	if (isSplatShuffle(V: II->getOperand(i_nocapture: `1`)))
15368	Ops.push_back(Elt: &II->getOperandUse(i: `1`));
15369	if (isSplatShuffle(V: II->getOperand(i_nocapture: `2`)))
15370	Ops.push_back(Elt: &II->getOperandUse(i: `2`));
15371	return !Ops.empty();
15372	case Intrinsic::aarch64_sve_ptest_first:
15373	case Intrinsic::aarch64_sve_ptest_last:
15374	if (auto *IIOp = dyn_cast<IntrinsicInst>(Val: II->getOperand(i_nocapture: `0`)))
15375	if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
15376	Ops.push_back(Elt: &II->getOperandUse(i: `0`));
15377	return !Ops.empty();
15378	case Intrinsic::aarch64_sme_write_horiz:
15379	case Intrinsic::aarch64_sme_write_vert:
15380	case Intrinsic::aarch64_sme_writeq_horiz:
15381	case Intrinsic::aarch64_sme_writeq_vert: {
15382	auto *Idx = dyn_cast<Instruction>(Val: II->getOperand(i_nocapture: `1`));
15383	if (!Idx \|\| Idx->getOpcode() != Instruction::Add)
15384	return false;
15385	Ops.push_back(Elt: &II->getOperandUse(i: `1`));
15386	return true;
15387	}
15388	case Intrinsic::aarch64_sme_read_horiz:
15389	case Intrinsic::aarch64_sme_read_vert:
15390	case Intrinsic::aarch64_sme_readq_horiz:
15391	case Intrinsic::aarch64_sme_readq_vert:
15392	case Intrinsic::aarch64_sme_ld1b_vert:
15393	case Intrinsic::aarch64_sme_ld1h_vert:
15394	case Intrinsic::aarch64_sme_ld1w_vert:
15395	case Intrinsic::aarch64_sme_ld1d_vert:
15396	case Intrinsic::aarch64_sme_ld1q_vert:
15397	case Intrinsic::aarch64_sme_st1b_vert:
15398	case Intrinsic::aarch64_sme_st1h_vert:
15399	case Intrinsic::aarch64_sme_st1w_vert:
15400	case Intrinsic::aarch64_sme_st1d_vert:
15401	case Intrinsic::aarch64_sme_st1q_vert:
15402	case Intrinsic::aarch64_sme_ld1b_horiz:
15403	case Intrinsic::aarch64_sme_ld1h_horiz:
15404	case Intrinsic::aarch64_sme_ld1w_horiz:
15405	case Intrinsic::aarch64_sme_ld1d_horiz:
15406	case Intrinsic::aarch64_sme_ld1q_horiz:
15407	case Intrinsic::aarch64_sme_st1b_horiz:
15408	case Intrinsic::aarch64_sme_st1h_horiz:
15409	case Intrinsic::aarch64_sme_st1w_horiz:
15410	case Intrinsic::aarch64_sme_st1d_horiz:
15411	case Intrinsic::aarch64_sme_st1q_horiz: {
15412	auto *Idx = dyn_cast<Instruction>(Val: II->getOperand(i_nocapture: `3`));
15413	if (!Idx \|\| Idx->getOpcode() != Instruction::Add)
15414	return false;
15415	Ops.push_back(Elt: &II->getOperandUse(i: `3`));
15416	return true;
15417	}
15418	case Intrinsic::aarch64_neon_pmull:
15419	if (!areExtractShuffleVectors(Op1: II->getOperand(i_nocapture: `0`), Op2: II->getOperand(i_nocapture: `1`)))
15420	return false;
15421	Ops.push_back(Elt: &II->getOperandUse(i: `0`));
15422	Ops.push_back(Elt: &II->getOperandUse(i: `1`));
15423	return true;
15424	case Intrinsic::aarch64_neon_pmull64:
15425	if (!areOperandsOfVmullHighP64(Op1: II->getArgOperand(i: `0`),
15426	Op2: II->getArgOperand(i: `1`)))
15427	return false;
15428	Ops.push_back(Elt: &II->getArgOperandUse(i: `0`));
15429	Ops.push_back(Elt: &II->getArgOperandUse(i: `1`));
15430	return true;
15431	case Intrinsic::masked_gather:
15432	if (!shouldSinkVectorOfPtrs(Ptrs: II->getArgOperand(i: `0`), Ops))
15433	return false;
15434	Ops.push_back(Elt: &II->getArgOperandUse(i: `0`));
15435	return true;
15436	case Intrinsic::masked_scatter:
15437	if (!shouldSinkVectorOfPtrs(Ptrs: II->getArgOperand(i: `1`), Ops))
15438	return false;
15439	Ops.push_back(Elt: &II->getArgOperandUse(i: `1`));
15440	return true;
15441	default:
15442	return false;
15443	}
15444	}
15445
15446	// Sink vscales closer to uses for better isel
15447	switch (I->getOpcode()) {
15448	case Instruction::GetElementPtr:
15449	case Instruction::Add:
15450	case Instruction::Sub:
15451	for (unsigned Op = `0`; Op < I->getNumOperands(); ++Op) {
15452	if (shouldSinkVScale(Op: I->getOperand(i: Op), Ops)) {
15453	Ops.push_back(Elt: &I->getOperandUse(i: Op));
15454	return true;
15455	}
15456	}
15457	break;
15458	default:
15459	break;
15460	}
15461
15462	if (!I->getType()->isVectorTy())
15463	return false;
15464
15465	switch (I->getOpcode()) {
15466	case Instruction::Sub:
15467	case Instruction::Add: {
15468	if (!areExtractExts(Ext1: I->getOperand(i: `0`), Ext2: I->getOperand(i: `1`)))
15469	return false;
15470
15471	// If the exts' operands extract either the lower or upper elements, we
15472	// can sink them too.
15473	auto Ext1 = cast<Instruction>(Val: I->getOperand(i: `0`));
15474	auto Ext2 = cast<Instruction>(Val: I->getOperand(i: `1`));
15475	if (areExtractShuffleVectors(Op1: Ext1->getOperand(i: `0`), Op2: Ext2->getOperand(i: `0`))) {
15476	Ops.push_back(Elt: &Ext1->getOperandUse(i: `0`));
15477	Ops.push_back(Elt: &Ext2->getOperandUse(i: `0`));
15478	}
15479
15480	Ops.push_back(Elt: &I->getOperandUse(i: `0`));
15481	Ops.push_back(Elt: &I->getOperandUse(i: `1`));
15482
15483	return true;
15484	}
15485	case Instruction::Or: {
15486	// Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
15487	// bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
15488	if (Subtarget->hasNEON()) {
15489	Instruction OtherAnd, IA, *IB;
15490	Value *MaskValue;
15491	// MainAnd refers to And instruction that has 'Not' as one of its operands
15492	if (match(V: I, P: m_c_Or(L: m_OneUse(SubPattern: m_Instruction(I&: OtherAnd)),
15493	R: m_OneUse(SubPattern: m_c_And(L: m_OneUse(SubPattern: m_Not(V: m_Value(V&: MaskValue))),
15494	R: m_Instruction(I&: IA)))))) {
15495	if (match(V: OtherAnd,
15496	P: m_c_And(L: m_Specific(V: MaskValue), R: m_Instruction(I&: IB)))) {
15497	Instruction *MainAnd = I->getOperand(i: `0`) == OtherAnd
15498	? cast<Instruction>(Val: I->getOperand(i: `1`))
15499	: cast<Instruction>(Val: I->getOperand(i: `0`));
15500
15501	// Both Ands should be in same basic block as Or
15502	if (I->getParent() != MainAnd->getParent() \|\|
15503	I->getParent() != OtherAnd->getParent())
15504	return false;
15505
15506	// Non-mask operands of both Ands should also be in same basic block
15507	if (I->getParent() != IA->getParent() \|\|
15508	I->getParent() != IB->getParent())
15509	return false;
15510
15511	Ops.push_back(Elt: &MainAnd->getOperandUse(i: MainAnd->getOperand(i: `0`) == IA ? `1` : `0`));
15512	Ops.push_back(Elt: &I->getOperandUse(i: `0`));
15513	Ops.push_back(Elt: &I->getOperandUse(i: `1`));
15514
15515	return true;
15516	}
15517	}
15518	}
15519
15520	return false;
15521	}
15522	case Instruction::Mul: {
15523	int NumZExts = `0`, NumSExts = `0`;
15524	for (auto &Op : I->operands()) {
15525	// Make sure we are not already sinking this operand
15526	if (any_of(Range&: Ops, P: [&](Use U) { return* U->get() == Op; }))
15527	continue;
15528
15529	if (match(V: &Op, P: m_SExt(Op: m_Value()))) {
15530	NumSExts++;
15531	continue;
15532	} else if (match(V: &Op, P: m_ZExt(Op: m_Value()))) {
15533	NumZExts++;
15534	continue;
15535	}
15536
15537	ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Val&: Op);
15538
15539	// If the Shuffle is a splat and the operand is a zext/sext, sinking the
15540	// operand and the s/zext can help create indexed s/umull. This is
15541	// especially useful to prevent i64 mul being scalarized.
15542	if (Shuffle && isSplatShuffle(V: Shuffle) &&
15543	match(V: Shuffle->getOperand(i_nocapture: `0`), P: m_ZExtOrSExt(Op: m_Value()))) {
15544	Ops.push_back(Elt: &Shuffle->getOperandUse(i: `0`));
15545	Ops.push_back(Elt: &Op);
15546	if (match(V: Shuffle->getOperand(i_nocapture: `0`), P: m_SExt(Op: m_Value())))
15547	NumSExts++;
15548	else
15549	NumZExts++;
15550	continue;
15551	}
15552
15553	if (!Shuffle)
15554	continue;
15555
15556	Value *ShuffleOperand = Shuffle->getOperand(i_nocapture: `0`);
15557	InsertElementInst *Insert = dyn_cast<InsertElementInst>(Val: ShuffleOperand);
15558	if (!Insert)
15559	continue;
15560
15561	Instruction *OperandInstr = dyn_cast<Instruction>(Val: Insert->getOperand(i_nocapture: `1`));
15562	if (!OperandInstr)
15563	continue;
15564
15565	ConstantInt *ElementConstant =
15566	dyn_cast<ConstantInt>(Val: Insert->getOperand(i_nocapture: `2`));
15567	// Check that the insertelement is inserting into element 0
15568	if (!ElementConstant \|\| !ElementConstant->isZero())
15569	continue;
15570
15571	unsigned Opcode = OperandInstr->getOpcode();
15572	if (Opcode == Instruction::SExt)
15573	NumSExts++;
15574	else if (Opcode == Instruction::ZExt)
15575	NumZExts++;
15576	else {
15577	// If we find that the top bits are known 0, then we can sink and allow
15578	// the backend to generate a umull.
15579	unsigned Bitwidth = I->getType()->getScalarSizeInBits();
15580	APInt UpperMask = APInt::getHighBitsSet(numBits: Bitwidth, hiBitsSet: Bitwidth / `2`);
15581	const DataLayout &DL = I->getFunction()->getParent()->getDataLayout();
15582	if (!MaskedValueIsZero(V: OperandInstr, Mask: UpperMask, DL))
15583	continue;
15584	NumZExts++;
15585	}
15586
15587	Ops.push_back(Elt: &Shuffle->getOperandUse(i: `0`));
15588	Ops.push_back(Elt: &Op);
15589	}
15590
15591	// Is it profitable to sink if we found two of the same type of extends.
15592	return !Ops.empty() && (NumSExts == `2` \|\| NumZExts == `2`);
15593	}
15594	default:
15595	return false;
15596	}
15597	return false;
15598	}
15599
15600	static bool createTblShuffleForZExt(ZExtInst ZExt, FixedVectorType DstTy,
15601	bool IsLittleEndian) {
15602	Value *Op = ZExt->getOperand(i_nocapture: `0`);
15603	auto *SrcTy = cast<FixedVectorType>(Val: Op->getType());
15604	auto SrcWidth = cast<IntegerType>(Val: SrcTy->getElementType())->getBitWidth();
15605	auto DstWidth = cast<IntegerType>(Val: DstTy->getElementType())->getBitWidth();
15606	if (DstWidth % `8` != `0` \|\| DstWidth <= `16` \|\| DstWidth >= `64`)
15607	return false;
15608
15609	assert(DstWidth % SrcWidth == `0` &&
15610	"TBL lowering is not supported for a ZExt instruction with this "
15611	"source & destination element type.");
15612	unsigned ZExtFactor = DstWidth / SrcWidth;
15613	unsigned NumElts = SrcTy->getNumElements();
15614	IRBuilder<> Builder(ZExt);
15615	SmallVector<int> Mask;
15616	// Create a mask that selects <0,...,Op[i]> for each lane of the destination
15617	// vector to replace the original ZExt. This can later be lowered to a set of
15618	// tbl instructions.
15619	for (unsigned i = `0`; i < NumElts * ZExtFactor; i++) {
15620	if (IsLittleEndian) {
15621	if (i % ZExtFactor == `0`)
15622	Mask.push_back(Elt: i / ZExtFactor);
15623	else
15624	Mask.push_back(Elt: NumElts);
15625	} else {
15626	if ((i + `1`) % ZExtFactor == `0`)
15627	Mask.push_back(Elt: (i - ZExtFactor + `1`) / ZExtFactor);
15628	else
15629	Mask.push_back(Elt: NumElts);
15630	}
15631	}
15632
15633	auto *FirstEltZero = Builder.CreateInsertElement(
15634	Vec: PoisonValue::get(T: SrcTy), NewElt: Builder.getInt8(C: `0`), Idx: uint64_t(`0`));
15635	Value *Result = Builder.CreateShuffleVector(V1: Op, V2: FirstEltZero, Mask);
15636	Result = Builder.CreateBitCast(V: Result, DestTy: DstTy);
15637	if (DstTy != ZExt->getType())
15638	Result = Builder.CreateZExt(V: Result, DestTy: ZExt->getType());
15639	ZExt->replaceAllUsesWith(V: Result);
15640	ZExt->eraseFromParent();
15641	return true;
15642	}
15643
15644	static void createTblForTrunc(TruncInst TI, bool* IsLittleEndian) {
15645	IRBuilder<> Builder(TI);
15646	SmallVector<Value *> Parts;
15647	int NumElements = cast<FixedVectorType>(Val: TI->getType())->getNumElements();
15648	auto *SrcTy = cast<FixedVectorType>(Val: TI->getOperand(i_nocapture: `0`)->getType());
15649	auto *DstTy = cast<FixedVectorType>(Val: TI->getType());
15650	assert(SrcTy->getElementType()->isIntegerTy() &&
15651	"Non-integer type source vector element is not supported");
15652	assert(DstTy->getElementType()->isIntegerTy(`8`) &&
15653	"Unsupported destination vector element type");
15654	unsigned SrcElemTySz =
15655	cast<IntegerType>(Val: SrcTy->getElementType())->getBitWidth();
15656	unsigned DstElemTySz =
15657	cast<IntegerType>(Val: DstTy->getElementType())->getBitWidth();
15658	assert((SrcElemTySz % DstElemTySz == `0`) &&
15659	"Cannot lower truncate to tbl instructions for a source element size "
15660	"that is not divisible by the destination element size");
15661	unsigned TruncFactor = SrcElemTySz / DstElemTySz;
15662	assert((SrcElemTySz == `16` \|\| SrcElemTySz == `32` \|\| SrcElemTySz == `64`) &&
15663	"Unsupported source vector element type size");
15664	Type *VecTy = FixedVectorType::get(ElementType: Builder.getInt8Ty(), NumElts: `16`);
15665
15666	// Create a mask to choose every nth byte from the source vector table of
15667	// bytes to create the truncated destination vector, where 'n' is the truncate
15668	// ratio. For example, for a truncate from Yxi64 to Yxi8, choose
15669	// 0,8,16,..Y8th bytes for the little-endian format*
15670	SmallVector<Constant *, `16`> MaskConst;
15671	for (int Itr = `0`; Itr < `16`; Itr++) {
15672	if (Itr < NumElements)
15673	MaskConst.push_back(Elt: Builder.getInt8(
15674	C: IsLittleEndian ? Itr * TruncFactor
15675	: Itr * TruncFactor + (TruncFactor - `1`)));
15676	else
15677	MaskConst.push_back(Elt: Builder.getInt8(C: `255`));
15678	}
15679
15680	int MaxTblSz = `128` * `4`;
15681	int MaxSrcSz = SrcElemTySz * NumElements;
15682	int ElemsPerTbl =
15683	(MaxTblSz > MaxSrcSz) ? NumElements : (MaxTblSz / SrcElemTySz);
15684	assert(ElemsPerTbl <= `16` &&
15685	"Maximum elements selected using TBL instruction cannot exceed 16!");
15686
15687	int ShuffleCount = `128` / SrcElemTySz;
15688	SmallVector<int> ShuffleLanes;
15689	for (int i = `0`; i < ShuffleCount; ++i)
15690	ShuffleLanes.push_back(Elt: i);
15691
15692	// Create TBL's table of bytes in 1,2,3 or 4 FP/SIMD registers using shuffles
15693	// over the source vector. If TBL's maximum 4 FP/SIMD registers are saturated,
15694	// call TBL & save the result in a vector of TBL results for combining later.
15695	SmallVector<Value *> Results;
15696	while (ShuffleLanes.back() < NumElements) {
15697	Parts.push_back(Elt: Builder.CreateBitCast(
15698	V: Builder.CreateShuffleVector(V: TI->getOperand(i_nocapture: `0`), Mask: ShuffleLanes), DestTy: VecTy));
15699
15700	if (Parts.size() == `4`) {
15701	auto *F = Intrinsic::getDeclaration(TI->getModule(),
15702	Intrinsic::aarch64_neon_tbl4, VecTy);
15703	Parts.push_back(Elt: ConstantVector::get(V: MaskConst));
15704	Results.push_back(Elt: Builder.CreateCall(F, Parts));
15705	Parts.clear();
15706	}
15707
15708	for (int i = `0`; i < ShuffleCount; ++i)
15709	ShuffleLanes [i] += ShuffleCount;
15710	}
15711
15712	assert((Parts.empty() \|\| Results.empty()) &&
15713	"Lowering trunc for vectors requiring different TBL instructions is "
15714	"not supported!");
15715	// Call TBL for the residual table bytes present in 1,2, or 3 FP/SIMD
15716	// registers
15717	if (!Parts.empty()) {
15718	Intrinsic::ID TblID;
15719	switch (Parts.size()) {
15720	case `1`:
15721	TblID = Intrinsic::aarch64_neon_tbl1;
15722	break;
15723	case `2`:
15724	TblID = Intrinsic::aarch64_neon_tbl2;
15725	break;
15726	case `3`:
15727	TblID = Intrinsic::aarch64_neon_tbl3;
15728	break;
15729	}
15730
15731	auto *F = Intrinsic::getDeclaration(M: TI->getModule(), id: TblID, Tys: VecTy);
15732	Parts.push_back(Elt: ConstantVector::get(V: MaskConst));
15733	Results.push_back(Elt: Builder.CreateCall(Callee: F, Args: Parts));
15734	}
15735
15736	// Extract the destination vector from TBL result(s) after combining them
15737	// where applicable. Currently, at most two TBLs are supported.
15738	assert(Results.size() <= `2` && "Trunc lowering does not support generation of "
15739	"more than 2 tbl instructions!");
15740	Value *FinalResult = Results [`0`];
15741	if (Results.size() == `1`) {
15742	if (ElemsPerTbl < `16`) {
15743	SmallVector<int> FinalMask(ElemsPerTbl);
15744	std::iota(first: FinalMask.begin(), last: FinalMask.end(), value: `0`);
15745	FinalResult = Builder.CreateShuffleVector(V: Results [`0`], Mask: FinalMask);
15746	}
15747	} else {
15748	SmallVector<int> FinalMask(ElemsPerTbl * Results.size());
15749	if (ElemsPerTbl < `16`) {
15750	std::iota(first: FinalMask.begin(), last: FinalMask.begin() + ElemsPerTbl, value: `0`);
15751	std::iota(first: FinalMask.begin() + ElemsPerTbl, last: FinalMask.end(), value: `16`);
15752	} else {
15753	std::iota(first: FinalMask.begin(), last: FinalMask.end(), value: `0`);
15754	}
15755	FinalResult =
15756	Builder.CreateShuffleVector(V1: Results [`0`], V2: Results [`1`], Mask: FinalMask);
15757	}
15758
15759	TI->replaceAllUsesWith(V: FinalResult);
15760	TI->eraseFromParent();
15761	}
15762
15763	bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(
15764	Instruction I, Loop L, const TargetTransformInfo &TTI) const {
15765	// shuffle_vector instructions are serialized when targeting SVE,
15766	// see LowerSPLAT_VECTOR. This peephole is not beneficial.
15767	if (!EnableExtToTBL \|\| Subtarget->useSVEForFixedLengthVectors())
15768	return false;
15769
15770	// Try to optimize conversions using tbl. This requires materializing constant
15771	// index vectors, which can increase code size and add loads. Skip the
15772	// transform unless the conversion is in a loop block guaranteed to execute
15773	// and we are not optimizing for size.
15774	Function *F = I->getParent()->getParent();
15775	if (!L \|\| L->getHeader() != I->getParent() \|\| F->hasMinSize() \|\|
15776	F->hasOptSize())
15777	return false;
15778
15779	auto *SrcTy = dyn_cast<FixedVectorType>(Val: I->getOperand(i: `0`)->getType());
15780	auto *DstTy = dyn_cast<FixedVectorType>(Val: I->getType());
15781	if (!SrcTy \|\| !DstTy)
15782	return false;
15783
15784	// Convert 'zext <Y x i8> %x to <Y x i8X>' to a shuffle that can be
15785	// lowered to tbl instructions to insert the original i8 elements
15786	// into i8x lanes. This is enabled for cases where it is beneficial.
15787	auto *ZExt = dyn_cast<ZExtInst>(Val: I);
15788	if (ZExt && SrcTy->getElementType()->isIntegerTy(Bitwidth: `8`)) {
15789	auto DstWidth = DstTy->getElementType()->getScalarSizeInBits();
15790	if (DstWidth % `8` != `0`)
15791	return false;
15792
15793	auto *TruncDstType =
15794	cast<FixedVectorType>(Val: VectorType::getTruncatedElementVectorType(VTy: DstTy));
15795	// If the ZExt can be lowered to a single ZExt to the next power-of-2 and
15796	// the remaining ZExt folded into the user, don't use tbl lowering.
15797	auto SrcWidth = SrcTy->getElementType()->getScalarSizeInBits();
15798	if (TTI.getCastInstrCost(Opcode: I->getOpcode(), Dst: DstTy, Src: TruncDstType,
15799	CCH: TargetTransformInfo::getCastContextHint(I),
15800	CostKind: TTI::TCK_SizeAndLatency, I) == TTI::TCC_Free) {
15801	if (SrcWidth * `2` >= TruncDstType->getElementType()->getScalarSizeInBits())
15802	return false;
15803
15804	DstTy = TruncDstType;
15805	}
15806
15807	return createTblShuffleForZExt(ZExt, DstTy, IsLittleEndian: Subtarget->isLittleEndian());
15808	}
15809
15810	auto *UIToFP = dyn_cast<UIToFPInst>(Val: I);
15811	if (UIToFP && SrcTy->getElementType()->isIntegerTy(Bitwidth: `8`) &&
15812	DstTy->getElementType()->isFloatTy()) {
15813	IRBuilder<> Builder(I);
15814	auto *ZExt = cast<ZExtInst>(
15815	Val: Builder.CreateZExt(V: I->getOperand(i: `0`), DestTy: VectorType::getInteger(VTy: DstTy)));
15816	auto *UI = Builder.CreateUIToFP(V: ZExt, DestTy: DstTy);
15817	I->replaceAllUsesWith(V: UI);
15818	I->eraseFromParent();
15819	return createTblShuffleForZExt(ZExt, DstTy: cast<FixedVectorType>(Val: ZExt->getType()),
15820	IsLittleEndian: Subtarget->isLittleEndian());
15821	}
15822
15823	// Convert 'fptoui <(8\|16) x float> to <(8\|16) x i8>' to a wide fptoui
15824	// followed by a truncate lowered to using tbl.4.
15825	auto *FPToUI = dyn_cast<FPToUIInst>(Val: I);
15826	if (FPToUI &&
15827	(SrcTy->getNumElements() == `8` \|\| SrcTy->getNumElements() == `16`) &&
15828	SrcTy->getElementType()->isFloatTy() &&
15829	DstTy->getElementType()->isIntegerTy(Bitwidth: `8`)) {
15830	IRBuilder<> Builder(I);
15831	auto *WideConv = Builder.CreateFPToUI(V: FPToUI->getOperand(i_nocapture: `0`),
15832	DestTy: VectorType::getInteger(VTy: SrcTy));
15833	auto *TruncI = Builder.CreateTrunc(V: WideConv, DestTy: DstTy);
15834	I->replaceAllUsesWith(V: TruncI);
15835	I->eraseFromParent();
15836	createTblForTrunc(TI: cast<TruncInst>(Val: TruncI), IsLittleEndian: Subtarget->isLittleEndian());
15837	return true;
15838	}
15839
15840	// Convert 'trunc <(8\|16) x (i32\|i64)> %x to <(8\|16) x i8>' to an appropriate
15841	// tbl instruction selecting the lowest/highest (little/big endian) 8 bits
15842	// per lane of the input that is represented using 1,2,3 or 4 128-bit table
15843	// registers
15844	auto *TI = dyn_cast<TruncInst>(Val: I);
15845	if (TI && DstTy->getElementType()->isIntegerTy(Bitwidth: `8`) &&
15846	((SrcTy->getElementType()->isIntegerTy(Bitwidth: `32`) \|\|
15847	SrcTy->getElementType()->isIntegerTy(Bitwidth: `64`)) &&
15848	(SrcTy->getNumElements() == `16` \|\| SrcTy->getNumElements() == `8`))) {
15849	createTblForTrunc(TI, IsLittleEndian: Subtarget->isLittleEndian());
15850	return true;
15851	}
15852
15853	return false;
15854	}
15855
15856	bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
15857	Align &RequiredAligment) const {
15858	if (!LoadedType.isSimple() \|\|
15859	(!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
15860	return false;
15861	// Cyclone supports unaligned accesses.
15862	RequiredAligment = Align (`1`);
15863	unsigned NumBits = LoadedType.getSizeInBits();
15864	return NumBits == `32` \|\| NumBits == `64`;
15865	}
15866
15867	/// A helper function for determining the number of interleaved accesses we
15868	/// will generate when lowering accesses of the given type.
15869	unsigned AArch64TargetLowering::getNumInterleavedAccesses(
15870	VectorType VecTy, const* DataLayout &DL, bool UseScalable) const {
15871	unsigned VecSize = `128`;
15872	unsigned ElSize = DL.getTypeSizeInBits(Ty: VecTy->getElementType());
15873	unsigned MinElts = VecTy->getElementCount().getKnownMinValue();
15874	if (UseScalable && isa<FixedVectorType>(Val: VecTy))
15875	VecSize = std::max(a: Subtarget->getMinSVEVectorSizeInBits(), b: `128u`);
15876	return std::max<unsigned>(a: `1`, b: (MinElts * ElSize + `127`) / VecSize);
15877	}
15878
15879	MachineMemOperand::Flags
15880	AArch64TargetLowering::getTargetMMOFlags(const Instruction &I) const {
15881	if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
15882	I.hasMetadata(FALKOR_STRIDED_ACCESS_MD))
15883	return MOStridedAccess;
15884	return MachineMemOperand::MONone;
15885	}
15886
15887	bool AArch64TargetLowering::isLegalInterleavedAccessType(
15888	VectorType VecTy, const* DataLayout &DL, bool &UseScalable) const {
15889	unsigned ElSize = DL.getTypeSizeInBits(Ty: VecTy->getElementType());
15890	auto EC = VecTy->getElementCount();
15891	unsigned MinElts = EC.getKnownMinValue();
15892
15893	UseScalable = false;
15894
15895	if (!VecTy->isScalableTy() && !Subtarget->hasNEON())
15896	return false;
15897
15898	if (VecTy->isScalableTy() && !Subtarget->hasSVEorSME())
15899	return false;
15900
15901	// Ensure that the predicate for this number of elements is available.
15902	if (Subtarget->hasSVE() && !getSVEPredPatternFromNumElements(MinElts))
15903	return false;
15904
15905	// Ensure the number of vector elements is greater than 1.
15906	if (MinElts < `2`)
15907	return false;
15908
15909	// Ensure the element type is legal.
15910	if (ElSize != `8` && ElSize != `16` && ElSize != `32` && ElSize != `64`)
15911	return false;
15912
15913	if (EC.isScalable()) {
15914	UseScalable = true;
15915	return isPowerOf2_32(Value: MinElts) && (MinElts * ElSize) % `128` == `0`;
15916	}
15917
15918	unsigned VecSize = DL.getTypeSizeInBits(Ty: VecTy);
15919	if (!Subtarget->isNeonAvailable() \|\|
15920	(Subtarget->useSVEForFixedLengthVectors() &&
15921	(VecSize % Subtarget->getMinSVEVectorSizeInBits() == `0` \|\|
15922	(VecSize < Subtarget->getMinSVEVectorSizeInBits() &&
15923	isPowerOf2_32(Value: MinElts) && VecSize > `128`)))) {
15924	UseScalable = true;
15925	return true;
15926	}
15927
15928	// Ensure the total vector size is 64 or a multiple of 128. Types larger than
15929	// 128 will be split into multiple interleaved accesses.
15930	return VecSize == `64` \|\| VecSize % `128` == `0`;
15931	}
15932
15933	static ScalableVectorType getSVEContainerIRType(FixedVectorType VTy) {
15934	if (VTy->getElementType() == Type::getDoubleTy(C&: VTy->getContext()))
15935	return ScalableVectorType::get(ElementType: VTy->getElementType(), MinNumElts: `2`);
15936
15937	if (VTy->getElementType() == Type::getFloatTy(C&: VTy->getContext()))
15938	return ScalableVectorType::get(ElementType: VTy->getElementType(), MinNumElts: `4`);
15939
15940	if (VTy->getElementType() == Type::getBFloatTy(C&: VTy->getContext()))
15941	return ScalableVectorType::get(ElementType: VTy->getElementType(), MinNumElts: `8`);
15942
15943	if (VTy->getElementType() == Type::getHalfTy(C&: VTy->getContext()))
15944	return ScalableVectorType::get(ElementType: VTy->getElementType(), MinNumElts: `8`);
15945
15946	if (VTy->getElementType() == Type::getInt64Ty(C&: VTy->getContext()))
15947	return ScalableVectorType::get(ElementType: VTy->getElementType(), MinNumElts: `2`);
15948
15949	if (VTy->getElementType() == Type::getInt32Ty(C&: VTy->getContext()))
15950	return ScalableVectorType::get(ElementType: VTy->getElementType(), MinNumElts: `4`);
15951
15952	if (VTy->getElementType() == Type::getInt16Ty(C&: VTy->getContext()))
15953	return ScalableVectorType::get(ElementType: VTy->getElementType(), MinNumElts: `8`);
15954
15955	if (VTy->getElementType() == Type::getInt8Ty(C&: VTy->getContext()))
15956	return ScalableVectorType::get(ElementType: VTy->getElementType(), MinNumElts: `16`);
15957
15958	llvm_unreachable("Cannot handle input vector type");
15959	}
15960
15961	static Function getStructuredLoadFunction(Module M, unsigned Factor,
15962	bool Scalable, Type *LDVTy,
15963	Type *PtrTy) {
15964	assert(Factor >= `2` && Factor <= `4` && "Invalid interleave factor");
15965	static const Intrinsic::ID SVELoads[`3`] = {Intrinsic::aarch64_sve_ld2_sret,
15966	Intrinsic::aarch64_sve_ld3_sret,
15967	Intrinsic::aarch64_sve_ld4_sret};
15968	static const Intrinsic::ID NEONLoads[`3`] = {Intrinsic::aarch64_neon_ld2,
15969	Intrinsic::aarch64_neon_ld3,
15970	Intrinsic::aarch64_neon_ld4};
15971	if (Scalable)
15972	return Intrinsic::getDeclaration(M, id: SVELoads[Factor - `2`], Tys: {LDVTy});
15973
15974	return Intrinsic::getDeclaration(M, id: NEONLoads[Factor - `2`], Tys: {LDVTy, PtrTy});
15975	}
15976
15977	static Function getStructuredStoreFunction(Module M, unsigned Factor,
15978	bool Scalable, Type *STVTy,
15979	Type *PtrTy) {
15980	assert(Factor >= `2` && Factor <= `4` && "Invalid interleave factor");
15981	static const Intrinsic::ID SVEStores[`3`] = {Intrinsic::aarch64_sve_st2,
15982	Intrinsic::aarch64_sve_st3,
15983	Intrinsic::aarch64_sve_st4};
15984	static const Intrinsic::ID NEONStores[`3`] = {Intrinsic::aarch64_neon_st2,
15985	Intrinsic::aarch64_neon_st3,
15986	Intrinsic::aarch64_neon_st4};
15987	if (Scalable)
15988	return Intrinsic::getDeclaration(M, id: SVEStores[Factor - `2`], Tys: {STVTy});
15989
15990	return Intrinsic::getDeclaration(M, id: NEONStores[Factor - `2`], Tys: {STVTy, PtrTy});
15991	}
15992
15993	/// Lower an interleaved load into a ldN intrinsic.
15994	///
15995	/// E.g. Lower an interleaved load (Factor = 2):
15996	/// %wide.vec = load <8 x i32>, <8 x i32> %ptr*
15997	/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
15998	/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
15999	///
16000	/// Into:
16001	/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
16002	/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
16003	/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
16004	bool AArch64TargetLowering::lowerInterleavedLoad(
16005	LoadInst LI, ArrayRef<ShuffleVectorInst > Shuffles,
16006	ArrayRef<unsigned> Indices, unsigned Factor) const {
16007	assert(Factor >= `2` && Factor <= getMaxSupportedInterleaveFactor() &&
16008	"Invalid interleave factor");
16009	assert(!Shuffles.empty() && "Empty shufflevector input");
16010	assert(Shuffles.size() == Indices.size() &&
16011	"Unmatched number of shufflevectors and indices");
16012
16013	const DataLayout &DL = LI->getModule()->getDataLayout();
16014
16015	VectorType *VTy = Shuffles [`0`]->getType();
16016
16017	// Skip if we do not have NEON and skip illegal vector types. We can
16018	// "legalize" wide vector types into multiple interleaved accesses as long as
16019	// the vector types are divisible by 128.
16020	bool UseScalable;
16021	if (!Subtarget->hasNEON() \|\|
16022	!isLegalInterleavedAccessType(VecTy: VTy, DL, UseScalable))
16023	return false;
16024
16025	unsigned NumLoads = getNumInterleavedAccesses(VecTy: VTy, DL, UseScalable);
16026
16027	auto *FVTy = cast<FixedVectorType>(Val: VTy);
16028
16029	// A pointer vector can not be the return type of the ldN intrinsics. Need to
16030	// load integer vectors first and then convert to pointer vectors.
16031	Type *EltTy = FVTy->getElementType();
16032	if (EltTy->isPointerTy())
16033	FVTy =
16034	FixedVectorType::get(ElementType: DL.getIntPtrType(EltTy), NumElts: FVTy->getNumElements());
16035
16036	// If we're going to generate more than one load, reset the sub-vector type
16037	// to something legal.
16038	FVTy = FixedVectorType::get(ElementType: FVTy->getElementType(),
16039	NumElts: FVTy->getNumElements() / NumLoads);
16040
16041	auto *LDVTy =
16042	UseScalable ? cast<VectorType>(Val: getSVEContainerIRType(VTy: FVTy)) : FVTy;
16043
16044	IRBuilder<> Builder(LI);
16045
16046	// The base address of the load.
16047	Value *BaseAddr = LI->getPointerOperand();
16048
16049	Type *PtrTy = LI->getPointerOperandType();
16050	Type *PredTy = VectorType::get(ElementType: Type::getInt1Ty(C&: LDVTy->getContext()),
16051	EC: LDVTy->getElementCount());
16052
16053	Function *LdNFunc = getStructuredLoadFunction(M: LI->getModule(), Factor,
16054	Scalable: UseScalable, LDVTy, PtrTy);
16055
16056	// Holds sub-vectors extracted from the load intrinsic return values. The
16057	// sub-vectors are associated with the shufflevector instructions they will
16058	// replace.
16059	DenseMap<ShuffleVectorInst , SmallVector<Value , `4`>> SubVecs;
16060
16061	Value PTrue = nullptr*;
16062	if (UseScalable) {
16063	std::optional<unsigned> PgPattern =
16064	getSVEPredPatternFromNumElements(FVTy->getNumElements());
16065	if (Subtarget->getMinSVEVectorSizeInBits() ==
16066	Subtarget->getMaxSVEVectorSizeInBits() &&
16067	Subtarget->getMinSVEVectorSizeInBits() == DL.getTypeSizeInBits(FVTy))
16068	PgPattern = AArch64SVEPredPattern::all;
16069
16070	auto *PTruePat =
16071	ConstantInt::get(Ty: Type::getInt32Ty(C&: LDVTy->getContext()), V: *PgPattern);
16072	PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
16073	{PTruePat});
16074	}
16075
16076	for (unsigned LoadCount = `0`; LoadCount < NumLoads; ++LoadCount) {
16077
16078	// If we're generating more than one load, compute the base address of
16079	// subsequent loads as an offset from the previous.
16080	if (LoadCount > `0`)
16081	BaseAddr = Builder.CreateConstGEP1_32(Ty: LDVTy->getElementType(), Ptr: BaseAddr,
16082	Idx0: FVTy->getNumElements() * Factor);
16083
16084	CallInst *LdN;
16085	if (UseScalable)
16086	LdN = Builder.CreateCall(Callee: LdNFunc, Args: {PTrue, BaseAddr}, Name: "ldN");
16087	else
16088	LdN = Builder.CreateCall(Callee: LdNFunc, Args: BaseAddr, Name: "ldN");
16089
16090	// Extract and store the sub-vectors returned by the load intrinsic.
16091	for (unsigned i = `0`; i < Shuffles.size(); i++) {
16092	ShuffleVectorInst *SVI = Shuffles [i];
16093	unsigned Index = Indices [i];
16094
16095	Value *SubVec = Builder.CreateExtractValue(Agg: LdN, Idxs: Index);
16096
16097	if (UseScalable)
16098	SubVec = Builder.CreateExtractVector(
16099	DstType: FVTy, SrcVec: SubVec,
16100	Idx: ConstantInt::get(Ty: Type::getInt64Ty(C&: VTy->getContext()), V: `0`));
16101
16102	// Convert the integer vector to pointer vector if the element is pointer.
16103	if (EltTy->isPointerTy())
16104	SubVec = Builder.CreateIntToPtr(
16105	V: SubVec, DestTy: FixedVectorType::get(ElementType: SVI->getType()->getElementType(),
16106	NumElts: FVTy->getNumElements()));
16107
16108	SubVecs [SVI].push_back(Elt: SubVec);
16109	}
16110	}
16111
16112	// Replace uses of the shufflevector instructions with the sub-vectors
16113	// returned by the load intrinsic. If a shufflevector instruction is
16114	// associated with more than one sub-vector, those sub-vectors will be
16115	// concatenated into a single wide vector.
16116	for (ShuffleVectorInst *SVI : Shuffles) {
16117	auto &SubVec = SubVecs [SVI];
16118	auto *WideVec =
16119	SubVec.size() > `1` ? concatenateVectors(Builder, Vecs: SubVec) : SubVec [`0`];
16120	SVI->replaceAllUsesWith(V: WideVec);
16121	}
16122
16123	return true;
16124	}
16125
16126	template <typename Iter>
16127	bool hasNearbyPairedStore(Iter It, Iter End, Value Ptr, const* DataLayout &DL) {
16128	int MaxLookupDist = `20`;
16129	unsigned IdxWidth = DL.getIndexSizeInBits(AS: `0`);
16130	APInt OffsetA(IdxWidth, `0`), OffsetB(IdxWidth, `0`);
16131	const Value *PtrA1 =
16132	Ptr->stripAndAccumulateInBoundsConstantOffsets(DL, Offset&: OffsetA);
16133
16134	while (++It != End) {
16135	if (It->isDebugOrPseudoInst())
16136	continue;
16137	if (MaxLookupDist-- == `0`)
16138	break;
16139	if (const auto SI = dyn_cast<StoreInst>(&It)) {
16140	const Value *PtrB1 =
16141	SI->getPointerOperand()->stripAndAccumulateInBoundsConstantOffsets(
16142	DL, OffsetB);
16143	if (PtrA1 == PtrB1 &&
16144	(OffsetA.sextOrTrunc(width: IdxWidth) - OffsetB.sextOrTrunc(width: IdxWidth))
16145	.abs() == `16`)
16146	return true;
16147	}
16148	}
16149
16150	return false;
16151	}
16152
16153	/// Lower an interleaved store into a stN intrinsic.
16154	///
16155	/// E.g. Lower an interleaved store (Factor = 3):
16156	/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
16157	/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
16158	/// store <12 x i32> %i.vec, <12 x i32> %ptr*
16159	///
16160	/// Into:
16161	/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
16162	/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
16163	/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
16164	/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
16165	///
16166	/// Note that the new shufflevectors will be removed and we'll only generate one
16167	/// st3 instruction in CodeGen.
16168	///
16169	/// Example for a more general valid mask (Factor 3). Lower:
16170	/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
16171	/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
16172	/// store <12 x i32> %i.vec, <12 x i32> %ptr*
16173	///
16174	/// Into:
16175	/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
16176	/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
16177	/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
16178	/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
16179	bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
16180	ShuffleVectorInst *SVI,
16181	unsigned Factor) const {
16182
16183	assert(Factor >= `2` && Factor <= getMaxSupportedInterleaveFactor() &&
16184	"Invalid interleave factor");
16185
16186	auto *VecTy = cast<FixedVectorType>(Val: SVI->getType());
16187	assert(VecTy->getNumElements() % Factor == `0` && "Invalid interleaved store");
16188
16189	unsigned LaneLen = VecTy->getNumElements() / Factor;
16190	Type *EltTy = VecTy->getElementType();
16191	auto *SubVecTy = FixedVectorType::get(ElementType: EltTy, NumElts: LaneLen);
16192
16193	const DataLayout &DL = SI->getModule()->getDataLayout();
16194	bool UseScalable;
16195
16196	// Skip if we do not have NEON and skip illegal vector types. We can
16197	// "legalize" wide vector types into multiple interleaved accesses as long as
16198	// the vector types are divisible by 128.
16199	if (!Subtarget->hasNEON() \|\|
16200	!isLegalInterleavedAccessType(VecTy: SubVecTy, DL, UseScalable))
16201	return false;
16202
16203	unsigned NumStores = getNumInterleavedAccesses(VecTy: SubVecTy, DL, UseScalable);
16204
16205	Value *Op0 = SVI->getOperand(i_nocapture: `0`);
16206	Value *Op1 = SVI->getOperand(i_nocapture: `1`);
16207	IRBuilder<> Builder(SI);
16208
16209	// StN intrinsics don't support pointer vectors as arguments. Convert pointer
16210	// vectors to integer vectors.
16211	if (EltTy->isPointerTy()) {
16212	Type *IntTy = DL.getIntPtrType(EltTy);
16213	unsigned NumOpElts =
16214	cast<FixedVectorType>(Val: Op0->getType())->getNumElements();
16215
16216	// Convert to the corresponding integer vector.
16217	auto *IntVecTy = FixedVectorType::get(ElementType: IntTy, NumElts: NumOpElts);
16218	Op0 = Builder.CreatePtrToInt(V: Op0, DestTy: IntVecTy);
16219	Op1 = Builder.CreatePtrToInt(V: Op1, DestTy: IntVecTy);
16220
16221	SubVecTy = FixedVectorType::get(ElementType: IntTy, NumElts: LaneLen);
16222	}
16223
16224	// If we're going to generate more than one store, reset the lane length
16225	// and sub-vector type to something legal.
16226	LaneLen /= NumStores;
16227	SubVecTy = FixedVectorType::get(ElementType: SubVecTy->getElementType(), NumElts: LaneLen);
16228
16229	auto *STVTy = UseScalable ? cast<VectorType>(Val: getSVEContainerIRType(VTy: SubVecTy))
16230	: SubVecTy;
16231
16232	// The base address of the store.
16233	Value *BaseAddr = SI->getPointerOperand();
16234
16235	auto Mask = SVI->getShuffleMask();
16236
16237	// Sanity check if all the indices are NOT in range.
16238	// If mask is `poison`, `Mask` may be a vector of -1s.
16239	// If all of them are `poison`, OOB read will happen later.
16240	if (llvm::all_of(Range&: Mask, P: [](int Idx) { return Idx == PoisonMaskElem; })) {
16241	return false;
16242	}
16243	// A 64bit st2 which does not start at element 0 will involved adding extra
16244	// ext elements making the st2 unprofitable, and if there is a nearby store
16245	// that points to BaseAddr+16 or BaseAddr-16 then it can be better left as a
16246	// zip;ldp pair which has higher throughput.
16247	if (Factor == `2` && SubVecTy->getPrimitiveSizeInBits() == `64` &&
16248	(Mask [`0`] != `0` \|\|
16249	hasNearbyPairedStore(It: SI->getIterator(), End: SI->getParent()->end(), Ptr: BaseAddr,
16250	DL) \|\|
16251	hasNearbyPairedStore(It: SI->getReverseIterator(), End: SI->getParent()->rend(),
16252	Ptr: BaseAddr, DL)))
16253	return false;
16254
16255	Type *PtrTy = SI->getPointerOperandType();
16256	Type *PredTy = VectorType::get(ElementType: Type::getInt1Ty(C&: STVTy->getContext()),
16257	EC: STVTy->getElementCount());
16258
16259	Function *StNFunc = getStructuredStoreFunction(M: SI->getModule(), Factor,
16260	Scalable: UseScalable, STVTy, PtrTy);
16261
16262	Value PTrue = nullptr*;
16263	if (UseScalable) {
16264	std::optional<unsigned> PgPattern =
16265	getSVEPredPatternFromNumElements(SubVecTy->getNumElements());
16266	if (Subtarget->getMinSVEVectorSizeInBits() ==
16267	Subtarget->getMaxSVEVectorSizeInBits() &&
16268	Subtarget->getMinSVEVectorSizeInBits() ==
16269	DL.getTypeSizeInBits(SubVecTy))
16270	PgPattern = AArch64SVEPredPattern::all;
16271
16272	auto *PTruePat =
16273	ConstantInt::get(Ty: Type::getInt32Ty(C&: STVTy->getContext()), V: *PgPattern);
16274	PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
16275	{PTruePat});
16276	}
16277
16278	for (unsigned StoreCount = `0`; StoreCount < NumStores; ++StoreCount) {
16279
16280	SmallVector<Value *, `5`> Ops;
16281
16282	// Split the shufflevector operands into sub vectors for the new stN call.
16283	for (unsigned i = `0`; i < Factor; i++) {
16284	Value *Shuffle;
16285	unsigned IdxI = StoreCount * LaneLen * Factor + i;
16286	if (Mask [IdxI] >= `0`) {
16287	Shuffle = Builder.CreateShuffleVector(
16288	V1: Op0, V2: Op1, Mask: createSequentialMask(Start: Mask [IdxI], NumInts: LaneLen, NumUndefs: `0`));
16289	} else {
16290	unsigned StartMask = `0`;
16291	for (unsigned j = `1`; j < LaneLen; j++) {
16292	unsigned IdxJ = StoreCount * LaneLen * Factor + j * Factor + i;
16293	if (Mask [IdxJ] >= `0`) {
16294	StartMask = Mask [IdxJ] - j;
16295	break;
16296	}
16297	}
16298	// Note: Filling undef gaps with random elements is ok, since
16299	// those elements were being written anyway (with undefs).
16300	// In the case of all undefs we're defaulting to using elems from 0
16301	// Note: StartMask cannot be negative, it's checked in
16302	// isReInterleaveMask
16303	Shuffle = Builder.CreateShuffleVector(
16304	V1: Op0, V2: Op1, Mask: createSequentialMask(Start: StartMask, NumInts: LaneLen, NumUndefs: `0`));
16305	}
16306
16307	if (UseScalable)
16308	Shuffle = Builder.CreateInsertVector(
16309	DstType: STVTy, SrcVec: UndefValue::get(T: STVTy), SubVec: Shuffle,
16310	Idx: ConstantInt::get(Ty: Type::getInt64Ty(C&: STVTy->getContext()), V: `0`));
16311
16312	Ops.push_back(Elt: Shuffle);
16313	}
16314
16315	if (UseScalable)
16316	Ops.push_back(Elt: PTrue);
16317
16318	// If we generating more than one store, we compute the base address of
16319	// subsequent stores as an offset from the previous.
16320	if (StoreCount > `0`)
16321	BaseAddr = Builder.CreateConstGEP1_32(Ty: SubVecTy->getElementType(),
16322	Ptr: BaseAddr, Idx0: LaneLen * Factor);
16323
16324	Ops.push_back(Elt: BaseAddr);
16325	Builder.CreateCall(Callee: StNFunc, Args: Ops);
16326	}
16327	return true;
16328	}
16329
16330	bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
16331	IntrinsicInst DI, LoadInst LI) const {
16332	// Only deinterleave2 supported at present.
16333	if (DI->getIntrinsicID() != Intrinsic::experimental_vector_deinterleave2)
16334	return false;
16335
16336	// Only a factor of 2 supported at present.
16337	const unsigned Factor = `2`;
16338
16339	VectorType *VTy = cast<VectorType>(Val: DI->getType()->getContainedType(i: `0`));
16340	const DataLayout &DL = DI->getModule()->getDataLayout();
16341	bool UseScalable;
16342	if (!isLegalInterleavedAccessType(VecTy: VTy, DL, UseScalable))
16343	return false;
16344
16345	// TODO: Add support for using SVE instructions with fixed types later, using
16346	// the code from lowerInterleavedLoad to obtain the correct container type.
16347	if (UseScalable && !VTy->isScalableTy())
16348	return false;
16349
16350	unsigned NumLoads = getNumInterleavedAccesses(VecTy: VTy, DL, UseScalable);
16351
16352	VectorType *LdTy =
16353	VectorType::get(ElementType: VTy->getElementType(),
16354	EC: VTy->getElementCount().divideCoefficientBy(RHS: NumLoads));
16355
16356	Type *PtrTy = LI->getPointerOperandType();
16357	Function *LdNFunc = getStructuredLoadFunction(M: DI->getModule(), Factor,
16358	Scalable: UseScalable, LDVTy: LdTy, PtrTy);
16359
16360	IRBuilder<> Builder(LI);
16361
16362	Value Pred = nullptr*;
16363	if (UseScalable)
16364	Pred =
16365	Builder.CreateVectorSplat(EC: LdTy->getElementCount(), V: Builder.getTrue());
16366
16367	Value *BaseAddr = LI->getPointerOperand();
16368	Value *Result;
16369	if (NumLoads > `1`) {
16370	Value *Left = PoisonValue::get(T: VTy);
16371	Value *Right = PoisonValue::get(T: VTy);
16372
16373	for (unsigned I = `0`; I < NumLoads; ++I) {
16374	Value Offset = Builder.getInt64(C: I Factor);
16375
16376	Value *Address = Builder.CreateGEP(Ty: LdTy, Ptr: BaseAddr, IdxList: {Offset});
16377	Value LdN = nullptr*;
16378	if (UseScalable)
16379	LdN = Builder.CreateCall(Callee: LdNFunc, Args: {Pred, Address}, Name: "ldN");
16380	else
16381	LdN = Builder.CreateCall(Callee: LdNFunc, Args: Address, Name: "ldN");
16382
16383	Value *Idx =
16384	Builder.getInt64(C: I * LdTy->getElementCount().getKnownMinValue());
16385	Left = Builder.CreateInsertVector(
16386	DstType: VTy, SrcVec: Left, SubVec: Builder.CreateExtractValue(Agg: LdN, Idxs: `0`), Idx);
16387	Right = Builder.CreateInsertVector(
16388	DstType: VTy, SrcVec: Right, SubVec: Builder.CreateExtractValue(Agg: LdN, Idxs: `1`), Idx);
16389	}
16390
16391	Result = PoisonValue::get(T: DI->getType());
16392	Result = Builder.CreateInsertValue(Agg: Result, Val: Left, Idxs: `0`);
16393	Result = Builder.CreateInsertValue(Agg: Result, Val: Right, Idxs: `1`);
16394	} else {
16395	if (UseScalable)
16396	Result = Builder.CreateCall(Callee: LdNFunc, Args: {Pred, BaseAddr}, Name: "ldN");
16397	else
16398	Result = Builder.CreateCall(Callee: LdNFunc, Args: BaseAddr, Name: "ldN");
16399	}
16400
16401	DI->replaceAllUsesWith(V: Result);
16402	return true;
16403	}
16404
16405	bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
16406	IntrinsicInst II, StoreInst SI) const {
16407	// Only interleave2 supported at present.
16408	if (II->getIntrinsicID() != Intrinsic::experimental_vector_interleave2)
16409	return false;
16410
16411	// Only a factor of 2 supported at present.
16412	const unsigned Factor = `2`;
16413
16414	VectorType *VTy = cast<VectorType>(Val: II->getOperand(i_nocapture: `0`)->getType());
16415	const DataLayout &DL = II->getModule()->getDataLayout();
16416	bool UseScalable;
16417	if (!isLegalInterleavedAccessType(VecTy: VTy, DL, UseScalable))
16418	return false;
16419
16420	// TODO: Add support for using SVE instructions with fixed types later, using
16421	// the code from lowerInterleavedStore to obtain the correct container type.
16422	if (UseScalable && !VTy->isScalableTy())
16423	return false;
16424
16425	unsigned NumStores = getNumInterleavedAccesses(VecTy: VTy, DL, UseScalable);
16426
16427	VectorType *StTy =
16428	VectorType::get(ElementType: VTy->getElementType(),
16429	EC: VTy->getElementCount().divideCoefficientBy(RHS: NumStores));
16430
16431	Type *PtrTy = SI->getPointerOperandType();
16432	Function *StNFunc = getStructuredStoreFunction(M: SI->getModule(), Factor,
16433	Scalable: UseScalable, STVTy: StTy, PtrTy);
16434
16435	IRBuilder<> Builder(SI);
16436
16437	Value *BaseAddr = SI->getPointerOperand();
16438	Value Pred = nullptr*;
16439
16440	if (UseScalable)
16441	Pred =
16442	Builder.CreateVectorSplat(EC: StTy->getElementCount(), V: Builder.getTrue());
16443
16444	Value *L = II->getOperand(i_nocapture: `0`);
16445	Value *R = II->getOperand(i_nocapture: `1`);
16446
16447	for (unsigned I = `0`; I < NumStores; ++I) {
16448	Value *Address = BaseAddr;
16449	if (NumStores > `1`) {
16450	Value Offset = Builder.getInt64(C: I Factor);
16451	Address = Builder.CreateGEP(Ty: StTy, Ptr: BaseAddr, IdxList: {Offset});
16452
16453	Value *Idx =
16454	Builder.getInt64(C: I * StTy->getElementCount().getKnownMinValue());
16455	L = Builder.CreateExtractVector(DstType: StTy, SrcVec: II->getOperand(i_nocapture: `0`), Idx);
16456	R = Builder.CreateExtractVector(DstType: StTy, SrcVec: II->getOperand(i_nocapture: `1`), Idx);
16457	}
16458
16459	if (UseScalable)
16460	Builder.CreateCall(Callee: StNFunc, Args: {L, R, Pred, Address});
16461	else
16462	Builder.CreateCall(Callee: StNFunc, Args: {L, R, Address});
16463	}
16464
16465	return true;
16466	}
16467
16468	EVT AArch64TargetLowering::getOptimalMemOpType(
16469	const MemOp &Op, const AttributeList &FuncAttributes) const {
16470	bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
16471	bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
16472	bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
16473	// Only use AdvSIMD to implement memset of 32-byte and above. It would have
16474	// taken one instruction to materialize the v2i64 zero and one store (with
16475	// restrictive addressing mode). Just do i64 stores.
16476	bool IsSmallMemset = Op.isMemset() && Op.size() < `32`;
16477	auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
16478	if (Op.isAligned(AlignCheck))
16479	return true;
16480	unsigned Fast;
16481	return allowsMisalignedMemoryAccesses(VT, AddrSpace: `0`, Alignment: Align (`1`),
16482	Flags: MachineMemOperand::MONone, Fast: &Fast) &&
16483	Fast;
16484	};
16485
16486	if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
16487	AlignmentIsAcceptable(MVT::v16i8, Align(`16`)))
16488	return MVT::v16i8;
16489	if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(`16`)))
16490	return MVT::f128;
16491	if (Op.size() >= `8` && AlignmentIsAcceptable(MVT::i64, Align(`8`)))
16492	return MVT::i64;
16493	if (Op.size() >= `4` && AlignmentIsAcceptable(MVT::i32, Align(`4`)))
16494	return MVT::i32;
16495	return MVT::Other;
16496	}
16497
16498	LLT AArch64TargetLowering::getOptimalMemOpLLT(
16499	const MemOp &Op, const AttributeList &FuncAttributes) const {
16500	bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
16501	bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
16502	bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
16503	// Only use AdvSIMD to implement memset of 32-byte and above. It would have
16504	// taken one instruction to materialize the v2i64 zero and one store (with
16505	// restrictive addressing mode). Just do i64 stores.
16506	bool IsSmallMemset = Op.isMemset() && Op.size() < `32`;
16507	auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
16508	if (Op.isAligned(AlignCheck))
16509	return true;
16510	unsigned Fast;
16511	return allowsMisalignedMemoryAccesses(VT, AddrSpace: `0`, Alignment: Align (`1`),
16512	Flags: MachineMemOperand::MONone, Fast: &Fast) &&
16513	Fast;
16514	};
16515
16516	if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
16517	AlignmentIsAcceptable(MVT::v2i64, Align(`16`)))
16518	return LLT::fixed_vector(NumElements: `2`, ScalarSizeInBits: `64`);
16519	if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(`16`)))
16520	return LLT::scalar(SizeInBits: `128`);
16521	if (Op.size() >= `8` && AlignmentIsAcceptable(MVT::i64, Align(`8`)))
16522	return LLT::scalar(SizeInBits: `64`);
16523	if (Op.size() >= `4` && AlignmentIsAcceptable(MVT::i32, Align(`4`)))
16524	return LLT::scalar(SizeInBits: `32`);
16525	return LLT ();
16526	}
16527
16528	// 12-bit optionally shifted immediates are legal for adds.
16529	bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
16530	if (Immed == std::numeric_limits<int64_t>::min()) {
16531	LLVM_DEBUG(dbgs() << "Illegal add imm " << Immed
16532	<< ": avoid UB for INT64_MIN\n");
16533	return false;
16534	}
16535	// Same encoding for add/sub, just flip the sign.
16536	Immed = std::abs(i: Immed);
16537	bool IsLegal = ((Immed >> `12`) == `0` \|\|
16538	((Immed & `0xfff`) == `0` && Immed >> `24` == `0`));
16539	LLVM_DEBUG(dbgs() << "Is " << Immed
16540	<< " legal add imm: " << (IsLegal ? "yes" : "no") << "\n");
16541	return IsLegal;
16542	}
16543
16544	bool AArch64TargetLowering::isLegalAddScalableImmediate(int64_t Imm) const {
16545	// We will only emit addvl/inc instructions for SVE2*
16546	if (!Subtarget->hasSVE2())
16547	return false;
16548
16549	// addvl's immediates are in terms of the number of bytes in a register.
16550	// Since there are 16 in the base supported size (128bits), we need to
16551	// divide the immediate by that much to give us a useful immediate to
16552	// multiply by vscale. We can't have a remainder as a result of this.
16553	if (Imm % `16` == `0`)
16554	return isInt<`6`>(x: Imm / `16`);
16555
16556	// Inc[b\|h\|w\|d] instructions take a pattern and a positive immediate
16557	// multiplier. For now, assume a pattern of 'all'. Incb would be a subset
16558	// of addvl as a result, so only take h\|w\|d into account.
16559	// Dec[h\|w\|d] will cover subtractions.
16560	// Immediates are in the range [1,16], so we can't do a 2's complement check.
16561	// FIXME: Can we make use of other patterns to cover other immediates?
16562
16563	// inch\|dech
16564	if (Imm % `8` == `0`)
16565	return std::labs(x: Imm / `8`) <= `16`;
16566	// incw\|decw
16567	if (Imm % `4` == `0`)
16568	return std::labs(x: Imm / `4`) <= `16`;
16569	// incd\|decd
16570	if (Imm % `2` == `0`)
16571	return std::labs(x: Imm / `2`) <= `16`;
16572
16573	return false;
16574	}
16575
16576	// Return false to prevent folding
16577	// (mul (add x, c1), c2) -> (add (mul x, c2), c2c1) in DAGCombine,*
16578	// if the folding leads to worse code.
16579	bool AArch64TargetLowering::isMulAddWithConstProfitable(
16580	SDValue AddNode, SDValue ConstNode) const {
16581	// Let the DAGCombiner decide for vector types and large types.
16582	const EVT VT = AddNode.getValueType();
16583	if (VT.isVector() \|\| VT.getScalarSizeInBits() > `64`)
16584	return true;
16585
16586	// It is worse if c1 is legal add immediate, while c1c2 is not*
16587	// and has to be composed by at least two instructions.
16588	const ConstantSDNode *C1Node = cast<ConstantSDNode>(Val: AddNode.getOperand(i: `1`));
16589	const ConstantSDNode *C2Node = cast<ConstantSDNode>(Val&: ConstNode);
16590	const int64_t C1 = C1Node->getSExtValue();
16591	const APInt C1C2 = C1Node->getAPIntValue() * C2Node->getAPIntValue();
16592	if (!isLegalAddImmediate(Immed: C1) \|\| isLegalAddImmediate(Immed: C1C2.getSExtValue()))
16593	return true;
16594	SmallVector<AArch64_IMM::ImmInsnModel, `4`> Insn;
16595	// Adapt to the width of a register.
16596	unsigned BitSize = VT.getSizeInBits() <= `32` ? `32` : `64`;
16597	AArch64_IMM::expandMOVImm(Imm: C1C2.getZExtValue(), BitSize, Insn);
16598	if (Insn.size() > `1`)
16599	return false;
16600
16601	// Default to true and let the DAGCombiner decide.
16602	return true;
16603	}
16604
16605	// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
16606	// immediates is the same as for an add or a sub.
16607	bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {
16608	return isLegalAddImmediate(Immed);
16609	}
16610
16611	/// isLegalAddressingMode - Return true if the addressing mode represented
16612	/// by AM is legal for this target, for a load/store of the specified type.
16613	bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL,
16614	const AddrMode &AMode, Type *Ty,
16615	unsigned AS, Instruction I) const* {
16616	// AArch64 has five basic addressing modes:
16617	// reg
16618	// reg + 9-bit signed offset
16619	// reg + SIZE_IN_BYTES 12-bit unsigned offset*
16620	// reg1 + reg2
16621	// reg + SIZE_IN_BYTES reg*
16622
16623	// No global is ever allowed as a base.
16624	if (AMode.BaseGV)
16625	return false;
16626
16627	// No reg+reg+imm addressing.
16628	if (AMode.HasBaseReg && AMode.BaseOffs && AMode.Scale)
16629	return false;
16630
16631	// Canonicalise `1ScaledReg + imm` into `BaseReg + imm` and*
16632	// `2ScaledReg` into `BaseReg + ScaledReg`*
16633	AddrMode AM = AMode;
16634	if (AM.Scale && !AM.HasBaseReg) {
16635	if (AM.Scale == `1`) {
16636	AM.HasBaseReg = true;
16637	AM.Scale = `0`;
16638	} else if (AM.Scale == `2`) {
16639	AM.HasBaseReg = true;
16640	AM.Scale = `1`;
16641	} else {
16642	return false;
16643	}
16644	}
16645
16646	// A base register is required in all addressing modes.
16647	if (!AM.HasBaseReg)
16648	return false;
16649
16650	if (Ty->isScalableTy()) {
16651	if (isa<ScalableVectorType>(Val: Ty)) {
16652	// See if we have a foldable vscale-based offset, for vector types which
16653	// are either legal or smaller than the minimum; more work will be
16654	// required if we need to consider addressing for types which need
16655	// legalization by splitting.
16656	uint64_t VecNumBytes = DL.getTypeSizeInBits(Ty).getKnownMinValue() / `8`;
16657	if (AM.HasBaseReg && !AM.BaseOffs && AM.ScalableOffset && !AM.Scale &&
16658	(AM.ScalableOffset % VecNumBytes == `0`) && VecNumBytes <= `16` &&
16659	isPowerOf2_64(Value: VecNumBytes))
16660	return isInt<`4`>(x: AM.ScalableOffset / (int64_t)VecNumBytes);
16661
16662	uint64_t VecElemNumBytes =
16663	DL.getTypeSizeInBits(Ty: cast<VectorType>(Val: Ty)->getElementType()) / `8`;
16664	return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset &&
16665	(AM.Scale == `0` \|\| (uint64_t)AM.Scale == VecElemNumBytes);
16666	}
16667
16668	return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset && !AM.Scale;
16669	}
16670
16671	// No scalable offsets allowed for non-scalable types.
16672	if (AM.ScalableOffset)
16673	return false;
16674
16675	// check reg + imm case:
16676	// i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES uimm12*
16677	uint64_t NumBytes = `0`;
16678	if (Ty->isSized()) {
16679	uint64_t NumBits = DL.getTypeSizeInBits(Ty);
16680	NumBytes = NumBits / `8`;
16681	if (!isPowerOf2_64(Value: NumBits))
16682	NumBytes = `0`;
16683	}
16684
16685	return Subtarget->getInstrInfo()->isLegalAddressingMode(NumBytes, Offset: AM.BaseOffs,
16686	Scale: AM.Scale);
16687	}
16688
16689	// Check whether the 2 offsets belong to the same imm24 range, and their high
16690	// 12bits are same, then their high part can be decoded with the offset of add.
16691	int64_t
16692	AArch64TargetLowering::getPreferredLargeGEPBaseOffset(int64_t MinOffset,
16693	int64_t MaxOffset) const {
16694	int64_t HighPart = MinOffset & ~`0xfffULL`;
16695	if (MinOffset >> `12` == MaxOffset >> `12` && isLegalAddImmediate(Immed: HighPart)) {
16696	// Rebase the value to an integer multiple of imm12.
16697	return HighPart;
16698	}
16699
16700	return `0`;
16701	}
16702
16703	bool AArch64TargetLowering::shouldConsiderGEPOffsetSplit() const {
16704	// Consider splitting large offset of struct or array.
16705	return true;
16706	}
16707
16708	bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(
16709	const MachineFunction &MF, EVT VT) const {
16710	VT = VT.getScalarType();
16711
16712	if (!VT.isSimple())
16713	return false;
16714
16715	switch (VT.getSimpleVT().SimpleTy) {
16716	case MVT::f16:
16717	return Subtarget->hasFullFP16();
16718	case MVT::f32:
16719	case MVT::f64:
16720	return true;
16721	default:
16722	break;
16723	}
16724
16725	return false;
16726	}
16727
16728	bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(const Function &F,
16729	Type Ty) const* {
16730	switch (Ty->getScalarType()->getTypeID()) {
16731	case Type::FloatTyID:
16732	case Type::DoubleTyID:
16733	return true;
16734	default:
16735	return false;
16736	}
16737	}
16738
16739	bool AArch64TargetLowering::generateFMAsInMachineCombiner(
16740	EVT VT, CodeGenOptLevel OptLevel) const {
16741	return (OptLevel >= CodeGenOptLevel::Aggressive) && !VT.isScalableVector() &&
16742	!useSVEForFixedLengthVectorVT(VT);
16743	}
16744
16745	const MCPhysReg *
16746	AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const {
16747	// LR is a callee-save register, but we must treat it as clobbered by any call
16748	// site. Hence we include LR in the scratch registers, which are in turn added
16749	// as implicit-defs for stackmaps and patchpoints.
16750	static const MCPhysReg ScratchRegs[] = {
16751	AArch64::X16, AArch64::X17, AArch64::LR, `0`
16752	};
16753	return ScratchRegs;
16754	}
16755
16756	ArrayRef<MCPhysReg> AArch64TargetLowering::getRoundingControlRegisters() const {
16757	static const MCPhysReg RCRegs[] = {AArch64::FPCR};
16758	return RCRegs;
16759	}
16760
16761	bool
16762	AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
16763	CombineLevel Level) const {
16764	assert((N->getOpcode() == ISD::SHL \|\| N->getOpcode() == ISD::SRA \|\|
16765	N->getOpcode() == ISD::SRL) &&
16766	"Expected shift op");
16767
16768	SDValue ShiftLHS = N->getOperand(Num: `0`);
16769	EVT VT = N->getValueType(ResNo: `0`);
16770
16771	// If ShiftLHS is unsigned bit extraction: ((x >> C) & mask), then do not
16772	// combine it with shift 'N' to let it be lowered to UBFX except:
16773	// ((x >> C) & mask) << C.
16774	if (ShiftLHS.getOpcode() == ISD::AND && (VT == MVT::i32 \|\| VT == MVT::i64) &&
16775	isa<ConstantSDNode>(ShiftLHS.getOperand(`1`))) {
16776	uint64_t TruncMask = ShiftLHS.getConstantOperandVal(i: `1`);
16777	if (isMask_64(Value: TruncMask)) {
16778	SDValue AndLHS = ShiftLHS.getOperand(i: `0`);
16779	if (AndLHS.getOpcode() == ISD::SRL) {
16780	if (auto *SRLC = dyn_cast<ConstantSDNode>(Val: AndLHS.getOperand(i: `1`))) {
16781	if (N->getOpcode() == ISD::SHL)
16782	if (auto *SHLC = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `1`)))
16783	return SRLC->getZExtValue() == SHLC->getZExtValue();
16784	return false;
16785	}
16786	}
16787	}
16788	}
16789	return true;
16790	}
16791
16792	bool AArch64TargetLowering::isDesirableToCommuteXorWithShift(
16793	const SDNode N) const* {
16794	assert(N->getOpcode() == ISD::XOR &&
16795	(N->getOperand(`0`).getOpcode() == ISD::SHL \|\|
16796	N->getOperand(`0`).getOpcode() == ISD::SRL) &&
16797	"Expected XOR(SHIFT) pattern");
16798
16799	// Only commute if the entire NOT mask is a hidden shifted mask.
16800	auto *XorC = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `1`));
16801	auto *ShiftC = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `0`).getOperand(i: `1`));
16802	if (XorC && ShiftC) {
16803	unsigned MaskIdx, MaskLen;
16804	if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
16805	unsigned ShiftAmt = ShiftC->getZExtValue();
16806	unsigned BitWidth = N->getValueType(ResNo: `0`).getScalarSizeInBits();
16807	if (N->getOperand(Num: `0`).getOpcode() == ISD::SHL)
16808	return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
16809	return MaskIdx == `0` && MaskLen == (BitWidth - ShiftAmt);
16810	}
16811	}
16812
16813	return false;
16814	}
16815
16816	bool AArch64TargetLowering::shouldFoldConstantShiftPairToMask(
16817	const SDNode N, CombineLevel Level) const* {
16818	assert(((N->getOpcode() == ISD::SHL &&
16819	N->getOperand(`0`).getOpcode() == ISD::SRL) \|\|
16820	(N->getOpcode() == ISD::SRL &&
16821	N->getOperand(`0`).getOpcode() == ISD::SHL)) &&
16822	"Expected shift-shift mask");
16823	// Don't allow multiuse shift folding with the same shift amount.
16824	if (!N->getOperand(Num: `0`)->hasOneUse())
16825	return false;
16826
16827	// Only fold srl(shl(x,c1),c2) iff C1 >= C2 to prevent loss of UBFX patterns.
16828	EVT VT = N->getValueType(ResNo: `0`);
16829	if (N->getOpcode() == ISD::SRL && (VT == MVT::i32 \|\| VT == MVT::i64)) {
16830	auto *C1 = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `0`).getOperand(i: `1`));
16831	auto *C2 = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `1`));
16832	return (!C1 \|\| !C2 \|\| C1->getZExtValue() >= C2->getZExtValue());
16833	}
16834
16835	return true;
16836	}
16837
16838	bool AArch64TargetLowering::shouldFoldSelectWithIdentityConstant(
16839	unsigned BinOpcode, EVT VT) const {
16840	return VT.isScalableVector() && isTypeLegal(VT);
16841	}
16842
16843	bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
16844	Type Ty) const* {
16845	assert(Ty->isIntegerTy());
16846
16847	unsigned BitSize = Ty->getPrimitiveSizeInBits();
16848	if (BitSize == `0`)
16849	return false;
16850
16851	int64_t Val = Imm.getSExtValue();
16852	if (Val == `0` \|\| AArch64_AM::isLogicalImmediate(imm: Val, regSize: BitSize))
16853	return true;
16854
16855	if ((int64_t)Val < `0`)
16856	Val = ~Val;
16857	if (BitSize == `32`)
16858	Val &= (`1LL` << `32`) - `1`;
16859
16860	unsigned Shift = llvm::Log2_64(Value: (uint64_t)Val) / `16`;
16861	// MOVZ is free so return true for one or fewer MOVK.
16862	return Shift < `3`;
16863	}
16864
16865	bool AArch64TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
16866	unsigned Index) const {
16867	if (!isOperationLegalOrCustom(Op: ISD::EXTRACT_SUBVECTOR, VT: ResVT))
16868	return false;
16869
16870	return (Index == `0` \|\| Index == ResVT.getVectorMinNumElements());
16871	}
16872
16873	/// Turn vector tests of the signbit in the form of:
16874	/// xor (sra X, elt_size(X)-1), -1
16875	/// into:
16876	/// cmge X, X, #0
16877	static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
16878	const AArch64Subtarget *Subtarget) {
16879	EVT VT = N->getValueType(ResNo: `0`);
16880	if (!Subtarget->hasNEON() \|\| !VT.isVector())
16881	return SDValue ();
16882
16883	// There must be a shift right algebraic before the xor, and the xor must be a
16884	// 'not' operation.
16885	SDValue Shift = N->getOperand(Num: `0`);
16886	SDValue Ones = N->getOperand(Num: `1`);
16887	if (Shift.getOpcode() != AArch64ISD::VASHR \|\| !Shift.hasOneUse() \|\|
16888	!ISD::isBuildVectorAllOnes(N: Ones.getNode()))
16889	return SDValue ();
16890
16891	// The shift should be smearing the sign bit across each vector element.
16892	auto *ShiftAmt = dyn_cast<ConstantSDNode>(Val: Shift.getOperand(i: `1`));
16893	EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
16894	if (!ShiftAmt \|\| ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - `1`)
16895	return SDValue ();
16896
16897	return DAG.getNode(Opcode: AArch64ISD::CMGEz, DL: SDLoc (N), VT, Operand: Shift.getOperand(i: `0`));
16898	}
16899
16900	// Given a vecreduce_add node, detect the below pattern and convert it to the
16901	// node sequence with UABDL, [S\|U]ADB and UADDLP.
16902	//
16903	// i32 vecreduce_add(
16904	// v16i32 abs(
16905	// v16i32 sub(
16906	// v16i32 [sign\|zero]_extend(v16i8 a), v16i32 [sign\|zero]_extend(v16i8 b))))
16907	// =================>
16908	// i32 vecreduce_add(
16909	// v4i32 UADDLP(
16910	// v8i16 add(
16911	// v8i16 zext(
16912	// v8i8 [S\|U]ABD low8:v16i8 a, low8:v16i8 b
16913	// v8i16 zext(
16914	// v8i8 [S\|U]ABD high8:v16i8 a, high8:v16i8 b
16915	static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N,
16916	SelectionDAG &DAG) {
16917	// Assumed i32 vecreduce_add
16918	if (N->getValueType(`0`) != MVT::i32)
16919	return SDValue ();
16920
16921	SDValue VecReduceOp0 = N->getOperand(Num: `0`);
16922	unsigned Opcode = VecReduceOp0.getOpcode();
16923	// Assumed v16i32 abs
16924	if (Opcode != ISD::ABS \|\| VecReduceOp0->getValueType(`0`) != MVT::v16i32)
16925	return SDValue ();
16926
16927	SDValue ABS = VecReduceOp0;
16928	// Assumed v16i32 sub
16929	if (ABS->getOperand(`0`)->getOpcode() != ISD::SUB \|\|
16930	ABS->getOperand(`0`)->getValueType(`0`) != MVT::v16i32)
16931	return SDValue ();
16932
16933	SDValue SUB = ABS ->getOperand(Num: `0`);
16934	unsigned Opcode0 = SUB ->getOperand(Num: `0`).getOpcode();
16935	unsigned Opcode1 = SUB ->getOperand(Num: `1`).getOpcode();
16936	// Assumed v16i32 type
16937	if (SUB->getOperand(`0`)->getValueType(`0`) != MVT::v16i32 \|\|
16938	SUB->getOperand(`1`)->getValueType(`0`) != MVT::v16i32)
16939	return SDValue ();
16940
16941	// Assumed zext or sext
16942	bool IsZExt = false;
16943	if (Opcode0 == ISD::ZERO_EXTEND && Opcode1 == ISD::ZERO_EXTEND) {
16944	IsZExt = true;
16945	} else if (Opcode0 == ISD::SIGN_EXTEND && Opcode1 == ISD::SIGN_EXTEND) {
16946	IsZExt = false;
16947	} else
16948	return SDValue ();
16949
16950	SDValue EXT0 = SUB ->getOperand(Num: `0`);
16951	SDValue EXT1 = SUB ->getOperand(Num: `1`);
16952	// Assumed zext's operand has v16i8 type
16953	if (EXT0->getOperand(`0`)->getValueType(`0`) != MVT::v16i8 \|\|
16954	EXT1->getOperand(`0`)->getValueType(`0`) != MVT::v16i8)
16955	return SDValue ();
16956
16957	// Pattern is dectected. Let's convert it to sequence of nodes.
16958	SDLoc DL(N);
16959
16960	// First, create the node pattern of UABD/SABD.
16961	SDValue UABDHigh8Op0 =
16962	DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(`0`),
16963	DAG.getConstant(`8`, DL, MVT::i64));
16964	SDValue UABDHigh8Op1 =
16965	DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(`0`),
16966	DAG.getConstant(`8`, DL, MVT::i64));
16967	SDValue UABDHigh8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
16968	UABDHigh8Op0, UABDHigh8Op1);
16969	SDValue UABDL = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDHigh8);
16970
16971	// Second, create the node pattern of UABAL.
16972	SDValue UABDLo8Op0 =
16973	DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(`0`),
16974	DAG.getConstant(`0`, DL, MVT::i64));
16975	SDValue UABDLo8Op1 =
16976	DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(`0`),
16977	DAG.getConstant(`0`, DL, MVT::i64));
16978	SDValue UABDLo8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
16979	UABDLo8Op0, UABDLo8Op1);
16980	SDValue ZExtUABD = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDLo8);
16981	SDValue UABAL = DAG.getNode(ISD::ADD, DL, MVT::v8i16, UABDL, ZExtUABD);
16982
16983	// Third, create the node of UADDLP.
16984	SDValue UADDLP = DAG.getNode(AArch64ISD::UADDLP, DL, MVT::v4i32, UABAL);
16985
16986	// Fourth, create the node of VECREDUCE_ADD.
16987	return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, UADDLP);
16988	}
16989
16990	// Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce
16991	// vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one))
16992	// vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B))
16993	// If we have vectors larger than v16i8 we extract v16i8 vectors,
16994	// Follow the same steps above to get DOT instructions concatenate them
16995	// and generate vecreduce.add(concat_vector(DOT, DOT2, ..)).
16996	static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG,
16997	const AArch64Subtarget *ST) {
16998	if (!ST->hasDotProd())
16999	return performVecReduceAddCombineWithUADDLP(N, DAG);
17000
17001	SDValue Op0 = N->getOperand(Num: `0`);
17002	if (N->getValueType(`0`) != MVT::i32 \|\| Op0.getValueType().isScalableVT() \|\|
17003	Op0.getValueType().getVectorElementType() != MVT::i32)
17004	return SDValue ();
17005
17006	unsigned ExtOpcode = Op0.getOpcode();
17007	SDValue A = Op0;
17008	SDValue B;
17009	if (ExtOpcode == ISD::MUL) {
17010	A = Op0.getOperand(i: `0`);
17011	B = Op0.getOperand(i: `1`);
17012	if (A.getOpcode() != B.getOpcode() \|\|
17013	A.getOperand(i: `0`).getValueType() != B.getOperand(i: `0`).getValueType())
17014	return SDValue ();
17015	ExtOpcode = A.getOpcode();
17016	}
17017	if (ExtOpcode != ISD::ZERO_EXTEND && ExtOpcode != ISD::SIGN_EXTEND)
17018	return SDValue ();
17019
17020	EVT Op0VT = A.getOperand(i: `0`).getValueType();
17021	bool IsValidElementCount = Op0VT.getVectorNumElements() % `8` == `0`;
17022	bool IsValidSize = Op0VT.getScalarSizeInBits() == `8`;
17023	if (!IsValidElementCount \|\| !IsValidSize)
17024	return SDValue ();
17025
17026	SDLoc DL(Op0);
17027	// For non-mla reductions B can be set to 1. For MLA we take the operand of
17028	// the extend B.
17029	if (!B)
17030	B = DAG.getConstant(Val: `1`, DL, VT: Op0VT);
17031	else
17032	B = B.getOperand(i: `0`);
17033
17034	unsigned IsMultipleOf16 = Op0VT.getVectorNumElements() % `16` == `0`;
17035	unsigned NumOfVecReduce;
17036	EVT TargetType;
17037	if (IsMultipleOf16) {
17038	NumOfVecReduce = Op0VT.getVectorNumElements() / `16`;
17039	TargetType = MVT::v4i32;
17040	} else {
17041	NumOfVecReduce = Op0VT.getVectorNumElements() / `8`;
17042	TargetType = MVT::v2i32;
17043	}
17044	auto DotOpcode =
17045	(ExtOpcode == ISD::ZERO_EXTEND) ? AArch64ISD::UDOT : AArch64ISD::SDOT;
17046	// Handle the case where we need to generate only one Dot operation.
17047	if (NumOfVecReduce == `1`) {
17048	SDValue Zeros = DAG.getConstant(Val: `0`, DL, VT: TargetType);
17049	SDValue Dot = DAG.getNode(Opcode: DotOpcode, DL, VT: Zeros.getValueType(), N1: Zeros,
17050	N2: A.getOperand(i: `0`), N3: B);
17051	return DAG.getNode(Opcode: ISD::VECREDUCE_ADD, DL, VT: N->getValueType(ResNo: `0`), Operand: Dot);
17052	}
17053	// Generate Dot instructions that are multiple of 16.
17054	unsigned VecReduce16Num = Op0VT.getVectorNumElements() / `16`;
17055	SmallVector<SDValue, `4`> SDotVec16;
17056	unsigned I = `0`;
17057	for (; I < VecReduce16Num; I += `1`) {
17058	SDValue Zeros = DAG.getConstant(`0`, DL, MVT::v4i32);
17059	SDValue Op0 =
17060	DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, A.getOperand(`0`),
17061	DAG.getConstant(I * `16`, DL, MVT::i64));
17062	SDValue Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, B,
17063	DAG.getConstant(I * `16`, DL, MVT::i64));
17064	SDValue Dot =
17065	DAG.getNode(Opcode: DotOpcode, DL, VT: Zeros.getValueType(), N1: Zeros, N2: Op0, N3: Op1);
17066	SDotVec16.push_back(Elt: Dot);
17067	}
17068	// Concatenate dot operations.
17069	EVT SDot16EVT =
17070	EVT::getVectorVT(DAG.getContext(), MVT::i32, `4` VecReduce16Num);
17071	SDValue ConcatSDot16 =
17072	DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: SDot16EVT, Ops: SDotVec16);
17073	SDValue VecReduceAdd16 =
17074	DAG.getNode(Opcode: ISD::VECREDUCE_ADD, DL, VT: N->getValueType(ResNo: `0`), Operand: ConcatSDot16);
17075	unsigned VecReduce8Num = (Op0VT.getVectorNumElements() % `16`) / `8`;
17076	if (VecReduce8Num == `0`)
17077	return VecReduceAdd16;
17078
17079	// Generate the remainder Dot operation that is multiple of 8.
17080	SmallVector<SDValue, `4`> SDotVec8;
17081	SDValue Zeros = DAG.getConstant(`0`, DL, MVT::v2i32);
17082	SDValue Vec8Op0 =
17083	DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, A.getOperand(`0`),
17084	DAG.getConstant(I * `16`, DL, MVT::i64));
17085	SDValue Vec8Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, B,
17086	DAG.getConstant(I * `16`, DL, MVT::i64));
17087	SDValue Dot =
17088	DAG.getNode(Opcode: DotOpcode, DL, VT: Zeros.getValueType(), N1: Zeros, N2: Vec8Op0, N3: Vec8Op1);
17089	SDValue VecReudceAdd8 =
17090	DAG.getNode(Opcode: ISD::VECREDUCE_ADD, DL, VT: N->getValueType(ResNo: `0`), Operand: Dot);
17091	return DAG.getNode(Opcode: ISD::ADD, DL, VT: N->getValueType(ResNo: `0`), N1: VecReduceAdd16,
17092	N2: VecReudceAdd8);
17093	}
17094
17095	// Given an (integer) vecreduce, we know the order of the inputs does not
17096	// matter. We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x))))
17097	// into UADDV(UADDLP(x)). This can also happen through an extra add, where we
17098	// transform UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))).
17099	static SDValue performUADDVAddCombine(SDValue A, SelectionDAG &DAG) {
17100	auto DetectAddExtract = [&](SDValue A) {
17101	// Look for add(zext(extract_lo(x)), zext(extract_hi(x))), returning
17102	// UADDLP(x) if found.
17103	assert(A.getOpcode() == ISD::ADD);
17104	EVT VT = A.getValueType();
17105	SDValue Op0 = A.getOperand(i: `0`);
17106	SDValue Op1 = A.getOperand(i: `1`);
17107	if (Op0.getOpcode() != Op0.getOpcode() \|\|
17108	(Op0.getOpcode() != ISD::ZERO_EXTEND &&
17109	Op0.getOpcode() != ISD::SIGN_EXTEND))
17110	return SDValue ();
17111	SDValue Ext0 = Op0.getOperand(i: `0`);
17112	SDValue Ext1 = Op1.getOperand(i: `0`);
17113	if (Ext0.getOpcode() != ISD::EXTRACT_SUBVECTOR \|\|
17114	Ext1.getOpcode() != ISD::EXTRACT_SUBVECTOR \|\|
17115	Ext0.getOperand(i: `0`) != Ext1.getOperand(i: `0`))
17116	return SDValue ();
17117	// Check that the type is twice the add types, and the extract are from
17118	// upper/lower parts of the same source.
17119	if (Ext0.getOperand(i: `0`).getValueType().getVectorNumElements() !=
17120	VT.getVectorNumElements() * `2`)
17121	return SDValue ();
17122	if ((Ext0.getConstantOperandVal(i: `1`) != `0` \|\|
17123	Ext1.getConstantOperandVal(i: `1`) != VT.getVectorNumElements()) &&
17124	(Ext1.getConstantOperandVal(i: `1`) != `0` \|\|
17125	Ext0.getConstantOperandVal(i: `1`) != VT.getVectorNumElements()))
17126	return SDValue ();
17127	unsigned Opcode = Op0.getOpcode() == ISD::ZERO_EXTEND ? AArch64ISD::UADDLP
17128	: AArch64ISD::SADDLP;
17129	return DAG.getNode(Opcode, DL: SDLoc (A), VT, Operand: Ext0.getOperand(i: `0`));
17130	};
17131
17132	if (SDValue R = DetectAddExtract (A))
17133	return R;
17134
17135	if (A.getOperand(i: `0`).getOpcode() == ISD::ADD && A.getOperand(i: `0`).hasOneUse())
17136	if (SDValue R = performUADDVAddCombine(A: A.getOperand(i: `0`), DAG))
17137	return DAG.getNode(Opcode: ISD::ADD, DL: SDLoc (A), VT: A.getValueType(), N1: R,
17138	N2: A.getOperand(i: `1`));
17139	if (A.getOperand(i: `1`).getOpcode() == ISD::ADD && A.getOperand(i: `1`).hasOneUse())
17140	if (SDValue R = performUADDVAddCombine(A: A.getOperand(i: `1`), DAG))
17141	return DAG.getNode(Opcode: ISD::ADD, DL: SDLoc (A), VT: A.getValueType(), N1: R,
17142	N2: A.getOperand(i: `0`));
17143	return SDValue ();
17144	}
17145
17146	// We can convert a UADDV(add(zext(64-bit source), zext(64-bit source))) into
17147	// UADDLV(concat), where the concat represents the 64-bit zext sources.
17148	static SDValue performUADDVZextCombine(SDValue A, SelectionDAG &DAG) {
17149	// Look for add(zext(64-bit source), zext(64-bit source)), returning
17150	// UADDLV(concat(zext, zext)) if found.
17151	assert(A.getOpcode() == ISD::ADD);
17152	EVT VT = A.getValueType();
17153	if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
17154	return SDValue ();
17155	SDValue Op0 = A.getOperand(i: `0`);
17156	SDValue Op1 = A.getOperand(i: `1`);
17157	if (Op0.getOpcode() != ISD::ZERO_EXTEND \|\| Op0.getOpcode() != Op1.getOpcode())
17158	return SDValue ();
17159	SDValue Ext0 = Op0.getOperand(i: `0`);
17160	SDValue Ext1 = Op1.getOperand(i: `0`);
17161	EVT ExtVT0 = Ext0.getValueType();
17162	EVT ExtVT1 = Ext1.getValueType();
17163	// Check zext VTs are the same and 64-bit length.
17164	if (ExtVT0 != ExtVT1 \|\|
17165	VT.getScalarSizeInBits() != (`2` * ExtVT0.getScalarSizeInBits()))
17166	return SDValue ();
17167	// Get VT for concat of zext sources.
17168	EVT PairVT = ExtVT0.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
17169	SDValue Concat =
17170	DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc (A), VT: PairVT, N1: Ext0, N2: Ext1);
17171
17172	switch (VT.getSimpleVT().SimpleTy) {
17173	case MVT::v2i64:
17174	case MVT::v4i32:
17175	return DAG.getNode(Opcode: AArch64ISD::UADDLV, DL: SDLoc (A), VT, Operand: Concat);
17176	case MVT::v8i16: {
17177	SDValue Uaddlv =
17178	DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), MVT::v4i32, Concat);
17179	return DAG.getNode(AArch64ISD::NVCAST, SDLoc(A), MVT::v8i16, Uaddlv);
17180	}
17181	default:
17182	llvm_unreachable("Unhandled vector type");
17183	}
17184	}
17185
17186	static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG) {
17187	SDValue A = N->getOperand(Num: `0`);
17188	if (A.getOpcode() == ISD::ADD) {
17189	if (SDValue R = performUADDVAddCombine(A, DAG))
17190	return DAG.getNode(Opcode: N->getOpcode(), DL: SDLoc (N), VT: N->getValueType(ResNo: `0`), Operand: R);
17191	else if (SDValue R = performUADDVZextCombine(A, DAG))
17192	return R;
17193	}
17194	return SDValue ();
17195	}
17196
17197	static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
17198	TargetLowering::DAGCombinerInfo &DCI,
17199	const AArch64Subtarget *Subtarget) {
17200	if (DCI.isBeforeLegalizeOps())
17201	return SDValue ();
17202
17203	return foldVectorXorShiftIntoCmp(N, DAG, Subtarget);
17204	}
17205
17206	SDValue
17207	AArch64TargetLowering::BuildSDIVPow2(SDNode N, const* APInt &Divisor,
17208	SelectionDAG &DAG,
17209	SmallVectorImpl<SDNode > &Created) const* {
17210	AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
17211	if (isIntDivCheap(VT: N->getValueType(ResNo: `0`), Attr))
17212	return SDValue (N,`0`); // Lower SDIV as SDIV
17213
17214	EVT VT = N->getValueType(ResNo: `0`);
17215
17216	// For scalable and fixed types, mark them as cheap so we can handle it much
17217	// later. This allows us to handle larger than legal types.
17218	if (VT.isScalableVector() \|\| Subtarget->useSVEForFixedLengthVectors())
17219	return SDValue (N, `0`);
17220
17221	// fold (sdiv X, pow2)
17222	if ((VT != MVT::i32 && VT != MVT::i64) \|\|
17223	!(Divisor.isPowerOf2() \|\| Divisor.isNegatedPowerOf2()))
17224	return SDValue ();
17225
17226	return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
17227	}
17228
17229	SDValue
17230	AArch64TargetLowering::BuildSREMPow2(SDNode N, const* APInt &Divisor,
17231	SelectionDAG &DAG,
17232	SmallVectorImpl<SDNode > &Created) const* {
17233	AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
17234	if (isIntDivCheap(VT: N->getValueType(ResNo: `0`), Attr))
17235	return SDValue (N, `0`); // Lower SREM as SREM
17236
17237	EVT VT = N->getValueType(ResNo: `0`);
17238
17239	// For scalable and fixed types, mark them as cheap so we can handle it much
17240	// later. This allows us to handle larger than legal types.
17241	if (VT.isScalableVector() \|\| Subtarget->useSVEForFixedLengthVectors())
17242	return SDValue (N, `0`);
17243
17244	// fold (srem X, pow2)
17245	if ((VT != MVT::i32 && VT != MVT::i64) \|\|
17246	!(Divisor.isPowerOf2() \|\| Divisor.isNegatedPowerOf2()))
17247	return SDValue ();
17248
17249	unsigned Lg2 = Divisor.countr_zero();
17250	if (Lg2 == `0`)
17251	return SDValue ();
17252
17253	SDLoc DL(N);
17254	SDValue N0 = N->getOperand(Num: `0`);
17255	SDValue Pow2MinusOne = DAG.getConstant(Val: (`1ULL` << Lg2) - `1`, DL, VT);
17256	SDValue Zero = DAG.getConstant(Val: `0`, DL, VT);
17257	SDValue CCVal, CSNeg;
17258	if (Lg2 == `1`) {
17259	SDValue Cmp = getAArch64Cmp(LHS: N0, RHS: Zero, CC: ISD::SETGE, AArch64cc&: CCVal, DAG, dl: DL);
17260	SDValue And = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: N0, N2: Pow2MinusOne);
17261	CSNeg = DAG.getNode(Opcode: AArch64ISD::CSNEG, DL, VT, N1: And, N2: And, N3: CCVal, N4: Cmp);
17262
17263	Created.push_back(Elt: Cmp.getNode());
17264	Created.push_back(Elt: And.getNode());
17265	} else {
17266	SDValue CCVal = DAG.getConstant(Val: AArch64CC::MI, DL, VT: MVT_CC);
17267	SDVTList VTs = DAG.getVTList(VT, MVT::i32);
17268
17269	SDValue Negs = DAG.getNode(Opcode: AArch64ISD::SUBS, DL, VTList: VTs, N1: Zero, N2: N0);
17270	SDValue AndPos = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: N0, N2: Pow2MinusOne);
17271	SDValue AndNeg = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: Negs, N2: Pow2MinusOne);
17272	CSNeg = DAG.getNode(Opcode: AArch64ISD::CSNEG, DL, VT, N1: AndPos, N2: AndNeg, N3: CCVal,
17273	N4: Negs.getValue(R: `1`));
17274
17275	Created.push_back(Elt: Negs.getNode());
17276	Created.push_back(Elt: AndPos.getNode());
17277	Created.push_back(Elt: AndNeg.getNode());
17278	}
17279
17280	return CSNeg;
17281	}
17282
17283	static std::optional<unsigned> IsSVECntIntrinsic(SDValue S) {
17284	switch(getIntrinsicID(N: S.getNode())) {
17285	default:
17286	break;
17287	case Intrinsic::aarch64_sve_cntb:
17288	return `8`;
17289	case Intrinsic::aarch64_sve_cnth:
17290	return `16`;
17291	case Intrinsic::aarch64_sve_cntw:
17292	return `32`;
17293	case Intrinsic::aarch64_sve_cntd:
17294	return `64`;
17295	}
17296	return {};
17297	}
17298
17299	/// Calculates what the pre-extend type is, based on the extension
17300	/// operation node provided by \p Extend.
17301	///
17302	/// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the
17303	/// pre-extend type is pulled directly from the operand, while other extend
17304	/// operations need a bit more inspection to get this information.
17305	///
17306	/// \param Extend The SDNode from the DAG that represents the extend operation
17307	///
17308	/// \returns The type representing the \p Extend source type, or \p MVT::Other
17309	/// if no valid type can be determined
17310	static EVT calculatePreExtendType(SDValue Extend) {
17311	switch (Extend.getOpcode()) {
17312	case ISD::SIGN_EXTEND:
17313	case ISD::ZERO_EXTEND:
17314	return Extend.getOperand(i: `0`).getValueType();
17315	case ISD::AssertSext:
17316	case ISD::AssertZext:
17317	case ISD::SIGN_EXTEND_INREG: {
17318	VTSDNode *TypeNode = dyn_cast<VTSDNode>(Val: Extend.getOperand(i: `1`));
17319	if (!TypeNode)
17320	return MVT::Other;
17321	return TypeNode->getVT();
17322	}
17323	case ISD::AND: {
17324	ConstantSDNode *Constant =
17325	dyn_cast<ConstantSDNode>(Val: Extend.getOperand(i: `1`).getNode());
17326	if (!Constant)
17327	return MVT::Other;
17328
17329	uint32_t Mask = Constant->getZExtValue();
17330
17331	if (Mask == UCHAR_MAX)
17332	return MVT::i8;
17333	else if (Mask == USHRT_MAX)
17334	return MVT::i16;
17335	else if (Mask == UINT_MAX)
17336	return MVT::i32;
17337
17338	return MVT::Other;
17339	}
17340	default:
17341	return MVT::Other;
17342	}
17343	}
17344
17345	/// Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern
17346	/// into sext/zext(buildvector) or sext/zext(shuffle) making use of the vector
17347	/// SExt/ZExt rather than the scalar SExt/ZExt
17348	static SDValue performBuildShuffleExtendCombine(SDValue BV, SelectionDAG &DAG) {
17349	EVT VT = BV.getValueType();
17350	if (BV.getOpcode() != ISD::BUILD_VECTOR &&
17351	BV.getOpcode() != ISD::VECTOR_SHUFFLE)
17352	return SDValue ();
17353
17354	// Use the first item in the buildvector/shuffle to get the size of the
17355	// extend, and make sure it looks valid.
17356	SDValue Extend = BV ->getOperand(Num: `0`);
17357	unsigned ExtendOpcode = Extend.getOpcode();
17358	bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND \|\|
17359	ExtendOpcode == ISD::SIGN_EXTEND_INREG \|\|
17360	ExtendOpcode == ISD::AssertSext;
17361	if (!IsSExt && ExtendOpcode != ISD::ZERO_EXTEND &&
17362	ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND)
17363	return SDValue ();
17364	// Shuffle inputs are vector, limit to SIGN_EXTEND and ZERO_EXTEND to ensure
17365	// calculatePreExtendType will work without issue.
17366	if (BV.getOpcode() == ISD::VECTOR_SHUFFLE &&
17367	ExtendOpcode != ISD::SIGN_EXTEND && ExtendOpcode != ISD::ZERO_EXTEND)
17368	return SDValue ();
17369
17370	// Restrict valid pre-extend data type
17371	EVT PreExtendType = calculatePreExtendType(Extend);
17372	if (PreExtendType == MVT::Other \|\|
17373	PreExtendType.getScalarSizeInBits() != VT.getScalarSizeInBits() / `2`)
17374	return SDValue ();
17375
17376	// Make sure all other operands are equally extended
17377	for (SDValue Op : drop_begin(RangeOrContainer: BV ->ops())) {
17378	if (Op.isUndef())
17379	continue;
17380	unsigned Opc = Op.getOpcode();
17381	bool OpcIsSExt = Opc == ISD::SIGN_EXTEND \|\| Opc == ISD::SIGN_EXTEND_INREG \|\|
17382	Opc == ISD::AssertSext;
17383	if (OpcIsSExt != IsSExt \|\| calculatePreExtendType(Extend: Op) != PreExtendType)
17384	return SDValue ();
17385	}
17386
17387	SDValue NBV;
17388	SDLoc DL(BV);
17389	if (BV.getOpcode() == ISD::BUILD_VECTOR) {
17390	EVT PreExtendVT = VT.changeVectorElementType(EltVT: PreExtendType);
17391	EVT PreExtendLegalType =
17392	PreExtendType.getScalarSizeInBits() < `32` ? MVT::i32 : PreExtendType;
17393	SmallVector<SDValue, `8`> NewOps;
17394	for (SDValue Op : BV ->ops())
17395	NewOps.push_back(Elt: Op.isUndef() ? DAG.getUNDEF(VT: PreExtendLegalType)
17396	: DAG.getAnyExtOrTrunc(Op: Op.getOperand(i: `0`), DL,
17397	VT: PreExtendLegalType));
17398	NBV = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL, VT: PreExtendVT, Ops: NewOps);
17399	} else { // BV.getOpcode() == ISD::VECTOR_SHUFFLE
17400	EVT PreExtendVT = VT.changeVectorElementType(EltVT: PreExtendType.getScalarType());
17401	NBV = DAG.getVectorShuffle(VT: PreExtendVT, dl: DL, N1: BV.getOperand(i: `0`).getOperand(i: `0`),
17402	N2: BV.getOperand(i: `1`).isUndef()
17403	? DAG.getUNDEF(VT: PreExtendVT)
17404	: BV.getOperand(i: `1`).getOperand(i: `0`),
17405	Mask: cast<ShuffleVectorSDNode>(Val&: BV)->getMask());
17406	}
17407	return DAG.getNode(Opcode: IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, VT, Operand: NBV);
17408	}
17409
17410	/// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
17411	/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
17412	static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG) {
17413	// If the value type isn't a vector, none of the operands are going to be dups
17414	EVT VT = Mul->getValueType(ResNo: `0`);
17415	if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
17416	return SDValue ();
17417
17418	SDValue Op0 = performBuildShuffleExtendCombine(BV: Mul->getOperand(Num: `0`), DAG);
17419	SDValue Op1 = performBuildShuffleExtendCombine(BV: Mul->getOperand(Num: `1`), DAG);
17420
17421	// Neither operands have been changed, don't make any further changes
17422	if (!Op0 && !Op1)
17423	return SDValue ();
17424
17425	SDLoc DL(Mul);
17426	return DAG.getNode(Opcode: Mul->getOpcode(), DL, VT, N1: Op0 ? Op0 : Mul->getOperand(Num: `0`),
17427	N2: Op1 ? Op1 : Mul->getOperand(Num: `1`));
17428	}
17429
17430	// Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz
17431	// Same for other types with equivalent constants.
17432	static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG) {
17433	EVT VT = N->getValueType(ResNo: `0`);
17434	if (VT != MVT::v2i64 && VT != MVT::v1i64 && VT != MVT::v2i32 &&
17435	VT != MVT::v4i32 && VT != MVT::v4i16 && VT != MVT::v8i16)
17436	return SDValue ();
17437	if (N->getOperand(Num: `0`).getOpcode() != ISD::AND \|\|
17438	N->getOperand(Num: `0`).getOperand(i: `0`).getOpcode() != ISD::SRL)
17439	return SDValue ();
17440
17441	SDValue And = N->getOperand(Num: `0`);
17442	SDValue Srl = And.getOperand(i: `0`);
17443
17444	APInt V1, V2, V3;
17445	if (!ISD::isConstantSplatVector(N: N->getOperand(Num: `1`).getNode(), SplatValue&: V1) \|\|
17446	!ISD::isConstantSplatVector(N: And.getOperand(i: `1`).getNode(), SplatValue&: V2) \|\|
17447	!ISD::isConstantSplatVector(N: Srl.getOperand(i: `1`).getNode(), SplatValue&: V3))
17448	return SDValue ();
17449
17450	unsigned HalfSize = VT.getScalarSizeInBits() / `2`;
17451	if (!V1.isMask(numBits: HalfSize) \|\| V2 != (`1ULL` \| `1ULL` << HalfSize) \|\|
17452	V3 != (HalfSize - `1`))
17453	return SDValue ();
17454
17455	EVT HalfVT = EVT::getVectorVT(Context&: *DAG.getContext(),
17456	VT: EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: HalfSize),
17457	EC: VT.getVectorElementCount() * `2`);
17458
17459	SDLoc DL(N);
17460	SDValue In = DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT: HalfVT, Operand: Srl.getOperand(i: `0`));
17461	SDValue CM = DAG.getNode(Opcode: AArch64ISD::CMLTz, DL, VT: HalfVT, Operand: In);
17462	return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT, Operand: CM);
17463	}
17464
17465	static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
17466	TargetLowering::DAGCombinerInfo &DCI,
17467	const AArch64Subtarget *Subtarget) {
17468
17469	if (SDValue Ext = performMulVectorExtendCombine(Mul: N, DAG))
17470	return Ext;
17471	if (SDValue Ext = performMulVectorCmpZeroCombine(N, DAG))
17472	return Ext;
17473
17474	if (DCI.isBeforeLegalizeOps())
17475	return SDValue ();
17476
17477	// Canonicalize X(Y+1) -> XY+X and (X+1)Y -> XY+Y,
17478	// and in MachineCombiner pass, add+mul will be combined into madd.
17479	// Similarly, X(1-Y) -> X - XY and (1-Y)X -> X - YX.
17480	SDLoc DL(N);
17481	EVT VT = N->getValueType(ResNo: `0`);
17482	SDValue N0 = N->getOperand(Num: `0`);
17483	SDValue N1 = N->getOperand(Num: `1`);
17484	SDValue MulOper;
17485	unsigned AddSubOpc;
17486
17487	auto IsAddSubWith1 = [&](SDValue V) -> bool {
17488	AddSubOpc = V ->getOpcode();
17489	if ((AddSubOpc == ISD::ADD \|\| AddSubOpc == ISD::SUB) && V ->hasOneUse()) {
17490	SDValue Opnd = V ->getOperand(Num: `1`);
17491	MulOper = V ->getOperand(Num: `0`);
17492	if (AddSubOpc == ISD::SUB)
17493	std::swap(a&: Opnd, b&: MulOper);
17494	if (auto C = dyn_cast<ConstantSDNode>(Val&: Opnd))
17495	return C->isOne();
17496	}
17497	return false;
17498	};
17499
17500	if (IsAddSubWith1 (N0)) {
17501	SDValue MulVal = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1, N2: MulOper);
17502	return DAG.getNode(Opcode: AddSubOpc, DL, VT, N1, N2: MulVal);
17503	}
17504
17505	if (IsAddSubWith1 (N1)) {
17506	SDValue MulVal = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: N0, N2: MulOper);
17507	return DAG.getNode(Opcode: AddSubOpc, DL, VT, N1: N0, N2: MulVal);
17508	}
17509
17510	// The below optimizations require a constant RHS.
17511	if (!isa<ConstantSDNode>(Val: N1))
17512	return SDValue ();
17513
17514	ConstantSDNode *C = cast<ConstantSDNode>(Val&: N1);
17515	const APInt &ConstValue = C->getAPIntValue();
17516
17517	// Allow the scaling to be folded into the `cnt` instruction by preventing
17518	// the scaling to be obscured here. This makes it easier to pattern match.
17519	if (IsSVECntIntrinsic(S: N0) \|\|
17520	(N0 ->getOpcode() == ISD::TRUNCATE &&
17521	(IsSVECntIntrinsic(S: N0 ->getOperand(Num: `0`)))))
17522	if (ConstValue.sge(RHS: `1`) && ConstValue.sle(RHS: `16`))
17523	return SDValue ();
17524
17525	// Multiplication of a power of two plus/minus one can be done more
17526	// cheaply as shift+add/sub. For now, this is true unilaterally. If
17527	// future CPUs have a cheaper MADD instruction, this may need to be
17528	// gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
17529	// 64-bit is 5 cycles, so this is always a win.
17530	// More aggressively, some multiplications N0 C can be lowered to*
17531	// shift+add+shift if the constant C = A B where A = 2^N + 1 and B = 2^M,*
17532	// e.g. 6=32=(2+1)2, 45=(1+4)(1+8)*
17533	// TODO: lower more cases.
17534
17535	// TrailingZeroes is used to test if the mul can be lowered to
17536	// shift+add+shift.
17537	unsigned TrailingZeroes = ConstValue.countr_zero();
17538	if (TrailingZeroes) {
17539	// Conservatively do not lower to shift+add+shift if the mul might be
17540	// folded into smul or umul.
17541	if (N0 ->hasOneUse() && (isSignExtended(N: N0, DAG) \|\|
17542	isZeroExtended(N: N0, DAG)))
17543	return SDValue ();
17544	// Conservatively do not lower to shift+add+shift if the mul might be
17545	// folded into madd or msub.
17546	if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ADD \|\|
17547	N->use_begin()->getOpcode() == ISD::SUB))
17548	return SDValue ();
17549	}
17550	// Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
17551	// and shift+add+shift.
17552	APInt ShiftedConstValue = ConstValue.ashr(ShiftAmt: TrailingZeroes);
17553	unsigned ShiftAmt;
17554
17555	auto Shl = [&](SDValue N0, unsigned N1) {
17556	SDValue RHS = DAG.getConstant(N1, DL, MVT::i64);
17557	return DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: N0, N2: RHS);
17558	};
17559	auto Add = [&](SDValue N0, SDValue N1) {
17560	return DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: N0, N2: N1);
17561	};
17562	auto Sub = [&](SDValue N0, SDValue N1) {
17563	return DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: N0, N2: N1);
17564	};
17565	auto Negate = [&](SDValue N) {
17566	SDValue Zero = DAG.getConstant(Val: `0`, DL, VT);
17567	return DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Zero, N2: N);
17568	};
17569
17570	// Can the const C be decomposed into (1+2^M1)(1+2^N1), eg:*
17571	// C = 45 is equal to (1+4)(1+8), we don't decompose it into (1+2)(16-1) as
17572	// the (2^N - 1) can't be execused via a single instruction.
17573	auto isPowPlusPlusConst = [](APInt C, APInt &M, APInt &N) {
17574	unsigned BitWidth = C.getBitWidth();
17575	for (unsigned i = `1`; i < BitWidth / `2`; i++) {
17576	APInt Rem;
17577	APInt X(BitWidth, (`1` << i) + `1`);
17578	APInt::sdivrem(LHS: C, RHS: X, Quotient&: N, Remainder&: Rem);
17579	APInt NVMinus1 = N - `1`;
17580	if (Rem == `0` && NVMinus1.isPowerOf2()) {
17581	M = X;
17582	return true;
17583	}
17584	}
17585	return false;
17586	};
17587
17588	if (ConstValue.isNonNegative()) {
17589	// (mul x, (2^N + 1) 2^M) => (shl (add (shl x, N), x), M)*
17590	// (mul x, 2^N - 1) => (sub (shl x, N), x)
17591	// (mul x, (2^(N-M) - 1) 2^M) => (sub (shl x, N), (shl x, M))*
17592	// (mul x, (2^M + 1) (2^N + 1))*
17593	// => MV = (add (shl x, M), x); (add (shl MV, N), MV)
17594	APInt SCVMinus1 = ShiftedConstValue - `1`;
17595	APInt SCVPlus1 = ShiftedConstValue + `1`;
17596	APInt CVPlus1 = ConstValue + `1`;
17597	APInt CVM, CVN;
17598	if (SCVMinus1.isPowerOf2()) {
17599	ShiftAmt = SCVMinus1.logBase2();
17600	return Shl (Add (Shl (N0, ShiftAmt), N0), TrailingZeroes);
17601	} else if (CVPlus1.isPowerOf2()) {
17602	ShiftAmt = CVPlus1.logBase2();
17603	return Sub (Shl (N0, ShiftAmt), N0);
17604	} else if (SCVPlus1.isPowerOf2()) {
17605	ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
17606	return Sub (Shl (N0, ShiftAmt), Shl (N0, TrailingZeroes));
17607	} else if (Subtarget->hasALULSLFast() &&
17608	isPowPlusPlusConst (ConstValue, CVM, CVN)) {
17609	APInt CVMMinus1 = CVM - `1`;
17610	APInt CVNMinus1 = CVN - `1`;
17611	unsigned ShiftM1 = CVMMinus1.logBase2();
17612	unsigned ShiftN1 = CVNMinus1.logBase2();
17613	// LSLFast implicate that Shifts <= 3 places are fast
17614	if (ShiftM1 <= `3` && ShiftN1 <= `3`) {
17615	SDValue MVal = Add (Shl (N0, ShiftM1), N0);
17616	return Add (Shl (MVal, ShiftN1), MVal);
17617	}
17618	}
17619	} else {
17620	// (mul x, -(2^N - 1)) => (sub x, (shl x, N))
17621	// (mul x, -(2^N + 1)) => - (add (shl x, N), x)
17622	// (mul x, -(2^(N-M) - 1) 2^M) => (sub (shl x, M), (shl x, N))*
17623	APInt SCVPlus1 = -ShiftedConstValue + `1`;
17624	APInt CVNegPlus1 = -ConstValue + `1`;
17625	APInt CVNegMinus1 = -ConstValue - `1`;
17626	if (CVNegPlus1.isPowerOf2()) {
17627	ShiftAmt = CVNegPlus1.logBase2();
17628	return Sub (N0, Shl (N0, ShiftAmt));
17629	} else if (CVNegMinus1.isPowerOf2()) {
17630	ShiftAmt = CVNegMinus1.logBase2();
17631	return Negate (Add (Shl (N0, ShiftAmt), N0));
17632	} else if (SCVPlus1.isPowerOf2()) {
17633	ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
17634	return Sub (Shl (N0, TrailingZeroes), Shl (N0, ShiftAmt));
17635	}
17636	}
17637
17638	return SDValue ();
17639	}
17640
17641	static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
17642	SelectionDAG &DAG) {
17643	// Take advantage of vector comparisons producing 0 or -1 in each lane to
17644	// optimize away operation when it's from a constant.
17645	//
17646	// The general transformation is:
17647	// UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
17648	// AND(VECTOR_CMP(x,y), constant2)
17649	// constant2 = UNARYOP(constant)
17650
17651	// Early exit if this isn't a vector operation, the operand of the
17652	// unary operation isn't a bitwise AND, or if the sizes of the operations
17653	// aren't the same.
17654	EVT VT = N->getValueType(ResNo: `0`);
17655	if (!VT.isVector() \|\| N->getOperand(Num: `0`)->getOpcode() != ISD::AND \|\|
17656	N->getOperand(Num: `0`)->getOperand(Num: `0`)->getOpcode() != ISD::SETCC \|\|
17657	VT.getSizeInBits() != N->getOperand(Num: `0`)->getValueType(ResNo: `0`).getSizeInBits())
17658	return SDValue ();
17659
17660	// Now check that the other operand of the AND is a constant. We could
17661	// make the transformation for non-constant splats as well, but it's unclear
17662	// that would be a benefit as it would not eliminate any operations, just
17663	// perform one more step in scalar code before moving to the vector unit.
17664	if (BuildVectorSDNode *BV =
17665	dyn_cast<BuildVectorSDNode>(Val: N->getOperand(Num: `0`)->getOperand(Num: `1`))) {
17666	// Bail out if the vector isn't a constant.
17667	if (!BV->isConstant())
17668	return SDValue ();
17669
17670	// Everything checks out. Build up the new and improved node.
17671	SDLoc DL(N);
17672	EVT IntVT = BV->getValueType(ResNo: `0`);
17673	// Create a new constant of the appropriate type for the transformed
17674	// DAG.
17675	SDValue SourceConst = DAG.getNode(Opcode: N->getOpcode(), DL, VT, Operand: SDValue (BV, `0`));
17676	// The AND node needs bitcasts to/from an integer vector type around it.
17677	SDValue MaskConst = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: IntVT, Operand: SourceConst);
17678	SDValue NewAnd = DAG.getNode(Opcode: ISD::AND, DL, VT: IntVT,
17679	N1: N->getOperand(Num: `0`)->getOperand(Num: `0`), N2: MaskConst);
17680	SDValue Res = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: NewAnd);
17681	return Res;
17682	}
17683
17684	return SDValue ();
17685	}
17686
17687	static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
17688	const AArch64Subtarget *Subtarget) {
17689	// First try to optimize away the conversion when it's conditionally from
17690	// a constant. Vectors only.
17691	if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG))
17692	return Res;
17693
17694	EVT VT = N->getValueType(ResNo: `0`);
17695	if (VT != MVT::f32 && VT != MVT::f64)
17696	return SDValue ();
17697
17698	// Only optimize when the source and destination types have the same width.
17699	if (VT.getSizeInBits() != N->getOperand(Num: `0`).getValueSizeInBits())
17700	return SDValue ();
17701
17702	// If the result of an integer load is only used by an integer-to-float
17703	// conversion, use a fp load instead and a AdvSIMD scalar {S\|U}CVTF instead.
17704	// This eliminates an "integer-to-vector-move" UOP and improves throughput.
17705	SDValue N0 = N->getOperand(Num: `0`);
17706	if (Subtarget->isNeonAvailable() && ISD::isNormalLoad(N: N0.getNode()) &&
17707	N0.hasOneUse() &&
17708	// Do not change the width of a volatile load.
17709	!cast<LoadSDNode>(Val&: N0)->isVolatile()) {
17710	LoadSDNode *LN0 = cast<LoadSDNode>(Val&: N0);
17711	SDValue Load = DAG.getLoad(VT, dl: SDLoc (N), Chain: LN0->getChain(), Ptr: LN0->getBasePtr(),
17712	PtrInfo: LN0->getPointerInfo(), Alignment: LN0->getAlign(),
17713	MMOFlags: LN0->getMemOperand()->getFlags());
17714
17715	// Make sure successors of the original load stay after it by updating them
17716	// to use the new Chain.
17717	DAG.ReplaceAllUsesOfValueWith(From: SDValue (LN0, `1`), To: Load.getValue(R: `1`));
17718
17719	unsigned Opcode =
17720	(N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF;
17721	return DAG.getNode(Opcode, DL: SDLoc (N), VT, Operand: Load);
17722	}
17723
17724	return SDValue ();
17725	}
17726
17727	/// Fold a floating-point multiply by power of two into floating-point to
17728	/// fixed-point conversion.
17729	static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
17730	TargetLowering::DAGCombinerInfo &DCI,
17731	const AArch64Subtarget *Subtarget) {
17732	if (!Subtarget->isNeonAvailable())
17733	return SDValue ();
17734
17735	if (!N->getValueType(ResNo: `0`).isSimple())
17736	return SDValue ();
17737
17738	SDValue Op = N->getOperand(Num: `0`);
17739	if (!Op.getValueType().isSimple() \|\| Op.getOpcode() != ISD::FMUL)
17740	return SDValue ();
17741
17742	if (!Op.getValueType().is64BitVector() && !Op.getValueType().is128BitVector())
17743	return SDValue ();
17744
17745	SDValue ConstVec = Op ->getOperand(Num: `1`);
17746	if (!isa<BuildVectorSDNode>(Val: ConstVec))
17747	return SDValue ();
17748
17749	MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
17750	uint32_t FloatBits = FloatTy.getSizeInBits();
17751	if (FloatBits != `32` && FloatBits != `64` &&
17752	(FloatBits != `16` \|\| !Subtarget->hasFullFP16()))
17753	return SDValue ();
17754
17755	MVT IntTy = N->getSimpleValueType(ResNo: `0`).getVectorElementType();
17756	uint32_t IntBits = IntTy.getSizeInBits();
17757	if (IntBits != `16` && IntBits != `32` && IntBits != `64`)
17758	return SDValue ();
17759
17760	// Avoid conversions where iN is larger than the float (e.g., float -> i64).
17761	if (IntBits > FloatBits)
17762	return SDValue ();
17763
17764	BitVector UndefElements;
17765	BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Val&: ConstVec);
17766	int32_t Bits = IntBits == `64` ? `64` : `32`;
17767	int32_t C = BV->getConstantFPSplatPow2ToLog2Int(UndefElements: &UndefElements, BitWidth: Bits + `1`);
17768	if (C == -`1` \|\| C == `0` \|\| C > Bits)
17769	return SDValue ();
17770
17771	EVT ResTy = Op.getValueType().changeVectorElementTypeToInteger();
17772	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT: ResTy))
17773	return SDValue ();
17774
17775	if (N->getOpcode() == ISD::FP_TO_SINT_SAT \|\|
17776	N->getOpcode() == ISD::FP_TO_UINT_SAT) {
17777	EVT SatVT = cast<VTSDNode>(Val: N->getOperand(Num: `1`))->getVT();
17778	if (SatVT.getScalarSizeInBits() != IntBits \|\| IntBits != FloatBits)
17779	return SDValue ();
17780	}
17781
17782	SDLoc DL(N);
17783	bool IsSigned = (N->getOpcode() == ISD::FP_TO_SINT \|\|
17784	N->getOpcode() == ISD::FP_TO_SINT_SAT);
17785	unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
17786	: Intrinsic::aarch64_neon_vcvtfp2fxu;
17787	SDValue FixConv =
17788	DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResTy,
17789	DAG.getConstant(IntrinsicOpcode, DL, MVT::i32),
17790	Op->getOperand(`0`), DAG.getConstant(C, DL, MVT::i32));
17791	// We can handle smaller integers by generating an extra trunc.
17792	if (IntBits < FloatBits)
17793	FixConv = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: N->getValueType(ResNo: `0`), Operand: FixConv);
17794
17795	return FixConv;
17796	}
17797
17798	/// Fold a floating-point divide by power of two into fixed-point to
17799	/// floating-point conversion.
17800	static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG,
17801	TargetLowering::DAGCombinerInfo &DCI,
17802	const AArch64Subtarget *Subtarget) {
17803	if (!Subtarget->hasNEON())
17804	return SDValue ();
17805
17806	SDValue Op = N->getOperand(Num: `0`);
17807	unsigned Opc = Op ->getOpcode();
17808	if (!Op.getValueType().isVector() \|\| !Op.getValueType().isSimple() \|\|
17809	!Op.getOperand(i: `0`).getValueType().isSimple() \|\|
17810	(Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP))
17811	return SDValue ();
17812
17813	SDValue ConstVec = N->getOperand(Num: `1`);
17814	if (!isa<BuildVectorSDNode>(Val: ConstVec))
17815	return SDValue ();
17816
17817	MVT IntTy = Op.getOperand(i: `0`).getSimpleValueType().getVectorElementType();
17818	int32_t IntBits = IntTy.getSizeInBits();
17819	if (IntBits != `16` && IntBits != `32` && IntBits != `64`)
17820	return SDValue ();
17821
17822	MVT FloatTy = N->getSimpleValueType(ResNo: `0`).getVectorElementType();
17823	int32_t FloatBits = FloatTy.getSizeInBits();
17824	if (FloatBits != `32` && FloatBits != `64`)
17825	return SDValue ();
17826
17827	// Avoid conversions where iN is larger than the float (e.g., i64 -> float).
17828	if (IntBits > FloatBits)
17829	return SDValue ();
17830
17831	BitVector UndefElements;
17832	BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Val&: ConstVec);
17833	int32_t C = BV->getConstantFPSplatPow2ToLog2Int(UndefElements: &UndefElements, BitWidth: FloatBits + `1`);
17834	if (C == -`1` \|\| C == `0` \|\| C > FloatBits)
17835	return SDValue ();
17836
17837	MVT ResTy;
17838	unsigned NumLanes = Op.getValueType().getVectorNumElements();
17839	switch (NumLanes) {
17840	default:
17841	return SDValue ();
17842	case `2`:
17843	ResTy = FloatBits == `32` ? MVT::v2i32 : MVT::v2i64;
17844	break;
17845	case `4`:
17846	ResTy = FloatBits == `32` ? MVT::v4i32 : MVT::v4i64;
17847	break;
17848	}
17849
17850	if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
17851	return SDValue ();
17852
17853	SDLoc DL(N);
17854	SDValue ConvInput = Op.getOperand(i: `0`);
17855	bool IsSigned = Opc == ISD::SINT_TO_FP;
17856	if (IntBits < FloatBits)
17857	ConvInput = DAG.getNode(Opcode: IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
17858	VT: ResTy, Operand: ConvInput);
17859
17860	unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp
17861	: Intrinsic::aarch64_neon_vcvtfxu2fp;
17862	return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
17863	DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,
17864	DAG.getConstant(C, DL, MVT::i32));
17865	}
17866
17867	static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
17868	const AArch64TargetLowering &TLI) {
17869	EVT VT = N->getValueType(ResNo: `0`);
17870	SelectionDAG &DAG = DCI.DAG;
17871	SDLoc DL(N);
17872	const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
17873
17874	if (!VT.isVector())
17875	return SDValue ();
17876
17877	// The combining code works for NEON, SVE2 and SME.
17878	if (TLI.useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget.isNeonAvailable()) \|\|
17879	(VT.isScalableVector() && !Subtarget.hasSVE2()))
17880	return SDValue ();
17881
17882	SDValue N0 = N->getOperand(Num: `0`);
17883	if (N0.getOpcode() != ISD::AND)
17884	return SDValue ();
17885
17886	SDValue N1 = N->getOperand(Num: `1`);
17887	if (N1.getOpcode() != ISD::AND)
17888	return SDValue ();
17889
17890	// InstCombine does (not (neg a)) => (add a -1).
17891	// Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)
17892	// Loop over all combinations of AND operands.
17893	for (int i = `1`; i >= `0`; --i) {
17894	for (int j = `1`; j >= `0`; --j) {
17895	SDValue O0 = N0 ->getOperand(Num: i);
17896	SDValue O1 = N1 ->getOperand(Num: j);
17897	SDValue Sub, Add, SubSibling, AddSibling;
17898
17899	// Find a SUB and an ADD operand, one from each AND.
17900	if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) {
17901	Sub = O0;
17902	Add = O1;
17903	SubSibling = N0 ->getOperand(Num: `1` - i);
17904	AddSibling = N1 ->getOperand(Num: `1` - j);
17905	} else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) {
17906	Add = O0;
17907	Sub = O1;
17908	AddSibling = N0 ->getOperand(Num: `1` - i);
17909	SubSibling = N1 ->getOperand(Num: `1` - j);
17910	} else
17911	continue;
17912
17913	if (!ISD::isConstantSplatVectorAllZeros(N: Sub.getOperand(i: `0`).getNode()))
17914	continue;
17915
17916	// Constant ones is always righthand operand of the Add.
17917	if (!ISD::isConstantSplatVectorAllOnes(N: Add.getOperand(i: `1`).getNode()))
17918	continue;
17919
17920	if (Sub.getOperand(i: `1`) != Add.getOperand(i: `0`))
17921	continue;
17922
17923	return DAG.getNode(Opcode: AArch64ISD::BSP, DL, VT, N1: Sub, N2: SubSibling, N3: AddSibling);
17924	}
17925	}
17926
17927	// (or (and a b) (and (not a) c)) => (bsl a b c)
17928	// We only have to look for constant vectors here since the general, variable
17929	// case can be handled in TableGen.
17930	unsigned Bits = VT.getScalarSizeInBits();
17931	uint64_t BitMask = Bits == `64` ? -`1ULL` : ((`1ULL` << Bits) - `1`);
17932	for (int i = `1`; i >= `0`; --i)
17933	for (int j = `1`; j >= `0`; --j) {
17934	APInt Val1, Val2;
17935
17936	if (ISD::isConstantSplatVector(N: N0 ->getOperand(Num: i).getNode(), SplatValue&: Val1) &&
17937	ISD::isConstantSplatVector(N: N1 ->getOperand(Num: j).getNode(), SplatValue&: Val2) &&
17938	(BitMask & ~Val1.getZExtValue()) == Val2.getZExtValue()) {
17939	return DAG.getNode(Opcode: AArch64ISD::BSP, DL, VT, N1: N0 ->getOperand(Num: i),
17940	N2: N0 ->getOperand(Num: `1` - i), N3: N1 ->getOperand(Num: `1` - j));
17941	}
17942	BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(Val: N0 ->getOperand(Num: i));
17943	BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(Val: N1 ->getOperand(Num: j));
17944	if (!BVN0 \|\| !BVN1)
17945	continue;
17946
17947	bool FoundMatch = true;
17948	for (unsigned k = `0`; k < VT.getVectorNumElements(); ++k) {
17949	ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(Val: BVN0->getOperand(Num: k));
17950	ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(Val: BVN1->getOperand(Num: k));
17951	if (!CN0 \|\| !CN1 \|\|
17952	CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
17953	FoundMatch = false;
17954	break;
17955	}
17956	}
17957	if (FoundMatch)
17958	return DAG.getNode(Opcode: AArch64ISD::BSP, DL, VT, N1: N0 ->getOperand(Num: i),
17959	N2: N0 ->getOperand(Num: `1` - i), N3: N1 ->getOperand(Num: `1` - j));
17960	}
17961
17962	return SDValue ();
17963	}
17964
17965	// Given a tree of and/or(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to
17966	// convert to csel(ccmp(.., cc0)), depending on cc1:
17967
17968	// (AND (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
17969	// =>
17970	// (CSET cc1 (CCMP x1 y1 !cc1 cc0 cmp0))
17971	//
17972	// (OR (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
17973	// =>
17974	// (CSET cc1 (CCMP x1 y1 cc1 !cc0 cmp0))
17975	static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG) {
17976	EVT VT = N->getValueType(ResNo: `0`);
17977	SDValue CSel0 = N->getOperand(Num: `0`);
17978	SDValue CSel1 = N->getOperand(Num: `1`);
17979
17980	if (CSel0.getOpcode() != AArch64ISD::CSEL \|\|
17981	CSel1.getOpcode() != AArch64ISD::CSEL)
17982	return SDValue ();
17983
17984	if (!CSel0 ->hasOneUse() \|\| !CSel1 ->hasOneUse())
17985	return SDValue ();
17986
17987	if (!isNullConstant(V: CSel0.getOperand(i: `0`)) \|\|
17988	!isOneConstant(V: CSel0.getOperand(i: `1`)) \|\|
17989	!isNullConstant(V: CSel1.getOperand(i: `0`)) \|\|
17990	!isOneConstant(V: CSel1.getOperand(i: `1`)))
17991	return SDValue ();
17992
17993	SDValue Cmp0 = CSel0.getOperand(i: `3`);
17994	SDValue Cmp1 = CSel1.getOperand(i: `3`);
17995	AArch64CC::CondCode CC0 = (AArch64CC::CondCode)CSel0.getConstantOperandVal(i: `2`);
17996	AArch64CC::CondCode CC1 = (AArch64CC::CondCode)CSel1.getConstantOperandVal(i: `2`);
17997	if (!Cmp0 ->hasOneUse() \|\| !Cmp1 ->hasOneUse())
17998	return SDValue ();
17999	if (Cmp1.getOpcode() != AArch64ISD::SUBS &&
18000	Cmp0.getOpcode() == AArch64ISD::SUBS) {
18001	std::swap(a&: Cmp0, b&: Cmp1);
18002	std::swap(a&: CC0, b&: CC1);
18003	}
18004
18005	if (Cmp1.getOpcode() != AArch64ISD::SUBS)
18006	return SDValue ();
18007
18008	SDLoc DL(N);
18009	SDValue CCmp, Condition;
18010	unsigned NZCV;
18011
18012	if (N->getOpcode() == ISD::AND) {
18013	AArch64CC::CondCode InvCC0 = AArch64CC::getInvertedCondCode(Code: CC0);
18014	Condition = DAG.getConstant(Val: InvCC0, DL, VT: MVT_CC);
18015	NZCV = AArch64CC::getNZCVToSatisfyCondCode(Code: CC1);
18016	} else {
18017	AArch64CC::CondCode InvCC1 = AArch64CC::getInvertedCondCode(Code: CC1);
18018	Condition = DAG.getConstant(Val: CC0, DL, VT: MVT_CC);
18019	NZCV = AArch64CC::getNZCVToSatisfyCondCode(Code: InvCC1);
18020	}
18021
18022	SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
18023
18024	auto *Op1 = dyn_cast<ConstantSDNode>(Val: Cmp1.getOperand(i: `1`));
18025	if (Op1 && Op1->getAPIntValue().isNegative() &&
18026	Op1->getAPIntValue().sgt(RHS: -`32`)) {
18027	// CCMP accept the constant int the range [0, 31]
18028	// if the Op1 is a constant in the range [-31, -1], we
18029	// can select to CCMN to avoid the extra mov
18030	SDValue AbsOp1 =
18031	DAG.getConstant(Val: Op1->getAPIntValue().abs(), DL, VT: Op1->getValueType(ResNo: `0`));
18032	CCmp = DAG.getNode(Opcode: AArch64ISD::CCMN, DL, VT: MVT_CC, N1: Cmp1.getOperand(i: `0`), N2: AbsOp1,
18033	N3: NZCVOp, N4: Condition, N5: Cmp0);
18034	} else {
18035	CCmp = DAG.getNode(Opcode: AArch64ISD::CCMP, DL, VT: MVT_CC, N1: Cmp1.getOperand(i: `0`),
18036	N2: Cmp1.getOperand(i: `1`), N3: NZCVOp, N4: Condition, N5: Cmp0);
18037	}
18038	return DAG.getNode(AArch64ISD::CSEL, DL, VT, CSel0.getOperand(`0`),
18039	CSel0.getOperand(`1`), DAG.getConstant(CC1, DL, MVT::i32),
18040	CCmp);
18041	}
18042
18043	static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
18044	const AArch64Subtarget *Subtarget,
18045	const AArch64TargetLowering &TLI) {
18046	SelectionDAG &DAG = DCI.DAG;
18047	EVT VT = N->getValueType(ResNo: `0`);
18048
18049	if (SDValue R = performANDORCSELCombine(N, DAG))
18050	return R;
18051
18052	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
18053	return SDValue ();
18054
18055	if (SDValue Res = tryCombineToBSL(N, DCI, TLI))
18056	return Res;
18057
18058	return SDValue ();
18059	}
18060
18061	static bool isConstantSplatVectorMaskForType(SDNode *N, EVT MemVT) {
18062	if (!MemVT.getVectorElementType().isSimple())
18063	return false;
18064
18065	uint64_t MaskForTy = `0ull`;
18066	switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) {
18067	case MVT::i8:
18068	MaskForTy = `0xffull`;
18069	break;
18070	case MVT::i16:
18071	MaskForTy = `0xffffull`;
18072	break;
18073	case MVT::i32:
18074	MaskForTy = `0xffffffffull`;
18075	break;
18076	default:
18077	return false;
18078	break;
18079	}
18080
18081	if (N->getOpcode() == AArch64ISD::DUP \|\| N->getOpcode() == ISD::SPLAT_VECTOR)
18082	if (auto *Op0 = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `0`)))
18083	return Op0->getAPIntValue().getLimitedValue() == MaskForTy;
18084
18085	return false;
18086	}
18087
18088	static SDValue performReinterpretCastCombine(SDNode *N) {
18089	SDValue LeafOp = SDValue (N, `0`);
18090	SDValue Op = N->getOperand(Num: `0`);
18091	while (Op.getOpcode() == AArch64ISD::REINTERPRET_CAST &&
18092	LeafOp.getValueType() != Op.getValueType())
18093	Op = Op ->getOperand(Num: `0`);
18094	if (LeafOp.getValueType() == Op.getValueType())
18095	return Op;
18096	return SDValue ();
18097	}
18098
18099	static SDValue performSVEAndCombine(SDNode *N,
18100	TargetLowering::DAGCombinerInfo &DCI) {
18101	SelectionDAG &DAG = DCI.DAG;
18102	SDValue Src = N->getOperand(Num: `0`);
18103	unsigned Opc = Src ->getOpcode();
18104
18105	// Zero/any extend of an unsigned unpack
18106	if (Opc == AArch64ISD::UUNPKHI \|\| Opc == AArch64ISD::UUNPKLO) {
18107	SDValue UnpkOp = Src ->getOperand(Num: `0`);
18108	SDValue Dup = N->getOperand(Num: `1`);
18109
18110	if (Dup.getOpcode() != ISD::SPLAT_VECTOR)
18111	return SDValue ();
18112
18113	SDLoc DL(N);
18114	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val: Dup ->getOperand(Num: `0`));
18115	if (!C)
18116	return SDValue ();
18117
18118	uint64_t ExtVal = C->getZExtValue();
18119
18120	auto MaskAndTypeMatch = [ExtVal](EVT VT) -> bool {
18121	return ((ExtVal == `0xFF` && VT == MVT::i8) \|\|
18122	(ExtVal == `0xFFFF` && VT == MVT::i16) \|\|
18123	(ExtVal == `0xFFFFFFFF` && VT == MVT::i32));
18124	};
18125
18126	// If the mask is fully covered by the unpack, we don't need to push
18127	// a new AND onto the operand
18128	EVT EltTy = UnpkOp ->getValueType(ResNo: `0`).getVectorElementType();
18129	if (MaskAndTypeMatch (EltTy))
18130	return Src;
18131
18132	// If this is 'and (uunpklo/hi (extload MemTy -> ExtTy)), mask', then check
18133	// to see if the mask is all-ones of size MemTy.
18134	auto MaskedLoadOp = dyn_cast<MaskedLoadSDNode>(Val&: UnpkOp);
18135	if (MaskedLoadOp && (MaskedLoadOp->getExtensionType() == ISD::ZEXTLOAD \|\|
18136	MaskedLoadOp->getExtensionType() == ISD::EXTLOAD)) {
18137	EVT EltTy = MaskedLoadOp->getMemoryVT().getVectorElementType();
18138	if (MaskAndTypeMatch (EltTy))
18139	return Src;
18140	}
18141
18142	// Truncate to prevent a DUP with an over wide constant
18143	APInt Mask = C->getAPIntValue().trunc(width: EltTy.getSizeInBits());
18144
18145	// Otherwise, make sure we propagate the AND to the operand
18146	// of the unpack
18147	Dup = DAG.getNode(ISD::SPLAT_VECTOR, DL, UnpkOp->getValueType(`0`),
18148	DAG.getConstant(Mask.zextOrTrunc(`32`), DL, MVT::i32));
18149
18150	SDValue And = DAG.getNode(Opcode: ISD::AND, DL,
18151	VT: UnpkOp ->getValueType(ResNo: `0`), N1: UnpkOp, N2: Dup);
18152
18153	return DAG.getNode(Opcode: Opc, DL, VT: N->getValueType(ResNo: `0`), Operand: And);
18154	}
18155
18156	if (DCI.isBeforeLegalizeOps())
18157	return SDValue ();
18158
18159	// If both sides of AND operations are i1 splat_vectors then
18160	// we can produce just i1 splat_vector as the result.
18161	if (isAllActivePredicate(DAG, N: N->getOperand(Num: `0`)))
18162	return N->getOperand(Num: `1`);
18163	if (isAllActivePredicate(DAG, N: N->getOperand(Num: `1`)))
18164	return N->getOperand(Num: `0`);
18165
18166	if (!EnableCombineMGatherIntrinsics)
18167	return SDValue ();
18168
18169	SDValue Mask = N->getOperand(Num: `1`);
18170
18171	if (!Src.hasOneUse())
18172	return SDValue ();
18173
18174	EVT MemVT;
18175
18176	// SVE load instructions perform an implicit zero-extend, which makes them
18177	// perfect candidates for combining.
18178	switch (Opc) {
18179	case AArch64ISD::LD1_MERGE_ZERO:
18180	case AArch64ISD::LDNF1_MERGE_ZERO:
18181	case AArch64ISD::LDFF1_MERGE_ZERO:
18182	MemVT = cast<VTSDNode>(Val: Src ->getOperand(Num: `3`))->getVT();
18183	break;
18184	case AArch64ISD::GLD1_MERGE_ZERO:
18185	case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
18186	case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
18187	case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
18188	case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
18189	case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
18190	case AArch64ISD::GLD1_IMM_MERGE_ZERO:
18191	case AArch64ISD::GLDFF1_MERGE_ZERO:
18192	case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
18193	case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
18194	case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
18195	case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
18196	case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
18197	case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
18198	case AArch64ISD::GLDNT1_MERGE_ZERO:
18199	MemVT = cast<VTSDNode>(Val: Src ->getOperand(Num: `4`))->getVT();
18200	break;
18201	default:
18202	return SDValue ();
18203	}
18204
18205	if (isConstantSplatVectorMaskForType(N: Mask.getNode(), MemVT))
18206	return Src;
18207
18208	return SDValue ();
18209	}
18210
18211	// Transform and(fcmp(a, b), fcmp(c, d)) into fccmp(fcmp(a, b), c, d)
18212	static SDValue performANDSETCCCombine(SDNode *N,
18213	TargetLowering::DAGCombinerInfo &DCI) {
18214
18215	// This function performs an optimization on a specific pattern involving
18216	// an AND operation and SETCC (Set Condition Code) node.
18217
18218	SDValue SetCC = N->getOperand(Num: `0`);
18219	EVT VT = N->getValueType(ResNo: `0`);
18220	SelectionDAG &DAG = DCI.DAG;
18221
18222	// Checks if the current node (N) is used by any SELECT instruction and
18223	// returns an empty SDValue to avoid applying the optimization to prevent
18224	// incorrect results
18225	for (auto U : N->uses())
18226	if (U->getOpcode() == ISD::SELECT)
18227	return SDValue ();
18228
18229	// Check if the operand is a SETCC node with floating-point comparison
18230	if (SetCC.getOpcode() == ISD::SETCC &&
18231	SetCC.getOperand(`0`).getValueType() == MVT::f32) {
18232
18233	SDValue Cmp;
18234	AArch64CC::CondCode CC;
18235
18236	// Check if the DAG is after legalization and if we can emit the conjunction
18237	if (!DCI.isBeforeLegalize() &&
18238	(Cmp = emitConjunction(DAG, Val: SDValue (N, `0`), OutCC&: CC))) {
18239
18240	AArch64CC::CondCode InvertedCC = AArch64CC::getInvertedCondCode(Code: CC);
18241
18242	SDLoc DL(N);
18243	return DAG.getNode(AArch64ISD::CSINC, DL, VT, DAG.getConstant(`0`, DL, VT),
18244	DAG.getConstant(`0`, DL, VT),
18245	DAG.getConstant(InvertedCC, DL, MVT::i32), Cmp);
18246	}
18247	}
18248	return SDValue ();
18249	}
18250
18251	static SDValue performANDCombine(SDNode *N,
18252	TargetLowering::DAGCombinerInfo &DCI) {
18253	SelectionDAG &DAG = DCI.DAG;
18254	SDValue LHS = N->getOperand(Num: `0`);
18255	SDValue RHS = N->getOperand(Num: `1`);
18256	EVT VT = N->getValueType(ResNo: `0`);
18257
18258	if (SDValue R = performANDORCSELCombine(N, DAG))
18259	return R;
18260
18261	if (SDValue R = performANDSETCCCombine(N,DCI))
18262	return R;
18263
18264	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
18265	return SDValue ();
18266
18267	if (VT.isScalableVector())
18268	return performSVEAndCombine(N, DCI);
18269
18270	// The combining code below works only for NEON vectors. In particular, it
18271	// does not work for SVE when dealing with vectors wider than 128 bits.
18272	if (!VT.is64BitVector() && !VT.is128BitVector())
18273	return SDValue ();
18274
18275	BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Val: RHS.getNode());
18276	if (!BVN)
18277	return SDValue ();
18278
18279	// AND does not accept an immediate, so check if we can use a BIC immediate
18280	// instruction instead. We do this here instead of using a (and x, (mvni imm))
18281	// pattern in isel, because some immediates may be lowered to the preferred
18282	// (and x, (movi imm)) form, even though an mvni representation also exists.
18283	APInt DefBits(VT.getSizeInBits(), `0`);
18284	APInt UndefBits(VT.getSizeInBits(), `0`);
18285	if (resolveBuildVector(BVN, CnstBits&: DefBits, UndefBits)) {
18286	SDValue NewOp;
18287
18288	// Any bits known to already be 0 need not be cleared again, which can help
18289	// reduce the size of the immediate to one supported by the instruction.
18290	KnownBits Known = DAG.computeKnownBits(Op: LHS);
18291	APInt ZeroSplat(VT.getSizeInBits(), `0`);
18292	for (unsigned I = `0`; I < VT.getSizeInBits() / Known.Zero.getBitWidth(); I++)
18293	ZeroSplat \|= Known.Zero.zext(width: VT.getSizeInBits())
18294	<< (Known.Zero.getBitWidth() * I);
18295
18296	DefBits = ~(DefBits \| ZeroSplat);
18297	if ((NewOp = tryAdvSIMDModImm32(NewOp: AArch64ISD::BICi, Op: SDValue (N, `0`), DAG,
18298	Bits: DefBits, LHS: &LHS)) \|\|
18299	(NewOp = tryAdvSIMDModImm16(NewOp: AArch64ISD::BICi, Op: SDValue (N, `0`), DAG,
18300	Bits: DefBits, LHS: &LHS)))
18301	return NewOp;
18302
18303	UndefBits = ~(UndefBits \| ZeroSplat);
18304	if ((NewOp = tryAdvSIMDModImm32(NewOp: AArch64ISD::BICi, Op: SDValue (N, `0`), DAG,
18305	Bits: UndefBits, LHS: &LHS)) \|\|
18306	(NewOp = tryAdvSIMDModImm16(NewOp: AArch64ISD::BICi, Op: SDValue (N, `0`), DAG,
18307	Bits: UndefBits, LHS: &LHS)))
18308	return NewOp;
18309	}
18310
18311	return SDValue ();
18312	}
18313
18314	static SDValue performFADDCombine(SDNode *N,
18315	TargetLowering::DAGCombinerInfo &DCI) {
18316	SelectionDAG &DAG = DCI.DAG;
18317	SDValue LHS = N->getOperand(Num: `0`);
18318	SDValue RHS = N->getOperand(Num: `1`);
18319	EVT VT = N->getValueType(ResNo: `0`);
18320	SDLoc DL(N);
18321
18322	if (!N->getFlags().hasAllowReassociation())
18323	return SDValue ();
18324
18325	// Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c)
18326	auto ReassocComplex = [&](SDValue A, SDValue B) {
18327	if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
18328	return SDValue ();
18329	unsigned Opc = A.getConstantOperandVal(i: `0`);
18330	if (Opc != Intrinsic::aarch64_neon_vcmla_rot0 &&
18331	Opc != Intrinsic::aarch64_neon_vcmla_rot90 &&
18332	Opc != Intrinsic::aarch64_neon_vcmla_rot180 &&
18333	Opc != Intrinsic::aarch64_neon_vcmla_rot270)
18334	return SDValue ();
18335	SDValue VCMLA = DAG.getNode(
18336	Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT, N1: A.getOperand(i: `0`),
18337	N2: DAG.getNode(Opcode: ISD::FADD, DL, VT, N1: A.getOperand(i: `1`), N2: B, Flags: N->getFlags()),
18338	N3: A.getOperand(i: `2`), N4: A.getOperand(i: `3`));
18339	VCMLA ->setFlags(A ->getFlags());
18340	return VCMLA;
18341	};
18342	if (SDValue R = ReassocComplex (LHS, RHS))
18343	return R;
18344	if (SDValue R = ReassocComplex (RHS, LHS))
18345	return R;
18346
18347	return SDValue ();
18348	}
18349
18350	static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
18351	switch (Opcode) {
18352	case ISD::STRICT_FADD:
18353	case ISD::FADD:
18354	return (FullFP16 && VT == MVT::f16) \|\| VT == MVT::f32 \|\| VT == MVT::f64;
18355	case ISD::ADD:
18356	return VT == MVT::i64;
18357	default:
18358	return false;
18359	}
18360	}
18361
18362	static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
18363	AArch64CC::CondCode Cond);
18364
18365	static bool isPredicateCCSettingOp(SDValue N) {
18366	if ((N.getOpcode() == ISD::SETCC) \|\|
18367	(N.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
18368	(N.getConstantOperandVal(`0`) == Intrinsic::aarch64_sve_whilege \|\|
18369	N.getConstantOperandVal(`0`) == Intrinsic::aarch64_sve_whilegt \|\|
18370	N.getConstantOperandVal(`0`) == Intrinsic::aarch64_sve_whilehi \|\|
18371	N.getConstantOperandVal(`0`) == Intrinsic::aarch64_sve_whilehs \|\|
18372	N.getConstantOperandVal(`0`) == Intrinsic::aarch64_sve_whilele \|\|
18373	N.getConstantOperandVal(`0`) == Intrinsic::aarch64_sve_whilelo \|\|
18374	N.getConstantOperandVal(`0`) == Intrinsic::aarch64_sve_whilels \|\|
18375	N.getConstantOperandVal(`0`) == Intrinsic::aarch64_sve_whilelt \|\|
18376	// get_active_lane_mask is lowered to a whilelo instruction.
18377	N.getConstantOperandVal(`0`) == Intrinsic::get_active_lane_mask)))
18378	return true;
18379
18380	return false;
18381	}
18382
18383	// Materialize : i1 = extract_vector_elt t37, Constant:i64<0>
18384	// ... into: "ptrue p, all" + PTEST
18385	static SDValue
18386	performFirstTrueTestVectorCombine(SDNode *N,
18387	TargetLowering::DAGCombinerInfo &DCI,
18388	const AArch64Subtarget *Subtarget) {
18389	assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
18390	// Make sure PTEST can be legalised with illegal types.
18391	if (!Subtarget->hasSVE() \|\| DCI.isBeforeLegalize())
18392	return SDValue ();
18393
18394	SDValue N0 = N->getOperand(Num: `0`);
18395	EVT VT = N0.getValueType();
18396
18397	if (!VT.isScalableVector() \|\| VT.getVectorElementType() != MVT::i1 \|\|
18398	!isNullConstant(N->getOperand(`1`)))
18399	return SDValue ();
18400
18401	// Restricted the DAG combine to only cases where we're extracting from a
18402	// flag-setting operation.
18403	if (!isPredicateCCSettingOp(N: N0))
18404	return SDValue ();
18405
18406	// Extracts of lane 0 for SVE can be expressed as PTEST(Op, FIRST) ? 1 : 0
18407	SelectionDAG &DAG = DCI.DAG;
18408	SDValue Pg = getPTrue(DAG, SDLoc(N), VT, AArch64SVEPredPattern::all);
18409	return getPTest(DAG, VT: N->getValueType(ResNo: `0`), Pg, Op: N0, Cond: AArch64CC::FIRST_ACTIVE);
18410	}
18411
18412	// Materialize : Idx = (add (mul vscale, NumEls), -1)
18413	// i1 = extract_vector_elt t37, Constant:i64<Idx>
18414	// ... into: "ptrue p, all" + PTEST
18415	static SDValue
18416	performLastTrueTestVectorCombine(SDNode *N,
18417	TargetLowering::DAGCombinerInfo &DCI,
18418	const AArch64Subtarget *Subtarget) {
18419	assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
18420	// Make sure PTEST is legal types.
18421	if (!Subtarget->hasSVE() \|\| DCI.isBeforeLegalize())
18422	return SDValue ();
18423
18424	SDValue N0 = N->getOperand(Num: `0`);
18425	EVT OpVT = N0.getValueType();
18426
18427	if (!OpVT.isScalableVector() \|\| OpVT.getVectorElementType() != MVT::i1)
18428	return SDValue ();
18429
18430	// Idx == (add (mul vscale, NumEls), -1)
18431	SDValue Idx = N->getOperand(Num: `1`);
18432	if (Idx.getOpcode() != ISD::ADD \|\| !isAllOnesConstant(V: Idx.getOperand(i: `1`)))
18433	return SDValue ();
18434
18435	SDValue VS = Idx.getOperand(i: `0`);
18436	if (VS.getOpcode() != ISD::VSCALE)
18437	return SDValue ();
18438
18439	unsigned NumEls = OpVT.getVectorElementCount().getKnownMinValue();
18440	if (VS.getConstantOperandVal(i: `0`) != NumEls)
18441	return SDValue ();
18442
18443	// Extracts of lane EC-1 for SVE can be expressed as PTEST(Op, LAST) ? 1 : 0
18444	SelectionDAG &DAG = DCI.DAG;
18445	SDValue Pg = getPTrue(DAG, SDLoc(N), OpVT, AArch64SVEPredPattern::all);
18446	return getPTest(DAG, VT: N->getValueType(ResNo: `0`), Pg, Op: N0, Cond: AArch64CC::LAST_ACTIVE);
18447	}
18448
18449	static SDValue
18450	performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
18451	const AArch64Subtarget *Subtarget) {
18452	assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
18453	if (SDValue Res = performFirstTrueTestVectorCombine(N, DCI, Subtarget))
18454	return Res;
18455	if (SDValue Res = performLastTrueTestVectorCombine(N, DCI, Subtarget))
18456	return Res;
18457
18458	SelectionDAG &DAG = DCI.DAG;
18459	SDValue N0 = N->getOperand(Num: `0`), N1 = N->getOperand(Num: `1`);
18460
18461	EVT VT = N->getValueType(ResNo: `0`);
18462	const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
18463	bool IsStrict = N0 ->isStrictFPOpcode();
18464
18465	// extract(dup x) -> x
18466	if (N0.getOpcode() == AArch64ISD::DUP)
18467	return VT.isInteger() ? DAG.getZExtOrTrunc(Op: N0.getOperand(i: `0`), DL: SDLoc (N), VT)
18468	: N0.getOperand(i: `0`);
18469
18470	// Rewrite for pairwise fadd pattern
18471	// (f32 (extract_vector_elt
18472	// (fadd (vXf32 Other)
18473	// (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0))
18474	// ->
18475	// (f32 (fadd (extract_vector_elt (vXf32 Other) 0)
18476	// (extract_vector_elt (vXf32 Other) 1))
18477	// For strict_fadd we need to make sure the old strict_fadd can be deleted, so
18478	// we can only do this when it's used only by the extract_vector_elt.
18479	if (isNullConstant(V: N1) && hasPairwiseAdd(Opcode: N0 ->getOpcode(), VT, FullFP16) &&
18480	(!IsStrict \|\| N0.hasOneUse())) {
18481	SDLoc DL(N0);
18482	SDValue N00 = N0 ->getOperand(Num: IsStrict ? `1` : `0`);
18483	SDValue N01 = N0 ->getOperand(Num: IsStrict ? `2` : `1`);
18484
18485	ShuffleVectorSDNode *Shuffle = dyn_cast<ShuffleVectorSDNode>(Val&: N01);
18486	SDValue Other = N00;
18487
18488	// And handle the commutative case.
18489	if (!Shuffle) {
18490	Shuffle = dyn_cast<ShuffleVectorSDNode>(Val&: N00);
18491	Other = N01;
18492	}
18493
18494	if (Shuffle && Shuffle->getMaskElt(Idx: `0`) == `1` &&
18495	Other == Shuffle->getOperand(Num: `0`)) {
18496	SDValue Extract1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
18497	DAG.getConstant(`0`, DL, MVT::i64));
18498	SDValue Extract2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
18499	DAG.getConstant(`1`, DL, MVT::i64));
18500	if (!IsStrict)
18501	return DAG.getNode(Opcode: N0 ->getOpcode(), DL, VT, N1: Extract1, N2: Extract2);
18502
18503	// For strict_fadd we need uses of the final extract_vector to be replaced
18504	// with the strict_fadd, but we also need uses of the chain output of the
18505	// original strict_fadd to use the chain output of the new strict_fadd as
18506	// otherwise it may not be deleted.
18507	SDValue Ret = DAG.getNode(N0->getOpcode(), DL,
18508	{VT, MVT::Other},
18509	{N0->getOperand(`0`), Extract1, Extract2});
18510	DAG.ReplaceAllUsesOfValueWith(From: SDValue (N, `0`), To: Ret);
18511	DAG.ReplaceAllUsesOfValueWith(From: N0.getValue(R: `1`), To: Ret.getValue(R: `1`));
18512	return SDValue (N, `0`);
18513	}
18514	}
18515
18516	return SDValue ();
18517	}
18518
18519	static SDValue performConcatVectorsCombine(SDNode *N,
18520	TargetLowering::DAGCombinerInfo &DCI,
18521	SelectionDAG &DAG) {
18522	SDLoc dl(N);
18523	EVT VT = N->getValueType(ResNo: `0`);
18524	SDValue N0 = N->getOperand(Num: `0`), N1 = N->getOperand(Num: `1`);
18525	unsigned N0Opc = N0 ->getOpcode(), N1Opc = N1 ->getOpcode();
18526
18527	if (VT.isScalableVector())
18528	return SDValue ();
18529
18530	// Optimize concat_vectors of truncated vectors, where the intermediate
18531	// type is illegal, to avoid said illegality, e.g.,
18532	// (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
18533	// (v2i16 (truncate (v2i64)))))
18534	// ->
18535	// (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
18536	// (v4i32 (bitcast (v2i64))),
18537	// <0, 2, 4, 6>)))
18538	// This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
18539	// on both input and result type, so we might generate worse code.
18540	// On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
18541	if (N->getNumOperands() == `2` && N0Opc == ISD::TRUNCATE &&
18542	N1Opc == ISD::TRUNCATE) {
18543	SDValue N00 = N0 ->getOperand(Num: `0`);
18544	SDValue N10 = N1 ->getOperand(Num: `0`);
18545	EVT N00VT = N00.getValueType();
18546
18547	if (N00VT == N10.getValueType() &&
18548	(N00VT == MVT::v2i64 \|\| N00VT == MVT::v4i32) &&
18549	N00VT.getScalarSizeInBits() == `4` * VT.getScalarSizeInBits()) {
18550	MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
18551	SmallVector<int, `8`> Mask(MidVT.getVectorNumElements());
18552	for (size_t i = `0`; i < Mask.size(); ++i)
18553	Mask [i] = i * `2`;
18554	return DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT,
18555	Operand: DAG.getVectorShuffle(
18556	VT: MidVT, dl,
18557	N1: DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MidVT, Operand: N00),
18558	N2: DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MidVT, Operand: N10), Mask));
18559	}
18560	}
18561
18562	if (N->getOperand(`0`).getValueType() == MVT::v4i8 \|\|
18563	N->getOperand(`0`).getValueType() == MVT::v2i16 \|\|
18564	N->getOperand(`0`).getValueType() == MVT::v2i8) {
18565	EVT SrcVT = N->getOperand(Num: `0`).getValueType();
18566	// If we have a concat of v4i8 loads, convert them to a buildvector of f32
18567	// loads to prevent having to go through the v4i8 load legalization that
18568	// needs to extend each element into a larger type.
18569	if (N->getNumOperands() % `2` == `0` &&
18570	all_of(Range: N->op_values(), P: [SrcVT](SDValue V) {
18571	if (V.getValueType() != SrcVT)
18572	return false;
18573	if (V.isUndef())
18574	return true;
18575	LoadSDNode *LD = dyn_cast<LoadSDNode>(Val&: V);
18576	return LD && V.hasOneUse() && LD->isSimple() && !LD->isIndexed() &&
18577	LD->getExtensionType() == ISD::NON_EXTLOAD;
18578	})) {
18579	EVT FVT = SrcVT == MVT::v2i8 ? MVT::f16 : MVT::f32;
18580	EVT NVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: FVT, NumElements: N->getNumOperands());
18581	SmallVector<SDValue> Ops;
18582
18583	for (unsigned i = `0`; i < N->getNumOperands(); i++) {
18584	SDValue V = N->getOperand(Num: i);
18585	if (V.isUndef())
18586	Ops.push_back(Elt: DAG.getUNDEF(VT: FVT));
18587	else {
18588	LoadSDNode *LD = cast<LoadSDNode>(Val&: V);
18589	SDValue NewLoad = DAG.getLoad(VT: FVT, dl, Chain: LD->getChain(),
18590	Ptr: LD->getBasePtr(), MMO: LD->getMemOperand());
18591	DAG.ReplaceAllUsesOfValueWith(From: SDValue (LD, `1`), To: NewLoad.getValue(R: `1`));
18592	Ops.push_back(Elt: NewLoad);
18593	}
18594	}
18595	return DAG.getBitcast(VT: N->getValueType(ResNo: `0`),
18596	V: DAG.getBuildVector(VT: NVT, DL: dl, Ops));
18597	}
18598	}
18599
18600	// Canonicalise concat_vectors to replace concatenations of truncated nots
18601	// with nots of concatenated truncates. This in some cases allows for multiple
18602	// redundant negations to be eliminated.
18603	// (concat_vectors (v4i16 (truncate (not (v4i32)))),
18604	// (v4i16 (truncate (not (v4i32)))))
18605	// ->
18606	// (not (concat_vectors (v4i16 (truncate (v4i32))),
18607	// (v4i16 (truncate (v4i32)))))
18608	if (N->getNumOperands() == `2` && N0Opc == ISD::TRUNCATE &&
18609	N1Opc == ISD::TRUNCATE && N->isOnlyUserOf(N: N0.getNode()) &&
18610	N->isOnlyUserOf(N: N1.getNode())) {
18611	auto isBitwiseVectorNegate = [](SDValue V) {
18612	return V ->getOpcode() == ISD::XOR &&
18613	ISD::isConstantSplatVectorAllOnes(N: V.getOperand(i: `1`).getNode());
18614	};
18615	SDValue N00 = N0 ->getOperand(Num: `0`);
18616	SDValue N10 = N1 ->getOperand(Num: `0`);
18617	if (isBitwiseVectorNegate (N00) && N0 ->isOnlyUserOf(N: N00.getNode()) &&
18618	isBitwiseVectorNegate (N10) && N1 ->isOnlyUserOf(N: N10.getNode())) {
18619	return DAG.getNOT(
18620	DL: dl,
18621	Val: DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: dl, VT,
18622	N1: DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: N0.getValueType(),
18623	Operand: N00 ->getOperand(Num: `0`)),
18624	N2: DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: N1.getValueType(),
18625	Operand: N10 ->getOperand(Num: `0`))),
18626	VT);
18627	}
18628	}
18629
18630	// Wait till after everything is legalized to try this. That way we have
18631	// legal vector types and such.
18632	if (DCI.isBeforeLegalizeOps())
18633	return SDValue ();
18634
18635	// Optimise concat_vectors of two [us]avgceils or [us]avgfloors with a 128-bit
18636	// destination size, combine into an avg of two contacts of the source
18637	// vectors. eg: concat(uhadd(a,b), uhadd(c, d)) -> uhadd(concat(a, c),
18638	// concat(b, d))
18639	if (N->getNumOperands() == `2` && N0Opc == N1Opc && VT.is128BitVector() &&
18640	(N0Opc == ISD::AVGCEILU \|\| N0Opc == ISD::AVGCEILS \|\|
18641	N0Opc == ISD::AVGFLOORU \|\| N0Opc == ISD::AVGFLOORS) &&
18642	N0 ->hasOneUse() && N1 ->hasOneUse()) {
18643	SDValue N00 = N0 ->getOperand(Num: `0`);
18644	SDValue N01 = N0 ->getOperand(Num: `1`);
18645	SDValue N10 = N1 ->getOperand(Num: `0`);
18646	SDValue N11 = N1 ->getOperand(Num: `1`);
18647
18648	if (!N00.isUndef() && !N01.isUndef() && !N10.isUndef() && !N11.isUndef()) {
18649	SDValue Concat0 = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: dl, VT, N1: N00, N2: N10);
18650	SDValue Concat1 = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: dl, VT, N1: N01, N2: N11);
18651	return DAG.getNode(Opcode: N0Opc, DL: dl, VT, N1: Concat0, N2: Concat1);
18652	}
18653	}
18654
18655	auto IsRSHRN = [](SDValue Shr) {
18656	if (Shr.getOpcode() != AArch64ISD::VLSHR)
18657	return false;
18658	SDValue Op = Shr.getOperand(i: `0`);
18659	EVT VT = Op.getValueType();
18660	unsigned ShtAmt = Shr.getConstantOperandVal(i: `1`);
18661	if (ShtAmt > VT.getScalarSizeInBits() / `2` \|\| Op.getOpcode() != ISD::ADD)
18662	return false;
18663
18664	APInt Imm;
18665	if (Op.getOperand(i: `1`).getOpcode() == AArch64ISD::MOVIshift)
18666	Imm = APInt (VT.getScalarSizeInBits(),
18667	Op.getOperand(i: `1`).getConstantOperandVal(i: `0`)
18668	<< Op.getOperand(i: `1`).getConstantOperandVal(i: `1`));
18669	else if (Op.getOperand(i: `1`).getOpcode() == AArch64ISD::DUP &&
18670	isa<ConstantSDNode>(Val: Op.getOperand(i: `1`).getOperand(i: `0`)))
18671	Imm = APInt (VT.getScalarSizeInBits(),
18672	Op.getOperand(i: `1`).getConstantOperandVal(i: `0`));
18673	else
18674	return false;
18675
18676	if (Imm != `1ULL` << (ShtAmt - `1`))
18677	return false;
18678	return true;
18679	};
18680
18681	// concat(rshrn(x), rshrn(y)) -> rshrn(concat(x, y))
18682	if (N->getNumOperands() == `2` && IsRSHRN (N0) &&
18683	((IsRSHRN (N1) &&
18684	N0.getConstantOperandVal(i: `1`) == N1.getConstantOperandVal(i: `1`)) \|\|
18685	N1.isUndef())) {
18686	SDValue X = N0.getOperand(i: `0`).getOperand(i: `0`);
18687	SDValue Y = N1.isUndef() ? DAG.getUNDEF(VT: X.getValueType())
18688	: N1.getOperand(i: `0`).getOperand(i: `0`);
18689	EVT BVT =
18690	X.getValueType().getDoubleNumVectorElementsVT(Context&: *DCI.DAG.getContext());
18691	SDValue CC = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: dl, VT: BVT, N1: X, N2: Y);
18692	SDValue Add = DAG.getNode(
18693	Opcode: ISD::ADD, DL: dl, VT: BVT, N1: CC,
18694	N2: DAG.getConstant(Val: `1ULL` << (N0.getConstantOperandVal(i: `1`) - `1`), DL: dl, VT: BVT));
18695	SDValue Shr =
18696	DAG.getNode(Opcode: AArch64ISD::VLSHR, DL: dl, VT: BVT, N1: Add, N2: N0.getOperand(i: `1`));
18697	return Shr;
18698	}
18699
18700	// concat(zip1(a, b), zip2(a, b)) is zip1(a, b)
18701	if (N->getNumOperands() == `2` && N0Opc == AArch64ISD::ZIP1 &&
18702	N1Opc == AArch64ISD::ZIP2 && N0.getOperand(i: `0`) == N1.getOperand(i: `0`) &&
18703	N0.getOperand(i: `1`) == N1.getOperand(i: `1`)) {
18704	SDValue E0 = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: dl, VT, N1: N0.getOperand(i: `0`),
18705	N2: DAG.getUNDEF(VT: N0.getValueType()));
18706	SDValue E1 = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: dl, VT, N1: N0.getOperand(i: `1`),
18707	N2: DAG.getUNDEF(VT: N0.getValueType()));
18708	return DAG.getNode(Opcode: AArch64ISD::ZIP1, DL: dl, VT, N1: E0, N2: E1);
18709	}
18710
18711	// If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
18712	// splat. The indexed instructions are going to be expecting a DUPLANE64, so
18713	// canonicalise to that.
18714	if (N->getNumOperands() == `2` && N0 == N1 && VT.getVectorNumElements() == `2`) {
18715	assert(VT.getScalarSizeInBits() == `64`);
18716	return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG),
18717	DAG.getConstant(`0`, dl, MVT::i64));
18718	}
18719
18720	// Canonicalise concat_vectors so that the right-hand vector has as few
18721	// bit-casts as possible before its real operation. The primary matching
18722	// destination for these operations will be the narrowing "2" instructions,
18723	// which depend on the operation being performed on this right-hand vector.
18724	// For example,
18725	// (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS))))
18726	// becomes
18727	// (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
18728
18729	if (N->getNumOperands() != `2` \|\| N1Opc != ISD::BITCAST)
18730	return SDValue ();
18731	SDValue RHS = N1 ->getOperand(Num: `0`);
18732	MVT RHSTy = RHS.getValueType().getSimpleVT();
18733	// If the RHS is not a vector, this is not the pattern we're looking for.
18734	if (!RHSTy.isVector())
18735	return SDValue ();
18736
18737	LLVM_DEBUG(
18738	dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
18739
18740	MVT ConcatTy = MVT::getVectorVT(VT: RHSTy.getVectorElementType(),
18741	NumElements: RHSTy.getVectorNumElements() * `2`);
18742	return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT,
18743	Operand: DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: dl, VT: ConcatTy,
18744	N1: DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: RHSTy, Operand: N0),
18745	N2: RHS));
18746	}
18747
18748	static SDValue
18749	performExtractSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
18750	SelectionDAG &DAG) {
18751	if (DCI.isBeforeLegalizeOps())
18752	return SDValue ();
18753
18754	EVT VT = N->getValueType(ResNo: `0`);
18755	if (!VT.isScalableVector() \|\| VT.getVectorElementType() != MVT::i1)
18756	return SDValue ();
18757
18758	SDValue V = N->getOperand(Num: `0`);
18759
18760	// NOTE: This combine exists in DAGCombiner, but that version's legality check
18761	// blocks this combine because the non-const case requires custom lowering.
18762	//
18763	// ty1 extract_vector(ty2 splat(const))) -> ty1 splat(const)
18764	if (V.getOpcode() == ISD::SPLAT_VECTOR)
18765	if (isa<ConstantSDNode>(Val: V.getOperand(i: `0`)))
18766	return DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL: SDLoc (N), VT, Operand: V.getOperand(i: `0`));
18767
18768	return SDValue ();
18769	}
18770
18771	static SDValue
18772	performInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
18773	SelectionDAG &DAG) {
18774	SDLoc DL(N);
18775	SDValue Vec = N->getOperand(Num: `0`);
18776	SDValue SubVec = N->getOperand(Num: `1`);
18777	uint64_t IdxVal = N->getConstantOperandVal(Num: `2`);
18778	EVT VecVT = Vec.getValueType();
18779	EVT SubVT = SubVec.getValueType();
18780
18781	// Only do this for legal fixed vector types.
18782	if (!VecVT.isFixedLengthVector() \|\|
18783	!DAG.getTargetLoweringInfo().isTypeLegal(VT: VecVT) \|\|
18784	!DAG.getTargetLoweringInfo().isTypeLegal(VT: SubVT))
18785	return SDValue ();
18786
18787	// Ignore widening patterns.
18788	if (IdxVal == `0` && Vec.isUndef())
18789	return SDValue ();
18790
18791	// Subvector must be half the width and an "aligned" insertion.
18792	unsigned NumSubElts = SubVT.getVectorNumElements();
18793	if ((SubVT.getSizeInBits() * `2`) != VecVT.getSizeInBits() \|\|
18794	(IdxVal != `0` && IdxVal != NumSubElts))
18795	return SDValue ();
18796
18797	// Fold insert_subvector -> concat_vectors
18798	// insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
18799	// insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
18800	SDValue Lo, Hi;
18801	if (IdxVal == `0`) {
18802	Lo = SubVec;
18803	Hi = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: SubVT, N1: Vec,
18804	N2: DAG.getVectorIdxConstant(Val: NumSubElts, DL));
18805	} else {
18806	Lo = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: SubVT, N1: Vec,
18807	N2: DAG.getVectorIdxConstant(Val: `0`, DL));
18808	Hi = SubVec;
18809	}
18810	return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: VecVT, N1: Lo, N2: Hi);
18811	}
18812
18813	static SDValue tryCombineFixedPointConvert(SDNode *N,
18814	TargetLowering::DAGCombinerInfo &DCI,
18815	SelectionDAG &DAG) {
18816	// Wait until after everything is legalized to try this. That way we have
18817	// legal vector types and such.
18818	if (DCI.isBeforeLegalizeOps())
18819	return SDValue ();
18820	// Transform a scalar conversion of a value from a lane extract into a
18821	// lane extract of a vector conversion. E.g., from foo1 to foo2:
18822	// double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
18823	// double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
18824	//
18825	// The second form interacts better with instruction selection and the
18826	// register allocator to avoid cross-class register copies that aren't
18827	// coalescable due to a lane reference.
18828
18829	// Check the operand and see if it originates from a lane extract.
18830	SDValue Op1 = N->getOperand(Num: `1`);
18831	if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
18832	return SDValue ();
18833
18834	// Yep, no additional predication needed. Perform the transform.
18835	SDValue IID = N->getOperand(Num: `0`);
18836	SDValue Shift = N->getOperand(Num: `2`);
18837	SDValue Vec = Op1.getOperand(i: `0`);
18838	SDValue Lane = Op1.getOperand(i: `1`);
18839	EVT ResTy = N->getValueType(ResNo: `0`);
18840	EVT VecResTy;
18841	SDLoc DL(N);
18842
18843	// The vector width should be 128 bits by the time we get here, even
18844	// if it started as 64 bits (the extract_vector handling will have
18845	// done so). Bail if it is not.
18846	if (Vec.getValueSizeInBits() != `128`)
18847	return SDValue ();
18848
18849	if (Vec.getValueType() == MVT::v4i32)
18850	VecResTy = MVT::v4f32;
18851	else if (Vec.getValueType() == MVT::v2i64)
18852	VecResTy = MVT::v2f64;
18853	else
18854	return SDValue ();
18855
18856	SDValue Convert =
18857	DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: VecResTy, N1: IID, N2: Vec, N3: Shift);
18858	return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: ResTy, N1: Convert, N2: Lane);
18859	}
18860
18861	// AArch64 high-vector "long" operations are formed by performing the non-high
18862	// version on an extract_subvector of each operand which gets the high half:
18863	//
18864	// (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
18865	//
18866	// However, there are cases which don't have an extract_high explicitly, but
18867	// have another operation that can be made compatible with one for free. For
18868	// example:
18869	//
18870	// (dupv64 scalar) --> (extract_high (dup128 scalar))
18871	//
18872	// This routine does the actual conversion of such DUPs, once outer routines
18873	// have determined that everything else is in order.
18874	// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
18875	// similarly here.
18876	static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
18877	MVT VT = N.getSimpleValueType();
18878	if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
18879	N.getConstantOperandVal(i: `1`) == `0`)
18880	N = N.getOperand(i: `0`);
18881
18882	switch (N.getOpcode()) {
18883	case AArch64ISD::DUP:
18884	case AArch64ISD::DUPLANE8:
18885	case AArch64ISD::DUPLANE16:
18886	case AArch64ISD::DUPLANE32:
18887	case AArch64ISD::DUPLANE64:
18888	case AArch64ISD::MOVI:
18889	case AArch64ISD::MOVIshift:
18890	case AArch64ISD::MOVIedit:
18891	case AArch64ISD::MOVImsl:
18892	case AArch64ISD::MVNIshift:
18893	case AArch64ISD::MVNImsl:
18894	break;
18895	default:
18896	// FMOV could be supported, but isn't very useful, as it would only occur
18897	// if you passed a bitcast' floating point immediate to an eligible long
18898	// integer op (addl, smull, ...).
18899	return SDValue ();
18900	}
18901
18902	if (!VT.is64BitVector())
18903	return SDValue ();
18904
18905	SDLoc DL(N);
18906	unsigned NumElems = VT.getVectorNumElements();
18907	if (N.getValueType().is64BitVector()) {
18908	MVT ElementTy = VT.getVectorElementType();
18909	MVT NewVT = MVT::getVectorVT(VT: ElementTy, NumElements: NumElems * `2`);
18910	N = DAG.getNode(Opcode: N ->getOpcode(), DL, VT: NewVT, Ops: N ->ops());
18911	}
18912
18913	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N,
18914	DAG.getConstant(NumElems, DL, MVT::i64));
18915	}
18916
18917	static bool isEssentiallyExtractHighSubvector(SDValue N) {
18918	if (N.getOpcode() == ISD::BITCAST)
18919	N = N.getOperand(i: `0`);
18920	if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
18921	return false;
18922	if (N.getOperand(i: `0`).getValueType().isScalableVector())
18923	return false;
18924	return N.getConstantOperandAPInt(i: `1`) ==
18925	N.getOperand(i: `0`).getValueType().getVectorNumElements() / `2`;
18926	}
18927
18928	/// Helper structure to keep track of ISD::SET_CC operands.
18929	struct GenericSetCCInfo {
18930	const SDValue *Opnd0;
18931	const SDValue *Opnd1;
18932	ISD::CondCode CC;
18933	};
18934
18935	/// Helper structure to keep track of a SET_CC lowered into AArch64 code.
18936	struct AArch64SetCCInfo {
18937	const SDValue *Cmp;
18938	AArch64CC::CondCode CC;
18939	};
18940
18941	/// Helper structure to keep track of SetCC information.
18942	union SetCCInfo {
18943	GenericSetCCInfo Generic;
18944	AArch64SetCCInfo AArch64;
18945	};
18946
18947	/// Helper structure to be able to read SetCC information. If set to
18948	/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
18949	/// GenericSetCCInfo.
18950	struct SetCCInfoAndKind {
18951	SetCCInfo Info;
18952	bool IsAArch64;
18953	};
18954
18955	/// Check whether or not \p Op is a SET_CC operation, either a generic or
18956	/// an
18957	/// AArch64 lowered one.
18958	/// \p SetCCInfo is filled accordingly.
18959	/// \post SetCCInfo is meanginfull only when this function returns true.
18960	/// \return True when Op is a kind of SET_CC operation.
18961	static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) {
18962	// If this is a setcc, this is straight forward.
18963	if (Op.getOpcode() == ISD::SETCC) {
18964	SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(i: `0`);
18965	SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(i: `1`);
18966	SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Val: Op.getOperand(i: `2`))->get();
18967	SetCCInfo.IsAArch64 = false;
18968	return true;
18969	}
18970	// Otherwise, check if this is a matching csel instruction.
18971	// In other words:
18972	// - csel 1, 0, cc
18973	// - csel 0, 1, !cc
18974	if (Op.getOpcode() != AArch64ISD::CSEL)
18975	return false;
18976	// Set the information about the operands.
18977	// TODO: we want the operands of the Cmp not the csel
18978	SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(i: `3`);
18979	SetCCInfo.IsAArch64 = true;
18980	SetCCInfo.Info.AArch64.CC =
18981	static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(i: `2`));
18982
18983	// Check that the operands matches the constraints:
18984	// (1) Both operands must be constants.
18985	// (2) One must be 1 and the other must be 0.
18986	ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: `0`));
18987	ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: `1`));
18988
18989	// Check (1).
18990	if (!TValue \|\| !FValue)
18991	return false;
18992
18993	// Check (2).
18994	if (!TValue->isOne()) {
18995	// Update the comparison when we are interested in !cc.
18996	std::swap(a&: TValue, b&: FValue);
18997	SetCCInfo.Info.AArch64.CC =
18998	AArch64CC::getInvertedCondCode(Code: SetCCInfo.Info.AArch64.CC);
18999	}
19000	return TValue->isOne() && FValue->isZero();
19001	}
19002
19003	// Returns true if Op is setcc or zext of setcc.
19004	static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
19005	if (isSetCC(Op, SetCCInfo&: Info))
19006	return true;
19007	return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
19008	isSetCC(Op: Op ->getOperand(Num: `0`), SetCCInfo&: Info));
19009	}
19010
19011	// The folding we want to perform is:
19012	// (add x, [zext] (setcc cc ...) )
19013	// -->
19014	// (csel x, (add x, 1), !cc ...)
19015	//
19016	// The latter will get matched to a CSINC instruction.
19017	static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) {
19018	assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
19019	SDValue LHS = Op->getOperand(Num: `0`);
19020	SDValue RHS = Op->getOperand(Num: `1`);
19021	SetCCInfoAndKind InfoAndKind;
19022
19023	// If both operands are a SET_CC, then we don't want to perform this
19024	// folding and create another csel as this results in more instructions
19025	// (and higher register usage).
19026	if (isSetCCOrZExtSetCC(Op: LHS, Info&: InfoAndKind) &&
19027	isSetCCOrZExtSetCC(Op: RHS, Info&: InfoAndKind))
19028	return SDValue ();
19029
19030	// If neither operand is a SET_CC, give up.
19031	if (!isSetCCOrZExtSetCC(Op: LHS, Info&: InfoAndKind)) {
19032	std::swap(a&: LHS, b&: RHS);
19033	if (!isSetCCOrZExtSetCC(Op: LHS, Info&: InfoAndKind))
19034	return SDValue ();
19035	}
19036
19037	// FIXME: This could be generatized to work for FP comparisons.
19038	EVT CmpVT = InfoAndKind.IsAArch64
19039	? InfoAndKind.Info.AArch64.Cmp->getOperand(i: `0`).getValueType()
19040	: InfoAndKind.Info.Generic.Opnd0->getValueType();
19041	if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
19042	return SDValue ();
19043
19044	SDValue CCVal;
19045	SDValue Cmp;
19046	SDLoc dl(Op);
19047	if (InfoAndKind.IsAArch64) {
19048	CCVal = DAG.getConstant(
19049	AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), dl,
19050	MVT::i32);
19051	Cmp = *InfoAndKind.Info.AArch64.Cmp;
19052	} else
19053	Cmp = getAArch64Cmp(
19054	LHS: InfoAndKind.Info.Generic.Opnd0, RHS: InfoAndKind.Info.Generic.Opnd1,
19055	CC: ISD::getSetCCInverse(Operation: InfoAndKind.Info.Generic.CC, Type: CmpVT), AArch64cc&: CCVal, DAG,
19056	dl);
19057
19058	EVT VT = Op->getValueType(ResNo: `0`);
19059	LHS = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT, N1: RHS, N2: DAG.getConstant(Val: `1`, DL: dl, VT));
19060	return DAG.getNode(Opcode: AArch64ISD::CSEL, DL: dl, VT, N1: RHS, N2: LHS, N3: CCVal, N4: Cmp);
19061	}
19062
19063	// ADD(UADDV a, UADDV b) --> UADDV(ADD a, b)
19064	static SDValue performAddUADDVCombine(SDNode *N, SelectionDAG &DAG) {
19065	EVT VT = N->getValueType(ResNo: `0`);
19066	// Only scalar integer and vector types.
19067	if (N->getOpcode() != ISD::ADD \|\| !VT.isScalarInteger())
19068	return SDValue ();
19069
19070	SDValue LHS = N->getOperand(Num: `0`);
19071	SDValue RHS = N->getOperand(Num: `1`);
19072	if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
19073	RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\| LHS.getValueType() != VT)
19074	return SDValue ();
19075
19076	auto *LHSN1 = dyn_cast<ConstantSDNode>(Val: LHS ->getOperand(Num: `1`));
19077	auto *RHSN1 = dyn_cast<ConstantSDNode>(Val: RHS ->getOperand(Num: `1`));
19078	if (!LHSN1 \|\| LHSN1 != RHSN1 \|\| !RHSN1->isZero())
19079	return SDValue ();
19080
19081	SDValue Op1 = LHS ->getOperand(Num: `0`);
19082	SDValue Op2 = RHS ->getOperand(Num: `0`);
19083	EVT OpVT1 = Op1.getValueType();
19084	EVT OpVT2 = Op2.getValueType();
19085	if (Op1.getOpcode() != AArch64ISD::UADDV \|\| OpVT1 != OpVT2 \|\|
19086	Op2.getOpcode() != AArch64ISD::UADDV \|\|
19087	OpVT1.getVectorElementType() != VT)
19088	return SDValue ();
19089
19090	SDValue Val1 = Op1.getOperand(i: `0`);
19091	SDValue Val2 = Op2.getOperand(i: `0`);
19092	EVT ValVT = Val1 ->getValueType(ResNo: `0`);
19093	SDLoc DL(N);
19094	SDValue AddVal = DAG.getNode(Opcode: ISD::ADD, DL, VT: ValVT, N1: Val1, N2: Val2);
19095	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
19096	DAG.getNode(AArch64ISD::UADDV, DL, ValVT, AddVal),
19097	DAG.getConstant(`0`, DL, MVT::i64));
19098	}
19099
19100	/// Perform the scalar expression combine in the form of:
19101	/// CSEL(c, 1, cc) + b => CSINC(b+c, b, cc)
19102	/// CSNEG(c, -1, cc) + b => CSINC(b+c, b, cc)
19103	static SDValue performAddCSelIntoCSinc(SDNode *N, SelectionDAG &DAG) {
19104	EVT VT = N->getValueType(ResNo: `0`);
19105	if (!VT.isScalarInteger() \|\| N->getOpcode() != ISD::ADD)
19106	return SDValue ();
19107
19108	SDValue LHS = N->getOperand(Num: `0`);
19109	SDValue RHS = N->getOperand(Num: `1`);
19110
19111	// Handle commutivity.
19112	if (LHS.getOpcode() != AArch64ISD::CSEL &&
19113	LHS.getOpcode() != AArch64ISD::CSNEG) {
19114	std::swap(a&: LHS, b&: RHS);
19115	if (LHS.getOpcode() != AArch64ISD::CSEL &&
19116	LHS.getOpcode() != AArch64ISD::CSNEG) {
19117	return SDValue ();
19118	}
19119	}
19120
19121	if (!LHS.hasOneUse())
19122	return SDValue ();
19123
19124	AArch64CC::CondCode AArch64CC =
19125	static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(i: `2`));
19126
19127	// The CSEL should include a const one operand, and the CSNEG should include
19128	// One or NegOne operand.
19129	ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(Val: LHS.getOperand(i: `0`));
19130	ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(Val: LHS.getOperand(i: `1`));
19131	if (!CTVal \|\| !CFVal)
19132	return SDValue ();
19133
19134	if (!(LHS.getOpcode() == AArch64ISD::CSEL &&
19135	(CTVal->isOne() \|\| CFVal->isOne())) &&
19136	!(LHS.getOpcode() == AArch64ISD::CSNEG &&
19137	(CTVal->isOne() \|\| CFVal->isAllOnes())))
19138	return SDValue ();
19139
19140	// Switch CSEL(1, c, cc) to CSEL(c, 1, !cc)
19141	if (LHS.getOpcode() == AArch64ISD::CSEL && CTVal->isOne() &&
19142	!CFVal->isOne()) {
19143	std::swap(a&: CTVal, b&: CFVal);
19144	AArch64CC = AArch64CC::getInvertedCondCode(Code: AArch64CC);
19145	}
19146
19147	SDLoc DL(N);
19148	// Switch CSNEG(1, c, cc) to CSNEG(-c, -1, !cc)
19149	if (LHS.getOpcode() == AArch64ISD::CSNEG && CTVal->isOne() &&
19150	!CFVal->isAllOnes()) {
19151	APInt C = -`1` * CFVal->getAPIntValue();
19152	CTVal = cast<ConstantSDNode>(Val: DAG.getConstant(Val: C, DL, VT));
19153	CFVal = cast<ConstantSDNode>(Val: DAG.getAllOnesConstant(DL, VT));
19154	AArch64CC = AArch64CC::getInvertedCondCode(Code: AArch64CC);
19155	}
19156
19157	// It might be neutral for larger constants, as the immediate need to be
19158	// materialized in a register.
19159	APInt ADDC = CTVal->getAPIntValue();
19160	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19161	if (!TLI.isLegalAddImmediate(ADDC.getSExtValue()))
19162	return SDValue ();
19163
19164	assert(((LHS.getOpcode() == AArch64ISD::CSEL && CFVal->isOne()) \|\|
19165	(LHS.getOpcode() == AArch64ISD::CSNEG && CFVal->isAllOnes())) &&
19166	"Unexpected constant value");
19167
19168	SDValue NewNode = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: RHS, N2: SDValue (CTVal, `0`));
19169	SDValue CCVal = DAG.getConstant(AArch64CC, DL, MVT::i32);
19170	SDValue Cmp = LHS.getOperand(i: `3`);
19171
19172	return DAG.getNode(Opcode: AArch64ISD::CSINC, DL, VT, N1: NewNode, N2: RHS, N3: CCVal, N4: Cmp);
19173	}
19174
19175	// ADD(UDOT(zero, x, y), A) --> UDOT(A, x, y)
19176	static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG) {
19177	EVT VT = N->getValueType(ResNo: `0`);
19178	if (N->getOpcode() != ISD::ADD)
19179	return SDValue ();
19180
19181	SDValue Dot = N->getOperand(Num: `0`);
19182	SDValue A = N->getOperand(Num: `1`);
19183	// Handle commutivity
19184	auto isZeroDot = [](SDValue Dot) {
19185	return (Dot.getOpcode() == AArch64ISD::UDOT \|\|
19186	Dot.getOpcode() == AArch64ISD::SDOT) &&
19187	isZerosVector(N: Dot.getOperand(i: `0`).getNode());
19188	};
19189	if (!isZeroDot (Dot))
19190	std::swap(a&: Dot, b&: A);
19191	if (!isZeroDot (Dot))
19192	return SDValue ();
19193
19194	return DAG.getNode(Opcode: Dot.getOpcode(), DL: SDLoc (N), VT, N1: A, N2: Dot.getOperand(i: `1`),
19195	N3: Dot.getOperand(i: `2`));
19196	}
19197
19198	static bool isNegatedInteger(SDValue Op) {
19199	return Op.getOpcode() == ISD::SUB && isNullConstant(V: Op.getOperand(i: `0`));
19200	}
19201
19202	static SDValue getNegatedInteger(SDValue Op, SelectionDAG &DAG) {
19203	SDLoc DL(Op);
19204	EVT VT = Op.getValueType();
19205	SDValue Zero = DAG.getConstant(Val: `0`, DL, VT);
19206	return DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Zero, N2: Op);
19207	}
19208
19209	// Try to fold
19210	//
19211	// (neg (csel X, Y)) -> (csel (neg X), (neg Y))
19212	//
19213	// The folding helps csel to be matched with csneg without generating
19214	// redundant neg instruction, which includes negation of the csel expansion
19215	// of abs node lowered by lowerABS.
19216	static SDValue performNegCSelCombine(SDNode *N, SelectionDAG &DAG) {
19217	if (!isNegatedInteger(Op: SDValue (N, `0`)))
19218	return SDValue ();
19219
19220	SDValue CSel = N->getOperand(Num: `1`);
19221	if (CSel.getOpcode() != AArch64ISD::CSEL \|\| !CSel ->hasOneUse())
19222	return SDValue ();
19223
19224	SDValue N0 = CSel.getOperand(i: `0`);
19225	SDValue N1 = CSel.getOperand(i: `1`);
19226
19227	// If both of them is not negations, it's not worth the folding as it
19228	// introduces two additional negations while reducing one negation.
19229	if (!isNegatedInteger(Op: N0) && !isNegatedInteger(Op: N1))
19230	return SDValue ();
19231
19232	SDValue N0N = getNegatedInteger(Op: N0, DAG);
19233	SDValue N1N = getNegatedInteger(Op: N1, DAG);
19234
19235	SDLoc DL(N);
19236	EVT VT = CSel.getValueType();
19237	return DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT, N1: N0N, N2: N1N, N3: CSel.getOperand(i: `2`),
19238	N4: CSel.getOperand(i: `3`));
19239	}
19240
19241	// The basic add/sub long vector instructions have variants with "2" on the end
19242	// which act on the high-half of their inputs. They are normally matched by
19243	// patterns like:
19244	//
19245	// (add (zeroext (extract_high LHS)),
19246	// (zeroext (extract_high RHS)))
19247	// -> uaddl2 vD, vN, vM
19248	//
19249	// However, if one of the extracts is something like a duplicate, this
19250	// instruction can still be used profitably. This function puts the DAG into a
19251	// more appropriate form for those patterns to trigger.
19252	static SDValue performAddSubLongCombine(SDNode *N,
19253	TargetLowering::DAGCombinerInfo &DCI) {
19254	SelectionDAG &DAG = DCI.DAG;
19255	if (DCI.isBeforeLegalizeOps())
19256	return SDValue ();
19257
19258	MVT VT = N->getSimpleValueType(ResNo: `0`);
19259	if (!VT.is128BitVector()) {
19260	if (N->getOpcode() == ISD::ADD)
19261	return performSetccAddFolding(Op: N, DAG);
19262	return SDValue ();
19263	}
19264
19265	// Make sure both branches are extended in the same way.
19266	SDValue LHS = N->getOperand(Num: `0`);
19267	SDValue RHS = N->getOperand(Num: `1`);
19268	if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
19269	LHS.getOpcode() != ISD::SIGN_EXTEND) \|\|
19270	LHS.getOpcode() != RHS.getOpcode())
19271	return SDValue ();
19272
19273	unsigned ExtType = LHS.getOpcode();
19274
19275	// It's not worth doing if at least one of the inputs isn't already an
19276	// extract, but we don't know which it'll be so we have to try both.
19277	if (isEssentiallyExtractHighSubvector(N: LHS.getOperand(i: `0`))) {
19278	RHS = tryExtendDUPToExtractHigh(N: RHS.getOperand(i: `0`), DAG);
19279	if (!RHS.getNode())
19280	return SDValue ();
19281
19282	RHS = DAG.getNode(Opcode: ExtType, DL: SDLoc (N), VT, Operand: RHS);
19283	} else if (isEssentiallyExtractHighSubvector(N: RHS.getOperand(i: `0`))) {
19284	LHS = tryExtendDUPToExtractHigh(N: LHS.getOperand(i: `0`), DAG);
19285	if (!LHS.getNode())
19286	return SDValue ();
19287
19288	LHS = DAG.getNode(Opcode: ExtType, DL: SDLoc (N), VT, Operand: LHS);
19289	}
19290
19291	return DAG.getNode(Opcode: N->getOpcode(), DL: SDLoc (N), VT, N1: LHS, N2: RHS);
19292	}
19293
19294	static bool isCMP(SDValue Op) {
19295	return Op.getOpcode() == AArch64ISD::SUBS &&
19296	!Op.getNode()->hasAnyUseOfValue(Value: `0`);
19297	}
19298
19299	// (CSEL 1 0 CC Cond) => CC
19300	// (CSEL 0 1 CC Cond) => !CC
19301	static std::optional<AArch64CC::CondCode> getCSETCondCode(SDValue Op) {
19302	if (Op.getOpcode() != AArch64ISD::CSEL)
19303	return std::nullopt;
19304	auto CC = static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(i: `2`));
19305	if (CC == AArch64CC::AL \|\| CC == AArch64CC::NV)
19306	return std::nullopt;
19307	SDValue OpLHS = Op.getOperand(i: `0`);
19308	SDValue OpRHS = Op.getOperand(i: `1`);
19309	if (isOneConstant(V: OpLHS) && isNullConstant(V: OpRHS))
19310	return CC;
19311	if (isNullConstant(V: OpLHS) && isOneConstant(V: OpRHS))
19312	return getInvertedCondCode(Code: CC);
19313
19314	return std::nullopt;
19315	}
19316
19317	// (ADC{S} l r (CMP (CSET HS carry) 1)) => (ADC{S} l r carry)
19318	// (SBC{S} l r (CMP 0 (CSET LO carry))) => (SBC{S} l r carry)
19319	static SDValue foldOverflowCheck(SDNode Op, SelectionDAG &DAG, bool* IsAdd) {
19320	SDValue CmpOp = Op->getOperand(Num: `2`);
19321	if (!isCMP(Op: CmpOp))
19322	return SDValue ();
19323
19324	if (IsAdd) {
19325	if (!isOneConstant(V: CmpOp.getOperand(i: `1`)))
19326	return SDValue ();
19327	} else {
19328	if (!isNullConstant(V: CmpOp.getOperand(i: `0`)))
19329	return SDValue ();
19330	}
19331
19332	SDValue CsetOp = CmpOp ->getOperand(Num: IsAdd ? `0` : `1`);
19333	auto CC = getCSETCondCode(Op: CsetOp);
19334	if (CC != (IsAdd ? AArch64CC::HS : AArch64CC::LO))
19335	return SDValue ();
19336
19337	return DAG.getNode(Opcode: Op->getOpcode(), DL: SDLoc (Op), VTList: Op->getVTList(),
19338	N1: Op->getOperand(Num: `0`), N2: Op->getOperand(Num: `1`),
19339	N3: CsetOp.getOperand(i: `3`));
19340	}
19341
19342	// (ADC x 0 cond) => (CINC x HS cond)
19343	static SDValue foldADCToCINC(SDNode *N, SelectionDAG &DAG) {
19344	SDValue LHS = N->getOperand(Num: `0`);
19345	SDValue RHS = N->getOperand(Num: `1`);
19346	SDValue Cond = N->getOperand(Num: `2`);
19347
19348	if (!isNullConstant(V: RHS))
19349	return SDValue ();
19350
19351	EVT VT = N->getValueType(ResNo: `0`);
19352	SDLoc DL(N);
19353
19354	// (CINC x cc cond) <=> (CSINC x x !cc cond)
19355	SDValue CC = DAG.getConstant(AArch64CC::LO, DL, MVT::i32);
19356	return DAG.getNode(Opcode: AArch64ISD::CSINC, DL, VT, N1: LHS, N2: LHS, N3: CC, N4: Cond);
19357	}
19358
19359	// Transform vector add(zext i8 to i32, zext i8 to i32)
19360	// into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32)
19361	// This allows extra uses of saddl/uaddl at the lower vector widths, and less
19362	// extends.
19363	static SDValue performVectorAddSubExtCombine(SDNode *N, SelectionDAG &DAG) {
19364	EVT VT = N->getValueType(ResNo: `0`);
19365	if (!VT.isFixedLengthVector() \|\| VT.getSizeInBits() <= `128` \|\|
19366	(N->getOperand(Num: `0`).getOpcode() != ISD::ZERO_EXTEND &&
19367	N->getOperand(Num: `0`).getOpcode() != ISD::SIGN_EXTEND) \|\|
19368	(N->getOperand(Num: `1`).getOpcode() != ISD::ZERO_EXTEND &&
19369	N->getOperand(Num: `1`).getOpcode() != ISD::SIGN_EXTEND) \|\|
19370	N->getOperand(Num: `0`).getOperand(i: `0`).getValueType() !=
19371	N->getOperand(Num: `1`).getOperand(i: `0`).getValueType())
19372	return SDValue ();
19373
19374	SDValue N0 = N->getOperand(Num: `0`).getOperand(i: `0`);
19375	SDValue N1 = N->getOperand(Num: `1`).getOperand(i: `0`);
19376	EVT InVT = N0.getValueType();
19377
19378	EVT S1 = InVT.getScalarType();
19379	EVT S2 = VT.getScalarType();
19380	if ((S2 == MVT::i32 && S1 == MVT::i8) \|\|
19381	(S2 == MVT::i64 && (S1 == MVT::i8 \|\| S1 == MVT::i16))) {
19382	SDLoc DL(N);
19383	EVT HalfVT = EVT::getVectorVT(Context&: *DAG.getContext(),
19384	VT: S2.getHalfSizedIntegerVT(Context&: *DAG.getContext()),
19385	EC: VT.getVectorElementCount());
19386	SDValue NewN0 = DAG.getNode(Opcode: N->getOperand(Num: `0`).getOpcode(), DL, VT: HalfVT, Operand: N0);
19387	SDValue NewN1 = DAG.getNode(Opcode: N->getOperand(Num: `1`).getOpcode(), DL, VT: HalfVT, Operand: N1);
19388	SDValue NewOp = DAG.getNode(Opcode: N->getOpcode(), DL, VT: HalfVT, N1: NewN0, N2: NewN1);
19389	return DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT, Operand: NewOp);
19390	}
19391	return SDValue ();
19392	}
19393
19394	static SDValue performBuildVectorCombine(SDNode *N,
19395	TargetLowering::DAGCombinerInfo &DCI,
19396	SelectionDAG &DAG) {
19397	SDLoc DL(N);
19398	EVT VT = N->getValueType(ResNo: `0`);
19399
19400	if (VT == MVT::v4f16 \|\| VT == MVT::v4bf16) {
19401	SDValue Elt0 = N->getOperand(Num: `0`), Elt1 = N->getOperand(Num: `1`),
19402	Elt2 = N->getOperand(Num: `2`), Elt3 = N->getOperand(Num: `3`);
19403	if (Elt0 ->getOpcode() == ISD::FP_ROUND &&
19404	Elt1 ->getOpcode() == ISD::FP_ROUND &&
19405	isa<ConstantSDNode>(Val: Elt0 ->getOperand(Num: `1`)) &&
19406	isa<ConstantSDNode>(Val: Elt1 ->getOperand(Num: `1`)) &&
19407	Elt0 ->getConstantOperandVal(Num: `1`) == Elt1 ->getConstantOperandVal(Num: `1`) &&
19408	Elt0 ->getOperand(Num: `0`)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19409	Elt1 ->getOperand(Num: `0`)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19410	// Constant index.
19411	isa<ConstantSDNode>(Val: Elt0 ->getOperand(Num: `0`)->getOperand(Num: `1`)) &&
19412	isa<ConstantSDNode>(Val: Elt1 ->getOperand(Num: `0`)->getOperand(Num: `1`)) &&
19413	Elt0 ->getOperand(Num: `0`)->getOperand(Num: `0`) ==
19414	Elt1 ->getOperand(Num: `0`)->getOperand(Num: `0`) &&
19415	Elt0 ->getOperand(Num: `0`)->getConstantOperandVal(Num: `1`) == `0` &&
19416	Elt1 ->getOperand(Num: `0`)->getConstantOperandVal(Num: `1`) == `1`) {
19417	SDValue LowLanesSrcVec = Elt0 ->getOperand(Num: `0`)->getOperand(Num: `0`);
19418	if (LowLanesSrcVec.getValueType() == MVT::v2f64) {
19419	SDValue HighLanes;
19420	if (Elt2 ->getOpcode() == ISD::UNDEF &&
19421	Elt3 ->getOpcode() == ISD::UNDEF) {
19422	HighLanes = DAG.getUNDEF(MVT::v2f32);
19423	} else if (Elt2 ->getOpcode() == ISD::FP_ROUND &&
19424	Elt3 ->getOpcode() == ISD::FP_ROUND &&
19425	isa<ConstantSDNode>(Val: Elt2 ->getOperand(Num: `1`)) &&
19426	isa<ConstantSDNode>(Val: Elt3 ->getOperand(Num: `1`)) &&
19427	Elt2 ->getConstantOperandVal(Num: `1`) ==
19428	Elt3 ->getConstantOperandVal(Num: `1`) &&
19429	Elt2 ->getOperand(Num: `0`)->getOpcode() ==
19430	ISD::EXTRACT_VECTOR_ELT &&
19431	Elt3 ->getOperand(Num: `0`)->getOpcode() ==
19432	ISD::EXTRACT_VECTOR_ELT &&
19433	// Constant index.
19434	isa<ConstantSDNode>(Val: Elt2 ->getOperand(Num: `0`)->getOperand(Num: `1`)) &&
19435	isa<ConstantSDNode>(Val: Elt3 ->getOperand(Num: `0`)->getOperand(Num: `1`)) &&
19436	Elt2 ->getOperand(Num: `0`)->getOperand(Num: `0`) ==
19437	Elt3 ->getOperand(Num: `0`)->getOperand(Num: `0`) &&
19438	Elt2 ->getOperand(Num: `0`)->getConstantOperandVal(Num: `1`) == `0` &&
19439	Elt3 ->getOperand(Num: `0`)->getConstantOperandVal(Num: `1`) == `1`) {
19440	SDValue HighLanesSrcVec = Elt2 ->getOperand(Num: `0`)->getOperand(Num: `0`);
19441	HighLanes =
19442	DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, HighLanesSrcVec);
19443	}
19444	if (HighLanes) {
19445	SDValue DoubleToSingleSticky =
19446	DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, LowLanesSrcVec);
19447	SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
19448	DoubleToSingleSticky, HighLanes);
19449	return DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT, N1: Concat,
19450	N2: Elt0 ->getOperand(Num: `1`));
19451	}
19452	}
19453	}
19454	}
19455
19456	if (VT == MVT::v2f64) {
19457	SDValue Elt0 = N->getOperand(Num: `0`), Elt1 = N->getOperand(Num: `1`);
19458	if (Elt0 ->getOpcode() == ISD::FP_EXTEND &&
19459	Elt1 ->getOpcode() == ISD::FP_EXTEND &&
19460	Elt0 ->getOperand(Num: `0`)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19461	Elt1 ->getOperand(Num: `0`)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19462	Elt0 ->getOperand(Num: `0`)->getOperand(Num: `0`) ==
19463	Elt1 ->getOperand(Num: `0`)->getOperand(Num: `0`) &&
19464	// Constant index.
19465	isa<ConstantSDNode>(Val: Elt0 ->getOperand(Num: `0`)->getOperand(Num: `1`)) &&
19466	isa<ConstantSDNode>(Val: Elt1 ->getOperand(Num: `0`)->getOperand(Num: `1`)) &&
19467	Elt0 ->getOperand(Num: `0`)->getConstantOperandVal(Num: `1`) + `1` ==
19468	Elt1 ->getOperand(Num: `0`)->getConstantOperandVal(Num: `1`) &&
19469	// EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
19470	// ResultType's known minimum vector length.
19471	Elt0 ->getOperand(Num: `0`)->getConstantOperandVal(Num: `1`) %
19472	VT.getVectorMinNumElements() ==
19473	`0`) {
19474	SDValue SrcVec = Elt0 ->getOperand(Num: `0`)->getOperand(Num: `0`);
19475	if (SrcVec.getValueType() == MVT::v4f16 \|\|
19476	SrcVec.getValueType() == MVT::v4bf16) {
19477	SDValue HalfToSingle =
19478	DAG.getNode(ISD::FP_EXTEND, DL, MVT::v4f32, SrcVec);
19479	SDValue SubvectorIdx = Elt0 ->getOperand(Num: `0`)->getOperand(Num: `1`);
19480	SDValue Extract = DAG.getNode(
19481	ISD::EXTRACT_SUBVECTOR, DL, VT.changeVectorElementType(MVT::f32),
19482	HalfToSingle, SubvectorIdx);
19483	return DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT, Operand: Extract);
19484	}
19485	}
19486	}
19487
19488	// A build vector of two extracted elements is equivalent to an
19489	// extract subvector where the inner vector is any-extended to the
19490	// extract_vector_elt VT.
19491	// (build_vector (extract_elt_iXX_to_i32 vec Idx+0)
19492	// (extract_elt_iXX_to_i32 vec Idx+1))
19493	// => (extract_subvector (anyext_iXX_to_i32 vec) Idx)
19494
19495	// For now, only consider the v2i32 case, which arises as a result of
19496	// legalization.
19497	if (VT != MVT::v2i32)
19498	return SDValue ();
19499
19500	SDValue Elt0 = N->getOperand(Num: `0`), Elt1 = N->getOperand(Num: `1`);
19501	// Reminder, EXTRACT_VECTOR_ELT has the effect of any-extending to its VT.
19502	if (Elt0 ->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19503	Elt1 ->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19504	// Constant index.
19505	isa<ConstantSDNode>(Val: Elt0 ->getOperand(Num: `1`)) &&
19506	isa<ConstantSDNode>(Val: Elt1 ->getOperand(Num: `1`)) &&
19507	// Both EXTRACT_VECTOR_ELT from same vector...
19508	Elt0 ->getOperand(Num: `0`) == Elt1 ->getOperand(Num: `0`) &&
19509	// ... and contiguous. First element's index +1 == second element's index.
19510	Elt0 ->getConstantOperandVal(Num: `1`) + `1` == Elt1 ->getConstantOperandVal(Num: `1`) &&
19511	// EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
19512	// ResultType's known minimum vector length.
19513	Elt0 ->getConstantOperandVal(Num: `1`) % VT.getVectorMinNumElements() == `0`) {
19514	SDValue VecToExtend = Elt0 ->getOperand(Num: `0`);
19515	EVT ExtVT = VecToExtend.getValueType().changeVectorElementType(MVT::i32);
19516	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT: ExtVT))
19517	return SDValue ();
19518
19519	SDValue SubvectorIdx = DAG.getVectorIdxConstant(Val: Elt0 ->getConstantOperandVal(Num: `1`), DL);
19520
19521	SDValue Ext = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: ExtVT, Operand: VecToExtend);
19522	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Ext,
19523	SubvectorIdx);
19524	}
19525
19526	return SDValue ();
19527	}
19528
19529	static SDValue performTruncateCombine(SDNode *N,
19530	SelectionDAG &DAG) {
19531	EVT VT = N->getValueType(ResNo: `0`);
19532	SDValue N0 = N->getOperand(Num: `0`);
19533	if (VT.isFixedLengthVector() && VT.is64BitVector() && N0.hasOneUse() &&
19534	N0.getOpcode() == AArch64ISD::DUP) {
19535	SDValue Op = N0.getOperand(i: `0`);
19536	if (VT.getScalarType() == MVT::i32 &&
19537	N0.getOperand(`0`).getValueType().getScalarType() == MVT::i64)
19538	Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), MVT::i32, Op);
19539	return DAG.getNode(Opcode: N0.getOpcode(), DL: SDLoc (N), VT, Operand: Op);
19540	}
19541
19542	return SDValue ();
19543	}
19544
19545	// Check an node is an extend or shift operand
19546	static bool isExtendOrShiftOperand(SDValue N) {
19547	unsigned Opcode = N.getOpcode();
19548	if (ISD::isExtOpcode(Opcode) \|\| Opcode == ISD::SIGN_EXTEND_INREG) {
19549	EVT SrcVT;
19550	if (Opcode == ISD::SIGN_EXTEND_INREG)
19551	SrcVT = cast<VTSDNode>(Val: N.getOperand(i: `1`))->getVT();
19552	else
19553	SrcVT = N.getOperand(i: `0`).getValueType();
19554
19555	return SrcVT == MVT::i32 \|\| SrcVT == MVT::i16 \|\| SrcVT == MVT::i8;
19556	} else if (Opcode == ISD::AND) {
19557	ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(Val: N.getOperand(i: `1`));
19558	if (!CSD)
19559	return false;
19560	uint64_t AndMask = CSD->getZExtValue();
19561	return AndMask == `0xff` \|\| AndMask == `0xffff` \|\| AndMask == `0xffffffff`;
19562	} else if (Opcode == ISD::SHL \|\| Opcode == ISD::SRL \|\| Opcode == ISD::SRA) {
19563	return isa<ConstantSDNode>(Val: N.getOperand(i: `1`));
19564	}
19565
19566	return false;
19567	}
19568
19569	// (N - Y) + Z --> (Z - Y) + N
19570	// when N is an extend or shift operand
19571	static SDValue performAddCombineSubShift(SDNode *N, SDValue SUB, SDValue Z,
19572	SelectionDAG &DAG) {
19573	auto IsOneUseExtend = [](SDValue N) {
19574	return N.hasOneUse() && isExtendOrShiftOperand(N);
19575	};
19576
19577	// DAGCombiner will revert the combination when Z is constant cause
19578	// dead loop. So don't enable the combination when Z is constant.
19579	// If Z is one use shift C, we also can't do the optimization.
19580	// It will falling to self infinite loop.
19581	if (isa<ConstantSDNode>(Val: Z) \|\| IsOneUseExtend (Z))
19582	return SDValue ();
19583
19584	if (SUB.getOpcode() != ISD::SUB \|\| !SUB.hasOneUse())
19585	return SDValue ();
19586
19587	SDValue Shift = SUB.getOperand(i: `0`);
19588	if (!IsOneUseExtend (Shift))
19589	return SDValue ();
19590
19591	SDLoc DL(N);
19592	EVT VT = N->getValueType(ResNo: `0`);
19593
19594	SDValue Y = SUB.getOperand(i: `1`);
19595	SDValue NewSub = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Z, N2: Y);
19596	return DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: NewSub, N2: Shift);
19597	}
19598
19599	static SDValue performAddCombineForShiftedOperands(SDNode *N,
19600	SelectionDAG &DAG) {
19601	// NOTE: Swapping LHS and RHS is not done for SUB, since SUB is not
19602	// commutative.
19603	if (N->getOpcode() != ISD::ADD)
19604	return SDValue ();
19605
19606	// Bail out when value type is not one of {i32, i64}, since AArch64 ADD with
19607	// shifted register is only available for i32 and i64.
19608	EVT VT = N->getValueType(ResNo: `0`);
19609	if (VT != MVT::i32 && VT != MVT::i64)
19610	return SDValue ();
19611
19612	SDLoc DL(N);
19613	SDValue LHS = N->getOperand(Num: `0`);
19614	SDValue RHS = N->getOperand(Num: `1`);
19615
19616	if (SDValue Val = performAddCombineSubShift(N, SUB: LHS, Z: RHS, DAG))
19617	return Val;
19618	if (SDValue Val = performAddCombineSubShift(N, SUB: RHS, Z: LHS, DAG))
19619	return Val;
19620
19621	uint64_t LHSImm = `0`, RHSImm = `0`;
19622	// If both operand are shifted by imm and shift amount is not greater than 4
19623	// for one operand, swap LHS and RHS to put operand with smaller shift amount
19624	// on RHS.
19625	//
19626	// On many AArch64 processors (Cortex A78, Neoverse N1/N2/V1, etc), ADD with
19627	// LSL shift (shift <= 4) has smaller latency and larger throughput than ADD
19628	// with LSL (shift > 4). For the rest of processors, this is no-op for
19629	// performance or correctness.
19630	if (isOpcWithIntImmediate(N: LHS.getNode(), Opc: ISD::SHL, Imm&: LHSImm) &&
19631	isOpcWithIntImmediate(N: RHS.getNode(), Opc: ISD::SHL, Imm&: RHSImm) && LHSImm <= `4` &&
19632	RHSImm > `4` && LHS.hasOneUse())
19633	return DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: RHS, N2: LHS);
19634
19635	return SDValue ();
19636	}
19637
19638	// The mid end will reassociate sub(sub(x, m1), m2) to sub(x, add(m1, m2))
19639	// This reassociates it back to allow the creation of more mls instructions.
19640	static SDValue performSubAddMULCombine(SDNode *N, SelectionDAG &DAG) {
19641	if (N->getOpcode() != ISD::SUB)
19642	return SDValue ();
19643
19644	SDValue Add = N->getOperand(Num: `1`);
19645	SDValue X = N->getOperand(Num: `0`);
19646	if (Add.getOpcode() != ISD::ADD)
19647	return SDValue ();
19648
19649	if (!Add.hasOneUse())
19650	return SDValue ();
19651	if (DAG.isConstantIntBuildVectorOrConstantInt(N: peekThroughBitcasts(V: X)))
19652	return SDValue ();
19653
19654	SDValue M1 = Add.getOperand(i: `0`);
19655	SDValue M2 = Add.getOperand(i: `1`);
19656	if (M1.getOpcode() != ISD::MUL && M1.getOpcode() != AArch64ISD::SMULL &&
19657	M1.getOpcode() != AArch64ISD::UMULL)
19658	return SDValue ();
19659	if (M2.getOpcode() != ISD::MUL && M2.getOpcode() != AArch64ISD::SMULL &&
19660	M2.getOpcode() != AArch64ISD::UMULL)
19661	return SDValue ();
19662
19663	EVT VT = N->getValueType(ResNo: `0`);
19664	SDValue Sub = DAG.getNode(Opcode: ISD::SUB, DL: SDLoc (N), VT, N1: X, N2: M1);
19665	return DAG.getNode(Opcode: ISD::SUB, DL: SDLoc (N), VT, N1: Sub, N2: M2);
19666	}
19667
19668	// Combine into mla/mls.
19669	// This works on the patterns of:
19670	// add v1, (mul v2, v3)
19671	// sub v1, (mul v2, v3)
19672	// for vectors of type <1 x i64> and <2 x i64> when SVE is available.
19673	// It will transform the add/sub to a scalable version, so that we can
19674	// make use of SVE's MLA/MLS that will be generated for that pattern
19675	static SDValue
19676	performSVEMulAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
19677	SelectionDAG &DAG = DCI.DAG;
19678	// Make sure that the types are legal
19679	if (!DCI.isAfterLegalizeDAG())
19680	return SDValue ();
19681	// Before using SVE's features, check first if it's available.
19682	if (!DAG.getSubtarget<AArch64Subtarget>().hasSVE())
19683	return SDValue ();
19684
19685	if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::SUB)
19686	return SDValue ();
19687
19688	if (!N->getValueType(ResNo: `0`).isFixedLengthVector())
19689	return SDValue ();
19690
19691	auto performOpt = [&DAG, &N](SDValue Op0, SDValue Op1) -> SDValue {
19692	if (Op1.getOpcode() != ISD::EXTRACT_SUBVECTOR)
19693	return SDValue ();
19694
19695	if (!cast<ConstantSDNode>(Val: Op1 ->getOperand(Num: `1`))->isZero())
19696	return SDValue ();
19697
19698	SDValue MulValue = Op1 ->getOperand(Num: `0`);
19699	if (MulValue.getOpcode() != AArch64ISD::MUL_PRED)
19700	return SDValue ();
19701
19702	if (!Op1.hasOneUse() \|\| !MulValue.hasOneUse())
19703	return SDValue ();
19704
19705	EVT ScalableVT = MulValue.getValueType();
19706	if (!ScalableVT.isScalableVector())
19707	return SDValue ();
19708
19709	SDValue ScaledOp = convertToScalableVector(DAG, VT: ScalableVT, V: Op0);
19710	SDValue NewValue =
19711	DAG.getNode(Opcode: N->getOpcode(), DL: SDLoc (N), VT: ScalableVT, Ops: {ScaledOp, MulValue});
19712	return convertFromScalableVector(DAG, VT: N->getValueType(ResNo: `0`), V: NewValue);
19713	};
19714
19715	if (SDValue res = performOpt (N->getOperand(Num: `0`), N->getOperand(Num: `1`)))
19716	return res;
19717	else if (N->getOpcode() == ISD::ADD)
19718	return performOpt (N->getOperand(Num: `1`), N->getOperand(Num: `0`));
19719
19720	return SDValue ();
19721	}
19722
19723	// Given a i64 add from a v1i64 extract, convert to a neon v1i64 add. This can
19724	// help, for example, to produce ssra from sshr+add.
19725	static SDValue performAddSubIntoVectorOp(SDNode *N, SelectionDAG &DAG) {
19726	EVT VT = N->getValueType(ResNo: `0`);
19727	if (VT != MVT::i64)
19728	return SDValue ();
19729	SDValue Op0 = N->getOperand(Num: `0`);
19730	SDValue Op1 = N->getOperand(Num: `1`);
19731
19732	// At least one of the operands should be an extract, and the other should be
19733	// something that is easy to convert to v1i64 type (in this case a load).
19734	if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
19735	Op0.getOpcode() != ISD::LOAD)
19736	return SDValue ();
19737	if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
19738	Op1.getOpcode() != ISD::LOAD)
19739	return SDValue ();
19740
19741	SDLoc DL(N);
19742	if (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19743	Op0.getOperand(`0`).getValueType() == MVT::v1i64) {
19744	Op0 = Op0.getOperand(i: `0`);
19745	Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op1);
19746	} else if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19747	Op1.getOperand(`0`).getValueType() == MVT::v1i64) {
19748	Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op0);
19749	Op1 = Op1.getOperand(i: `0`);
19750	} else
19751	return SDValue ();
19752
19753	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64,
19754	DAG.getNode(N->getOpcode(), DL, MVT::v1i64, Op0, Op1),
19755	DAG.getConstant(`0`, DL, MVT::i64));
19756	}
19757
19758	static bool isLoadOrMultipleLoads(SDValue B, SmallVector<LoadSDNode *> &Loads) {
19759	SDValue BV = peekThroughOneUseBitcasts(V: B);
19760	if (!BV ->hasOneUse())
19761	return false;
19762	if (auto *Ld = dyn_cast<LoadSDNode>(Val&: BV)) {
19763	if (!Ld \|\| !Ld->isSimple())
19764	return false;
19765	Loads.push_back(Elt: Ld);
19766	return true;
19767	} else if (BV.getOpcode() == ISD::BUILD_VECTOR \|\|
19768	BV.getOpcode() == ISD::CONCAT_VECTORS) {
19769	for (unsigned Op = `0`; Op < BV.getNumOperands(); Op++) {
19770	auto *Ld = dyn_cast<LoadSDNode>(Val: BV.getOperand(i: Op));
19771	if (!Ld \|\| !Ld->isSimple() \|\| !BV.getOperand(i: Op).hasOneUse())
19772	return false;
19773	Loads.push_back(Elt: Ld);
19774	}
19775	return true;
19776	} else if (B.getOpcode() == ISD::VECTOR_SHUFFLE) {
19777	// Try to find a tree of shuffles and concats from how IR shuffles of loads
19778	// are lowered. Note that this only comes up because we do not always visit
19779	// operands before uses. After that is fixed this can be removed and in the
19780	// meantime this is fairly specific to the lowering we expect from IR.
19781	// t46: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19> t44, t45
19782	// t44: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u> t42, t43
19783	// t42: v16i8 = concat_vectors t40, t36, undef:v4i8, undef:v4i8
19784	// t40: v4i8,ch = load<(load (s32) from %ir.17)> t0, t22, undef:i64
19785	// t36: v4i8,ch = load<(load (s32) from %ir.13)> t0, t18, undef:i64
19786	// t43: v16i8 = concat_vectors t32, undef:v4i8, undef:v4i8, undef:v4i8
19787	// t32: v4i8,ch = load<(load (s32) from %ir.9)> t0, t14, undef:i64
19788	// t45: v16i8 = concat_vectors t28, undef:v4i8, undef:v4i8, undef:v4i8
19789	// t28: v4i8,ch = load<(load (s32) from %ir.0)> t0, t2, undef:i64
19790	if (B.getOperand(i: `0`).getOpcode() != ISD::VECTOR_SHUFFLE \|\|
19791	B.getOperand(i: `0`).getOperand(i: `0`).getOpcode() != ISD::CONCAT_VECTORS \|\|
19792	B.getOperand(i: `0`).getOperand(i: `1`).getOpcode() != ISD::CONCAT_VECTORS \|\|
19793	B.getOperand(i: `1`).getOpcode() != ISD::CONCAT_VECTORS \|\|
19794	B.getOperand(i: `1`).getNumOperands() != `4`)
19795	return false;
19796	auto SV1 = cast<ShuffleVectorSDNode>(Val&: B);
19797	auto SV2 = cast<ShuffleVectorSDNode>(Val: B.getOperand(i: `0`));
19798	int NumElts = B.getValueType().getVectorNumElements();
19799	int NumSubElts = NumElts / `4`;
19800	for (int I = `0`; I < NumSubElts; I++) {
19801	// <0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19>
19802	if (SV1->getMaskElt(Idx: I) != I \|\|
19803	SV1->getMaskElt(Idx: I + NumSubElts) != I + NumSubElts \|\|
19804	SV1->getMaskElt(Idx: I + NumSubElts * `2`) != I + NumSubElts * `2` \|\|
19805	SV1->getMaskElt(Idx: I + NumSubElts * `3`) != I + NumElts)
19806	return false;
19807	// <0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u>
19808	if (SV2->getMaskElt(Idx: I) != I \|\|
19809	SV2->getMaskElt(Idx: I + NumSubElts) != I + NumSubElts \|\|
19810	SV2->getMaskElt(Idx: I + NumSubElts * `2`) != I + NumElts)
19811	return false;
19812	}
19813	auto *Ld0 = dyn_cast<LoadSDNode>(Val: SV2->getOperand(Num: `0`).getOperand(i: `0`));
19814	auto *Ld1 = dyn_cast<LoadSDNode>(Val: SV2->getOperand(Num: `0`).getOperand(i: `1`));
19815	auto *Ld2 = dyn_cast<LoadSDNode>(Val: SV2->getOperand(Num: `1`).getOperand(i: `0`));
19816	auto *Ld3 = dyn_cast<LoadSDNode>(Val: B.getOperand(i: `1`).getOperand(i: `0`));
19817	if (!Ld0 \|\| !Ld1 \|\| !Ld2 \|\| !Ld3 \|\| !Ld0->isSimple() \|\| !Ld1->isSimple() \|\|
19818	!Ld2->isSimple() \|\| !Ld3->isSimple())
19819	return false;
19820	Loads.push_back(Elt: Ld0);
19821	Loads.push_back(Elt: Ld1);
19822	Loads.push_back(Elt: Ld2);
19823	Loads.push_back(Elt: Ld3);
19824	return true;
19825	}
19826	return false;
19827	}
19828
19829	static bool areLoadedOffsetButOtherwiseSame(SDValue Op0, SDValue Op1,
19830	SelectionDAG &DAG,
19831	unsigned &NumSubLoads) {
19832	if (!Op0.hasOneUse() \|\| !Op1.hasOneUse())
19833	return false;
19834
19835	SmallVector<LoadSDNode *> Loads0, Loads1;
19836	if (isLoadOrMultipleLoads(B: Op0, Loads&: Loads0) &&
19837	isLoadOrMultipleLoads(B: Op1, Loads&: Loads1)) {
19838	if (NumSubLoads && Loads0.size() != NumSubLoads)
19839	return false;
19840	NumSubLoads = Loads0.size();
19841	return Loads0.size() == Loads1.size() &&
19842	all_of(Range: zip(t&: Loads0, u&: Loads1), P: [&DAG](auto L) {
19843	unsigned Size = get<`0`>(L)->getValueType(`0`).getSizeInBits();
19844	return Size == get<`1`>(L)->getValueType(`0`).getSizeInBits() &&
19845	DAG.areNonVolatileConsecutiveLoads(LD: get<`1`>(L), Base: get<`0`>(L),
19846	Bytes: Size / `8`, Dist: `1`);
19847	});
19848	}
19849
19850	if (Op0.getOpcode() != Op1.getOpcode())
19851	return false;
19852
19853	switch (Op0.getOpcode()) {
19854	case ISD::ADD:
19855	case ISD::SUB:
19856	return areLoadedOffsetButOtherwiseSame(Op0: Op0.getOperand(i: `0`), Op1: Op1.getOperand(i: `0`),
19857	DAG, NumSubLoads) &&
19858	areLoadedOffsetButOtherwiseSame(Op0: Op0.getOperand(i: `1`), Op1: Op1.getOperand(i: `1`),
19859	DAG, NumSubLoads);
19860	case ISD::SIGN_EXTEND:
19861	case ISD::ANY_EXTEND:
19862	case ISD::ZERO_EXTEND:
19863	EVT XVT = Op0.getOperand(i: `0`).getValueType();
19864	if (XVT.getScalarSizeInBits() != `8` && XVT.getScalarSizeInBits() != `16` &&
19865	XVT.getScalarSizeInBits() != `32`)
19866	return false;
19867	return areLoadedOffsetButOtherwiseSame(Op0: Op0.getOperand(i: `0`), Op1: Op1.getOperand(i: `0`),
19868	DAG, NumSubLoads);
19869	}
19870	return false;
19871	}
19872
19873	// This method attempts to fold trees of add(ext(load p), shl(ext(load p+4))
19874	// into a single load of twice the size, that we extract the bottom part and top
19875	// part so that the shl can use a shll2 instruction. The two loads in that
19876	// example can also be larger trees of instructions, which are identical except
19877	// for the leaves which are all loads offset from the LHS, including
19878	// buildvectors of multiple loads. For example the RHS tree could be
19879	// sub(zext(buildvec(load p+4, load q+4)), zext(buildvec(load r+4, load s+4)))
19880	// Whilst it can be common for the larger loads to replace LDP instructions
19881	// (which doesn't gain anything on it's own), the larger loads can help create
19882	// more efficient code, and in buildvectors prevent the need for ld1 lane
19883	// inserts which can be slower than normal loads.
19884	static SDValue performExtBinopLoadFold(SDNode *N, SelectionDAG &DAG) {
19885	EVT VT = N->getValueType(ResNo: `0`);
19886	if (!VT.isFixedLengthVector() \|\|
19887	(VT.getScalarSizeInBits() != `16` && VT.getScalarSizeInBits() != `32` &&
19888	VT.getScalarSizeInBits() != `64`))
19889	return SDValue ();
19890
19891	SDValue Other = N->getOperand(Num: `0`);
19892	SDValue Shift = N->getOperand(Num: `1`);
19893	if (Shift.getOpcode() != ISD::SHL && N->getOpcode() != ISD::SUB)
19894	std::swap(a&: Shift, b&: Other);
19895	APInt ShiftAmt;
19896	if (Shift.getOpcode() != ISD::SHL \|\| !Shift.hasOneUse() \|\|
19897	!ISD::isConstantSplatVector(N: Shift.getOperand(i: `1`).getNode(), SplatValue&: ShiftAmt))
19898	return SDValue ();
19899
19900	if (!ISD::isExtOpcode(Opcode: Shift.getOperand(i: `0`).getOpcode()) \|\|
19901	!ISD::isExtOpcode(Opcode: Other.getOpcode()) \|\|
19902	Shift.getOperand(i: `0`).getOperand(i: `0`).getValueType() !=
19903	Other.getOperand(i: `0`).getValueType() \|\|
19904	!Other.hasOneUse() \|\| !Shift.getOperand(i: `0`).hasOneUse())
19905	return SDValue ();
19906
19907	SDValue Op0 = Other.getOperand(i: `0`);
19908	SDValue Op1 = Shift.getOperand(i: `0`).getOperand(i: `0`);
19909
19910	unsigned NumSubLoads = `0`;
19911	if (!areLoadedOffsetButOtherwiseSame(Op0, Op1, DAG, NumSubLoads))
19912	return SDValue ();
19913
19914	// Attempt to rule out some unprofitable cases using heuristics (some working
19915	// around suboptimal code generation), notably if the extend not be able to
19916	// use ushll2 instructions as the types are not large enough. Otherwise zip's
19917	// will need to be created which can increase the instruction count.
19918	unsigned NumElts = Op0.getValueType().getVectorNumElements();
19919	unsigned NumSubElts = NumElts / NumSubLoads;
19920	if (NumSubElts * VT.getScalarSizeInBits() < `128` \|\|
19921	(Other.getOpcode() != Shift.getOperand(i: `0`).getOpcode() &&
19922	Op0.getValueType().getSizeInBits() < `128` &&
19923	!DAG.getTargetLoweringInfo().isTypeLegal(VT: Op0.getValueType())))
19924	return SDValue ();
19925
19926	// Recreate the tree with the new combined loads.
19927	std::function<SDValue(SDValue, SDValue, SelectionDAG &)> GenCombinedTree =
19928	[&GenCombinedTree](SDValue Op0, SDValue Op1, SelectionDAG &DAG) {
19929	EVT DVT =
19930	Op0.getValueType().getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
19931
19932	SmallVector<LoadSDNode *> Loads0, Loads1;
19933	if (isLoadOrMultipleLoads(B: Op0, Loads&: Loads0) &&
19934	isLoadOrMultipleLoads(B: Op1, Loads&: Loads1)) {
19935	EVT LoadVT = EVT::getVectorVT(
19936	Context&: *DAG.getContext(), VT: Op0.getValueType().getScalarType(),
19937	NumElements: Op0.getValueType().getVectorNumElements() / Loads0.size());
19938	EVT DLoadVT = LoadVT.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
19939
19940	SmallVector<SDValue> NewLoads;
19941	for (const auto &[L0, L1] : zip(t&: Loads0, u&: Loads1)) {
19942	SDValue Load = DAG.getLoad(VT: DLoadVT, dl: SDLoc (L0), Chain: L0->getChain(),
19943	Ptr: L0->getBasePtr(), PtrInfo: L0->getPointerInfo(),
19944	Alignment: L0->getOriginalAlign());
19945	DAG.makeEquivalentMemoryOrdering(OldLoad: L0, NewMemOp: Load.getValue(R: `1`));
19946	DAG.makeEquivalentMemoryOrdering(OldLoad: L1, NewMemOp: Load.getValue(R: `1`));
19947	NewLoads.push_back(Elt: Load);
19948	}
19949	return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc (Op0), VT: DVT, Ops: NewLoads);
19950	}
19951
19952	SmallVector<SDValue> Ops;
19953	for (const auto &[O0, O1] : zip(t: Op0 ->op_values(), u: Op1 ->op_values()))
19954	Ops.push_back(Elt: GenCombinedTree (O0, O1, DAG));
19955	return DAG.getNode(Opcode: Op0.getOpcode(), DL: SDLoc (Op0), VT: DVT, Ops);
19956	};
19957	SDValue NewOp = GenCombinedTree (Op0, Op1, DAG);
19958
19959	SmallVector<int> LowMask(NumElts, `0`), HighMask(NumElts, `0`);
19960	int Hi = NumSubElts, Lo = `0`;
19961	for (unsigned i = `0`; i < NumSubLoads; i++) {
19962	for (unsigned j = `0`; j < NumSubElts; j++) {
19963	LowMask [i * NumSubElts + j] = Lo++;
19964	HighMask [i * NumSubElts + j] = Hi++;
19965	}
19966	Lo += NumSubElts;
19967	Hi += NumSubElts;
19968	}
19969	SDLoc DL(N);
19970	SDValue Ext0, Ext1;
19971	// Extract the top and bottom lanes, then extend the result. Possibly extend
19972	// the result then extract the lanes if the two operands match as it produces
19973	// slightly smaller code.
19974	if (Other.getOpcode() != Shift.getOperand(i: `0`).getOpcode()) {
19975	SDValue SubL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Op0.getValueType(),
19976	NewOp, DAG.getConstant(`0`, DL, MVT::i64));
19977	SDValue SubH =
19978	DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Op0.getValueType(), NewOp,
19979	DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
19980	SDValue Extr0 =
19981	DAG.getVectorShuffle(VT: Op0.getValueType(), dl: DL, N1: SubL, N2: SubH, Mask: LowMask);
19982	SDValue Extr1 =
19983	DAG.getVectorShuffle(VT: Op0.getValueType(), dl: DL, N1: SubL, N2: SubH, Mask: HighMask);
19984	Ext0 = DAG.getNode(Opcode: Other.getOpcode(), DL, VT, Operand: Extr0);
19985	Ext1 = DAG.getNode(Opcode: Shift.getOperand(i: `0`).getOpcode(), DL, VT, Operand: Extr1);
19986	} else {
19987	EVT DVT = VT.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
19988	SDValue Ext = DAG.getNode(Opcode: Other.getOpcode(), DL, VT: DVT, Operand: NewOp);
19989	SDValue SubL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
19990	DAG.getConstant(`0`, DL, MVT::i64));
19991	SDValue SubH =
19992	DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
19993	DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
19994	Ext0 = DAG.getVectorShuffle(VT, dl: DL, N1: SubL, N2: SubH, Mask: LowMask);
19995	Ext1 = DAG.getVectorShuffle(VT, dl: DL, N1: SubL, N2: SubH, Mask: HighMask);
19996	}
19997	SDValue NShift =
19998	DAG.getNode(Opcode: Shift.getOpcode(), DL, VT, N1: Ext1, N2: Shift.getOperand(i: `1`));
19999	return DAG.getNode(Opcode: N->getOpcode(), DL, VT, N1: Ext0, N2: NShift);
20000	}
20001
20002	static SDValue performAddSubCombine(SDNode *N,
20003	TargetLowering::DAGCombinerInfo &DCI) {
20004	// Try to change sum of two reductions.
20005	if (SDValue Val = performAddUADDVCombine(N, DAG&: DCI.DAG))
20006	return Val;
20007	if (SDValue Val = performAddDotCombine(N, DAG&: DCI.DAG))
20008	return Val;
20009	if (SDValue Val = performAddCSelIntoCSinc(N, DAG&: DCI.DAG))
20010	return Val;
20011	if (SDValue Val = performNegCSelCombine(N, DAG&: DCI.DAG))
20012	return Val;
20013	if (SDValue Val = performVectorAddSubExtCombine(N, DAG&: DCI.DAG))
20014	return Val;
20015	if (SDValue Val = performAddCombineForShiftedOperands(N, DAG&: DCI.DAG))
20016	return Val;
20017	if (SDValue Val = performSubAddMULCombine(N, DAG&: DCI.DAG))
20018	return Val;
20019	if (SDValue Val = performSVEMulAddSubCombine(N, DCI))
20020	return Val;
20021	if (SDValue Val = performAddSubIntoVectorOp(N, DAG&: DCI.DAG))
20022	return Val;
20023
20024	if (SDValue Val = performExtBinopLoadFold(N, DAG&: DCI.DAG))
20025	return Val;
20026
20027	return performAddSubLongCombine(N, DCI);
20028	}
20029
20030	// Massage DAGs which we can use the high-half "long" operations on into
20031	// something isel will recognize better. E.g.
20032	//
20033	// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
20034	// (aarch64_neon_umull (extract_high (v2i64 vec)))
20035	// (extract_high (v2i64 (dup128 scalar)))))
20036	//
20037	static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N,
20038	TargetLowering::DAGCombinerInfo &DCI,
20039	SelectionDAG &DAG) {
20040	if (DCI.isBeforeLegalizeOps())
20041	return SDValue ();
20042
20043	SDValue LHS = N->getOperand(Num: (IID == Intrinsic::not_intrinsic) ? `0` : `1`);
20044	SDValue RHS = N->getOperand(Num: (IID == Intrinsic::not_intrinsic) ? `1` : `2`);
20045	assert(LHS.getValueType().is64BitVector() &&
20046	RHS.getValueType().is64BitVector() &&
20047	"unexpected shape for long operation");
20048
20049	// Either node could be a DUP, but it's not worth doing both of them (you'd
20050	// just as well use the non-high version) so look for a corresponding extract
20051	// operation on the other "wing".
20052	if (isEssentiallyExtractHighSubvector(N: LHS)) {
20053	RHS = tryExtendDUPToExtractHigh(N: RHS, DAG);
20054	if (!RHS.getNode())
20055	return SDValue ();
20056	} else if (isEssentiallyExtractHighSubvector(N: RHS)) {
20057	LHS = tryExtendDUPToExtractHigh(N: LHS, DAG);
20058	if (!LHS.getNode())
20059	return SDValue ();
20060	} else
20061	return SDValue ();
20062
20063	if (IID == Intrinsic::not_intrinsic)
20064	return DAG.getNode(Opcode: N->getOpcode(), DL: SDLoc (N), VT: N->getValueType(ResNo: `0`), N1: LHS, N2: RHS);
20065
20066	return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
20067	N1: N->getOperand(Num: `0`), N2: LHS, N3: RHS);
20068	}
20069
20070	static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
20071	MVT ElemTy = N->getSimpleValueType(ResNo: `0`).getScalarType();
20072	unsigned ElemBits = ElemTy.getSizeInBits();
20073
20074	int64_t ShiftAmount;
20075	if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Val: N->getOperand(Num: `2`))) {
20076	APInt SplatValue, SplatUndef;
20077	unsigned SplatBitSize;
20078	bool HasAnyUndefs;
20079	if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
20080	HasAnyUndefs, MinSplatBits: ElemBits) \|\|
20081	SplatBitSize != ElemBits)
20082	return SDValue ();
20083
20084	ShiftAmount = SplatValue.getSExtValue();
20085	} else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `2`))) {
20086	ShiftAmount = CVN->getSExtValue();
20087	} else
20088	return SDValue ();
20089
20090	// If the shift amount is zero, remove the shift intrinsic.
20091	if (ShiftAmount == `0` && IID != Intrinsic::aarch64_neon_sqshlu)
20092	return N->getOperand(Num: `1`);
20093
20094	unsigned Opcode;
20095	bool IsRightShift;
20096	switch (IID) {
20097	default:
20098	llvm_unreachable("Unknown shift intrinsic");
20099	case Intrinsic::aarch64_neon_sqshl:
20100	Opcode = AArch64ISD::SQSHL_I;
20101	IsRightShift = false;
20102	break;
20103	case Intrinsic::aarch64_neon_uqshl:
20104	Opcode = AArch64ISD::UQSHL_I;
20105	IsRightShift = false;
20106	break;
20107	case Intrinsic::aarch64_neon_srshl:
20108	Opcode = AArch64ISD::SRSHR_I;
20109	IsRightShift = true;
20110	break;
20111	case Intrinsic::aarch64_neon_urshl:
20112	Opcode = AArch64ISD::URSHR_I;
20113	IsRightShift = true;
20114	break;
20115	case Intrinsic::aarch64_neon_sqshlu:
20116	Opcode = AArch64ISD::SQSHLU_I;
20117	IsRightShift = false;
20118	break;
20119	case Intrinsic::aarch64_neon_sshl:
20120	case Intrinsic::aarch64_neon_ushl:
20121	// For positive shift amounts we can use SHL, as ushl/sshl perform a regular
20122	// left shift for positive shift amounts. For negative shifts we can use a
20123	// VASHR/VLSHR as appropiate.
20124	if (ShiftAmount < `0`) {
20125	Opcode = IID == Intrinsic::aarch64_neon_sshl ? AArch64ISD::VASHR
20126	: AArch64ISD::VLSHR;
20127	ShiftAmount = -ShiftAmount;
20128	} else
20129	Opcode = AArch64ISD::VSHL;
20130	IsRightShift = false;
20131	break;
20132	}
20133
20134	EVT VT = N->getValueType(ResNo: `0`);
20135	SDValue Op = N->getOperand(Num: `1`);
20136	SDLoc dl(N);
20137	if (VT == MVT::i64) {
20138	Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op);
20139	VT = MVT::v1i64;
20140	}
20141
20142	if (IsRightShift && ShiftAmount <= -`1` && ShiftAmount >= -(int)ElemBits) {
20143	Op = DAG.getNode(Opcode, dl, VT, Op,
20144	DAG.getConstant(-ShiftAmount, dl, MVT::i32));
20145	if (N->getValueType(`0`) == MVT::i64)
20146	Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op,
20147	DAG.getConstant(`0`, dl, MVT::i64));
20148	return Op;
20149	} else if (!IsRightShift && ShiftAmount >= `0` && ShiftAmount < ElemBits) {
20150	Op = DAG.getNode(Opcode, dl, VT, Op,
20151	DAG.getConstant(ShiftAmount, dl, MVT::i32));
20152	if (N->getValueType(`0`) == MVT::i64)
20153	Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op,
20154	DAG.getConstant(`0`, dl, MVT::i64));
20155	return Op;
20156	}
20157
20158	return SDValue ();
20159	}
20160
20161	// The CRC32[BH] instructions ignore the high bits of their data operand. Since
20162	// the intrinsics must be legal and take an i32, this means there's almost
20163	// certainly going to be a zext in the DAG which we can eliminate.
20164	static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
20165	SDValue AndN = N->getOperand(Num: `2`);
20166	if (AndN.getOpcode() != ISD::AND)
20167	return SDValue ();
20168
20169	ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Val: AndN.getOperand(i: `1`));
20170	if (!CMask \|\| CMask->getZExtValue() != Mask)
20171	return SDValue ();
20172
20173	return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
20174	N->getOperand(`0`), N->getOperand(`1`), AndN.getOperand(`0`));
20175	}
20176
20177	static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N,
20178	SelectionDAG &DAG) {
20179	SDLoc dl(N);
20180	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(`0`),
20181	DAG.getNode(Opc, dl,
20182	N->getOperand(`1`).getSimpleValueType(),
20183	N->getOperand(`1`)),
20184	DAG.getConstant(`0`, dl, MVT::i64));
20185	}
20186
20187	static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG) {
20188	SDLoc DL(N);
20189	SDValue Op1 = N->getOperand(Num: `1`);
20190	SDValue Op2 = N->getOperand(Num: `2`);
20191	EVT ScalarTy = Op2.getValueType();
20192	if ((ScalarTy == MVT::i8) \|\| (ScalarTy == MVT::i16))
20193	ScalarTy = MVT::i32;
20194
20195	// Lower index_vector(base, step) to mul(step step_vector(1)) + splat(base).
20196	SDValue StepVector = DAG.getStepVector(DL, ResVT: N->getValueType(ResNo: `0`));
20197	SDValue Step = DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: N->getValueType(ResNo: `0`), Operand: Op2);
20198	SDValue Mul = DAG.getNode(Opcode: ISD::MUL, DL, VT: N->getValueType(ResNo: `0`), N1: StepVector, N2: Step);
20199	SDValue Base = DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: N->getValueType(ResNo: `0`), Operand: Op1);
20200	return DAG.getNode(Opcode: ISD::ADD, DL, VT: N->getValueType(ResNo: `0`), N1: Mul, N2: Base);
20201	}
20202
20203	static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG) {
20204	SDLoc dl(N);
20205	SDValue Scalar = N->getOperand(Num: `3`);
20206	EVT ScalarTy = Scalar.getValueType();
20207
20208	if ((ScalarTy == MVT::i8) \|\| (ScalarTy == MVT::i16))
20209	Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
20210
20211	SDValue Passthru = N->getOperand(Num: `1`);
20212	SDValue Pred = N->getOperand(Num: `2`);
20213	return DAG.getNode(Opcode: AArch64ISD::DUP_MERGE_PASSTHRU, DL: dl, VT: N->getValueType(ResNo: `0`),
20214	N1: Pred, N2: Scalar, N3: Passthru);
20215	}
20216
20217	static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG) {
20218	SDLoc dl(N);
20219	LLVMContext &Ctx = *DAG.getContext();
20220	EVT VT = N->getValueType(ResNo: `0`);
20221
20222	assert(VT.isScalableVector() && "Expected a scalable vector.");
20223
20224	// Current lowering only supports the SVE-ACLE types.
20225	if (VT.getSizeInBits().getKnownMinValue() != AArch64::SVEBitsPerBlock)
20226	return SDValue ();
20227
20228	unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / `8`;
20229	unsigned ByteSize = VT.getSizeInBits().getKnownMinValue() / `8`;
20230	EVT ByteVT =
20231	EVT::getVectorVT(Ctx, MVT::i8, ElementCount::getScalable(ByteSize));
20232
20233	// Convert everything to the domain of EXT (i.e bytes).
20234	SDValue Op0 = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: ByteVT, Operand: N->getOperand(Num: `1`));
20235	SDValue Op1 = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: ByteVT, Operand: N->getOperand(Num: `2`));
20236	SDValue Op2 = DAG.getNode(ISD::MUL, dl, MVT::i32, N->getOperand(`3`),
20237	DAG.getConstant(ElemSize, dl, MVT::i32));
20238
20239	SDValue EXT = DAG.getNode(Opcode: AArch64ISD::EXT, DL: dl, VT: ByteVT, N1: Op0, N2: Op1, N3: Op2);
20240	return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT, Operand: EXT);
20241	}
20242
20243	static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC,
20244	TargetLowering::DAGCombinerInfo &DCI,
20245	SelectionDAG &DAG) {
20246	if (DCI.isBeforeLegalize())
20247	return SDValue ();
20248
20249	SDValue Comparator = N->getOperand(Num: `3`);
20250	if (Comparator.getOpcode() == AArch64ISD::DUP \|\|
20251	Comparator.getOpcode() == ISD::SPLAT_VECTOR) {
20252	unsigned IID = getIntrinsicID(N);
20253	EVT VT = N->getValueType(ResNo: `0`);
20254	EVT CmpVT = N->getOperand(Num: `2`).getValueType();
20255	SDValue Pred = N->getOperand(Num: `1`);
20256	SDValue Imm;
20257	SDLoc DL(N);
20258
20259	switch (IID) {
20260	default:
20261	llvm_unreachable("Called with wrong intrinsic!");
20262	break;
20263
20264	// Signed comparisons
20265	case Intrinsic::aarch64_sve_cmpeq_wide:
20266	case Intrinsic::aarch64_sve_cmpne_wide:
20267	case Intrinsic::aarch64_sve_cmpge_wide:
20268	case Intrinsic::aarch64_sve_cmpgt_wide:
20269	case Intrinsic::aarch64_sve_cmplt_wide:
20270	case Intrinsic::aarch64_sve_cmple_wide: {
20271	if (auto *CN = dyn_cast<ConstantSDNode>(Val: Comparator.getOperand(i: `0`))) {
20272	int64_t ImmVal = CN->getSExtValue();
20273	if (ImmVal >= -`16` && ImmVal <= `15`)
20274	Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
20275	else
20276	return SDValue ();
20277	}
20278	break;
20279	}
20280	// Unsigned comparisons
20281	case Intrinsic::aarch64_sve_cmphs_wide:
20282	case Intrinsic::aarch64_sve_cmphi_wide:
20283	case Intrinsic::aarch64_sve_cmplo_wide:
20284	case Intrinsic::aarch64_sve_cmpls_wide: {
20285	if (auto *CN = dyn_cast<ConstantSDNode>(Val: Comparator.getOperand(i: `0`))) {
20286	uint64_t ImmVal = CN->getZExtValue();
20287	if (ImmVal <= `127`)
20288	Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
20289	else
20290	return SDValue ();
20291	}
20292	break;
20293	}
20294	}
20295
20296	if (!Imm)
20297	return SDValue ();
20298
20299	SDValue Splat = DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: CmpVT, Operand: Imm);
20300	return DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL, VT, N1: Pred,
20301	N2: N->getOperand(Num: `2`), N3: Splat, N4: DAG.getCondCode(Cond: CC));
20302	}
20303
20304	return SDValue ();
20305	}
20306
20307	static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
20308	AArch64CC::CondCode Cond) {
20309	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20310
20311	SDLoc DL(Op);
20312	assert(Op.getValueType().isScalableVector() &&
20313	TLI.isTypeLegal(Op.getValueType()) &&
20314	"Expected legal scalable vector type!");
20315	assert(Op.getValueType() == Pg.getValueType() &&
20316	"Expected same type for PTEST operands");
20317
20318	// Ensure target specific opcodes are using legal type.
20319	EVT OutVT = TLI.getTypeToTransformTo(Context&: *DAG.getContext(), VT);
20320	SDValue TVal = DAG.getConstant(Val: `1`, DL, VT: OutVT);
20321	SDValue FVal = DAG.getConstant(Val: `0`, DL, VT: OutVT);
20322
20323	// Ensure operands have type nxv16i1.
20324	if (Op.getValueType() != MVT::nxv16i1) {
20325	if ((Cond == AArch64CC::ANY_ACTIVE \|\| Cond == AArch64CC::NONE_ACTIVE) &&
20326	isZeroingInactiveLanes(Op))
20327	Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Pg);
20328	else
20329	Pg = getSVEPredicateBitCast(MVT::nxv16i1, Pg, DAG);
20330	Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Op);
20331	}
20332
20333	// Set condition code (CC) flags.
20334	SDValue Test = DAG.getNode(
20335	Cond == AArch64CC::ANY_ACTIVE ? AArch64ISD::PTEST_ANY : AArch64ISD::PTEST,
20336	DL, MVT::Other, Pg, Op);
20337
20338	// Convert CC to integer based on requested condition.
20339	// NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare.
20340	SDValue CC = DAG.getConstant(getInvertedCondCode(Cond), DL, MVT::i32);
20341	SDValue Res = DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT: OutVT, N1: FVal, N2: TVal, N3: CC, N4: Test);
20342	return DAG.getZExtOrTrunc(Op: Res, DL, VT);
20343	}
20344
20345	static SDValue combineSVEReductionInt(SDNode N, unsigned* Opc,
20346	SelectionDAG &DAG) {
20347	SDLoc DL(N);
20348
20349	SDValue Pred = N->getOperand(Num: `1`);
20350	SDValue VecToReduce = N->getOperand(Num: `2`);
20351
20352	// NOTE: The integer reduction's result type is not always linked to the
20353	// operand's element type so we construct it from the intrinsic's result type.
20354	EVT ReduceVT = getPackedSVEVectorVT(VT: N->getValueType(ResNo: `0`));
20355	SDValue Reduce = DAG.getNode(Opcode: Opc, DL, VT: ReduceVT, N1: Pred, N2: VecToReduce);
20356
20357	// SVE reductions set the whole vector register with the first element
20358	// containing the reduction result, which we'll now extract.
20359	SDValue Zero = DAG.getConstant(`0`, DL, MVT::i64);
20360	return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: N->getValueType(ResNo: `0`), N1: Reduce,
20361	N2: Zero);
20362	}
20363
20364	static SDValue combineSVEReductionFP(SDNode N, unsigned* Opc,
20365	SelectionDAG &DAG) {
20366	SDLoc DL(N);
20367
20368	SDValue Pred = N->getOperand(Num: `1`);
20369	SDValue VecToReduce = N->getOperand(Num: `2`);
20370
20371	EVT ReduceVT = VecToReduce.getValueType();
20372	SDValue Reduce = DAG.getNode(Opcode: Opc, DL, VT: ReduceVT, N1: Pred, N2: VecToReduce);
20373
20374	// SVE reductions set the whole vector register with the first element
20375	// containing the reduction result, which we'll now extract.
20376	SDValue Zero = DAG.getConstant(`0`, DL, MVT::i64);
20377	return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: N->getValueType(ResNo: `0`), N1: Reduce,
20378	N2: Zero);
20379	}
20380
20381	static SDValue combineSVEReductionOrderedFP(SDNode N, unsigned* Opc,
20382	SelectionDAG &DAG) {
20383	SDLoc DL(N);
20384
20385	SDValue Pred = N->getOperand(Num: `1`);
20386	SDValue InitVal = N->getOperand(Num: `2`);
20387	SDValue VecToReduce = N->getOperand(Num: `3`);
20388	EVT ReduceVT = VecToReduce.getValueType();
20389
20390	// Ordered reductions use the first lane of the result vector as the
20391	// reduction's initial value.
20392	SDValue Zero = DAG.getConstant(`0`, DL, MVT::i64);
20393	InitVal = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: ReduceVT,
20394	N1: DAG.getUNDEF(VT: ReduceVT), N2: InitVal, N3: Zero);
20395
20396	SDValue Reduce = DAG.getNode(Opcode: Opc, DL, VT: ReduceVT, N1: Pred, N2: InitVal, N3: VecToReduce);
20397
20398	// SVE reductions set the whole vector register with the first element
20399	// containing the reduction result, which we'll now extract.
20400	return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: N->getValueType(ResNo: `0`), N1: Reduce,
20401	N2: Zero);
20402	}
20403
20404	// If a merged operation has no inactive lanes we can relax it to a predicated
20405	// or unpredicated operation, which potentially allows better isel (perhaps
20406	// using immediate forms) or relaxing register reuse requirements.
20407	static SDValue convertMergedOpToPredOp(SDNode N, unsigned* Opc,
20408	SelectionDAG &DAG, bool UnpredOp = false,
20409	bool SwapOperands = false) {
20410	assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!");
20411	assert(N->getNumOperands() == `4` && "Expected 3 operand intrinsic!");
20412	SDValue Pg = N->getOperand(Num: `1`);
20413	SDValue Op1 = N->getOperand(Num: SwapOperands ? `3` : `2`);
20414	SDValue Op2 = N->getOperand(Num: SwapOperands ? `2` : `3`);
20415
20416	// ISD way to specify an all active predicate.
20417	if (isAllActivePredicate(DAG, N: Pg)) {
20418	if (UnpredOp)
20419	return DAG.getNode(Opcode: Opc, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`), N1: Op1, N2: Op2);
20420
20421	return DAG.getNode(Opcode: Opc, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`), N1: Pg, N2: Op1, N3: Op2);
20422	}
20423
20424	// FUTURE: SplatVector(true)
20425	return SDValue ();
20426	}
20427
20428	static SDValue performIntrinsicCombine(SDNode *N,
20429	TargetLowering::DAGCombinerInfo &DCI,
20430	const AArch64Subtarget *Subtarget) {
20431	SelectionDAG &DAG = DCI.DAG;
20432	unsigned IID = getIntrinsicID(N);
20433	switch (IID) {
20434	default:
20435	break;
20436	case Intrinsic::get_active_lane_mask: {
20437	SDValue Res = SDValue ();
20438	EVT VT = N->getValueType(ResNo: `0`);
20439	if (VT.isFixedLengthVector()) {
20440	// We can use the SVE whilelo instruction to lower this intrinsic by
20441	// creating the appropriate sequence of scalable vector operations and
20442	// then extracting a fixed-width subvector from the scalable vector.
20443
20444	SDLoc DL(N);
20445	SDValue ID =
20446	DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
20447
20448	EVT WhileVT = EVT::getVectorVT(
20449	*DAG.getContext(), MVT::i1,
20450	ElementCount::getScalable(VT.getVectorNumElements()));
20451
20452	// Get promoted scalable vector VT, i.e. promote nxv4i1 -> nxv4i32.
20453	EVT PromVT = getPromotedVTForPredicate(VT: WhileVT);
20454
20455	// Get the fixed-width equivalent of PromVT for extraction.
20456	EVT ExtVT =
20457	EVT::getVectorVT(Context&: *DAG.getContext(), VT: PromVT.getVectorElementType(),
20458	EC: VT.getVectorElementCount());
20459
20460	Res = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: WhileVT, N1: ID,
20461	N2: N->getOperand(Num: `1`), N3: N->getOperand(Num: `2`));
20462	Res = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: PromVT, Operand: Res);
20463	Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, Res,
20464	DAG.getConstant(`0`, DL, MVT::i64));
20465	Res = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: Res);
20466	}
20467	return Res;
20468	}
20469	case Intrinsic::aarch64_neon_vcvtfxs2fp:
20470	case Intrinsic::aarch64_neon_vcvtfxu2fp:
20471	return tryCombineFixedPointConvert(N, DCI, DAG);
20472	case Intrinsic::aarch64_neon_saddv:
20473	return combineAcrossLanesIntrinsic(Opc: AArch64ISD::SADDV, N, DAG);
20474	case Intrinsic::aarch64_neon_uaddv:
20475	return combineAcrossLanesIntrinsic(Opc: AArch64ISD::UADDV, N, DAG);
20476	case Intrinsic::aarch64_neon_sminv:
20477	return combineAcrossLanesIntrinsic(Opc: AArch64ISD::SMINV, N, DAG);
20478	case Intrinsic::aarch64_neon_uminv:
20479	return combineAcrossLanesIntrinsic(Opc: AArch64ISD::UMINV, N, DAG);
20480	case Intrinsic::aarch64_neon_smaxv:
20481	return combineAcrossLanesIntrinsic(Opc: AArch64ISD::SMAXV, N, DAG);
20482	case Intrinsic::aarch64_neon_umaxv:
20483	return combineAcrossLanesIntrinsic(Opc: AArch64ISD::UMAXV, N, DAG);
20484	case Intrinsic::aarch64_neon_fmax:
20485	return DAG.getNode(Opcode: ISD::FMAXIMUM, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
20486	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`));
20487	case Intrinsic::aarch64_neon_fmin:
20488	return DAG.getNode(Opcode: ISD::FMINIMUM, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
20489	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`));
20490	case Intrinsic::aarch64_neon_fmaxnm:
20491	return DAG.getNode(Opcode: ISD::FMAXNUM, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
20492	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`));
20493	case Intrinsic::aarch64_neon_fminnm:
20494	return DAG.getNode(Opcode: ISD::FMINNUM, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
20495	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`));
20496	case Intrinsic::aarch64_neon_smull:
20497	return DAG.getNode(Opcode: AArch64ISD::SMULL, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
20498	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`));
20499	case Intrinsic::aarch64_neon_umull:
20500	return DAG.getNode(Opcode: AArch64ISD::UMULL, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
20501	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`));
20502	case Intrinsic::aarch64_neon_pmull:
20503	return DAG.getNode(Opcode: AArch64ISD::PMULL, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
20504	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`));
20505	case Intrinsic::aarch64_neon_sqdmull:
20506	return tryCombineLongOpWithDup(IID, N, DCI, DAG);
20507	case Intrinsic::aarch64_neon_sqshl:
20508	case Intrinsic::aarch64_neon_uqshl:
20509	case Intrinsic::aarch64_neon_sqshlu:
20510	case Intrinsic::aarch64_neon_srshl:
20511	case Intrinsic::aarch64_neon_urshl:
20512	case Intrinsic::aarch64_neon_sshl:
20513	case Intrinsic::aarch64_neon_ushl:
20514	return tryCombineShiftImm(IID, N, DAG);
20515	case Intrinsic::aarch64_neon_sabd:
20516	return DAG.getNode(Opcode: ISD::ABDS, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
20517	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`));
20518	case Intrinsic::aarch64_neon_uabd:
20519	return DAG.getNode(Opcode: ISD::ABDU, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
20520	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`));
20521	case Intrinsic::aarch64_crc32b:
20522	case Intrinsic::aarch64_crc32cb:
20523	return tryCombineCRC32(Mask: `0xff`, N, DAG);
20524	case Intrinsic::aarch64_crc32h:
20525	case Intrinsic::aarch64_crc32ch:
20526	return tryCombineCRC32(Mask: `0xffff`, N, DAG);
20527	case Intrinsic::aarch64_sve_saddv:
20528	// There is no i64 version of SADDV because the sign is irrelevant.
20529	if (N->getOperand(`2`)->getValueType(`0`).getVectorElementType() == MVT::i64)
20530	return combineSVEReductionInt(N, Opc: AArch64ISD::UADDV_PRED, DAG);
20531	else
20532	return combineSVEReductionInt(N, Opc: AArch64ISD::SADDV_PRED, DAG);
20533	case Intrinsic::aarch64_sve_uaddv:
20534	return combineSVEReductionInt(N, Opc: AArch64ISD::UADDV_PRED, DAG);
20535	case Intrinsic::aarch64_sve_smaxv:
20536	return combineSVEReductionInt(N, Opc: AArch64ISD::SMAXV_PRED, DAG);
20537	case Intrinsic::aarch64_sve_umaxv:
20538	return combineSVEReductionInt(N, Opc: AArch64ISD::UMAXV_PRED, DAG);
20539	case Intrinsic::aarch64_sve_sminv:
20540	return combineSVEReductionInt(N, Opc: AArch64ISD::SMINV_PRED, DAG);
20541	case Intrinsic::aarch64_sve_uminv:
20542	return combineSVEReductionInt(N, Opc: AArch64ISD::UMINV_PRED, DAG);
20543	case Intrinsic::aarch64_sve_orv:
20544	return combineSVEReductionInt(N, Opc: AArch64ISD::ORV_PRED, DAG);
20545	case Intrinsic::aarch64_sve_eorv:
20546	return combineSVEReductionInt(N, Opc: AArch64ISD::EORV_PRED, DAG);
20547	case Intrinsic::aarch64_sve_andv:
20548	return combineSVEReductionInt(N, Opc: AArch64ISD::ANDV_PRED, DAG);
20549	case Intrinsic::aarch64_sve_index:
20550	return LowerSVEIntrinsicIndex(N, DAG);
20551	case Intrinsic::aarch64_sve_dup:
20552	return LowerSVEIntrinsicDUP(N, DAG);
20553	case Intrinsic::aarch64_sve_dup_x:
20554	return DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
20555	Operand: N->getOperand(Num: `1`));
20556	case Intrinsic::aarch64_sve_ext:
20557	return LowerSVEIntrinsicEXT(N, DAG);
20558	case Intrinsic::aarch64_sve_mul_u:
20559	return DAG.getNode(Opcode: AArch64ISD::MUL_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
20560	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
20561	case Intrinsic::aarch64_sve_smulh_u:
20562	return DAG.getNode(Opcode: AArch64ISD::MULHS_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
20563	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
20564	case Intrinsic::aarch64_sve_umulh_u:
20565	return DAG.getNode(Opcode: AArch64ISD::MULHU_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
20566	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
20567	case Intrinsic::aarch64_sve_smin_u:
20568	return DAG.getNode(Opcode: AArch64ISD::SMIN_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
20569	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
20570	case Intrinsic::aarch64_sve_umin_u:
20571	return DAG.getNode(Opcode: AArch64ISD::UMIN_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
20572	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
20573	case Intrinsic::aarch64_sve_smax_u:
20574	return DAG.getNode(Opcode: AArch64ISD::SMAX_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
20575	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
20576	case Intrinsic::aarch64_sve_umax_u:
20577	return DAG.getNode(Opcode: AArch64ISD::UMAX_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
20578	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
20579	case Intrinsic::aarch64_sve_lsl_u:
20580	return DAG.getNode(Opcode: AArch64ISD::SHL_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
20581	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
20582	case Intrinsic::aarch64_sve_lsr_u:
20583	return DAG.getNode(Opcode: AArch64ISD::SRL_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
20584	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
20585	case Intrinsic::aarch64_sve_asr_u:
20586	return DAG.getNode(Opcode: AArch64ISD::SRA_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
20587	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
20588	case Intrinsic::aarch64_sve_fadd_u:
20589	return DAG.getNode(Opcode: AArch64ISD::FADD_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
20590	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
20591	case Intrinsic::aarch64_sve_fdiv_u:
20592	return DAG.getNode(Opcode: AArch64ISD::FDIV_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
20593	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
20594	case Intrinsic::aarch64_sve_fmax_u:
20595	return DAG.getNode(Opcode: AArch64ISD::FMAX_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
20596	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
20597	case Intrinsic::aarch64_sve_fmaxnm_u:
20598	return DAG.getNode(Opcode: AArch64ISD::FMAXNM_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
20599	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
20600	case Intrinsic::aarch64_sve_fmla_u:
20601	return DAG.getNode(Opcode: AArch64ISD::FMA_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
20602	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `3`), N3: N->getOperand(Num: `4`),
20603	N4: N->getOperand(Num: `2`));
20604	case Intrinsic::aarch64_sve_fmin_u:
20605	return DAG.getNode(Opcode: AArch64ISD::FMIN_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
20606	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
20607	case Intrinsic::aarch64_sve_fminnm_u:
20608	return DAG.getNode(Opcode: AArch64ISD::FMINNM_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
20609	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
20610	case Intrinsic::aarch64_sve_fmul_u:
20611	return DAG.getNode(Opcode: AArch64ISD::FMUL_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
20612	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
20613	case Intrinsic::aarch64_sve_fsub_u:
20614	return DAG.getNode(Opcode: AArch64ISD::FSUB_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
20615	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
20616	case Intrinsic::aarch64_sve_add_u:
20617	return DAG.getNode(Opcode: ISD::ADD, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`), N1: N->getOperand(Num: `2`),
20618	N2: N->getOperand(Num: `3`));
20619	case Intrinsic::aarch64_sve_sub_u:
20620	return DAG.getNode(Opcode: ISD::SUB, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`), N1: N->getOperand(Num: `2`),
20621	N2: N->getOperand(Num: `3`));
20622	case Intrinsic::aarch64_sve_subr:
20623	return convertMergedOpToPredOp(N, Opc: ISD::SUB, DAG, UnpredOp: true, SwapOperands: true);
20624	case Intrinsic::aarch64_sve_and_u:
20625	return DAG.getNode(Opcode: ISD::AND, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`), N1: N->getOperand(Num: `2`),
20626	N2: N->getOperand(Num: `3`));
20627	case Intrinsic::aarch64_sve_bic_u:
20628	return DAG.getNode(Opcode: AArch64ISD::BIC, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
20629	N1: N->getOperand(Num: `2`), N2: N->getOperand(Num: `3`));
20630	case Intrinsic::aarch64_sve_eor_u:
20631	return DAG.getNode(Opcode: ISD::XOR, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`), N1: N->getOperand(Num: `2`),
20632	N2: N->getOperand(Num: `3`));
20633	case Intrinsic::aarch64_sve_orr_u:
20634	return DAG.getNode(Opcode: ISD::OR, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`), N1: N->getOperand(Num: `2`),
20635	N2: N->getOperand(Num: `3`));
20636	case Intrinsic::aarch64_sve_sabd_u:
20637	return DAG.getNode(Opcode: ISD::ABDS, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
20638	N1: N->getOperand(Num: `2`), N2: N->getOperand(Num: `3`));
20639	case Intrinsic::aarch64_sve_uabd_u:
20640	return DAG.getNode(Opcode: ISD::ABDU, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
20641	N1: N->getOperand(Num: `2`), N2: N->getOperand(Num: `3`));
20642	case Intrinsic::aarch64_sve_sdiv_u:
20643	return DAG.getNode(Opcode: AArch64ISD::SDIV_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
20644	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
20645	case Intrinsic::aarch64_sve_udiv_u:
20646	return DAG.getNode(Opcode: AArch64ISD::UDIV_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
20647	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
20648	case Intrinsic::aarch64_sve_sqadd:
20649	return convertMergedOpToPredOp(N, Opc: ISD::SADDSAT, DAG, UnpredOp: true);
20650	case Intrinsic::aarch64_sve_sqsub_u:
20651	return DAG.getNode(Opcode: ISD::SSUBSAT, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
20652	N1: N->getOperand(Num: `2`), N2: N->getOperand(Num: `3`));
20653	case Intrinsic::aarch64_sve_uqadd:
20654	return convertMergedOpToPredOp(N, Opc: ISD::UADDSAT, DAG, UnpredOp: true);
20655	case Intrinsic::aarch64_sve_uqsub_u:
20656	return DAG.getNode(Opcode: ISD::USUBSAT, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
20657	N1: N->getOperand(Num: `2`), N2: N->getOperand(Num: `3`));
20658	case Intrinsic::aarch64_sve_sqadd_x:
20659	return DAG.getNode(Opcode: ISD::SADDSAT, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
20660	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`));
20661	case Intrinsic::aarch64_sve_sqsub_x:
20662	return DAG.getNode(Opcode: ISD::SSUBSAT, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
20663	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`));
20664	case Intrinsic::aarch64_sve_uqadd_x:
20665	return DAG.getNode(Opcode: ISD::UADDSAT, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
20666	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`));
20667	case Intrinsic::aarch64_sve_uqsub_x:
20668	return DAG.getNode(Opcode: ISD::USUBSAT, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
20669	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`));
20670	case Intrinsic::aarch64_sve_asrd:
20671	return DAG.getNode(Opcode: AArch64ISD::SRAD_MERGE_OP1, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
20672	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
20673	case Intrinsic::aarch64_sve_cmphs:
20674	if (!N->getOperand(Num: `2`).getValueType().isFloatingPoint())
20675	return DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL: SDLoc (N),
20676	VT: N->getValueType(ResNo: `0`), N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`),
20677	N3: N->getOperand(Num: `3`), N4: DAG.getCondCode(Cond: ISD::SETUGE));
20678	break;
20679	case Intrinsic::aarch64_sve_cmphi:
20680	if (!N->getOperand(Num: `2`).getValueType().isFloatingPoint())
20681	return DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL: SDLoc (N),
20682	VT: N->getValueType(ResNo: `0`), N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`),
20683	N3: N->getOperand(Num: `3`), N4: DAG.getCondCode(Cond: ISD::SETUGT));
20684	break;
20685	case Intrinsic::aarch64_sve_fcmpge:
20686	case Intrinsic::aarch64_sve_cmpge:
20687	return DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL: SDLoc (N),
20688	VT: N->getValueType(ResNo: `0`), N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`),
20689	N3: N->getOperand(Num: `3`), N4: DAG.getCondCode(Cond: ISD::SETGE));
20690	break;
20691	case Intrinsic::aarch64_sve_fcmpgt:
20692	case Intrinsic::aarch64_sve_cmpgt:
20693	return DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL: SDLoc (N),
20694	VT: N->getValueType(ResNo: `0`), N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`),
20695	N3: N->getOperand(Num: `3`), N4: DAG.getCondCode(Cond: ISD::SETGT));
20696	break;
20697	case Intrinsic::aarch64_sve_fcmpeq:
20698	case Intrinsic::aarch64_sve_cmpeq:
20699	return DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL: SDLoc (N),
20700	VT: N->getValueType(ResNo: `0`), N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`),
20701	N3: N->getOperand(Num: `3`), N4: DAG.getCondCode(Cond: ISD::SETEQ));
20702	break;
20703	case Intrinsic::aarch64_sve_fcmpne:
20704	case Intrinsic::aarch64_sve_cmpne:
20705	return DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL: SDLoc (N),
20706	VT: N->getValueType(ResNo: `0`), N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`),
20707	N3: N->getOperand(Num: `3`), N4: DAG.getCondCode(Cond: ISD::SETNE));
20708	break;
20709	case Intrinsic::aarch64_sve_fcmpuo:
20710	return DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL: SDLoc (N),
20711	VT: N->getValueType(ResNo: `0`), N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`),
20712	N3: N->getOperand(Num: `3`), N4: DAG.getCondCode(Cond: ISD::SETUO));
20713	break;
20714	case Intrinsic::aarch64_sve_fadda:
20715	return combineSVEReductionOrderedFP(N, Opc: AArch64ISD::FADDA_PRED, DAG);
20716	case Intrinsic::aarch64_sve_faddv:
20717	return combineSVEReductionFP(N, Opc: AArch64ISD::FADDV_PRED, DAG);
20718	case Intrinsic::aarch64_sve_fmaxnmv:
20719	return combineSVEReductionFP(N, Opc: AArch64ISD::FMAXNMV_PRED, DAG);
20720	case Intrinsic::aarch64_sve_fmaxv:
20721	return combineSVEReductionFP(N, Opc: AArch64ISD::FMAXV_PRED, DAG);
20722	case Intrinsic::aarch64_sve_fminnmv:
20723	return combineSVEReductionFP(N, Opc: AArch64ISD::FMINNMV_PRED, DAG);
20724	case Intrinsic::aarch64_sve_fminv:
20725	return combineSVEReductionFP(N, Opc: AArch64ISD::FMINV_PRED, DAG);
20726	case Intrinsic::aarch64_sve_sel:
20727	return DAG.getNode(Opcode: ISD::VSELECT, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
20728	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
20729	case Intrinsic::aarch64_sve_cmpeq_wide:
20730	return tryConvertSVEWideCompare(N, CC: ISD::SETEQ, DCI, DAG);
20731	case Intrinsic::aarch64_sve_cmpne_wide:
20732	return tryConvertSVEWideCompare(N, CC: ISD::SETNE, DCI, DAG);
20733	case Intrinsic::aarch64_sve_cmpge_wide:
20734	return tryConvertSVEWideCompare(N, CC: ISD::SETGE, DCI, DAG);
20735	case Intrinsic::aarch64_sve_cmpgt_wide:
20736	return tryConvertSVEWideCompare(N, CC: ISD::SETGT, DCI, DAG);
20737	case Intrinsic::aarch64_sve_cmplt_wide:
20738	return tryConvertSVEWideCompare(N, CC: ISD::SETLT, DCI, DAG);
20739	case Intrinsic::aarch64_sve_cmple_wide:
20740	return tryConvertSVEWideCompare(N, CC: ISD::SETLE, DCI, DAG);
20741	case Intrinsic::aarch64_sve_cmphs_wide:
20742	return tryConvertSVEWideCompare(N, CC: ISD::SETUGE, DCI, DAG);
20743	case Intrinsic::aarch64_sve_cmphi_wide:
20744	return tryConvertSVEWideCompare(N, CC: ISD::SETUGT, DCI, DAG);
20745	case Intrinsic::aarch64_sve_cmplo_wide:
20746	return tryConvertSVEWideCompare(N, CC: ISD::SETULT, DCI, DAG);
20747	case Intrinsic::aarch64_sve_cmpls_wide:
20748	return tryConvertSVEWideCompare(N, CC: ISD::SETULE, DCI, DAG);
20749	case Intrinsic::aarch64_sve_ptest_any:
20750	return getPTest(DAG, VT: N->getValueType(ResNo: `0`), Pg: N->getOperand(Num: `1`), Op: N->getOperand(Num: `2`),
20751	Cond: AArch64CC::ANY_ACTIVE);
20752	case Intrinsic::aarch64_sve_ptest_first:
20753	return getPTest(DAG, VT: N->getValueType(ResNo: `0`), Pg: N->getOperand(Num: `1`), Op: N->getOperand(Num: `2`),
20754	Cond: AArch64CC::FIRST_ACTIVE);
20755	case Intrinsic::aarch64_sve_ptest_last:
20756	return getPTest(DAG, VT: N->getValueType(ResNo: `0`), Pg: N->getOperand(Num: `1`), Op: N->getOperand(Num: `2`),
20757	Cond: AArch64CC::LAST_ACTIVE);
20758	}
20759	return SDValue ();
20760	}
20761
20762	static bool isCheapToExtend(const SDValue &N) {
20763	unsigned OC = N ->getOpcode();
20764	return OC == ISD::LOAD \|\| OC == ISD::MLOAD \|\|
20765	ISD::isConstantSplatVectorAllZeros(N: N.getNode());
20766	}
20767
20768	static SDValue
20769	performSignExtendSetCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
20770	SelectionDAG &DAG) {
20771	// If we have (sext (setcc A B)) and A and B are cheap to extend,
20772	// we can move the sext into the arguments and have the same result. For
20773	// example, if A and B are both loads, we can make those extending loads and
20774	// avoid an extra instruction. This pattern appears often in VLS code
20775	// generation where the inputs to the setcc have a different size to the
20776	// instruction that wants to use the result of the setcc.
20777	assert(N->getOpcode() == ISD::SIGN_EXTEND &&
20778	N->getOperand(`0`)->getOpcode() == ISD::SETCC);
20779	const SDValue SetCC = N->getOperand(Num: `0`);
20780
20781	const SDValue CCOp0 = SetCC.getOperand(i: `0`);
20782	const SDValue CCOp1 = SetCC.getOperand(i: `1`);
20783	if (!CCOp0 ->getValueType(ResNo: `0`).isInteger() \|\|
20784	!CCOp1 ->getValueType(ResNo: `0`).isInteger())
20785	return SDValue ();
20786
20787	ISD::CondCode Code =
20788	cast<CondCodeSDNode>(Val: SetCC ->getOperand(Num: `2`).getNode())->get();
20789
20790	ISD::NodeType ExtType =
20791	isSignedIntSetCC(Code) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
20792
20793	if (isCheapToExtend(N: SetCC.getOperand(i: `0`)) &&
20794	isCheapToExtend(N: SetCC.getOperand(i: `1`))) {
20795	const SDValue Ext1 =
20796	DAG.getNode(Opcode: ExtType, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`), Operand: CCOp0);
20797	const SDValue Ext2 =
20798	DAG.getNode(Opcode: ExtType, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`), Operand: CCOp1);
20799
20800	return DAG.getSetCC(
20801	DL: SDLoc (SetCC), VT: N->getValueType(ResNo: `0`), LHS: Ext1, RHS: Ext2,
20802	Cond: cast<CondCodeSDNode>(Val: SetCC ->getOperand(Num: `2`).getNode())->get());
20803	}
20804
20805	return SDValue ();
20806	}
20807
20808	static SDValue performExtendCombine(SDNode *N,
20809	TargetLowering::DAGCombinerInfo &DCI,
20810	SelectionDAG &DAG) {
20811	// If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
20812	// we can convert that DUP into another extract_high (of a bigger DUP), which
20813	// helps the backend to decide that an sabdl2 would be useful, saving a real
20814	// extract_high operation.
20815	if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
20816	(N->getOperand(Num: `0`).getOpcode() == ISD::ABDU \|\|
20817	N->getOperand(Num: `0`).getOpcode() == ISD::ABDS)) {
20818	SDNode *ABDNode = N->getOperand(Num: `0`).getNode();
20819	SDValue NewABD =
20820	tryCombineLongOpWithDup(IID: Intrinsic::not_intrinsic, N: ABDNode, DCI, DAG);
20821	if (!NewABD.getNode())
20822	return SDValue ();
20823
20824	return DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`), Operand: NewABD);
20825	}
20826
20827	if (N->getValueType(ResNo: `0`).isFixedLengthVector() &&
20828	N->getOpcode() == ISD::SIGN_EXTEND &&
20829	N->getOperand(Num: `0`)->getOpcode() == ISD::SETCC)
20830	return performSignExtendSetCCCombine(N, DCI, DAG);
20831
20832	return SDValue ();
20833	}
20834
20835	static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St,
20836	SDValue SplatVal, unsigned NumVecElts) {
20837	assert(!St.isTruncatingStore() && "cannot split truncating vector store");
20838	Align OrigAlignment = St.getAlign();
20839	unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / `8`;
20840
20841	// Create scalar stores. This is at least as good as the code sequence for a
20842	// split unaligned store which is a dup.s, ext.b, and two stores.
20843	// Most of the time the three stores should be replaced by store pair
20844	// instructions (stp).
20845	SDLoc DL(&St);
20846	SDValue BasePtr = St.getBasePtr();
20847	uint64_t BaseOffset = `0`;
20848
20849	const MachinePointerInfo &PtrInfo = St.getPointerInfo();
20850	SDValue NewST1 =
20851	DAG.getStore(Chain: St.getChain(), dl: DL, Val: SplatVal, Ptr: BasePtr, PtrInfo,
20852	Alignment: OrigAlignment, MMOFlags: St.getMemOperand()->getFlags());
20853
20854	// As this in ISel, we will not merge this add which may degrade results.
20855	if (BasePtr ->getOpcode() == ISD::ADD &&
20856	isa<ConstantSDNode>(Val: BasePtr ->getOperand(Num: `1`))) {
20857	BaseOffset = cast<ConstantSDNode>(Val: BasePtr ->getOperand(Num: `1`))->getSExtValue();
20858	BasePtr = BasePtr ->getOperand(Num: `0`);
20859	}
20860
20861	unsigned Offset = EltOffset;
20862	while (--NumVecElts) {
20863	Align Alignment = commonAlignment(A: OrigAlignment, Offset);
20864	SDValue OffsetPtr =
20865	DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
20866	DAG.getConstant(BaseOffset + Offset, DL, MVT::i64));
20867	NewST1 = DAG.getStore(Chain: NewST1.getValue(R: `0`), dl: DL, Val: SplatVal, Ptr: OffsetPtr,
20868	PtrInfo: PtrInfo.getWithOffset(O: Offset), Alignment,
20869	MMOFlags: St.getMemOperand()->getFlags());
20870	Offset += EltOffset;
20871	}
20872	return NewST1;
20873	}
20874
20875	// Returns an SVE type that ContentTy can be trivially sign or zero extended
20876	// into.
20877	static MVT getSVEContainerType(EVT ContentTy) {
20878	assert(ContentTy.isSimple() && "No SVE containers for extended types");
20879
20880	switch (ContentTy.getSimpleVT().SimpleTy) {
20881	default:
20882	llvm_unreachable("No known SVE container for this MVT type");
20883	case MVT::nxv2i8:
20884	case MVT::nxv2i16:
20885	case MVT::nxv2i32:
20886	case MVT::nxv2i64:
20887	case MVT::nxv2f32:
20888	case MVT::nxv2f64:
20889	return MVT::nxv2i64;
20890	case MVT::nxv4i8:
20891	case MVT::nxv4i16:
20892	case MVT::nxv4i32:
20893	case MVT::nxv4f32:
20894	return MVT::nxv4i32;
20895	case MVT::nxv8i8:
20896	case MVT::nxv8i16:
20897	case MVT::nxv8f16:
20898	case MVT::nxv8bf16:
20899	return MVT::nxv8i16;
20900	case MVT::nxv16i8:
20901	return MVT::nxv16i8;
20902	}
20903	}
20904
20905	static SDValue performLD1Combine(SDNode N, SelectionDAG &DAG, unsigned* Opc) {
20906	SDLoc DL(N);
20907	EVT VT = N->getValueType(ResNo: `0`);
20908
20909	if (VT.getSizeInBits().getKnownMinValue() > AArch64::SVEBitsPerBlock)
20910	return SDValue ();
20911
20912	EVT ContainerVT = VT;
20913	if (ContainerVT.isInteger())
20914	ContainerVT = getSVEContainerType(ContentTy: ContainerVT);
20915
20916	SDVTList VTs = DAG.getVTList(ContainerVT, MVT::Other);
20917	SDValue Ops[] = { N->getOperand(Num: `0`), // Chain
20918	N->getOperand(Num: `2`), // Pg
20919	N->getOperand(Num: `3`), // Base
20920	DAG.getValueType(VT) };
20921
20922	SDValue Load = DAG.getNode(Opcode: Opc, DL, VTList: VTs, Ops);
20923	SDValue LoadChain = SDValue (Load.getNode(), `1`);
20924
20925	if (ContainerVT.isInteger() && (VT != ContainerVT))
20926	Load = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: Load.getValue(R: `0`));
20927
20928	return DAG.getMergeValues(Ops: { Load, LoadChain }, dl: DL);
20929	}
20930
20931	static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG) {
20932	SDLoc DL(N);
20933	EVT VT = N->getValueType(ResNo: `0`);
20934	EVT PtrTy = N->getOperand(Num: `3`).getValueType();
20935
20936	EVT LoadVT = VT;
20937	if (VT.isFloatingPoint())
20938	LoadVT = VT.changeTypeToInteger();
20939
20940	auto *MINode = cast<MemIntrinsicSDNode>(Val: N);
20941	SDValue PassThru = DAG.getConstant(Val: `0`, DL, VT: LoadVT);
20942	SDValue L = DAG.getMaskedLoad(VT: LoadVT, dl: DL, Chain: MINode->getChain(),
20943	Base: MINode->getOperand(Num: `3`), Offset: DAG.getUNDEF(VT: PtrTy),
20944	Mask: MINode->getOperand(Num: `2`), Src0: PassThru,
20945	MemVT: MINode->getMemoryVT(), MMO: MINode->getMemOperand(),
20946	AM: ISD::UNINDEXED, ISD::NON_EXTLOAD, IsExpanding: false);
20947
20948	if (VT.isFloatingPoint()) {
20949	SDValue Ops[] = { DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: L), L.getValue(R: `1`) };
20950	return DAG.getMergeValues(Ops, dl: DL);
20951	}
20952
20953	return L;
20954	}
20955
20956	template <unsigned Opcode>
20957	static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG) {
20958	static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO \|\|
20959	Opcode == AArch64ISD::LD1RO_MERGE_ZERO,
20960	"Unsupported opcode.");
20961	SDLoc DL(N);
20962	EVT VT = N->getValueType(ResNo: `0`);
20963
20964	EVT LoadVT = VT;
20965	if (VT.isFloatingPoint())
20966	LoadVT = VT.changeTypeToInteger();
20967
20968	SDValue Ops[] = {N->getOperand(Num: `0`), N->getOperand(Num: `2`), N->getOperand(Num: `3`)};
20969	SDValue Load = DAG.getNode(Opcode, DL, {LoadVT, MVT::Other}, Ops);
20970	SDValue LoadChain = SDValue (Load.getNode(), `1`);
20971
20972	if (VT.isFloatingPoint())
20973	Load = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Load.getValue(R: `0`));
20974
20975	return DAG.getMergeValues(Ops: {Load, LoadChain}, dl: DL);
20976	}
20977
20978	static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG) {
20979	SDLoc DL(N);
20980	SDValue Data = N->getOperand(Num: `2`);
20981	EVT DataVT = Data.getValueType();
20982	EVT HwSrcVt = getSVEContainerType(ContentTy: DataVT);
20983	SDValue InputVT = DAG.getValueType(DataVT);
20984
20985	if (DataVT.isFloatingPoint())
20986	InputVT = DAG.getValueType(HwSrcVt);
20987
20988	SDValue SrcNew;
20989	if (Data.getValueType().isFloatingPoint())
20990	SrcNew = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: HwSrcVt, Operand: Data);
20991	else
20992	SrcNew = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: HwSrcVt, Operand: Data);
20993
20994	SDValue Ops[] = { N->getOperand(Num: `0`), // Chain
20995	SrcNew,
20996	N->getOperand(Num: `4`), // Base
20997	N->getOperand(Num: `3`), // Pg
20998	InputVT
20999	};
21000
21001	return DAG.getNode(Opcode: AArch64ISD::ST1_PRED, DL, VT: N->getValueType(ResNo: `0`), Ops);
21002	}
21003
21004	static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG) {
21005	SDLoc DL(N);
21006
21007	SDValue Data = N->getOperand(Num: `2`);
21008	EVT DataVT = Data.getValueType();
21009	EVT PtrTy = N->getOperand(Num: `4`).getValueType();
21010
21011	if (DataVT.isFloatingPoint())
21012	Data = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: DataVT.changeTypeToInteger(), Operand: Data);
21013
21014	auto *MINode = cast<MemIntrinsicSDNode>(Val: N);
21015	return DAG.getMaskedStore(Chain: MINode->getChain(), dl: DL, Val: Data, Base: MINode->getOperand(Num: `4`),
21016	Offset: DAG.getUNDEF(VT: PtrTy), Mask: MINode->getOperand(Num: `3`),
21017	MemVT: MINode->getMemoryVT(), MMO: MINode->getMemOperand(),
21018	AM: ISD::UNINDEXED, IsTruncating: false, IsCompressing: false);
21019	}
21020
21021	/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The
21022	/// load store optimizer pass will merge them to store pair stores. This should
21023	/// be better than a movi to create the vector zero followed by a vector store
21024	/// if the zero constant is not re-used, since one instructions and one register
21025	/// live range will be removed.
21026	///
21027	/// For example, the final generated code should be:
21028	///
21029	/// stp xzr, xzr, [x0]
21030	///
21031	/// instead of:
21032	///
21033	/// movi v0.2d, #0
21034	/// str q0, [x0]
21035	///
21036	static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
21037	SDValue StVal = St.getValue();
21038	EVT VT = StVal.getValueType();
21039
21040	// Avoid scalarizing zero splat stores for scalable vectors.
21041	if (VT.isScalableVector())
21042	return SDValue ();
21043
21044	// It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
21045	// 2, 3 or 4 i32 elements.
21046	int NumVecElts = VT.getVectorNumElements();
21047	if (!(((NumVecElts == `2` \|\| NumVecElts == `3`) &&
21048	VT.getVectorElementType().getSizeInBits() == `64`) \|\|
21049	((NumVecElts == `2` \|\| NumVecElts == `3` \|\| NumVecElts == `4`) &&
21050	VT.getVectorElementType().getSizeInBits() == `32`)))
21051	return SDValue ();
21052
21053	if (StVal.getOpcode() != ISD::BUILD_VECTOR)
21054	return SDValue ();
21055
21056	// If the zero constant has more than one use then the vector store could be
21057	// better since the constant mov will be amortized and stp q instructions
21058	// should be able to be formed.
21059	if (!StVal.hasOneUse())
21060	return SDValue ();
21061
21062	// If the store is truncating then it's going down to i16 or smaller, which
21063	// means it can be implemented in a single store anyway.
21064	if (St.isTruncatingStore())
21065	return SDValue ();
21066
21067	// If the immediate offset of the address operand is too large for the stp
21068	// instruction, then bail out.
21069	if (DAG.isBaseWithConstantOffset(Op: St.getBasePtr())) {
21070	int64_t Offset = St.getBasePtr()->getConstantOperandVal(Num: `1`);
21071	if (Offset < -`512` \|\| Offset > `504`)
21072	return SDValue ();
21073	}
21074
21075	for (int I = `0`; I < NumVecElts; ++I) {
21076	SDValue EltVal = StVal.getOperand(i: I);
21077	if (!isNullConstant(V: EltVal) && !isNullFPConstant(V: EltVal))
21078	return SDValue ();
21079	}
21080
21081	// Use a CopyFromReg WZR/XZR here to prevent
21082	// DAGCombiner::MergeConsecutiveStores from undoing this transformation.
21083	SDLoc DL(&St);
21084	unsigned ZeroReg;
21085	EVT ZeroVT;
21086	if (VT.getVectorElementType().getSizeInBits() == `32`) {
21087	ZeroReg = AArch64::WZR;
21088	ZeroVT = MVT::i32;
21089	} else {
21090	ZeroReg = AArch64::XZR;
21091	ZeroVT = MVT::i64;
21092	}
21093	SDValue SplatVal =
21094	DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: DL, Reg: ZeroReg, VT: ZeroVT);
21095	return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
21096	}
21097
21098	/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
21099	/// value. The load store optimizer pass will merge them to store pair stores.
21100	/// This has better performance than a splat of the scalar followed by a split
21101	/// vector store. Even if the stores are not merged it is four stores vs a dup,
21102	/// followed by an ext.b and two stores.
21103	static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
21104	SDValue StVal = St.getValue();
21105	EVT VT = StVal.getValueType();
21106
21107	// Don't replace floating point stores, they possibly won't be transformed to
21108	// stp because of the store pair suppress pass.
21109	if (VT.isFloatingPoint())
21110	return SDValue ();
21111
21112	// We can express a splat as store pair(s) for 2 or 4 elements.
21113	unsigned NumVecElts = VT.getVectorNumElements();
21114	if (NumVecElts != `4` && NumVecElts != `2`)
21115	return SDValue ();
21116
21117	// If the store is truncating then it's going down to i16 or smaller, which
21118	// means it can be implemented in a single store anyway.
21119	if (St.isTruncatingStore())
21120	return SDValue ();
21121
21122	// Check that this is a splat.
21123	// Make sure that each of the relevant vector element locations are inserted
21124	// to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
21125	std::bitset<`4`> IndexNotInserted((`1` << NumVecElts) - `1`);
21126	SDValue SplatVal;
21127	for (unsigned I = `0`; I < NumVecElts; ++I) {
21128	// Check for insert vector elements.
21129	if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
21130	return SDValue ();
21131
21132	// Check that same value is inserted at each vector element.
21133	if (I == `0`)
21134	SplatVal = StVal.getOperand(i: `1`);
21135	else if (StVal.getOperand(i: `1`) != SplatVal)
21136	return SDValue ();
21137
21138	// Check insert element index.
21139	ConstantSDNode *CIndex = dyn_cast<ConstantSDNode>(Val: StVal.getOperand(i: `2`));
21140	if (!CIndex)
21141	return SDValue ();
21142	uint64_t IndexVal = CIndex->getZExtValue();
21143	if (IndexVal >= NumVecElts)
21144	return SDValue ();
21145	IndexNotInserted.reset(position: IndexVal);
21146
21147	StVal = StVal.getOperand(i: `0`);
21148	}
21149	// Check that all vector element locations were inserted to.
21150	if (IndexNotInserted.any())
21151	return SDValue ();
21152
21153	return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
21154	}
21155
21156	static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
21157	SelectionDAG &DAG,
21158	const AArch64Subtarget *Subtarget) {
21159
21160	StoreSDNode *S = cast<StoreSDNode>(Val: N);
21161	if (S->isVolatile() \|\| S->isIndexed())
21162	return SDValue ();
21163
21164	SDValue StVal = S->getValue();
21165	EVT VT = StVal.getValueType();
21166
21167	if (!VT.isFixedLengthVector())
21168	return SDValue ();
21169
21170	// If we get a splat of zeros, convert this vector store to a store of
21171	// scalars. They will be merged into store pairs of xzr thereby removing one
21172	// instruction and one register.
21173	if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, St&: *S))
21174	return ReplacedZeroSplat;
21175
21176	// FIXME: The logic for deciding if an unaligned store should be split should
21177	// be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
21178	// a call to that function here.
21179
21180	if (!Subtarget->isMisaligned128StoreSlow())
21181	return SDValue ();
21182
21183	// Don't split at -Oz.
21184	if (DAG.getMachineFunction().getFunction().hasMinSize())
21185	return SDValue ();
21186
21187	// Don't split v2i64 vectors. Memcpy lowering produces those and splitting
21188	// those up regresses performance on micro-benchmarks and olden/bh.
21189	if (VT.getVectorNumElements() < `2` \|\| VT == MVT::v2i64)
21190	return SDValue ();
21191
21192	// Split unaligned 16B stores. They are terrible for performance.
21193	// Don't split stores with alignment of 1 or 2. Code that uses clang vector
21194	// extensions can use this to mark that it does not want splitting to happen
21195	// (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
21196	// eliminating alignment hazards is only 1 in 8 for alignment of 2.
21197	if (VT.getSizeInBits() != `128` \|\| S->getAlign() >= Align (`16`) \|\|
21198	S->getAlign() <= Align (`2`))
21199	return SDValue ();
21200
21201	// If we get a splat of a scalar convert this vector store to a store of
21202	// scalars. They will be merged into store pairs thereby removing two
21203	// instructions.
21204	if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, St&: *S))
21205	return ReplacedSplat;
21206
21207	SDLoc DL(S);
21208
21209	// Split VT into two.
21210	EVT HalfVT = VT.getHalfNumVectorElementsVT(Context&: *DAG.getContext());
21211	unsigned NumElts = HalfVT.getVectorNumElements();
21212	SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
21213	DAG.getConstant(`0`, DL, MVT::i64));
21214	SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
21215	DAG.getConstant(NumElts, DL, MVT::i64));
21216	SDValue BasePtr = S->getBasePtr();
21217	SDValue NewST1 =
21218	DAG.getStore(Chain: S->getChain(), dl: DL, Val: SubVector0, Ptr: BasePtr, PtrInfo: S->getPointerInfo(),
21219	Alignment: S->getAlign(), MMOFlags: S->getMemOperand()->getFlags());
21220	SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
21221	DAG.getConstant(`8`, DL, MVT::i64));
21222	return DAG.getStore(Chain: NewST1.getValue(R: `0`), dl: DL, Val: SubVector1, Ptr: OffsetPtr,
21223	PtrInfo: S->getPointerInfo(), Alignment: S->getAlign(),
21224	MMOFlags: S->getMemOperand()->getFlags());
21225	}
21226
21227	static SDValue performSpliceCombine(SDNode *N, SelectionDAG &DAG) {
21228	assert(N->getOpcode() == AArch64ISD::SPLICE && "Unexepected Opcode!");
21229
21230	// splice(pg, op1, undef) -> op1
21231	if (N->getOperand(Num: `2`).isUndef())
21232	return N->getOperand(Num: `1`);
21233
21234	return SDValue ();
21235	}
21236
21237	static SDValue performUnpackCombine(SDNode *N, SelectionDAG &DAG,
21238	const AArch64Subtarget *Subtarget) {
21239	assert((N->getOpcode() == AArch64ISD::UUNPKHI \|\|
21240	N->getOpcode() == AArch64ISD::UUNPKLO) &&
21241	"Unexpected Opcode!");
21242
21243	// uunpklo/hi undef -> undef
21244	if (N->getOperand(Num: `0`).isUndef())
21245	return DAG.getUNDEF(VT: N->getValueType(ResNo: `0`));
21246
21247	// If this is a masked load followed by an UUNPKLO, fold this into a masked
21248	// extending load. We can do this even if this is already a masked
21249	// {z,}extload.
21250	if (N->getOperand(Num: `0`).getOpcode() == ISD::MLOAD &&
21251	N->getOpcode() == AArch64ISD::UUNPKLO) {
21252	MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(Val: N->getOperand(Num: `0`));
21253	SDValue Mask = MLD->getMask();
21254	SDLoc DL(N);
21255
21256	if (MLD->isUnindexed() && MLD->getExtensionType() != ISD::SEXTLOAD &&
21257	SDValue (MLD, `0`).hasOneUse() && Mask ->getOpcode() == AArch64ISD::PTRUE &&
21258	(MLD->getPassThru()->isUndef() \|\|
21259	isZerosVector(N: MLD->getPassThru().getNode()))) {
21260	unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
21261	unsigned PgPattern = Mask ->getConstantOperandVal(Num: `0`);
21262	EVT VT = N->getValueType(ResNo: `0`);
21263
21264	// Ensure we can double the size of the predicate pattern
21265	unsigned NumElts = getNumElementsFromSVEPredPattern(Pattern: PgPattern);
21266	if (NumElts &&
21267	NumElts * VT.getVectorElementType().getSizeInBits() <= MinSVESize) {
21268	Mask =
21269	getPTrue(DAG, DL, VT.changeVectorElementType(MVT::i1), PgPattern);
21270	SDValue PassThru = DAG.getConstant(Val: `0`, DL, VT);
21271	SDValue NewLoad = DAG.getMaskedLoad(
21272	VT, dl: DL, Chain: MLD->getChain(), Base: MLD->getBasePtr(), Offset: MLD->getOffset(), Mask,
21273	Src0: PassThru, MemVT: MLD->getMemoryVT(), MMO: MLD->getMemOperand(),
21274	AM: MLD->getAddressingMode(), ISD::ZEXTLOAD);
21275
21276	DAG.ReplaceAllUsesOfValueWith(From: SDValue (MLD, `1`), To: NewLoad.getValue(R: `1`));
21277
21278	return NewLoad;
21279	}
21280	}
21281	}
21282
21283	return SDValue ();
21284	}
21285
21286	static bool isHalvingTruncateAndConcatOfLegalIntScalableType(SDNode *N) {
21287	if (N->getOpcode() != AArch64ISD::UZP1)
21288	return false;
21289	SDValue Op0 = N->getOperand(Num: `0`);
21290	EVT SrcVT = Op0 ->getValueType(ResNo: `0`);
21291	EVT DstVT = N->getValueType(ResNo: `0`);
21292	return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv16i8) \|\|
21293	(SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv8i16) \|\|
21294	(SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv4i32);
21295	}
21296
21297	// Try to combine rounding shifts where the operands come from an extend, and
21298	// the result is truncated and combined into one vector.
21299	// uzp1(rshrnb(uunpklo(X),C), rshrnb(uunpkhi(X), C)) -> urshr(X, C)
21300	static SDValue tryCombineExtendRShTrunc(SDNode *N, SelectionDAG &DAG) {
21301	assert(N->getOpcode() == AArch64ISD::UZP1 && "Only UZP1 expected.");
21302	SDValue Op0 = N->getOperand(Num: `0`);
21303	SDValue Op1 = N->getOperand(Num: `1`);
21304	EVT ResVT = N->getValueType(ResNo: `0`);
21305
21306	unsigned RshOpc = Op0.getOpcode();
21307	if (RshOpc != AArch64ISD::RSHRNB_I)
21308	return SDValue ();
21309
21310	// Same op code and imm value?
21311	SDValue ShiftValue = Op0.getOperand(i: `1`);
21312	if (RshOpc != Op1.getOpcode() \|\| ShiftValue != Op1.getOperand(i: `1`))
21313	return SDValue ();
21314
21315	// Same unextended operand value?
21316	SDValue Lo = Op0.getOperand(i: `0`);
21317	SDValue Hi = Op1.getOperand(i: `0`);
21318	if (Lo.getOpcode() != AArch64ISD::UUNPKLO &&
21319	Hi.getOpcode() != AArch64ISD::UUNPKHI)
21320	return SDValue ();
21321	SDValue OrigArg = Lo.getOperand(i: `0`);
21322	if (OrigArg != Hi.getOperand(i: `0`))
21323	return SDValue ();
21324
21325	SDLoc DL(N);
21326	return DAG.getNode(Opcode: AArch64ISD::URSHR_I_PRED, DL, VT: ResVT,
21327	N1: getPredicateForVector(DAG, DL, VT: ResVT), N2: OrigArg,
21328	N3: ShiftValue);
21329	}
21330
21331	// Try to simplify:
21332	// t1 = nxv8i16 add(X, 1 << (ShiftValue - 1))
21333	// t2 = nxv8i16 srl(t1, ShiftValue)
21334	// to
21335	// t1 = nxv8i16 rshrnb(X, shiftvalue).
21336	// rshrnb will zero the top half bits of each element. Therefore, this combine
21337	// should only be performed when a following instruction with the rshrnb
21338	// as an operand does not care about the top half of each element. For example,
21339	// a uzp1 or a truncating store.
21340	static SDValue trySimplifySrlAddToRshrnb(SDValue Srl, SelectionDAG &DAG,
21341	const AArch64Subtarget *Subtarget) {
21342	EVT VT = Srl ->getValueType(ResNo: `0`);
21343	if (!VT.isScalableVector() \|\| !Subtarget->hasSVE2())
21344	return SDValue ();
21345
21346	EVT ResVT;
21347	if (VT == MVT::nxv8i16)
21348	ResVT = MVT::nxv16i8;
21349	else if (VT == MVT::nxv4i32)
21350	ResVT = MVT::nxv8i16;
21351	else if (VT == MVT::nxv2i64)
21352	ResVT = MVT::nxv4i32;
21353	else
21354	return SDValue ();
21355
21356	SDLoc DL(Srl);
21357	unsigned ShiftValue;
21358	SDValue RShOperand;
21359	if (!canLowerSRLToRoundingShiftForVT(Shift: Srl, ResVT, DAG, ShiftValue, RShOperand))
21360	return SDValue ();
21361	SDValue Rshrnb = DAG.getNode(
21362	AArch64ISD::RSHRNB_I, DL, ResVT,
21363	{RShOperand, DAG.getTargetConstant(ShiftValue, DL, MVT::i32)});
21364	return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Rshrnb);
21365	}
21366
21367	static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG,
21368	const AArch64Subtarget *Subtarget) {
21369	SDLoc DL(N);
21370	SDValue Op0 = N->getOperand(Num: `0`);
21371	SDValue Op1 = N->getOperand(Num: `1`);
21372	EVT ResVT = N->getValueType(ResNo: `0`);
21373
21374	// uzp1(x, undef) -> concat(truncate(x), undef)
21375	if (Op1.getOpcode() == ISD::UNDEF) {
21376	EVT BCVT = MVT::Other, HalfVT = MVT::Other;
21377	switch (ResVT.getSimpleVT().SimpleTy) {
21378	default:
21379	break;
21380	case MVT::v16i8:
21381	BCVT = MVT::v8i16;
21382	HalfVT = MVT::v8i8;
21383	break;
21384	case MVT::v8i16:
21385	BCVT = MVT::v4i32;
21386	HalfVT = MVT::v4i16;
21387	break;
21388	case MVT::v4i32:
21389	BCVT = MVT::v2i64;
21390	HalfVT = MVT::v2i32;
21391	break;
21392	}
21393	if (BCVT != MVT::Other) {
21394	SDValue BC = DAG.getBitcast(VT: BCVT, V: Op0);
21395	SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: HalfVT, Operand: BC);
21396	return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: ResVT, N1: Trunc,
21397	N2: DAG.getUNDEF(VT: HalfVT));
21398	}
21399	}
21400
21401	if (SDValue Urshr = tryCombineExtendRShTrunc(N, DAG))
21402	return Urshr;
21403
21404	if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Srl: Op0, DAG, Subtarget))
21405	return DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: ResVT, N1: Rshrnb, N2: Op1);
21406
21407	if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Srl: Op1, DAG, Subtarget))
21408	return DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: ResVT, N1: Op0, N2: Rshrnb);
21409
21410	// uzp1(unpklo(uzp1(x, y)), z) => uzp1(x, z)
21411	if (Op0.getOpcode() == AArch64ISD::UUNPKLO) {
21412	if (Op0.getOperand(i: `0`).getOpcode() == AArch64ISD::UZP1) {
21413	SDValue X = Op0.getOperand(i: `0`).getOperand(i: `0`);
21414	return DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: ResVT, N1: X, N2: Op1);
21415	}
21416	}
21417
21418	// uzp1(x, unpkhi(uzp1(y, z))) => uzp1(x, z)
21419	if (Op1.getOpcode() == AArch64ISD::UUNPKHI) {
21420	if (Op1.getOperand(i: `0`).getOpcode() == AArch64ISD::UZP1) {
21421	SDValue Z = Op1.getOperand(i: `0`).getOperand(i: `1`);
21422	return DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: ResVT, N1: Op0, N2: Z);
21423	}
21424	}
21425
21426	// These optimizations only work on little endian.
21427	if (!DAG.getDataLayout().isLittleEndian())
21428	return SDValue ();
21429
21430	// uzp1(bitcast(x), bitcast(y)) -> uzp1(x, y)
21431	// Example:
21432	// nxv4i32 = uzp1 bitcast(nxv4i32 x to nxv2i64), bitcast(nxv4i32 y to nxv2i64)
21433	// to
21434	// nxv4i32 = uzp1 nxv4i32 x, nxv4i32 y
21435	if (isHalvingTruncateAndConcatOfLegalIntScalableType(N) &&
21436	Op0.getOpcode() == ISD::BITCAST && Op1.getOpcode() == ISD::BITCAST) {
21437	if (Op0.getOperand(i: `0`).getValueType() == Op1.getOperand(i: `0`).getValueType()) {
21438	return DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: ResVT, N1: Op0.getOperand(i: `0`),
21439	N2: Op1.getOperand(i: `0`));
21440	}
21441	}
21442
21443	if (ResVT != MVT::v2i32 && ResVT != MVT::v4i16 && ResVT != MVT::v8i8)
21444	return SDValue ();
21445
21446	SDValue SourceOp0 = peekThroughBitcasts(V: Op0);
21447	SDValue SourceOp1 = peekThroughBitcasts(V: Op1);
21448
21449	// truncating uzp1(x, y) -> xtn(concat (x, y))
21450	if (SourceOp0.getValueType() == SourceOp1.getValueType()) {
21451	EVT Op0Ty = SourceOp0.getValueType();
21452	if ((ResVT == MVT::v4i16 && Op0Ty == MVT::v2i32) \|\|
21453	(ResVT == MVT::v8i8 && Op0Ty == MVT::v4i16)) {
21454	SDValue Concat =
21455	DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL,
21456	VT: Op0Ty.getDoubleNumVectorElementsVT(Context&: *DAG.getContext()),
21457	N1: SourceOp0, N2: SourceOp1);
21458	return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: ResVT, Operand: Concat);
21459	}
21460	}
21461
21462	// uzp1(xtn x, xtn y) -> xtn(uzp1 (x, y))
21463	if (SourceOp0.getOpcode() != ISD::TRUNCATE \|\|
21464	SourceOp1.getOpcode() != ISD::TRUNCATE)
21465	return SDValue ();
21466	SourceOp0 = SourceOp0.getOperand(i: `0`);
21467	SourceOp1 = SourceOp1.getOperand(i: `0`);
21468
21469	if (SourceOp0.getValueType() != SourceOp1.getValueType() \|\|
21470	!SourceOp0.getValueType().isSimple())
21471	return SDValue ();
21472
21473	EVT ResultTy;
21474
21475	switch (SourceOp0.getSimpleValueType().SimpleTy) {
21476	case MVT::v2i64:
21477	ResultTy = MVT::v4i32;
21478	break;
21479	case MVT::v4i32:
21480	ResultTy = MVT::v8i16;
21481	break;
21482	case MVT::v8i16:
21483	ResultTy = MVT::v16i8;
21484	break;
21485	default:
21486	return SDValue ();
21487	}
21488
21489	SDValue UzpOp0 = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: ResultTy, Operand: SourceOp0);
21490	SDValue UzpOp1 = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: ResultTy, Operand: SourceOp1);
21491	SDValue UzpResult =
21492	DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: UzpOp0.getValueType(), N1: UzpOp0, N2: UzpOp1);
21493
21494	EVT BitcastResultTy;
21495
21496	switch (ResVT.getSimpleVT().SimpleTy) {
21497	case MVT::v2i32:
21498	BitcastResultTy = MVT::v2i64;
21499	break;
21500	case MVT::v4i16:
21501	BitcastResultTy = MVT::v4i32;
21502	break;
21503	case MVT::v8i8:
21504	BitcastResultTy = MVT::v8i16;
21505	break;
21506	default:
21507	llvm_unreachable("Should be one of {v2i32, v4i16, v8i8}");
21508	}
21509
21510	return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: ResVT,
21511	Operand: DAG.getNode(Opcode: ISD::BITCAST, DL, VT: BitcastResultTy, Operand: UzpResult));
21512	}
21513
21514	static SDValue performGLD1Combine(SDNode *N, SelectionDAG &DAG) {
21515	unsigned Opc = N->getOpcode();
21516
21517	assert(((Opc >= AArch64ISD::GLD1_MERGE_ZERO && // unsigned gather loads
21518	Opc <= AArch64ISD::GLD1_IMM_MERGE_ZERO) \|\|
21519	(Opc >= AArch64ISD::GLD1S_MERGE_ZERO && // signed gather loads
21520	Opc <= AArch64ISD::GLD1S_IMM_MERGE_ZERO)) &&
21521	"Invalid opcode.");
21522
21523	const bool Scaled = Opc == AArch64ISD::GLD1_SCALED_MERGE_ZERO \|\|
21524	Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
21525	const bool Signed = Opc == AArch64ISD::GLD1S_MERGE_ZERO \|\|
21526	Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
21527	const bool Extended = Opc == AArch64ISD::GLD1_SXTW_MERGE_ZERO \|\|
21528	Opc == AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO \|\|
21529	Opc == AArch64ISD::GLD1_UXTW_MERGE_ZERO \|\|
21530	Opc == AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO;
21531
21532	SDLoc DL(N);
21533	SDValue Chain = N->getOperand(Num: `0`);
21534	SDValue Pg = N->getOperand(Num: `1`);
21535	SDValue Base = N->getOperand(Num: `2`);
21536	SDValue Offset = N->getOperand(Num: `3`);
21537	SDValue Ty = N->getOperand(Num: `4`);
21538
21539	EVT ResVT = N->getValueType(ResNo: `0`);
21540
21541	const auto OffsetOpc = Offset.getOpcode();
21542	const bool OffsetIsZExt =
21543	OffsetOpc == AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU;
21544	const bool OffsetIsSExt =
21545	OffsetOpc == AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU;
21546
21547	// Fold sign/zero extensions of vector offsets into GLD1 nodes where possible.
21548	if (!Extended && (OffsetIsSExt \|\| OffsetIsZExt)) {
21549	SDValue ExtPg = Offset.getOperand(i: `0`);
21550	VTSDNode *ExtFrom = cast<VTSDNode>(Val: Offset.getOperand(i: `2`).getNode());
21551	EVT ExtFromEVT = ExtFrom->getVT().getVectorElementType();
21552
21553	// If the predicate for the sign- or zero-extended offset is the
21554	// same as the predicate used for this load and the sign-/zero-extension
21555	// was from a 32-bits...
21556	if (ExtPg == Pg && ExtFromEVT == MVT::i32) {
21557	SDValue UnextendedOffset = Offset.getOperand(i: `1`);
21558
21559	unsigned NewOpc = getGatherVecOpcode(IsScaled: Scaled, IsSigned: OffsetIsSExt, NeedsExtend: true);
21560	if (Signed)
21561	NewOpc = getSignExtendedGatherOpcode(Opcode: NewOpc);
21562
21563	return DAG.getNode(NewOpc, DL, {ResVT, MVT::Other},
21564	{Chain, Pg, Base, UnextendedOffset, Ty});
21565	}
21566	}
21567
21568	return SDValue ();
21569	}
21570
21571	/// Optimize a vector shift instruction and its operand if shifted out
21572	/// bits are not used.
21573	static SDValue performVectorShiftCombine(SDNode *N,
21574	const AArch64TargetLowering &TLI,
21575	TargetLowering::DAGCombinerInfo &DCI) {
21576	assert(N->getOpcode() == AArch64ISD::VASHR \|\|
21577	N->getOpcode() == AArch64ISD::VLSHR);
21578
21579	SDValue Op = N->getOperand(Num: `0`);
21580	unsigned OpScalarSize = Op.getScalarValueSizeInBits();
21581
21582	unsigned ShiftImm = N->getConstantOperandVal(Num: `1`);
21583	assert(OpScalarSize > ShiftImm && "Invalid shift imm");
21584
21585	// Remove sign_extend_inreg (ashr(shl(x)) based on the number of sign bits.
21586	if (N->getOpcode() == AArch64ISD::VASHR &&
21587	Op.getOpcode() == AArch64ISD::VSHL &&
21588	N->getOperand(Num: `1`) == Op.getOperand(i: `1`))
21589	if (DCI.DAG.ComputeNumSignBits(Op: Op.getOperand(i: `0`)) > ShiftImm)
21590	return Op.getOperand(i: `0`);
21591
21592	APInt ShiftedOutBits = APInt::getLowBitsSet(numBits: OpScalarSize, loBitsSet: ShiftImm);
21593	APInt DemandedMask = ~ShiftedOutBits;
21594
21595	if (TLI.SimplifyDemandedBits(Op, DemandedBits: DemandedMask, DCI))
21596	return SDValue (N, `0`);
21597
21598	return SDValue ();
21599	}
21600
21601	static SDValue performSunpkloCombine(SDNode *N, SelectionDAG &DAG) {
21602	// sunpklo(sext(pred)) -> sext(extract_low_half(pred))
21603	// This transform works in partnership with performSetCCPunpkCombine to
21604	// remove unnecessary transfer of predicates into standard registers and back
21605	if (N->getOperand(`0`).getOpcode() == ISD::SIGN_EXTEND &&
21606	N->getOperand(`0`)->getOperand(`0`)->getValueType(`0`).getScalarType() ==
21607	MVT::i1) {
21608	SDValue CC = N->getOperand(Num: `0`)->getOperand(Num: `0`);
21609	auto VT = CC ->getValueType(ResNo: `0`).getHalfNumVectorElementsVT(Context&: *DAG.getContext());
21610	SDValue Unpk = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SDLoc (N), VT, N1: CC,
21611	N2: DAG.getVectorIdxConstant(Val: `0`, DL: SDLoc (N)));
21612	return DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`), Operand: Unpk);
21613	}
21614
21615	return SDValue ();
21616	}
21617
21618	/// Target-specific DAG combine function for post-increment LD1 (lane) and
21619	/// post-increment LD1R.
21620	static SDValue performPostLD1Combine(SDNode *N,
21621	TargetLowering::DAGCombinerInfo &DCI,
21622	bool IsLaneOp) {
21623	if (DCI.isBeforeLegalizeOps())
21624	return SDValue ();
21625
21626	SelectionDAG &DAG = DCI.DAG;
21627	EVT VT = N->getValueType(ResNo: `0`);
21628
21629	if (!VT.is128BitVector() && !VT.is64BitVector())
21630	return SDValue ();
21631
21632	unsigned LoadIdx = IsLaneOp ? `1` : `0`;
21633	SDNode *LD = N->getOperand(Num: LoadIdx).getNode();
21634	// If it is not LOAD, can not do such combine.
21635	if (LD->getOpcode() != ISD::LOAD)
21636	return SDValue ();
21637
21638	// The vector lane must be a constant in the LD1LANE opcode.
21639	SDValue Lane;
21640	if (IsLaneOp) {
21641	Lane = N->getOperand(Num: `2`);
21642	auto *LaneC = dyn_cast<ConstantSDNode>(Val&: Lane);
21643	if (!LaneC \|\| LaneC->getZExtValue() >= VT.getVectorNumElements())
21644	return SDValue ();
21645	}
21646
21647	LoadSDNode *LoadSDN = cast<LoadSDNode>(Val: LD);
21648	EVT MemVT = LoadSDN->getMemoryVT();
21649	// Check if memory operand is the same type as the vector element.
21650	if (MemVT != VT.getVectorElementType())
21651	return SDValue ();
21652
21653	// Check if there are other uses. If so, do not combine as it will introduce
21654	// an extra load.
21655	for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE;
21656	++UI) {
21657	if (UI.getUse().getResNo() == `1`) // Ignore uses of the chain result.
21658	continue;
21659	if (*UI != N)
21660	return SDValue ();
21661	}
21662
21663	// If there is one use and it can splat the value, prefer that operation.
21664	// TODO: This could be expanded to more operations if they reliably use the
21665	// index variants.
21666	if (N->hasOneUse()) {
21667	unsigned UseOpc = N->use_begin()->getOpcode();
21668	if (UseOpc == ISD::FMUL \|\| UseOpc == ISD::FMA)
21669	return SDValue ();
21670	}
21671
21672	SDValue Addr = LD->getOperand(Num: `1`);
21673	SDValue Vector = N->getOperand(Num: `0`);
21674	// Search for a use of the address operand that is an increment.
21675	for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE =
21676	Addr.getNode()->use_end(); UI != UE; ++UI) {
21677	SDNode User = UI;
21678	if (User->getOpcode() != ISD::ADD
21679	\|\| UI.getUse().getResNo() != Addr.getResNo())
21680	continue;
21681
21682	// If the increment is a constant, it must match the memory ref size.
21683	SDValue Inc = User->getOperand(Num: User->getOperand(Num: `0`) == Addr ? `1` : `0`);
21684	if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Val: Inc.getNode())) {
21685	uint32_t IncVal = CInc->getZExtValue();
21686	unsigned NumBytes = VT.getScalarSizeInBits() / `8`;
21687	if (IncVal != NumBytes)
21688	continue;
21689	Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
21690	}
21691
21692	// To avoid cycle construction make sure that neither the load nor the add
21693	// are predecessors to each other or the Vector.
21694	SmallPtrSet<const SDNode *, `32`> Visited;
21695	SmallVector<const SDNode *, `16`> Worklist;
21696	Visited.insert(Ptr: Addr.getNode());
21697	Worklist.push_back(Elt: User);
21698	Worklist.push_back(Elt: LD);
21699	Worklist.push_back(Elt: Vector.getNode());
21700	if (SDNode::hasPredecessorHelper(N: LD, Visited, Worklist) \|\|
21701	SDNode::hasPredecessorHelper(N: User, Visited, Worklist))
21702	continue;
21703
21704	SmallVector<SDValue, `8`> Ops;
21705	Ops.push_back(Elt: LD->getOperand(Num: `0`)); // Chain
21706	if (IsLaneOp) {
21707	Ops.push_back(Elt: Vector); // The vector to be inserted
21708	Ops.push_back(Elt: Lane); // The lane to be inserted in the vector
21709	}
21710	Ops.push_back(Elt: Addr);
21711	Ops.push_back(Elt: Inc);
21712
21713	EVT Tys[`3`] = { VT, MVT::i64, MVT::Other };
21714	SDVTList SDTys = DAG.getVTList(VTs: Tys);
21715	unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
21716	SDValue UpdN = DAG.getMemIntrinsicNode(Opcode: NewOp, dl: SDLoc (N), VTList: SDTys, Ops,
21717	MemVT,
21718	MMO: LoadSDN->getMemOperand());
21719
21720	// Update the uses.
21721	SDValue NewResults[] = {
21722	SDValue (LD, `0`), // The result of load
21723	SDValue (UpdN.getNode(), `2`) // Chain
21724	};
21725	DCI.CombineTo(N: LD, To: NewResults);
21726	DCI.CombineTo(N, Res: SDValue (UpdN.getNode(), `0`)); // Dup/Inserted Result
21727	DCI.CombineTo(N: User, Res: SDValue (UpdN.getNode(), `1`)); // Write back register
21728
21729	break;
21730	}
21731	return SDValue ();
21732	}
21733
21734	/// Simplify ``Addr`` given that the top byte of it is ignored by HW during
21735	/// address translation.
21736	static bool performTBISimplification(SDValue Addr,
21737	TargetLowering::DAGCombinerInfo &DCI,
21738	SelectionDAG &DAG) {
21739	APInt DemandedMask = APInt::getLowBitsSet(numBits: `64`, loBitsSet: `56`);
21740	KnownBits Known;
21741	TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
21742	!DCI.isBeforeLegalizeOps());
21743	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21744	if (TLI.SimplifyDemandedBits(Op: Addr, DemandedBits: DemandedMask, Known, TLO)) {
21745	DCI.CommitTargetLoweringOpt(TLO);
21746	return true;
21747	}
21748	return false;
21749	}
21750
21751	static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N) {
21752	assert((N->getOpcode() == ISD::STORE \|\| N->getOpcode() == ISD::MSTORE) &&
21753	"Expected STORE dag node in input!");
21754
21755	if (auto Store = dyn_cast<StoreSDNode>(Val: N)) {
21756	if (!Store->isTruncatingStore() \|\| Store->isIndexed())
21757	return SDValue ();
21758	SDValue Ext = Store->getValue();
21759	auto ExtOpCode = Ext.getOpcode();
21760	if (ExtOpCode != ISD::ZERO_EXTEND && ExtOpCode != ISD::SIGN_EXTEND &&
21761	ExtOpCode != ISD::ANY_EXTEND)
21762	return SDValue ();
21763	SDValue Orig = Ext ->getOperand(Num: `0`);
21764	if (Store->getMemoryVT() != Orig.getValueType())
21765	return SDValue ();
21766	return DAG.getStore(Chain: Store->getChain(), dl: SDLoc (Store), Val: Orig,
21767	Ptr: Store->getBasePtr(), MMO: Store->getMemOperand());
21768	}
21769
21770	return SDValue ();
21771	}
21772
21773	// A custom combine to lower load <3 x i8> as the more efficient sequence
21774	// below:
21775	// ldrb wX, [x0, #2]
21776	// ldrh wY, [x0]
21777	// orr wX, wY, wX, lsl #16
21778	// fmov s0, wX
21779	//
21780	// Note that an alternative sequence with even fewer (although usually more
21781	// complex/expensive) instructions would be:
21782	// ld1r.4h { v0 }, [x0], #2
21783	// ld1.b { v0 }[2], [x0]
21784	//
21785	// Generating this sequence unfortunately results in noticeably worse codegen
21786	// for code that extends the loaded v3i8, due to legalization breaking vector
21787	// shuffle detection in a way that is very difficult to work around.
21788	// TODO: Revisit once v3i8 legalization has been improved in general.
21789	static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG) {
21790	EVT MemVT = LD->getMemoryVT();
21791	if (MemVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, `3`) \|\|
21792	LD->getOriginalAlign() >= `4`)
21793	return SDValue ();
21794
21795	SDLoc DL(LD);
21796	MachineFunction &MF = DAG.getMachineFunction();
21797	SDValue Chain = LD->getChain();
21798	SDValue BasePtr = LD->getBasePtr();
21799	MachineMemOperand *MMO = LD->getMemOperand();
21800	assert(LD->getOffset().isUndef() && "undef offset expected");
21801
21802	// Load 2 x i8, then 1 x i8.
21803	SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, MMO);
21804	TypeSize Offset2 = TypeSize::getFixed(ExactSize: `2`);
21805	SDValue L8 = DAG.getLoad(MVT::i8, DL, Chain,
21806	DAG.getMemBasePlusOffset(BasePtr, Offset2, DL),
21807	MF.getMachineMemOperand(MMO, `2`, `1`));
21808
21809	// Extend to i32.
21810	SDValue Ext16 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L16);
21811	SDValue Ext8 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L8);
21812
21813	// Pack 2 x i8 and 1 x i8 in an i32 and convert to v4i8.
21814	SDValue Shl = DAG.getNode(ISD::SHL, DL, MVT::i32, Ext8,
21815	DAG.getConstant(`16`, DL, MVT::i32));
21816	SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, Ext16, Shl);
21817	SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v4i8, Or);
21818
21819	// Extract v3i8 again.
21820	SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT, Cast,
21821	DAG.getConstant(`0`, DL, MVT::i64));
21822	SDValue TokenFactor = DAG.getNode(
21823	ISD::TokenFactor, DL, MVT::Other,
21824	{SDValue(cast<SDNode>(L16), `1`), SDValue(cast<SDNode>(L8), `1`)});
21825	return DAG.getMergeValues(Ops: {Extract, TokenFactor}, dl: DL);
21826	}
21827
21828	// Perform TBI simplification if supported by the target and try to break up
21829	// nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit
21830	// load instructions can be selected.
21831	static SDValue performLOADCombine(SDNode *N,
21832	TargetLowering::DAGCombinerInfo &DCI,
21833	SelectionDAG &DAG,
21834	const AArch64Subtarget *Subtarget) {
21835	if (Subtarget->supportsAddressTopByteIgnored())
21836	performTBISimplification(Addr: N->getOperand(Num: `1`), DCI, DAG);
21837
21838	LoadSDNode *LD = cast<LoadSDNode>(Val: N);
21839	if (LD->isVolatile() \|\| !Subtarget->isLittleEndian())
21840	return SDValue (N, `0`);
21841
21842	if (SDValue Res = combineV3I8LoadExt(LD, DAG))
21843	return Res;
21844
21845	if (!LD->isNonTemporal())
21846	return SDValue (N, `0`);
21847
21848	EVT MemVT = LD->getMemoryVT();
21849	if (MemVT.isScalableVector() \|\| MemVT.getSizeInBits() <= `256` \|\|
21850	MemVT.getSizeInBits() % `256` == `0` \|\|
21851	`256` % MemVT.getScalarSizeInBits() != `0`)
21852	return SDValue (N, `0`);
21853
21854	SDLoc DL(LD);
21855	SDValue Chain = LD->getChain();
21856	SDValue BasePtr = LD->getBasePtr();
21857	SDNodeFlags Flags = LD->getFlags();
21858	SmallVector<SDValue, `4`> LoadOps;
21859	SmallVector<SDValue, `4`> LoadOpsChain;
21860	// Replace any non temporal load over 256-bit with a series of 256 bit loads
21861	// and a scalar/vector load less than 256. This way we can utilize 256-bit
21862	// loads and reduce the amount of load instructions generated.
21863	MVT NewVT =
21864	MVT::getVectorVT(VT: MemVT.getVectorElementType().getSimpleVT(),
21865	NumElements: `256` / MemVT.getVectorElementType().getSizeInBits());
21866	unsigned Num256Loads = MemVT.getSizeInBits() / `256`;
21867	// Create all 256-bit loads starting from offset 0 and up to Num256Loads-132.*
21868	for (unsigned I = `0`; I < Num256Loads; I++) {
21869	unsigned PtrOffset = I * `32`;
21870	SDValue NewPtr = DAG.getMemBasePlusOffset(
21871	Base: BasePtr, Offset: TypeSize::getFixed(ExactSize: PtrOffset), DL, Flags);
21872	Align NewAlign = commonAlignment(A: LD->getAlign(), Offset: PtrOffset);
21873	SDValue NewLoad = DAG.getLoad(
21874	VT: NewVT, dl: DL, Chain, Ptr: NewPtr, PtrInfo: LD->getPointerInfo().getWithOffset(O: PtrOffset),
21875	Alignment: NewAlign, MMOFlags: LD->getMemOperand()->getFlags(), AAInfo: LD->getAAInfo());
21876	LoadOps.push_back(Elt: NewLoad);
21877	LoadOpsChain.push_back(Elt: SDValue (cast<SDNode>(Val&: NewLoad), `1`));
21878	}
21879
21880	// Process remaining bits of the load operation.
21881	// This is done by creating an UNDEF vector to match the size of the
21882	// 256-bit loads and inserting the remaining load to it. We extract the
21883	// original load type at the end using EXTRACT_SUBVECTOR instruction.
21884	unsigned BitsRemaining = MemVT.getSizeInBits() % `256`;
21885	unsigned PtrOffset = (MemVT.getSizeInBits() - BitsRemaining) / `8`;
21886	MVT RemainingVT = MVT::getVectorVT(
21887	VT: MemVT.getVectorElementType().getSimpleVT(),
21888	NumElements: BitsRemaining / MemVT.getVectorElementType().getSizeInBits());
21889	SDValue NewPtr = DAG.getMemBasePlusOffset(
21890	Base: BasePtr, Offset: TypeSize::getFixed(ExactSize: PtrOffset), DL, Flags);
21891	Align NewAlign = commonAlignment(A: LD->getAlign(), Offset: PtrOffset);
21892	SDValue RemainingLoad =
21893	DAG.getLoad(VT: RemainingVT, dl: DL, Chain, Ptr: NewPtr,
21894	PtrInfo: LD->getPointerInfo().getWithOffset(O: PtrOffset), Alignment: NewAlign,
21895	MMOFlags: LD->getMemOperand()->getFlags(), AAInfo: LD->getAAInfo());
21896	SDValue UndefVector = DAG.getUNDEF(VT: NewVT);
21897	SDValue InsertIdx = DAG.getVectorIdxConstant(Val: `0`, DL);
21898	SDValue ExtendedReminingLoad =
21899	DAG.getNode(Opcode: ISD::INSERT_SUBVECTOR, DL, VT: NewVT,
21900	Ops: {UndefVector, RemainingLoad, InsertIdx});
21901	LoadOps.push_back(Elt: ExtendedReminingLoad);
21902	LoadOpsChain.push_back(Elt: SDValue (cast<SDNode>(Val&: RemainingLoad), `1`));
21903	EVT ConcatVT =
21904	EVT::getVectorVT(Context&: *DAG.getContext(), VT: MemVT.getScalarType(),
21905	NumElements: LoadOps.size() * NewVT.getVectorNumElements());
21906	SDValue ConcatVectors =
21907	DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: ConcatVT, Ops: LoadOps);
21908	// Extract the original vector type size.
21909	SDValue ExtractSubVector =
21910	DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MemVT,
21911	Ops: {ConcatVectors, DAG.getVectorIdxConstant(Val: `0`, DL)});
21912	SDValue TokenFactor =
21913	DAG.getNode(ISD::TokenFactor, DL, MVT::Other, LoadOpsChain);
21914	return DAG.getMergeValues(Ops: {ExtractSubVector, TokenFactor}, dl: DL);
21915	}
21916
21917	static EVT tryGetOriginalBoolVectorType(SDValue Op, int Depth = `0`) {
21918	EVT VecVT = Op.getValueType();
21919	assert(VecVT.isVector() && VecVT.getVectorElementType() == MVT::i1 &&
21920	"Need boolean vector type.");
21921
21922	if (Depth > `3`)
21923	return MVT::INVALID_SIMPLE_VALUE_TYPE;
21924
21925	// We can get the base type from a vector compare or truncate.
21926	if (Op.getOpcode() == ISD::SETCC \|\| Op.getOpcode() == ISD::TRUNCATE)
21927	return Op.getOperand(i: `0`).getValueType();
21928
21929	// If an operand is a bool vector, continue looking.
21930	EVT BaseVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
21931	for (SDValue Operand : Op ->op_values()) {
21932	if (Operand.getValueType() != VecVT)
21933	continue;
21934
21935	EVT OperandVT = tryGetOriginalBoolVectorType(Op: Operand, Depth: Depth + `1`);
21936	if (!BaseVT.isSimple())
21937	BaseVT = OperandVT;
21938	else if (OperandVT != BaseVT)
21939	return MVT::INVALID_SIMPLE_VALUE_TYPE;
21940	}
21941
21942	return BaseVT;
21943	}
21944
21945	// When converting a <N x iX> vector to <N x i1> to store or use as a scalar
21946	// iN, we can use a trick that extracts the i^th bit from the i^th element and
21947	// then performs a vector add to get a scalar bitmask. This requires that each
21948	// element's bits are either all 1 or all 0.
21949	static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG) {
21950	SDLoc DL(N);
21951	SDValue ComparisonResult(N, `0`);
21952	EVT VecVT = ComparisonResult.getValueType();
21953	assert(VecVT.isVector() && "Must be a vector type");
21954
21955	unsigned NumElts = VecVT.getVectorNumElements();
21956	if (NumElts != `2` && NumElts != `4` && NumElts != `8` && NumElts != `16`)
21957	return SDValue ();
21958
21959	if (VecVT.getVectorElementType() != MVT::i1 &&
21960	!DAG.getTargetLoweringInfo().isTypeLegal(VecVT))
21961	return SDValue ();
21962
21963	// If we can find the original types to work on instead of a vector of i1,
21964	// we can avoid extend/extract conversion instructions.
21965	if (VecVT.getVectorElementType() == MVT::i1) {
21966	VecVT = tryGetOriginalBoolVectorType(Op: ComparisonResult);
21967	if (!VecVT.isSimple()) {
21968	unsigned BitsPerElement = std::max(a: `64` / NumElts, b: `8u`); // >= 64-bit vector
21969	VecVT = MVT::getVectorVT(VT: MVT::getIntegerVT(BitWidth: BitsPerElement), NumElements: NumElts);
21970	}
21971	}
21972	VecVT = VecVT.changeVectorElementTypeToInteger();
21973
21974	// Large vectors don't map directly to this conversion, so to avoid too many
21975	// edge cases, we don't apply it here. The conversion will likely still be
21976	// applied later via multiple smaller vectors, whose results are concatenated.
21977	if (VecVT.getSizeInBits() > `128`)
21978	return SDValue ();
21979
21980	// Ensure that all elements' bits are either 0s or 1s.
21981	ComparisonResult = DAG.getSExtOrTrunc(Op: ComparisonResult, DL, VT: VecVT);
21982
21983	SmallVector<SDValue, `16`> MaskConstants;
21984	if (VecVT == MVT::v16i8) {
21985	// v16i8 is a special case, as we have 16 entries but only 8 positional bits
21986	// per entry. We split it into two halves, apply the mask, zip the halves to
21987	// create 8x 16-bit values, and the perform the vector reduce.
21988	for (unsigned Half = `0`; Half < `2`; ++Half) {
21989	for (unsigned MaskBit = `1`; MaskBit <= `128`; MaskBit *= `2`) {
21990	MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i32));
21991	}
21992	}
21993	SDValue Mask = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL, VT: VecVT, Ops: MaskConstants);
21994	SDValue RepresentativeBits =
21995	DAG.getNode(Opcode: ISD::AND, DL, VT: VecVT, N1: ComparisonResult, N2: Mask);
21996
21997	SDValue UpperRepresentativeBits =
21998	DAG.getNode(AArch64ISD::EXT, DL, VecVT, RepresentativeBits,
21999	RepresentativeBits, DAG.getConstant(`8`, DL, MVT::i32));
22000	SDValue Zipped = DAG.getNode(Opcode: AArch64ISD::ZIP1, DL, VT: VecVT,
22001	N1: RepresentativeBits, N2: UpperRepresentativeBits);
22002	Zipped = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Zipped);
22003	return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i16, Zipped);
22004	}
22005
22006	// All other vector sizes.
22007	unsigned MaxBitMask = `1u` << (VecVT.getVectorNumElements() - `1`);
22008	for (unsigned MaskBit = `1`; MaskBit <= MaxBitMask; MaskBit *= `2`) {
22009	MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i64));
22010	}
22011
22012	SDValue Mask = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL, VT: VecVT, Ops: MaskConstants);
22013	SDValue RepresentativeBits =
22014	DAG.getNode(Opcode: ISD::AND, DL, VT: VecVT, N1: ComparisonResult, N2: Mask);
22015	EVT ResultVT = MVT::getIntegerVT(BitWidth: std::max<unsigned>(
22016	a: NumElts, b: VecVT.getVectorElementType().getSizeInBits()));
22017	return DAG.getNode(Opcode: ISD::VECREDUCE_ADD, DL, VT: ResultVT, Operand: RepresentativeBits);
22018	}
22019
22020	static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG,
22021	StoreSDNode *Store) {
22022	if (!Store->isTruncatingStore())
22023	return SDValue ();
22024
22025	SDLoc DL(Store);
22026	SDValue VecOp = Store->getValue();
22027	EVT VT = VecOp.getValueType();
22028	EVT MemVT = Store->getMemoryVT();
22029
22030	if (!MemVT.isVector() \|\| !VT.isVector() \|\|
22031	MemVT.getVectorElementType() != MVT::i1)
22032	return SDValue ();
22033
22034	// If we are storing a vector that we are currently building, let
22035	// `scalarizeVectorStore()` handle this more efficiently.
22036	if (VecOp.getOpcode() == ISD::BUILD_VECTOR)
22037	return SDValue ();
22038
22039	VecOp = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MemVT, Operand: VecOp);
22040	SDValue VectorBits = vectorToScalarBitmask(N: VecOp.getNode(), DAG);
22041	if (!VectorBits)
22042	return SDValue ();
22043
22044	EVT StoreVT =
22045	EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: MemVT.getStoreSizeInBits());
22046	SDValue ExtendedBits = DAG.getZExtOrTrunc(Op: VectorBits, DL, VT: StoreVT);
22047	return DAG.getStore(Chain: Store->getChain(), dl: DL, Val: ExtendedBits, Ptr: Store->getBasePtr(),
22048	MMO: Store->getMemOperand());
22049	}
22050
22051	bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT) {
22052	return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv8i8) \|\|
22053	(SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv4i16) \|\|
22054	(SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv2i32);
22055	}
22056
22057	// Combine store (trunc X to <3 x i8>) to sequence of ST1.b.
22058	static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG,
22059	const AArch64Subtarget *Subtarget) {
22060	SDValue Value = ST->getValue();
22061	EVT ValueVT = Value.getValueType();
22062
22063	if (ST->isVolatile() \|\| !Subtarget->isLittleEndian() \|\|
22064	Value.getOpcode() != ISD::TRUNCATE \|\|
22065	ValueVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, `3`))
22066	return SDValue ();
22067
22068	assert(ST->getOffset().isUndef() && "undef offset expected");
22069	SDLoc DL(ST);
22070	auto WideVT = EVT::getVectorVT(
22071	Context&: *DAG.getContext(),
22072	VT: Value ->getOperand(Num: `0`).getValueType().getVectorElementType(), NumElements: `4`);
22073	SDValue UndefVector = DAG.getUNDEF(VT: WideVT);
22074	SDValue WideTrunc = DAG.getNode(
22075	Opcode: ISD::INSERT_SUBVECTOR, DL, VT: WideVT,
22076	Ops: {UndefVector, Value ->getOperand(Num: `0`), DAG.getVectorIdxConstant(Val: `0`, DL)});
22077	SDValue Cast = DAG.getNode(
22078	ISD::BITCAST, DL, WideVT.getSizeInBits() == `64` ? MVT::v8i8 : MVT::v16i8,
22079	WideTrunc);
22080
22081	MachineFunction &MF = DAG.getMachineFunction();
22082	SDValue Chain = ST->getChain();
22083	MachineMemOperand *MMO = ST->getMemOperand();
22084	unsigned IdxScale = WideVT.getScalarSizeInBits() / `8`;
22085	SDValue E2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
22086	DAG.getConstant(`2` * IdxScale, DL, MVT::i64));
22087	TypeSize Offset2 = TypeSize::getFixed(ExactSize: `2`);
22088	SDValue Ptr2 = DAG.getMemBasePlusOffset(Base: ST->getBasePtr(), Offset: Offset2, DL);
22089	Chain = DAG.getStore(Chain, dl: DL, Val: E2, Ptr: Ptr2, MMO: MF.getMachineMemOperand(MMO, Offset: `2`, Size: `1`));
22090
22091	SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
22092	DAG.getConstant(`1` * IdxScale, DL, MVT::i64));
22093	TypeSize Offset1 = TypeSize::getFixed(ExactSize: `1`);
22094	SDValue Ptr1 = DAG.getMemBasePlusOffset(Base: ST->getBasePtr(), Offset: Offset1, DL);
22095	Chain = DAG.getStore(Chain, dl: DL, Val: E1, Ptr: Ptr1, MMO: MF.getMachineMemOperand(MMO, Offset: `1`, Size: `1`));
22096
22097	SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
22098	DAG.getConstant(`0`, DL, MVT::i64));
22099	Chain = DAG.getStore(Chain, dl: DL, Val: E0, Ptr: ST->getBasePtr(),
22100	MMO: MF.getMachineMemOperand(MMO, Offset: `0`, Size: `1`));
22101	return Chain;
22102	}
22103
22104	static SDValue performSTORECombine(SDNode *N,
22105	TargetLowering::DAGCombinerInfo &DCI,
22106	SelectionDAG &DAG,
22107	const AArch64Subtarget *Subtarget) {
22108	StoreSDNode *ST = cast<StoreSDNode>(Val: N);
22109	SDValue Chain = ST->getChain();
22110	SDValue Value = ST->getValue();
22111	SDValue Ptr = ST->getBasePtr();
22112	EVT ValueVT = Value.getValueType();
22113
22114	auto hasValidElementTypeForFPTruncStore = [](EVT VT) {
22115	EVT EltVT = VT.getVectorElementType();
22116	return EltVT == MVT::f32 \|\| EltVT == MVT::f64;
22117	};
22118
22119	if (SDValue Res = combineI8TruncStore(ST, DAG, Subtarget))
22120	return Res;
22121
22122	// If this is an FP_ROUND followed by a store, fold this into a truncating
22123	// store. We can do this even if this is already a truncstore.
22124	// We purposefully don't care about legality of the nodes here as we know
22125	// they can be split down into something legal.
22126	if (DCI.isBeforeLegalizeOps() && Value.getOpcode() == ISD::FP_ROUND &&
22127	Value.getNode()->hasOneUse() && ST->isUnindexed() &&
22128	Subtarget->useSVEForFixedLengthVectors() &&
22129	ValueVT.isFixedLengthVector() &&
22130	ValueVT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits() &&
22131	hasValidElementTypeForFPTruncStore(Value.getOperand(i: `0`).getValueType()))
22132	return DAG.getTruncStore(Chain, dl: SDLoc (N), Val: Value.getOperand(i: `0`), Ptr,
22133	SVT: ST->getMemoryVT(), MMO: ST->getMemOperand());
22134
22135	if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
22136	return Split;
22137
22138	if (Subtarget->supportsAddressTopByteIgnored() &&
22139	performTBISimplification(Addr: N->getOperand(Num: `2`), DCI, DAG))
22140	return SDValue (N, `0`);
22141
22142	if (SDValue Store = foldTruncStoreOfExt(DAG, N))
22143	return Store;
22144
22145	if (SDValue Store = combineBoolVectorAndTruncateStore(DAG, Store: ST))
22146	return Store;
22147
22148	if (ST->isTruncatingStore()) {
22149	EVT StoreVT = ST->getMemoryVT();
22150	if (!isHalvingTruncateOfLegalScalableType(SrcVT: ValueVT, DstVT: StoreVT))
22151	return SDValue ();
22152	if (SDValue Rshrnb =
22153	trySimplifySrlAddToRshrnb(Srl: ST->getOperand(Num: `1`), DAG, Subtarget)) {
22154	return DAG.getTruncStore(Chain: ST->getChain(), dl: ST, Val: Rshrnb, Ptr: ST->getBasePtr(),
22155	SVT: StoreVT, MMO: ST->getMemOperand());
22156	}
22157	}
22158
22159	return SDValue ();
22160	}
22161
22162	static SDValue performMSTORECombine(SDNode *N,
22163	TargetLowering::DAGCombinerInfo &DCI,
22164	SelectionDAG &DAG,
22165	const AArch64Subtarget *Subtarget) {
22166	MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(Val: N);
22167	SDValue Value = MST->getValue();
22168	SDValue Mask = MST->getMask();
22169	SDLoc DL(N);
22170
22171	// If this is a UZP1 followed by a masked store, fold this into a masked
22172	// truncating store. We can do this even if this is already a masked
22173	// truncstore.
22174	if (Value.getOpcode() == AArch64ISD::UZP1 && Value ->hasOneUse() &&
22175	MST->isUnindexed() && Mask ->getOpcode() == AArch64ISD::PTRUE &&
22176	Value.getValueType().isInteger()) {
22177	Value = Value.getOperand(i: `0`);
22178	if (Value.getOpcode() == ISD::BITCAST) {
22179	EVT HalfVT =
22180	Value.getValueType().getHalfNumVectorElementsVT(Context&: *DAG.getContext());
22181	EVT InVT = Value.getOperand(i: `0`).getValueType();
22182
22183	if (HalfVT.widenIntegerVectorElementType(Context&: *DAG.getContext()) == InVT) {
22184	unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
22185	unsigned PgPattern = Mask ->getConstantOperandVal(Num: `0`);
22186
22187	// Ensure we can double the size of the predicate pattern
22188	unsigned NumElts = getNumElementsFromSVEPredPattern(Pattern: PgPattern);
22189	if (NumElts && NumElts * InVT.getVectorElementType().getSizeInBits() <=
22190	MinSVESize) {
22191	Mask = getPTrue(DAG, DL, InVT.changeVectorElementType(MVT::i1),
22192	PgPattern);
22193	return DAG.getMaskedStore(Chain: MST->getChain(), dl: DL, Val: Value.getOperand(i: `0`),
22194	Base: MST->getBasePtr(), Offset: MST->getOffset(), Mask,
22195	MemVT: MST->getMemoryVT(), MMO: MST->getMemOperand(),
22196	AM: MST->getAddressingMode(),
22197	/IsTruncating=/true);
22198	}
22199	}
22200	}
22201	}
22202
22203	if (MST->isTruncatingStore()) {
22204	EVT ValueVT = Value ->getValueType(ResNo: `0`);
22205	EVT MemVT = MST->getMemoryVT();
22206	if (!isHalvingTruncateOfLegalScalableType(SrcVT: ValueVT, DstVT: MemVT))
22207	return SDValue ();
22208	if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Srl: Value, DAG, Subtarget)) {
22209	return DAG.getMaskedStore(Chain: MST->getChain(), dl: DL, Val: Rshrnb, Base: MST->getBasePtr(),
22210	Offset: MST->getOffset(), Mask: MST->getMask(),
22211	MemVT: MST->getMemoryVT(), MMO: MST->getMemOperand(),
22212	AM: MST->getAddressingMode(), IsTruncating: true);
22213	}
22214	}
22215
22216	return SDValue ();
22217	}
22218
22219	/// \return true if part of the index was folded into the Base.
22220	static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale,
22221	SDLoc DL, SelectionDAG &DAG) {
22222	// This function assumes a vector of i64 indices.
22223	EVT IndexVT = Index.getValueType();
22224	if (!IndexVT.isVector() \|\| IndexVT.getVectorElementType() != MVT::i64)
22225	return false;
22226
22227	// Simplify:
22228	// BasePtr = Ptr
22229	// Index = X + splat(Offset)
22230	// ->
22231	// BasePtr = Ptr + Offset scale.*
22232	// Index = X
22233	if (Index.getOpcode() == ISD::ADD) {
22234	if (auto Offset = DAG.getSplatValue(V: Index.getOperand(i: `1`))) {
22235	Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
22236	BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
22237	Index = Index.getOperand(i: `0`);
22238	return true;
22239	}
22240	}
22241
22242	// Simplify:
22243	// BasePtr = Ptr
22244	// Index = (X + splat(Offset)) << splat(Shift)
22245	// ->
22246	// BasePtr = Ptr + (Offset << Shift) scale)*
22247	// Index = X << splat(shift)
22248	if (Index.getOpcode() == ISD::SHL &&
22249	Index.getOperand(i: `0`).getOpcode() == ISD::ADD) {
22250	SDValue Add = Index.getOperand(i: `0`);
22251	SDValue ShiftOp = Index.getOperand(i: `1`);
22252	SDValue OffsetOp = Add.getOperand(i: `1`);
22253	if (auto Shift = DAG.getSplatValue(V: ShiftOp))
22254	if (auto Offset = DAG.getSplatValue(V: OffsetOp)) {
22255	Offset = DAG.getNode(ISD::SHL, DL, MVT::i64, Offset, Shift);
22256	Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
22257	BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
22258	Index = DAG.getNode(Opcode: ISD::SHL, DL, VT: Index.getValueType(),
22259	N1: Add.getOperand(i: `0`), N2: ShiftOp);
22260	return true;
22261	}
22262	}
22263
22264	return false;
22265	}
22266
22267	// Analyse the specified address returning true if a more optimal addressing
22268	// mode is available. When returning true all parameters are updated to reflect
22269	// their recommended values.
22270	static bool findMoreOptimalIndexType(const MaskedGatherScatterSDNode *N,
22271	SDValue &BasePtr, SDValue &Index,
22272	SelectionDAG &DAG) {
22273	// Try to iteratively fold parts of the index into the base pointer to
22274	// simplify the index as much as possible.
22275	bool Changed = false;
22276	while (foldIndexIntoBase(BasePtr, Index, Scale: N->getScale(), DL: SDLoc (N), DAG))
22277	Changed = true;
22278
22279	// Only consider element types that are pointer sized as smaller types can
22280	// be easily promoted.
22281	EVT IndexVT = Index.getValueType();
22282	if (IndexVT.getVectorElementType() != MVT::i64 \|\| IndexVT == MVT::nxv2i64)
22283	return Changed;
22284
22285	// Can indices be trivially shrunk?
22286	EVT DataVT = N->getOperand(Num: `1`).getValueType();
22287	// Don't attempt to shrink the index for fixed vectors of 64 bit data since it
22288	// will later be re-extended to 64 bits in legalization
22289	if (DataVT.isFixedLengthVector() && DataVT.getScalarSizeInBits() == `64`)
22290	return Changed;
22291	if (ISD::isVectorShrinkable(N: Index.getNode(), NewEltSize: `32`, Signed: N->isIndexSigned())) {
22292	EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
22293	Index = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SDLoc (N), VT: NewIndexVT, Operand: Index);
22294	return true;
22295	}
22296
22297	// Match:
22298	// Index = step(const)
22299	int64_t Stride = `0`;
22300	if (Index.getOpcode() == ISD::STEP_VECTOR) {
22301	Stride = cast<ConstantSDNode>(Val: Index.getOperand(i: `0`))->getSExtValue();
22302	}
22303	// Match:
22304	// Index = step(const) << shift(const)
22305	else if (Index.getOpcode() == ISD::SHL &&
22306	Index.getOperand(i: `0`).getOpcode() == ISD::STEP_VECTOR) {
22307	SDValue RHS = Index.getOperand(i: `1`);
22308	if (auto *Shift =
22309	dyn_cast_or_null<ConstantSDNode>(Val: DAG.getSplatValue(V: RHS))) {
22310	int64_t Step = (int64_t)Index.getOperand(i: `0`).getConstantOperandVal(i: `1`);
22311	Stride = Step << Shift->getZExtValue();
22312	}
22313	}
22314
22315	// Return early because no supported pattern is found.
22316	if (Stride == `0`)
22317	return Changed;
22318
22319	if (Stride < std::numeric_limits<int32_t>::min() \|\|
22320	Stride > std::numeric_limits<int32_t>::max())
22321	return Changed;
22322
22323	const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
22324	unsigned MaxVScale =
22325	Subtarget.getMaxSVEVectorSizeInBits() / AArch64::SVEBitsPerBlock;
22326	int64_t LastElementOffset =
22327	IndexVT.getVectorMinNumElements() * Stride * MaxVScale;
22328
22329	if (LastElementOffset < std::numeric_limits<int32_t>::min() \|\|
22330	LastElementOffset > std::numeric_limits<int32_t>::max())
22331	return Changed;
22332
22333	EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
22334	// Stride does not scale explicitly by 'Scale', because it happens in
22335	// the gather/scatter addressing mode.
22336	Index = DAG.getStepVector(DL: SDLoc (N), ResVT: NewIndexVT, StepVal: APInt (`32`, Stride));
22337	return true;
22338	}
22339
22340	static SDValue performMaskedGatherScatterCombine(
22341	SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) {
22342	MaskedGatherScatterSDNode *MGS = cast<MaskedGatherScatterSDNode>(Val: N);
22343	assert(MGS && "Can only combine gather load or scatter store nodes");
22344
22345	if (!DCI.isBeforeLegalize())
22346	return SDValue ();
22347
22348	SDLoc DL(MGS);
22349	SDValue Chain = MGS->getChain();
22350	SDValue Scale = MGS->getScale();
22351	SDValue Index = MGS->getIndex();
22352	SDValue Mask = MGS->getMask();
22353	SDValue BasePtr = MGS->getBasePtr();
22354	ISD::MemIndexType IndexType = MGS->getIndexType();
22355
22356	if (!findMoreOptimalIndexType(N: MGS, BasePtr, Index, DAG))
22357	return SDValue ();
22358
22359	// Here we catch such cases early and change MGATHER's IndexType to allow
22360	// the use of an Index that's more legalisation friendly.
22361	if (auto *MGT = dyn_cast<MaskedGatherSDNode>(Val: MGS)) {
22362	SDValue PassThru = MGT->getPassThru();
22363	SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
22364	return DAG.getMaskedGather(
22365	DAG.getVTList(N->getValueType(`0`), MVT::Other), MGT->getMemoryVT(), DL,
22366	Ops, MGT->getMemOperand(), IndexType, MGT->getExtensionType());
22367	}
22368	auto *MSC = cast<MaskedScatterSDNode>(Val: MGS);
22369	SDValue Data = MSC->getValue();
22370	SDValue Ops[] = {Chain, Data, Mask, BasePtr, Index, Scale};
22371	return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MSC->getMemoryVT(), DL,
22372	Ops, MSC->getMemOperand(), IndexType,
22373	MSC->isTruncatingStore());
22374	}
22375
22376	/// Target-specific DAG combine function for NEON load/store intrinsics
22377	/// to merge base address updates.
22378	static SDValue performNEONPostLDSTCombine(SDNode *N,
22379	TargetLowering::DAGCombinerInfo &DCI,
22380	SelectionDAG &DAG) {
22381	if (DCI.isBeforeLegalize() \|\| DCI.isCalledByLegalizer())
22382	return SDValue ();
22383
22384	unsigned AddrOpIdx = N->getNumOperands() - `1`;
22385	SDValue Addr = N->getOperand(Num: AddrOpIdx);
22386
22387	// Search for a use of the address operand that is an increment.
22388	for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
22389	UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
22390	SDNode User = UI;
22391	if (User->getOpcode() != ISD::ADD \|\|
22392	UI.getUse().getResNo() != Addr.getResNo())
22393	continue;
22394
22395	// Check that the add is independent of the load/store. Otherwise, folding
22396	// it would create a cycle.
22397	SmallPtrSet<const SDNode *, `32`> Visited;
22398	SmallVector<const SDNode *, `16`> Worklist;
22399	Visited.insert(Ptr: Addr.getNode());
22400	Worklist.push_back(Elt: N);
22401	Worklist.push_back(Elt: User);
22402	if (SDNode::hasPredecessorHelper(N, Visited, Worklist) \|\|
22403	SDNode::hasPredecessorHelper(N: User, Visited, Worklist))
22404	continue;
22405
22406	// Find the new opcode for the updating load/store.
22407	bool IsStore = false;
22408	bool IsLaneOp = false;
22409	bool IsDupOp = false;
22410	unsigned NewOpc = `0`;
22411	unsigned NumVecs = `0`;
22412	unsigned IntNo = N->getConstantOperandVal(Num: `1`);
22413	switch (IntNo) {
22414	default: llvm_unreachable("unexpected intrinsic for Neon base update");
22415	case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post;
22416	NumVecs = `2`; break;
22417	case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post;
22418	NumVecs = `3`; break;
22419	case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post;
22420	NumVecs = `4`; break;
22421	case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post;
22422	NumVecs = `2`; IsStore = true; break;
22423	case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post;
22424	NumVecs = `3`; IsStore = true; break;
22425	case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post;
22426	NumVecs = `4`; IsStore = true; break;
22427	case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post;
22428	NumVecs = `2`; break;
22429	case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post;
22430	NumVecs = `3`; break;
22431	case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post;
22432	NumVecs = `4`; break;
22433	case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post;
22434	NumVecs = `2`; IsStore = true; break;
22435	case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post;
22436	NumVecs = `3`; IsStore = true; break;
22437	case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post;
22438	NumVecs = `4`; IsStore = true; break;
22439	case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost;
22440	NumVecs = `2`; IsDupOp = true; break;
22441	case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost;
22442	NumVecs = `3`; IsDupOp = true; break;
22443	case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost;
22444	NumVecs = `4`; IsDupOp = true; break;
22445	case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost;
22446	NumVecs = `2`; IsLaneOp = true; break;
22447	case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost;
22448	NumVecs = `3`; IsLaneOp = true; break;
22449	case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost;
22450	NumVecs = `4`; IsLaneOp = true; break;
22451	case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost;
22452	NumVecs = `2`; IsStore = true; IsLaneOp = true; break;
22453	case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost;
22454	NumVecs = `3`; IsStore = true; IsLaneOp = true; break;
22455	case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost;
22456	NumVecs = `4`; IsStore = true; IsLaneOp = true; break;
22457	}
22458
22459	EVT VecTy;
22460	if (IsStore)
22461	VecTy = N->getOperand(Num: `2`).getValueType();
22462	else
22463	VecTy = N->getValueType(ResNo: `0`);
22464
22465	// If the increment is a constant, it must match the memory ref size.
22466	SDValue Inc = User->getOperand(Num: User->getOperand(Num: `0`) == Addr ? `1` : `0`);
22467	if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Val: Inc.getNode())) {
22468	uint32_t IncVal = CInc->getZExtValue();
22469	unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / `8`;
22470	if (IsLaneOp \|\| IsDupOp)
22471	NumBytes /= VecTy.getVectorNumElements();
22472	if (IncVal != NumBytes)
22473	continue;
22474	Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
22475	}
22476	SmallVector<SDValue, `8`> Ops;
22477	Ops.push_back(Elt: N->getOperand(Num: `0`)); // Incoming chain
22478	// Load lane and store have vector list as input.
22479	if (IsLaneOp \|\| IsStore)
22480	for (unsigned i = `2`; i < AddrOpIdx; ++i)
22481	Ops.push_back(Elt: N->getOperand(Num: i));
22482	Ops.push_back(Elt: Addr); // Base register
22483	Ops.push_back(Elt: Inc);
22484
22485	// Return Types.
22486	EVT Tys[`6`];
22487	unsigned NumResultVecs = (IsStore ? `0` : NumVecs);
22488	unsigned n;
22489	for (n = `0`; n < NumResultVecs; ++n)
22490	Tys[n] = VecTy;
22491	Tys[n++] = MVT::i64; // Type of write back register
22492	Tys[n] = MVT::Other; // Type of the chain
22493	SDVTList SDTys = DAG.getVTList(VTs: ArrayRef(Tys, NumResultVecs + `2`));
22494
22495	MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(Val: N);
22496	SDValue UpdN = DAG.getMemIntrinsicNode(Opcode: NewOpc, dl: SDLoc (N), VTList: SDTys, Ops,
22497	MemVT: MemInt->getMemoryVT(),
22498	MMO: MemInt->getMemOperand());
22499
22500	// Update the uses.
22501	std::vector<SDValue> NewResults;
22502	for (unsigned i = `0`; i < NumResultVecs; ++i) {
22503	NewResults.push_back(x: SDValue (UpdN.getNode(), i));
22504	}
22505	NewResults.push_back(x: SDValue (UpdN.getNode(), NumResultVecs + `1`));
22506	DCI.CombineTo(N, To: NewResults);
22507	DCI.CombineTo(N: User, Res: SDValue (UpdN.getNode(), NumResultVecs));
22508
22509	break;
22510	}
22511	return SDValue ();
22512	}
22513
22514	// Checks to see if the value is the prescribed width and returns information
22515	// about its extension mode.
22516	static
22517	bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
22518	ExtType = ISD::NON_EXTLOAD;
22519	switch(V.getNode()->getOpcode()) {
22520	default:
22521	return false;
22522	case ISD::LOAD: {
22523	LoadSDNode *LoadNode = cast<LoadSDNode>(Val: V.getNode());
22524	if ((LoadNode->getMemoryVT() == MVT::i8 && width == `8`)
22525	\|\| (LoadNode->getMemoryVT() == MVT::i16 && width == `16`)) {
22526	ExtType = LoadNode->getExtensionType();
22527	return true;
22528	}
22529	return false;
22530	}
22531	case ISD::AssertSext: {
22532	VTSDNode *TypeNode = cast<VTSDNode>(Val: V.getNode()->getOperand(Num: `1`));
22533	if ((TypeNode->getVT() == MVT::i8 && width == `8`)
22534	\|\| (TypeNode->getVT() == MVT::i16 && width == `16`)) {
22535	ExtType = ISD::SEXTLOAD;
22536	return true;
22537	}
22538	return false;
22539	}
22540	case ISD::AssertZext: {
22541	VTSDNode *TypeNode = cast<VTSDNode>(Val: V.getNode()->getOperand(Num: `1`));
22542	if ((TypeNode->getVT() == MVT::i8 && width == `8`)
22543	\|\| (TypeNode->getVT() == MVT::i16 && width == `16`)) {
22544	ExtType = ISD::ZEXTLOAD;
22545	return true;
22546	}
22547	return false;
22548	}
22549	case ISD::Constant:
22550	case ISD::TargetConstant: {
22551	return std::abs(i: cast<ConstantSDNode>(Val: V.getNode())->getSExtValue()) <
22552	`1LL` << (width - `1`);
22553	}
22554	}
22555
22556	return true;
22557	}
22558
22559	// This function does a whole lot of voodoo to determine if the tests are
22560	// equivalent without and with a mask. Essentially what happens is that given a
22561	// DAG resembling:
22562	//
22563	// +-------------+ +-------------+ +-------------+ +-------------+
22564	// \| Input \| \| AddConstant \| \| CompConstant\| \| CC \|
22565	// +-------------+ +-------------+ +-------------+ +-------------+
22566	// \| \| \| \|
22567	// V V \| +----------+
22568	// +-------------+ +----+ \| \|
22569	// \| ADD \| \|0xff\| \| \|
22570	// +-------------+ +----+ \| \|
22571	// \| \| \| \|
22572	// V V \| \|
22573	// +-------------+ \| \|
22574	// \| AND \| \| \|
22575	// +-------------+ \| \|
22576	// \| \| \|
22577	// +-----+ \| \|
22578	// \| \| \|
22579	// V V V
22580	// +-------------+
22581	// \| CMP \|
22582	// +-------------+
22583	//
22584	// The AND node may be safely removed for some combinations of inputs. In
22585	// particular we need to take into account the extension type of the Input,
22586	// the exact values of AddConstant, CompConstant, and CC, along with the nominal
22587	// width of the input (this can work for any width inputs, the above graph is
22588	// specific to 8 bits.
22589	//
22590	// The specific equations were worked out by generating output tables for each
22591	// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
22592	// problem was simplified by working with 4 bit inputs, which means we only
22593	// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
22594	// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
22595	// patterns present in both extensions (0,7). For every distinct set of
22596	// AddConstant and CompConstants bit patterns we can consider the masked and
22597	// unmasked versions to be equivalent if the result of this function is true for
22598	// all 16 distinct bit patterns of for the current extension type of Input (w0).
22599	//
22600	// sub w8, w0, w1
22601	// and w10, w8, #0x0f
22602	// cmp w8, w2
22603	// cset w9, AArch64CC
22604	// cmp w10, w2
22605	// cset w11, AArch64CC
22606	// cmp w9, w11
22607	// cset w0, eq
22608	// ret
22609	//
22610	// Since the above function shows when the outputs are equivalent it defines
22611	// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
22612	// would be expensive to run during compiles. The equations below were written
22613	// in a test harness that confirmed they gave equivalent outputs to the above
22614	// for all inputs function, so they can be used determine if the removal is
22615	// legal instead.
22616	//
22617	// isEquivalentMaskless() is the code for testing if the AND can be removed
22618	// factored out of the DAG recognition as the DAG can take several forms.
22619
22620	static bool isEquivalentMaskless(unsigned CC, unsigned width,
22621	ISD::LoadExtType ExtType, int AddConstant,
22622	int CompConstant) {
22623	// By being careful about our equations and only writing the in term
22624	// symbolic values and well known constants (0, 1, -1, MaxUInt) we can
22625	// make them generally applicable to all bit widths.
22626	int MaxUInt = (`1` << width);
22627
22628	// For the purposes of these comparisons sign extending the type is
22629	// equivalent to zero extending the add and displacing it by half the integer
22630	// width. Provided we are careful and make sure our equations are valid over
22631	// the whole range we can just adjust the input and avoid writing equations
22632	// for sign extended inputs.
22633	if (ExtType == ISD::SEXTLOAD)
22634	AddConstant -= (`1` << (width-`1`));
22635
22636	switch(CC) {
22637	case AArch64CC::LE:
22638	case AArch64CC::GT:
22639	if ((AddConstant == `0`) \|\|
22640	(CompConstant == MaxUInt - `1` && AddConstant < `0`) \|\|
22641	(AddConstant >= `0` && CompConstant < `0`) \|\|
22642	(AddConstant <= `0` && CompConstant <= `0` && CompConstant < AddConstant))
22643	return true;
22644	break;
22645	case AArch64CC::LT:
22646	case AArch64CC::GE:
22647	if ((AddConstant == `0`) \|\|
22648	(AddConstant >= `0` && CompConstant <= `0`) \|\|
22649	(AddConstant <= `0` && CompConstant <= `0` && CompConstant <= AddConstant))
22650	return true;
22651	break;
22652	case AArch64CC::HI:
22653	case AArch64CC::LS:
22654	if ((AddConstant >= `0` && CompConstant < `0`) \|\|
22655	(AddConstant <= `0` && CompConstant >= -`1` &&
22656	CompConstant < AddConstant + MaxUInt))
22657	return true;
22658	break;
22659	case AArch64CC::PL:
22660	case AArch64CC::MI:
22661	if ((AddConstant == `0`) \|\|
22662	(AddConstant > `0` && CompConstant <= `0`) \|\|
22663	(AddConstant < `0` && CompConstant <= AddConstant))
22664	return true;
22665	break;
22666	case AArch64CC::LO:
22667	case AArch64CC::HS:
22668	if ((AddConstant >= `0` && CompConstant <= `0`) \|\|
22669	(AddConstant <= `0` && CompConstant >= `0` &&
22670	CompConstant <= AddConstant + MaxUInt))
22671	return true;
22672	break;
22673	case AArch64CC::EQ:
22674	case AArch64CC::NE:
22675	if ((AddConstant > `0` && CompConstant < `0`) \|\|
22676	(AddConstant < `0` && CompConstant >= `0` &&
22677	CompConstant < AddConstant + MaxUInt) \|\|
22678	(AddConstant >= `0` && CompConstant >= `0` &&
22679	CompConstant >= AddConstant) \|\|
22680	(AddConstant <= `0` && CompConstant < `0` && CompConstant < AddConstant))
22681	return true;
22682	break;
22683	case AArch64CC::VS:
22684	case AArch64CC::VC:
22685	case AArch64CC::AL:
22686	case AArch64CC::NV:
22687	return true;
22688	case AArch64CC::Invalid:
22689	break;
22690	}
22691
22692	return false;
22693	}
22694
22695	// (X & C) >u Mask --> (X & (C & (~Mask)) != 0
22696	// (X & C) <u Pow2 --> (X & (C & ~(Pow2-1)) == 0
22697	static SDValue performSubsToAndsCombine(SDNode N, SDNode SubsNode,
22698	SDNode *AndNode, SelectionDAG &DAG,
22699	unsigned CCIndex, unsigned CmpIndex,
22700	unsigned CC) {
22701	ConstantSDNode *SubsC = dyn_cast<ConstantSDNode>(Val: SubsNode->getOperand(Num: `1`));
22702	if (!SubsC)
22703	return SDValue ();
22704
22705	APInt SubsAP = SubsC->getAPIntValue();
22706	if (CC == AArch64CC::HI) {
22707	if (!SubsAP.isMask())
22708	return SDValue ();
22709	} else if (CC == AArch64CC::LO) {
22710	if (!SubsAP.isPowerOf2())
22711	return SDValue ();
22712	} else
22713	return SDValue ();
22714
22715	ConstantSDNode *AndC = dyn_cast<ConstantSDNode>(Val: AndNode->getOperand(Num: `1`));
22716	if (!AndC)
22717	return SDValue ();
22718
22719	APInt MaskAP = CC == AArch64CC::HI ? SubsAP : (SubsAP - `1`);
22720
22721	SDLoc DL(N);
22722	APInt AndSMask = (~MaskAP) & AndC->getAPIntValue();
22723	SDValue ANDS = DAG.getNode(
22724	Opcode: AArch64ISD::ANDS, DL, VTList: SubsNode->getVTList(), N1: AndNode->getOperand(Num: `0`),
22725	N2: DAG.getConstant(Val: AndSMask, DL, VT: SubsC->getValueType(ResNo: `0`)));
22726	SDValue AArch64_CC =
22727	DAG.getConstant(Val: CC == AArch64CC::HI ? AArch64CC::NE : AArch64CC::EQ, DL,
22728	VT: N->getOperand(Num: CCIndex)->getValueType(ResNo: `0`));
22729
22730	// For now, only performCSELCombine and performBRCONDCombine call this
22731	// function. And both of them pass 2 for CCIndex, 3 for CmpIndex with 4
22732	// operands. So just init the ops direct to simplify the code. If we have some
22733	// other case with different CCIndex, CmpIndex, we need to use for loop to
22734	// rewrite the code here.
22735	// TODO: Do we need to assert number of operand is 4 here?
22736	assert((CCIndex == `2` && CmpIndex == `3`) &&
22737	"Expected CCIndex to be 2 and CmpIndex to be 3.");
22738	SDValue Ops[] = {N->getOperand(Num: `0`), N->getOperand(Num: `1`), AArch64_CC,
22739	ANDS.getValue(R: `1`)};
22740	return DAG.getNode(Opcode: N->getOpcode(), DL: N, VTList: N->getVTList(), Ops);
22741	}
22742
22743	static
22744	SDValue performCONDCombine(SDNode *N,
22745	TargetLowering::DAGCombinerInfo &DCI,
22746	SelectionDAG &DAG, unsigned CCIndex,
22747	unsigned CmpIndex) {
22748	unsigned CC = cast<ConstantSDNode>(Val: N->getOperand(Num: CCIndex))->getSExtValue();
22749	SDNode *SubsNode = N->getOperand(Num: CmpIndex).getNode();
22750	unsigned CondOpcode = SubsNode->getOpcode();
22751
22752	if (CondOpcode != AArch64ISD::SUBS \|\| SubsNode->hasAnyUseOfValue(Value: `0`))
22753	return SDValue ();
22754
22755	// There is a SUBS feeding this condition. Is it fed by a mask we can
22756	// use?
22757
22758	SDNode *AndNode = SubsNode->getOperand(Num: `0`).getNode();
22759	unsigned MaskBits = `0`;
22760
22761	if (AndNode->getOpcode() != ISD::AND)
22762	return SDValue ();
22763
22764	if (SDValue Val = performSubsToAndsCombine(N, SubsNode, AndNode, DAG, CCIndex,
22765	CmpIndex, CC))
22766	return Val;
22767
22768	if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val: AndNode->getOperand(Num: `1`))) {
22769	uint32_t CNV = CN->getZExtValue();
22770	if (CNV == `255`)
22771	MaskBits = `8`;
22772	else if (CNV == `65535`)
22773	MaskBits = `16`;
22774	}
22775
22776	if (!MaskBits)
22777	return SDValue ();
22778
22779	SDValue AddValue = AndNode->getOperand(Num: `0`);
22780
22781	if (AddValue.getOpcode() != ISD::ADD)
22782	return SDValue ();
22783
22784	// The basic dag structure is correct, grab the inputs and validate them.
22785
22786	SDValue AddInputValue1 = AddValue.getNode()->getOperand(Num: `0`);
22787	SDValue AddInputValue2 = AddValue.getNode()->getOperand(Num: `1`);
22788	SDValue SubsInputValue = SubsNode->getOperand(Num: `1`);
22789
22790	// The mask is present and the provenance of all the values is a smaller type,
22791	// lets see if the mask is superfluous.
22792
22793	if (!isa<ConstantSDNode>(Val: AddInputValue2.getNode()) \|\|
22794	!isa<ConstantSDNode>(Val: SubsInputValue.getNode()))
22795	return SDValue ();
22796
22797	ISD::LoadExtType ExtType;
22798
22799	if (!checkValueWidth(V: SubsInputValue, width: MaskBits, ExtType) \|\|
22800	!checkValueWidth(V: AddInputValue2, width: MaskBits, ExtType) \|\|
22801	!checkValueWidth(V: AddInputValue1, width: MaskBits, ExtType) )
22802	return SDValue ();
22803
22804	if(!isEquivalentMaskless(CC, width: MaskBits, ExtType,
22805	AddConstant: cast<ConstantSDNode>(Val: AddInputValue2.getNode())->getSExtValue(),
22806	CompConstant: cast<ConstantSDNode>(Val: SubsInputValue.getNode())->getSExtValue()))
22807	return SDValue ();
22808
22809	// The AND is not necessary, remove it.
22810
22811	SDVTList VTs = DAG.getVTList(VT1: SubsNode->getValueType(ResNo: `0`),
22812	VT2: SubsNode->getValueType(ResNo: `1`));
22813	SDValue Ops[] = { AddValue, SubsNode->getOperand(Num: `1`) };
22814
22815	SDValue NewValue = DAG.getNode(Opcode: CondOpcode, DL: SDLoc (SubsNode), VTList: VTs, Ops);
22816	DAG.ReplaceAllUsesWith(From: SubsNode, To: NewValue.getNode());
22817
22818	return SDValue (N, `0`);
22819	}
22820
22821	// Optimize compare with zero and branch.
22822	static SDValue performBRCONDCombine(SDNode *N,
22823	TargetLowering::DAGCombinerInfo &DCI,
22824	SelectionDAG &DAG) {
22825	MachineFunction &MF = DAG.getMachineFunction();
22826	// Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
22827	// will not be produced, as they are conditional branch instructions that do
22828	// not set flags.
22829	if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
22830	return SDValue ();
22831
22832	if (SDValue NV = performCONDCombine(N, DCI, DAG, CCIndex: `2`, CmpIndex: `3`))
22833	N = NV.getNode();
22834	SDValue Chain = N->getOperand(Num: `0`);
22835	SDValue Dest = N->getOperand(Num: `1`);
22836	SDValue CCVal = N->getOperand(Num: `2`);
22837	SDValue Cmp = N->getOperand(Num: `3`);
22838
22839	assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
22840	unsigned CC = CCVal ->getAsZExtVal();
22841	if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
22842	return SDValue ();
22843
22844	unsigned CmpOpc = Cmp.getOpcode();
22845	if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
22846	return SDValue ();
22847
22848	// Only attempt folding if there is only one use of the flag and no use of the
22849	// value.
22850	if (!Cmp ->hasNUsesOfValue(NUses: `0`, Value: `0`) \|\| !Cmp ->hasNUsesOfValue(NUses: `1`, Value: `1`))
22851	return SDValue ();
22852
22853	SDValue LHS = Cmp.getOperand(i: `0`);
22854	SDValue RHS = Cmp.getOperand(i: `1`);
22855
22856	assert(LHS.getValueType() == RHS.getValueType() &&
22857	"Expected the value type to be the same for both operands!");
22858	if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
22859	return SDValue ();
22860
22861	if (isNullConstant(V: LHS))
22862	std::swap(a&: LHS, b&: RHS);
22863
22864	if (!isNullConstant(V: RHS))
22865	return SDValue ();
22866
22867	if (LHS.getOpcode() == ISD::SHL \|\| LHS.getOpcode() == ISD::SRA \|\|
22868	LHS.getOpcode() == ISD::SRL)
22869	return SDValue ();
22870
22871	// Fold the compare into the branch instruction.
22872	SDValue BR;
22873	if (CC == AArch64CC::EQ)
22874	BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
22875	else
22876	BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
22877
22878	// Do not add new nodes to DAG combiner worklist.
22879	DCI.CombineTo(N, Res: BR, AddTo: false);
22880
22881	return SDValue ();
22882	}
22883
22884	static SDValue foldCSELofCTTZ(SDNode *N, SelectionDAG &DAG) {
22885	unsigned CC = N->getConstantOperandVal(Num: `2`);
22886	SDValue SUBS = N->getOperand(Num: `3`);
22887	SDValue Zero, CTTZ;
22888
22889	if (CC == AArch64CC::EQ && SUBS.getOpcode() == AArch64ISD::SUBS) {
22890	Zero = N->getOperand(Num: `0`);
22891	CTTZ = N->getOperand(Num: `1`);
22892	} else if (CC == AArch64CC::NE && SUBS.getOpcode() == AArch64ISD::SUBS) {
22893	Zero = N->getOperand(Num: `1`);
22894	CTTZ = N->getOperand(Num: `0`);
22895	} else
22896	return SDValue ();
22897
22898	if ((CTTZ.getOpcode() != ISD::CTTZ && CTTZ.getOpcode() != ISD::TRUNCATE) \|\|
22899	(CTTZ.getOpcode() == ISD::TRUNCATE &&
22900	CTTZ.getOperand(i: `0`).getOpcode() != ISD::CTTZ))
22901	return SDValue ();
22902
22903	assert((CTTZ.getValueType() == MVT::i32 \|\| CTTZ.getValueType() == MVT::i64) &&
22904	"Illegal type in CTTZ folding");
22905
22906	if (!isNullConstant(V: Zero) \|\| !isNullConstant(V: SUBS.getOperand(i: `1`)))
22907	return SDValue ();
22908
22909	SDValue X = CTTZ.getOpcode() == ISD::TRUNCATE
22910	? CTTZ.getOperand(i: `0`).getOperand(i: `0`)
22911	: CTTZ.getOperand(i: `0`);
22912
22913	if (X != SUBS.getOperand(i: `0`))
22914	return SDValue ();
22915
22916	unsigned BitWidth = CTTZ.getOpcode() == ISD::TRUNCATE
22917	? CTTZ.getOperand(i: `0`).getValueSizeInBits()
22918	: CTTZ.getValueSizeInBits();
22919	SDValue BitWidthMinusOne =
22920	DAG.getConstant(Val: BitWidth - `1`, DL: SDLoc (N), VT: CTTZ.getValueType());
22921	return DAG.getNode(Opcode: ISD::AND, DL: SDLoc (N), VT: CTTZ.getValueType(), N1: CTTZ,
22922	N2: BitWidthMinusOne);
22923	}
22924
22925	// (CSEL l r EQ (CMP (CSEL x y cc2 cond) x)) => (CSEL l r cc2 cond)
22926	// (CSEL l r EQ (CMP (CSEL x y cc2 cond) y)) => (CSEL l r !cc2 cond)
22927	// Where x and y are constants and x != y
22928
22929	// (CSEL l r NE (CMP (CSEL x y cc2 cond) x)) => (CSEL l r !cc2 cond)
22930	// (CSEL l r NE (CMP (CSEL x y cc2 cond) y)) => (CSEL l r cc2 cond)
22931	// Where x and y are constants and x != y
22932	static SDValue foldCSELOfCSEL(SDNode *Op, SelectionDAG &DAG) {
22933	SDValue L = Op->getOperand(Num: `0`);
22934	SDValue R = Op->getOperand(Num: `1`);
22935	AArch64CC::CondCode OpCC =
22936	static_cast<AArch64CC::CondCode>(Op->getConstantOperandVal(Num: `2`));
22937
22938	SDValue OpCmp = Op->getOperand(Num: `3`);
22939	if (!isCMP(Op: OpCmp))
22940	return SDValue ();
22941
22942	SDValue CmpLHS = OpCmp.getOperand(i: `0`);
22943	SDValue CmpRHS = OpCmp.getOperand(i: `1`);
22944
22945	if (CmpRHS.getOpcode() == AArch64ISD::CSEL)
22946	std::swap(a&: CmpLHS, b&: CmpRHS);
22947	else if (CmpLHS.getOpcode() != AArch64ISD::CSEL)
22948	return SDValue ();
22949
22950	SDValue X = CmpLHS ->getOperand(Num: `0`);
22951	SDValue Y = CmpLHS ->getOperand(Num: `1`);
22952	if (!isa<ConstantSDNode>(Val: X) \|\| !isa<ConstantSDNode>(Val: Y) \|\| X == Y) {
22953	return SDValue ();
22954	}
22955
22956	// If one of the constant is opaque constant, x,y sdnode is still different
22957	// but the real value maybe the same. So check APInt here to make sure the
22958	// code is correct.
22959	ConstantSDNode *CX = cast<ConstantSDNode>(Val&: X);
22960	ConstantSDNode *CY = cast<ConstantSDNode>(Val&: Y);
22961	if (CX->getAPIntValue() == CY->getAPIntValue())
22962	return SDValue ();
22963
22964	AArch64CC::CondCode CC =
22965	static_cast<AArch64CC::CondCode>(CmpLHS ->getConstantOperandVal(Num: `2`));
22966	SDValue Cond = CmpLHS ->getOperand(Num: `3`);
22967
22968	if (CmpRHS == Y)
22969	CC = AArch64CC::getInvertedCondCode(Code: CC);
22970	else if (CmpRHS != X)
22971	return SDValue ();
22972
22973	if (OpCC == AArch64CC::NE)
22974	CC = AArch64CC::getInvertedCondCode(Code: CC);
22975	else if (OpCC != AArch64CC::EQ)
22976	return SDValue ();
22977
22978	SDLoc DL(Op);
22979	EVT VT = Op->getValueType(ResNo: `0`);
22980
22981	SDValue CCValue = DAG.getConstant(CC, DL, MVT::i32);
22982	return DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT, N1: L, N2: R, N3: CCValue, N4: Cond);
22983	}
22984
22985	// Optimize CSEL instructions
22986	static SDValue performCSELCombine(SDNode *N,
22987	TargetLowering::DAGCombinerInfo &DCI,
22988	SelectionDAG &DAG) {
22989	// CSEL x, x, cc -> x
22990	if (N->getOperand(Num: `0`) == N->getOperand(Num: `1`))
22991	return N->getOperand(Num: `0`);
22992
22993	if (SDValue R = foldCSELOfCSEL(Op: N, DAG))
22994	return R;
22995
22996	// CSEL 0, cttz(X), eq(X, 0) -> AND cttz bitwidth-1
22997	// CSEL cttz(X), 0, ne(X, 0) -> AND cttz bitwidth-1
22998	if (SDValue Folded = foldCSELofCTTZ(N, DAG))
22999	return Folded;
23000
23001	return performCONDCombine(N, DCI, DAG, CCIndex: `2`, CmpIndex: `3`);
23002	}
23003
23004	// Try to re-use an already extended operand of a vector SetCC feeding a
23005	// extended select. Doing so avoids requiring another full extension of the
23006	// SET_CC result when lowering the select.
23007	static SDValue tryToWidenSetCCOperands(SDNode *Op, SelectionDAG &DAG) {
23008	EVT Op0MVT = Op->getOperand(Num: `0`).getValueType();
23009	if (!Op0MVT.isVector() \|\| Op->use_empty())
23010	return SDValue ();
23011
23012	// Make sure that all uses of Op are VSELECTs with result matching types where
23013	// the result type has a larger element type than the SetCC operand.
23014	SDNode FirstUse = Op->use_begin();
23015	if (FirstUse->getOpcode() != ISD::VSELECT)
23016	return SDValue ();
23017	EVT UseMVT = FirstUse->getValueType(ResNo: `0`);
23018	if (UseMVT.getScalarSizeInBits() <= Op0MVT.getScalarSizeInBits())
23019	return SDValue ();
23020	if (any_of(Range: Op->uses(), P: [&UseMVT](const SDNode *N) {
23021	return N->getOpcode() != ISD::VSELECT \|\| N->getValueType(ResNo: `0`) != UseMVT;
23022	}))
23023	return SDValue ();
23024
23025	APInt V;
23026	if (!ISD::isConstantSplatVector(N: Op->getOperand(Num: `1`).getNode(), SplatValue&: V))
23027	return SDValue ();
23028
23029	SDLoc DL(Op);
23030	SDValue Op0ExtV;
23031	SDValue Op1ExtV;
23032	ISD::CondCode CC = cast<CondCodeSDNode>(Val: Op->getOperand(Num: `2`))->get();
23033	// Check if the first operand of the SET_CC is already extended. If it is,
23034	// split the SET_CC and re-use the extended version of the operand.
23035	SDNode *Op0SExt = DAG.getNodeIfExists(Opcode: ISD::SIGN_EXTEND, VTList: DAG.getVTList(VT: UseMVT),
23036	Ops: Op->getOperand(Num: `0`));
23037	SDNode *Op0ZExt = DAG.getNodeIfExists(Opcode: ISD::ZERO_EXTEND, VTList: DAG.getVTList(VT: UseMVT),
23038	Ops: Op->getOperand(Num: `0`));
23039	if (Op0SExt && (isSignedIntSetCC(Code: CC) \|\| isIntEqualitySetCC(Code: CC))) {
23040	Op0ExtV = SDValue (Op0SExt, `0`);
23041	Op1ExtV = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: UseMVT, Operand: Op->getOperand(Num: `1`));
23042	} else if (Op0ZExt && (isUnsignedIntSetCC(Code: CC) \|\| isIntEqualitySetCC(Code: CC))) {
23043	Op0ExtV = SDValue (Op0ZExt, `0`);
23044	Op1ExtV = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: UseMVT, Operand: Op->getOperand(Num: `1`));
23045	} else
23046	return SDValue ();
23047
23048	return DAG.getNode(ISD::SETCC, DL, UseMVT.changeVectorElementType(MVT::i1),
23049	Op0ExtV, Op1ExtV, Op->getOperand(`2`));
23050	}
23051
23052	static SDValue
23053	performVecReduceBitwiseCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
23054	SelectionDAG &DAG) {
23055	SDValue Vec = N->getOperand(Num: `0`);
23056	if (DCI.isBeforeLegalize() &&
23057	Vec.getValueType().getVectorElementType() == MVT::i1 &&
23058	Vec.getValueType().isFixedLengthVector() &&
23059	Vec.getValueType().isPow2VectorType()) {
23060	SDLoc DL(N);
23061	return getVectorBitwiseReduce(Opcode: N->getOpcode(), Vec, VT: N->getValueType(ResNo: `0`), DL,
23062	DAG);
23063	}
23064
23065	return SDValue ();
23066	}
23067
23068	static SDValue performSETCCCombine(SDNode *N,
23069	TargetLowering::DAGCombinerInfo &DCI,
23070	SelectionDAG &DAG) {
23071	assert(N->getOpcode() == ISD::SETCC && "Unexpected opcode!");
23072	SDValue LHS = N->getOperand(Num: `0`);
23073	SDValue RHS = N->getOperand(Num: `1`);
23074	ISD::CondCode Cond = cast<CondCodeSDNode>(Val: N->getOperand(Num: `2`))->get();
23075	SDLoc DL(N);
23076	EVT VT = N->getValueType(ResNo: `0`);
23077
23078	if (SDValue V = tryToWidenSetCCOperands(Op: N, DAG))
23079	return V;
23080
23081	// setcc (csel 0, 1, cond, X), 1, ne ==> csel 0, 1, !cond, X
23082	if (Cond == ISD::SETNE && isOneConstant(V: RHS) &&
23083	LHS ->getOpcode() == AArch64ISD::CSEL &&
23084	isNullConstant(V: LHS ->getOperand(Num: `0`)) && isOneConstant(V: LHS ->getOperand(Num: `1`)) &&
23085	LHS ->hasOneUse()) {
23086	// Invert CSEL's condition.
23087	auto OldCond =
23088	static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(i: `2`));
23089	auto NewCond = getInvertedCondCode(Code: OldCond);
23090
23091	// csel 0, 1, !cond, X
23092	SDValue CSEL =
23093	DAG.getNode(AArch64ISD::CSEL, DL, LHS.getValueType(), LHS.getOperand(`0`),
23094	LHS.getOperand(`1`), DAG.getConstant(NewCond, DL, MVT::i32),
23095	LHS.getOperand(`3`));
23096	return DAG.getZExtOrTrunc(Op: CSEL, DL, VT);
23097	}
23098
23099	// setcc (srl x, imm), 0, ne ==> setcc (and x, (-1 << imm)), 0, ne
23100	if (Cond == ISD::SETNE && isNullConstant(V: RHS) &&
23101	LHS ->getOpcode() == ISD::SRL && isa<ConstantSDNode>(Val: LHS ->getOperand(Num: `1`)) &&
23102	LHS ->getConstantOperandVal(Num: `1`) < VT.getScalarSizeInBits() &&
23103	LHS ->hasOneUse()) {
23104	EVT TstVT = LHS ->getValueType(ResNo: `0`);
23105	if (TstVT.isScalarInteger() && TstVT.getFixedSizeInBits() <= `64`) {
23106	// this pattern will get better opt in emitComparison
23107	uint64_t TstImm = -`1ULL` << LHS ->getConstantOperandVal(Num: `1`);
23108	SDValue TST = DAG.getNode(Opcode: ISD::AND, DL, VT: TstVT, N1: LHS ->getOperand(Num: `0`),
23109	N2: DAG.getConstant(Val: TstImm, DL, VT: TstVT));
23110	return DAG.getNode(Opcode: ISD::SETCC, DL, VT, N1: TST, N2: RHS, N3: N->getOperand(Num: `2`));
23111	}
23112	}
23113
23114	// setcc (iN (bitcast (vNi1 X))), 0, (eq\|ne)
23115	// ==> setcc (iN (zext (i1 (vecreduce_or (vNi1 X))))), 0, (eq\|ne)
23116	// setcc (iN (bitcast (vNi1 X))), -1, (eq\|ne)
23117	// ==> setcc (iN (sext (i1 (vecreduce_and (vNi1 X))))), -1, (eq\|ne)
23118	if (DCI.isBeforeLegalize() && VT.isScalarInteger() &&
23119	(Cond == ISD::SETEQ \|\| Cond == ISD::SETNE) &&
23120	(isNullConstant(V: RHS) \|\| isAllOnesConstant(V: RHS)) &&
23121	LHS ->getOpcode() == ISD::BITCAST) {
23122	EVT ToVT = LHS ->getValueType(ResNo: `0`);
23123	EVT FromVT = LHS ->getOperand(Num: `0`).getValueType();
23124	if (FromVT.isFixedLengthVector() &&
23125	FromVT.getVectorElementType() == MVT::i1) {
23126	bool IsNull = isNullConstant(V: RHS);
23127	LHS = DAG.getNode(IsNull ? ISD::VECREDUCE_OR : ISD::VECREDUCE_AND,
23128	DL, MVT::i1, LHS->getOperand(`0`));
23129	LHS = DAG.getNode(Opcode: IsNull ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, DL, VT: ToVT,
23130	Operand: LHS);
23131	return DAG.getSetCC(DL, VT, LHS, RHS, Cond);
23132	}
23133	}
23134
23135	// Try to perform the memcmp when the result is tested for [in]equality with 0
23136	if (SDValue V = performOrXorChainCombine(N, DAG))
23137	return V;
23138
23139	return SDValue ();
23140	}
23141
23142	// Replace a flag-setting operator (eg ANDS) with the generic version
23143	// (eg AND) if the flag is unused.
23144	static SDValue performFlagSettingCombine(SDNode *N,
23145	TargetLowering::DAGCombinerInfo &DCI,
23146	unsigned GenericOpcode) {
23147	SDLoc DL(N);
23148	SDValue LHS = N->getOperand(Num: `0`);
23149	SDValue RHS = N->getOperand(Num: `1`);
23150	EVT VT = N->getValueType(ResNo: `0`);
23151
23152	// If the flag result isn't used, convert back to a generic opcode.
23153	if (!N->hasAnyUseOfValue(Value: `1`)) {
23154	SDValue Res = DCI.DAG.getNode(Opcode: GenericOpcode, DL, VT, Ops: N->ops());
23155	return DCI.DAG.getMergeValues({Res, DCI.DAG.getConstant(`0`, DL, MVT::i32)},
23156	DL);
23157	}
23158
23159	// Combine identical generic nodes into this node, re-using the result.
23160	if (SDNode *Generic = DCI.DAG.getNodeIfExists(
23161	Opcode: GenericOpcode, VTList: DCI.DAG.getVTList(VT), Ops: {LHS, RHS}))
23162	DCI.CombineTo(N: Generic, Res: SDValue (N, `0`));
23163
23164	return SDValue ();
23165	}
23166
23167	static SDValue performSetCCPunpkCombine(SDNode *N, SelectionDAG &DAG) {
23168	// setcc_merge_zero pred
23169	// (sign_extend (extract_subvector (setcc_merge_zero ... pred ...))), 0, ne
23170	// => extract_subvector (inner setcc_merge_zero)
23171	SDValue Pred = N->getOperand(Num: `0`);
23172	SDValue LHS = N->getOperand(Num: `1`);
23173	SDValue RHS = N->getOperand(Num: `2`);
23174	ISD::CondCode Cond = cast<CondCodeSDNode>(Val: N->getOperand(Num: `3`))->get();
23175
23176	if (Cond != ISD::SETNE \|\| !isZerosVector(N: RHS.getNode()) \|\|
23177	LHS ->getOpcode() != ISD::SIGN_EXTEND)
23178	return SDValue ();
23179
23180	SDValue Extract = LHS ->getOperand(Num: `0`);
23181	if (Extract ->getOpcode() != ISD::EXTRACT_SUBVECTOR \|\|
23182	Extract ->getValueType(ResNo: `0`) != N->getValueType(ResNo: `0`) \|\|
23183	Extract ->getConstantOperandVal(Num: `1`) != `0`)
23184	return SDValue ();
23185
23186	SDValue InnerSetCC = Extract ->getOperand(Num: `0`);
23187	if (InnerSetCC ->getOpcode() != AArch64ISD::SETCC_MERGE_ZERO)
23188	return SDValue ();
23189
23190	// By this point we've effectively got
23191	// zero_inactive_lanes_and_trunc_i1(sext_i1(A)). If we can prove A's inactive
23192	// lanes are already zero then the trunc(sext()) sequence is redundant and we
23193	// can operate on A directly.
23194	SDValue InnerPred = InnerSetCC.getOperand(i: `0`);
23195	if (Pred.getOpcode() == AArch64ISD::PTRUE &&
23196	InnerPred.getOpcode() == AArch64ISD::PTRUE &&
23197	Pred.getConstantOperandVal(`0`) == InnerPred.getConstantOperandVal(`0`) &&
23198	Pred->getConstantOperandVal(`0`) >= AArch64SVEPredPattern::vl1 &&
23199	Pred->getConstantOperandVal(`0`) <= AArch64SVEPredPattern::vl256)
23200	return Extract;
23201
23202	return SDValue ();
23203	}
23204
23205	static SDValue
23206	performSetccMergeZeroCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
23207	assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
23208	"Unexpected opcode!");
23209
23210	SelectionDAG &DAG = DCI.DAG;
23211	SDValue Pred = N->getOperand(Num: `0`);
23212	SDValue LHS = N->getOperand(Num: `1`);
23213	SDValue RHS = N->getOperand(Num: `2`);
23214	ISD::CondCode Cond = cast<CondCodeSDNode>(Val: N->getOperand(Num: `3`))->get();
23215
23216	if (SDValue V = performSetCCPunpkCombine(N, DAG))
23217	return V;
23218
23219	if (Cond == ISD::SETNE && isZerosVector(N: RHS.getNode()) &&
23220	LHS ->getOpcode() == ISD::SIGN_EXTEND &&
23221	LHS ->getOperand(Num: `0`)->getValueType(ResNo: `0`) == N->getValueType(ResNo: `0`)) {
23222	// setcc_merge_zero(
23223	// pred, extend(setcc_merge_zero(pred, ...)), != splat(0))
23224	// => setcc_merge_zero(pred, ...)
23225	if (LHS ->getOperand(Num: `0`)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
23226	LHS ->getOperand(Num: `0`)->getOperand(Num: `0`) == Pred)
23227	return LHS ->getOperand(Num: `0`);
23228
23229	// setcc_merge_zero(
23230	// all_active, extend(nxvNi1 ...), != splat(0))
23231	// -> nxvNi1 ...
23232	if (isAllActivePredicate(DAG, N: Pred))
23233	return LHS ->getOperand(Num: `0`);
23234
23235	// setcc_merge_zero(
23236	// pred, extend(nxvNi1 ...), != splat(0))
23237	// -> nxvNi1 and(pred, ...)
23238	if (DCI.isAfterLegalizeDAG())
23239	// Do this after legalization to allow more folds on setcc_merge_zero
23240	// to be recognized.
23241	return DAG.getNode(Opcode: ISD::AND, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
23242	N1: LHS ->getOperand(Num: `0`), N2: Pred);
23243	}
23244
23245	return SDValue ();
23246	}
23247
23248	// Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test
23249	// as well as whether the test should be inverted. This code is required to
23250	// catch these cases (as opposed to standard dag combines) because
23251	// AArch64ISD::TBZ is matched during legalization.
23252	static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
23253	SelectionDAG &DAG) {
23254
23255	if (!Op ->hasOneUse())
23256	return Op;
23257
23258	// We don't handle undef/constant-fold cases below, as they should have
23259	// already been taken care of (e.g. and of 0, test of undefined shifted bits,
23260	// etc.)
23261
23262	// (tbz (trunc x), b) -> (tbz x, b)
23263	// This case is just here to enable more of the below cases to be caught.
23264	if (Op ->getOpcode() == ISD::TRUNCATE &&
23265	Bit < Op ->getValueType(ResNo: `0`).getSizeInBits()) {
23266	return getTestBitOperand(Op: Op ->getOperand(Num: `0`), Bit, Invert, DAG);
23267	}
23268
23269	// (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
23270	if (Op ->getOpcode() == ISD::ANY_EXTEND &&
23271	Bit < Op ->getOperand(Num: `0`).getValueSizeInBits()) {
23272	return getTestBitOperand(Op: Op ->getOperand(Num: `0`), Bit, Invert, DAG);
23273	}
23274
23275	if (Op ->getNumOperands() != `2`)
23276	return Op;
23277
23278	auto *C = dyn_cast<ConstantSDNode>(Val: Op ->getOperand(Num: `1`));
23279	if (!C)
23280	return Op;
23281
23282	switch (Op ->getOpcode()) {
23283	default:
23284	return Op;
23285
23286	// (tbz (and x, m), b) -> (tbz x, b)
23287	case ISD::AND:
23288	if ((C->getZExtValue() >> Bit) & `1`)
23289	return getTestBitOperand(Op: Op ->getOperand(Num: `0`), Bit, Invert, DAG);
23290	return Op;
23291
23292	// (tbz (shl x, c), b) -> (tbz x, b-c)
23293	case ISD::SHL:
23294	if (C->getZExtValue() <= Bit &&
23295	(Bit - C->getZExtValue()) < Op ->getValueType(ResNo: `0`).getSizeInBits()) {
23296	Bit = Bit - C->getZExtValue();
23297	return getTestBitOperand(Op: Op ->getOperand(Num: `0`), Bit, Invert, DAG);
23298	}
23299	return Op;
23300
23301	// (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
23302	case ISD::SRA:
23303	Bit = Bit + C->getZExtValue();
23304	if (Bit >= Op ->getValueType(ResNo: `0`).getSizeInBits())
23305	Bit = Op ->getValueType(ResNo: `0`).getSizeInBits() - `1`;
23306	return getTestBitOperand(Op: Op ->getOperand(Num: `0`), Bit, Invert, DAG);
23307
23308	// (tbz (srl x, c), b) -> (tbz x, b+c)
23309	case ISD::SRL:
23310	if ((Bit + C->getZExtValue()) < Op ->getValueType(ResNo: `0`).getSizeInBits()) {
23311	Bit = Bit + C->getZExtValue();
23312	return getTestBitOperand(Op: Op ->getOperand(Num: `0`), Bit, Invert, DAG);
23313	}
23314	return Op;
23315
23316	// (tbz (xor x, -1), b) -> (tbnz x, b)
23317	case ISD::XOR:
23318	if ((C->getZExtValue() >> Bit) & `1`)
23319	Invert = !Invert;
23320	return getTestBitOperand(Op: Op ->getOperand(Num: `0`), Bit, Invert, DAG);
23321	}
23322	}
23323
23324	// Optimize test single bit zero/non-zero and branch.
23325	static SDValue performTBZCombine(SDNode *N,
23326	TargetLowering::DAGCombinerInfo &DCI,
23327	SelectionDAG &DAG) {
23328	unsigned Bit = N->getConstantOperandVal(Num: `2`);
23329	bool Invert = false;
23330	SDValue TestSrc = N->getOperand(Num: `1`);
23331	SDValue NewTestSrc = getTestBitOperand(Op: TestSrc, Bit, Invert, DAG);
23332
23333	if (TestSrc == NewTestSrc)
23334	return SDValue ();
23335
23336	unsigned NewOpc = N->getOpcode();
23337	if (Invert) {
23338	if (NewOpc == AArch64ISD::TBZ)
23339	NewOpc = AArch64ISD::TBNZ;
23340	else {
23341	assert(NewOpc == AArch64ISD::TBNZ);
23342	NewOpc = AArch64ISD::TBZ;
23343	}
23344	}
23345
23346	SDLoc DL(N);
23347	return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(`0`), NewTestSrc,
23348	DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(`3`));
23349	}
23350
23351	// Swap vselect operands where it may allow a predicated operation to achieve
23352	// the `sel`.
23353	//
23354	// (vselect (setcc ( condcode) (_) (_)) (a) (op (a) (b)))
23355	// => (vselect (setcc (!condcode) (_) (_)) (op (a) (b)) (a))
23356	static SDValue trySwapVSelectOperands(SDNode *N, SelectionDAG &DAG) {
23357	auto SelectA = N->getOperand(Num: `1`);
23358	auto SelectB = N->getOperand(Num: `2`);
23359	auto NTy = N->getValueType(ResNo: `0`);
23360
23361	if (!NTy.isScalableVector())
23362	return SDValue ();
23363	SDValue SetCC = N->getOperand(Num: `0`);
23364	if (SetCC.getOpcode() != ISD::SETCC \|\| !SetCC.hasOneUse())
23365	return SDValue ();
23366
23367	switch (SelectB.getOpcode()) {
23368	default:
23369	return SDValue ();
23370	case ISD::FMUL:
23371	case ISD::FSUB:
23372	case ISD::FADD:
23373	break;
23374	}
23375	if (SelectA != SelectB.getOperand(i: `0`))
23376	return SDValue ();
23377
23378	ISD::CondCode CC = cast<CondCodeSDNode>(Val: SetCC.getOperand(i: `2`))->get();
23379	ISD::CondCode InverseCC =
23380	ISD::getSetCCInverse(Operation: CC, Type: SetCC.getOperand(i: `0`).getValueType());
23381	auto InverseSetCC =
23382	DAG.getSetCC(DL: SDLoc (SetCC), VT: SetCC.getValueType(), LHS: SetCC.getOperand(i: `0`),
23383	RHS: SetCC.getOperand(i: `1`), Cond: InverseCC);
23384
23385	return DAG.getNode(Opcode: ISD::VSELECT, DL: SDLoc (N), VT: NTy,
23386	Ops: {InverseSetCC, SelectB, SelectA});
23387	}
23388
23389	// vselect (v1i1 setcc) ->
23390	// vselect (v1iXX setcc) (XX is the size of the compared operand type)
23391	// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
23392	// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
23393	// such VSELECT.
23394	static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) {
23395	if (auto SwapResult = trySwapVSelectOperands(N, DAG))
23396	return SwapResult;
23397
23398	SDValue N0 = N->getOperand(Num: `0`);
23399	EVT CCVT = N0.getValueType();
23400
23401	if (isAllActivePredicate(DAG, N: N0))
23402	return N->getOperand(Num: `1`);
23403
23404	if (isAllInactivePredicate(N: N0))
23405	return N->getOperand(Num: `2`);
23406
23407	// Check for sign pattern (VSELECT setgt, iN lhs, -1, 1, -1) and transform
23408	// into (OR (ASR lhs, N-1), 1), which requires less instructions for the
23409	// supported types.
23410	SDValue SetCC = N->getOperand(Num: `0`);
23411	if (SetCC.getOpcode() == ISD::SETCC &&
23412	SetCC.getOperand(i: `2`) == DAG.getCondCode(Cond: ISD::SETGT)) {
23413	SDValue CmpLHS = SetCC.getOperand(i: `0`);
23414	EVT VT = CmpLHS.getValueType();
23415	SDNode *CmpRHS = SetCC.getOperand(i: `1`).getNode();
23416	SDNode *SplatLHS = N->getOperand(Num: `1`).getNode();
23417	SDNode *SplatRHS = N->getOperand(Num: `2`).getNode();
23418	APInt SplatLHSVal;
23419	if (CmpLHS.getValueType() == N->getOperand(`1`).getValueType() &&
23420	VT.isSimple() &&
23421	is_contained(ArrayRef({MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
23422	MVT::v2i32, MVT::v4i32, MVT::v2i64}),
23423	VT.getSimpleVT().SimpleTy) &&
23424	ISD::isConstantSplatVector(SplatLHS, SplatLHSVal) &&
23425	SplatLHSVal.isOne() && ISD::isConstantSplatVectorAllOnes(CmpRHS) &&
23426	ISD::isConstantSplatVectorAllOnes(SplatRHS)) {
23427	unsigned NumElts = VT.getVectorNumElements();
23428	SmallVector<SDValue, `8`> Ops(
23429	NumElts, DAG.getConstant(Val: VT.getScalarSizeInBits() - `1`, DL: SDLoc (N),
23430	VT: VT.getScalarType()));
23431	SDValue Val = DAG.getBuildVector(VT, DL: SDLoc (N), Ops);
23432
23433	auto Shift = DAG.getNode(Opcode: ISD::SRA, DL: SDLoc (N), VT, N1: CmpLHS, N2: Val);
23434	auto Or = DAG.getNode(Opcode: ISD::OR, DL: SDLoc (N), VT, N1: Shift, N2: N->getOperand(Num: `1`));
23435	return Or;
23436	}
23437	}
23438
23439	EVT CmpVT = N0.getOperand(i: `0`).getValueType();
23440	if (N0.getOpcode() != ISD::SETCC \|\|
23441	CCVT.getVectorElementCount() != ElementCount::getFixed(`1`) \|\|
23442	CCVT.getVectorElementType() != MVT::i1 \|\|
23443	CmpVT.getVectorElementType().isFloatingPoint())
23444	return SDValue ();
23445
23446	EVT ResVT = N->getValueType(ResNo: `0`);
23447	// Only combine when the result type is of the same size as the compared
23448	// operands.
23449	if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
23450	return SDValue ();
23451
23452	SDValue IfTrue = N->getOperand(Num: `1`);
23453	SDValue IfFalse = N->getOperand(Num: `2`);
23454	SetCC = DAG.getSetCC(DL: SDLoc (N), VT: CmpVT.changeVectorElementTypeToInteger(),
23455	LHS: N0.getOperand(i: `0`), RHS: N0.getOperand(i: `1`),
23456	Cond: cast<CondCodeSDNode>(Val: N0.getOperand(i: `2`))->get());
23457	return DAG.getNode(Opcode: ISD::VSELECT, DL: SDLoc (N), VT: ResVT, N1: SetCC,
23458	N2: IfTrue, N3: IfFalse);
23459	}
23460
23461	/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
23462	/// the compare-mask instructions rather than going via NZCV, even if LHS and
23463	/// RHS are really scalar. This replaces any scalar setcc in the above pattern
23464	/// with a vector one followed by a DUP shuffle on the result.
23465	static SDValue performSelectCombine(SDNode *N,
23466	TargetLowering::DAGCombinerInfo &DCI) {
23467	SelectionDAG &DAG = DCI.DAG;
23468	SDValue N0 = N->getOperand(Num: `0`);
23469	EVT ResVT = N->getValueType(ResNo: `0`);
23470
23471	if (N0.getOpcode() != ISD::SETCC)
23472	return SDValue ();
23473
23474	if (ResVT.isScalableVT())
23475	return SDValue ();
23476
23477	// Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
23478	// scalar SetCCResultType. We also don't expect vectors, because we assume
23479	// that selects fed by vector SETCCs are canonicalized to VSELECT.
23480	assert((N0.getValueType() == MVT::i1 \|\| N0.getValueType() == MVT::i32) &&
23481	"Scalar-SETCC feeding SELECT has unexpected result type!");
23482
23483	// If NumMaskElts == 0, the comparison is larger than select result. The
23484	// largest real NEON comparison is 64-bits per lane, which means the result is
23485	// at most 32-bits and an illegal vector. Just bail out for now.
23486	EVT SrcVT = N0.getOperand(i: `0`).getValueType();
23487
23488	// Don't try to do this optimization when the setcc itself has i1 operands.
23489	// There are no legal vectors of i1, so this would be pointless. v1f16 is
23490	// ruled out to prevent the creation of setcc that need to be scalarized.
23491	if (SrcVT == MVT::i1 \|\|
23492	(SrcVT.isFloatingPoint() && SrcVT.getSizeInBits() <= `16`))
23493	return SDValue ();
23494
23495	int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
23496	if (!ResVT.isVector() \|\| NumMaskElts == `0`)
23497	return SDValue ();
23498
23499	SrcVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: SrcVT, NumElements: NumMaskElts);
23500	EVT CCVT = SrcVT.changeVectorElementTypeToInteger();
23501
23502	// Also bail out if the vector CCVT isn't the same size as ResVT.
23503	// This can happen if the SETCC operand size doesn't divide the ResVT size
23504	// (e.g., f64 vs v3f32).
23505	if (CCVT.getSizeInBits() != ResVT.getSizeInBits())
23506	return SDValue ();
23507
23508	// Make sure we didn't create illegal types, if we're not supposed to.
23509	assert(DCI.isBeforeLegalize() \|\|
23510	DAG.getTargetLoweringInfo().isTypeLegal(SrcVT));
23511
23512	// First perform a vector comparison, where lane 0 is the one we're interested
23513	// in.
23514	SDLoc DL(N0);
23515	SDValue LHS =
23516	DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL, VT: SrcVT, Operand: N0.getOperand(i: `0`));
23517	SDValue RHS =
23518	DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL, VT: SrcVT, Operand: N0.getOperand(i: `1`));
23519	SDValue SetCC = DAG.getNode(Opcode: ISD::SETCC, DL, VT: CCVT, N1: LHS, N2: RHS, N3: N0.getOperand(i: `2`));
23520
23521	// Now duplicate the comparison mask we want across all other lanes.
23522	SmallVector<int, `8`> DUPMask(CCVT.getVectorNumElements(), `0`);
23523	SDValue Mask = DAG.getVectorShuffle(VT: CCVT, dl: DL, N1: SetCC, N2: SetCC, Mask: DUPMask);
23524	Mask = DAG.getNode(Opcode: ISD::BITCAST, DL,
23525	VT: ResVT.changeVectorElementTypeToInteger(), Operand: Mask);
23526
23527	return DAG.getSelect(DL, VT: ResVT, Cond: Mask, LHS: N->getOperand(Num: `1`), RHS: N->getOperand(Num: `2`));
23528	}
23529
23530	static SDValue performDUPCombine(SDNode *N,
23531	TargetLowering::DAGCombinerInfo &DCI) {
23532	EVT VT = N->getValueType(ResNo: `0`);
23533	SDLoc DL(N);
23534	// If "v2i32 DUP(x)" and "v4i32 DUP(x)" both exist, use an extract from the
23535	// 128bit vector version.
23536	if (VT.is64BitVector() && DCI.isAfterLegalizeDAG()) {
23537	EVT LVT = VT.getDoubleNumVectorElementsVT(Context&: *DCI.DAG.getContext());
23538	SmallVector<SDValue> Ops(N->ops());
23539	if (SDNode *LN = DCI.DAG.getNodeIfExists(Opcode: N->getOpcode(),
23540	VTList: DCI.DAG.getVTList(VT: LVT), Ops)) {
23541	return DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SDValue(LN, `0`),
23542	DCI.DAG.getConstant(`0`, DL, MVT::i64));
23543	}
23544	}
23545
23546	if (N->getOpcode() == AArch64ISD::DUP) {
23547	if (DCI.isAfterLegalizeDAG()) {
23548	// If scalar dup's operand is extract_vector_elt, try to combine them into
23549	// duplane. For example,
23550	//
23551	// t21: i32 = extract_vector_elt t19, Constant:i64<0>
23552	// t18: v4i32 = AArch64ISD::DUP t21
23553	// ==>
23554	// t22: v4i32 = AArch64ISD::DUPLANE32 t19, Constant:i64<0>
23555	SDValue EXTRACT_VEC_ELT = N->getOperand(Num: `0`);
23556	if (EXTRACT_VEC_ELT.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
23557	if (VT == EXTRACT_VEC_ELT.getOperand(i: `0`).getValueType()) {
23558	unsigned Opcode = getDUPLANEOp(EltType: VT.getVectorElementType());
23559	return DCI.DAG.getNode(Opcode, DL, VT, N1: EXTRACT_VEC_ELT.getOperand(i: `0`),
23560	N2: EXTRACT_VEC_ELT.getOperand(i: `1`));
23561	}
23562	}
23563	}
23564
23565	return performPostLD1Combine(N, DCI, IsLaneOp: false);
23566	}
23567
23568	return SDValue ();
23569	}
23570
23571	/// Get rid of unnecessary NVCASTs (that don't change the type).
23572	static SDValue performNVCASTCombine(SDNode *N, SelectionDAG &DAG) {
23573	if (N->getValueType(ResNo: `0`) == N->getOperand(Num: `0`).getValueType())
23574	return N->getOperand(Num: `0`);
23575	if (N->getOperand(Num: `0`).getOpcode() == AArch64ISD::NVCAST)
23576	return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
23577	Operand: N->getOperand(Num: `0`).getOperand(i: `0`));
23578
23579	return SDValue ();
23580	}
23581
23582	// If all users of the globaladdr are of the form (globaladdr + constant), find
23583	// the smallest constant, fold it into the globaladdr's offset and rewrite the
23584	// globaladdr as (globaladdr + constant) - constant.
23585	static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG,
23586	const AArch64Subtarget *Subtarget,
23587	const TargetMachine &TM) {
23588	auto *GN = cast<GlobalAddressSDNode>(Val: N);
23589	if (Subtarget->ClassifyGlobalReference(GV: GN->getGlobal(), TM) !=
23590	AArch64II::MO_NO_FLAG)
23591	return SDValue ();
23592
23593	uint64_t MinOffset = -`1ull`;
23594	for (SDNode *N : GN->uses()) {
23595	if (N->getOpcode() != ISD::ADD)
23596	return SDValue ();
23597	auto *C = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `0`));
23598	if (!C)
23599	C = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `1`));
23600	if (!C)
23601	return SDValue ();
23602	MinOffset = std::min(a: MinOffset, b: C->getZExtValue());
23603	}
23604	uint64_t Offset = MinOffset + GN->getOffset();
23605
23606	// Require that the new offset is larger than the existing one. Otherwise, we
23607	// can end up oscillating between two possible DAGs, for example,
23608	// (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).
23609	if (Offset <= uint64_t(GN->getOffset()))
23610	return SDValue ();
23611
23612	// Check whether folding this offset is legal. It must not go out of bounds of
23613	// the referenced object to avoid violating the code model, and must be
23614	// smaller than 2^20 because this is the largest offset expressible in all
23615	// object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF
23616	// stores an immediate signed 21 bit offset.)
23617	//
23618	// This check also prevents us from folding negative offsets, which will end
23619	// up being treated in the same way as large positive ones. They could also
23620	// cause code model violations, and aren't really common enough to matter.
23621	if (Offset >= (`1` << `20`))
23622	return SDValue ();
23623
23624	const GlobalValue *GV = GN->getGlobal();
23625	Type *T = GV->getValueType();
23626	if (!T->isSized() \|\|
23627	Offset > GV->getParent()->getDataLayout().getTypeAllocSize(Ty: T))
23628	return SDValue ();
23629
23630	SDLoc DL(GN);
23631	SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset);
23632	return DAG.getNode(ISD::SUB, DL, MVT::i64, Result,
23633	DAG.getConstant(MinOffset, DL, MVT::i64));
23634	}
23635
23636	static SDValue performCTLZCombine(SDNode *N, SelectionDAG &DAG,
23637	const AArch64Subtarget *Subtarget) {
23638	SDValue BR = N->getOperand(Num: `0`);
23639	if (!Subtarget->hasCSSC() \|\| BR.getOpcode() != ISD::BITREVERSE \|\|
23640	!BR.getValueType().isScalarInteger())
23641	return SDValue ();
23642
23643	SDLoc DL(N);
23644	return DAG.getNode(Opcode: ISD::CTTZ, DL, VT: BR.getValueType(), Operand: BR.getOperand(i: `0`));
23645	}
23646
23647	// Turns the vector of indices into a vector of byte offstes by scaling Offset
23648	// by (BitWidth / 8).
23649	static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset,
23650	SDLoc DL, unsigned BitWidth) {
23651	assert(Offset.getValueType().isScalableVector() &&
23652	"This method is only for scalable vectors of offsets");
23653
23654	SDValue Shift = DAG.getConstant(Log2_32(BitWidth / `8`), DL, MVT::i64);
23655	SDValue SplatShift = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Shift);
23656
23657	return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift);
23658	}
23659
23660	/// Check if the value of \p OffsetInBytes can be used as an immediate for
23661	/// the gather load/prefetch and scatter store instructions with vector base and
23662	/// immediate offset addressing mode:
23663	///
23664	/// [<Zn>.[S\|D]{, #<imm>}]
23665	///
23666	/// where <imm> = sizeof(<T>) k, for k = 0, 1, ..., 31.*
23667	inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes,
23668	unsigned ScalarSizeInBytes) {
23669	// The immediate is not a multiple of the scalar size.
23670	if (OffsetInBytes % ScalarSizeInBytes)
23671	return false;
23672
23673	// The immediate is out of range.
23674	if (OffsetInBytes / ScalarSizeInBytes > `31`)
23675	return false;
23676
23677	return true;
23678	}
23679
23680	/// Check if the value of \p Offset represents a valid immediate for the SVE
23681	/// gather load/prefetch and scatter store instructiona with vector base and
23682	/// immediate offset addressing mode:
23683	///
23684	/// [<Zn>.[S\|D]{, #<imm>}]
23685	///
23686	/// where <imm> = sizeof(<T>) k, for k = 0, 1, ..., 31.*
23687	static bool isValidImmForSVEVecImmAddrMode(SDValue Offset,
23688	unsigned ScalarSizeInBytes) {
23689	ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Val: Offset.getNode());
23690	return OffsetConst && isValidImmForSVEVecImmAddrMode(
23691	OffsetInBytes: OffsetConst->getZExtValue(), ScalarSizeInBytes);
23692	}
23693
23694	static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG,
23695	unsigned Opcode,
23696	bool OnlyPackedOffsets = true) {
23697	const SDValue Src = N->getOperand(Num: `2`);
23698	const EVT SrcVT = Src ->getValueType(ResNo: `0`);
23699	assert(SrcVT.isScalableVector() &&
23700	"Scatter stores are only possible for SVE vectors");
23701
23702	SDLoc DL(N);
23703	MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT();
23704
23705	// Make sure that source data will fit into an SVE register
23706	if (SrcVT.getSizeInBits().getKnownMinValue() > AArch64::SVEBitsPerBlock)
23707	return SDValue ();
23708
23709	// For FPs, ACLE only supports _packed_ single and double precision types.
23710	// SST1Q_[INDEX_]PRED is the ST1Q for sve2p1 and should allow all sizes.
23711	if (SrcElVT.isFloatingPoint())
23712	if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64) &&
23713	((Opcode != AArch64ISD::SST1Q_PRED &&
23714	Opcode != AArch64ISD::SST1Q_INDEX_PRED) \|\|
23715	((SrcVT != MVT::nxv8f16) && (SrcVT != MVT::nxv8bf16))))
23716	return SDValue ();
23717
23718	// Depending on the addressing mode, this is either a pointer or a vector of
23719	// pointers (that fits into one register)
23720	SDValue Base = N->getOperand(Num: `4`);
23721	// Depending on the addressing mode, this is either a single offset or a
23722	// vector of offsets (that fits into one register)
23723	SDValue Offset = N->getOperand(Num: `5`);
23724
23725	// For "scalar + vector of indices", just scale the indices. This only
23726	// applies to non-temporal scatters because there's no instruction that takes
23727	// indicies.
23728	if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) {
23729	Offset =
23730	getScaledOffsetForBitWidth(DAG, Offset, DL, BitWidth: SrcElVT.getSizeInBits());
23731	Opcode = AArch64ISD::SSTNT1_PRED;
23732	} else if (Opcode == AArch64ISD::SST1Q_INDEX_PRED) {
23733	Offset =
23734	getScaledOffsetForBitWidth(DAG, Offset, DL, BitWidth: SrcElVT.getSizeInBits());
23735	Opcode = AArch64ISD::SST1Q_PRED;
23736	}
23737
23738	// In the case of non-temporal gather loads there's only one SVE instruction
23739	// per data-size: "scalar + vector", i.e.
23740	// stnt1{b\|h\|w\|d} { z0.s }, p0/z, [z0.s, x0]*
23741	// Since we do have intrinsics that allow the arguments to be in a different
23742	// order, we may need to swap them to match the spec.
23743	if ((Opcode == AArch64ISD::SSTNT1_PRED \|\| Opcode == AArch64ISD::SST1Q_PRED) &&
23744	Offset.getValueType().isVector())
23745	std::swap(a&: Base, b&: Offset);
23746
23747	// SST1_IMM requires that the offset is an immediate that is:
23748	// a multiple of #SizeInBytes,*
23749	// in the range [0, 31 x #SizeInBytes],*
23750	// where #SizeInBytes is the size in bytes of the stored items. For
23751	// immediates outside that range and non-immediate scalar offsets use SST1 or
23752	// SST1_UXTW instead.
23753	if (Opcode == AArch64ISD::SST1_IMM_PRED) {
23754	if (!isValidImmForSVEVecImmAddrMode(Offset,
23755	ScalarSizeInBytes: SrcVT.getScalarSizeInBits() / `8`)) {
23756	if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
23757	Opcode = AArch64ISD::SST1_UXTW_PRED;
23758	else
23759	Opcode = AArch64ISD::SST1_PRED;
23760
23761	std::swap(a&: Base, b&: Offset);
23762	}
23763	}
23764
23765	auto &TLI = DAG.getTargetLoweringInfo();
23766	if (!TLI.isTypeLegal(VT: Base.getValueType()))
23767	return SDValue ();
23768
23769	// Some scatter store variants allow unpacked offsets, but only as nxv2i32
23770	// vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
23771	// nxv2i64. Legalize accordingly.
23772	if (!OnlyPackedOffsets &&
23773	Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
23774	Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(`0`);
23775
23776	if (!TLI.isTypeLegal(VT: Offset.getValueType()))
23777	return SDValue ();
23778
23779	// Source value type that is representable in hardware
23780	EVT HwSrcVt = getSVEContainerType(ContentTy: SrcVT);
23781
23782	// Keep the original type of the input data to store - this is needed to be
23783	// able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For
23784	// FP values we want the integer equivalent, so just use HwSrcVt.
23785	SDValue InputVT = DAG.getValueType(SrcVT);
23786	if (SrcVT.isFloatingPoint())
23787	InputVT = DAG.getValueType(HwSrcVt);
23788
23789	SDVTList VTs = DAG.getVTList(MVT::Other);
23790	SDValue SrcNew;
23791
23792	if (Src.getValueType().isFloatingPoint())
23793	SrcNew = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: HwSrcVt, Operand: Src);
23794	else
23795	SrcNew = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: HwSrcVt, Operand: Src);
23796
23797	SDValue Ops[] = {N->getOperand(Num: `0`), // Chain
23798	SrcNew,
23799	N->getOperand(Num: `3`), // Pg
23800	Base,
23801	Offset,
23802	InputVT};
23803
23804	return DAG.getNode(Opcode, DL, VTList: VTs, Ops);
23805	}
23806
23807	static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG,
23808	unsigned Opcode,
23809	bool OnlyPackedOffsets = true) {
23810	const EVT RetVT = N->getValueType(ResNo: `0`);
23811	assert(RetVT.isScalableVector() &&
23812	"Gather loads are only possible for SVE vectors");
23813
23814	SDLoc DL(N);
23815
23816	// Make sure that the loaded data will fit into an SVE register
23817	if (RetVT.getSizeInBits().getKnownMinValue() > AArch64::SVEBitsPerBlock)
23818	return SDValue ();
23819
23820	// Depending on the addressing mode, this is either a pointer or a vector of
23821	// pointers (that fits into one register)
23822	SDValue Base = N->getOperand(Num: `3`);
23823	// Depending on the addressing mode, this is either a single offset or a
23824	// vector of offsets (that fits into one register)
23825	SDValue Offset = N->getOperand(Num: `4`);
23826
23827	// For "scalar + vector of indices", scale the indices to obtain unscaled
23828	// offsets. This applies to non-temporal and quadword gathers, which do not
23829	// have an addressing mode with scaled offset.
23830	if (Opcode == AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) {
23831	Offset = getScaledOffsetForBitWidth(DAG, Offset, DL,
23832	BitWidth: RetVT.getScalarSizeInBits());
23833	Opcode = AArch64ISD::GLDNT1_MERGE_ZERO;
23834	} else if (Opcode == AArch64ISD::GLD1Q_INDEX_MERGE_ZERO) {
23835	Offset = getScaledOffsetForBitWidth(DAG, Offset, DL,
23836	BitWidth: RetVT.getScalarSizeInBits());
23837	Opcode = AArch64ISD::GLD1Q_MERGE_ZERO;
23838	}
23839
23840	// In the case of non-temporal gather loads and quadword gather loads there's
23841	// only one addressing mode : "vector + scalar", e.g.
23842	// ldnt1{b\|h\|w\|d} { z0.s }, p0/z, [z0.s, x0]
23843	// Since we do have intrinsics that allow the arguments to be in a different
23844	// order, we may need to swap them to match the spec.
23845	if ((Opcode == AArch64ISD::GLDNT1_MERGE_ZERO \|\|
23846	Opcode == AArch64ISD::GLD1Q_MERGE_ZERO) &&
23847	Offset.getValueType().isVector())
23848	std::swap(a&: Base, b&: Offset);
23849
23850	// GLD{FF}1_IMM requires that the offset is an immediate that is:
23851	// a multiple of #SizeInBytes,*
23852	// in the range [0, 31 x #SizeInBytes],*
23853	// where #SizeInBytes is the size in bytes of the loaded items. For
23854	// immediates outside that range and non-immediate scalar offsets use
23855	// GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead.
23856	if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO \|\|
23857	Opcode == AArch64ISD::GLDFF1_IMM_MERGE_ZERO) {
23858	if (!isValidImmForSVEVecImmAddrMode(Offset,
23859	ScalarSizeInBytes: RetVT.getScalarSizeInBits() / `8`)) {
23860	if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
23861	Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
23862	? AArch64ISD::GLD1_UXTW_MERGE_ZERO
23863	: AArch64ISD::GLDFF1_UXTW_MERGE_ZERO;
23864	else
23865	Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
23866	? AArch64ISD::GLD1_MERGE_ZERO
23867	: AArch64ISD::GLDFF1_MERGE_ZERO;
23868
23869	std::swap(a&: Base, b&: Offset);
23870	}
23871	}
23872
23873	auto &TLI = DAG.getTargetLoweringInfo();
23874	if (!TLI.isTypeLegal(VT: Base.getValueType()))
23875	return SDValue ();
23876
23877	// Some gather load variants allow unpacked offsets, but only as nxv2i32
23878	// vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
23879	// nxv2i64. Legalize accordingly.
23880	if (!OnlyPackedOffsets &&
23881	Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
23882	Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(`0`);
23883
23884	// Return value type that is representable in hardware
23885	EVT HwRetVt = getSVEContainerType(ContentTy: RetVT);
23886
23887	// Keep the original output value type around - this is needed to be able to
23888	// select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP
23889	// values we want the integer equivalent, so just use HwRetVT.
23890	SDValue OutVT = DAG.getValueType(RetVT);
23891	if (RetVT.isFloatingPoint())
23892	OutVT = DAG.getValueType(HwRetVt);
23893
23894	SDVTList VTs = DAG.getVTList(HwRetVt, MVT::Other);
23895	SDValue Ops[] = {N->getOperand(Num: `0`), // Chain
23896	N->getOperand(Num: `2`), // Pg
23897	Base, Offset, OutVT};
23898
23899	SDValue Load = DAG.getNode(Opcode, DL, VTList: VTs, Ops);
23900	SDValue LoadChain = SDValue (Load.getNode(), `1`);
23901
23902	if (RetVT.isInteger() && (RetVT != HwRetVt))
23903	Load = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: RetVT, Operand: Load.getValue(R: `0`));
23904
23905	// If the original return value was FP, bitcast accordingly. Doing it here
23906	// means that we can avoid adding TableGen patterns for FPs.
23907	if (RetVT.isFloatingPoint())
23908	Load = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: RetVT, Operand: Load.getValue(R: `0`));
23909
23910	return DAG.getMergeValues(Ops: {Load, LoadChain}, dl: DL);
23911	}
23912
23913	static SDValue
23914	performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
23915	SelectionDAG &DAG) {
23916	SDLoc DL(N);
23917	SDValue Src = N->getOperand(Num: `0`);
23918	unsigned Opc = Src ->getOpcode();
23919
23920	// Sign extend of an unsigned unpack -> signed unpack
23921	if (Opc == AArch64ISD::UUNPKHI \|\| Opc == AArch64ISD::UUNPKLO) {
23922
23923	unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI
23924	: AArch64ISD::SUNPKLO;
23925
23926	// Push the sign extend to the operand of the unpack
23927	// This is necessary where, for example, the operand of the unpack
23928	// is another unpack:
23929	// 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8)
23930	// ->
23931	// 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8)
23932	// ->
23933	// 4i32 sunpklo(8i16 sunpklo(16i8 opnd))
23934	SDValue ExtOp = Src ->getOperand(Num: `0`);
23935	auto VT = cast<VTSDNode>(Val: N->getOperand(Num: `1`))->getVT();
23936	EVT EltTy = VT.getVectorElementType();
23937	(void)EltTy;
23938
23939	assert((EltTy == MVT::i8 \|\| EltTy == MVT::i16 \|\| EltTy == MVT::i32) &&
23940	"Sign extending from an invalid type");
23941
23942	EVT ExtVT = VT.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
23943
23944	SDValue Ext = DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL, VT: ExtOp.getValueType(),
23945	N1: ExtOp, N2: DAG.getValueType(ExtVT));
23946
23947	return DAG.getNode(Opcode: SOpc, DL, VT: N->getValueType(ResNo: `0`), Operand: Ext);
23948	}
23949
23950	if (DCI.isBeforeLegalizeOps())
23951	return SDValue ();
23952
23953	if (!EnableCombineMGatherIntrinsics)
23954	return SDValue ();
23955
23956	// SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
23957	// for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
23958	unsigned NewOpc;
23959	unsigned MemVTOpNum = `4`;
23960	switch (Opc) {
23961	case AArch64ISD::LD1_MERGE_ZERO:
23962	NewOpc = AArch64ISD::LD1S_MERGE_ZERO;
23963	MemVTOpNum = `3`;
23964	break;
23965	case AArch64ISD::LDNF1_MERGE_ZERO:
23966	NewOpc = AArch64ISD::LDNF1S_MERGE_ZERO;
23967	MemVTOpNum = `3`;
23968	break;
23969	case AArch64ISD::LDFF1_MERGE_ZERO:
23970	NewOpc = AArch64ISD::LDFF1S_MERGE_ZERO;
23971	MemVTOpNum = `3`;
23972	break;
23973	case AArch64ISD::GLD1_MERGE_ZERO:
23974	NewOpc = AArch64ISD::GLD1S_MERGE_ZERO;
23975	break;
23976	case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
23977	NewOpc = AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
23978	break;
23979	case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
23980	NewOpc = AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
23981	break;
23982	case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
23983	NewOpc = AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
23984	break;
23985	case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
23986	NewOpc = AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
23987	break;
23988	case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
23989	NewOpc = AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
23990	break;
23991	case AArch64ISD::GLD1_IMM_MERGE_ZERO:
23992	NewOpc = AArch64ISD::GLD1S_IMM_MERGE_ZERO;
23993	break;
23994	case AArch64ISD::GLDFF1_MERGE_ZERO:
23995	NewOpc = AArch64ISD::GLDFF1S_MERGE_ZERO;
23996	break;
23997	case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
23998	NewOpc = AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO;
23999	break;
24000	case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
24001	NewOpc = AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO;
24002	break;
24003	case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
24004	NewOpc = AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO;
24005	break;
24006	case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
24007	NewOpc = AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO;
24008	break;
24009	case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
24010	NewOpc = AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO;
24011	break;
24012	case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
24013	NewOpc = AArch64ISD::GLDFF1S_IMM_MERGE_ZERO;
24014	break;
24015	case AArch64ISD::GLDNT1_MERGE_ZERO:
24016	NewOpc = AArch64ISD::GLDNT1S_MERGE_ZERO;
24017	break;
24018	default:
24019	return SDValue ();
24020	}
24021
24022	EVT SignExtSrcVT = cast<VTSDNode>(Val: N->getOperand(Num: `1`))->getVT();
24023	EVT SrcMemVT = cast<VTSDNode>(Val: Src ->getOperand(Num: MemVTOpNum))->getVT();
24024
24025	if ((SignExtSrcVT != SrcMemVT) \|\| !Src.hasOneUse())
24026	return SDValue ();
24027
24028	EVT DstVT = N->getValueType(ResNo: `0`);
24029	SDVTList VTs = DAG.getVTList(DstVT, MVT::Other);
24030
24031	SmallVector<SDValue, `5`> Ops;
24032	for (unsigned I = `0`; I < Src ->getNumOperands(); ++I)
24033	Ops.push_back(Elt: Src ->getOperand(Num: I));
24034
24035	SDValue ExtLoad = DAG.getNode(Opcode: NewOpc, DL: SDLoc (N), VTList: VTs, Ops);
24036	DCI.CombineTo(N, Res: ExtLoad);
24037	DCI.CombineTo(N: Src.getNode(), Res0: ExtLoad, Res1: ExtLoad.getValue(R: `1`));
24038
24039	// Return N so it doesn't get rechecked
24040	return SDValue (N, `0`);
24041	}
24042
24043	/// Legalize the gather prefetch (scalar + vector addressing mode) when the
24044	/// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset
24045	/// != nxv2i32) do not need legalization.
24046	static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG) {
24047	const unsigned OffsetPos = `4`;
24048	SDValue Offset = N->getOperand(Num: OffsetPos);
24049
24050	// Not an unpacked vector, bail out.
24051	if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32)
24052	return SDValue ();
24053
24054	// Extend the unpacked offset vector to 64-bit lanes.
24055	SDLoc DL(N);
24056	Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset);
24057	SmallVector<SDValue, `5`> Ops(N->op_begin(), N->op_end());
24058	// Replace the offset operand with the 64-bit one.
24059	Ops [OffsetPos] = Offset;
24060
24061	return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
24062	}
24063
24064	/// Combines a node carrying the intrinsic
24065	/// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses
24066	/// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to
24067	/// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the
24068	/// sve gather prefetch instruction with vector plus immediate addressing mode.
24069	static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG,
24070	unsigned ScalarSizeInBytes) {
24071	const unsigned ImmPos = `4`, OffsetPos = `3`;
24072	// No need to combine the node if the immediate is valid...
24073	if (isValidImmForSVEVecImmAddrMode(Offset: N->getOperand(Num: ImmPos), ScalarSizeInBytes))
24074	return SDValue ();
24075
24076	// ...otherwise swap the offset base with the offset...
24077	SmallVector<SDValue, `5`> Ops(N->op_begin(), N->op_end());
24078	std::swap(a&: Ops [ImmPos], b&: Ops [OffsetPos]);
24079	// ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to
24080	// `aarch64_sve_prfb_gather_uxtw_index`.
24081	SDLoc DL(N);
24082	Ops[`1`] = DAG.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL,
24083	MVT::i64);
24084
24085	return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
24086	}
24087
24088	// Return true if the vector operation can guarantee only the first lane of its
24089	// result contains data, with all bits in other lanes set to zero.
24090	static bool isLanes1toNKnownZero(SDValue Op) {
24091	switch (Op.getOpcode()) {
24092	default:
24093	return false;
24094	case AArch64ISD::ANDV_PRED:
24095	case AArch64ISD::EORV_PRED:
24096	case AArch64ISD::FADDA_PRED:
24097	case AArch64ISD::FADDV_PRED:
24098	case AArch64ISD::FMAXNMV_PRED:
24099	case AArch64ISD::FMAXV_PRED:
24100	case AArch64ISD::FMINNMV_PRED:
24101	case AArch64ISD::FMINV_PRED:
24102	case AArch64ISD::ORV_PRED:
24103	case AArch64ISD::SADDV_PRED:
24104	case AArch64ISD::SMAXV_PRED:
24105	case AArch64ISD::SMINV_PRED:
24106	case AArch64ISD::UADDV_PRED:
24107	case AArch64ISD::UMAXV_PRED:
24108	case AArch64ISD::UMINV_PRED:
24109	return true;
24110	}
24111	}
24112
24113	static SDValue removeRedundantInsertVectorElt(SDNode *N) {
24114	assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && "Unexpected node!");
24115	SDValue InsertVec = N->getOperand(Num: `0`);
24116	SDValue InsertElt = N->getOperand(Num: `1`);
24117	SDValue InsertIdx = N->getOperand(Num: `2`);
24118
24119	// We only care about inserts into the first element...
24120	if (!isNullConstant(V: InsertIdx))
24121	return SDValue ();
24122	// ...of a zero'd vector...
24123	if (!ISD::isConstantSplatVectorAllZeros(N: InsertVec.getNode()))
24124	return SDValue ();
24125	// ...where the inserted data was previously extracted...
24126	if (InsertElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
24127	return SDValue ();
24128
24129	SDValue ExtractVec = InsertElt.getOperand(i: `0`);
24130	SDValue ExtractIdx = InsertElt.getOperand(i: `1`);
24131
24132	// ...from the first element of a vector.
24133	if (!isNullConstant(V: ExtractIdx))
24134	return SDValue ();
24135
24136	// If we get here we are effectively trying to zero lanes 1-N of a vector.
24137
24138	// Ensure there's no type conversion going on.
24139	if (N->getValueType(ResNo: `0`) != ExtractVec.getValueType())
24140	return SDValue ();
24141
24142	if (!isLanes1toNKnownZero(Op: ExtractVec))
24143	return SDValue ();
24144
24145	// The explicit zeroing is redundant.
24146	return ExtractVec;
24147	}
24148
24149	static SDValue
24150	performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
24151	if (SDValue Res = removeRedundantInsertVectorElt(N))
24152	return Res;
24153
24154	return performPostLD1Combine(N, DCI, IsLaneOp: true);
24155	}
24156
24157	static SDValue performSVESpliceCombine(SDNode *N, SelectionDAG &DAG) {
24158	EVT Ty = N->getValueType(ResNo: `0`);
24159	if (Ty.isInteger())
24160	return SDValue ();
24161
24162	EVT IntTy = Ty.changeVectorElementTypeToInteger();
24163	EVT ExtIntTy = getPackedSVEVectorVT(EC: IntTy.getVectorElementCount());
24164	if (ExtIntTy.getVectorElementType().getScalarSizeInBits() <
24165	IntTy.getVectorElementType().getScalarSizeInBits())
24166	return SDValue ();
24167
24168	SDLoc DL(N);
24169	SDValue LHS = DAG.getAnyExtOrTrunc(Op: DAG.getBitcast(VT: IntTy, V: N->getOperand(Num: `0`)),
24170	DL, VT: ExtIntTy);
24171	SDValue RHS = DAG.getAnyExtOrTrunc(Op: DAG.getBitcast(VT: IntTy, V: N->getOperand(Num: `1`)),
24172	DL, VT: ExtIntTy);
24173	SDValue Idx = N->getOperand(Num: `2`);
24174	SDValue Splice = DAG.getNode(Opcode: ISD::VECTOR_SPLICE, DL, VT: ExtIntTy, N1: LHS, N2: RHS, N3: Idx);
24175	SDValue Trunc = DAG.getAnyExtOrTrunc(Op: Splice, DL, VT: IntTy);
24176	return DAG.getBitcast(VT: Ty, V: Trunc);
24177	}
24178
24179	static SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG,
24180	TargetLowering::DAGCombinerInfo &DCI,
24181	const AArch64Subtarget *Subtarget) {
24182	SDValue N0 = N->getOperand(Num: `0`);
24183	EVT VT = N->getValueType(ResNo: `0`);
24184
24185	// If this is fp_round(fpextend), don't fold it, allow ourselves to be folded.
24186	if (N->hasOneUse() && N->use_begin()->getOpcode() == ISD::FP_ROUND)
24187	return SDValue ();
24188
24189	auto hasValidElementTypeForFPExtLoad = [](EVT VT) {
24190	EVT EltVT = VT.getVectorElementType();
24191	return EltVT == MVT::f32 \|\| EltVT == MVT::f64;
24192	};
24193
24194	// fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
24195	// We purposefully don't care about legality of the nodes here as we know
24196	// they can be split down into something legal.
24197	if (DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(N: N0.getNode()) &&
24198	N0.hasOneUse() && Subtarget->useSVEForFixedLengthVectors() &&
24199	VT.isFixedLengthVector() && hasValidElementTypeForFPExtLoad(VT) &&
24200	VT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits()) {
24201	LoadSDNode *LN0 = cast<LoadSDNode>(Val&: N0);
24202	SDValue ExtLoad = DAG.getExtLoad(ExtType: ISD::EXTLOAD, dl: SDLoc (N), VT,
24203	Chain: LN0->getChain(), Ptr: LN0->getBasePtr(),
24204	MemVT: N0.getValueType(), MMO: LN0->getMemOperand());
24205	DCI.CombineTo(N, Res: ExtLoad);
24206	DCI.CombineTo(
24207	N: N0.getNode(),
24208	Res0: DAG.getNode(Opcode: ISD::FP_ROUND, DL: SDLoc (N0), VT: N0.getValueType(), N1: ExtLoad,
24209	N2: DAG.getIntPtrConstant(Val: `1`, DL: SDLoc (N0), /isTarget=/true)),
24210	Res1: ExtLoad.getValue(R: `1`));
24211	return SDValue (N, `0`); // Return N so it doesn't get rechecked!
24212	}
24213
24214	return SDValue ();
24215	}
24216
24217	static SDValue performBSPExpandForSVE(SDNode *N, SelectionDAG &DAG,
24218	const AArch64Subtarget *Subtarget) {
24219	EVT VT = N->getValueType(ResNo: `0`);
24220
24221	// Don't expand for NEON, SVE2 or SME
24222	if (!VT.isScalableVector() \|\| Subtarget->hasSVE2() \|\| Subtarget->hasSME())
24223	return SDValue ();
24224
24225	SDLoc DL(N);
24226
24227	SDValue Mask = N->getOperand(Num: `0`);
24228	SDValue In1 = N->getOperand(Num: `1`);
24229	SDValue In2 = N->getOperand(Num: `2`);
24230
24231	SDValue InvMask = DAG.getNOT(DL, Val: Mask, VT);
24232	SDValue Sel = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: Mask, N2: In1);
24233	SDValue SelInv = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: InvMask, N2: In2);
24234	return DAG.getNode(Opcode: ISD::OR, DL, VT, N1: Sel, N2: SelInv);
24235	}
24236
24237	static SDValue performDupLane128Combine(SDNode *N, SelectionDAG &DAG) {
24238	EVT VT = N->getValueType(ResNo: `0`);
24239
24240	SDValue Insert = N->getOperand(Num: `0`);
24241	if (Insert.getOpcode() != ISD::INSERT_SUBVECTOR)
24242	return SDValue ();
24243
24244	if (!Insert.getOperand(i: `0`).isUndef())
24245	return SDValue ();
24246
24247	uint64_t IdxInsert = Insert.getConstantOperandVal(i: `2`);
24248	uint64_t IdxDupLane = N->getConstantOperandVal(Num: `1`);
24249	if (IdxInsert != `0` \|\| IdxDupLane != `0`)
24250	return SDValue ();
24251
24252	SDValue Bitcast = Insert.getOperand(i: `1`);
24253	if (Bitcast.getOpcode() != ISD::BITCAST)
24254	return SDValue ();
24255
24256	SDValue Subvec = Bitcast.getOperand(i: `0`);
24257	EVT SubvecVT = Subvec.getValueType();
24258	if (!SubvecVT.is128BitVector())
24259	return SDValue ();
24260	EVT NewSubvecVT =
24261	getPackedSVEVectorVT(VT: Subvec.getValueType().getVectorElementType());
24262
24263	SDLoc DL(N);
24264	SDValue NewInsert =
24265	DAG.getNode(Opcode: ISD::INSERT_SUBVECTOR, DL, VT: NewSubvecVT,
24266	N1: DAG.getUNDEF(VT: NewSubvecVT), N2: Subvec, N3: Insert ->getOperand(Num: `2`));
24267	SDValue NewDuplane128 = DAG.getNode(Opcode: AArch64ISD::DUPLANE128, DL, VT: NewSubvecVT,
24268	N1: NewInsert, N2: N->getOperand(Num: `1`));
24269	return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: NewDuplane128);
24270	}
24271
24272	// Try to combine mull with uzp1.
24273	static SDValue tryCombineMULLWithUZP1(SDNode *N,
24274	TargetLowering::DAGCombinerInfo &DCI,
24275	SelectionDAG &DAG) {
24276	if (DCI.isBeforeLegalizeOps())
24277	return SDValue ();
24278
24279	SDValue LHS = N->getOperand(Num: `0`);
24280	SDValue RHS = N->getOperand(Num: `1`);
24281
24282	SDValue ExtractHigh;
24283	SDValue ExtractLow;
24284	SDValue TruncHigh;
24285	SDValue TruncLow;
24286	SDLoc DL(N);
24287
24288	// Check the operands are trunc and extract_high.
24289	if (isEssentiallyExtractHighSubvector(N: LHS) &&
24290	RHS.getOpcode() == ISD::TRUNCATE) {
24291	TruncHigh = RHS;
24292	if (LHS.getOpcode() == ISD::BITCAST)
24293	ExtractHigh = LHS.getOperand(i: `0`);
24294	else
24295	ExtractHigh = LHS;
24296	} else if (isEssentiallyExtractHighSubvector(N: RHS) &&
24297	LHS.getOpcode() == ISD::TRUNCATE) {
24298	TruncHigh = LHS;
24299	if (LHS.getOpcode() == ISD::BITCAST)
24300	ExtractHigh = RHS.getOperand(i: `0`);
24301	else
24302	ExtractHigh = RHS;
24303	} else
24304	return SDValue ();
24305
24306	// If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
24307	// with uzp1.
24308	// You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
24309	SDValue TruncHighOp = TruncHigh.getOperand(i: `0`);
24310	EVT TruncHighOpVT = TruncHighOp.getValueType();
24311	if (TruncHighOp.getOpcode() == AArch64ISD::DUP \|\|
24312	DAG.isSplatValue(V: TruncHighOp, AllowUndefs: false))
24313	return SDValue ();
24314
24315	// Check there is other extract_high with same source vector.
24316	// For example,
24317	//
24318	// t18: v4i16 = extract_subvector t2, Constant:i64<0>
24319	// t12: v4i16 = truncate t11
24320	// t31: v4i32 = AArch64ISD::SMULL t18, t12
24321	// t23: v4i16 = extract_subvector t2, Constant:i64<4>
24322	// t16: v4i16 = truncate t15
24323	// t30: v4i32 = AArch64ISD::SMULL t23, t1
24324	//
24325	// This dagcombine assumes the two extract_high uses same source vector in
24326	// order to detect the pair of the mull. If they have different source vector,
24327	// this code will not work.
24328	bool HasFoundMULLow = true;
24329	SDValue ExtractHighSrcVec = ExtractHigh.getOperand(i: `0`);
24330	if (ExtractHighSrcVec ->use_size() != `2`)
24331	HasFoundMULLow = false;
24332
24333	// Find ExtractLow.
24334	for (SDNode *User : ExtractHighSrcVec.getNode()->uses()) {
24335	if (User == ExtractHigh.getNode())
24336	continue;
24337
24338	if (User->getOpcode() != ISD::EXTRACT_SUBVECTOR \|\|
24339	!isNullConstant(V: User->getOperand(Num: `1`))) {
24340	HasFoundMULLow = false;
24341	break;
24342	}
24343
24344	ExtractLow.setNode(User);
24345	}
24346
24347	if (!ExtractLow \|\| !ExtractLow ->hasOneUse())
24348	HasFoundMULLow = false;
24349
24350	// Check ExtractLow's user.
24351	if (HasFoundMULLow) {
24352	SDNode ExtractLowUser = ExtractLow.getNode()->use_begin();
24353	if (ExtractLowUser->getOpcode() != N->getOpcode()) {
24354	HasFoundMULLow = false;
24355	} else {
24356	if (ExtractLowUser->getOperand(Num: `0`) == ExtractLow) {
24357	if (ExtractLowUser->getOperand(Num: `1`).getOpcode() == ISD::TRUNCATE)
24358	TruncLow = ExtractLowUser->getOperand(Num: `1`);
24359	else
24360	HasFoundMULLow = false;
24361	} else {
24362	if (ExtractLowUser->getOperand(Num: `0`).getOpcode() == ISD::TRUNCATE)
24363	TruncLow = ExtractLowUser->getOperand(Num: `0`);
24364	else
24365	HasFoundMULLow = false;
24366	}
24367	}
24368	}
24369
24370	// If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
24371	// with uzp1.
24372	// You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
24373	EVT TruncHighVT = TruncHigh.getValueType();
24374	EVT UZP1VT = TruncHighVT.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
24375	SDValue TruncLowOp =
24376	HasFoundMULLow ? TruncLow.getOperand(i: `0`) : DAG.getUNDEF(VT: UZP1VT);
24377	EVT TruncLowOpVT = TruncLowOp.getValueType();
24378	if (HasFoundMULLow && (TruncLowOp.getOpcode() == AArch64ISD::DUP \|\|
24379	DAG.isSplatValue(V: TruncLowOp, AllowUndefs: false)))
24380	return SDValue ();
24381
24382	// Create uzp1, extract_high and extract_low.
24383	if (TruncHighOpVT != UZP1VT)
24384	TruncHighOp = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: UZP1VT, Operand: TruncHighOp);
24385	if (TruncLowOpVT != UZP1VT)
24386	TruncLowOp = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: UZP1VT, Operand: TruncLowOp);
24387
24388	SDValue UZP1 =
24389	DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: UZP1VT, N1: TruncLowOp, N2: TruncHighOp);
24390	SDValue HighIdxCst =
24391	DAG.getConstant(TruncHighVT.getVectorNumElements(), DL, MVT::i64);
24392	SDValue NewTruncHigh =
24393	DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: TruncHighVT, N1: UZP1, N2: HighIdxCst);
24394	DAG.ReplaceAllUsesWith(From: TruncHigh, To: NewTruncHigh);
24395
24396	if (HasFoundMULLow) {
24397	EVT TruncLowVT = TruncLow.getValueType();
24398	SDValue NewTruncLow = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: TruncLowVT,
24399	N1: UZP1, N2: ExtractLow.getOperand(i: `1`));
24400	DAG.ReplaceAllUsesWith(From: TruncLow, To: NewTruncLow);
24401	}
24402
24403	return SDValue (N, `0`);
24404	}
24405
24406	static SDValue performMULLCombine(SDNode *N,
24407	TargetLowering::DAGCombinerInfo &DCI,
24408	SelectionDAG &DAG) {
24409	if (SDValue Val =
24410	tryCombineLongOpWithDup(IID: Intrinsic::not_intrinsic, N, DCI, DAG))
24411	return Val;
24412
24413	if (SDValue Val = tryCombineMULLWithUZP1(N, DCI, DAG))
24414	return Val;
24415
24416	return SDValue ();
24417	}
24418
24419	static SDValue
24420	performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
24421	SelectionDAG &DAG) {
24422	// Let's do below transform.
24423	//
24424	// t34: v4i32 = AArch64ISD::UADDLV t2
24425	// t35: i32 = extract_vector_elt t34, Constant:i64<0>
24426	// t7: i64 = zero_extend t35
24427	// t20: v1i64 = scalar_to_vector t7
24428	// ==>
24429	// t34: v4i32 = AArch64ISD::UADDLV t2
24430	// t39: v2i32 = extract_subvector t34, Constant:i64<0>
24431	// t40: v1i64 = AArch64ISD::NVCAST t39
24432	if (DCI.isBeforeLegalizeOps())
24433	return SDValue ();
24434
24435	EVT VT = N->getValueType(ResNo: `0`);
24436	if (VT != MVT::v1i64)
24437	return SDValue ();
24438
24439	SDValue ZEXT = N->getOperand(Num: `0`);
24440	if (ZEXT.getOpcode() != ISD::ZERO_EXTEND \|\| ZEXT.getValueType() != MVT::i64)
24441	return SDValue ();
24442
24443	SDValue EXTRACT_VEC_ELT = ZEXT.getOperand(i: `0`);
24444	if (EXTRACT_VEC_ELT.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
24445	EXTRACT_VEC_ELT.getValueType() != MVT::i32)
24446	return SDValue ();
24447
24448	if (!isNullConstant(V: EXTRACT_VEC_ELT.getOperand(i: `1`)))
24449	return SDValue ();
24450
24451	SDValue UADDLV = EXTRACT_VEC_ELT.getOperand(i: `0`);
24452	if (UADDLV.getOpcode() != AArch64ISD::UADDLV \|\|
24453	UADDLV.getValueType() != MVT::v4i32 \|\|
24454	UADDLV.getOperand(`0`).getValueType() != MVT::v8i8)
24455	return SDValue ();
24456
24457	// Let's generate new sequence with AArch64ISD::NVCAST.
24458	SDLoc DL(N);
24459	SDValue EXTRACT_SUBVEC =
24460	DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, UADDLV,
24461	DAG.getConstant(`0`, DL, MVT::i64));
24462	SDValue NVCAST =
24463	DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v1i64, EXTRACT_SUBVEC);
24464
24465	return NVCAST;
24466	}
24467
24468	SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
24469	DAGCombinerInfo &DCI) const {
24470	SelectionDAG &DAG = DCI.DAG;
24471	switch (N->getOpcode()) {
24472	default:
24473	LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
24474	break;
24475	case ISD::VECREDUCE_AND:
24476	case ISD::VECREDUCE_OR:
24477	case ISD::VECREDUCE_XOR:
24478	return performVecReduceBitwiseCombine(N, DCI, DAG);
24479	case ISD::ADD:
24480	case ISD::SUB:
24481	return performAddSubCombine(N, DCI);
24482	case ISD::BUILD_VECTOR:
24483	return performBuildVectorCombine(N, DCI, DAG);
24484	case ISD::TRUNCATE:
24485	return performTruncateCombine(N, DAG);
24486	case AArch64ISD::ANDS:
24487	return performFlagSettingCombine(N, DCI, GenericOpcode: ISD::AND);
24488	case AArch64ISD::ADC:
24489	if (auto R = foldOverflowCheck(Op: N, DAG, / IsAdd / true))
24490	return R;
24491	return foldADCToCINC(N, DAG);
24492	case AArch64ISD::SBC:
24493	return foldOverflowCheck(Op: N, DAG, / IsAdd / false);
24494	case AArch64ISD::ADCS:
24495	if (auto R = foldOverflowCheck(Op: N, DAG, / IsAdd / true))
24496	return R;
24497	return performFlagSettingCombine(N, DCI, GenericOpcode: AArch64ISD::ADC);
24498	case AArch64ISD::SBCS:
24499	if (auto R = foldOverflowCheck(Op: N, DAG, / IsAdd / false))
24500	return R;
24501	return performFlagSettingCombine(N, DCI, GenericOpcode: AArch64ISD::SBC);
24502	case AArch64ISD::BICi: {
24503	APInt DemandedBits =
24504	APInt::getAllOnes(numBits: N->getValueType(ResNo: `0`).getScalarSizeInBits());
24505	APInt DemandedElts =
24506	APInt::getAllOnes(numBits: N->getValueType(ResNo: `0`).getVectorNumElements());
24507
24508	if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(
24509	Op: SDValue (N, `0`), DemandedBits, DemandedElts, DCI))
24510	return SDValue ();
24511
24512	break;
24513	}
24514	case ISD::XOR:
24515	return performXorCombine(N, DAG, DCI, Subtarget);
24516	case ISD::MUL:
24517	return performMulCombine(N, DAG, DCI, Subtarget);
24518	case ISD::SINT_TO_FP:
24519	case ISD::UINT_TO_FP:
24520	return performIntToFpCombine(N, DAG, Subtarget);
24521	case ISD::FP_TO_SINT:
24522	case ISD::FP_TO_UINT:
24523	case ISD::FP_TO_SINT_SAT:
24524	case ISD::FP_TO_UINT_SAT:
24525	return performFpToIntCombine(N, DAG, DCI, Subtarget);
24526	case ISD::FDIV:
24527	return performFDivCombine(N, DAG, DCI, Subtarget);
24528	case ISD::OR:
24529	return performORCombine(N, DCI, Subtarget, TLI: *this);
24530	case ISD::AND:
24531	return performANDCombine(N, DCI);
24532	case ISD::FADD:
24533	return performFADDCombine(N, DCI);
24534	case ISD::INTRINSIC_WO_CHAIN:
24535	return performIntrinsicCombine(N, DCI, Subtarget);
24536	case ISD::ANY_EXTEND:
24537	case ISD::ZERO_EXTEND:
24538	case ISD::SIGN_EXTEND:
24539	return performExtendCombine(N, DCI, DAG);
24540	case ISD::SIGN_EXTEND_INREG:
24541	return performSignExtendInRegCombine(N, DCI, DAG);
24542	case ISD::CONCAT_VECTORS:
24543	return performConcatVectorsCombine(N, DCI, DAG);
24544	case ISD::EXTRACT_SUBVECTOR:
24545	return performExtractSubvectorCombine(N, DCI, DAG);
24546	case ISD::INSERT_SUBVECTOR:
24547	return performInsertSubvectorCombine(N, DCI, DAG);
24548	case ISD::SELECT:
24549	return performSelectCombine(N, DCI);
24550	case ISD::VSELECT:
24551	return performVSelectCombine(N, DAG&: DCI.DAG);
24552	case ISD::SETCC:
24553	return performSETCCCombine(N, DCI, DAG);
24554	case ISD::LOAD:
24555	return performLOADCombine(N, DCI, DAG, Subtarget);
24556	case ISD::STORE:
24557	return performSTORECombine(N, DCI, DAG, Subtarget);
24558	case ISD::MSTORE:
24559	return performMSTORECombine(N, DCI, DAG, Subtarget);
24560	case ISD::MGATHER:
24561	case ISD::MSCATTER:
24562	return performMaskedGatherScatterCombine(N, DCI, DAG);
24563	case ISD::VECTOR_SPLICE:
24564	return performSVESpliceCombine(N, DAG);
24565	case ISD::FP_EXTEND:
24566	return performFPExtendCombine(N, DAG, DCI, Subtarget);
24567	case AArch64ISD::BRCOND:
24568	return performBRCONDCombine(N, DCI, DAG);
24569	case AArch64ISD::TBNZ:
24570	case AArch64ISD::TBZ:
24571	return performTBZCombine(N, DCI, DAG);
24572	case AArch64ISD::CSEL:
24573	return performCSELCombine(N, DCI, DAG);
24574	case AArch64ISD::DUP:
24575	case AArch64ISD::DUPLANE8:
24576	case AArch64ISD::DUPLANE16:
24577	case AArch64ISD::DUPLANE32:
24578	case AArch64ISD::DUPLANE64:
24579	return performDUPCombine(N, DCI);
24580	case AArch64ISD::DUPLANE128:
24581	return performDupLane128Combine(N, DAG);
24582	case AArch64ISD::NVCAST:
24583	return performNVCASTCombine(N, DAG);
24584	case AArch64ISD::SPLICE:
24585	return performSpliceCombine(N, DAG);
24586	case AArch64ISD::UUNPKLO:
24587	case AArch64ISD::UUNPKHI:
24588	return performUnpackCombine(N, DAG, Subtarget);
24589	case AArch64ISD::UZP1:
24590	return performUzpCombine(N, DAG, Subtarget);
24591	case AArch64ISD::SETCC_MERGE_ZERO:
24592	return performSetccMergeZeroCombine(N, DCI);
24593	case AArch64ISD::REINTERPRET_CAST:
24594	return performReinterpretCastCombine(N);
24595	case AArch64ISD::GLD1_MERGE_ZERO:
24596	case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
24597	case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
24598	case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
24599	case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
24600	case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
24601	case AArch64ISD::GLD1_IMM_MERGE_ZERO:
24602	case AArch64ISD::GLD1S_MERGE_ZERO:
24603	case AArch64ISD::GLD1S_SCALED_MERGE_ZERO:
24604	case AArch64ISD::GLD1S_UXTW_MERGE_ZERO:
24605	case AArch64ISD::GLD1S_SXTW_MERGE_ZERO:
24606	case AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO:
24607	case AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO:
24608	case AArch64ISD::GLD1S_IMM_MERGE_ZERO:
24609	return performGLD1Combine(N, DAG);
24610	case AArch64ISD::VASHR:
24611	case AArch64ISD::VLSHR:
24612	return performVectorShiftCombine(N, TLI: *this, DCI);
24613	case AArch64ISD::SUNPKLO:
24614	return performSunpkloCombine(N, DAG);
24615	case AArch64ISD::BSP:
24616	return performBSPExpandForSVE(N, DAG, Subtarget);
24617	case ISD::INSERT_VECTOR_ELT:
24618	return performInsertVectorEltCombine(N, DCI);
24619	case ISD::EXTRACT_VECTOR_ELT:
24620	return performExtractVectorEltCombine(N, DCI, Subtarget);
24621	case ISD::VECREDUCE_ADD:
24622	return performVecReduceAddCombine(N, DAG&: DCI.DAG, ST: Subtarget);
24623	case AArch64ISD::UADDV:
24624	return performUADDVCombine(N, DAG);
24625	case AArch64ISD::SMULL:
24626	case AArch64ISD::UMULL:
24627	case AArch64ISD::PMULL:
24628	return performMULLCombine(N, DCI, DAG);
24629	case ISD::INTRINSIC_VOID:
24630	case ISD::INTRINSIC_W_CHAIN:
24631	switch (N->getConstantOperandVal(Num: `1`)) {
24632	case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
24633	return combineSVEPrefetchVecBaseImmOff(N, DAG, ScalarSizeInBytes: `1` /=ScalarSizeInBytes/);
24634	case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
24635	return combineSVEPrefetchVecBaseImmOff(N, DAG, ScalarSizeInBytes: `2` /=ScalarSizeInBytes/);
24636	case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
24637	return combineSVEPrefetchVecBaseImmOff(N, DAG, ScalarSizeInBytes: `4` /=ScalarSizeInBytes/);
24638	case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
24639	return combineSVEPrefetchVecBaseImmOff(N, DAG, ScalarSizeInBytes: `8` /=ScalarSizeInBytes/);
24640	case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
24641	case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
24642	case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
24643	case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
24644	case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
24645	case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
24646	case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
24647	case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
24648	return legalizeSVEGatherPrefetchOffsVec(N, DAG);
24649	case Intrinsic::aarch64_neon_ld2:
24650	case Intrinsic::aarch64_neon_ld3:
24651	case Intrinsic::aarch64_neon_ld4:
24652	case Intrinsic::aarch64_neon_ld1x2:
24653	case Intrinsic::aarch64_neon_ld1x3:
24654	case Intrinsic::aarch64_neon_ld1x4:
24655	case Intrinsic::aarch64_neon_ld2lane:
24656	case Intrinsic::aarch64_neon_ld3lane:
24657	case Intrinsic::aarch64_neon_ld4lane:
24658	case Intrinsic::aarch64_neon_ld2r:
24659	case Intrinsic::aarch64_neon_ld3r:
24660	case Intrinsic::aarch64_neon_ld4r:
24661	case Intrinsic::aarch64_neon_st2:
24662	case Intrinsic::aarch64_neon_st3:
24663	case Intrinsic::aarch64_neon_st4:
24664	case Intrinsic::aarch64_neon_st1x2:
24665	case Intrinsic::aarch64_neon_st1x3:
24666	case Intrinsic::aarch64_neon_st1x4:
24667	case Intrinsic::aarch64_neon_st2lane:
24668	case Intrinsic::aarch64_neon_st3lane:
24669	case Intrinsic::aarch64_neon_st4lane:
24670	return performNEONPostLDSTCombine(N, DCI, DAG);
24671	case Intrinsic::aarch64_sve_ldnt1:
24672	return performLDNT1Combine(N, DAG);
24673	case Intrinsic::aarch64_sve_ld1rq:
24674	return performLD1ReplicateCombine<AArch64ISD::LD1RQ_MERGE_ZERO>(N, DAG);
24675	case Intrinsic::aarch64_sve_ld1ro:
24676	return performLD1ReplicateCombine<AArch64ISD::LD1RO_MERGE_ZERO>(N, DAG);
24677	case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
24678	return performGatherLoadCombine(N, DAG, Opcode: AArch64ISD::GLDNT1_MERGE_ZERO);
24679	case Intrinsic::aarch64_sve_ldnt1_gather:
24680	return performGatherLoadCombine(N, DAG, Opcode: AArch64ISD::GLDNT1_MERGE_ZERO);
24681	case Intrinsic::aarch64_sve_ldnt1_gather_index:
24682	return performGatherLoadCombine(N, DAG,
24683	Opcode: AArch64ISD::GLDNT1_INDEX_MERGE_ZERO);
24684	case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
24685	return performGatherLoadCombine(N, DAG, Opcode: AArch64ISD::GLDNT1_MERGE_ZERO);
24686	case Intrinsic::aarch64_sve_ld1:
24687	return performLD1Combine(N, DAG, Opc: AArch64ISD::LD1_MERGE_ZERO);
24688	case Intrinsic::aarch64_sve_ldnf1:
24689	return performLD1Combine(N, DAG, Opc: AArch64ISD::LDNF1_MERGE_ZERO);
24690	case Intrinsic::aarch64_sve_ldff1:
24691	return performLD1Combine(N, DAG, Opc: AArch64ISD::LDFF1_MERGE_ZERO);
24692	case Intrinsic::aarch64_sve_st1:
24693	return performST1Combine(N, DAG);
24694	case Intrinsic::aarch64_sve_stnt1:
24695	return performSTNT1Combine(N, DAG);
24696	case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
24697	return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SSTNT1_PRED);
24698	case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
24699	return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SSTNT1_PRED);
24700	case Intrinsic::aarch64_sve_stnt1_scatter:
24701	return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SSTNT1_PRED);
24702	case Intrinsic::aarch64_sve_stnt1_scatter_index:
24703	return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SSTNT1_INDEX_PRED);
24704	case Intrinsic::aarch64_sve_ld1_gather:
24705	return performGatherLoadCombine(N, DAG, Opcode: AArch64ISD::GLD1_MERGE_ZERO);
24706	case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
24707	case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
24708	return performGatherLoadCombine(N, DAG, Opcode: AArch64ISD::GLD1Q_MERGE_ZERO);
24709	case Intrinsic::aarch64_sve_ld1q_gather_index:
24710	return performGatherLoadCombine(N, DAG,
24711	Opcode: AArch64ISD::GLD1Q_INDEX_MERGE_ZERO);
24712	case Intrinsic::aarch64_sve_ld1_gather_index:
24713	return performGatherLoadCombine(N, DAG,
24714	Opcode: AArch64ISD::GLD1_SCALED_MERGE_ZERO);
24715	case Intrinsic::aarch64_sve_ld1_gather_sxtw:
24716	return performGatherLoadCombine(N, DAG, Opcode: AArch64ISD::GLD1_SXTW_MERGE_ZERO,
24717	/OnlyPackedOffsets=/false);
24718	case Intrinsic::aarch64_sve_ld1_gather_uxtw:
24719	return performGatherLoadCombine(N, DAG, Opcode: AArch64ISD::GLD1_UXTW_MERGE_ZERO,
24720	/OnlyPackedOffsets=/false);
24721	case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
24722	return performGatherLoadCombine(N, DAG,
24723	Opcode: AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO,
24724	/OnlyPackedOffsets=/false);
24725	case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
24726	return performGatherLoadCombine(N, DAG,
24727	Opcode: AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO,
24728	/OnlyPackedOffsets=/false);
24729	case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
24730	return performGatherLoadCombine(N, DAG, Opcode: AArch64ISD::GLD1_IMM_MERGE_ZERO);
24731	case Intrinsic::aarch64_sve_ldff1_gather:
24732	return performGatherLoadCombine(N, DAG, Opcode: AArch64ISD::GLDFF1_MERGE_ZERO);
24733	case Intrinsic::aarch64_sve_ldff1_gather_index:
24734	return performGatherLoadCombine(N, DAG,
24735	Opcode: AArch64ISD::GLDFF1_SCALED_MERGE_ZERO);
24736	case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
24737	return performGatherLoadCombine(N, DAG,
24738	Opcode: AArch64ISD::GLDFF1_SXTW_MERGE_ZERO,
24739	/OnlyPackedOffsets=/false);
24740	case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
24741	return performGatherLoadCombine(N, DAG,
24742	Opcode: AArch64ISD::GLDFF1_UXTW_MERGE_ZERO,
24743	/OnlyPackedOffsets=/false);
24744	case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
24745	return performGatherLoadCombine(N, DAG,
24746	Opcode: AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO,
24747	/OnlyPackedOffsets=/false);
24748	case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
24749	return performGatherLoadCombine(N, DAG,
24750	Opcode: AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO,
24751	/OnlyPackedOffsets=/false);
24752	case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
24753	return performGatherLoadCombine(N, DAG,
24754	Opcode: AArch64ISD::GLDFF1_IMM_MERGE_ZERO);
24755	case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
24756	case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
24757	return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SST1Q_PRED);
24758	case Intrinsic::aarch64_sve_st1q_scatter_index:
24759	return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SST1Q_INDEX_PRED);
24760	case Intrinsic::aarch64_sve_st1_scatter:
24761	return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SST1_PRED);
24762	case Intrinsic::aarch64_sve_st1_scatter_index:
24763	return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SST1_SCALED_PRED);
24764	case Intrinsic::aarch64_sve_st1_scatter_sxtw:
24765	return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SST1_SXTW_PRED,
24766	/OnlyPackedOffsets=/false);
24767	case Intrinsic::aarch64_sve_st1_scatter_uxtw:
24768	return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SST1_UXTW_PRED,
24769	/OnlyPackedOffsets=/false);
24770	case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
24771	return performScatterStoreCombine(N, DAG,
24772	Opcode: AArch64ISD::SST1_SXTW_SCALED_PRED,
24773	/OnlyPackedOffsets=/false);
24774	case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
24775	return performScatterStoreCombine(N, DAG,
24776	Opcode: AArch64ISD::SST1_UXTW_SCALED_PRED,
24777	/OnlyPackedOffsets=/false);
24778	case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
24779	return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SST1_IMM_PRED);
24780	case Intrinsic::aarch64_rndr:
24781	case Intrinsic::aarch64_rndrrs: {
24782	unsigned IntrinsicID = N->getConstantOperandVal(Num: `1`);
24783	auto Register =
24784	(IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR
24785	: AArch64SysReg::RNDRRS);
24786	SDLoc DL(N);
24787	SDValue A = DAG.getNode(
24788	AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, MVT::Glue, MVT::Other),
24789	N->getOperand(`0`), DAG.getConstant(Register, DL, MVT::i64));
24790	SDValue B = DAG.getNode(
24791	AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(`0`, DL, MVT::i32),
24792	DAG.getConstant(`0`, DL, MVT::i32),
24793	DAG.getConstant(AArch64CC::NE, DL, MVT::i32), A.getValue(`1`));
24794	return DAG.getMergeValues(
24795	{A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(`2`)}, DL);
24796	}
24797	case Intrinsic::aarch64_sme_ldr_zt:
24798	return DAG.getNode(AArch64ISD::RESTORE_ZT, SDLoc(N),
24799	DAG.getVTList(MVT::Other), N->getOperand(`0`),
24800	N->getOperand(`2`), N->getOperand(`3`));
24801	case Intrinsic::aarch64_sme_str_zt:
24802	return DAG.getNode(AArch64ISD::SAVE_ZT, SDLoc(N),
24803	DAG.getVTList(MVT::Other), N->getOperand(`0`),
24804	N->getOperand(`2`), N->getOperand(`3`));
24805	default:
24806	break;
24807	}
24808	break;
24809	case ISD::GlobalAddress:
24810	return performGlobalAddressCombine(N, DAG, Subtarget, TM: getTargetMachine());
24811	case ISD::CTLZ:
24812	return performCTLZCombine(N, DAG, Subtarget);
24813	case ISD::SCALAR_TO_VECTOR:
24814	return performScalarToVectorCombine(N, DCI, DAG);
24815	}
24816	return SDValue ();
24817	}
24818
24819	// Check if the return value is used as only a return value, as otherwise
24820	// we can't perform a tail-call. In particular, we need to check for
24821	// target ISD nodes that are returns and any other "odd" constructs
24822	// that the generic analysis code won't necessarily catch.
24823	bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
24824	SDValue &Chain) const {
24825	if (N->getNumValues() != `1`)
24826	return false;
24827	if (!N->hasNUsesOfValue(NUses: `1`, Value: `0`))
24828	return false;
24829
24830	SDValue TCChain = Chain;
24831	SDNode Copy = N->use_begin();
24832	if (Copy->getOpcode() == ISD::CopyToReg) {
24833	// If the copy has a glue operand, we conservatively assume it isn't safe to
24834	// perform a tail call.
24835	if (Copy->getOperand(Copy->getNumOperands() - `1`).getValueType() ==
24836	MVT::Glue)
24837	return false;
24838	TCChain = Copy->getOperand(Num: `0`);
24839	} else if (Copy->getOpcode() != ISD::FP_EXTEND)
24840	return false;
24841
24842	bool HasRet = false;
24843	for (SDNode *Node : Copy->uses()) {
24844	if (Node->getOpcode() != AArch64ISD::RET_GLUE)
24845	return false;
24846	HasRet = true;
24847	}
24848
24849	if (!HasRet)
24850	return false;
24851
24852	Chain = TCChain;
24853	return true;
24854	}
24855
24856	// Return whether the an instruction can potentially be optimized to a tail
24857	// call. This will cause the optimizers to attempt to move, or duplicate,
24858	// return instructions to help enable tail call optimizations for this
24859	// instruction.
24860	bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst CI) const* {
24861	return CI->isTailCall();
24862	}
24863
24864	bool AArch64TargetLowering::isIndexingLegal(MachineInstr &MI, Register Base,
24865	Register Offset, bool IsPre,
24866	MachineRegisterInfo &MRI) const {
24867	auto CstOffset = getIConstantVRegVal(VReg: Offset, MRI);
24868	if (!CstOffset \|\| CstOffset ->isZero())
24869	return false;
24870
24871	// All of the indexed addressing mode instructions take a signed 9 bit
24872	// immediate offset. Our CstOffset is a G_PTR_ADD offset so it already
24873	// encodes the sign/indexing direction.
24874	return isInt<`9`>(x: CstOffset ->getSExtValue());
24875	}
24876
24877	bool AArch64TargetLowering::getIndexedAddressParts(SDNode N, SDNode Op,
24878	SDValue &Base,
24879	SDValue &Offset,
24880	SelectionDAG &DAG) const {
24881	if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
24882	return false;
24883
24884	// Non-null if there is exactly one user of the loaded value (ignoring chain).
24885	SDNode ValOnlyUser = nullptr*;
24886	for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); UI != UE;
24887	++UI) {
24888	if (UI.getUse().getResNo() == `1`)
24889	continue; // Ignore chain.
24890	if (ValOnlyUser == nullptr)
24891	ValOnlyUser = *UI;
24892	else {
24893	ValOnlyUser = nullptr; // Multiple non-chain uses, bail out.
24894	break;
24895	}
24896	}
24897
24898	auto IsUndefOrZero = [](SDValue V) {
24899	return V.isUndef() \|\| isNullOrNullSplat(V, /AllowUndefs/ true);
24900	};
24901
24902	// If the only user of the value is a scalable vector splat, it is
24903	// preferable to do a replicating load (ld1r).*
24904	if (ValOnlyUser && ValOnlyUser->getValueType(ResNo: `0`).isScalableVector() &&
24905	(ValOnlyUser->getOpcode() == ISD::SPLAT_VECTOR \|\|
24906	(ValOnlyUser->getOpcode() == AArch64ISD::DUP_MERGE_PASSTHRU &&
24907	IsUndefOrZero (ValOnlyUser->getOperand(Num: `2`)))))
24908	return false;
24909
24910	Base = Op->getOperand(Num: `0`);
24911	// All of the indexed addressing mode instructions take a signed
24912	// 9 bit immediate offset.
24913	if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: `1`))) {
24914	int64_t RHSC = RHS->getSExtValue();
24915	if (Op->getOpcode() == ISD::SUB)
24916	RHSC = -(uint64_t)RHSC;
24917	if (!isInt<`9`>(x: RHSC))
24918	return false;
24919	// Always emit pre-inc/post-inc addressing mode. Use negated constant offset
24920	// when dealing with subtraction.
24921	Offset = DAG.getConstant(Val: RHSC, DL: SDLoc (N), VT: RHS->getValueType(ResNo: `0`));
24922	return true;
24923	}
24924	return false;
24925	}
24926
24927	bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
24928	SDValue &Offset,
24929	ISD::MemIndexedMode &AM,
24930	SelectionDAG &DAG) const {
24931	EVT VT;
24932	SDValue Ptr;
24933	if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Val: N)) {
24934	VT = LD->getMemoryVT();
24935	Ptr = LD->getBasePtr();
24936	} else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(Val: N)) {
24937	VT = ST->getMemoryVT();
24938	Ptr = ST->getBasePtr();
24939	} else
24940	return false;
24941
24942	if (!getIndexedAddressParts(N, Op: Ptr.getNode(), Base, Offset, DAG))
24943	return false;
24944	AM = ISD::PRE_INC;
24945	return true;
24946	}
24947
24948	bool AArch64TargetLowering::getPostIndexedAddressParts(
24949	SDNode N, SDNode Op, SDValue &Base, SDValue &Offset,
24950	ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
24951	EVT VT;
24952	SDValue Ptr;
24953	if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Val: N)) {
24954	VT = LD->getMemoryVT();
24955	Ptr = LD->getBasePtr();
24956	} else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(Val: N)) {
24957	VT = ST->getMemoryVT();
24958	Ptr = ST->getBasePtr();
24959	} else
24960	return false;
24961
24962	if (!getIndexedAddressParts(N, Op, Base, Offset, DAG))
24963	return false;
24964	// Post-indexing updates the base, so it's not a valid transform
24965	// if that's not the same as the load's pointer.
24966	if (Ptr != Base)
24967	return false;
24968	AM = ISD::POST_INC;
24969	return true;
24970	}
24971
24972	static void replaceBoolVectorBitcast(SDNode *N,
24973	SmallVectorImpl<SDValue> &Results,
24974	SelectionDAG &DAG) {
24975	SDLoc DL(N);
24976	SDValue Op = N->getOperand(Num: `0`);
24977	EVT VT = N->getValueType(ResNo: `0`);
24978	[[maybe_unused]] EVT SrcVT = Op.getValueType();
24979	assert(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
24980	"Must be bool vector.");
24981
24982	// Special handling for Clang's __builtin_convertvector. For vectors with <8
24983	// elements, it adds a vector concatenation with undef(s). If we encounter
24984	// this here, we can skip the concat.
24985	if (Op.getOpcode() == ISD::CONCAT_VECTORS && !Op.getOperand(i: `0`).isUndef()) {
24986	bool AllUndef = true;
24987	for (unsigned I = `1`; I < Op.getNumOperands(); ++I)
24988	AllUndef &= Op.getOperand(i: I).isUndef();
24989
24990	if (AllUndef)
24991	Op = Op.getOperand(i: `0`);
24992	}
24993
24994	SDValue VectorBits = vectorToScalarBitmask(N: Op.getNode(), DAG);
24995	if (VectorBits)
24996	Results.push_back(Elt: DAG.getZExtOrTrunc(Op: VectorBits, DL, VT));
24997	}
24998
24999	static void CustomNonLegalBITCASTResults(SDNode *N,
25000	SmallVectorImpl<SDValue> &Results,
25001	SelectionDAG &DAG, EVT ExtendVT,
25002	EVT CastVT) {
25003	SDLoc DL(N);
25004	SDValue Op = N->getOperand(Num: `0`);
25005	EVT VT = N->getValueType(ResNo: `0`);
25006
25007	// Use SCALAR_TO_VECTOR for lane zero
25008	SDValue Vec = DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL, VT: ExtendVT, Operand: Op);
25009	SDValue CastVal = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: CastVT, Operand: Vec);
25010	SDValue IdxZero = DAG.getVectorIdxConstant(Val: `0`, DL);
25011	Results.push_back(
25012	Elt: DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: CastVal, N2: IdxZero));
25013	}
25014
25015	void AArch64TargetLowering::ReplaceBITCASTResults(
25016	SDNode N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const* {
25017	SDLoc DL(N);
25018	SDValue Op = N->getOperand(Num: `0`);
25019	EVT VT = N->getValueType(ResNo: `0`);
25020	EVT SrcVT = Op.getValueType();
25021
25022	if (VT == MVT::v2i16 && SrcVT == MVT::i32) {
25023	CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v4i16);
25024	return;
25025	}
25026
25027	if (VT == MVT::v4i8 && SrcVT == MVT::i32) {
25028	CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v8i8);
25029	return;
25030	}
25031
25032	if (VT == MVT::v2i8 && SrcVT == MVT::i16) {
25033	CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v4i16, MVT::v8i8);
25034	return;
25035	}
25036
25037	if (VT.isScalableVector() && !isTypeLegal(VT) && isTypeLegal(VT: SrcVT)) {
25038	assert(!VT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
25039	"Expected fp->int bitcast!");
25040
25041	// Bitcasting between unpacked vector types of different element counts is
25042	// not a NOP because the live elements are laid out differently.
25043	// 01234567
25044	// e.g. nxv2i32 = XX??XX??
25045	// nxv4f16 = X?X?X?X?
25046	if (VT.getVectorElementCount() != SrcVT.getVectorElementCount())
25047	return;
25048
25049	SDValue CastResult = getSVESafeBitCast(VT: getSVEContainerType(ContentTy: VT), Op, DAG);
25050	Results.push_back(Elt: DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: CastResult));
25051	return;
25052	}
25053
25054	if (SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
25055	!VT.isVector())
25056	return replaceBoolVectorBitcast(N, Results, DAG);
25057
25058	if (VT != MVT::i16 \|\| (SrcVT != MVT::f16 && SrcVT != MVT::bf16))
25059	return;
25060
25061	Op = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
25062	DAG.getUNDEF(MVT::i32), Op);
25063	Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
25064	Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
25065	}
25066
25067	static void ReplaceAddWithADDP(SDNode *N, SmallVectorImpl<SDValue> &Results,
25068	SelectionDAG &DAG,
25069	const AArch64Subtarget *Subtarget) {
25070	EVT VT = N->getValueType(ResNo: `0`);
25071	if (!VT.is256BitVector() \|\|
25072	(VT.getScalarType().isFloatingPoint() &&
25073	!N->getFlags().hasAllowReassociation()) \|\|
25074	(VT.getScalarType() == MVT::f16 && !Subtarget->hasFullFP16()) \|\|
25075	VT.getScalarType() == MVT::bf16)
25076	return;
25077
25078	SDValue X = N->getOperand(Num: `0`);
25079	auto *Shuf = dyn_cast<ShuffleVectorSDNode>(Val: N->getOperand(Num: `1`));
25080	if (!Shuf) {
25081	Shuf = dyn_cast<ShuffleVectorSDNode>(Val: N->getOperand(Num: `0`));
25082	X = N->getOperand(Num: `1`);
25083	if (!Shuf)
25084	return;
25085	}
25086
25087	if (Shuf->getOperand(Num: `0`) != X \|\| !Shuf->getOperand(Num: `1`)->isUndef())
25088	return;
25089
25090	// Check the mask is 1,0,3,2,5,4,...
25091	ArrayRef<int> Mask = Shuf->getMask();
25092	for (int I = `0`, E = Mask.size(); I < E; I++)
25093	if (Mask [I] != (I % `2` == `0` ? I + `1` : I - `1`))
25094	return;
25095
25096	SDLoc DL(N);
25097	auto LoHi = DAG.SplitVector(N: X, DL);
25098	assert(LoHi.first.getValueType() == LoHi.second.getValueType());
25099	SDValue Addp = DAG.getNode(Opcode: AArch64ISD::ADDP, DL: N, VT: LoHi.first.getValueType(),
25100	N1: LoHi.first, N2: LoHi.second);
25101
25102	// Shuffle the elements back into order.
25103	SmallVector<int> NMask;
25104	for (unsigned I = `0`, E = VT.getVectorNumElements() / `2`; I < E; I++) {
25105	NMask.push_back(Elt: I);
25106	NMask.push_back(Elt: I);
25107	}
25108	Results.push_back(
25109	Elt: DAG.getVectorShuffle(VT, dl: DL,
25110	N1: DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT, N1: Addp,
25111	N2: DAG.getUNDEF(VT: LoHi.first.getValueType())),
25112	N2: DAG.getUNDEF(VT), Mask: NMask));
25113	}
25114
25115	static void ReplaceReductionResults(SDNode *N,
25116	SmallVectorImpl<SDValue> &Results,
25117	SelectionDAG &DAG, unsigned InterOp,
25118	unsigned AcrossOp) {
25119	EVT LoVT, HiVT;
25120	SDValue Lo, Hi;
25121	SDLoc dl(N);
25122	std::tie(args&: LoVT, args&: HiVT) = DAG.GetSplitDestVTs(VT: N->getValueType(ResNo: `0`));
25123	std::tie(args&: Lo, args&: Hi) = DAG.SplitVectorOperand(N, OpNo: `0`);
25124	SDValue InterVal = DAG.getNode(Opcode: InterOp, DL: dl, VT: LoVT, N1: Lo, N2: Hi);
25125	SDValue SplitVal = DAG.getNode(Opcode: AcrossOp, DL: dl, VT: LoVT, Operand: InterVal);
25126	Results.push_back(Elt: SplitVal);
25127	}
25128
25129	void AArch64TargetLowering::ReplaceExtractSubVectorResults(
25130	SDNode N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const* {
25131	SDValue In = N->getOperand(Num: `0`);
25132	EVT InVT = In.getValueType();
25133
25134	// Common code will handle these just fine.
25135	if (!InVT.isScalableVector() \|\| !InVT.isInteger())
25136	return;
25137
25138	SDLoc DL(N);
25139	EVT VT = N->getValueType(ResNo: `0`);
25140
25141	// The following checks bail if this is not a halving operation.
25142
25143	ElementCount ResEC = VT.getVectorElementCount();
25144
25145	if (InVT.getVectorElementCount() != (ResEC * `2`))
25146	return;
25147
25148	auto *CIndex = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `1`));
25149	if (!CIndex)
25150	return;
25151
25152	unsigned Index = CIndex->getZExtValue();
25153	if ((Index != `0`) && (Index != ResEC.getKnownMinValue()))
25154	return;
25155
25156	unsigned Opcode = (Index == `0`) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI;
25157	EVT ExtendedHalfVT = VT.widenIntegerVectorElementType(Context&: *DAG.getContext());
25158
25159	SDValue Half = DAG.getNode(Opcode, DL, VT: ExtendedHalfVT, Operand: N->getOperand(Num: `0`));
25160	Results.push_back(Elt: DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: Half));
25161	}
25162
25163	// Create an even/odd pair of X registers holding integer value V.
25164	static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {
25165	SDLoc dl(V.getNode());
25166	auto [VLo, VHi] = DAG.SplitScalar(V, dl, MVT::i64, MVT::i64);
25167	if (DAG.getDataLayout().isBigEndian())
25168	std::swap (VLo, VHi);
25169	SDValue RegClass =
25170	DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, dl, MVT::i32);
25171	SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, dl, MVT::i32);
25172	SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, dl, MVT::i32);
25173	const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
25174	return SDValue(
25175	DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), `0`);
25176	}
25177
25178	static void ReplaceCMP_SWAP_128Results(SDNode *N,
25179	SmallVectorImpl<SDValue> &Results,
25180	SelectionDAG &DAG,
25181	const AArch64Subtarget *Subtarget) {
25182	assert(N->getValueType(`0`) == MVT::i128 &&
25183	"AtomicCmpSwap on types less than 128 should be legal");
25184
25185	MachineMemOperand *MemOp = cast<MemSDNode>(Val: N)->getMemOperand();
25186	if (Subtarget->hasLSE() \|\| Subtarget->outlineAtomics()) {
25187	// LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
25188	// so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
25189	SDValue Ops[] = {
25190	createGPRPairNode(DAG, V: N->getOperand(Num: `2`)), // Compare value
25191	createGPRPairNode(DAG, V: N->getOperand(Num: `3`)), // Store value
25192	N->getOperand(Num: `1`), // Ptr
25193	N->getOperand(Num: `0`), // Chain in
25194	};
25195
25196	unsigned Opcode;
25197	switch (MemOp->getMergedOrdering()) {
25198	case AtomicOrdering::Monotonic:
25199	Opcode = AArch64::CASPX;
25200	break;
25201	case AtomicOrdering::Acquire:
25202	Opcode = AArch64::CASPAX;
25203	break;
25204	case AtomicOrdering::Release:
25205	Opcode = AArch64::CASPLX;
25206	break;
25207	case AtomicOrdering::AcquireRelease:
25208	case AtomicOrdering::SequentiallyConsistent:
25209	Opcode = AArch64::CASPALX;
25210	break;
25211	default:
25212	llvm_unreachable("Unexpected ordering!");
25213	}
25214
25215	MachineSDNode *CmpSwap = DAG.getMachineNode(
25216	Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops);
25217	DAG.setNodeMemRefs(N: CmpSwap, NewMemRefs: {MemOp});
25218
25219	unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
25220	if (DAG.getDataLayout().isBigEndian())
25221	std::swap(a&: SubReg1, b&: SubReg2);
25222	SDValue Lo = DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64,
25223	SDValue(CmpSwap, `0`));
25224	SDValue Hi = DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64,
25225	SDValue(CmpSwap, `0`));
25226	Results.push_back(
25227	DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
25228	Results.push_back(Elt: SDValue (CmpSwap, `1`)); // Chain out
25229	return;
25230	}
25231
25232	unsigned Opcode;
25233	switch (MemOp->getMergedOrdering()) {
25234	case AtomicOrdering::Monotonic:
25235	Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
25236	break;
25237	case AtomicOrdering::Acquire:
25238	Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
25239	break;
25240	case AtomicOrdering::Release:
25241	Opcode = AArch64::CMP_SWAP_128_RELEASE;
25242	break;
25243	case AtomicOrdering::AcquireRelease:
25244	case AtomicOrdering::SequentiallyConsistent:
25245	Opcode = AArch64::CMP_SWAP_128;
25246	break;
25247	default:
25248	llvm_unreachable("Unexpected ordering!");
25249	}
25250
25251	SDLoc DL(N);
25252	auto Desired = DAG.SplitScalar(N->getOperand(`2`), DL, MVT::i64, MVT::i64);
25253	auto New = DAG.SplitScalar(N->getOperand(`3`), DL, MVT::i64, MVT::i64);
25254	SDValue Ops[] = {N->getOperand(Num: `1`), Desired.first, Desired.second,
25255	New.first, New.second, N->getOperand(Num: `0`)};
25256	SDNode *CmpSwap = DAG.getMachineNode(
25257	Opcode, SDLoc(N), DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other),
25258	Ops);
25259	DAG.setNodeMemRefs(N: cast<MachineSDNode>(Val: CmpSwap), NewMemRefs: {MemOp});
25260
25261	Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
25262	SDValue(CmpSwap, `0`), SDValue(CmpSwap, `1`)));
25263	Results.push_back(Elt: SDValue (CmpSwap, `3`));
25264	}
25265
25266	static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode,
25267	AtomicOrdering Ordering) {
25268	// ATOMIC_LOAD_CLR only appears when lowering ATOMIC_LOAD_AND (see
25269	// LowerATOMIC_LOAD_AND). We can't take that approach with 128-bit, because
25270	// the type is not legal. Therefore we shouldn't expect to see a 128-bit
25271	// ATOMIC_LOAD_CLR at any point.
25272	assert(ISDOpcode != ISD::ATOMIC_LOAD_CLR &&
25273	"ATOMIC_LOAD_AND should be lowered to LDCLRP directly");
25274	assert(ISDOpcode != ISD::ATOMIC_LOAD_ADD && "There is no 128 bit LDADD");
25275	assert(ISDOpcode != ISD::ATOMIC_LOAD_SUB && "There is no 128 bit LDSUB");
25276
25277	if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
25278	// The operand will need to be XORed in a separate step.
25279	switch (Ordering) {
25280	case AtomicOrdering::Monotonic:
25281	return AArch64::LDCLRP;
25282	break;
25283	case AtomicOrdering::Acquire:
25284	return AArch64::LDCLRPA;
25285	break;
25286	case AtomicOrdering::Release:
25287	return AArch64::LDCLRPL;
25288	break;
25289	case AtomicOrdering::AcquireRelease:
25290	case AtomicOrdering::SequentiallyConsistent:
25291	return AArch64::LDCLRPAL;
25292	break;
25293	default:
25294	llvm_unreachable("Unexpected ordering!");
25295	}
25296	}
25297
25298	if (ISDOpcode == ISD::ATOMIC_LOAD_OR) {
25299	switch (Ordering) {
25300	case AtomicOrdering::Monotonic:
25301	return AArch64::LDSETP;
25302	break;
25303	case AtomicOrdering::Acquire:
25304	return AArch64::LDSETPA;
25305	break;
25306	case AtomicOrdering::Release:
25307	return AArch64::LDSETPL;
25308	break;
25309	case AtomicOrdering::AcquireRelease:
25310	case AtomicOrdering::SequentiallyConsistent:
25311	return AArch64::LDSETPAL;
25312	break;
25313	default:
25314	llvm_unreachable("Unexpected ordering!");
25315	}
25316	}
25317
25318	if (ISDOpcode == ISD::ATOMIC_SWAP) {
25319	switch (Ordering) {
25320	case AtomicOrdering::Monotonic:
25321	return AArch64::SWPP;
25322	break;
25323	case AtomicOrdering::Acquire:
25324	return AArch64::SWPPA;
25325	break;
25326	case AtomicOrdering::Release:
25327	return AArch64::SWPPL;
25328	break;
25329	case AtomicOrdering::AcquireRelease:
25330	case AtomicOrdering::SequentiallyConsistent:
25331	return AArch64::SWPPAL;
25332	break;
25333	default:
25334	llvm_unreachable("Unexpected ordering!");
25335	}
25336	}
25337
25338	llvm_unreachable("Unexpected ISDOpcode!");
25339	}
25340
25341	static void ReplaceATOMIC_LOAD_128Results(SDNode *N,
25342	SmallVectorImpl<SDValue> &Results,
25343	SelectionDAG &DAG,
25344	const AArch64Subtarget *Subtarget) {
25345	// LSE128 has a 128-bit RMW ops, but i128 is not a legal type, so lower it
25346	// here. This follows the approach of the CMP_SWAP_XXX pseudo instructions
25347	// rather than the CASP instructions, because CASP has register classes for
25348	// the pairs of registers and therefore uses REG_SEQUENCE and EXTRACT_SUBREG
25349	// to present them as single operands. LSE128 instructions use the GPR64
25350	// register class (because the pair does not have to be sequential), like
25351	// CMP_SWAP_XXX, and therefore we use TRUNCATE and BUILD_PAIR.
25352
25353	assert(N->getValueType(`0`) == MVT::i128 &&
25354	"AtomicLoadXXX on types less than 128 should be legal");
25355
25356	if (!Subtarget->hasLSE128())
25357	return;
25358
25359	MachineMemOperand *MemOp = cast<MemSDNode>(Val: N)->getMemOperand();
25360	const SDValue &Chain = N->getOperand(Num: `0`);
25361	const SDValue &Ptr = N->getOperand(Num: `1`);
25362	const SDValue &Val128 = N->getOperand(Num: `2`);
25363	std::pair<SDValue, SDValue> Val2x64 =
25364	DAG.SplitScalar(Val128, SDLoc(Val128), MVT::i64, MVT::i64);
25365
25366	const unsigned ISDOpcode = N->getOpcode();
25367	const unsigned MachineOpcode =
25368	getAtomicLoad128Opcode(ISDOpcode, Ordering: MemOp->getMergedOrdering());
25369
25370	if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
25371	SDLoc dl(Val128);
25372	Val2x64.first =
25373	DAG.getNode(ISD::XOR, dl, MVT::i64,
25374	DAG.getConstant(-`1ULL`, dl, MVT::i64), Val2x64.first);
25375	Val2x64.second =
25376	DAG.getNode(ISD::XOR, dl, MVT::i64,
25377	DAG.getConstant(-`1ULL`, dl, MVT::i64), Val2x64.second);
25378	}
25379
25380	SDValue Ops[] = {Val2x64.first, Val2x64.second, Ptr, Chain};
25381	if (DAG.getDataLayout().isBigEndian())
25382	std::swap(a&: Ops[`0`], b&: Ops[`1`]);
25383
25384	MachineSDNode *AtomicInst =
25385	DAG.getMachineNode(MachineOpcode, SDLoc(N),
25386	DAG.getVTList(MVT::i64, MVT::i64, MVT::Other), Ops);
25387
25388	DAG.setNodeMemRefs(N: AtomicInst, NewMemRefs: {MemOp});
25389
25390	SDValue Lo = SDValue (AtomicInst, `0`), Hi = SDValue (AtomicInst, `1`);
25391	if (DAG.getDataLayout().isBigEndian())
25392	std::swap(a&: Lo, b&: Hi);
25393
25394	Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
25395	Results.push_back(Elt: SDValue (AtomicInst, `2`)); // Chain out
25396	}
25397
25398	void AArch64TargetLowering::ReplaceNodeResults(
25399	SDNode N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const* {
25400	switch (N->getOpcode()) {
25401	default:
25402	llvm_unreachable("Don't know how to custom expand this");
25403	case ISD::BITCAST:
25404	ReplaceBITCASTResults(N, Results, DAG);
25405	return;
25406	case ISD::VECREDUCE_ADD:
25407	case ISD::VECREDUCE_SMAX:
25408	case ISD::VECREDUCE_SMIN:
25409	case ISD::VECREDUCE_UMAX:
25410	case ISD::VECREDUCE_UMIN:
25411	Results.push_back(Elt: LowerVECREDUCE(Op: SDValue (N, `0`), DAG));
25412	return;
25413	case ISD::ADD:
25414	case ISD::FADD:
25415	ReplaceAddWithADDP(N, Results, DAG, Subtarget);
25416	return;
25417
25418	case ISD::CTPOP:
25419	case ISD::PARITY:
25420	if (SDValue Result = LowerCTPOP_PARITY(Op: SDValue (N, `0`), DAG))
25421	Results.push_back(Elt: Result);
25422	return;
25423	case AArch64ISD::SADDV:
25424	ReplaceReductionResults(N, Results, DAG, InterOp: ISD::ADD, AcrossOp: AArch64ISD::SADDV);
25425	return;
25426	case AArch64ISD::UADDV:
25427	ReplaceReductionResults(N, Results, DAG, InterOp: ISD::ADD, AcrossOp: AArch64ISD::UADDV);
25428	return;
25429	case AArch64ISD::SMINV:
25430	ReplaceReductionResults(N, Results, DAG, InterOp: ISD::SMIN, AcrossOp: AArch64ISD::SMINV);
25431	return;
25432	case AArch64ISD::UMINV:
25433	ReplaceReductionResults(N, Results, DAG, InterOp: ISD::UMIN, AcrossOp: AArch64ISD::UMINV);
25434	return;
25435	case AArch64ISD::SMAXV:
25436	ReplaceReductionResults(N, Results, DAG, InterOp: ISD::SMAX, AcrossOp: AArch64ISD::SMAXV);
25437	return;
25438	case AArch64ISD::UMAXV:
25439	ReplaceReductionResults(N, Results, DAG, InterOp: ISD::UMAX, AcrossOp: AArch64ISD::UMAXV);
25440	return;
25441	case ISD::MULHS:
25442	if (useSVEForFixedLengthVectorVT(VT: SDValue (N, `0`).getValueType()))
25443	Results.push_back(
25444	Elt: LowerToPredicatedOp(Op: SDValue (N, `0`), DAG, NewOp: AArch64ISD::MULHS_PRED));
25445	return;
25446	case ISD::MULHU:
25447	if (useSVEForFixedLengthVectorVT(VT: SDValue (N, `0`).getValueType()))
25448	Results.push_back(
25449	Elt: LowerToPredicatedOp(Op: SDValue (N, `0`), DAG, NewOp: AArch64ISD::MULHU_PRED));
25450	return;
25451	case ISD::FP_TO_UINT:
25452	case ISD::FP_TO_SINT:
25453	case ISD::STRICT_FP_TO_SINT:
25454	case ISD::STRICT_FP_TO_UINT:
25455	assert(N->getValueType(`0`) == MVT::i128 && "unexpected illegal conversion");
25456	// Let normal code take care of it by not adding anything to Results.
25457	return;
25458	case ISD::ATOMIC_CMP_SWAP:
25459	ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
25460	return;
25461	case ISD::ATOMIC_LOAD_CLR:
25462	assert(N->getValueType(`0`) != MVT::i128 &&
25463	"128-bit ATOMIC_LOAD_AND should be lowered directly to LDCLRP");
25464	break;
25465	case ISD::ATOMIC_LOAD_AND:
25466	case ISD::ATOMIC_LOAD_OR:
25467	case ISD::ATOMIC_SWAP: {
25468	assert(cast<AtomicSDNode>(N)->getVal().getValueType() == MVT::i128 &&
25469	"Expected 128-bit atomicrmw.");
25470	// These need custom type legalisation so we go directly to instruction.
25471	ReplaceATOMIC_LOAD_128Results(N, Results, DAG, Subtarget);
25472	return;
25473	}
25474	case ISD::ATOMIC_LOAD:
25475	case ISD::LOAD: {
25476	MemSDNode *LoadNode = cast<MemSDNode>(Val: N);
25477	EVT MemVT = LoadNode->getMemoryVT();
25478	// Handle lowering 256 bit non temporal loads into LDNP for little-endian
25479	// targets.
25480	if (LoadNode->isNonTemporal() && Subtarget->isLittleEndian() &&
25481	MemVT.getSizeInBits() == `256u` &&
25482	(MemVT.getScalarSizeInBits() == `8u` \|\|
25483	MemVT.getScalarSizeInBits() == `16u` \|\|
25484	MemVT.getScalarSizeInBits() == `32u` \|\|
25485	MemVT.getScalarSizeInBits() == `64u`)) {
25486
25487	SDValue Result = DAG.getMemIntrinsicNode(
25488	AArch64ISD::LDNP, SDLoc(N),
25489	DAG.getVTList({MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
25490	MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
25491	MVT::Other}),
25492	{LoadNode->getChain(), LoadNode->getBasePtr()},
25493	LoadNode->getMemoryVT(), LoadNode->getMemOperand());
25494
25495	SDValue Pair = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc (N), VT: MemVT,
25496	N1: Result.getValue(R: `0`), N2: Result.getValue(R: `1`));
25497	Results.append(IL: {Pair, Result.getValue(R: `2`) / Chain /});
25498	return;
25499	}
25500
25501	if ((!LoadNode->isVolatile() && !LoadNode->isAtomic()) \|\|
25502	LoadNode->getMemoryVT() != MVT::i128) {
25503	// Non-volatile or atomic loads are optimized later in AArch64's load/store
25504	// optimizer.
25505	return;
25506	}
25507
25508	if (SDValue(N, `0`).getValueType() == MVT::i128) {
25509	auto *AN = dyn_cast<AtomicSDNode>(Val: LoadNode);
25510	bool isLoadAcquire =
25511	AN && AN->getSuccessOrdering() == AtomicOrdering::Acquire;
25512	unsigned Opcode = isLoadAcquire ? AArch64ISD::LDIAPP : AArch64ISD::LDP;
25513
25514	if (isLoadAcquire)
25515	assert(Subtarget->hasFeature(AArch64::FeatureRCPC3));
25516
25517	SDValue Result = DAG.getMemIntrinsicNode(
25518	Opcode, SDLoc(N), DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
25519	{LoadNode->getChain(), LoadNode->getBasePtr()},
25520	LoadNode->getMemoryVT(), LoadNode->getMemOperand());
25521
25522	unsigned FirstRes = DAG.getDataLayout().isBigEndian() ? `1` : `0`;
25523
25524	SDValue Pair =
25525	DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
25526	Result.getValue(FirstRes), Result.getValue(`1` - FirstRes));
25527	Results.append(IL: {Pair, Result.getValue(R: `2`) / Chain /});
25528	}
25529	return;
25530	}
25531	case ISD::EXTRACT_SUBVECTOR:
25532	ReplaceExtractSubVectorResults(N, Results, DAG);
25533	return;
25534	case ISD::INSERT_SUBVECTOR:
25535	case ISD::CONCAT_VECTORS:
25536	// Custom lowering has been requested for INSERT_SUBVECTOR and
25537	// CONCAT_VECTORS -- but delegate to common code for result type
25538	// legalisation
25539	return;
25540	case ISD::INTRINSIC_WO_CHAIN: {
25541	EVT VT = N->getValueType(ResNo: `0`);
25542	assert((VT == MVT::i8 \|\| VT == MVT::i16) &&
25543	"custom lowering for unexpected type");
25544
25545	Intrinsic::ID IntID =
25546	static_cast<Intrinsic::ID>(N->getConstantOperandVal(Num: `0`));
25547	switch (IntID) {
25548	default:
25549	return;
25550	case Intrinsic::aarch64_sve_clasta_n: {
25551	SDLoc DL(N);
25552	auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(`2`));
25553	auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32,
25554	N->getOperand(`1`), Op2, N->getOperand(`3`));
25555	Results.push_back(Elt: DAG.getNode(ISD::TRUNCATE, DL, VT, V));
25556	return;
25557	}
25558	case Intrinsic::aarch64_sve_clastb_n: {
25559	SDLoc DL(N);
25560	auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(`2`));
25561	auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32,
25562	N->getOperand(`1`), Op2, N->getOperand(`3`));
25563	Results.push_back(Elt: DAG.getNode(ISD::TRUNCATE, DL, VT, V));
25564	return;
25565	}
25566	case Intrinsic::aarch64_sve_lasta: {
25567	SDLoc DL(N);
25568	auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32,
25569	N->getOperand(`1`), N->getOperand(`2`));
25570	Results.push_back(Elt: DAG.getNode(ISD::TRUNCATE, DL, VT, V));
25571	return;
25572	}
25573	case Intrinsic::aarch64_sve_lastb: {
25574	SDLoc DL(N);
25575	auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32,
25576	N->getOperand(`1`), N->getOperand(`2`));
25577	Results.push_back(Elt: DAG.getNode(ISD::TRUNCATE, DL, VT, V));
25578	return;
25579	}
25580	}
25581	}
25582	case ISD::READ_REGISTER: {
25583	SDLoc DL(N);
25584	assert(N->getValueType(`0`) == MVT::i128 &&
25585	"READ_REGISTER custom lowering is only for 128-bit sysregs");
25586	SDValue Chain = N->getOperand(Num: `0`);
25587	SDValue SysRegName = N->getOperand(Num: `1`);
25588
25589	SDValue Result = DAG.getNode(
25590	AArch64ISD::MRRS, DL, DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
25591	Chain, SysRegName);
25592
25593	// Sysregs are not endian. Result.getValue(0) always contains the lower half
25594	// of the 128-bit System Register value.
25595	SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
25596	Result.getValue(`0`), Result.getValue(`1`));
25597	Results.push_back(Elt: Pair);
25598	Results.push_back(Elt: Result.getValue(R: `2`)); // Chain
25599	return;
25600	}
25601	}
25602	}
25603
25604	bool AArch64TargetLowering::useLoadStackGuardNode() const {
25605	if (Subtarget->isTargetAndroid() \|\| Subtarget->isTargetFuchsia())
25606	return TargetLowering::useLoadStackGuardNode();
25607	return true;
25608	}
25609
25610	unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
25611	// Combine multiple FDIVs with the same divisor into multiple FMULs by the
25612	// reciprocal if there are three or more FDIVs.
25613	return `3`;
25614	}
25615
25616	TargetLoweringBase::LegalizeTypeAction
25617	AArch64TargetLowering::getPreferredVectorAction(MVT VT) const {
25618	// During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8,
25619	// v4i16, v2i32 instead of to promote.
25620	if (VT == MVT::v1i8 \|\| VT == MVT::v1i16 \|\| VT == MVT::v1i32 \|\|
25621	VT == MVT::v1f32)
25622	return TypeWidenVector;
25623
25624	return TargetLoweringBase::getPreferredVectorAction(VT);
25625	}
25626
25627	// In v8.4a, ldp and stp instructions are guaranteed to be single-copy atomic
25628	// provided the address is 16-byte aligned.
25629	bool AArch64TargetLowering::isOpSuitableForLDPSTP(const Instruction I) const* {
25630	if (!Subtarget->hasLSE2())
25631	return false;
25632
25633	if (auto LI = dyn_cast<LoadInst>(Val: I))
25634	return LI->getType()->getPrimitiveSizeInBits() == `128` &&
25635	LI->getAlign() >= Align (`16`);
25636
25637	if (auto SI = dyn_cast<StoreInst>(Val: I))
25638	return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == `128` &&
25639	SI->getAlign() >= Align (`16`);
25640
25641	return false;
25642	}
25643
25644	bool AArch64TargetLowering::isOpSuitableForLSE128(const Instruction I) const* {
25645	if (!Subtarget->hasLSE128())
25646	return false;
25647
25648	// Only use SWPP for stores where LSE2 would require a fence. Unlike STP, SWPP
25649	// will clobber the two registers.
25650	if (const auto *SI = dyn_cast<StoreInst>(Val: I))
25651	return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == `128` &&
25652	SI->getAlign() >= Align (`16`) &&
25653	(SI->getOrdering() == AtomicOrdering::Release \|\|
25654	SI->getOrdering() == AtomicOrdering::SequentiallyConsistent);
25655
25656	if (const auto *RMW = dyn_cast<AtomicRMWInst>(Val: I))
25657	return RMW->getValOperand()->getType()->getPrimitiveSizeInBits() == `128` &&
25658	RMW->getAlign() >= Align (`16`) &&
25659	(RMW->getOperation() == AtomicRMWInst::Xchg \|\|
25660	RMW->getOperation() == AtomicRMWInst::And \|\|
25661	RMW->getOperation() == AtomicRMWInst::Or);
25662
25663	return false;
25664	}
25665
25666	bool AArch64TargetLowering::isOpSuitableForRCPC3(const Instruction I) const* {
25667	if (!Subtarget->hasLSE2() \|\| !Subtarget->hasRCPC3())
25668	return false;
25669
25670	if (auto LI = dyn_cast<LoadInst>(Val: I))
25671	return LI->getType()->getPrimitiveSizeInBits() == `128` &&
25672	LI->getAlign() >= Align (`16`) &&
25673	LI->getOrdering() == AtomicOrdering::Acquire;
25674
25675	if (auto SI = dyn_cast<StoreInst>(Val: I))
25676	return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == `128` &&
25677	SI->getAlign() >= Align (`16`) &&
25678	SI->getOrdering() == AtomicOrdering::Release;
25679
25680	return false;
25681	}
25682
25683	bool AArch64TargetLowering::shouldInsertFencesForAtomic(
25684	const Instruction I) const* {
25685	if (isOpSuitableForRCPC3(I))
25686	return false;
25687	if (isOpSuitableForLSE128(I))
25688	return false;
25689	if (isOpSuitableForLDPSTP(I))
25690	return true;
25691	return false;
25692	}
25693
25694	bool AArch64TargetLowering::shouldInsertTrailingFenceForAtomicStore(
25695	const Instruction I) const* {
25696	// Store-Release instructions only provide seq_cst guarantees when paired with
25697	// Load-Acquire instructions. MSVC CRT does not use these instructions to
25698	// implement seq_cst loads and stores, so we need additional explicit fences
25699	// after memory writes.
25700	if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
25701	return false;
25702
25703	switch (I->getOpcode()) {
25704	default:
25705	return false;
25706	case Instruction::AtomicCmpXchg:
25707	return cast<AtomicCmpXchgInst>(Val: I)->getSuccessOrdering() ==
25708	AtomicOrdering::SequentiallyConsistent;
25709	case Instruction::AtomicRMW:
25710	return cast<AtomicRMWInst>(Val: I)->getOrdering() ==
25711	AtomicOrdering::SequentiallyConsistent;
25712	case Instruction::Store:
25713	return cast<StoreInst>(Val: I)->getOrdering() ==
25714	AtomicOrdering::SequentiallyConsistent;
25715	}
25716	}
25717
25718	// Loads and stores less than 128-bits are already atomic; ones above that
25719	// are doomed anyway, so defer to the default libcall and blame the OS when
25720	// things go wrong.
25721	TargetLoweringBase::AtomicExpansionKind
25722	AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst SI) const* {
25723	unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
25724	if (Size != `128`)
25725	return AtomicExpansionKind::None;
25726	if (isOpSuitableForRCPC3(I: SI))
25727	return AtomicExpansionKind::None;
25728	if (isOpSuitableForLSE128(I: SI))
25729	return AtomicExpansionKind::Expand;
25730	if (isOpSuitableForLDPSTP(I: SI))
25731	return AtomicExpansionKind::None;
25732	return AtomicExpansionKind::Expand;
25733	}
25734
25735	// Loads and stores less than 128-bits are already atomic; ones above that
25736	// are doomed anyway, so defer to the default libcall and blame the OS when
25737	// things go wrong.
25738	TargetLowering::AtomicExpansionKind
25739	AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst LI) const* {
25740	unsigned Size = LI->getType()->getPrimitiveSizeInBits();
25741
25742	if (Size != `128`)
25743	return AtomicExpansionKind::None;
25744	if (isOpSuitableForRCPC3(I: LI))
25745	return AtomicExpansionKind::None;
25746	// No LSE128 loads
25747	if (isOpSuitableForLDPSTP(I: LI))
25748	return AtomicExpansionKind::None;
25749
25750	// At -O0, fast-regalloc cannot cope with the live vregs necessary to
25751	// implement atomicrmw without spilling. If the target address is also on the
25752	// stack and close enough to the spill slot, this can lead to a situation
25753	// where the monitor always gets cleared and the atomic operation can never
25754	// succeed. So at -O0 lower this operation to a CAS loop.
25755	if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
25756	return AtomicExpansionKind::CmpXChg;
25757
25758	// Using CAS for an atomic load has a better chance of succeeding under high
25759	// contention situations. So use it if available.
25760	return Subtarget->hasLSE() ? AtomicExpansionKind::CmpXChg
25761	: AtomicExpansionKind::LLSC;
25762	}
25763
25764	// The "default" for integer RMW operations is to expand to an LL/SC loop.
25765	// However, with the LSE instructions (or outline-atomics mode, which provides
25766	// library routines in place of the LSE-instructions), we can directly emit many
25767	// operations instead.
25768	//
25769	// Floating-point operations are always emitted to a cmpxchg loop, because they
25770	// may trigger a trap which aborts an LLSC sequence.
25771	TargetLowering::AtomicExpansionKind
25772	AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst AI) const* {
25773	unsigned Size = AI->getType()->getPrimitiveSizeInBits();
25774	assert(Size <= `128` && "AtomicExpandPass should've handled larger sizes.");
25775
25776	if (AI->isFloatingPointOperation())
25777	return AtomicExpansionKind::CmpXChg;
25778
25779	bool CanUseLSE128 = Subtarget->hasLSE128() && Size == `128` &&
25780	(AI->getOperation() == AtomicRMWInst::Xchg \|\|
25781	AI->getOperation() == AtomicRMWInst::Or \|\|
25782	AI->getOperation() == AtomicRMWInst::And);
25783	if (CanUseLSE128)
25784	return AtomicExpansionKind::None;
25785
25786	// Nand is not supported in LSE.
25787	// Leave 128 bits to LLSC or CmpXChg.
25788	if (AI->getOperation() != AtomicRMWInst::Nand && Size < `128`) {
25789	if (Subtarget->hasLSE())
25790	return AtomicExpansionKind::None;
25791	if (Subtarget->outlineAtomics()) {
25792	// [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.
25793	// Don't outline them unless
25794	// (1) high level <atomic> support approved:
25795	// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf
25796	// (2) low level libgcc and compiler-rt support implemented by:
25797	// min/max outline atomics helpers
25798	if (AI->getOperation() != AtomicRMWInst::Min &&
25799	AI->getOperation() != AtomicRMWInst::Max &&
25800	AI->getOperation() != AtomicRMWInst::UMin &&
25801	AI->getOperation() != AtomicRMWInst::UMax) {
25802	return AtomicExpansionKind::None;
25803	}
25804	}
25805	}
25806
25807	// At -O0, fast-regalloc cannot cope with the live vregs necessary to
25808	// implement atomicrmw without spilling. If the target address is also on the
25809	// stack and close enough to the spill slot, this can lead to a situation
25810	// where the monitor always gets cleared and the atomic operation can never
25811	// succeed. So at -O0 lower this operation to a CAS loop. Also worthwhile if
25812	// we have a single CAS instruction that can replace the loop.
25813	if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None \|\|
25814	Subtarget->hasLSE())
25815	return AtomicExpansionKind::CmpXChg;
25816
25817	return AtomicExpansionKind::LLSC;
25818	}
25819
25820	TargetLowering::AtomicExpansionKind
25821	AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR(
25822	AtomicCmpXchgInst AI) const* {
25823	// If subtarget has LSE, leave cmpxchg intact for codegen.
25824	if (Subtarget->hasLSE() \|\| Subtarget->outlineAtomics())
25825	return AtomicExpansionKind::None;
25826	// At -O0, fast-regalloc cannot cope with the live vregs necessary to
25827	// implement cmpxchg without spilling. If the address being exchanged is also
25828	// on the stack and close enough to the spill slot, this can lead to a
25829	// situation where the monitor always gets cleared and the atomic operation
25830	// can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
25831	if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
25832	return AtomicExpansionKind::None;
25833
25834	// 128-bit atomic cmpxchg is weird; AtomicExpand doesn't know how to expand
25835	// it.
25836	unsigned Size = AI->getCompareOperand()->getType()->getPrimitiveSizeInBits();
25837	if (Size > `64`)
25838	return AtomicExpansionKind::None;
25839
25840	return AtomicExpansionKind::LLSC;
25841	}
25842
25843	Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder,
25844	Type ValueTy, Value Addr,
25845	AtomicOrdering Ord) const {
25846	Module *M = Builder.GetInsertBlock()->getParent()->getParent();
25847	bool IsAcquire = isAcquireOrStronger(AO: Ord);
25848
25849	// Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
25850	// intrinsic must return {i64, i64} and we have to recombine them into a
25851	// single i128 here.
25852	if (ValueTy->getPrimitiveSizeInBits() == `128`) {
25853	Intrinsic::ID Int =
25854	IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
25855	Function *Ldxr = Intrinsic::getDeclaration(M, id: Int);
25856
25857	Value *LoHi = Builder.CreateCall(Callee: Ldxr, Args: Addr, Name: "lohi");
25858
25859	Value *Lo = Builder.CreateExtractValue(Agg: LoHi, Idxs: `0`, Name: "lo");
25860	Value *Hi = Builder.CreateExtractValue(Agg: LoHi, Idxs: `1`, Name: "hi");
25861	Lo = Builder.CreateZExt(V: Lo, DestTy: ValueTy, Name: "lo64");
25862	Hi = Builder.CreateZExt(V: Hi, DestTy: ValueTy, Name: "hi64");
25863	return Builder.CreateOr(
25864	LHS: Lo, RHS: Builder.CreateShl(LHS: Hi, RHS: ConstantInt::get(Ty: ValueTy, V: `64`)), Name: "val64");
25865	}
25866
25867	Type *Tys[] = { Addr->getType() };
25868	Intrinsic::ID Int =
25869	IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
25870	Function *Ldxr = Intrinsic::getDeclaration(M, id: Int, Tys);
25871
25872	const DataLayout &DL = M->getDataLayout();
25873	IntegerType *IntEltTy = Builder.getIntNTy(N: DL.getTypeSizeInBits(Ty: ValueTy));
25874	CallInst *CI = Builder.CreateCall(Callee: Ldxr, Args: Addr);
25875	CI->addParamAttr(
25876	`0`, Attribute::get(Builder.getContext(), Attribute::ElementType, ValueTy));
25877	Value *Trunc = Builder.CreateTrunc(V: CI, DestTy: IntEltTy);
25878
25879	return Builder.CreateBitCast(V: Trunc, DestTy: ValueTy);
25880	}
25881
25882	void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
25883	IRBuilderBase &Builder) const {
25884	Module *M = Builder.GetInsertBlock()->getParent()->getParent();
25885	Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex));
25886	}
25887
25888	Value *AArch64TargetLowering::emitStoreConditional(IRBuilderBase &Builder,
25889	Value Val, Value Addr,
25890	AtomicOrdering Ord) const {
25891	Module *M = Builder.GetInsertBlock()->getParent()->getParent();
25892	bool IsRelease = isReleaseOrStronger(AO: Ord);
25893
25894	// Since the intrinsics must have legal type, the i128 intrinsics take two
25895	// parameters: "i64, i64". We must marshal Val into the appropriate form
25896	// before the call.
25897	if (Val->getType()->getPrimitiveSizeInBits() == `128`) {
25898	Intrinsic::ID Int =
25899	IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
25900	Function *Stxr = Intrinsic::getDeclaration(M, id: Int);
25901	Type *Int64Ty = Type::getInt64Ty(C&: M->getContext());
25902
25903	Value *Lo = Builder.CreateTrunc(V: Val, DestTy: Int64Ty, Name: "lo");
25904	Value *Hi = Builder.CreateTrunc(V: Builder.CreateLShr(LHS: Val, RHS: `64`), DestTy: Int64Ty, Name: "hi");
25905	return Builder.CreateCall(Callee: Stxr, Args: {Lo, Hi, Addr});
25906	}
25907
25908	Intrinsic::ID Int =
25909	IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
25910	Type *Tys[] = { Addr->getType() };
25911	Function *Stxr = Intrinsic::getDeclaration(M, id: Int, Tys);
25912
25913	const DataLayout &DL = M->getDataLayout();
25914	IntegerType *IntValTy = Builder.getIntNTy(N: DL.getTypeSizeInBits(Ty: Val->getType()));
25915	Val = Builder.CreateBitCast(V: Val, DestTy: IntValTy);
25916
25917	CallInst *CI = Builder.CreateCall(
25918	Callee: Stxr, Args: {Builder.CreateZExtOrBitCast(
25919	V: Val, DestTy: Stxr->getFunctionType()->getParamType(i: `0`)),
25920	Addr});
25921	CI->addParamAttr(`1`, Attribute::get(Builder.getContext(),
25922	Attribute::ElementType, Val->getType()));
25923	return CI;
25924	}
25925
25926	bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters(
25927	Type Ty, CallingConv::ID CallConv, bool* isVarArg,
25928	const DataLayout &DL) const {
25929	if (!Ty->isArrayTy()) {
25930	const TypeSize &TySize = Ty->getPrimitiveSizeInBits();
25931	return TySize.isScalable() && TySize.getKnownMinValue() > `128`;
25932	}
25933
25934	// All non aggregate members of the type must have the same type
25935	SmallVector<EVT> ValueVTs;
25936	ComputeValueVTs(TLI: *this, DL, Ty, ValueVTs);
25937	return all_equal(Range&: ValueVTs);
25938	}
25939
25940	bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
25941	EVT) const {
25942	return false;
25943	}
25944
25945	static Value UseTlsOffset(IRBuilderBase &IRB, unsigned* Offset) {
25946	Module *M = IRB.GetInsertBlock()->getParent()->getParent();
25947	Function *ThreadPointerFunc =
25948	Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
25949	return IRB.CreatePointerCast(
25950	V: IRB.CreateConstGEP1_32(Ty: IRB.getInt8Ty(), Ptr: IRB.CreateCall(Callee: ThreadPointerFunc),
25951	Idx0: Offset),
25952	DestTy: IRB.getPtrTy(AddrSpace: `0`));
25953	}
25954
25955	Value AArch64TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const* {
25956	// Android provides a fixed TLS slot for the stack cookie. See the definition
25957	// of TLS_SLOT_STACK_GUARD in
25958	// https://android.googlesource.com/platform/bionic/+/main/libc/platform/bionic/tls_defines.h
25959	if (Subtarget->isTargetAndroid())
25960	return UseTlsOffset(IRB, Offset: `0x28`);
25961
25962	// Fuchsia is similar.
25963	// <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
25964	if (Subtarget->isTargetFuchsia())
25965	return UseTlsOffset(IRB, Offset: -`0x10`);
25966
25967	return TargetLowering::getIRStackGuard(IRB);
25968	}
25969
25970	void AArch64TargetLowering::insertSSPDeclarations(Module &M) const {
25971	// MSVC CRT provides functionalities for stack protection.
25972	if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) {
25973	// MSVC CRT has a global variable holding security cookie.
25974	M.getOrInsertGlobal(Name: "__security_cookie",
25975	Ty: PointerType::getUnqual(C&: M.getContext()));
25976
25977	// MSVC CRT has a function to validate security cookie.
25978	FunctionCallee SecurityCheckCookie =
25979	M.getOrInsertFunction(Name: Subtarget->getSecurityCheckCookieName(),
25980	RetTy: Type::getVoidTy(C&: M.getContext()),
25981	Args: PointerType::getUnqual(C&: M.getContext()));
25982	if (Function *F = dyn_cast<Function>(Val: SecurityCheckCookie.getCallee())) {
25983	F->setCallingConv(CallingConv::Win64);
25984	F->addParamAttr(`0`, Attribute::AttrKind::InReg);
25985	}
25986	return;
25987	}
25988	TargetLowering::insertSSPDeclarations(M);
25989	}
25990
25991	Value AArch64TargetLowering::getSDagStackGuard(const* Module &M) const {
25992	// MSVC CRT has a global variable holding security cookie.
25993	if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
25994	return M.getGlobalVariable(Name: "__security_cookie");
25995	return TargetLowering::getSDagStackGuard(M);
25996	}
25997
25998	Function AArch64TargetLowering::getSSPStackGuardCheck(const* Module &M) const {
25999	// MSVC CRT has a function to validate security cookie.
26000	if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
26001	return M.getFunction(Name: Subtarget->getSecurityCheckCookieName());
26002	return TargetLowering::getSSPStackGuardCheck(M);
26003	}
26004
26005	Value *
26006	AArch64TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
26007	// Android provides a fixed TLS slot for the SafeStack pointer. See the
26008	// definition of TLS_SLOT_SAFESTACK in
26009	// https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
26010	if (Subtarget->isTargetAndroid())
26011	return UseTlsOffset(IRB, Offset: `0x48`);
26012
26013	// Fuchsia is similar.
26014	// <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
26015	if (Subtarget->isTargetFuchsia())
26016	return UseTlsOffset(IRB, Offset: -`0x8`);
26017
26018	return TargetLowering::getSafeStackPointerLocation(IRB);
26019	}
26020
26021	bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial(
26022	const Instruction &AndI) const {
26023	// Only sink 'and' mask to cmp use block if it is masking a single bit, since
26024	// this is likely to be fold the and/cmp/br into a single tbz instruction. It
26025	// may be beneficial to sink in other cases, but we would have to check that
26026	// the cmp would not get folded into the br to form a cbz for these to be
26027	// beneficial.
26028	ConstantInt* Mask = dyn_cast<ConstantInt>(Val: AndI.getOperand(i: `1`));
26029	if (!Mask)
26030	return false;
26031	return Mask->getValue().isPowerOf2();
26032	}
26033
26034	bool AArch64TargetLowering::
26035	shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
26036	SDValue X, ConstantSDNode XC, ConstantSDNode CC, SDValue Y,
26037	unsigned OldShiftOpcode, unsigned NewShiftOpcode,
26038	SelectionDAG &DAG) const {
26039	// Does baseline recommend not to perform the fold by default?
26040	if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
26041	X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
26042	return false;
26043	// Else, if this is a vector shift, prefer 'shl'.
26044	return X.getValueType().isScalarInteger() \|\| NewShiftOpcode == ISD::SHL;
26045	}
26046
26047	TargetLowering::ShiftLegalizationStrategy
26048	AArch64TargetLowering::preferredShiftLegalizationStrategy(
26049	SelectionDAG &DAG, SDNode N, unsigned* int ExpansionFactor) const {
26050	if (DAG.getMachineFunction().getFunction().hasMinSize() &&
26051	!Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin())
26052	return ShiftLegalizationStrategy::LowerToLibcall;
26053	return TargetLowering::preferredShiftLegalizationStrategy(DAG, N,
26054	ExpansionFactor);
26055	}
26056
26057	void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock Entry) const* {
26058	// Update IsSplitCSR in AArch64unctionInfo.
26059	AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
26060	AFI->setIsSplitCSR(true);
26061	}
26062
26063	void AArch64TargetLowering::insertCopiesSplitCSR(
26064	MachineBasicBlock *Entry,
26065	const SmallVectorImpl<MachineBasicBlock > &Exits) const* {
26066	const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
26067	const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(MF: Entry->getParent());
26068	if (!IStart)
26069	return;
26070
26071	const TargetInstrInfo *TII = Subtarget->getInstrInfo();
26072	MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
26073	MachineBasicBlock::iterator MBBI = Entry->begin();
26074	for (const MCPhysReg I = IStart; I; ++I) {
26075	const TargetRegisterClass RC = nullptr*;
26076	if (AArch64::GPR64RegClass.contains(*I))
26077	RC = &AArch64::GPR64RegClass;
26078	else if (AArch64::FPR64RegClass.contains(*I))
26079	RC = &AArch64::FPR64RegClass;
26080	else
26081	llvm_unreachable("Unexpected register class in CSRsViaCopy!");
26082
26083	Register NewVR = MRI->createVirtualRegister(RegClass: RC);
26084	// Create copy from CSR to a virtual register.
26085	// FIXME: this currently does not emit CFI pseudo-instructions, it works
26086	// fine for CXX_FAST_TLS since the C++-style TLS access functions should be
26087	// nounwind. If we want to generalize this later, we may need to emit
26088	// CFI pseudo-instructions.
26089	assert(Entry->getParent()->getFunction().hasFnAttribute(
26090	Attribute::NoUnwind) &&
26091	"Function should be nounwind in insertCopiesSplitCSR!");
26092	Entry->addLiveIn(PhysReg: *I);
26093	BuildMI(BB&: *Entry, I: MBBI, MIMD: DebugLoc (), MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: NewVR)
26094	.addReg(RegNo: *I);
26095
26096	// Insert the copy-back instructions right before the terminator.
26097	for (auto *Exit : Exits)
26098	BuildMI(BB&: *Exit, I: Exit->getFirstTerminator(), MIMD: DebugLoc (),
26099	MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: *I)
26100	.addReg(RegNo: NewVR);
26101	}
26102	}
26103
26104	bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
26105	// Integer division on AArch64 is expensive. However, when aggressively
26106	// optimizing for code size, we prefer to use a div instruction, as it is
26107	// usually smaller than the alternative sequence.
26108	// The exception to this is vector division. Since AArch64 doesn't have vector
26109	// integer division, leaving the division as-is is a loss even in terms of
26110	// size, because it will have to be scalarized, while the alternative code
26111	// sequence can be performed in vector form.
26112	bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
26113	return OptSize && !VT.isVector();
26114	}
26115
26116	bool AArch64TargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
26117	// We want inc-of-add for scalars and sub-of-not for vectors.
26118	return VT.isScalarInteger();
26119	}
26120
26121	bool AArch64TargetLowering::shouldConvertFpToSat(unsigned Op, EVT FPVT,
26122	EVT VT) const {
26123	// v8f16 without fp16 need to be extended to v8f32, which is more difficult to
26124	// legalize.
26125	if (FPVT == MVT::v8f16 && !Subtarget->hasFullFP16())
26126	return false;
26127	if (FPVT == MVT::v8bf16)
26128	return false;
26129	return TargetLowering::shouldConvertFpToSat(Op, FPVT, VT);
26130	}
26131
26132	MachineInstr *
26133	AArch64TargetLowering::EmitKCFICheck(MachineBasicBlock &MBB,
26134	MachineBasicBlock::instr_iterator &MBBI,
26135	const TargetInstrInfo TII) const* {
26136	assert(MBBI ->isCall() && MBBI ->getCFIType() &&
26137	"Invalid call instruction for a KCFI check");
26138
26139	switch (MBBI ->getOpcode()) {
26140	case AArch64::BLR:
26141	case AArch64::BLRNoIP:
26142	case AArch64::TCRETURNri:
26143	case AArch64::TCRETURNrix16x17:
26144	case AArch64::TCRETURNrix17:
26145	case AArch64::TCRETURNrinotx16:
26146	break;
26147	default:
26148	llvm_unreachable("Unexpected CFI call opcode");
26149	}
26150
26151	MachineOperand &Target = MBBI ->getOperand(i: `0`);
26152	assert(Target.isReg() && "Invalid target operand for an indirect call");
26153	Target.setIsRenamable(false);
26154
26155	return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(AArch64::KCFI_CHECK))
26156	.addReg(Target.getReg())
26157	.addImm(MBBI->getCFIType())
26158	.getInstr();
26159	}
26160
26161	bool AArch64TargetLowering::enableAggressiveFMAFusion(EVT VT) const {
26162	return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
26163	}
26164
26165	unsigned
26166	AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const {
26167	if (Subtarget->isTargetDarwin() \|\| Subtarget->isTargetWindows())
26168	return getPointerTy(DL).getSizeInBits();
26169
26170	return `3` * getPointerTy(DL).getSizeInBits() + `2` * `32`;
26171	}
26172
26173	void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
26174	MachineFrameInfo &MFI = MF.getFrameInfo();
26175	// If we have any vulnerable SVE stack objects then the stack protector
26176	// needs to be placed at the top of the SVE stack area, as the SVE locals
26177	// are placed above the other locals, so we allocate it as if it were a
26178	// scalable vector.
26179	// FIXME: It may be worthwhile having a specific interface for this rather
26180	// than doing it here in finalizeLowering.
26181	if (MFI.hasStackProtectorIndex()) {
26182	for (unsigned int i = `0`, e = MFI.getObjectIndexEnd(); i != e; ++i) {
26183	if (MFI.getStackID(ObjectIdx: i) == TargetStackID::ScalableVector &&
26184	MFI.getObjectSSPLayout(ObjectIdx: i) != MachineFrameInfo::SSPLK_None) {
26185	MFI.setStackID(ObjectIdx: MFI.getStackProtectorIndex(),
26186	ID: TargetStackID::ScalableVector);
26187	MFI.setObjectAlignment(ObjectIdx: MFI.getStackProtectorIndex(), Alignment: Align (`16`));
26188	break;
26189	}
26190	}
26191	}
26192	MFI.computeMaxCallFrameSize(MF);
26193	TargetLoweringBase::finalizeLowering(MF);
26194	}
26195
26196	// Unlike X86, we let frame lowering assign offsets to all catch objects.
26197	bool AArch64TargetLowering::needsFixedCatchObjects() const {
26198	return false;
26199	}
26200
26201	bool AArch64TargetLowering::shouldLocalize(
26202	const MachineInstr &MI, const TargetTransformInfo TTI) const* {
26203	auto &MF = *MI.getMF();
26204	auto &MRI = MF.getRegInfo();
26205	auto maxUses = [](unsigned RematCost) {
26206	// A cost of 1 means remats are basically free.
26207	if (RematCost == `1`)
26208	return std::numeric_limits<unsigned>::max();
26209	if (RematCost == `2`)
26210	return `2U`;
26211
26212	// Remat is too expensive, only sink if there's one user.
26213	if (RematCost > `2`)
26214	return `1U`;
26215	llvm_unreachable("Unexpected remat cost");
26216	};
26217
26218	unsigned Opc = MI.getOpcode();
26219	switch (Opc) {
26220	case TargetOpcode::G_GLOBAL_VALUE: {
26221	// On Darwin, TLS global vars get selected into function calls, which
26222	// we don't want localized, as they can get moved into the middle of a
26223	// another call sequence.
26224	const GlobalValue &GV = *MI.getOperand(i: `1`).getGlobal();
26225	if (GV.isThreadLocal() && Subtarget->isTargetMachO())
26226	return false;
26227	return true; // Always localize G_GLOBAL_VALUE to avoid high reg pressure.
26228	}
26229	case TargetOpcode::G_FCONSTANT:
26230	case TargetOpcode::G_CONSTANT: {
26231	const ConstantInt *CI;
26232	unsigned AdditionalCost = `0`;
26233
26234	if (Opc == TargetOpcode::G_CONSTANT)
26235	CI = MI.getOperand(i: `1`).getCImm();
26236	else {
26237	LLT Ty = MRI.getType(Reg: MI.getOperand(i: `0`).getReg());
26238	// We try to estimate cost of 32/64b fpimms, as they'll likely be
26239	// materialized as integers.
26240	if (Ty.getScalarSizeInBits() != `32` && Ty.getScalarSizeInBits() != `64`)
26241	break;
26242	auto APF = MI.getOperand(i: `1`).getFPImm()->getValueAPF();
26243	bool OptForSize =
26244	MF.getFunction().hasOptSize() \|\| MF.getFunction().hasMinSize();
26245	if (isFPImmLegal(Imm: APF, VT: EVT::getFloatingPointVT(BitWidth: Ty.getScalarSizeInBits()),
26246	OptForSize))
26247	return true; // Constant should be cheap.
26248	CI =
26249	ConstantInt::get(Context&: MF.getFunction().getContext(), V: APF.bitcastToAPInt());
26250	// FP materialization also costs an extra move, from gpr to fpr.
26251	AdditionalCost = `1`;
26252	}
26253	APInt Imm = CI->getValue();
26254	InstructionCost Cost = TTI->getIntImmCost(
26255	Imm, Ty: CI->getType(), CostKind: TargetTransformInfo::TCK_CodeSize);
26256	assert(Cost.isValid() && "Expected a valid imm cost");
26257
26258	unsigned RematCost = *Cost.getValue();
26259	RematCost += AdditionalCost;
26260	Register Reg = MI.getOperand(i: `0`).getReg();
26261	unsigned MaxUses = maxUses (RematCost);
26262	// Don't pass UINT_MAX sentinel value to hasAtMostUserInstrs().
26263	if (MaxUses == std::numeric_limits<unsigned>::max())
26264	--MaxUses;
26265	return MRI.hasAtMostUserInstrs(Reg, MaxUsers: MaxUses);
26266	}
26267	// If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being
26268	// localizable.
26269	case AArch64::ADRP:
26270	case AArch64::G_ADD_LOW:
26271	// Need to localize G_PTR_ADD so that G_GLOBAL_VALUE can be localized too.
26272	case TargetOpcode::G_PTR_ADD:
26273	return true;
26274	default:
26275	break;
26276	}
26277	return TargetLoweringBase::shouldLocalize(MI, TTI);
26278	}
26279
26280	bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const {
26281	if (Inst.getType()->isScalableTy())
26282	return true;
26283
26284	for (unsigned i = `0`; i < Inst.getNumOperands(); ++i)
26285	if (Inst.getOperand(i)->getType()->isScalableTy())
26286	return true;
26287
26288	if (const AllocaInst *AI = dyn_cast<AllocaInst>(Val: &Inst)) {
26289	if (AI->getAllocatedType()->isScalableTy())
26290	return true;
26291	}
26292
26293	// Checks to allow the use of SME instructions
26294	if (auto *Base = dyn_cast<CallBase>(Val: &Inst)) {
26295	auto CallerAttrs = SMEAttrs (*Inst.getFunction());
26296	auto CalleeAttrs = SMEAttrs (*Base);
26297	if (CallerAttrs.requiresSMChange(Callee: CalleeAttrs) \|\|
26298	CallerAttrs.requiresLazySave(Callee: CalleeAttrs) \|\|
26299	CallerAttrs.requiresPreservingZT0(Callee: CalleeAttrs))
26300	return true;
26301	}
26302	return false;
26303	}
26304
26305	// Return the largest legal scalable vector type that matches VT's element type.
26306	static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT) {
26307	assert(VT.isFixedLengthVector() &&
26308	DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
26309	"Expected legal fixed length vector!");
26310	switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
26311	default:
26312	llvm_unreachable("unexpected element type for SVE container");
26313	case MVT::i8:
26314	return EVT(MVT::nxv16i8);
26315	case MVT::i16:
26316	return EVT(MVT::nxv8i16);
26317	case MVT::i32:
26318	return EVT(MVT::nxv4i32);
26319	case MVT::i64:
26320	return EVT(MVT::nxv2i64);
26321	case MVT::bf16:
26322	return EVT(MVT::nxv8bf16);
26323	case MVT::f16:
26324	return EVT(MVT::nxv8f16);
26325	case MVT::f32:
26326	return EVT(MVT::nxv4f32);
26327	case MVT::f64:
26328	return EVT(MVT::nxv2f64);
26329	}
26330	}
26331
26332	// Return a PTRUE with active lanes corresponding to the extent of VT.
26333	static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL,
26334	EVT VT) {
26335	assert(VT.isFixedLengthVector() &&
26336	DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
26337	"Expected legal fixed length vector!");
26338
26339	std::optional<unsigned> PgPattern =
26340	getSVEPredPatternFromNumElements(VT.getVectorNumElements());
26341	assert(PgPattern && "Unexpected element count for SVE predicate");
26342
26343	// For vectors that are exactly getMaxSVEVectorSizeInBits big, we can use
26344	// AArch64SVEPredPattern::all, which can enable the use of unpredicated
26345	// variants of instructions when available.
26346	const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
26347	unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
26348	unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
26349	if (MaxSVESize && MinSVESize == MaxSVESize &&
26350	MaxSVESize == VT.getSizeInBits())
26351	PgPattern = AArch64SVEPredPattern::all;
26352
26353	MVT MaskVT;
26354	switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
26355	default:
26356	llvm_unreachable("unexpected element type for SVE predicate");
26357	case MVT::i8:
26358	MaskVT = MVT::nxv16i1;
26359	break;
26360	case MVT::i16:
26361	case MVT::f16:
26362	case MVT::bf16:
26363	MaskVT = MVT::nxv8i1;
26364	break;
26365	case MVT::i32:
26366	case MVT::f32:
26367	MaskVT = MVT::nxv4i1;
26368	break;
26369	case MVT::i64:
26370	case MVT::f64:
26371	MaskVT = MVT::nxv2i1;
26372	break;
26373	}
26374
26375	return getPTrue(DAG, DL, VT: MaskVT, Pattern: *PgPattern);
26376	}
26377
26378	static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL,
26379	EVT VT) {
26380	assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
26381	"Expected legal scalable vector!");
26382	auto PredTy = VT.changeVectorElementType(MVT::i1);
26383	return getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all);
26384	}
26385
26386	static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT) {
26387	if (VT.isFixedLengthVector())
26388	return getPredicateForFixedLengthVector(DAG, DL, VT);
26389
26390	return getPredicateForScalableVector(DAG, DL, VT);
26391	}
26392
26393	// Grow V to consume an entire SVE register.
26394	static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {
26395	assert(VT.isScalableVector() &&
26396	"Expected to convert into a scalable vector!");
26397	assert(V.getValueType().isFixedLengthVector() &&
26398	"Expected a fixed length vector operand!");
26399	SDLoc DL(V);
26400	SDValue Zero = DAG.getConstant(`0`, DL, MVT::i64);
26401	return DAG.getNode(Opcode: ISD::INSERT_SUBVECTOR, DL, VT, N1: DAG.getUNDEF(VT), N2: V, N3: Zero);
26402	}
26403
26404	// Shrink V so it's just big enough to maintain a VT's worth of data.
26405	static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {
26406	assert(VT.isFixedLengthVector() &&
26407	"Expected to convert into a fixed length vector!");
26408	assert(V.getValueType().isScalableVector() &&
26409	"Expected a scalable vector operand!");
26410	SDLoc DL(V);
26411	SDValue Zero = DAG.getConstant(`0`, DL, MVT::i64);
26412	return DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: V, N2: Zero);
26413	}
26414
26415	// Convert all fixed length vector loads larger than NEON to masked_loads.
26416	SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
26417	SDValue Op, SelectionDAG &DAG) const {
26418	auto Load = cast<LoadSDNode>(Val&: Op);
26419
26420	SDLoc DL(Op);
26421	EVT VT = Op.getValueType();
26422	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26423	EVT LoadVT = ContainerVT;
26424	EVT MemVT = Load->getMemoryVT();
26425
26426	auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
26427
26428	if (VT.isFloatingPoint()) {
26429	LoadVT = ContainerVT.changeTypeToInteger();
26430	MemVT = MemVT.changeTypeToInteger();
26431	}
26432
26433	SDValue NewLoad = DAG.getMaskedLoad(
26434	VT: LoadVT, dl: DL, Chain: Load->getChain(), Base: Load->getBasePtr(), Offset: Load->getOffset(), Mask: Pg,
26435	Src0: DAG.getUNDEF(VT: LoadVT), MemVT, MMO: Load->getMemOperand(),
26436	AM: Load->getAddressingMode(), Load->getExtensionType());
26437
26438	SDValue Result = NewLoad;
26439	if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) {
26440	EVT ExtendVT = ContainerVT.changeVectorElementType(
26441	EltVT: Load->getMemoryVT().getVectorElementType());
26442
26443	Result = getSVESafeBitCast(VT: ExtendVT, Op: Result, DAG);
26444	Result = DAG.getNode(Opcode: AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, VT: ContainerVT,
26445	N1: Pg, N2: Result, N3: DAG.getUNDEF(VT: ContainerVT));
26446	} else if (VT.isFloatingPoint()) {
26447	Result = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: ContainerVT, Operand: Result);
26448	}
26449
26450	Result = convertFromScalableVector(DAG, VT, V: Result);
26451	SDValue MergedValues[`2`] = {Result, NewLoad.getValue(R: `1`)};
26452	return DAG.getMergeValues(Ops: MergedValues, dl: DL);
26453	}
26454
26455	static SDValue convertFixedMaskToScalableVector(SDValue Mask,
26456	SelectionDAG &DAG) {
26457	SDLoc DL(Mask);
26458	EVT InVT = Mask.getValueType();
26459	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: InVT);
26460
26461	auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT: InVT);
26462
26463	if (ISD::isBuildVectorAllOnes(N: Mask.getNode()))
26464	return Pg;
26465
26466	auto Op1 = convertToScalableVector(DAG, VT: ContainerVT, V: Mask);
26467	auto Op2 = DAG.getConstant(Val: `0`, DL, VT: ContainerVT);
26468
26469	return DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL, VT: Pg.getValueType(),
26470	Ops: {Pg, Op1, Op2, DAG.getCondCode(Cond: ISD::SETNE)});
26471	}
26472
26473	// Convert all fixed length vector loads larger than NEON to masked_loads.
26474	SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE(
26475	SDValue Op, SelectionDAG &DAG) const {
26476	auto Load = cast<MaskedLoadSDNode>(Val&: Op);
26477
26478	SDLoc DL(Op);
26479	EVT VT = Op.getValueType();
26480	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26481
26482	SDValue Mask = Load->getMask();
26483	// If this is an extending load and the mask type is not the same as
26484	// load's type then we have to extend the mask type.
26485	if (VT.getScalarSizeInBits() > Mask.getValueType().getScalarSizeInBits()) {
26486	assert(Load->getExtensionType() != ISD::NON_EXTLOAD &&
26487	"Incorrect mask type");
26488	Mask = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT, Operand: Mask);
26489	}
26490	Mask = convertFixedMaskToScalableVector(Mask, DAG);
26491
26492	SDValue PassThru;
26493	bool IsPassThruZeroOrUndef = false;
26494
26495	if (Load->getPassThru()->isUndef()) {
26496	PassThru = DAG.getUNDEF(VT: ContainerVT);
26497	IsPassThruZeroOrUndef = true;
26498	} else {
26499	if (ContainerVT.isInteger())
26500	PassThru = DAG.getConstant(Val: `0`, DL, VT: ContainerVT);
26501	else
26502	PassThru = DAG.getConstantFP(Val: `0`, DL, VT: ContainerVT);
26503	if (isZerosVector(N: Load->getPassThru().getNode()))
26504	IsPassThruZeroOrUndef = true;
26505	}
26506
26507	SDValue NewLoad = DAG.getMaskedLoad(
26508	VT: ContainerVT, dl: DL, Chain: Load->getChain(), Base: Load->getBasePtr(), Offset: Load->getOffset(),
26509	Mask, Src0: PassThru, MemVT: Load->getMemoryVT(), MMO: Load->getMemOperand(),
26510	AM: Load->getAddressingMode(), Load->getExtensionType());
26511
26512	SDValue Result = NewLoad;
26513	if (!IsPassThruZeroOrUndef) {
26514	SDValue OldPassThru =
26515	convertToScalableVector(DAG, VT: ContainerVT, V: Load->getPassThru());
26516	Result = DAG.getSelect(DL, VT: ContainerVT, Cond: Mask, LHS: Result, RHS: OldPassThru);
26517	}
26518
26519	Result = convertFromScalableVector(DAG, VT, V: Result);
26520	SDValue MergedValues[`2`] = {Result, NewLoad.getValue(R: `1`)};
26521	return DAG.getMergeValues(Ops: MergedValues, dl: DL);
26522	}
26523
26524	// Convert all fixed length vector stores larger than NEON to masked_stores.
26525	SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
26526	SDValue Op, SelectionDAG &DAG) const {
26527	auto Store = cast<StoreSDNode>(Val&: Op);
26528
26529	SDLoc DL(Op);
26530	EVT VT = Store->getValue().getValueType();
26531	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26532	EVT MemVT = Store->getMemoryVT();
26533
26534	auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
26535	auto NewValue = convertToScalableVector(DAG, VT: ContainerVT, V: Store->getValue());
26536
26537	if (VT.isFloatingPoint() && Store->isTruncatingStore()) {
26538	EVT TruncVT = ContainerVT.changeVectorElementType(
26539	EltVT: Store->getMemoryVT().getVectorElementType());
26540	MemVT = MemVT.changeTypeToInteger();
26541	NewValue = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, TruncVT, Pg,
26542	NewValue, DAG.getTargetConstant(`0`, DL, MVT::i64),
26543	DAG.getUNDEF(TruncVT));
26544	NewValue =
26545	getSVESafeBitCast(VT: ContainerVT.changeTypeToInteger(), Op: NewValue, DAG);
26546	} else if (VT.isFloatingPoint()) {
26547	MemVT = MemVT.changeTypeToInteger();
26548	NewValue =
26549	getSVESafeBitCast(VT: ContainerVT.changeTypeToInteger(), Op: NewValue, DAG);
26550	}
26551
26552	return DAG.getMaskedStore(Chain: Store->getChain(), dl: DL, Val: NewValue,
26553	Base: Store->getBasePtr(), Offset: Store->getOffset(), Mask: Pg, MemVT,
26554	MMO: Store->getMemOperand(), AM: Store->getAddressingMode(),
26555	IsTruncating: Store->isTruncatingStore());
26556	}
26557
26558	SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
26559	SDValue Op, SelectionDAG &DAG) const {
26560	auto *Store = cast<MaskedStoreSDNode>(Val&: Op);
26561
26562	SDLoc DL(Op);
26563	EVT VT = Store->getValue().getValueType();
26564	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26565
26566	auto NewValue = convertToScalableVector(DAG, VT: ContainerVT, V: Store->getValue());
26567	SDValue Mask = convertFixedMaskToScalableVector(Mask: Store->getMask(), DAG);
26568
26569	return DAG.getMaskedStore(
26570	Chain: Store->getChain(), dl: DL, Val: NewValue, Base: Store->getBasePtr(), Offset: Store->getOffset(),
26571	Mask, MemVT: Store->getMemoryVT(), MMO: Store->getMemOperand(),
26572	AM: Store->getAddressingMode(), IsTruncating: Store->isTruncatingStore());
26573	}
26574
26575	SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
26576	SDValue Op, SelectionDAG &DAG) const {
26577	SDLoc dl(Op);
26578	EVT VT = Op.getValueType();
26579	EVT EltVT = VT.getVectorElementType();
26580
26581	bool Signed = Op.getOpcode() == ISD::SDIV;
26582	unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
26583
26584	bool Negated;
26585	uint64_t SplatVal;
26586	if (Signed && isPow2Splat(Op: Op.getOperand(i: `1`), SplatVal, Negated)) {
26587	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26588	SDValue Op1 = convertToScalableVector(DAG, VT: ContainerVT, V: Op.getOperand(i: `0`));
26589	SDValue Op2 = DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32);
26590
26591	SDValue Pg = getPredicateForFixedLengthVector(DAG, DL&: dl, VT);
26592	SDValue Res =
26593	DAG.getNode(Opcode: AArch64ISD::SRAD_MERGE_OP1, DL: dl, VT: ContainerVT, N1: Pg, N2: Op1, N3: Op2);
26594	if (Negated)
26595	Res = DAG.getNode(Opcode: ISD::SUB, DL: dl, VT: ContainerVT,
26596	N1: DAG.getConstant(Val: `0`, DL: dl, VT: ContainerVT), N2: Res);
26597
26598	return convertFromScalableVector(DAG, VT, V: Res);
26599	}
26600
26601	// Scalable vector i32/i64 DIV is supported.
26602	if (EltVT == MVT::i32 \|\| EltVT == MVT::i64)
26603	return LowerToPredicatedOp(Op, DAG, NewOp: PredOpcode);
26604
26605	// Scalable vector i8/i16 DIV is not supported. Promote it to i32.
26606	EVT HalfVT = VT.getHalfNumVectorElementsVT(Context&: *DAG.getContext());
26607	EVT PromVT = HalfVT.widenIntegerVectorElementType(Context&: *DAG.getContext());
26608	unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
26609
26610	// If the wider type is legal: extend, op, and truncate.
26611	EVT WideVT = VT.widenIntegerVectorElementType(Context&: *DAG.getContext());
26612	if (DAG.getTargetLoweringInfo().isTypeLegal(VT: WideVT)) {
26613	SDValue Op0 = DAG.getNode(Opcode: ExtendOpcode, DL: dl, VT: WideVT, Operand: Op.getOperand(i: `0`));
26614	SDValue Op1 = DAG.getNode(Opcode: ExtendOpcode, DL: dl, VT: WideVT, Operand: Op.getOperand(i: `1`));
26615	SDValue Div = DAG.getNode(Opcode: Op.getOpcode(), DL: dl, VT: WideVT, N1: Op0, N2: Op1);
26616	return DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT, Operand: Div);
26617	}
26618
26619	auto HalveAndExtendVector = [&DAG, &dl, &HalfVT, &PromVT,
26620	&ExtendOpcode](SDValue Op) {
26621	SDValue IdxZero = DAG.getConstant(`0`, dl, MVT::i64);
26622	SDValue IdxHalf =
26623	DAG.getConstant(HalfVT.getVectorNumElements(), dl, MVT::i64);
26624	SDValue Lo = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: dl, VT: HalfVT, N1: Op, N2: IdxZero);
26625	SDValue Hi = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: dl, VT: HalfVT, N1: Op, N2: IdxHalf);
26626	return std::pair<SDValue, SDValue>(
26627	{DAG.getNode(Opcode: ExtendOpcode, DL: dl, VT: PromVT, Operand: Lo),
26628	DAG.getNode(Opcode: ExtendOpcode, DL: dl, VT: PromVT, Operand: Hi)});
26629	};
26630
26631	// If wider type is not legal: split, extend, op, trunc and concat.
26632	auto [Op0LoExt, Op0HiExt] = HalveAndExtendVector (Op.getOperand(i: `0`));
26633	auto [Op1LoExt, Op1HiExt] = HalveAndExtendVector (Op.getOperand(i: `1`));
26634	SDValue Lo = DAG.getNode(Opcode: Op.getOpcode(), DL: dl, VT: PromVT, N1: Op0LoExt, N2: Op1LoExt);
26635	SDValue Hi = DAG.getNode(Opcode: Op.getOpcode(), DL: dl, VT: PromVT, N1: Op0HiExt, N2: Op1HiExt);
26636	SDValue LoTrunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: HalfVT, Operand: Lo);
26637	SDValue HiTrunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: HalfVT, Operand: Hi);
26638	return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: dl, VT, Ops: {LoTrunc, HiTrunc});
26639	}
26640
26641	SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE(
26642	SDValue Op, SelectionDAG &DAG) const {
26643	EVT VT = Op.getValueType();
26644	assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
26645
26646	SDLoc DL(Op);
26647	SDValue Val = Op.getOperand(i: `0`);
26648	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: Val.getValueType());
26649	Val = convertToScalableVector(DAG, VT: ContainerVT, V: Val);
26650
26651	bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND;
26652	unsigned ExtendOpc = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
26653
26654	// Repeatedly unpack Val until the result is of the desired element type.
26655	switch (ContainerVT.getSimpleVT().SimpleTy) {
26656	default:
26657	llvm_unreachable("unimplemented container type");
26658	case MVT::nxv16i8:
26659	Val = DAG.getNode(ExtendOpc, DL, MVT::nxv8i16, Val);
26660	if (VT.getVectorElementType() == MVT::i16)
26661	break;
26662	[[fallthrough]];
26663	case MVT::nxv8i16:
26664	Val = DAG.getNode(ExtendOpc, DL, MVT::nxv4i32, Val);
26665	if (VT.getVectorElementType() == MVT::i32)
26666	break;
26667	[[fallthrough]];
26668	case MVT::nxv4i32:
26669	Val = DAG.getNode(ExtendOpc, DL, MVT::nxv2i64, Val);
26670	assert(VT.getVectorElementType() == MVT::i64 && "Unexpected element type!");
26671	break;
26672	}
26673
26674	return convertFromScalableVector(DAG, VT, V: Val);
26675	}
26676
26677	SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
26678	SDValue Op, SelectionDAG &DAG) const {
26679	EVT VT = Op.getValueType();
26680	assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
26681
26682	SDLoc DL(Op);
26683	SDValue Val = Op.getOperand(i: `0`);
26684	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: Val.getValueType());
26685	Val = convertToScalableVector(DAG, VT: ContainerVT, V: Val);
26686
26687	// Repeatedly truncate Val until the result is of the desired element type.
26688	switch (ContainerVT.getSimpleVT().SimpleTy) {
26689	default:
26690	llvm_unreachable("unimplemented container type");
26691	case MVT::nxv2i64:
26692	Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val);
26693	Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val);
26694	if (VT.getVectorElementType() == MVT::i32)
26695	break;
26696	[[fallthrough]];
26697	case MVT::nxv4i32:
26698	Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val);
26699	Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val);
26700	if (VT.getVectorElementType() == MVT::i16)
26701	break;
26702	[[fallthrough]];
26703	case MVT::nxv8i16:
26704	Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val);
26705	Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val);
26706	assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!");
26707	break;
26708	}
26709
26710	return convertFromScalableVector(DAG, VT, V: Val);
26711	}
26712
26713	SDValue AArch64TargetLowering::LowerFixedLengthExtractVectorElt(
26714	SDValue Op, SelectionDAG &DAG) const {
26715	EVT VT = Op.getValueType();
26716	EVT InVT = Op.getOperand(i: `0`).getValueType();
26717	assert(InVT.isFixedLengthVector() && "Expected fixed length vector type!");
26718
26719	SDLoc DL(Op);
26720	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: InVT);
26721	SDValue Op0 = convertToScalableVector(DAG, VT: ContainerVT, V: Op ->getOperand(Num: `0`));
26722
26723	return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT, N1: Op0, N2: Op.getOperand(i: `1`));
26724	}
26725
26726	SDValue AArch64TargetLowering::LowerFixedLengthInsertVectorElt(
26727	SDValue Op, SelectionDAG &DAG) const {
26728	EVT VT = Op.getValueType();
26729	assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
26730
26731	SDLoc DL(Op);
26732	EVT InVT = Op.getOperand(i: `0`).getValueType();
26733	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: InVT);
26734	SDValue Op0 = convertToScalableVector(DAG, VT: ContainerVT, V: Op ->getOperand(Num: `0`));
26735
26736	auto ScalableRes = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: ContainerVT, N1: Op0,
26737	N2: Op.getOperand(i: `1`), N3: Op.getOperand(i: `2`));
26738
26739	return convertFromScalableVector(DAG, VT, V: ScalableRes);
26740	}
26741
26742	// Convert vector operation 'Op' to an equivalent predicated operation whereby
26743	// the original operation's type is used to construct a suitable predicate.
26744	// NOTE: The results for inactive lanes are undefined.
26745	SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
26746	SelectionDAG &DAG,
26747	unsigned NewOp) const {
26748	EVT VT = Op.getValueType();
26749	SDLoc DL(Op);
26750	auto Pg = getPredicateForVector(DAG, DL, VT);
26751
26752	if (VT.isFixedLengthVector()) {
26753	assert(isTypeLegal(VT) && "Expected only legal fixed-width types");
26754	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26755
26756	// Create list of operands by converting existing ones to scalable types.
26757	SmallVector<SDValue, `4`> Operands = {Pg};
26758	for (const SDValue &V : Op ->op_values()) {
26759	if (isa<CondCodeSDNode>(Val: V)) {
26760	Operands.push_back(Elt: V);
26761	continue;
26762	}
26763
26764	if (const VTSDNode *VTNode = dyn_cast<VTSDNode>(Val: V)) {
26765	EVT VTArg = VTNode->getVT().getVectorElementType();
26766	EVT NewVTArg = ContainerVT.changeVectorElementType(EltVT: VTArg);
26767	Operands.push_back(Elt: DAG.getValueType(NewVTArg));
26768	continue;
26769	}
26770
26771	assert(isTypeLegal(V.getValueType()) &&
26772	"Expected only legal fixed-width types");
26773	Operands.push_back(Elt: convertToScalableVector(DAG, VT: ContainerVT, V));
26774	}
26775
26776	if (isMergePassthruOpcode(Opc: NewOp))
26777	Operands.push_back(Elt: DAG.getUNDEF(VT: ContainerVT));
26778
26779	auto ScalableRes = DAG.getNode(Opcode: NewOp, DL, VT: ContainerVT, Ops: Operands);
26780	return convertFromScalableVector(DAG, VT, V: ScalableRes);
26781	}
26782
26783	assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
26784
26785	SmallVector<SDValue, `4`> Operands = {Pg};
26786	for (const SDValue &V : Op ->op_values()) {
26787	assert((!V.getValueType().isVector() \|\|
26788	V.getValueType().isScalableVector()) &&
26789	"Only scalable vectors are supported!");
26790	Operands.push_back(Elt: V);
26791	}
26792
26793	if (isMergePassthruOpcode(Opc: NewOp))
26794	Operands.push_back(Elt: DAG.getUNDEF(VT));
26795
26796	return DAG.getNode(Opcode: NewOp, DL, VT, Ops: Operands, Flags: Op ->getFlags());
26797	}
26798
26799	// If a fixed length vector operation has no side effects when applied to
26800	// undefined elements, we can safely use scalable vectors to perform the same
26801	// operation without needing to worry about predication.
26802	SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op,
26803	SelectionDAG &DAG) const {
26804	EVT VT = Op.getValueType();
26805	assert(VT.isFixedLengthVector() && isTypeLegal(VT) &&
26806	"Only expected to lower fixed length vector operation!");
26807	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26808
26809	// Create list of operands by converting existing ones to scalable types.
26810	SmallVector<SDValue, `4`> Ops;
26811	for (const SDValue &V : Op ->op_values()) {
26812	assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!");
26813
26814	// Pass through non-vector operands.
26815	if (!V.getValueType().isVector()) {
26816	Ops.push_back(Elt: V);
26817	continue;
26818	}
26819
26820	// "cast" fixed length vector to a scalable vector.
26821	assert(V.getValueType().isFixedLengthVector() &&
26822	isTypeLegal(V.getValueType()) &&
26823	"Only fixed length vectors are supported!");
26824	Ops.push_back(Elt: convertToScalableVector(DAG, VT: ContainerVT, V));
26825	}
26826
26827	auto ScalableRes = DAG.getNode(Opcode: Op.getOpcode(), DL: SDLoc (Op), VT: ContainerVT, Ops);
26828	return convertFromScalableVector(DAG, VT, V: ScalableRes);
26829	}
26830
26831	SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp,
26832	SelectionDAG &DAG) const {
26833	SDLoc DL(ScalarOp);
26834	SDValue AccOp = ScalarOp.getOperand(i: `0`);
26835	SDValue VecOp = ScalarOp.getOperand(i: `1`);
26836	EVT SrcVT = VecOp.getValueType();
26837	EVT ResVT = SrcVT.getVectorElementType();
26838
26839	EVT ContainerVT = SrcVT;
26840	if (SrcVT.isFixedLengthVector()) {
26841	ContainerVT = getContainerForFixedLengthVector(DAG, VT: SrcVT);
26842	VecOp = convertToScalableVector(DAG, VT: ContainerVT, V: VecOp);
26843	}
26844
26845	SDValue Pg = getPredicateForVector(DAG, DL, VT: SrcVT);
26846	SDValue Zero = DAG.getConstant(`0`, DL, MVT::i64);
26847
26848	// Convert operands to Scalable.
26849	AccOp = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: ContainerVT,
26850	N1: DAG.getUNDEF(VT: ContainerVT), N2: AccOp, N3: Zero);
26851
26852	// Perform reduction.
26853	SDValue Rdx = DAG.getNode(Opcode: AArch64ISD::FADDA_PRED, DL, VT: ContainerVT,
26854	N1: Pg, N2: AccOp, N3: VecOp);
26855
26856	return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: ResVT, N1: Rdx, N2: Zero);
26857	}
26858
26859	SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp,
26860	SelectionDAG &DAG) const {
26861	SDLoc DL(ReduceOp);
26862	SDValue Op = ReduceOp.getOperand(i: `0`);
26863	EVT OpVT = Op.getValueType();
26864	EVT VT = ReduceOp.getValueType();
26865
26866	if (!OpVT.isScalableVector() \|\| OpVT.getVectorElementType() != MVT::i1)
26867	return SDValue ();
26868
26869	SDValue Pg = getPredicateForVector(DAG, DL, VT: OpVT);
26870
26871	switch (ReduceOp.getOpcode()) {
26872	default:
26873	return SDValue ();
26874	case ISD::VECREDUCE_OR:
26875	if (isAllActivePredicate(DAG, Pg) && OpVT == MVT::nxv16i1)
26876	// The predicate can be 'Op' because
26877	// vecreduce_or(Op & <all true>) <=> vecreduce_or(Op).
26878	return getPTest(DAG, VT, Pg: Op, Op, Cond: AArch64CC::ANY_ACTIVE);
26879	else
26880	return getPTest(DAG, VT, Pg, Op, Cond: AArch64CC::ANY_ACTIVE);
26881	case ISD::VECREDUCE_AND: {
26882	Op = DAG.getNode(Opcode: ISD::XOR, DL, VT: OpVT, N1: Op, N2: Pg);
26883	return getPTest(DAG, VT, Pg, Op, Cond: AArch64CC::NONE_ACTIVE);
26884	}
26885	case ISD::VECREDUCE_XOR: {
26886	SDValue ID =
26887	DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64);
26888	if (OpVT == MVT::nxv1i1) {
26889	// Emulate a CNTP on .Q using .D and a different governing predicate.
26890	Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Pg);
26891	Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Op);
26892	}
26893	SDValue Cntp =
26894	DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, ID, Pg, Op);
26895	return DAG.getAnyExtOrTrunc(Op: Cntp, DL, VT);
26896	}
26897	}
26898
26899	return SDValue ();
26900	}
26901
26902	SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode,
26903	SDValue ScalarOp,
26904	SelectionDAG &DAG) const {
26905	SDLoc DL(ScalarOp);
26906	SDValue VecOp = ScalarOp.getOperand(i: `0`);
26907	EVT SrcVT = VecOp.getValueType();
26908
26909	if (useSVEForFixedLengthVectorVT(
26910	VT: SrcVT,
26911	/OverrideNEON=/Subtarget->useSVEForFixedLengthVectors())) {
26912	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: SrcVT);
26913	VecOp = convertToScalableVector(DAG, VT: ContainerVT, V: VecOp);
26914	}
26915
26916	// UADDV always returns an i64 result.
26917	EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 :
26918	SrcVT.getVectorElementType();
26919	EVT RdxVT = SrcVT;
26920	if (SrcVT.isFixedLengthVector() \|\| Opcode == AArch64ISD::UADDV_PRED)
26921	RdxVT = getPackedSVEVectorVT(VT: ResVT);
26922
26923	SDValue Pg = getPredicateForVector(DAG, DL, VT: SrcVT);
26924	SDValue Rdx = DAG.getNode(Opcode, DL, VT: RdxVT, N1: Pg, N2: VecOp);
26925	SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT,
26926	Rdx, DAG.getConstant(`0`, DL, MVT::i64));
26927
26928	// The VEC_REDUCE nodes expect an element size result.
26929	if (ResVT != ScalarOp.getValueType())
26930	Res = DAG.getAnyExtOrTrunc(Op: Res, DL, VT: ScalarOp.getValueType());
26931
26932	return Res;
26933	}
26934
26935	SDValue
26936	AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op,
26937	SelectionDAG &DAG) const {
26938	EVT VT = Op.getValueType();
26939	SDLoc DL(Op);
26940
26941	EVT InVT = Op.getOperand(i: `1`).getValueType();
26942	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: InVT);
26943	SDValue Op1 = convertToScalableVector(DAG, VT: ContainerVT, V: Op ->getOperand(Num: `1`));
26944	SDValue Op2 = convertToScalableVector(DAG, VT: ContainerVT, V: Op ->getOperand(Num: `2`));
26945
26946	// Convert the mask to a predicated (NOTE: We don't need to worry about
26947	// inactive lanes since VSELECT is safe when given undefined elements).
26948	EVT MaskVT = Op.getOperand(i: `0`).getValueType();
26949	EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, VT: MaskVT);
26950	auto Mask = convertToScalableVector(DAG, VT: MaskContainerVT, V: Op.getOperand(i: `0`));
26951	Mask = DAG.getNode(ISD::TRUNCATE, DL,
26952	MaskContainerVT.changeVectorElementType(MVT::i1), Mask);
26953
26954	auto ScalableRes = DAG.getNode(Opcode: ISD::VSELECT, DL, VT: ContainerVT,
26955	N1: Mask, N2: Op1, N3: Op2);
26956
26957	return convertFromScalableVector(DAG, VT, V: ScalableRes);
26958	}
26959
26960	SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE(
26961	SDValue Op, SelectionDAG &DAG) const {
26962	SDLoc DL(Op);
26963	EVT InVT = Op.getOperand(i: `0`).getValueType();
26964	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: InVT);
26965
26966	assert(InVT.isFixedLengthVector() && isTypeLegal(InVT) &&
26967	"Only expected to lower fixed length vector operation!");
26968	assert(Op.getValueType() == InVT.changeTypeToInteger() &&
26969	"Expected integer result of the same bit length as the inputs!");
26970
26971	auto Op1 = convertToScalableVector(DAG, VT: ContainerVT, V: Op.getOperand(i: `0`));
26972	auto Op2 = convertToScalableVector(DAG, VT: ContainerVT, V: Op.getOperand(i: `1`));
26973	auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT: InVT);
26974
26975	EVT CmpVT = Pg.getValueType();
26976	auto Cmp = DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL, VT: CmpVT,
26977	Ops: {Pg, Op1, Op2, Op.getOperand(i: `2`)});
26978
26979	EVT PromoteVT = ContainerVT.changeTypeToInteger();
26980	auto Promote = DAG.getBoolExtOrTrunc(Op: Cmp, SL: DL, VT: PromoteVT, OpVT: InVT);
26981	return convertFromScalableVector(DAG, VT: Op.getValueType(), V: Promote);
26982	}
26983
26984	SDValue
26985	AArch64TargetLowering::LowerFixedLengthBitcastToSVE(SDValue Op,
26986	SelectionDAG &DAG) const {
26987	SDLoc DL(Op);
26988	auto SrcOp = Op.getOperand(i: `0`);
26989	EVT VT = Op.getValueType();
26990	EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
26991	EVT ContainerSrcVT =
26992	getContainerForFixedLengthVector(DAG, VT: SrcOp.getValueType());
26993
26994	SrcOp = convertToScalableVector(DAG, VT: ContainerSrcVT, V: SrcOp);
26995	Op = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: ContainerDstVT, Operand: SrcOp);
26996	return convertFromScalableVector(DAG, VT, V: Op);
26997	}
26998
26999	SDValue AArch64TargetLowering::LowerFixedLengthConcatVectorsToSVE(
27000	SDValue Op, SelectionDAG &DAG) const {
27001	SDLoc DL(Op);
27002	unsigned NumOperands = Op ->getNumOperands();
27003
27004	assert(NumOperands > `1` && isPowerOf2_32(NumOperands) &&
27005	"Unexpected number of operands in CONCAT_VECTORS");
27006
27007	auto SrcOp1 = Op.getOperand(i: `0`);
27008	auto SrcOp2 = Op.getOperand(i: `1`);
27009	EVT VT = Op.getValueType();
27010	EVT SrcVT = SrcOp1.getValueType();
27011
27012	if (NumOperands > `2`) {
27013	SmallVector<SDValue, `4`> Ops;
27014	EVT PairVT = SrcVT.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
27015	for (unsigned I = `0`; I < NumOperands; I += `2`)
27016	Ops.push_back(Elt: DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: PairVT,
27017	N1: Op ->getOperand(Num: I), N2: Op ->getOperand(Num: I + `1`)));
27018
27019	return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT, Ops);
27020	}
27021
27022	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27023
27024	SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT: SrcVT);
27025	SrcOp1 = convertToScalableVector(DAG, VT: ContainerVT, V: SrcOp1);
27026	SrcOp2 = convertToScalableVector(DAG, VT: ContainerVT, V: SrcOp2);
27027
27028	Op = DAG.getNode(Opcode: AArch64ISD::SPLICE, DL, VT: ContainerVT, N1: Pg, N2: SrcOp1, N3: SrcOp2);
27029
27030	return convertFromScalableVector(DAG, VT, V: Op);
27031	}
27032
27033	SDValue
27034	AArch64TargetLowering::LowerFixedLengthFPExtendToSVE(SDValue Op,
27035	SelectionDAG &DAG) const {
27036	EVT VT = Op.getValueType();
27037	assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27038
27039	SDLoc DL(Op);
27040	SDValue Val = Op.getOperand(i: `0`);
27041	SDValue Pg = getPredicateForVector(DAG, DL, VT);
27042	EVT SrcVT = Val.getValueType();
27043	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27044	EVT ExtendVT = ContainerVT.changeVectorElementType(
27045	EltVT: SrcVT.getVectorElementType());
27046
27047	Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: SrcVT.changeTypeToInteger(), Operand: Val);
27048	Val = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: VT.changeTypeToInteger(), Operand: Val);
27049
27050	Val = convertToScalableVector(DAG, VT: ContainerVT.changeTypeToInteger(), V: Val);
27051	Val = getSVESafeBitCast(VT: ExtendVT, Op: Val, DAG);
27052	Val = DAG.getNode(Opcode: AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, VT: ContainerVT,
27053	N1: Pg, N2: Val, N3: DAG.getUNDEF(VT: ContainerVT));
27054
27055	return convertFromScalableVector(DAG, VT, V: Val);
27056	}
27057
27058	SDValue
27059	AArch64TargetLowering::LowerFixedLengthFPRoundToSVE(SDValue Op,
27060	SelectionDAG &DAG) const {
27061	EVT VT = Op.getValueType();
27062	assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27063
27064	SDLoc DL(Op);
27065	SDValue Val = Op.getOperand(i: `0`);
27066	EVT SrcVT = Val.getValueType();
27067	EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, VT: SrcVT);
27068	EVT RoundVT = ContainerSrcVT.changeVectorElementType(
27069	EltVT: VT.getVectorElementType());
27070	SDValue Pg = getPredicateForVector(DAG, DL, VT: RoundVT);
27071
27072	Val = convertToScalableVector(DAG, VT: ContainerSrcVT, V: Val);
27073	Val = DAG.getNode(Opcode: AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, VT: RoundVT, N1: Pg, N2: Val,
27074	N3: Op.getOperand(i: `1`), N4: DAG.getUNDEF(VT: RoundVT));
27075	Val = getSVESafeBitCast(VT: ContainerSrcVT.changeTypeToInteger(), Op: Val, DAG);
27076	Val = convertFromScalableVector(DAG, VT: SrcVT.changeTypeToInteger(), V: Val);
27077
27078	Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: VT.changeTypeToInteger(), Operand: Val);
27079	return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Val);
27080	}
27081
27082	SDValue
27083	AArch64TargetLowering::LowerFixedLengthIntToFPToSVE(SDValue Op,
27084	SelectionDAG &DAG) const {
27085	EVT VT = Op.getValueType();
27086	assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27087
27088	bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP;
27089	unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
27090	: AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
27091
27092	SDLoc DL(Op);
27093	SDValue Val = Op.getOperand(i: `0`);
27094	EVT SrcVT = Val.getValueType();
27095	EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
27096	EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, VT: SrcVT);
27097
27098	if (VT.bitsGE(VT: SrcVT)) {
27099	SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
27100
27101	Val = DAG.getNode(Opcode: IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
27102	VT: VT.changeTypeToInteger(), Operand: Val);
27103
27104	// Safe to use a larger than specified operand because by promoting the
27105	// value nothing has changed from an arithmetic point of view.
27106	Val =
27107	convertToScalableVector(DAG, VT: ContainerDstVT.changeTypeToInteger(), V: Val);
27108	Val = DAG.getNode(Opcode, DL, VT: ContainerDstVT, N1: Pg, N2: Val,
27109	N3: DAG.getUNDEF(VT: ContainerDstVT));
27110	return convertFromScalableVector(DAG, VT, V: Val);
27111	} else {
27112	EVT CvtVT = ContainerSrcVT.changeVectorElementType(
27113	EltVT: ContainerDstVT.getVectorElementType());
27114	SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT: SrcVT);
27115
27116	Val = convertToScalableVector(DAG, VT: ContainerSrcVT, V: Val);
27117	Val = DAG.getNode(Opcode, DL, VT: CvtVT, N1: Pg, N2: Val, N3: DAG.getUNDEF(VT: CvtVT));
27118	Val = getSVESafeBitCast(VT: ContainerSrcVT, Op: Val, DAG);
27119	Val = convertFromScalableVector(DAG, VT: SrcVT, V: Val);
27120
27121	Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: VT.changeTypeToInteger(), Operand: Val);
27122	return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Val);
27123	}
27124	}
27125
27126	SDValue
27127	AArch64TargetLowering::LowerVECTOR_DEINTERLEAVE(SDValue Op,
27128	SelectionDAG &DAG) const {
27129	SDLoc DL(Op);
27130	EVT OpVT = Op.getValueType();
27131	assert(OpVT.isScalableVector() &&
27132	"Expected scalable vector in LowerVECTOR_DEINTERLEAVE.");
27133	SDValue Even = DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: OpVT, N1: Op.getOperand(i: `0`),
27134	N2: Op.getOperand(i: `1`));
27135	SDValue Odd = DAG.getNode(Opcode: AArch64ISD::UZP2, DL, VT: OpVT, N1: Op.getOperand(i: `0`),
27136	N2: Op.getOperand(i: `1`));
27137	return DAG.getMergeValues(Ops: {Even, Odd}, dl: DL);
27138	}
27139
27140	SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op,
27141	SelectionDAG &DAG) const {
27142	SDLoc DL(Op);
27143	EVT OpVT = Op.getValueType();
27144	assert(OpVT.isScalableVector() &&
27145	"Expected scalable vector in LowerVECTOR_INTERLEAVE.");
27146
27147	SDValue Lo = DAG.getNode(Opcode: AArch64ISD::ZIP1, DL, VT: OpVT, N1: Op.getOperand(i: `0`),
27148	N2: Op.getOperand(i: `1`));
27149	SDValue Hi = DAG.getNode(Opcode: AArch64ISD::ZIP2, DL, VT: OpVT, N1: Op.getOperand(i: `0`),
27150	N2: Op.getOperand(i: `1`));
27151	return DAG.getMergeValues(Ops: {Lo, Hi}, dl: DL);
27152	}
27153
27154	SDValue
27155	AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
27156	SelectionDAG &DAG) const {
27157	EVT VT = Op.getValueType();
27158	assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27159
27160	bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
27161	unsigned Opcode = IsSigned ? AArch64ISD::FCVTZS_MERGE_PASSTHRU
27162	: AArch64ISD::FCVTZU_MERGE_PASSTHRU;
27163
27164	SDLoc DL(Op);
27165	SDValue Val = Op.getOperand(i: `0`);
27166	EVT SrcVT = Val.getValueType();
27167	EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
27168	EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, VT: SrcVT);
27169
27170	if (VT.bitsGT(VT: SrcVT)) {
27171	EVT CvtVT = ContainerDstVT.changeVectorElementType(
27172	EltVT: ContainerSrcVT.getVectorElementType());
27173	SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
27174
27175	Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: SrcVT.changeTypeToInteger(), Operand: Val);
27176	Val = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT, Operand: Val);
27177
27178	Val = convertToScalableVector(DAG, VT: ContainerDstVT, V: Val);
27179	Val = getSVESafeBitCast(VT: CvtVT, Op: Val, DAG);
27180	Val = DAG.getNode(Opcode, DL, VT: ContainerDstVT, N1: Pg, N2: Val,
27181	N3: DAG.getUNDEF(VT: ContainerDstVT));
27182	return convertFromScalableVector(DAG, VT, V: Val);
27183	} else {
27184	EVT CvtVT = ContainerSrcVT.changeTypeToInteger();
27185	SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT: SrcVT);
27186
27187	// Safe to use a larger than specified result since an fp_to_int where the
27188	// result doesn't fit into the destination is undefined.
27189	Val = convertToScalableVector(DAG, VT: ContainerSrcVT, V: Val);
27190	Val = DAG.getNode(Opcode, DL, VT: CvtVT, N1: Pg, N2: Val, N3: DAG.getUNDEF(VT: CvtVT));
27191	Val = convertFromScalableVector(DAG, VT: SrcVT.changeTypeToInteger(), V: Val);
27192
27193	return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: Val);
27194	}
27195	}
27196
27197	static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2,
27198	ArrayRef<int> ShuffleMask, EVT VT,
27199	EVT ContainerVT, SelectionDAG &DAG) {
27200	auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
27201	SDLoc DL(Op);
27202	unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
27203	unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
27204	bool IsSingleOp =
27205	ShuffleVectorInst::isSingleSourceMask(Mask: ShuffleMask, NumSrcElts: ShuffleMask.size());
27206
27207	if (!Subtarget.isNeonAvailable() && !MinSVESize)
27208	MinSVESize = `128`;
27209
27210	// Ignore two operands if no SVE2 or all index numbers couldn't
27211	// be represented.
27212	if (!IsSingleOp && !Subtarget.hasSVE2())
27213	return SDValue ();
27214
27215	EVT VTOp1 = Op.getOperand(i: `0`).getValueType();
27216	unsigned BitsPerElt = VTOp1.getVectorElementType().getSizeInBits();
27217	unsigned IndexLen = MinSVESize / BitsPerElt;
27218	unsigned ElementsPerVectorReg = VTOp1.getVectorNumElements();
27219	uint64_t MaxOffset = APInt (BitsPerElt, -`1`, false).getZExtValue();
27220	EVT MaskEltType = VTOp1.getVectorElementType().changeTypeToInteger();
27221	EVT MaskType = EVT::getVectorVT(Context&: *DAG.getContext(), VT: MaskEltType, NumElements: IndexLen);
27222	bool MinMaxEqual = (MinSVESize == MaxSVESize);
27223	assert(ElementsPerVectorReg <= IndexLen && ShuffleMask.size() <= IndexLen &&
27224	"Incorrectly legalised shuffle operation");
27225
27226	SmallVector<SDValue, `8`> TBLMask;
27227	// If MinSVESize is not equal to MaxSVESize then we need to know which
27228	// TBL mask element needs adjustment.
27229	SmallVector<SDValue, `8`> AddRuntimeVLMask;
27230
27231	// Bail out for 8-bits element types, because with 2048-bit SVE register
27232	// size 8 bits is only sufficient to index into the first source vector.
27233	if (!IsSingleOp && !MinMaxEqual && BitsPerElt == `8`)
27234	return SDValue ();
27235
27236	for (int Index : ShuffleMask) {
27237	// Handling poison index value.
27238	if (Index < `0`)
27239	Index = `0`;
27240	// If the mask refers to elements in the second operand, then we have to
27241	// offset the index by the number of elements in a vector. If this is number
27242	// is not known at compile-time, we need to maintain a mask with 'VL' values
27243	// to add at runtime.
27244	if ((unsigned)Index >= ElementsPerVectorReg) {
27245	if (MinMaxEqual) {
27246	Index += IndexLen - ElementsPerVectorReg;
27247	} else {
27248	Index = Index - ElementsPerVectorReg;
27249	AddRuntimeVLMask.push_back(DAG.getConstant(`1`, DL, MVT::i64));
27250	}
27251	} else if (!MinMaxEqual)
27252	AddRuntimeVLMask.push_back(DAG.getConstant(`0`, DL, MVT::i64));
27253	// For 8-bit elements and 1024-bit SVE registers and MaxOffset equals
27254	// to 255, this might point to the last element of in the second operand
27255	// of the shufflevector, thus we are rejecting this transform.
27256	if ((unsigned)Index >= MaxOffset)
27257	return SDValue ();
27258	TBLMask.push_back(DAG.getConstant(Index, DL, MVT::i64));
27259	}
27260
27261	// Choosing an out-of-range index leads to the lane being zeroed vs zero
27262	// value where it would perform first lane duplication for out of
27263	// index elements. For i8 elements an out-of-range index could be a valid
27264	// for 2048-bit vector register size.
27265	for (unsigned i = `0`; i < IndexLen - ElementsPerVectorReg; ++i) {
27266	TBLMask.push_back(DAG.getConstant((int)MaxOffset, DL, MVT::i64));
27267	if (!MinMaxEqual)
27268	AddRuntimeVLMask.push_back(DAG.getConstant(`0`, DL, MVT::i64));
27269	}
27270
27271	EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, VT: MaskType);
27272	SDValue VecMask =
27273	DAG.getBuildVector(VT: MaskType, DL, Ops: ArrayRef(TBLMask.data(), IndexLen));
27274	SDValue SVEMask = convertToScalableVector(DAG, VT: MaskContainerVT, V: VecMask);
27275
27276	SDValue Shuffle;
27277	if (IsSingleOp)
27278	Shuffle =
27279	DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
27280	DAG.getConstant(Intrinsic::aarch64_sve_tbl, DL, MVT::i32),
27281	Op1, SVEMask);
27282	else if (Subtarget.hasSVE2()) {
27283	if (!MinMaxEqual) {
27284	unsigned MinNumElts = AArch64::SVEBitsPerBlock / BitsPerElt;
27285	SDValue VScale = (BitsPerElt == `64`)
27286	? DAG.getVScale(DL, MVT::i64, APInt(`64`, MinNumElts))
27287	: DAG.getVScale(DL, MVT::i32, APInt(`32`, MinNumElts));
27288	SDValue VecMask =
27289	DAG.getBuildVector(VT: MaskType, DL, Ops: ArrayRef(TBLMask.data(), IndexLen));
27290	SDValue MulByMask = DAG.getNode(
27291	Opcode: ISD::MUL, DL, VT: MaskType,
27292	N1: DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: MaskType, Operand: VScale),
27293	N2: DAG.getBuildVector(VT: MaskType, DL,
27294	Ops: ArrayRef(AddRuntimeVLMask.data(), IndexLen)));
27295	SDValue UpdatedVecMask =
27296	DAG.getNode(Opcode: ISD::ADD, DL, VT: MaskType, N1: VecMask, N2: MulByMask);
27297	SVEMask = convertToScalableVector(
27298	DAG, VT: getContainerForFixedLengthVector(DAG, VT: MaskType), V: UpdatedVecMask);
27299	}
27300	Shuffle =
27301	DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
27302	DAG.getConstant(Intrinsic::aarch64_sve_tbl2, DL, MVT::i32),
27303	Op1, Op2, SVEMask);
27304	}
27305	Shuffle = convertFromScalableVector(DAG, VT, V: Shuffle);
27306	return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: Op.getValueType(), Operand: Shuffle);
27307	}
27308
27309	SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
27310	SDValue Op, SelectionDAG &DAG) const {
27311	EVT VT = Op.getValueType();
27312	assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27313
27314	auto *SVN = cast<ShuffleVectorSDNode>(Val: Op.getNode());
27315	auto ShuffleMask = SVN->getMask();
27316
27317	SDLoc DL(Op);
27318	SDValue Op1 = Op.getOperand(i: `0`);
27319	SDValue Op2 = Op.getOperand(i: `1`);
27320
27321	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27322	Op1 = convertToScalableVector(DAG, VT: ContainerVT, V: Op1);
27323	Op2 = convertToScalableVector(DAG, VT: ContainerVT, V: Op2);
27324
27325	auto MinLegalExtractEltScalarTy = [](EVT ScalarTy) -> EVT {
27326	if (ScalarTy == MVT::i8 \|\| ScalarTy == MVT::i16)
27327	return MVT::i32;
27328	return ScalarTy;
27329	};
27330
27331	if (SVN->isSplat()) {
27332	unsigned Lane = std::max(a: `0`, b: SVN->getSplatIndex());
27333	EVT ScalarTy = MinLegalExtractEltScalarTy (VT.getVectorElementType());
27334	SDValue SplatEl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
27335	DAG.getConstant(Lane, DL, MVT::i64));
27336	Op = DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: ContainerVT, Operand: SplatEl);
27337	return convertFromScalableVector(DAG, VT, V: Op);
27338	}
27339
27340	bool ReverseEXT = false;
27341	unsigned Imm;
27342	if (isEXTMask(M: ShuffleMask, VT, ReverseEXT, Imm) &&
27343	Imm == VT.getVectorNumElements() - `1`) {
27344	if (ReverseEXT)
27345	std::swap(a&: Op1, b&: Op2);
27346	EVT ScalarTy = MinLegalExtractEltScalarTy (VT.getVectorElementType());
27347	SDValue Scalar = DAG.getNode(
27348	ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
27349	DAG.getConstant(VT.getVectorNumElements() - `1`, DL, MVT::i64));
27350	Op = DAG.getNode(Opcode: AArch64ISD::INSR, DL, VT: ContainerVT, N1: Op2, N2: Scalar);
27351	return convertFromScalableVector(DAG, VT, V: Op);
27352	}
27353
27354	for (unsigned LaneSize : {`64U`, `32U`, `16U`}) {
27355	if (isREVMask(M: ShuffleMask, VT, BlockSize: LaneSize)) {
27356	EVT NewVT =
27357	getPackedSVEVectorVT(VT: EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: LaneSize));
27358	unsigned RevOp;
27359	unsigned EltSz = VT.getScalarSizeInBits();
27360	if (EltSz == `8`)
27361	RevOp = AArch64ISD::BSWAP_MERGE_PASSTHRU;
27362	else if (EltSz == `16`)
27363	RevOp = AArch64ISD::REVH_MERGE_PASSTHRU;
27364	else
27365	RevOp = AArch64ISD::REVW_MERGE_PASSTHRU;
27366
27367	Op = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: NewVT, Operand: Op1);
27368	Op = LowerToPredicatedOp(Op, DAG, NewOp: RevOp);
27369	Op = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: ContainerVT, Operand: Op);
27370	return convertFromScalableVector(DAG, VT, V: Op);
27371	}
27372	}
27373
27374	if (Subtarget->hasSVE2p1() && VT.getScalarSizeInBits() == `64` &&
27375	isREVMask(M: ShuffleMask, VT, BlockSize: `128`)) {
27376	if (!VT.isFloatingPoint())
27377	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::REVD_MERGE_PASSTHRU);
27378
27379	EVT NewVT = getPackedSVEVectorVT(VT: EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: `64`));
27380	Op = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: NewVT, Operand: Op1);
27381	Op = LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::REVD_MERGE_PASSTHRU);
27382	Op = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: ContainerVT, Operand: Op);
27383	return convertFromScalableVector(DAG, VT, V: Op);
27384	}
27385
27386	unsigned WhichResult;
27387	if (isZIPMask(M: ShuffleMask, VT, WhichResultOut&: WhichResult) && WhichResult == `0`)
27388	return convertFromScalableVector(
27389	DAG, VT, V: DAG.getNode(Opcode: AArch64ISD::ZIP1, DL, VT: ContainerVT, N1: Op1, N2: Op2));
27390
27391	if (isTRNMask(M: ShuffleMask, VT, WhichResult)) {
27392	unsigned Opc = (WhichResult == `0`) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
27393	return convertFromScalableVector(
27394	DAG, VT, V: DAG.getNode(Opcode: Opc, DL, VT: ContainerVT, N1: Op1, N2: Op2));
27395	}
27396
27397	if (isZIP_v_undef_Mask(M: ShuffleMask, VT, WhichResult) && WhichResult == `0`)
27398	return convertFromScalableVector(
27399	DAG, VT, V: DAG.getNode(Opcode: AArch64ISD::ZIP1, DL, VT: ContainerVT, N1: Op1, N2: Op1));
27400
27401	if (isTRN_v_undef_Mask(M: ShuffleMask, VT, WhichResult)) {
27402	unsigned Opc = (WhichResult == `0`) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
27403	return convertFromScalableVector(
27404	DAG, VT, V: DAG.getNode(Opcode: Opc, DL, VT: ContainerVT, N1: Op1, N2: Op1));
27405	}
27406
27407	// Functions like isZIPMask return true when a ISD::VECTOR_SHUFFLE's mask
27408	// represents the same logical operation as performed by a ZIP instruction. In
27409	// isolation these functions do not mean the ISD::VECTOR_SHUFFLE is exactly
27410	// equivalent to an AArch64 instruction. There's the extra component of
27411	// ISD::VECTOR_SHUFFLE's value type to consider. Prior to SVE these functions
27412	// only operated on 64/128bit vector types that have a direct mapping to a
27413	// target register and so an exact mapping is implied.
27414	// However, when using SVE for fixed length vectors, most legal vector types
27415	// are actually sub-vectors of a larger SVE register. When mapping
27416	// ISD::VECTOR_SHUFFLE to an SVE instruction care must be taken to consider
27417	// how the mask's indices translate. Specifically, when the mapping requires
27418	// an exact meaning for a specific vector index (e.g. Index X is the last
27419	// vector element in the register) then such mappings are often only safe when
27420	// the exact SVE register size is know. The main exception to this is when
27421	// indices are logically relative to the first element of either
27422	// ISD::VECTOR_SHUFFLE operand because these relative indices don't change
27423	// when converting from fixed-length to scalable vector types (i.e. the start
27424	// of a fixed length vector is always the start of a scalable vector).
27425	unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
27426	unsigned MaxSVESize = Subtarget->getMaxSVEVectorSizeInBits();
27427	if (MinSVESize == MaxSVESize && MaxSVESize == VT.getSizeInBits()) {
27428	if (ShuffleVectorInst::isReverseMask(Mask: ShuffleMask, NumSrcElts: ShuffleMask.size()) &&
27429	Op2.isUndef()) {
27430	Op = DAG.getNode(Opcode: ISD::VECTOR_REVERSE, DL, VT: ContainerVT, Operand: Op1);
27431	return convertFromScalableVector(DAG, VT, V: Op);
27432	}
27433
27434	if (isZIPMask(M: ShuffleMask, VT, WhichResultOut&: WhichResult) && WhichResult != `0`)
27435	return convertFromScalableVector(
27436	DAG, VT, V: DAG.getNode(Opcode: AArch64ISD::ZIP2, DL, VT: ContainerVT, N1: Op1, N2: Op2));
27437
27438	if (isUZPMask(M: ShuffleMask, VT, WhichResultOut&: WhichResult)) {
27439	unsigned Opc = (WhichResult == `0`) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
27440	return convertFromScalableVector(
27441	DAG, VT, V: DAG.getNode(Opcode: Opc, DL, VT: ContainerVT, N1: Op1, N2: Op2));
27442	}
27443
27444	if (isZIP_v_undef_Mask(M: ShuffleMask, VT, WhichResult) && WhichResult != `0`)
27445	return convertFromScalableVector(
27446	DAG, VT, V: DAG.getNode(Opcode: AArch64ISD::ZIP2, DL, VT: ContainerVT, N1: Op1, N2: Op1));
27447
27448	if (isUZP_v_undef_Mask(M: ShuffleMask, VT, WhichResult)) {
27449	unsigned Opc = (WhichResult == `0`) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
27450	return convertFromScalableVector(
27451	DAG, VT, V: DAG.getNode(Opcode: Opc, DL, VT: ContainerVT, N1: Op1, N2: Op1));
27452	}
27453	}
27454
27455	// Avoid producing TBL instruction if we don't know SVE register minimal size,
27456	// unless NEON is not available and we can assume minimal SVE register size is
27457	// 128-bits.
27458	if (MinSVESize \|\| !Subtarget->isNeonAvailable())
27459	return GenerateFixedLengthSVETBL(Op, Op1, Op2, ShuffleMask, VT, ContainerVT,
27460	DAG);
27461
27462	return SDValue ();
27463	}
27464
27465	SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
27466	SelectionDAG &DAG) const {
27467	SDLoc DL(Op);
27468	EVT InVT = Op.getValueType();
27469
27470	assert(VT.isScalableVector() && isTypeLegal(VT) &&
27471	InVT.isScalableVector() && isTypeLegal(InVT) &&
27472	"Only expect to cast between legal scalable vector types!");
27473	assert(VT.getVectorElementType() != MVT::i1 &&
27474	InVT.getVectorElementType() != MVT::i1 &&
27475	"For predicate bitcasts, use getSVEPredicateBitCast");
27476
27477	if (InVT == VT)
27478	return Op;
27479
27480	EVT PackedVT = getPackedSVEVectorVT(VT: VT.getVectorElementType());
27481	EVT PackedInVT = getPackedSVEVectorVT(VT: InVT.getVectorElementType());
27482
27483	// Safe bitcasting between unpacked vector types of different element counts
27484	// is currently unsupported because the following is missing the necessary
27485	// work to ensure the result's elements live where they're supposed to within
27486	// an SVE register.
27487	// 01234567
27488	// e.g. nxv2i32 = XX??XX??
27489	// nxv4f16 = X?X?X?X?
27490	assert((VT.getVectorElementCount() == InVT.getVectorElementCount() \|\|
27491	VT == PackedVT \|\| InVT == PackedInVT) &&
27492	"Unexpected bitcast!");
27493
27494	// Pack input if required.
27495	if (InVT != PackedInVT)
27496	Op = DAG.getNode(Opcode: AArch64ISD::REINTERPRET_CAST, DL, VT: PackedInVT, Operand: Op);
27497
27498	Op = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: PackedVT, Operand: Op);
27499
27500	// Unpack result if required.
27501	if (VT != PackedVT)
27502	Op = DAG.getNode(Opcode: AArch64ISD::REINTERPRET_CAST, DL, VT, Operand: Op);
27503
27504	return Op;
27505	}
27506
27507	bool AArch64TargetLowering::isAllActivePredicate(SelectionDAG &DAG,
27508	SDValue N) const {
27509	return ::isAllActivePredicate(DAG, N);
27510	}
27511
27512	EVT AArch64TargetLowering::getPromotedVTForPredicate(EVT VT) const {
27513	return ::getPromotedVTForPredicate(VT);
27514	}
27515
27516	bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
27517	SDValue Op, const APInt &OriginalDemandedBits,
27518	const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
27519	unsigned Depth) const {
27520
27521	unsigned Opc = Op.getOpcode();
27522	switch (Opc) {
27523	case AArch64ISD::VSHL: {
27524	// Match (VSHL (VLSHR Val X) X)
27525	SDValue ShiftL = Op;
27526	SDValue ShiftR = Op ->getOperand(Num: `0`);
27527	if (ShiftR ->getOpcode() != AArch64ISD::VLSHR)
27528	return false;
27529
27530	if (!ShiftL.hasOneUse() \|\| !ShiftR.hasOneUse())
27531	return false;
27532
27533	unsigned ShiftLBits = ShiftL ->getConstantOperandVal(Num: `1`);
27534	unsigned ShiftRBits = ShiftR ->getConstantOperandVal(Num: `1`);
27535
27536	// Other cases can be handled as well, but this is not
27537	// implemented.
27538	if (ShiftRBits != ShiftLBits)
27539	return false;
27540
27541	unsigned ScalarSize = Op.getScalarValueSizeInBits();
27542	assert(ScalarSize > ShiftLBits && "Invalid shift imm");
27543
27544	APInt ZeroBits = APInt::getLowBitsSet(numBits: ScalarSize, loBitsSet: ShiftLBits);
27545	APInt UnusedBits = ~OriginalDemandedBits;
27546
27547	if ((ZeroBits & UnusedBits) != ZeroBits)
27548	return false;
27549
27550	// All bits that are zeroed by (VSHL (VLSHR Val X) X) are not
27551	// used - simplify to just Val.
27552	return TLO.CombineTo(O: Op, N: ShiftR ->getOperand(Num: `0`));
27553	}
27554	case AArch64ISD::BICi: {
27555	// Fold BICi if all destination bits already known to be zeroed
27556	SDValue Op0 = Op.getOperand(i: `0`);
27557	KnownBits KnownOp0 =
27558	TLO.DAG.computeKnownBits(Op: Op0, DemandedElts: OriginalDemandedElts, Depth: Depth + `1`);
27559	// Op0 &= ~(ConstantOperandVal(1) << ConstantOperandVal(2))
27560	uint64_t BitsToClear = Op ->getConstantOperandVal(Num: `1`)
27561	<< Op ->getConstantOperandVal(Num: `2`);
27562	APInt AlreadyZeroedBitsToClear = BitsToClear & KnownOp0.Zero;
27563	if (APInt (Known.getBitWidth(), BitsToClear)
27564	.isSubsetOf(RHS: AlreadyZeroedBitsToClear))
27565	return TLO.CombineTo(O: Op, N: Op0);
27566
27567	Known = KnownOp0 &
27568	KnownBits::makeConstant(C: APInt (Known.getBitWidth(), ~BitsToClear));
27569
27570	return false;
27571	}
27572	case ISD::INTRINSIC_WO_CHAIN: {
27573	if (auto ElementSize = IsSVECntIntrinsic(S: Op)) {
27574	unsigned MaxSVEVectorSizeInBits = Subtarget->getMaxSVEVectorSizeInBits();
27575	if (!MaxSVEVectorSizeInBits)
27576	MaxSVEVectorSizeInBits = AArch64::SVEMaxBitsPerVector;
27577	unsigned MaxElements = MaxSVEVectorSizeInBits / *ElementSize;
27578	// The SVE count intrinsics don't support the multiplier immediate so we
27579	// don't have to account for that here. The value returned may be slightly
27580	// over the true required bits, as this is based on the "ALL" pattern. The
27581	// other patterns are also exposed by these intrinsics, but they all
27582	// return a value that's strictly less than "ALL".
27583	unsigned RequiredBits = llvm::bit_width(Value: MaxElements);
27584	unsigned BitWidth = Known.Zero.getBitWidth();
27585	if (RequiredBits < BitWidth)
27586	Known.Zero.setHighBits(BitWidth - RequiredBits);
27587	return false;
27588	}
27589	}
27590	}
27591
27592	return TargetLowering::SimplifyDemandedBitsForTargetNode(
27593	Op, DemandedBits: OriginalDemandedBits, DemandedElts: OriginalDemandedElts, Known, TLO, Depth);
27594	}
27595
27596	bool AArch64TargetLowering::isTargetCanonicalConstantNode(SDValue Op) const {
27597	return Op.getOpcode() == AArch64ISD::DUP \|\|
27598	Op.getOpcode() == AArch64ISD::MOVI \|\|
27599	(Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
27600	Op.getOperand(i: `0`).getOpcode() == AArch64ISD::DUP) \|\|
27601	TargetLowering::isTargetCanonicalConstantNode(Op);
27602	}
27603
27604	bool AArch64TargetLowering::isComplexDeinterleavingSupported() const {
27605	return Subtarget->hasSVE() \|\| Subtarget->hasSVE2() \|\|
27606	Subtarget->hasComplxNum();
27607	}
27608
27609	bool AArch64TargetLowering::isComplexDeinterleavingOperationSupported(
27610	ComplexDeinterleavingOperation Operation, Type Ty) const* {
27611	auto *VTy = dyn_cast<VectorType>(Val: Ty);
27612	if (!VTy)
27613	return false;
27614
27615	// If the vector is scalable, SVE is enabled, implying support for complex
27616	// numbers. Otherwise, we need to ensure complex number support is available
27617	if (!VTy->isScalableTy() && !Subtarget->hasComplxNum())
27618	return false;
27619
27620	auto *ScalarTy = VTy->getScalarType();
27621	unsigned NumElements = VTy->getElementCount().getKnownMinValue();
27622
27623	// We can only process vectors that have a bit size of 128 or higher (with an
27624	// additional 64 bits for Neon). Additionally, these vectors must have a
27625	// power-of-2 size, as we later split them into the smallest supported size
27626	// and merging them back together after applying complex operation.
27627	unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
27628	if ((VTyWidth < `128` && (VTy->isScalableTy() \|\| VTyWidth != `64`)) \|\|
27629	!llvm::isPowerOf2_32(Value: VTyWidth))
27630	return false;
27631
27632	if (ScalarTy->isIntegerTy() && Subtarget->hasSVE2() && VTy->isScalableTy()) {
27633	unsigned ScalarWidth = ScalarTy->getScalarSizeInBits();
27634	return `8` <= ScalarWidth && ScalarWidth <= `64`;
27635	}
27636
27637	return (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) \|\|
27638	ScalarTy->isFloatTy() \|\| ScalarTy->isDoubleTy();
27639	}
27640
27641	Value *AArch64TargetLowering::createComplexDeinterleavingIR(
27642	IRBuilderBase &B, ComplexDeinterleavingOperation OperationType,
27643	ComplexDeinterleavingRotation Rotation, Value InputA, Value InputB,
27644	Value Accumulator) const* {
27645	VectorType *Ty = cast<VectorType>(Val: InputA->getType());
27646	bool IsScalable = Ty->isScalableTy();
27647	bool IsInt = Ty->getElementType()->isIntegerTy();
27648
27649	unsigned TyWidth =
27650	Ty->getScalarSizeInBits() * Ty->getElementCount().getKnownMinValue();
27651
27652	assert(((TyWidth >= `128` && llvm::isPowerOf2_32(TyWidth)) \|\| TyWidth == `64`) &&
27653	"Vector type must be either 64 or a power of 2 that is at least 128");
27654
27655	if (TyWidth > `128`) {
27656	int Stride = Ty->getElementCount().getKnownMinValue() / `2`;
27657	auto *HalfTy = VectorType::getHalfElementsVectorType(VTy: Ty);
27658	auto *LowerSplitA = B.CreateExtractVector(DstType: HalfTy, SrcVec: InputA, Idx: B.getInt64(C: `0`));
27659	auto *LowerSplitB = B.CreateExtractVector(DstType: HalfTy, SrcVec: InputB, Idx: B.getInt64(C: `0`));
27660	auto *UpperSplitA =
27661	B.CreateExtractVector(DstType: HalfTy, SrcVec: InputA, Idx: B.getInt64(C: Stride));
27662	auto *UpperSplitB =
27663	B.CreateExtractVector(DstType: HalfTy, SrcVec: InputB, Idx: B.getInt64(C: Stride));
27664	Value LowerSplitAcc = nullptr*;
27665	Value UpperSplitAcc = nullptr*;
27666	if (Accumulator) {
27667	LowerSplitAcc = B.CreateExtractVector(DstType: HalfTy, SrcVec: Accumulator, Idx: B.getInt64(C: `0`));
27668	UpperSplitAcc =
27669	B.CreateExtractVector(DstType: HalfTy, SrcVec: Accumulator, Idx: B.getInt64(C: Stride));
27670	}
27671	auto *LowerSplitInt = createComplexDeinterleavingIR(
27672	B, OperationType, Rotation, InputA: LowerSplitA, InputB: LowerSplitB, Accumulator: LowerSplitAcc);
27673	auto *UpperSplitInt = createComplexDeinterleavingIR(
27674	B, OperationType, Rotation, InputA: UpperSplitA, InputB: UpperSplitB, Accumulator: UpperSplitAcc);
27675
27676	auto *Result = B.CreateInsertVector(DstType: Ty, SrcVec: PoisonValue::get(T: Ty), SubVec: LowerSplitInt,
27677	Idx: B.getInt64(C: `0`));
27678	return B.CreateInsertVector(DstType: Ty, SrcVec: Result, SubVec: UpperSplitInt, Idx: B.getInt64(C: Stride));
27679	}
27680
27681	if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
27682	if (Accumulator == nullptr)
27683	Accumulator = Constant::getNullValue(Ty);
27684
27685	if (IsScalable) {
27686	if (IsInt)
27687	return B.CreateIntrinsic(
27688	Intrinsic::aarch64_sve_cmla_x, Ty,
27689	{Accumulator, InputA, InputB, B.getInt32((int)Rotation * `90`)});
27690
27691	auto *Mask = B.getAllOnesMask(NumElts: Ty->getElementCount());
27692	return B.CreateIntrinsic(
27693	Intrinsic::aarch64_sve_fcmla, Ty,
27694	{Mask, Accumulator, InputA, InputB, B.getInt32((int)Rotation * `90`)});
27695	}
27696
27697	Intrinsic::ID IdMap[`4`] = {Intrinsic::aarch64_neon_vcmla_rot0,
27698	Intrinsic::aarch64_neon_vcmla_rot90,
27699	Intrinsic::aarch64_neon_vcmla_rot180,
27700	Intrinsic::aarch64_neon_vcmla_rot270};
27701
27702
27703	return B.CreateIntrinsic(ID: IdMap[(int)Rotation], Types: Ty,
27704	Args: {Accumulator, InputA, InputB});
27705	}
27706
27707	if (OperationType == ComplexDeinterleavingOperation::CAdd) {
27708	if (IsScalable) {
27709	if (Rotation == ComplexDeinterleavingRotation::Rotation_90 \|\|
27710	Rotation == ComplexDeinterleavingRotation::Rotation_270) {
27711	if (IsInt)
27712	return B.CreateIntrinsic(
27713	Intrinsic::aarch64_sve_cadd_x, Ty,
27714	{InputA, InputB, B.getInt32((int)Rotation * `90`)});
27715
27716	auto *Mask = B.getAllOnesMask(NumElts: Ty->getElementCount());
27717	return B.CreateIntrinsic(
27718	Intrinsic::aarch64_sve_fcadd, Ty,
27719	{Mask, InputA, InputB, B.getInt32((int)Rotation * `90`)});
27720	}
27721	return nullptr;
27722	}
27723
27724	Intrinsic::ID IntId = Intrinsic::not_intrinsic;
27725	if (Rotation == ComplexDeinterleavingRotation::Rotation_90)
27726	IntId = Intrinsic::aarch64_neon_vcadd_rot90;
27727	else if (Rotation == ComplexDeinterleavingRotation::Rotation_270)
27728	IntId = Intrinsic::aarch64_neon_vcadd_rot270;
27729
27730	if (IntId == Intrinsic::not_intrinsic)
27731	return nullptr;
27732
27733	return B.CreateIntrinsic(ID: IntId, Types: Ty, Args: {InputA, InputB});
27734	}
27735
27736	return nullptr;
27737	}
27738
27739	bool AArch64TargetLowering::preferScalarizeSplat(SDNode N) const* {
27740	unsigned Opc = N->getOpcode();
27741	if (ISD::isExtOpcode(Opcode: Opc)) {
27742	if (any_of(Range: N->uses(),
27743	P: [&](SDNode Use) { return* Use->getOpcode() == ISD::MUL; }))
27744	return false;
27745	}
27746	return true;
27747	}
27748
27749	unsigned AArch64TargetLowering::getMinimumJumpTableEntries() const {
27750	return Subtarget->getMinimumJumpTableEntries();
27751	}
27752
27753	MVT AArch64TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
27754	CallingConv::ID CC,
27755	EVT VT) const {
27756	bool NonUnitFixedLengthVector =
27757	VT.isFixedLengthVector() && !VT.getVectorElementCount().isScalar();
27758	if (!NonUnitFixedLengthVector \|\| !Subtarget->useSVEForFixedLengthVectors())
27759	return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
27760
27761	EVT VT1;
27762	MVT RegisterVT;
27763	unsigned NumIntermediates;
27764	getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT&: VT1, NumIntermediates,
27765	RegisterVT);
27766	return RegisterVT;
27767	}
27768
27769	unsigned AArch64TargetLowering::getNumRegistersForCallingConv(
27770	LLVMContext &Context, CallingConv::ID CC, EVT VT) const {
27771	bool NonUnitFixedLengthVector =
27772	VT.isFixedLengthVector() && !VT.getVectorElementCount().isScalar();
27773	if (!NonUnitFixedLengthVector \|\| !Subtarget->useSVEForFixedLengthVectors())
27774	return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
27775
27776	EVT VT1;
27777	MVT VT2;
27778	unsigned NumIntermediates;
27779	return getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT&: VT1,
27780	NumIntermediates, RegisterVT&: VT2);
27781	}
27782
27783	unsigned AArch64TargetLowering::getVectorTypeBreakdownForCallingConv(
27784	LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
27785	unsigned &NumIntermediates, MVT &RegisterVT) const {
27786	int NumRegs = TargetLowering::getVectorTypeBreakdownForCallingConv(
27787	Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
27788	if (!RegisterVT.isFixedLengthVector() \|\|
27789	RegisterVT.getFixedSizeInBits() <= `128`)
27790	return NumRegs;
27791
27792	assert(Subtarget->useSVEForFixedLengthVectors() && "Unexpected mode!");
27793	assert(IntermediateVT == RegisterVT && "Unexpected VT mismatch!");
27794	assert(RegisterVT.getFixedSizeInBits() % `128` == `0` && "Unexpected size!");
27795
27796	// A size mismatch here implies either type promotion or widening and would
27797	// have resulted in scalarisation if larger vectors had not be available.
27798	if (RegisterVT.getSizeInBits() * NumRegs != VT.getSizeInBits()) {
27799	EVT EltTy = VT.getVectorElementType();
27800	EVT NewVT = EVT::getVectorVT(Context, VT: EltTy, EC: ElementCount::getFixed(MinVal: `1`));
27801	if (!isTypeLegal(VT: NewVT))
27802	NewVT = EltTy;
27803
27804	IntermediateVT = NewVT;
27805	NumIntermediates = VT.getVectorNumElements();
27806	RegisterVT = getRegisterType(Context, VT: NewVT);
27807	return NumIntermediates;
27808	}
27809
27810	// SVE VLS support does not introduce a new ABI so we should use NEON sized
27811	// types for vector arguments and returns.
27812
27813	unsigned NumSubRegs = RegisterVT.getFixedSizeInBits() / `128`;
27814	NumIntermediates *= NumSubRegs;
27815	NumRegs *= NumSubRegs;
27816
27817	switch (RegisterVT.getVectorElementType().SimpleTy) {
27818	default:
27819	llvm_unreachable("unexpected element type for vector");
27820	case MVT::i8:
27821	IntermediateVT = RegisterVT = MVT::v16i8;
27822	break;
27823	case MVT::i16:
27824	IntermediateVT = RegisterVT = MVT::v8i16;
27825	break;
27826	case MVT::i32:
27827	IntermediateVT = RegisterVT = MVT::v4i32;
27828	break;
27829	case MVT::i64:
27830	IntermediateVT = RegisterVT = MVT::v2i64;
27831	break;
27832	case MVT::f16:
27833	IntermediateVT = RegisterVT = MVT::v8f16;
27834	break;
27835	case MVT::f32:
27836	IntermediateVT = RegisterVT = MVT::v4f32;
27837	break;
27838	case MVT::f64:
27839	IntermediateVT = RegisterVT = MVT::v2f64;
27840	break;
27841	case MVT::bf16:
27842	IntermediateVT = RegisterVT = MVT::v8bf16;
27843	break;
27844	}
27845
27846	return NumRegs;
27847	}
27848
27849	bool AArch64TargetLowering::hasInlineStackProbe(
27850	const MachineFunction &MF) const {
27851	return !Subtarget->isTargetWindows() &&
27852	MF.getInfo<AArch64FunctionInfo>()->hasStackProbing();
27853	}
27854
27855	#ifndef NDEBUG
27856	void AArch64TargetLowering::verifyTargetSDNode(const SDNode N) const* {
27857	switch (N->getOpcode()) {
27858	default:
27859	break;
27860	case AArch64ISD::SUNPKLO:
27861	case AArch64ISD::SUNPKHI:
27862	case AArch64ISD::UUNPKLO:
27863	case AArch64ISD::UUNPKHI: {
27864	assert(N->getNumValues() == `1` && "Expected one result!");
27865	assert(N->getNumOperands() == `1` && "Expected one operand!");
27866	EVT VT = N->getValueType(ResNo: `0`);
27867	EVT OpVT = N->getOperand(Num: `0`).getValueType();
27868	assert(OpVT.isVector() && VT.isVector() && OpVT.isInteger() &&
27869	VT.isInteger() && "Expected integer vectors!");
27870	assert(OpVT.getSizeInBits() == VT.getSizeInBits() &&
27871	"Expected vectors of equal size!");
27872	// TODO: Enable assert once bogus creations have been fixed.
27873	// assert(OpVT.getVectorElementCount() == VT.getVectorElementCount()2 &&*
27874	// "Expected result vector with half the lanes of its input!");
27875	break;
27876	}
27877	case AArch64ISD::TRN1:
27878	case AArch64ISD::TRN2:
27879	case AArch64ISD::UZP1:
27880	case AArch64ISD::UZP2:
27881	case AArch64ISD::ZIP1:
27882	case AArch64ISD::ZIP2: {
27883	assert(N->getNumValues() == `1` && "Expected one result!");
27884	assert(N->getNumOperands() == `2` && "Expected two operands!");
27885	EVT VT = N->getValueType(ResNo: `0`);
27886	EVT Op0VT = N->getOperand(Num: `0`).getValueType();
27887	EVT Op1VT = N->getOperand(Num: `1`).getValueType();
27888	assert(VT.isVector() && Op0VT.isVector() && Op1VT.isVector() &&
27889	"Expected vectors!");
27890	// TODO: Enable assert once bogus creations have been fixed.
27891	// assert(VT == Op0VT && VT == Op1VT && "Expected matching vectors!");
27892	break;
27893	}
27894	}
27895	}
27896	#endif
27897

source code of llvm/lib/Target/AArch64/AArch64ISelLowering.cpp