AArch64InstructionSelector.cpp source code [llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp]

1	//===- AArch64InstructionSelector.cpp ----------------------------- C++ --==//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	/// \file
9	/// This file implements the targeting of the InstructionSelector class for
10	/// AArch64.
11	/// \todo This should be generated by TableGen.
12	//===----------------------------------------------------------------------===//
13
14	#include "AArch64GlobalISelUtils.h"
15	#include "AArch64InstrInfo.h"
16	#include "AArch64MachineFunctionInfo.h"
17	#include "AArch64RegisterBankInfo.h"
18	#include "AArch64RegisterInfo.h"
19	#include "AArch64Subtarget.h"
20	#include "AArch64TargetMachine.h"
21	#include "MCTargetDesc/AArch64AddressingModes.h"
22	#include "MCTargetDesc/AArch64MCTargetDesc.h"
23	#include "llvm/BinaryFormat/Dwarf.h"
24	#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
25	#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
26	#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
27	#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
28	#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
29	#include "llvm/CodeGen/GlobalISel/Utils.h"
30	#include "llvm/CodeGen/MachineBasicBlock.h"
31	#include "llvm/CodeGen/MachineConstantPool.h"
32	#include "llvm/CodeGen/MachineFrameInfo.h"
33	#include "llvm/CodeGen/MachineFunction.h"
34	#include "llvm/CodeGen/MachineInstr.h"
35	#include "llvm/CodeGen/MachineInstrBuilder.h"
36	#include "llvm/CodeGen/MachineMemOperand.h"
37	#include "llvm/CodeGen/MachineOperand.h"
38	#include "llvm/CodeGen/MachineRegisterInfo.h"
39	#include "llvm/CodeGen/TargetOpcodes.h"
40	#include "llvm/CodeGen/TargetRegisterInfo.h"
41	#include "llvm/IR/Constants.h"
42	#include "llvm/IR/DerivedTypes.h"
43	#include "llvm/IR/Instructions.h"
44	#include "llvm/IR/IntrinsicsAArch64.h"
45	#include "llvm/IR/PatternMatch.h"
46	#include "llvm/IR/Type.h"
47	#include "llvm/Pass.h"
48	#include "llvm/Support/Debug.h"
49	#include "llvm/Support/raw_ostream.h"
50	#include <optional>
51
52	#define DEBUG_TYPE "aarch64-isel"
53
54	using namespace llvm;
55	using namespace MIPatternMatch;
56	using namespace AArch64GISelUtils;
57
58	namespace llvm {
59	class BlockFrequencyInfo;
60	class ProfileSummaryInfo;
61	}
62
63	namespace {
64
65	#define GET_GLOBALISEL_PREDICATE_BITSET
66	#include "AArch64GenGlobalISel.inc"
67	#undef GET_GLOBALISEL_PREDICATE_BITSET
68
69
70	class AArch64InstructionSelector : public InstructionSelector {
71	public:
72	AArch64InstructionSelector(const AArch64TargetMachine &TM,
73	const AArch64Subtarget &STI,
74	const AArch64RegisterBankInfo &RBI);
75
76	bool select(MachineInstr &I) override;
77	static const char getName() { return* DEBUG_TYPE; }
78
79	void setupMF(MachineFunction &MF, GISelKnownBits *KB,
80	CodeGenCoverage CoverageInfo, ProfileSummaryInfo PSI,
81	BlockFrequencyInfo *BFI) override {
82	InstructionSelector::setupMF(mf&: MF, kb: KB, covinfo: CoverageInfo, psi: PSI, bfi: BFI);
83	MIB.setMF(MF);
84
85	// hasFnAttribute() is expensive to call on every BRCOND selection, so
86	// cache it here for each run of the selector.
87	ProduceNonFlagSettingCondBr =
88	!MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
89	MFReturnAddr = Register ();
90
91	processPHIs(MF);
92	}
93
94	private:
95	/// tblgen-erated 'select' implementation, used as the initial selector for
96	/// the patterns that don't require complex C++.
97	bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;
98
99	// A lowering phase that runs before any selection attempts.
100	// Returns true if the instruction was modified.
101	bool preISelLower(MachineInstr &I);
102
103	// An early selection function that runs before the selectImpl() call.
104	bool earlySelect(MachineInstr &I);
105
106	/// Save state that is shared between select calls, call select on \p I and
107	/// then restore the saved state. This can be used to recursively call select
108	/// within a select call.
109	bool selectAndRestoreState(MachineInstr &I);
110
111	// Do some preprocessing of G_PHIs before we begin selection.
112	void processPHIs(MachineFunction &MF);
113
114	bool earlySelectSHL(MachineInstr &I, MachineRegisterInfo &MRI);
115
116	/// Eliminate same-sized cross-bank copies into stores before selectImpl().
117	bool contractCrossBankCopyIntoStore(MachineInstr &I,
118	MachineRegisterInfo &MRI);
119
120	bool convertPtrAddToAdd(MachineInstr &I, MachineRegisterInfo &MRI);
121
122	bool selectVaStartAAPCS(MachineInstr &I, MachineFunction &MF,
123	MachineRegisterInfo &MRI) const;
124	bool selectVaStartDarwin(MachineInstr &I, MachineFunction &MF,
125	MachineRegisterInfo &MRI) const;
126
127	///@{
128	/// Helper functions for selectCompareBranch.
129	bool selectCompareBranchFedByFCmp(MachineInstr &I, MachineInstr &FCmp,
130	MachineIRBuilder &MIB) const;
131	bool selectCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp,
132	MachineIRBuilder &MIB) const;
133	bool tryOptCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp,
134	MachineIRBuilder &MIB) const;
135	bool tryOptAndIntoCompareBranch(MachineInstr &AndInst, bool Invert,
136	MachineBasicBlock *DstMBB,
137	MachineIRBuilder &MIB) const;
138	///@}
139
140	bool selectCompareBranch(MachineInstr &I, MachineFunction &MF,
141	MachineRegisterInfo &MRI);
142
143	bool selectVectorAshrLshr(MachineInstr &I, MachineRegisterInfo &MRI);
144	bool selectVectorSHL(MachineInstr &I, MachineRegisterInfo &MRI);
145
146	// Helper to generate an equivalent of scalar_to_vector into a new register,
147	// returned via 'Dst'.
148	MachineInstr emitScalarToVector(unsigned* EltSize,
149	const TargetRegisterClass *DstRC,
150	Register Scalar,
151	MachineIRBuilder &MIRBuilder) const;
152	/// Helper to narrow vector that was widened by emitScalarToVector.
153	/// Copy lowest part of 128-bit or 64-bit vector to 64-bit or 32-bit
154	/// vector, correspondingly.
155	MachineInstr *emitNarrowVector(Register DstReg, Register SrcReg,
156	MachineIRBuilder &MIRBuilder,
157	MachineRegisterInfo &MRI) const;
158
159	/// Emit a lane insert into \p DstReg, or a new vector register if
160	/// std::nullopt is provided.
161	///
162	/// The lane inserted into is defined by \p LaneIdx. The vector source
163	/// register is given by \p SrcReg. The register containing the element is
164	/// given by \p EltReg.
165	MachineInstr *emitLaneInsert(std::optional<Register> DstReg, Register SrcReg,
166	Register EltReg, unsigned LaneIdx,
167	const RegisterBank &RB,
168	MachineIRBuilder &MIRBuilder) const;
169
170	/// Emit a sequence of instructions representing a constant \p CV for a
171	/// vector register \p Dst. (E.g. a MOV, or a load from a constant pool.)
172	///
173	/// \returns the last instruction in the sequence on success, and nullptr
174	/// otherwise.
175	MachineInstr emitConstantVector(Register Dst, Constant CV,
176	MachineIRBuilder &MIRBuilder,
177	MachineRegisterInfo &MRI);
178
179	MachineInstr tryAdvSIMDModImm8(Register Dst, unsigned* DstSize, APInt Bits,
180	MachineIRBuilder &MIRBuilder);
181
182	MachineInstr tryAdvSIMDModImm16(Register Dst, unsigned* DstSize, APInt Bits,
183	MachineIRBuilder &MIRBuilder, bool Inv);
184
185	MachineInstr tryAdvSIMDModImm32(Register Dst, unsigned* DstSize, APInt Bits,
186	MachineIRBuilder &MIRBuilder, bool Inv);
187	MachineInstr tryAdvSIMDModImm64(Register Dst, unsigned* DstSize, APInt Bits,
188	MachineIRBuilder &MIRBuilder);
189	MachineInstr tryAdvSIMDModImm321s(Register Dst, unsigned* DstSize, APInt Bits,
190	MachineIRBuilder &MIRBuilder, bool Inv);
191	MachineInstr tryAdvSIMDModImmFP(Register Dst, unsigned* DstSize, APInt Bits,
192	MachineIRBuilder &MIRBuilder);
193
194	bool selectInsertElt(MachineInstr &I, MachineRegisterInfo &MRI);
195	bool tryOptConstantBuildVec(MachineInstr &MI, LLT DstTy,
196	MachineRegisterInfo &MRI);
197	/// \returns true if a G_BUILD_VECTOR instruction \p MI can be selected as a
198	/// SUBREG_TO_REG.
199	bool tryOptBuildVecToSubregToReg(MachineInstr &MI, MachineRegisterInfo &MRI);
200	bool selectBuildVector(MachineInstr &I, MachineRegisterInfo &MRI);
201	bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI);
202	bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI);
203
204	bool selectShuffleVector(MachineInstr &I, MachineRegisterInfo &MRI);
205	bool selectExtractElt(MachineInstr &I, MachineRegisterInfo &MRI);
206	bool selectConcatVectors(MachineInstr &I, MachineRegisterInfo &MRI);
207	bool selectSplitVectorUnmerge(MachineInstr &I, MachineRegisterInfo &MRI);
208
209	/// Helper function to select vector load intrinsics like
210	/// @llvm.aarch64.neon.ld2., @llvm.aarch64.neon.ld4., etc.
211	/// \p Opc is the opcode that the selected instruction should use.
212	/// \p NumVecs is the number of vector destinations for the instruction.
213	/// \p I is the original G_INTRINSIC_W_SIDE_EFFECTS instruction.
214	bool selectVectorLoadIntrinsic(unsigned Opc, unsigned NumVecs,
215	MachineInstr &I);
216	bool selectVectorLoadLaneIntrinsic(unsigned Opc, unsigned NumVecs,
217	MachineInstr &I);
218	void selectVectorStoreIntrinsic(MachineInstr &I, unsigned NumVecs,
219	unsigned Opc);
220	bool selectVectorStoreLaneIntrinsic(MachineInstr &I, unsigned NumVecs,
221	unsigned Opc);
222	bool selectIntrinsicWithSideEffects(MachineInstr &I,
223	MachineRegisterInfo &MRI);
224	bool selectIntrinsic(MachineInstr &I, MachineRegisterInfo &MRI);
225	bool selectVectorICmp(MachineInstr &I, MachineRegisterInfo &MRI);
226	bool selectJumpTable(MachineInstr &I, MachineRegisterInfo &MRI);
227	bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI);
228	bool selectTLSGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI);
229	bool selectReduction(MachineInstr &I, MachineRegisterInfo &MRI);
230	bool selectMOPS(MachineInstr &I, MachineRegisterInfo &MRI);
231	bool selectUSMovFromExtend(MachineInstr &I, MachineRegisterInfo &MRI);
232
233	bool selectIndexedExtLoad(MachineInstr &I, MachineRegisterInfo &MRI);
234	bool selectIndexedLoad(MachineInstr &I, MachineRegisterInfo &MRI);
235	bool selectIndexedStore(GIndexedStore &I, MachineRegisterInfo &MRI);
236
237	unsigned emitConstantPoolEntry(const Constant *CPVal,
238	MachineFunction &MF) const;
239	MachineInstr emitLoadFromConstantPool(const* Constant *CPVal,
240	MachineIRBuilder &MIRBuilder) const;
241
242	// Emit a vector concat operation.
243	MachineInstr *emitVectorConcat(std::optional<Register> Dst, Register Op1,
244	Register Op2,
245	MachineIRBuilder &MIRBuilder) const;
246
247	// Emit an integer compare between LHS and RHS, which checks for Predicate.
248	MachineInstr *emitIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
249	MachineOperand &Predicate,
250	MachineIRBuilder &MIRBuilder) const;
251
252	/// Emit a floating point comparison between \p LHS and \p RHS.
253	/// \p Pred if given is the intended predicate to use.
254	MachineInstr *
255	emitFPCompare(Register LHS, Register RHS, MachineIRBuilder &MIRBuilder,
256	std::optional<CmpInst::Predicate> = std::nullopt) const;
257
258	MachineInstr *
259	emitInstr(unsigned Opcode, std::initializer_list<llvm::DstOp> DstOps,
260	std::initializer_list<llvm::SrcOp> SrcOps,
261	MachineIRBuilder &MIRBuilder,
262	const ComplexRendererFns &RenderFns = std::nullopt) const;
263	/// Helper function to emit an add or sub instruction.
264	///
265	/// \p AddrModeAndSizeToOpcode must contain each of the opcode variants above
266	/// in a specific order.
267	///
268	/// Below is an example of the expected input to \p AddrModeAndSizeToOpcode.
269	///
270	/// \code
271	/// const std::array<std::array<unsigned, 2>, 4> Table {
272	/// {{AArch64::ADDXri, AArch64::ADDWri},
273	/// {AArch64::ADDXrs, AArch64::ADDWrs},
274	/// {AArch64::ADDXrr, AArch64::ADDWrr},
275	/// {AArch64::SUBXri, AArch64::SUBWri},
276	/// {AArch64::ADDXrx, AArch64::ADDWrx}}};
277	/// \endcode
278	///
279	/// Each row in the table corresponds to a different addressing mode. Each
280	/// column corresponds to a different register size.
281	///
282	/// \attention Rows must be structured as follows:
283	/// - Row 0: The ri opcode variants
284	/// - Row 1: The rs opcode variants
285	/// - Row 2: The rr opcode variants
286	/// - Row 3: The ri opcode variants for negative immediates
287	/// - Row 4: The rx opcode variants
288	///
289	/// \attention Columns must be structured as follows:
290	/// - Column 0: The 64-bit opcode variants
291	/// - Column 1: The 32-bit opcode variants
292	///
293	/// \p Dst is the destination register of the binop to emit.
294	/// \p LHS is the left-hand operand of the binop to emit.
295	/// \p RHS is the right-hand operand of the binop to emit.
296	MachineInstr *emitAddSub(
297	const std::array<std::array<unsigned, `2`>, `5`> &AddrModeAndSizeToOpcode,
298	Register Dst, MachineOperand &LHS, MachineOperand &RHS,
299	MachineIRBuilder &MIRBuilder) const;
300	MachineInstr *emitADD(Register DefReg, MachineOperand &LHS,
301	MachineOperand &RHS,
302	MachineIRBuilder &MIRBuilder) const;
303	MachineInstr *emitADDS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
304	MachineIRBuilder &MIRBuilder) const;
305	MachineInstr *emitSUBS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
306	MachineIRBuilder &MIRBuilder) const;
307	MachineInstr *emitADCS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
308	MachineIRBuilder &MIRBuilder) const;
309	MachineInstr *emitSBCS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
310	MachineIRBuilder &MIRBuilder) const;
311	MachineInstr *emitCMN(MachineOperand &LHS, MachineOperand &RHS,
312	MachineIRBuilder &MIRBuilder) const;
313	MachineInstr *emitTST(MachineOperand &LHS, MachineOperand &RHS,
314	MachineIRBuilder &MIRBuilder) const;
315	MachineInstr *emitSelect(Register Dst, Register LHS, Register RHS,
316	AArch64CC::CondCode CC,
317	MachineIRBuilder &MIRBuilder) const;
318	MachineInstr *emitExtractVectorElt(std::optional<Register> DstReg,
319	const RegisterBank &DstRB, LLT ScalarTy,
320	Register VecReg, unsigned LaneIdx,
321	MachineIRBuilder &MIRBuilder) const;
322	MachineInstr *emitCSINC(Register Dst, Register Src1, Register Src2,
323	AArch64CC::CondCode Pred,
324	MachineIRBuilder &MIRBuilder) const;
325	/// Emit a CSet for a FP compare.
326	///
327	/// \p Dst is expected to be a 32-bit scalar register.
328	MachineInstr *emitCSetForFCmp(Register Dst, CmpInst::Predicate Pred,
329	MachineIRBuilder &MIRBuilder) const;
330
331	/// Emit an instruction that sets NZCV to the carry-in expected by \p I.
332	/// Might elide the instruction if the previous instruction already sets NZCV
333	/// correctly.
334	MachineInstr *emitCarryIn(MachineInstr &I, Register CarryReg);
335
336	/// Emit the overflow op for \p Opcode.
337	///
338	/// \p Opcode is expected to be an overflow op's opcode, e.g. G_UADDO,
339	/// G_USUBO, etc.
340	std::pair<MachineInstr *, AArch64CC::CondCode>
341	emitOverflowOp(unsigned Opcode, Register Dst, MachineOperand &LHS,
342	MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const;
343
344	bool selectOverflowOp(MachineInstr &I, MachineRegisterInfo &MRI);
345
346	/// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
347	/// In some cases this is even possible with OR operations in the expression.
348	MachineInstr *emitConjunction(Register Val, AArch64CC::CondCode &OutCC,
349	MachineIRBuilder &MIB) const;
350	MachineInstr *emitConditionalComparison(Register LHS, Register RHS,
351	CmpInst::Predicate CC,
352	AArch64CC::CondCode Predicate,
353	AArch64CC::CondCode OutCC,
354	MachineIRBuilder &MIB) const;
355	MachineInstr *emitConjunctionRec(Register Val, AArch64CC::CondCode &OutCC,
356	bool Negate, Register CCOp,
357	AArch64CC::CondCode Predicate,
358	MachineIRBuilder &MIB) const;
359
360	/// Emit a TB(N)Z instruction which tests \p Bit in \p TestReg.
361	/// \p IsNegative is true if the test should be "not zero".
362	/// This will also optimize the test bit instruction when possible.
363	MachineInstr emitTestBit(Register TestReg, uint64_t Bit, bool* IsNegative,
364	MachineBasicBlock *DstMBB,
365	MachineIRBuilder &MIB) const;
366
367	/// Emit a CB(N)Z instruction which branches to \p DestMBB.
368	MachineInstr emitCBZ(Register CompareReg, bool* IsNegative,
369	MachineBasicBlock *DestMBB,
370	MachineIRBuilder &MIB) const;
371
372	// Equivalent to the i32shift_a and friends from AArch64InstrInfo.td.
373	// We use these manually instead of using the importer since it doesn't
374	// support SDNodeXForm.
375	ComplexRendererFns selectShiftA_32(const MachineOperand &Root) const;
376	ComplexRendererFns selectShiftB_32(const MachineOperand &Root) const;
377	ComplexRendererFns selectShiftA_64(const MachineOperand &Root) const;
378	ComplexRendererFns selectShiftB_64(const MachineOperand &Root) const;
379
380	ComplexRendererFns select12BitValueWithLeftShift(uint64_t Immed) const;
381	ComplexRendererFns selectArithImmed(MachineOperand &Root) const;
382	ComplexRendererFns selectNegArithImmed(MachineOperand &Root) const;
383
384	ComplexRendererFns selectAddrModeUnscaled(MachineOperand &Root,
385	unsigned Size) const;
386
387	ComplexRendererFns selectAddrModeUnscaled8(MachineOperand &Root) const {
388	return selectAddrModeUnscaled(Root, Size: `1`);
389	}
390	ComplexRendererFns selectAddrModeUnscaled16(MachineOperand &Root) const {
391	return selectAddrModeUnscaled(Root, Size: `2`);
392	}
393	ComplexRendererFns selectAddrModeUnscaled32(MachineOperand &Root) const {
394	return selectAddrModeUnscaled(Root, Size: `4`);
395	}
396	ComplexRendererFns selectAddrModeUnscaled64(MachineOperand &Root) const {
397	return selectAddrModeUnscaled(Root, Size: `8`);
398	}
399	ComplexRendererFns selectAddrModeUnscaled128(MachineOperand &Root) const {
400	return selectAddrModeUnscaled(Root, Size: `16`);
401	}
402
403	/// Helper to try to fold in a GISEL_ADD_LOW into an immediate, to be used
404	/// from complex pattern matchers like selectAddrModeIndexed().
405	ComplexRendererFns tryFoldAddLowIntoImm(MachineInstr &RootDef, unsigned Size,
406	MachineRegisterInfo &MRI) const;
407
408	ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root,
409	unsigned Size) const;
410	template <int Width>
411	ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root) const {
412	return selectAddrModeIndexed(Root, Size: Width / `8`);
413	}
414
415	bool isWorthFoldingIntoExtendedReg(MachineInstr &MI,
416	const MachineRegisterInfo &MRI) const;
417	ComplexRendererFns
418	selectAddrModeShiftedExtendXReg(MachineOperand &Root,
419	unsigned SizeInBytes) const;
420
421	/// Returns a \p ComplexRendererFns which contains a base, offset, and whether
422	/// or not a shift + extend should be folded into an addressing mode. Returns
423	/// None when this is not profitable or possible.
424	ComplexRendererFns
425	selectExtendedSHL(MachineOperand &Root, MachineOperand &Base,
426	MachineOperand &Offset, unsigned SizeInBytes,
427	bool WantsExt) const;
428	ComplexRendererFns selectAddrModeRegisterOffset(MachineOperand &Root) const;
429	ComplexRendererFns selectAddrModeXRO(MachineOperand &Root,
430	unsigned SizeInBytes) const;
431	template <int Width>
432	ComplexRendererFns selectAddrModeXRO(MachineOperand &Root) const {
433	return selectAddrModeXRO(Root, SizeInBytes: Width / `8`);
434	}
435
436	ComplexRendererFns selectAddrModeWRO(MachineOperand &Root,
437	unsigned SizeInBytes) const;
438	template <int Width>
439	ComplexRendererFns selectAddrModeWRO(MachineOperand &Root) const {
440	return selectAddrModeWRO(Root, SizeInBytes: Width / `8`);
441	}
442
443	ComplexRendererFns selectShiftedRegister(MachineOperand &Root,
444	bool AllowROR = false) const;
445
446	ComplexRendererFns selectArithShiftedRegister(MachineOperand &Root) const {
447	return selectShiftedRegister(Root);
448	}
449
450	ComplexRendererFns selectLogicalShiftedRegister(MachineOperand &Root) const {
451	return selectShiftedRegister(Root, AllowROR: true);
452	}
453
454	/// Given an extend instruction, determine the correct shift-extend type for
455	/// that instruction.
456	///
457	/// If the instruction is going to be used in a load or store, pass
458	/// \p IsLoadStore = true.
459	AArch64_AM::ShiftExtendType
460	getExtendTypeForInst(MachineInstr &MI, MachineRegisterInfo &MRI,
461	bool IsLoadStore = false) const;
462
463	/// Move \p Reg to \p RC if \p Reg is not already on \p RC.
464	///
465	/// \returns Either \p Reg if no change was necessary, or the new register
466	/// created by moving \p Reg.
467	///
468	/// Note: This uses emitCopy right now.
469	Register moveScalarRegClass(Register Reg, const TargetRegisterClass &RC,
470	MachineIRBuilder &MIB) const;
471
472	ComplexRendererFns selectArithExtendedRegister(MachineOperand &Root) const;
473
474	ComplexRendererFns selectExtractHigh(MachineOperand &Root) const;
475
476	void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
477	int OpIdx = -`1`) const;
478	void renderLogicalImm32(MachineInstrBuilder &MIB, const MachineInstr &I,
479	int OpIdx = -`1`) const;
480	void renderLogicalImm64(MachineInstrBuilder &MIB, const MachineInstr &I,
481	int OpIdx = -`1`) const;
482	void renderFPImm16(MachineInstrBuilder &MIB, const MachineInstr &MI,
483	int OpIdx = -`1`) const;
484	void renderFPImm32(MachineInstrBuilder &MIB, const MachineInstr &MI,
485	int OpIdx = -`1`) const;
486	void renderFPImm64(MachineInstrBuilder &MIB, const MachineInstr &MI,
487	int OpIdx = -`1`) const;
488	void renderFPImm32SIMDModImmType4(MachineInstrBuilder &MIB,
489	const MachineInstr &MI,
490	int OpIdx = -`1`) const;
491
492	// Materialize a GlobalValue or BlockAddress using a movz+movk sequence.
493	void materializeLargeCMVal(MachineInstr &I, const Value V, unsigned* OpFlags);
494
495	// Optimization methods.
496	bool tryOptSelect(GSelect &Sel);
497	bool tryOptSelectConjunction(GSelect &Sel, MachineInstr &CondMI);
498	MachineInstr *tryFoldIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
499	MachineOperand &Predicate,
500	MachineIRBuilder &MIRBuilder) const;
501
502	/// Return true if \p MI is a load or store of \p NumBytes bytes.
503	bool isLoadStoreOfNumBytes(const MachineInstr &MI, unsigned NumBytes) const;
504
505	/// Returns true if \p MI is guaranteed to have the high-half of a 64-bit
506	/// register zeroed out. In other words, the result of MI has been explicitly
507	/// zero extended.
508	bool isDef32(const MachineInstr &MI) const;
509
510	const AArch64TargetMachine &TM;
511	const AArch64Subtarget &STI;
512	const AArch64InstrInfo &TII;
513	const AArch64RegisterInfo &TRI;
514	const AArch64RegisterBankInfo &RBI;
515
516	bool ProduceNonFlagSettingCondBr = false;
517
518	// Some cached values used during selection.
519	// We use LR as a live-in register, and we keep track of it here as it can be
520	// clobbered by calls.
521	Register MFReturnAddr;
522
523	MachineIRBuilder MIB;
524
525	#define GET_GLOBALISEL_PREDICATES_DECL
526	#include "AArch64GenGlobalISel.inc"
527	#undef GET_GLOBALISEL_PREDICATES_DECL
528
529	// We declare the temporaries used by selectImpl() in the class to minimize the
530	// cost of constructing placeholder values.
531	#define GET_GLOBALISEL_TEMPORARIES_DECL
532	#include "AArch64GenGlobalISel.inc"
533	#undef GET_GLOBALISEL_TEMPORARIES_DECL
534	};
535
536	} // end anonymous namespace
537
538	#define GET_GLOBALISEL_IMPL
539	#include "AArch64GenGlobalISel.inc"
540	#undef GET_GLOBALISEL_IMPL
541
542	AArch64InstructionSelector::AArch64InstructionSelector(
543	const AArch64TargetMachine &TM, const AArch64Subtarget &STI,
544	const AArch64RegisterBankInfo &RBI)
545	: TM(TM), STI(STI), TII(STI.getInstrInfo()), TRI(STI.getRegisterInfo()),
546	RBI(RBI),
547	#define GET_GLOBALISEL_PREDICATES_INIT
548	#include "AArch64GenGlobalISel.inc"
549	#undef GET_GLOBALISEL_PREDICATES_INIT
550	#define GET_GLOBALISEL_TEMPORARIES_INIT
551	#include "AArch64GenGlobalISel.inc"
552	#undef GET_GLOBALISEL_TEMPORARIES_INIT
553	{
554	}
555
556	// FIXME: This should be target-independent, inferred from the types declared
557	// for each class in the bank.
558	//
559	/// Given a register bank, and a type, return the smallest register class that
560	/// can represent that combination.
561	static const TargetRegisterClass *
562	getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB,
563	bool GetAllRegSet = false) {
564	if (RB.getID() == AArch64::GPRRegBankID) {
565	if (Ty.getSizeInBits() <= `32`)
566	return GetAllRegSet ? &AArch64::GPR32allRegClass
567	: &AArch64::GPR32RegClass;
568	if (Ty.getSizeInBits() == `64`)
569	return GetAllRegSet ? &AArch64::GPR64allRegClass
570	: &AArch64::GPR64RegClass;
571	if (Ty.getSizeInBits() == `128`)
572	return &AArch64::XSeqPairsClassRegClass;
573	return nullptr;
574	}
575
576	if (RB.getID() == AArch64::FPRRegBankID) {
577	switch (Ty.getSizeInBits()) {
578	case `8`:
579	return &AArch64::FPR8RegClass;
580	case `16`:
581	return &AArch64::FPR16RegClass;
582	case `32`:
583	return &AArch64::FPR32RegClass;
584	case `64`:
585	return &AArch64::FPR64RegClass;
586	case `128`:
587	return &AArch64::FPR128RegClass;
588	}
589	return nullptr;
590	}
591
592	return nullptr;
593	}
594
595	/// Given a register bank, and size in bits, return the smallest register class
596	/// that can represent that combination.
597	static const TargetRegisterClass *
598	getMinClassForRegBank(const RegisterBank &RB, unsigned SizeInBits,
599	bool GetAllRegSet = false) {
600	unsigned RegBankID = RB.getID();
601
602	if (RegBankID == AArch64::GPRRegBankID) {
603	if (SizeInBits <= `32`)
604	return GetAllRegSet ? &AArch64::GPR32allRegClass
605	: &AArch64::GPR32RegClass;
606	if (SizeInBits == `64`)
607	return GetAllRegSet ? &AArch64::GPR64allRegClass
608	: &AArch64::GPR64RegClass;
609	if (SizeInBits == `128`)
610	return &AArch64::XSeqPairsClassRegClass;
611	}
612
613	if (RegBankID == AArch64::FPRRegBankID) {
614	switch (SizeInBits) {
615	default:
616	return nullptr;
617	case `8`:
618	return &AArch64::FPR8RegClass;
619	case `16`:
620	return &AArch64::FPR16RegClass;
621	case `32`:
622	return &AArch64::FPR32RegClass;
623	case `64`:
624	return &AArch64::FPR64RegClass;
625	case `128`:
626	return &AArch64::FPR128RegClass;
627	}
628	}
629
630	return nullptr;
631	}
632
633	/// Returns the correct subregister to use for a given register class.
634	static bool getSubRegForClass(const TargetRegisterClass *RC,
635	const TargetRegisterInfo &TRI, unsigned &SubReg) {
636	switch (TRI.getRegSizeInBits(RC: *RC)) {
637	case `8`:
638	SubReg = AArch64::bsub;
639	break;
640	case `16`:
641	SubReg = AArch64::hsub;
642	break;
643	case `32`:
644	if (RC != &AArch64::FPR32RegClass)
645	SubReg = AArch64::sub_32;
646	else
647	SubReg = AArch64::ssub;
648	break;
649	case `64`:
650	SubReg = AArch64::dsub;
651	break;
652	default:
653	LLVM_DEBUG(
654	dbgs() << "Couldn't find appropriate subregister for register class.");
655	return false;
656	}
657
658	return true;
659	}
660
661	/// Returns the minimum size the given register bank can hold.
662	static unsigned getMinSizeForRegBank(const RegisterBank &RB) {
663	switch (RB.getID()) {
664	case AArch64::GPRRegBankID:
665	return `32`;
666	case AArch64::FPRRegBankID:
667	return `8`;
668	default:
669	llvm_unreachable("Tried to get minimum size for unknown register bank.");
670	}
671	}
672
673	/// Create a REG_SEQUENCE instruction using the registers in \p Regs.
674	/// Helper function for functions like createDTuple and createQTuple.
675	///
676	/// \p RegClassIDs - The list of register class IDs available for some tuple of
677	/// a scalar class. E.g. QQRegClassID, QQQRegClassID, QQQQRegClassID. This is
678	/// expected to contain between 2 and 4 tuple classes.
679	///
680	/// \p SubRegs - The list of subregister classes associated with each register
681	/// class ID in \p RegClassIDs. E.g., QQRegClassID should use the qsub0
682	/// subregister class. The index of each subregister class is expected to
683	/// correspond with the index of each register class.
684	///
685	/// \returns Either the destination register of REG_SEQUENCE instruction that
686	/// was created, or the 0th element of \p Regs if \p Regs contains a single
687	/// element.
688	static Register createTuple(ArrayRef<Register> Regs,
689	const unsigned RegClassIDs[],
690	const unsigned SubRegs[], MachineIRBuilder &MIB) {
691	unsigned NumRegs = Regs.size();
692	if (NumRegs == `1`)
693	return Regs [`0`];
694	assert(NumRegs >= `2` && NumRegs <= `4` &&
695	"Only support between two and 4 registers in a tuple!");
696	const TargetRegisterInfo *TRI = MIB.getMF().getSubtarget().getRegisterInfo();
697	auto *DesiredClass = TRI->getRegClass(i: RegClassIDs[NumRegs - `2`]);
698	auto RegSequence =
699	MIB.buildInstr(Opc: TargetOpcode::REG_SEQUENCE, DstOps: {DesiredClass}, SrcOps: {});
700	for (unsigned I = `0`, E = Regs.size(); I < E; ++I) {
701	RegSequence.addUse(RegNo: Regs [I]);
702	RegSequence.addImm(Val: SubRegs[I]);
703	}
704	return RegSequence.getReg(Idx: `0`);
705	}
706
707	/// Create a tuple of D-registers using the registers in \p Regs.
708	static Register createDTuple(ArrayRef<Register> Regs, MachineIRBuilder &MIB) {
709	static const unsigned RegClassIDs[] = {
710	AArch64::DDRegClassID, AArch64::DDDRegClassID, AArch64::DDDDRegClassID};
711	static const unsigned SubRegs[] = {AArch64::dsub0, AArch64::dsub1,
712	AArch64::dsub2, AArch64::dsub3};
713	return createTuple(Regs, RegClassIDs, SubRegs, MIB);
714	}
715
716	/// Create a tuple of Q-registers using the registers in \p Regs.
717	static Register createQTuple(ArrayRef<Register> Regs, MachineIRBuilder &MIB) {
718	static const unsigned RegClassIDs[] = {
719	AArch64::QQRegClassID, AArch64::QQQRegClassID, AArch64::QQQQRegClassID};
720	static const unsigned SubRegs[] = {AArch64::qsub0, AArch64::qsub1,
721	AArch64::qsub2, AArch64::qsub3};
722	return createTuple(Regs, RegClassIDs, SubRegs, MIB);
723	}
724
725	static std::optional<uint64_t> getImmedFromMO(const MachineOperand &Root) {
726	auto &MI = *Root.getParent();
727	auto &MBB = *MI.getParent();
728	auto &MF = *MBB.getParent();
729	auto &MRI = MF.getRegInfo();
730	uint64_t Immed;
731	if (Root.isImm())
732	Immed = Root.getImm();
733	else if (Root.isCImm())
734	Immed = Root.getCImm()->getZExtValue();
735	else if (Root.isReg()) {
736	auto ValAndVReg =
737	getIConstantVRegValWithLookThrough(VReg: Root.getReg(), MRI, LookThroughInstrs: true);
738	if (!ValAndVReg)
739	return std::nullopt;
740	Immed = ValAndVReg ->Value.getSExtValue();
741	} else
742	return std::nullopt;
743	return Immed;
744	}
745
746	/// Check whether \p I is a currently unsupported binary operation:
747	/// - it has an unsized type
748	/// - an operand is not a vreg
749	/// - all operands are not in the same bank
750	/// These are checks that should someday live in the verifier, but right now,
751	/// these are mostly limitations of the aarch64 selector.
752	static bool unsupportedBinOp(const MachineInstr &I,
753	const AArch64RegisterBankInfo &RBI,
754	const MachineRegisterInfo &MRI,
755	const AArch64RegisterInfo &TRI) {
756	LLT Ty = MRI.getType(Reg: I.getOperand(i: `0`).getReg());
757	if (!Ty.isValid()) {
758	LLVM_DEBUG(dbgs() << "Generic binop register should be typed\n");
759	return true;
760	}
761
762	const RegisterBank PrevOpBank = nullptr*;
763	for (auto &MO : I.operands()) {
764	// FIXME: Support non-register operands.
765	if (!MO.isReg()) {
766	LLVM_DEBUG(dbgs() << "Generic inst non-reg operands are unsupported\n");
767	return true;
768	}
769
770	// FIXME: Can generic operations have physical registers operands? If
771	// so, this will need to be taught about that, and we'll need to get the
772	// bank out of the minimal class for the register.
773	// Either way, this needs to be documented (and possibly verified).
774	if (!MO.getReg().isVirtual()) {
775	LLVM_DEBUG(dbgs() << "Generic inst has physical register operand\n");
776	return true;
777	}
778
779	const RegisterBank *OpBank = RBI.getRegBank(MO.getReg(), MRI, TRI);
780	if (!OpBank) {
781	LLVM_DEBUG(dbgs() << "Generic register has no bank or class\n");
782	return true;
783	}
784
785	if (PrevOpBank && OpBank != PrevOpBank) {
786	LLVM_DEBUG(dbgs() << "Generic inst operands have different banks\n");
787	return true;
788	}
789	PrevOpBank = OpBank;
790	}
791	return false;
792	}
793
794	/// Select the AArch64 opcode for the basic binary operation \p GenericOpc
795	/// (such as G_OR or G_SDIV), appropriate for the register bank \p RegBankID
796	/// and of size \p OpSize.
797	/// \returns \p GenericOpc if the combination is unsupported.
798	static unsigned selectBinaryOp(unsigned GenericOpc, unsigned RegBankID,
799	unsigned OpSize) {
800	switch (RegBankID) {
801	case AArch64::GPRRegBankID:
802	if (OpSize == `32`) {
803	switch (GenericOpc) {
804	case TargetOpcode::G_SHL:
805	return AArch64::LSLVWr;
806	case TargetOpcode::G_LSHR:
807	return AArch64::LSRVWr;
808	case TargetOpcode::G_ASHR:
809	return AArch64::ASRVWr;
810	default:
811	return GenericOpc;
812	}
813	} else if (OpSize == `64`) {
814	switch (GenericOpc) {
815	case TargetOpcode::G_PTR_ADD:
816	return AArch64::ADDXrr;
817	case TargetOpcode::G_SHL:
818	return AArch64::LSLVXr;
819	case TargetOpcode::G_LSHR:
820	return AArch64::LSRVXr;
821	case TargetOpcode::G_ASHR:
822	return AArch64::ASRVXr;
823	default:
824	return GenericOpc;
825	}
826	}
827	break;
828	case AArch64::FPRRegBankID:
829	switch (OpSize) {
830	case `32`:
831	switch (GenericOpc) {
832	case TargetOpcode::G_FADD:
833	return AArch64::FADDSrr;
834	case TargetOpcode::G_FSUB:
835	return AArch64::FSUBSrr;
836	case TargetOpcode::G_FMUL:
837	return AArch64::FMULSrr;
838	case TargetOpcode::G_FDIV:
839	return AArch64::FDIVSrr;
840	default:
841	return GenericOpc;
842	}
843	case `64`:
844	switch (GenericOpc) {
845	case TargetOpcode::G_FADD:
846	return AArch64::FADDDrr;
847	case TargetOpcode::G_FSUB:
848	return AArch64::FSUBDrr;
849	case TargetOpcode::G_FMUL:
850	return AArch64::FMULDrr;
851	case TargetOpcode::G_FDIV:
852	return AArch64::FDIVDrr;
853	case TargetOpcode::G_OR:
854	return AArch64::ORRv8i8;
855	default:
856	return GenericOpc;
857	}
858	}
859	break;
860	}
861	return GenericOpc;
862	}
863
864	/// Select the AArch64 opcode for the G_LOAD or G_STORE operation \p GenericOpc,
865	/// appropriate for the (value) register bank \p RegBankID and of memory access
866	/// size \p OpSize. This returns the variant with the base+unsigned-immediate
867	/// addressing mode (e.g., LDRXui).
868	/// \returns \p GenericOpc if the combination is unsupported.
869	static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID,
870	unsigned OpSize) {
871	const bool isStore = GenericOpc == TargetOpcode::G_STORE;
872	switch (RegBankID) {
873	case AArch64::GPRRegBankID:
874	switch (OpSize) {
875	case `8`:
876	return isStore ? AArch64::STRBBui : AArch64::LDRBBui;
877	case `16`:
878	return isStore ? AArch64::STRHHui : AArch64::LDRHHui;
879	case `32`:
880	return isStore ? AArch64::STRWui : AArch64::LDRWui;
881	case `64`:
882	return isStore ? AArch64::STRXui : AArch64::LDRXui;
883	}
884	break;
885	case AArch64::FPRRegBankID:
886	switch (OpSize) {
887	case `8`:
888	return isStore ? AArch64::STRBui : AArch64::LDRBui;
889	case `16`:
890	return isStore ? AArch64::STRHui : AArch64::LDRHui;
891	case `32`:
892	return isStore ? AArch64::STRSui : AArch64::LDRSui;
893	case `64`:
894	return isStore ? AArch64::STRDui : AArch64::LDRDui;
895	case `128`:
896	return isStore ? AArch64::STRQui : AArch64::LDRQui;
897	}
898	break;
899	}
900	return GenericOpc;
901	}
902
903	/// Helper function for selectCopy. Inserts a subregister copy from \p SrcReg
904	/// to \p To.*
905	///
906	/// E.g "To = COPY SrcReg:SubReg"
907	static bool copySubReg(MachineInstr &I, MachineRegisterInfo &MRI,
908	const RegisterBankInfo &RBI, Register SrcReg,
909	const TargetRegisterClass To, unsigned* SubReg) {
910	assert(SrcReg.isValid() && "Expected a valid source register?");
911	assert(To && "Destination register class cannot be null");
912	assert(SubReg && "Expected a valid subregister");
913
914	MachineIRBuilder MIB(I);
915	auto SubRegCopy =
916	MIB.buildInstr(Opc: TargetOpcode::COPY, DstOps: {To}, SrcOps: {}).addReg(RegNo: SrcReg, flags: `0`, SubReg);
917	MachineOperand &RegOp = I.getOperand(i: `1`);
918	RegOp.setReg(SubRegCopy.getReg(Idx: `0`));
919
920	// It's possible that the destination register won't be constrained. Make
921	// sure that happens.
922	if (!I.getOperand(i: `0`).getReg().isPhysical())
923	RBI.constrainGenericRegister(Reg: I.getOperand(i: `0`).getReg(), RC: *To, MRI);
924
925	return true;
926	}
927
928	/// Helper function to get the source and destination register classes for a
929	/// copy. Returns a std::pair containing the source register class for the
930	/// copy, and the destination register class for the copy. If a register class
931	/// cannot be determined, then it will be nullptr.
932	static std::pair<const TargetRegisterClass , const* TargetRegisterClass *>
933	getRegClassesForCopy(MachineInstr &I, const TargetInstrInfo &TII,
934	MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
935	const RegisterBankInfo &RBI) {
936	Register DstReg = I.getOperand(i: `0`).getReg();
937	Register SrcReg = I.getOperand(i: `1`).getReg();
938	const RegisterBank &DstRegBank = *RBI.getRegBank(Reg: DstReg, MRI, TRI);
939	const RegisterBank &SrcRegBank = *RBI.getRegBank(Reg: SrcReg, MRI, TRI);
940	unsigned DstSize = RBI.getSizeInBits(Reg: DstReg, MRI, TRI);
941	unsigned SrcSize = RBI.getSizeInBits(Reg: SrcReg, MRI, TRI);
942
943	// Special casing for cross-bank copies of s1s. We can technically represent
944	// a 1-bit value with any size of register. The minimum size for a GPR is 32
945	// bits. So, we need to put the FPR on 32 bits as well.
946	//
947	// FIXME: I'm not sure if this case holds true outside of copies. If it does,
948	// then we can pull it into the helpers that get the appropriate class for a
949	// register bank. Or make a new helper that carries along some constraint
950	// information.
951	if (SrcRegBank != DstRegBank && (DstSize == `1` && SrcSize == `1`))
952	SrcSize = DstSize = `32`;
953
954	return {getMinClassForRegBank(RB: SrcRegBank, SizeInBits: SrcSize, GetAllRegSet: true),
955	getMinClassForRegBank(RB: DstRegBank, SizeInBits: DstSize, GetAllRegSet: true)};
956	}
957
958	// FIXME: We need some sort of API in RBI/TRI to allow generic code to
959	// constrain operands of simple instructions given a TargetRegisterClass
960	// and LLT
961	static bool selectDebugInstr(MachineInstr &I, MachineRegisterInfo &MRI,
962	const RegisterBankInfo &RBI) {
963	for (MachineOperand &MO : I.operands()) {
964	if (!MO.isReg())
965	continue;
966	Register Reg = MO.getReg();
967	if (!Reg)
968	continue;
969	if (Reg.isPhysical())
970	continue;
971	LLT Ty = MRI.getType(Reg);
972	const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
973	const TargetRegisterClass *RC =
974	RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
975	if (!RC) {
976	const RegisterBank &RB = RegClassOrBank.get<const* RegisterBank *>();
977	RC = getRegClassForTypeOnBank(Ty, RB);
978	if (!RC) {
979	LLVM_DEBUG(
980	dbgs() << "Warning: DBG_VALUE operand has unexpected size/bank\n");
981	break;
982	}
983	}
984	RBI.constrainGenericRegister(Reg, RC: *RC, MRI);
985	}
986
987	return true;
988	}
989
990	static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
991	MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
992	const RegisterBankInfo &RBI) {
993	Register DstReg = I.getOperand(i: `0`).getReg();
994	Register SrcReg = I.getOperand(i: `1`).getReg();
995	const RegisterBank &DstRegBank = *RBI.getRegBank(Reg: DstReg, MRI, TRI);
996	const RegisterBank &SrcRegBank = *RBI.getRegBank(Reg: SrcReg, MRI, TRI);
997
998	// Find the correct register classes for the source and destination registers.
999	const TargetRegisterClass *SrcRC;
1000	const TargetRegisterClass *DstRC;
1001	std::tie(args&: SrcRC, args&: DstRC) = getRegClassesForCopy(I, TII, MRI, TRI, RBI);
1002
1003	if (!DstRC) {
1004	LLVM_DEBUG(dbgs() << "Unexpected dest size "
1005	<< RBI.getSizeInBits(DstReg, MRI, TRI) << `'\n'`);
1006	return false;
1007	}
1008
1009	// Is this a copy? If so, then we may need to insert a subregister copy.
1010	if (I.isCopy()) {
1011	// Yes. Check if there's anything to fix up.
1012	if (!SrcRC) {
1013	LLVM_DEBUG(dbgs() << "Couldn't determine source register class\n");
1014	return false;
1015	}
1016
1017	unsigned SrcSize = TRI.getRegSizeInBits(RC: *SrcRC);
1018	unsigned DstSize = TRI.getRegSizeInBits(RC: *DstRC);
1019	unsigned SubReg;
1020
1021	// If the source bank doesn't support a subregister copy small enough,
1022	// then we first need to copy to the destination bank.
1023	if (getMinSizeForRegBank(RB: SrcRegBank) > DstSize) {
1024	const TargetRegisterClass *DstTempRC =
1025	getMinClassForRegBank(RB: DstRegBank, SizeInBits: SrcSize, / GetAllRegSet / true);
1026	getSubRegForClass(RC: DstRC, TRI, SubReg);
1027
1028	MachineIRBuilder MIB(I);
1029	auto Copy = MIB.buildCopy(Res: {DstTempRC}, Op: {SrcReg});
1030	copySubReg(I, MRI, RBI, SrcReg: Copy.getReg(Idx: `0`), To: DstRC, SubReg);
1031	} else if (SrcSize > DstSize) {
1032	// If the source register is bigger than the destination we need to
1033	// perform a subregister copy.
1034	const TargetRegisterClass *SubRegRC =
1035	getMinClassForRegBank(RB: SrcRegBank, SizeInBits: DstSize, / GetAllRegSet / true);
1036	getSubRegForClass(RC: SubRegRC, TRI, SubReg);
1037	copySubReg(I, MRI, RBI, SrcReg, To: DstRC, SubReg);
1038	} else if (DstSize > SrcSize) {
1039	// If the destination register is bigger than the source we need to do
1040	// a promotion using SUBREG_TO_REG.
1041	const TargetRegisterClass *PromotionRC =
1042	getMinClassForRegBank(RB: SrcRegBank, SizeInBits: DstSize, / GetAllRegSet / true);
1043	getSubRegForClass(RC: SrcRC, TRI, SubReg);
1044
1045	Register PromoteReg = MRI.createVirtualRegister(RegClass: PromotionRC);
1046	BuildMI(*I.getParent(), I, I.getDebugLoc(),
1047	TII.get(AArch64::SUBREG_TO_REG), PromoteReg)
1048	.addImm(`0`)
1049	.addUse(SrcReg)
1050	.addImm(SubReg);
1051	MachineOperand &RegOp = I.getOperand(i: `1`);
1052	RegOp.setReg(PromoteReg);
1053	}
1054
1055	// If the destination is a physical register, then there's nothing to
1056	// change, so we're done.
1057	if (DstReg.isPhysical())
1058	return true;
1059	}
1060
1061	// No need to constrain SrcReg. It will get constrained when we hit another
1062	// of its use or its defs. Copies do not have constraints.
1063	if (!RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI)) {
1064	LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
1065	<< " operand\n");
1066	return false;
1067	}
1068
1069	// If this a GPR ZEXT that we want to just reduce down into a copy.
1070	// The sizes will be mismatched with the source < 32b but that's ok.
1071	if (I.getOpcode() == TargetOpcode::G_ZEXT) {
1072	I.setDesc(TII.get(AArch64::COPY));
1073	assert(SrcRegBank.getID() == AArch64::GPRRegBankID);
1074	return selectCopy(I, TII, MRI, TRI, RBI);
1075	}
1076
1077	I.setDesc(TII.get(AArch64::COPY));
1078	return true;
1079	}
1080
1081	static unsigned selectFPConvOpc(unsigned GenericOpc, LLT DstTy, LLT SrcTy) {
1082	if (!DstTy.isScalar() \|\| !SrcTy.isScalar())
1083	return GenericOpc;
1084
1085	const unsigned DstSize = DstTy.getSizeInBits();
1086	const unsigned SrcSize = SrcTy.getSizeInBits();
1087
1088	switch (DstSize) {
1089	case `32`:
1090	switch (SrcSize) {
1091	case `32`:
1092	switch (GenericOpc) {
1093	case TargetOpcode::G_SITOFP:
1094	return AArch64::SCVTFUWSri;
1095	case TargetOpcode::G_UITOFP:
1096	return AArch64::UCVTFUWSri;
1097	case TargetOpcode::G_FPTOSI:
1098	return AArch64::FCVTZSUWSr;
1099	case TargetOpcode::G_FPTOUI:
1100	return AArch64::FCVTZUUWSr;
1101	default:
1102	return GenericOpc;
1103	}
1104	case `64`:
1105	switch (GenericOpc) {
1106	case TargetOpcode::G_SITOFP:
1107	return AArch64::SCVTFUXSri;
1108	case TargetOpcode::G_UITOFP:
1109	return AArch64::UCVTFUXSri;
1110	case TargetOpcode::G_FPTOSI:
1111	return AArch64::FCVTZSUWDr;
1112	case TargetOpcode::G_FPTOUI:
1113	return AArch64::FCVTZUUWDr;
1114	default:
1115	return GenericOpc;
1116	}
1117	default:
1118	return GenericOpc;
1119	}
1120	case `64`:
1121	switch (SrcSize) {
1122	case `32`:
1123	switch (GenericOpc) {
1124	case TargetOpcode::G_SITOFP:
1125	return AArch64::SCVTFUWDri;
1126	case TargetOpcode::G_UITOFP:
1127	return AArch64::UCVTFUWDri;
1128	case TargetOpcode::G_FPTOSI:
1129	return AArch64::FCVTZSUXSr;
1130	case TargetOpcode::G_FPTOUI:
1131	return AArch64::FCVTZUUXSr;
1132	default:
1133	return GenericOpc;
1134	}
1135	case `64`:
1136	switch (GenericOpc) {
1137	case TargetOpcode::G_SITOFP:
1138	return AArch64::SCVTFUXDri;
1139	case TargetOpcode::G_UITOFP:
1140	return AArch64::UCVTFUXDri;
1141	case TargetOpcode::G_FPTOSI:
1142	return AArch64::FCVTZSUXDr;
1143	case TargetOpcode::G_FPTOUI:
1144	return AArch64::FCVTZUUXDr;
1145	default:
1146	return GenericOpc;
1147	}
1148	default:
1149	return GenericOpc;
1150	}
1151	default:
1152	return GenericOpc;
1153	};
1154	return GenericOpc;
1155	}
1156
1157	MachineInstr *
1158	AArch64InstructionSelector::emitSelect(Register Dst, Register True,
1159	Register False, AArch64CC::CondCode CC,
1160	MachineIRBuilder &MIB) const {
1161	MachineRegisterInfo &MRI = *MIB.getMRI();
1162	assert(RBI.getRegBank(False, MRI, TRI)->getID() ==
1163	RBI.getRegBank(True, MRI, TRI)->getID() &&
1164	"Expected both select operands to have the same regbank?");
1165	LLT Ty = MRI.getType(Reg: True);
1166	if (Ty.isVector())
1167	return nullptr;
1168	const unsigned Size = Ty.getSizeInBits();
1169	assert((Size == `32` \|\| Size == `64`) &&
1170	"Expected 32 bit or 64 bit select only?");
1171	const bool Is32Bit = Size == `32`;
1172	if (RBI.getRegBank(True, MRI, TRI)->getID() != AArch64::GPRRegBankID) {
1173	unsigned Opc = Is32Bit ? AArch64::FCSELSrrr : AArch64::FCSELDrrr;
1174	auto FCSel = MIB.buildInstr(Opc, DstOps: {Dst}, SrcOps: {True, False}).addImm(Val: CC);
1175	constrainSelectedInstRegOperands(*FCSel, TII, TRI, RBI);
1176	return &*FCSel;
1177	}
1178
1179	// By default, we'll try and emit a CSEL.
1180	unsigned Opc = Is32Bit ? AArch64::CSELWr : AArch64::CSELXr;
1181	bool Optimized = false;
1182	auto TryFoldBinOpIntoSelect = [&Opc, Is32Bit, &CC, &MRI,
1183	&Optimized](Register &Reg, Register &OtherReg,
1184	bool Invert) {
1185	if (Optimized)
1186	return false;
1187
1188	// Attempt to fold:
1189	//
1190	// %sub = G_SUB 0, %x
1191	// %select = G_SELECT cc, %reg, %sub
1192	//
1193	// Into:
1194	// %select = CSNEG %reg, %x, cc
1195	Register MatchReg;
1196	if (mi_match(R: Reg, MRI, P: m_Neg(Src: m_Reg(R&: MatchReg)))) {
1197	Opc = Is32Bit ? AArch64::CSNEGWr : AArch64::CSNEGXr;
1198	Reg = MatchReg;
1199	if (Invert) {
1200	CC = AArch64CC::getInvertedCondCode(Code: CC);
1201	std::swap(a&: Reg, b&: OtherReg);
1202	}
1203	return true;
1204	}
1205
1206	// Attempt to fold:
1207	//
1208	// %xor = G_XOR %x, -1
1209	// %select = G_SELECT cc, %reg, %xor
1210	//
1211	// Into:
1212	// %select = CSINV %reg, %x, cc
1213	if (mi_match(R: Reg, MRI, P: m_Not(Src: m_Reg(R&: MatchReg)))) {
1214	Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1215	Reg = MatchReg;
1216	if (Invert) {
1217	CC = AArch64CC::getInvertedCondCode(Code: CC);
1218	std::swap(a&: Reg, b&: OtherReg);
1219	}
1220	return true;
1221	}
1222
1223	// Attempt to fold:
1224	//
1225	// %add = G_ADD %x, 1
1226	// %select = G_SELECT cc, %reg, %add
1227	//
1228	// Into:
1229	// %select = CSINC %reg, %x, cc
1230	if (mi_match(R: Reg, MRI,
1231	P: m_any_of(preds: m_GAdd(L: m_Reg(R&: MatchReg), R: m_SpecificICst(RequestedValue: `1`)),
1232	preds: m_GPtrAdd(L: m_Reg(R&: MatchReg), R: m_SpecificICst(RequestedValue: `1`))))) {
1233	Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1234	Reg = MatchReg;
1235	if (Invert) {
1236	CC = AArch64CC::getInvertedCondCode(Code: CC);
1237	std::swap(a&: Reg, b&: OtherReg);
1238	}
1239	return true;
1240	}
1241
1242	return false;
1243	};
1244
1245	// Helper lambda which tries to use CSINC/CSINV for the instruction when its
1246	// true/false values are constants.
1247	// FIXME: All of these patterns already exist in tablegen. We should be
1248	// able to import these.
1249	auto TryOptSelectCst = [&Opc, &True, &False, &CC, Is32Bit, &MRI,
1250	&Optimized]() {
1251	if (Optimized)
1252	return false;
1253	auto TrueCst = getIConstantVRegValWithLookThrough(VReg: True, MRI);
1254	auto FalseCst = getIConstantVRegValWithLookThrough(VReg: False, MRI);
1255	if (!TrueCst && !FalseCst)
1256	return false;
1257
1258	Register ZReg = Is32Bit ? AArch64::WZR : AArch64::XZR;
1259	if (TrueCst && FalseCst) {
1260	int64_t T = TrueCst ->Value.getSExtValue();
1261	int64_t F = FalseCst ->Value.getSExtValue();
1262
1263	if (T == `0` && F == `1`) {
1264	// G_SELECT cc, 0, 1 -> CSINC zreg, zreg, cc
1265	Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1266	True = ZReg;
1267	False = ZReg;
1268	return true;
1269	}
1270
1271	if (T == `0` && F == -`1`) {
1272	// G_SELECT cc 0, -1 -> CSINV zreg, zreg cc
1273	Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1274	True = ZReg;
1275	False = ZReg;
1276	return true;
1277	}
1278	}
1279
1280	if (TrueCst) {
1281	int64_t T = TrueCst ->Value.getSExtValue();
1282	if (T == `1`) {
1283	// G_SELECT cc, 1, f -> CSINC f, zreg, inv_cc
1284	Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1285	True = False;
1286	False = ZReg;
1287	CC = AArch64CC::getInvertedCondCode(Code: CC);
1288	return true;
1289	}
1290
1291	if (T == -`1`) {
1292	// G_SELECT cc, -1, f -> CSINV f, zreg, inv_cc
1293	Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1294	True = False;
1295	False = ZReg;
1296	CC = AArch64CC::getInvertedCondCode(Code: CC);
1297	return true;
1298	}
1299	}
1300
1301	if (FalseCst) {
1302	int64_t F = FalseCst ->Value.getSExtValue();
1303	if (F == `1`) {
1304	// G_SELECT cc, t, 1 -> CSINC t, zreg, cc
1305	Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1306	False = ZReg;
1307	return true;
1308	}
1309
1310	if (F == -`1`) {
1311	// G_SELECT cc, t, -1 -> CSINC t, zreg, cc
1312	Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1313	False = ZReg;
1314	return true;
1315	}
1316	}
1317	return false;
1318	};
1319
1320	Optimized \|= TryFoldBinOpIntoSelect (False, True, /Invert = / false);
1321	Optimized \|= TryFoldBinOpIntoSelect (True, False, /Invert = / true);
1322	Optimized \|= TryOptSelectCst ();
1323	auto SelectInst = MIB.buildInstr(Opc, DstOps: {Dst}, SrcOps: {True, False}).addImm(Val: CC);
1324	constrainSelectedInstRegOperands(*SelectInst, TII, TRI, RBI);
1325	return &*SelectInst;
1326	}
1327
1328	static AArch64CC::CondCode changeICMPPredToAArch64CC(CmpInst::Predicate P) {
1329	switch (P) {
1330	default:
1331	llvm_unreachable("Unknown condition code!");
1332	case CmpInst::ICMP_NE:
1333	return AArch64CC::NE;
1334	case CmpInst::ICMP_EQ:
1335	return AArch64CC::EQ;
1336	case CmpInst::ICMP_SGT:
1337	return AArch64CC::GT;
1338	case CmpInst::ICMP_SGE:
1339	return AArch64CC::GE;
1340	case CmpInst::ICMP_SLT:
1341	return AArch64CC::LT;
1342	case CmpInst::ICMP_SLE:
1343	return AArch64CC::LE;
1344	case CmpInst::ICMP_UGT:
1345	return AArch64CC::HI;
1346	case CmpInst::ICMP_UGE:
1347	return AArch64CC::HS;
1348	case CmpInst::ICMP_ULT:
1349	return AArch64CC::LO;
1350	case CmpInst::ICMP_ULE:
1351	return AArch64CC::LS;
1352	}
1353	}
1354
1355	/// changeFPCCToORAArch64CC - Convert an IR fp condition code to an AArch64 CC.
1356	static void changeFPCCToORAArch64CC(CmpInst::Predicate CC,
1357	AArch64CC::CondCode &CondCode,
1358	AArch64CC::CondCode &CondCode2) {
1359	CondCode2 = AArch64CC::AL;
1360	switch (CC) {
1361	default:
1362	llvm_unreachable("Unknown FP condition!");
1363	case CmpInst::FCMP_OEQ:
1364	CondCode = AArch64CC::EQ;
1365	break;
1366	case CmpInst::FCMP_OGT:
1367	CondCode = AArch64CC::GT;
1368	break;
1369	case CmpInst::FCMP_OGE:
1370	CondCode = AArch64CC::GE;
1371	break;
1372	case CmpInst::FCMP_OLT:
1373	CondCode = AArch64CC::MI;
1374	break;
1375	case CmpInst::FCMP_OLE:
1376	CondCode = AArch64CC::LS;
1377	break;
1378	case CmpInst::FCMP_ONE:
1379	CondCode = AArch64CC::MI;
1380	CondCode2 = AArch64CC::GT;
1381	break;
1382	case CmpInst::FCMP_ORD:
1383	CondCode = AArch64CC::VC;
1384	break;
1385	case CmpInst::FCMP_UNO:
1386	CondCode = AArch64CC::VS;
1387	break;
1388	case CmpInst::FCMP_UEQ:
1389	CondCode = AArch64CC::EQ;
1390	CondCode2 = AArch64CC::VS;
1391	break;
1392	case CmpInst::FCMP_UGT:
1393	CondCode = AArch64CC::HI;
1394	break;
1395	case CmpInst::FCMP_UGE:
1396	CondCode = AArch64CC::PL;
1397	break;
1398	case CmpInst::FCMP_ULT:
1399	CondCode = AArch64CC::LT;
1400	break;
1401	case CmpInst::FCMP_ULE:
1402	CondCode = AArch64CC::LE;
1403	break;
1404	case CmpInst::FCMP_UNE:
1405	CondCode = AArch64CC::NE;
1406	break;
1407	}
1408	}
1409
1410	/// Convert an IR fp condition code to an AArch64 CC.
1411	/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
1412	/// should be AND'ed instead of OR'ed.
1413	static void changeFPCCToANDAArch64CC(CmpInst::Predicate CC,
1414	AArch64CC::CondCode &CondCode,
1415	AArch64CC::CondCode &CondCode2) {
1416	CondCode2 = AArch64CC::AL;
1417	switch (CC) {
1418	default:
1419	changeFPCCToORAArch64CC(CC, CondCode, CondCode2);
1420	assert(CondCode2 == AArch64CC::AL);
1421	break;
1422	case CmpInst::FCMP_ONE:
1423	// (a one b)
1424	// == ((a olt b) \|\| (a ogt b))
1425	// == ((a ord b) && (a une b))
1426	CondCode = AArch64CC::VC;
1427	CondCode2 = AArch64CC::NE;
1428	break;
1429	case CmpInst::FCMP_UEQ:
1430	// (a ueq b)
1431	// == ((a uno b) \|\| (a oeq b))
1432	// == ((a ule b) && (a uge b))
1433	CondCode = AArch64CC::PL;
1434	CondCode2 = AArch64CC::LE;
1435	break;
1436	}
1437	}
1438
1439	/// Return a register which can be used as a bit to test in a TB(N)Z.
1440	static Register getTestBitReg(Register Reg, uint64_t &Bit, bool &Invert,
1441	MachineRegisterInfo &MRI) {
1442	assert(Reg.isValid() && "Expected valid register!");
1443	bool HasZext = false;
1444	while (MachineInstr *MI = getDefIgnoringCopies(Reg, MRI)) {
1445	unsigned Opc = MI->getOpcode();
1446
1447	if (!MI->getOperand(i: `0`).isReg() \|\|
1448	!MRI.hasOneNonDBGUse(RegNo: MI->getOperand(i: `0`).getReg()))
1449	break;
1450
1451	// (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
1452	//
1453	// (tbz (trunc x), b) -> (tbz x, b) is always safe, because the bit number
1454	// on the truncated x is the same as the bit number on x.
1455	if (Opc == TargetOpcode::G_ANYEXT \|\| Opc == TargetOpcode::G_ZEXT \|\|
1456	Opc == TargetOpcode::G_TRUNC) {
1457	if (Opc == TargetOpcode::G_ZEXT)
1458	HasZext = true;
1459
1460	Register NextReg = MI->getOperand(i: `1`).getReg();
1461	// Did we find something worth folding?
1462	if (!NextReg.isValid() \|\| !MRI.hasOneNonDBGUse(RegNo: NextReg))
1463	break;
1464
1465	// NextReg is worth folding. Keep looking.
1466	Reg = NextReg;
1467	continue;
1468	}
1469
1470	// Attempt to find a suitable operation with a constant on one side.
1471	std::optional<uint64_t> C;
1472	Register TestReg;
1473	switch (Opc) {
1474	default:
1475	break;
1476	case TargetOpcode::G_AND:
1477	case TargetOpcode::G_XOR: {
1478	TestReg = MI->getOperand(i: `1`).getReg();
1479	Register ConstantReg = MI->getOperand(i: `2`).getReg();
1480	auto VRegAndVal = getIConstantVRegValWithLookThrough(VReg: ConstantReg, MRI);
1481	if (!VRegAndVal) {
1482	// AND commutes, check the other side for a constant.
1483	// FIXME: Can we canonicalize the constant so that it's always on the
1484	// same side at some point earlier?
1485	std::swap(a&: ConstantReg, b&: TestReg);
1486	VRegAndVal = getIConstantVRegValWithLookThrough(VReg: ConstantReg, MRI);
1487	}
1488	if (VRegAndVal) {
1489	if (HasZext)
1490	C = VRegAndVal ->Value.getZExtValue();
1491	else
1492	C = VRegAndVal ->Value.getSExtValue();
1493	}
1494	break;
1495	}
1496	case TargetOpcode::G_ASHR:
1497	case TargetOpcode::G_LSHR:
1498	case TargetOpcode::G_SHL: {
1499	TestReg = MI->getOperand(i: `1`).getReg();
1500	auto VRegAndVal =
1501	getIConstantVRegValWithLookThrough(VReg: MI->getOperand(i: `2`).getReg(), MRI);
1502	if (VRegAndVal)
1503	C = VRegAndVal ->Value.getSExtValue();
1504	break;
1505	}
1506	}
1507
1508	// Didn't find a constant or viable register. Bail out of the loop.
1509	if (!C \|\| !TestReg.isValid())
1510	break;
1511
1512	// We found a suitable instruction with a constant. Check to see if we can
1513	// walk through the instruction.
1514	Register NextReg;
1515	unsigned TestRegSize = MRI.getType(Reg: TestReg).getSizeInBits();
1516	switch (Opc) {
1517	default:
1518	break;
1519	case TargetOpcode::G_AND:
1520	// (tbz (and x, m), b) -> (tbz x, b) when the b-th bit of m is set.
1521	if ((*C >> Bit) & `1`)
1522	NextReg = TestReg;
1523	break;
1524	case TargetOpcode::G_SHL:
1525	// (tbz (shl x, c), b) -> (tbz x, b-c) when b-c is positive and fits in
1526	// the type of the register.
1527	if (C <= Bit && (Bit - C) < TestRegSize) {
1528	NextReg = TestReg;
1529	Bit = Bit - *C;
1530	}
1531	break;
1532	case TargetOpcode::G_ASHR:
1533	// (tbz (ashr x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits
1534	// in x
1535	NextReg = TestReg;
1536	Bit = Bit + *C;
1537	if (Bit >= TestRegSize)
1538	Bit = TestRegSize - `1`;
1539	break;
1540	case TargetOpcode::G_LSHR:
1541	// (tbz (lshr x, c), b) -> (tbz x, b+c) when b + c is < # bits in x
1542	if ((Bit + *C) < TestRegSize) {
1543	NextReg = TestReg;
1544	Bit = Bit + *C;
1545	}
1546	break;
1547	case TargetOpcode::G_XOR:
1548	// We can walk through a G_XOR by inverting whether we use tbz/tbnz when
1549	// appropriate.
1550	//
1551	// e.g. If x' = xor x, c, and the b-th bit is set in c then
1552	//
1553	// tbz x', b -> tbnz x, b
1554	//
1555	// Because x' only has the b-th bit set if x does not.
1556	if ((*C >> Bit) & `1`)
1557	Invert = !Invert;
1558	NextReg = TestReg;
1559	break;
1560	}
1561
1562	// Check if we found anything worth folding.
1563	if (!NextReg.isValid())
1564	return Reg;
1565	Reg = NextReg;
1566	}
1567
1568	return Reg;
1569	}
1570
1571	MachineInstr *AArch64InstructionSelector::emitTestBit(
1572	Register TestReg, uint64_t Bit, bool IsNegative, MachineBasicBlock *DstMBB,
1573	MachineIRBuilder &MIB) const {
1574	assert(TestReg.isValid());
1575	assert(ProduceNonFlagSettingCondBr &&
1576	"Cannot emit TB(N)Z with speculation tracking!");
1577	MachineRegisterInfo &MRI = *MIB.getMRI();
1578
1579	// Attempt to optimize the test bit by walking over instructions.
1580	TestReg = getTestBitReg(Reg: TestReg, Bit, Invert&: IsNegative, MRI);
1581	LLT Ty = MRI.getType(Reg: TestReg);
1582	unsigned Size = Ty.getSizeInBits();
1583	assert(!Ty.isVector() && "Expected a scalar!");
1584	assert(Bit < `64` && "Bit is too large!");
1585
1586	// When the test register is a 64-bit register, we have to narrow to make
1587	// TBNZW work.
1588	bool UseWReg = Bit < `32`;
1589	unsigned NecessarySize = UseWReg ? `32` : `64`;
1590	if (Size != NecessarySize)
1591	TestReg = moveScalarRegClass(
1592	TestReg, UseWReg ? AArch64::GPR32RegClass : AArch64::GPR64RegClass,
1593	MIB);
1594
1595	static const unsigned OpcTable[`2`][`2`] = {{AArch64::TBZX, AArch64::TBNZX},
1596	{AArch64::TBZW, AArch64::TBNZW}};
1597	unsigned Opc = OpcTable[UseWReg][IsNegative];
1598	auto TestBitMI =
1599	MIB.buildInstr(Opcode: Opc).addReg(RegNo: TestReg).addImm(Val: Bit).addMBB(MBB: DstMBB);
1600	constrainSelectedInstRegOperands(*TestBitMI, TII, TRI, RBI);
1601	return &*TestBitMI;
1602	}
1603
1604	bool AArch64InstructionSelector::tryOptAndIntoCompareBranch(
1605	MachineInstr &AndInst, bool Invert, MachineBasicBlock *DstMBB,
1606	MachineIRBuilder &MIB) const {
1607	assert(AndInst.getOpcode() == TargetOpcode::G_AND && "Expected G_AND only?");
1608	// Given something like this:
1609	//
1610	// %x = ...Something...
1611	// %one = G_CONSTANT i64 1
1612	// %zero = G_CONSTANT i64 0
1613	// %and = G_AND %x, %one
1614	// %cmp = G_ICMP intpred(ne), %and, %zero
1615	// %cmp_trunc = G_TRUNC %cmp
1616	// G_BRCOND %cmp_trunc, %bb.3
1617	//
1618	// We want to try and fold the AND into the G_BRCOND and produce either a
1619	// TBNZ (when we have intpred(ne)) or a TBZ (when we have intpred(eq)).
1620	//
1621	// In this case, we'd get
1622	//
1623	// TBNZ %x %bb.3
1624	//
1625
1626	// Check if the AND has a constant on its RHS which we can use as a mask.
1627	// If it's a power of 2, then it's the same as checking a specific bit.
1628	// (e.g, ANDing with 8 == ANDing with 000...100 == testing if bit 3 is set)
1629	auto MaybeBit = getIConstantVRegValWithLookThrough(
1630	VReg: AndInst.getOperand(i: `2`).getReg(), MRI: *MIB.getMRI());
1631	if (!MaybeBit)
1632	return false;
1633
1634	int32_t Bit = MaybeBit ->Value.exactLogBase2();
1635	if (Bit < `0`)
1636	return false;
1637
1638	Register TestReg = AndInst.getOperand(i: `1`).getReg();
1639
1640	// Emit a TB(N)Z.
1641	emitTestBit(TestReg, Bit, IsNegative: Invert, DstMBB, MIB);
1642	return true;
1643	}
1644
1645	MachineInstr *AArch64InstructionSelector::emitCBZ(Register CompareReg,
1646	bool IsNegative,
1647	MachineBasicBlock *DestMBB,
1648	MachineIRBuilder &MIB) const {
1649	assert(ProduceNonFlagSettingCondBr && "CBZ does not set flags!");
1650	MachineRegisterInfo &MRI = *MIB.getMRI();
1651	assert(RBI.getRegBank(CompareReg, MRI, TRI)->getID() ==
1652	AArch64::GPRRegBankID &&
1653	"Expected GPRs only?");
1654	auto Ty = MRI.getType(Reg: CompareReg);
1655	unsigned Width = Ty.getSizeInBits();
1656	assert(!Ty.isVector() && "Expected scalar only?");
1657	assert(Width <= `64` && "Expected width to be at most 64?");
1658	static const unsigned OpcTable[`2`][`2`] = {{AArch64::CBZW, AArch64::CBZX},
1659	{AArch64::CBNZW, AArch64::CBNZX}};
1660	unsigned Opc = OpcTable[IsNegative][Width == `64`];
1661	auto BranchMI = MIB.buildInstr(Opc, DstOps: {}, SrcOps: {CompareReg}).addMBB(MBB: DestMBB);
1662	constrainSelectedInstRegOperands(*BranchMI, TII, TRI, RBI);
1663	return &*BranchMI;
1664	}
1665
1666	bool AArch64InstructionSelector::selectCompareBranchFedByFCmp(
1667	MachineInstr &I, MachineInstr &FCmp, MachineIRBuilder &MIB) const {
1668	assert(FCmp.getOpcode() == TargetOpcode::G_FCMP);
1669	assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1670	// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
1671	// totally clean. Some of them require two branches to implement.
1672	auto Pred = (CmpInst::Predicate)FCmp.getOperand(i: `1`).getPredicate();
1673	emitFPCompare(LHS: FCmp.getOperand(i: `2`).getReg(), RHS: FCmp.getOperand(i: `3`).getReg(), MIRBuilder&: MIB,
1674	Pred);
1675	AArch64CC::CondCode CC1, CC2;
1676	changeFCMPPredToAArch64CC(P: static_cast<CmpInst::Predicate>(Pred), CondCode&: CC1, CondCode2&: CC2);
1677	MachineBasicBlock *DestMBB = I.getOperand(i: `1`).getMBB();
1678	MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC1).addMBB(DestMBB);
1679	if (CC2 != AArch64CC::AL)
1680	MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC2).addMBB(DestMBB);
1681	I.eraseFromParent();
1682	return true;
1683	}
1684
1685	bool AArch64InstructionSelector::tryOptCompareBranchFedByICmp(
1686	MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const {
1687	assert(ICmp.getOpcode() == TargetOpcode::G_ICMP);
1688	assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1689	// Attempt to optimize the G_BRCOND + G_ICMP into a TB(N)Z/CB(N)Z.
1690	//
1691	// Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z
1692	// instructions will not be produced, as they are conditional branch
1693	// instructions that do not set flags.
1694	if (!ProduceNonFlagSettingCondBr)
1695	return false;
1696
1697	MachineRegisterInfo &MRI = *MIB.getMRI();
1698	MachineBasicBlock *DestMBB = I.getOperand(i: `1`).getMBB();
1699	auto Pred =
1700	static_cast<CmpInst::Predicate>(ICmp.getOperand(i: `1`).getPredicate());
1701	Register LHS = ICmp.getOperand(i: `2`).getReg();
1702	Register RHS = ICmp.getOperand(i: `3`).getReg();
1703
1704	// We're allowed to emit a TB(N)Z/CB(N)Z. Try to do that.
1705	auto VRegAndVal = getIConstantVRegValWithLookThrough(VReg: RHS, MRI);
1706	MachineInstr *AndInst = getOpcodeDef(Opcode: TargetOpcode::G_AND, Reg: LHS, MRI);
1707
1708	// When we can emit a TB(N)Z, prefer that.
1709	//
1710	// Handle non-commutative condition codes first.
1711	// Note that we don't want to do this when we have a G_AND because it can
1712	// become a tst. The tst will make the test bit in the TB(N)Z redundant.
1713	if (VRegAndVal && !AndInst) {
1714	int64_t C = VRegAndVal ->Value.getSExtValue();
1715
1716	// When we have a greater-than comparison, we can just test if the msb is
1717	// zero.
1718	if (C == -`1` && Pred == CmpInst::ICMP_SGT) {
1719	uint64_t Bit = MRI.getType(Reg: LHS).getSizeInBits() - `1`;
1720	emitTestBit(TestReg: LHS, Bit, /IsNegative = / false, DstMBB: DestMBB, MIB);
1721	I.eraseFromParent();
1722	return true;
1723	}
1724
1725	// When we have a less than comparison, we can just test if the msb is not
1726	// zero.
1727	if (C == `0` && Pred == CmpInst::ICMP_SLT) {
1728	uint64_t Bit = MRI.getType(Reg: LHS).getSizeInBits() - `1`;
1729	emitTestBit(TestReg: LHS, Bit, /IsNegative = / true, DstMBB: DestMBB, MIB);
1730	I.eraseFromParent();
1731	return true;
1732	}
1733
1734	// Inversely, if we have a signed greater-than-or-equal comparison to zero,
1735	// we can test if the msb is zero.
1736	if (C == `0` && Pred == CmpInst::ICMP_SGE) {
1737	uint64_t Bit = MRI.getType(Reg: LHS).getSizeInBits() - `1`;
1738	emitTestBit(TestReg: LHS, Bit, /IsNegative = / false, DstMBB: DestMBB, MIB);
1739	I.eraseFromParent();
1740	return true;
1741	}
1742	}
1743
1744	// Attempt to handle commutative condition codes. Right now, that's only
1745	// eq/ne.
1746	if (ICmpInst::isEquality(P: Pred)) {
1747	if (!VRegAndVal) {
1748	std::swap(a&: RHS, b&: LHS);
1749	VRegAndVal = getIConstantVRegValWithLookThrough(VReg: RHS, MRI);
1750	AndInst = getOpcodeDef(Opcode: TargetOpcode::G_AND, Reg: LHS, MRI);
1751	}
1752
1753	if (VRegAndVal && VRegAndVal ->Value == `0`) {
1754	// If there's a G_AND feeding into this branch, try to fold it away by
1755	// emitting a TB(N)Z instead.
1756	//
1757	// Note: If we have LT, then it is* possible to fold, but it wouldn't be*
1758	// beneficial. When we have an AND and LT, we need a TST/ANDS, so folding
1759	// would be redundant.
1760	if (AndInst &&
1761	tryOptAndIntoCompareBranch(
1762	AndInst&: AndInst, /Invert = /* Pred == CmpInst::ICMP_NE, DstMBB: DestMBB, MIB)) {
1763	I.eraseFromParent();
1764	return true;
1765	}
1766
1767	// Otherwise, try to emit a CB(N)Z instead.
1768	auto LHSTy = MRI.getType(Reg: LHS);
1769	if (!LHSTy.isVector() && LHSTy.getSizeInBits() <= `64`) {
1770	emitCBZ(CompareReg: LHS, /IsNegative = / Pred == CmpInst::ICMP_NE, DestMBB, MIB);
1771	I.eraseFromParent();
1772	return true;
1773	}
1774	}
1775	}
1776
1777	return false;
1778	}
1779
1780	bool AArch64InstructionSelector::selectCompareBranchFedByICmp(
1781	MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const {
1782	assert(ICmp.getOpcode() == TargetOpcode::G_ICMP);
1783	assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1784	if (tryOptCompareBranchFedByICmp(I, ICmp, MIB))
1785	return true;
1786
1787	// Couldn't optimize. Emit a compare + a Bcc.
1788	MachineBasicBlock *DestMBB = I.getOperand(i: `1`).getMBB();
1789	auto PredOp = ICmp.getOperand(i: `1`);
1790	emitIntegerCompare(LHS&: ICmp.getOperand(i: `2`), RHS&: ICmp.getOperand(i: `3`), Predicate&: PredOp, MIRBuilder&: MIB);
1791	const AArch64CC::CondCode CC = changeICMPPredToAArch64CC(
1792	P: static_cast<CmpInst::Predicate>(PredOp.getPredicate()));
1793	MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC).addMBB(DestMBB);
1794	I.eraseFromParent();
1795	return true;
1796	}
1797
1798	bool AArch64InstructionSelector::selectCompareBranch(
1799	MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) {
1800	Register CondReg = I.getOperand(i: `0`).getReg();
1801	MachineInstr *CCMI = MRI.getVRegDef(Reg: CondReg);
1802	// Try to select the G_BRCOND using whatever is feeding the condition if
1803	// possible.
1804	unsigned CCMIOpc = CCMI->getOpcode();
1805	if (CCMIOpc == TargetOpcode::G_FCMP)
1806	return selectCompareBranchFedByFCmp(I, FCmp&: *CCMI, MIB);
1807	if (CCMIOpc == TargetOpcode::G_ICMP)
1808	return selectCompareBranchFedByICmp(I, ICmp&: *CCMI, MIB);
1809
1810	// Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z
1811	// instructions will not be produced, as they are conditional branch
1812	// instructions that do not set flags.
1813	if (ProduceNonFlagSettingCondBr) {
1814	emitTestBit(TestReg: CondReg, /Bit = / `0`, /IsNegative = / true,
1815	DstMBB: I.getOperand(i: `1`).getMBB(), MIB);
1816	I.eraseFromParent();
1817	return true;
1818	}
1819
1820	// Can't emit TB(N)Z/CB(N)Z. Emit a tst + bcc instead.
1821	auto TstMI =
1822	MIB.buildInstr(AArch64::ANDSWri, {LLT::scalar(`32`)}, {CondReg}).addImm(`1`);
1823	constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
1824	auto Bcc = MIB.buildInstr(AArch64::Bcc)
1825	.addImm(AArch64CC::NE)
1826	.addMBB(I.getOperand(`1`).getMBB());
1827	I.eraseFromParent();
1828	return constrainSelectedInstRegOperands(*Bcc, TII, TRI, RBI);
1829	}
1830
1831	/// Returns the element immediate value of a vector shift operand if found.
1832	/// This needs to detect a splat-like operation, e.g. a G_BUILD_VECTOR.
1833	static std::optional<int64_t> getVectorShiftImm(Register Reg,
1834	MachineRegisterInfo &MRI) {
1835	assert(MRI.getType(Reg).isVector() && "Expected a vector shift operand");
1836	MachineInstr *OpMI = MRI.getVRegDef(Reg);
1837	return getAArch64VectorSplatScalar(MI: *OpMI, MRI);
1838	}
1839
1840	/// Matches and returns the shift immediate value for a SHL instruction given
1841	/// a shift operand.
1842	static std::optional<int64_t> getVectorSHLImm(LLT SrcTy, Register Reg,
1843	MachineRegisterInfo &MRI) {
1844	std::optional<int64_t> ShiftImm = getVectorShiftImm(Reg, MRI);
1845	if (!ShiftImm)
1846	return std::nullopt;
1847	// Check the immediate is in range for a SHL.
1848	int64_t Imm = *ShiftImm;
1849	if (Imm < `0`)
1850	return std::nullopt;
1851	switch (SrcTy.getElementType().getSizeInBits()) {
1852	default:
1853	LLVM_DEBUG(dbgs() << "Unhandled element type for vector shift");
1854	return std::nullopt;
1855	case `8`:
1856	if (Imm > `7`)
1857	return std::nullopt;
1858	break;
1859	case `16`:
1860	if (Imm > `15`)
1861	return std::nullopt;
1862	break;
1863	case `32`:
1864	if (Imm > `31`)
1865	return std::nullopt;
1866	break;
1867	case `64`:
1868	if (Imm > `63`)
1869	return std::nullopt;
1870	break;
1871	}
1872	return Imm;
1873	}
1874
1875	bool AArch64InstructionSelector::selectVectorSHL(MachineInstr &I,
1876	MachineRegisterInfo &MRI) {
1877	assert(I.getOpcode() == TargetOpcode::G_SHL);
1878	Register DstReg = I.getOperand(i: `0`).getReg();
1879	const LLT Ty = MRI.getType(Reg: DstReg);
1880	Register Src1Reg = I.getOperand(i: `1`).getReg();
1881	Register Src2Reg = I.getOperand(i: `2`).getReg();
1882
1883	if (!Ty.isVector())
1884	return false;
1885
1886	// Check if we have a vector of constants on RHS that we can select as the
1887	// immediate form.
1888	std::optional<int64_t> ImmVal = getVectorSHLImm(SrcTy: Ty, Reg: Src2Reg, MRI);
1889
1890	unsigned Opc = `0`;
1891	if (Ty == LLT::fixed_vector(NumElements: `2`, ScalarSizeInBits: `64`)) {
1892	Opc = ImmVal ? AArch64::SHLv2i64_shift : AArch64::USHLv2i64;
1893	} else if (Ty == LLT::fixed_vector(NumElements: `4`, ScalarSizeInBits: `32`)) {
1894	Opc = ImmVal ? AArch64::SHLv4i32_shift : AArch64::USHLv4i32;
1895	} else if (Ty == LLT::fixed_vector(NumElements: `2`, ScalarSizeInBits: `32`)) {
1896	Opc = ImmVal ? AArch64::SHLv2i32_shift : AArch64::USHLv2i32;
1897	} else if (Ty == LLT::fixed_vector(NumElements: `4`, ScalarSizeInBits: `16`)) {
1898	Opc = ImmVal ? AArch64::SHLv4i16_shift : AArch64::USHLv4i16;
1899	} else if (Ty == LLT::fixed_vector(NumElements: `8`, ScalarSizeInBits: `16`)) {
1900	Opc = ImmVal ? AArch64::SHLv8i16_shift : AArch64::USHLv8i16;
1901	} else if (Ty == LLT::fixed_vector(NumElements: `16`, ScalarSizeInBits: `8`)) {
1902	Opc = ImmVal ? AArch64::SHLv16i8_shift : AArch64::USHLv16i8;
1903	} else if (Ty == LLT::fixed_vector(NumElements: `8`, ScalarSizeInBits: `8`)) {
1904	Opc = ImmVal ? AArch64::SHLv8i8_shift : AArch64::USHLv8i8;
1905	} else {
1906	LLVM_DEBUG(dbgs() << "Unhandled G_SHL type");
1907	return false;
1908	}
1909
1910	auto Shl = MIB.buildInstr(Opc, DstOps: {DstReg}, SrcOps: {Src1Reg});
1911	if (ImmVal)
1912	Shl.addImm(Val: *ImmVal);
1913	else
1914	Shl.addUse(RegNo: Src2Reg);
1915	constrainSelectedInstRegOperands(*Shl, TII, TRI, RBI);
1916	I.eraseFromParent();
1917	return true;
1918	}
1919
1920	bool AArch64InstructionSelector::selectVectorAshrLshr(
1921	MachineInstr &I, MachineRegisterInfo &MRI) {
1922	assert(I.getOpcode() == TargetOpcode::G_ASHR \|\|
1923	I.getOpcode() == TargetOpcode::G_LSHR);
1924	Register DstReg = I.getOperand(i: `0`).getReg();
1925	const LLT Ty = MRI.getType(Reg: DstReg);
1926	Register Src1Reg = I.getOperand(i: `1`).getReg();
1927	Register Src2Reg = I.getOperand(i: `2`).getReg();
1928
1929	if (!Ty.isVector())
1930	return false;
1931
1932	bool IsASHR = I.getOpcode() == TargetOpcode::G_ASHR;
1933
1934	// We expect the immediate case to be lowered in the PostLegalCombiner to
1935	// AArch64ISD::VASHR or AArch64ISD::VLSHR equivalents.
1936
1937	// There is not a shift right register instruction, but the shift left
1938	// register instruction takes a signed value, where negative numbers specify a
1939	// right shift.
1940
1941	unsigned Opc = `0`;
1942	unsigned NegOpc = `0`;
1943	const TargetRegisterClass *RC =
1944	getRegClassForTypeOnBank(Ty, RBI.getRegBank(AArch64::FPRRegBankID));
1945	if (Ty == LLT::fixed_vector(NumElements: `2`, ScalarSizeInBits: `64`)) {
1946	Opc = IsASHR ? AArch64::SSHLv2i64 : AArch64::USHLv2i64;
1947	NegOpc = AArch64::NEGv2i64;
1948	} else if (Ty == LLT::fixed_vector(NumElements: `4`, ScalarSizeInBits: `32`)) {
1949	Opc = IsASHR ? AArch64::SSHLv4i32 : AArch64::USHLv4i32;
1950	NegOpc = AArch64::NEGv4i32;
1951	} else if (Ty == LLT::fixed_vector(NumElements: `2`, ScalarSizeInBits: `32`)) {
1952	Opc = IsASHR ? AArch64::SSHLv2i32 : AArch64::USHLv2i32;
1953	NegOpc = AArch64::NEGv2i32;
1954	} else if (Ty == LLT::fixed_vector(NumElements: `4`, ScalarSizeInBits: `16`)) {
1955	Opc = IsASHR ? AArch64::SSHLv4i16 : AArch64::USHLv4i16;
1956	NegOpc = AArch64::NEGv4i16;
1957	} else if (Ty == LLT::fixed_vector(NumElements: `8`, ScalarSizeInBits: `16`)) {
1958	Opc = IsASHR ? AArch64::SSHLv8i16 : AArch64::USHLv8i16;
1959	NegOpc = AArch64::NEGv8i16;
1960	} else if (Ty == LLT::fixed_vector(NumElements: `16`, ScalarSizeInBits: `8`)) {
1961	Opc = IsASHR ? AArch64::SSHLv16i8 : AArch64::USHLv16i8;
1962	NegOpc = AArch64::NEGv16i8;
1963	} else if (Ty == LLT::fixed_vector(NumElements: `8`, ScalarSizeInBits: `8`)) {
1964	Opc = IsASHR ? AArch64::SSHLv8i8 : AArch64::USHLv8i8;
1965	NegOpc = AArch64::NEGv8i8;
1966	} else {
1967	LLVM_DEBUG(dbgs() << "Unhandled G_ASHR type");
1968	return false;
1969	}
1970
1971	auto Neg = MIB.buildInstr(Opc: NegOpc, DstOps: {RC}, SrcOps: {Src2Reg});
1972	constrainSelectedInstRegOperands(*Neg, TII, TRI, RBI);
1973	auto SShl = MIB.buildInstr(Opc, DstOps: {DstReg}, SrcOps: {Src1Reg, Neg});
1974	constrainSelectedInstRegOperands(*SShl, TII, TRI, RBI);
1975	I.eraseFromParent();
1976	return true;
1977	}
1978
1979	bool AArch64InstructionSelector::selectVaStartAAPCS(
1980	MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
1981	return false;
1982	}
1983
1984	bool AArch64InstructionSelector::selectVaStartDarwin(
1985	MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
1986	AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
1987	Register ListReg = I.getOperand(i: `0`).getReg();
1988
1989	Register ArgsAddrReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
1990
1991	int FrameIdx = FuncInfo->getVarArgsStackIndex();
1992	if (MF.getSubtarget<AArch64Subtarget>().isCallingConvWin64(
1993	CC: MF.getFunction().getCallingConv())) {
1994	FrameIdx = FuncInfo->getVarArgsGPRSize() > `0`
1995	? FuncInfo->getVarArgsGPRIndex()
1996	: FuncInfo->getVarArgsStackIndex();
1997	}
1998
1999	auto MIB =
2000	BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::ADDXri))
2001	.addDef(ArgsAddrReg)
2002	.addFrameIndex(FrameIdx)
2003	.addImm(`0`)
2004	.addImm(`0`);
2005
2006	constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2007
2008	MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::STRXui))
2009	.addUse(ArgsAddrReg)
2010	.addUse(ListReg)
2011	.addImm(`0`)
2012	.addMemOperand(*I.memoperands_begin());
2013
2014	constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2015	I.eraseFromParent();
2016	return true;
2017	}
2018
2019	void AArch64InstructionSelector::materializeLargeCMVal(
2020	MachineInstr &I, const Value V, unsigned* OpFlags) {
2021	MachineBasicBlock &MBB = *I.getParent();
2022	MachineFunction &MF = *MBB.getParent();
2023	MachineRegisterInfo &MRI = MF.getRegInfo();
2024
2025	auto MovZ = MIB.buildInstr(AArch64::MOVZXi, {&AArch64::GPR64RegClass}, {});
2026	MovZ->addOperand(MF, I.getOperand(i: `1`));
2027	MovZ->getOperand(`1`).setTargetFlags(OpFlags \| AArch64II::MO_G0 \|
2028	AArch64II::MO_NC);
2029	MovZ->addOperand(MF, MachineOperand::CreateImm(Val: `0`));
2030	constrainSelectedInstRegOperands(*MovZ, TII, TRI, RBI);
2031
2032	auto BuildMovK = [&](Register SrcReg, unsigned char Flags, unsigned Offset,
2033	Register ForceDstReg) {
2034	Register DstReg = ForceDstReg
2035	? ForceDstReg
2036	: MRI.createVirtualRegister(&AArch64::GPR64RegClass);
2037	auto MovI = MIB.buildInstr(AArch64::MOVKXi).addDef(DstReg).addUse(SrcReg);
2038	if (auto *GV = dyn_cast<GlobalValue>(Val: V)) {
2039	MovI->addOperand(MF, MachineOperand::CreateGA(
2040	GV, Offset: MovZ->getOperand(`1`).getOffset(), TargetFlags: Flags));
2041	} else {
2042	MovI->addOperand(
2043	MF, MachineOperand::CreateBA(BA: cast<BlockAddress>(Val: V),
2044	Offset: MovZ->getOperand(`1`).getOffset(), TargetFlags: Flags));
2045	}
2046	MovI->addOperand(MF, MachineOperand::CreateImm(Val: Offset));
2047	constrainSelectedInstRegOperands(*MovI, TII, TRI, RBI);
2048	return DstReg;
2049	};
2050	Register DstReg = BuildMovK(MovZ.getReg(`0`),
2051	AArch64II::MO_G1 \| AArch64II::MO_NC, `16`, `0`);
2052	DstReg = BuildMovK (DstReg, AArch64II::MO_G2 \| AArch64II::MO_NC, `32`, `0`);
2053	BuildMovK (DstReg, AArch64II::MO_G3, `48`, I.getOperand(i: `0`).getReg());
2054	}
2055
2056	bool AArch64InstructionSelector::preISelLower(MachineInstr &I) {
2057	MachineBasicBlock &MBB = *I.getParent();
2058	MachineFunction &MF = *MBB.getParent();
2059	MachineRegisterInfo &MRI = MF.getRegInfo();
2060
2061	switch (I.getOpcode()) {
2062	case TargetOpcode::G_STORE: {
2063	bool Changed = contractCrossBankCopyIntoStore(I, MRI);
2064	MachineOperand &SrcOp = I.getOperand(i: `0`);
2065	if (MRI.getType(Reg: SrcOp.getReg()).isPointer()) {
2066	// Allow matching with imported patterns for stores of pointers. Unlike
2067	// G_LOAD/G_PTR_ADD, we may not have selected all users. So, emit a copy
2068	// and constrain.
2069	auto Copy = MIB.buildCopy(Res: LLT::scalar(SizeInBits: `64`), Op: SrcOp);
2070	Register NewSrc = Copy.getReg(Idx: `0`);
2071	SrcOp.setReg(NewSrc);
2072	RBI.constrainGenericRegister(NewSrc, AArch64::GPR64RegClass, MRI);
2073	Changed = true;
2074	}
2075	return Changed;
2076	}
2077	case TargetOpcode::G_PTR_ADD:
2078	return convertPtrAddToAdd(I, MRI);
2079	case TargetOpcode::G_LOAD: {
2080	// For scalar loads of pointers, we try to convert the dest type from p0
2081	// to s64 so that our imported patterns can match. Like with the G_PTR_ADD
2082	// conversion, this should be ok because all users should have been
2083	// selected already, so the type doesn't matter for them.
2084	Register DstReg = I.getOperand(i: `0`).getReg();
2085	const LLT DstTy = MRI.getType(Reg: DstReg);
2086	if (!DstTy.isPointer())
2087	return false;
2088	MRI.setType(VReg: DstReg, Ty: LLT::scalar(SizeInBits: `64`));
2089	return true;
2090	}
2091	case AArch64::G_DUP: {
2092	// Convert the type from p0 to s64 to help selection.
2093	LLT DstTy = MRI.getType(Reg: I.getOperand(i: `0`).getReg());
2094	if (!DstTy.isPointerVector())
2095	return false;
2096	auto NewSrc = MIB.buildCopy(Res: LLT::scalar(SizeInBits: `64`), Op: I.getOperand(i: `1`).getReg());
2097	MRI.setType(VReg: I.getOperand(i: `0`).getReg(),
2098	Ty: DstTy.changeElementType(NewEltTy: LLT::scalar(SizeInBits: `64`)));
2099	MRI.setRegClass(NewSrc.getReg(`0`), &AArch64::GPR64RegClass);
2100	I.getOperand(i: `1`).setReg(NewSrc.getReg(Idx: `0`));
2101	return true;
2102	}
2103	case TargetOpcode::G_UITOFP:
2104	case TargetOpcode::G_SITOFP: {
2105	// If both source and destination regbanks are FPR, then convert the opcode
2106	// to G_SITOF so that the importer can select it to an fpr variant.
2107	// Otherwise, it ends up matching an fpr/gpr variant and adding a cross-bank
2108	// copy.
2109	Register SrcReg = I.getOperand(i: `1`).getReg();
2110	LLT SrcTy = MRI.getType(Reg: SrcReg);
2111	LLT DstTy = MRI.getType(Reg: I.getOperand(i: `0`).getReg());
2112	if (SrcTy.isVector() \|\| SrcTy.getSizeInBits() != DstTy.getSizeInBits())
2113	return false;
2114
2115	if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::FPRRegBankID) {
2116	if (I.getOpcode() == TargetOpcode::G_SITOFP)
2117	I.setDesc(TII.get(AArch64::G_SITOF));
2118	else
2119	I.setDesc(TII.get(AArch64::G_UITOF));
2120	return true;
2121	}
2122	return false;
2123	}
2124	default:
2125	return false;
2126	}
2127	}
2128
2129	/// This lowering tries to look for G_PTR_ADD instructions and then converts
2130	/// them to a standard G_ADD with a COPY on the source.
2131	///
2132	/// The motivation behind this is to expose the add semantics to the imported
2133	/// tablegen patterns. We shouldn't need to check for uses being loads/stores,
2134	/// because the selector works bottom up, uses before defs. By the time we
2135	/// end up trying to select a G_PTR_ADD, we should have already attempted to
2136	/// fold this into addressing modes and were therefore unsuccessful.
2137	bool AArch64InstructionSelector::convertPtrAddToAdd(
2138	MachineInstr &I, MachineRegisterInfo &MRI) {
2139	assert(I.getOpcode() == TargetOpcode::G_PTR_ADD && "Expected G_PTR_ADD");
2140	Register DstReg = I.getOperand(i: `0`).getReg();
2141	Register AddOp1Reg = I.getOperand(i: `1`).getReg();
2142	const LLT PtrTy = MRI.getType(Reg: DstReg);
2143	if (PtrTy.getAddressSpace() != `0`)
2144	return false;
2145
2146	const LLT CastPtrTy =
2147	PtrTy.isVector() ? LLT::fixed_vector(NumElements: `2`, ScalarSizeInBits: `64`) : LLT::scalar(SizeInBits: `64`);
2148	auto PtrToInt = MIB.buildPtrToInt(Dst: CastPtrTy, Src: AddOp1Reg);
2149	// Set regbanks on the registers.
2150	if (PtrTy.isVector())
2151	MRI.setRegBank(PtrToInt.getReg(`0`), RBI.getRegBank(AArch64::FPRRegBankID));
2152	else
2153	MRI.setRegBank(PtrToInt.getReg(`0`), RBI.getRegBank(AArch64::GPRRegBankID));
2154
2155	// Now turn the %dst(p0) = G_PTR_ADD %base, off into:
2156	// %dst(intty) = G_ADD %intbase, off
2157	I.setDesc(TII.get(TargetOpcode::G_ADD));
2158	MRI.setType(VReg: DstReg, Ty: CastPtrTy);
2159	I.getOperand(i: `1`).setReg(PtrToInt.getReg(Idx: `0`));
2160	if (!select(I&: *PtrToInt)) {
2161	LLVM_DEBUG(dbgs() << "Failed to select G_PTRTOINT in convertPtrAddToAdd");
2162	return false;
2163	}
2164
2165	// Also take the opportunity here to try to do some optimization.
2166	// Try to convert this into a G_SUB if the offset is a 0-x negate idiom.
2167	Register NegatedReg;
2168	if (!mi_match(R: I.getOperand(i: `2`).getReg(), MRI, P: m_Neg(Src: m_Reg(R&: NegatedReg))))
2169	return true;
2170	I.getOperand(i: `2`).setReg(NegatedReg);
2171	I.setDesc(TII.get(TargetOpcode::G_SUB));
2172	return true;
2173	}
2174
2175	bool AArch64InstructionSelector::earlySelectSHL(MachineInstr &I,
2176	MachineRegisterInfo &MRI) {
2177	// We try to match the immediate variant of LSL, which is actually an alias
2178	// for a special case of UBFM. Otherwise, we fall back to the imported
2179	// selector which will match the register variant.
2180	assert(I.getOpcode() == TargetOpcode::G_SHL && "unexpected op");
2181	const auto &MO = I.getOperand(i: `2`);
2182	auto VRegAndVal = getIConstantVRegVal(VReg: MO.getReg(), MRI);
2183	if (!VRegAndVal)
2184	return false;
2185
2186	const LLT DstTy = MRI.getType(Reg: I.getOperand(i: `0`).getReg());
2187	if (DstTy.isVector())
2188	return false;
2189	bool Is64Bit = DstTy.getSizeInBits() == `64`;
2190	auto Imm1Fn = Is64Bit ? selectShiftA_64(Root: MO) : selectShiftA_32(Root: MO);
2191	auto Imm2Fn = Is64Bit ? selectShiftB_64(Root: MO) : selectShiftB_32(Root: MO);
2192
2193	if (!Imm1Fn \|\| !Imm2Fn)
2194	return false;
2195
2196	auto NewI =
2197	MIB.buildInstr(Is64Bit ? AArch64::UBFMXri : AArch64::UBFMWri,
2198	{I.getOperand(`0`).getReg()}, {I.getOperand(`1`).getReg()});
2199
2200	for (auto &RenderFn : *Imm1Fn)
2201	RenderFn(NewI);
2202	for (auto &RenderFn : *Imm2Fn)
2203	RenderFn(NewI);
2204
2205	I.eraseFromParent();
2206	return constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI);
2207	}
2208
2209	bool AArch64InstructionSelector::contractCrossBankCopyIntoStore(
2210	MachineInstr &I, MachineRegisterInfo &MRI) {
2211	assert(I.getOpcode() == TargetOpcode::G_STORE && "Expected G_STORE");
2212	// If we're storing a scalar, it doesn't matter what register bank that
2213	// scalar is on. All that matters is the size.
2214	//
2215	// So, if we see something like this (with a 32-bit scalar as an example):
2216	//
2217	// %x:gpr(s32) = ... something ...
2218	// %y:fpr(s32) = COPY %x:gpr(s32)
2219	// G_STORE %y:fpr(s32)
2220	//
2221	// We can fix this up into something like this:
2222	//
2223	// G_STORE %x:gpr(s32)
2224	//
2225	// And then continue the selection process normally.
2226	Register DefDstReg = getSrcRegIgnoringCopies(Reg: I.getOperand(i: `0`).getReg(), MRI);
2227	if (!DefDstReg.isValid())
2228	return false;
2229	LLT DefDstTy = MRI.getType(Reg: DefDstReg);
2230	Register StoreSrcReg = I.getOperand(i: `0`).getReg();
2231	LLT StoreSrcTy = MRI.getType(Reg: StoreSrcReg);
2232
2233	// If we get something strange like a physical register, then we shouldn't
2234	// go any further.
2235	if (!DefDstTy.isValid())
2236	return false;
2237
2238	// Are the source and dst types the same size?
2239	if (DefDstTy.getSizeInBits() != StoreSrcTy.getSizeInBits())
2240	return false;
2241
2242	if (RBI.getRegBank(StoreSrcReg, MRI, TRI) ==
2243	RBI.getRegBank(DefDstReg, MRI, TRI))
2244	return false;
2245
2246	// We have a cross-bank copy, which is entering a store. Let's fold it.
2247	I.getOperand(i: `0`).setReg(DefDstReg);
2248	return true;
2249	}
2250
2251	bool AArch64InstructionSelector::earlySelect(MachineInstr &I) {
2252	assert(I.getParent() && "Instruction should be in a basic block!");
2253	assert(I.getParent()->getParent() && "Instruction should be in a function!");
2254
2255	MachineBasicBlock &MBB = *I.getParent();
2256	MachineFunction &MF = *MBB.getParent();
2257	MachineRegisterInfo &MRI = MF.getRegInfo();
2258
2259	switch (I.getOpcode()) {
2260	case AArch64::G_DUP: {
2261	// Before selecting a DUP instruction, check if it is better selected as a
2262	// MOV or load from a constant pool.
2263	Register Src = I.getOperand(i: `1`).getReg();
2264	auto ValAndVReg = getAnyConstantVRegValWithLookThrough(VReg: Src, MRI);
2265	if (!ValAndVReg)
2266	return false;
2267	LLVMContext &Ctx = MF.getFunction().getContext();
2268	Register Dst = I.getOperand(i: `0`).getReg();
2269	auto *CV = ConstantDataVector::getSplat(
2270	NumElts: MRI.getType(Reg: Dst).getNumElements(),
2271	Elt: ConstantInt::get(Ty: Type::getIntNTy(C&: Ctx, N: MRI.getType(Reg: Src).getSizeInBits()),
2272	V: ValAndVReg ->Value));
2273	if (!emitConstantVector(Dst, CV, MIRBuilder&: MIB, MRI))
2274	return false;
2275	I.eraseFromParent();
2276	return true;
2277	}
2278	case TargetOpcode::G_SEXT:
2279	// Check for i64 sext(i32 vector_extract) prior to tablegen to select SMOV
2280	// over a normal extend.
2281	if (selectUSMovFromExtend(I, MRI))
2282	return true;
2283	return false;
2284	case TargetOpcode::G_BR:
2285	return false;
2286	case TargetOpcode::G_SHL:
2287	return earlySelectSHL(I, MRI);
2288	case TargetOpcode::G_CONSTANT: {
2289	bool IsZero = false;
2290	if (I.getOperand(i: `1`).isCImm())
2291	IsZero = I.getOperand(i: `1`).getCImm()->isZero();
2292	else if (I.getOperand(i: `1`).isImm())
2293	IsZero = I.getOperand(i: `1`).getImm() == `0`;
2294
2295	if (!IsZero)
2296	return false;
2297
2298	Register DefReg = I.getOperand(i: `0`).getReg();
2299	LLT Ty = MRI.getType(Reg: DefReg);
2300	if (Ty.getSizeInBits() == `64`) {
2301	I.getOperand(`1`).ChangeToRegister(AArch64::XZR, false);
2302	RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI);
2303	} else if (Ty.getSizeInBits() == `32`) {
2304	I.getOperand(`1`).ChangeToRegister(AArch64::WZR, false);
2305	RBI.constrainGenericRegister(DefReg, AArch64::GPR32RegClass, MRI);
2306	} else
2307	return false;
2308
2309	I.setDesc(TII.get(TargetOpcode::COPY));
2310	return true;
2311	}
2312
2313	case TargetOpcode::G_ADD: {
2314	// Check if this is being fed by a G_ICMP on either side.
2315	//
2316	// (cmp pred, x, y) + z
2317	//
2318	// In the above case, when the cmp is true, we increment z by 1. So, we can
2319	// fold the add into the cset for the cmp by using cinc.
2320	//
2321	// FIXME: This would probably be a lot nicer in PostLegalizerLowering.
2322	Register AddDst = I.getOperand(i: `0`).getReg();
2323	Register AddLHS = I.getOperand(i: `1`).getReg();
2324	Register AddRHS = I.getOperand(i: `2`).getReg();
2325	// Only handle scalars.
2326	LLT Ty = MRI.getType(Reg: AddLHS);
2327	if (Ty.isVector())
2328	return false;
2329	// Since G_ICMP is modeled as ADDS/SUBS/ANDS, we can handle 32 bits or 64
2330	// bits.
2331	unsigned Size = Ty.getSizeInBits();
2332	if (Size != `32` && Size != `64`)
2333	return false;
2334	auto MatchCmp = [&](Register Reg) -> MachineInstr * {
2335	if (!MRI.hasOneNonDBGUse(RegNo: Reg))
2336	return nullptr;
2337	// If the LHS of the add is 32 bits, then we want to fold a 32-bit
2338	// compare.
2339	if (Size == `32`)
2340	return getOpcodeDef(Opcode: TargetOpcode::G_ICMP, Reg, MRI);
2341	// We model scalar compares using 32-bit destinations right now.
2342	// If it's a 64-bit compare, it'll have 64-bit sources.
2343	Register ZExt;
2344	if (!mi_match(R: Reg, MRI,
2345	P: m_OneNonDBGUse(SP: m_GZExt(Src: m_OneNonDBGUse(SP: m_Reg(R&: ZExt))))))
2346	return nullptr;
2347	auto *Cmp = getOpcodeDef(Opcode: TargetOpcode::G_ICMP, Reg: ZExt, MRI);
2348	if (!Cmp \|\|
2349	MRI.getType(Reg: Cmp->getOperand(i: `2`).getReg()).getSizeInBits() != `64`)
2350	return nullptr;
2351	return Cmp;
2352	};
2353	// Try to match
2354	// z + (cmp pred, x, y)
2355	MachineInstr *Cmp = MatchCmp (AddRHS);
2356	if (!Cmp) {
2357	// (cmp pred, x, y) + z
2358	std::swap(a&: AddLHS, b&: AddRHS);
2359	Cmp = MatchCmp (AddRHS);
2360	if (!Cmp)
2361	return false;
2362	}
2363	auto &PredOp = Cmp->getOperand(i: `1`);
2364	auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate());
2365	const AArch64CC::CondCode InvCC =
2366	changeICMPPredToAArch64CC(P: CmpInst::getInversePredicate(pred: Pred));
2367	MIB.setInstrAndDebugLoc(I);
2368	emitIntegerCompare(/LHS=/Cmp->getOperand(i: `2`),
2369	/RHS=/Cmp->getOperand(i: `3`), Predicate&: PredOp, MIRBuilder&: MIB);
2370	emitCSINC(/Dst=/AddDst, /Src =/Src1: AddLHS, /Src2=/AddLHS, Pred: InvCC, MIRBuilder&: MIB);
2371	I.eraseFromParent();
2372	return true;
2373	}
2374	case TargetOpcode::G_OR: {
2375	// Look for operations that take the lower `Width=Size-ShiftImm` bits of
2376	// `ShiftSrc` and insert them into the upper `Width` bits of `MaskSrc` via
2377	// shifting and masking that we can replace with a BFI (encoded as a BFM).
2378	Register Dst = I.getOperand(i: `0`).getReg();
2379	LLT Ty = MRI.getType(Reg: Dst);
2380
2381	if (!Ty.isScalar())
2382	return false;
2383
2384	unsigned Size = Ty.getSizeInBits();
2385	if (Size != `32` && Size != `64`)
2386	return false;
2387
2388	Register ShiftSrc;
2389	int64_t ShiftImm;
2390	Register MaskSrc;
2391	int64_t MaskImm;
2392	if (!mi_match(
2393	R: Dst, MRI,
2394	P: m_GOr(L: m_OneNonDBGUse(SP: m_GShl(L: m_Reg(R&: ShiftSrc), R: m_ICst(Cst&: ShiftImm))),
2395	R: m_OneNonDBGUse(SP: m_GAnd(L: m_Reg(R&: MaskSrc), R: m_ICst(Cst&: MaskImm))))))
2396	return false;
2397
2398	if (ShiftImm > Size \|\| ((`1ULL` << ShiftImm) - `1ULL`) != uint64_t(MaskImm))
2399	return false;
2400
2401	int64_t Immr = Size - ShiftImm;
2402	int64_t Imms = Size - ShiftImm - `1`;
2403	unsigned Opc = Size == `32` ? AArch64::BFMWri : AArch64::BFMXri;
2404	emitInstr(Opcode: Opc, DstOps: {Dst}, SrcOps: {MaskSrc, ShiftSrc, Immr, Imms}, MIRBuilder&: MIB);
2405	I.eraseFromParent();
2406	return true;
2407	}
2408	case TargetOpcode::G_FENCE: {
2409	if (I.getOperand(i: `1`).getImm() == `0`)
2410	BuildMI(MBB, I, MIMetadata (I), TII.get(TargetOpcode::MEMBARRIER));
2411	else
2412	BuildMI(MBB, I, MIMetadata(I), TII.get(AArch64::DMB))
2413	.addImm(I.getOperand(`0`).getImm() == `4` ? `0x9` : `0xb`);
2414	I.eraseFromParent();
2415	return true;
2416	}
2417	default:
2418	return false;
2419	}
2420	}
2421
2422	bool AArch64InstructionSelector::select(MachineInstr &I) {
2423	assert(I.getParent() && "Instruction should be in a basic block!");
2424	assert(I.getParent()->getParent() && "Instruction should be in a function!");
2425
2426	MachineBasicBlock &MBB = *I.getParent();
2427	MachineFunction &MF = *MBB.getParent();
2428	MachineRegisterInfo &MRI = MF.getRegInfo();
2429
2430	const AArch64Subtarget *Subtarget = &MF.getSubtarget<AArch64Subtarget>();
2431	if (Subtarget->requiresStrictAlign()) {
2432	// We don't support this feature yet.
2433	LLVM_DEBUG(dbgs() << "AArch64 GISel does not support strict-align yet\n");
2434	return false;
2435	}
2436
2437	MIB.setInstrAndDebugLoc(I);
2438
2439	unsigned Opcode = I.getOpcode();
2440	// G_PHI requires same handling as PHI
2441	if (!I.isPreISelOpcode() \|\| Opcode == TargetOpcode::G_PHI) {
2442	// Certain non-generic instructions also need some special handling.
2443
2444	if (Opcode == TargetOpcode::LOAD_STACK_GUARD)
2445	return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2446
2447	if (Opcode == TargetOpcode::PHI \|\| Opcode == TargetOpcode::G_PHI) {
2448	const Register DefReg = I.getOperand(i: `0`).getReg();
2449	const LLT DefTy = MRI.getType(Reg: DefReg);
2450
2451	const RegClassOrRegBank &RegClassOrBank =
2452	MRI.getRegClassOrRegBank(Reg: DefReg);
2453
2454	const TargetRegisterClass *DefRC
2455	= RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
2456	if (!DefRC) {
2457	if (!DefTy.isValid()) {
2458	LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
2459	return false;
2460	}
2461	const RegisterBank &RB = RegClassOrBank.get<const* RegisterBank *>();
2462	DefRC = getRegClassForTypeOnBank(Ty: DefTy, RB);
2463	if (!DefRC) {
2464	LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
2465	return false;
2466	}
2467	}
2468
2469	I.setDesc(TII.get(TargetOpcode::PHI));
2470
2471	return RBI.constrainGenericRegister(Reg: DefReg, RC: *DefRC, MRI);
2472	}
2473
2474	if (I.isCopy())
2475	return selectCopy(I, TII, MRI, TRI, RBI);
2476
2477	if (I.isDebugInstr())
2478	return selectDebugInstr(I, MRI, RBI);
2479
2480	return true;
2481	}
2482
2483
2484	if (I.getNumOperands() != I.getNumExplicitOperands()) {
2485	LLVM_DEBUG(
2486	dbgs() << "Generic instruction has unexpected implicit operands\n");
2487	return false;
2488	}
2489
2490	// Try to do some lowering before we start instruction selecting. These
2491	// lowerings are purely transformations on the input G_MIR and so selection
2492	// must continue after any modification of the instruction.
2493	if (preISelLower(I)) {
2494	Opcode = I.getOpcode(); // The opcode may have been modified, refresh it.
2495	}
2496
2497	// There may be patterns where the importer can't deal with them optimally,
2498	// but does select it to a suboptimal sequence so our custom C++ selection
2499	// code later never has a chance to work on it. Therefore, we have an early
2500	// selection attempt here to give priority to certain selection routines
2501	// over the imported ones.
2502	if (earlySelect(I))
2503	return true;
2504
2505	if (selectImpl(I, CoverageInfo&: *CoverageInfo))
2506	return true;
2507
2508	LLT Ty =
2509	I.getOperand(i: `0`).isReg() ? MRI.getType(Reg: I.getOperand(i: `0`).getReg()) : LLT {};
2510
2511	switch (Opcode) {
2512	case TargetOpcode::G_SBFX:
2513	case TargetOpcode::G_UBFX: {
2514	static const unsigned OpcTable[`2`][`2`] = {
2515	{AArch64::UBFMWri, AArch64::UBFMXri},
2516	{AArch64::SBFMWri, AArch64::SBFMXri}};
2517	bool IsSigned = Opcode == TargetOpcode::G_SBFX;
2518	unsigned Size = Ty.getSizeInBits();
2519	unsigned Opc = OpcTable[IsSigned][Size == `64`];
2520	auto Cst1 =
2521	getIConstantVRegValWithLookThrough(VReg: I.getOperand(i: `2`).getReg(), MRI);
2522	assert(Cst1 && "Should have gotten a constant for src 1?");
2523	auto Cst2 =
2524	getIConstantVRegValWithLookThrough(VReg: I.getOperand(i: `3`).getReg(), MRI);
2525	assert(Cst2 && "Should have gotten a constant for src 2?");
2526	auto LSB = Cst1 ->Value.getZExtValue();
2527	auto Width = Cst2 ->Value.getZExtValue();
2528	auto BitfieldInst =
2529	MIB.buildInstr(Opc, DstOps: {I.getOperand(i: `0`)}, SrcOps: {I.getOperand(i: `1`)})
2530	.addImm(Val: LSB)
2531	.addImm(Val: LSB + Width - `1`);
2532	I.eraseFromParent();
2533	return constrainSelectedInstRegOperands(*BitfieldInst, TII, TRI, RBI);
2534	}
2535	case TargetOpcode::G_BRCOND:
2536	return selectCompareBranch(I, MF, MRI);
2537
2538	case TargetOpcode::G_BRINDIRECT: {
2539	I.setDesc(TII.get(AArch64::BR));
2540	return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2541	}
2542
2543	case TargetOpcode::G_BRJT:
2544	return selectBrJT(I, MRI);
2545
2546	case AArch64::G_ADD_LOW: {
2547	// This op may have been separated from it's ADRP companion by the localizer
2548	// or some other code motion pass. Given that many CPUs will try to
2549	// macro fuse these operations anyway, select this into a MOVaddr pseudo
2550	// which will later be expanded into an ADRP+ADD pair after scheduling.
2551	MachineInstr *BaseMI = MRI.getVRegDef(Reg: I.getOperand(i: `1`).getReg());
2552	if (BaseMI->getOpcode() != AArch64::ADRP) {
2553	I.setDesc(TII.get(AArch64::ADDXri));
2554	I.addOperand(Op: MachineOperand::CreateImm(Val: `0`));
2555	return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2556	}
2557	assert(TM.getCodeModel() == CodeModel::Small &&
2558	"Expected small code model");
2559	auto Op1 = BaseMI->getOperand(i: `1`);
2560	auto Op2 = I.getOperand(i: `2`);
2561	auto MovAddr = MIB.buildInstr(AArch64::MOVaddr, {I.getOperand(`0`)}, {})
2562	.addGlobalAddress(Op1.getGlobal(), Op1.getOffset(),
2563	Op1.getTargetFlags())
2564	.addGlobalAddress(Op2.getGlobal(), Op2.getOffset(),
2565	Op2.getTargetFlags());
2566	I.eraseFromParent();
2567	return constrainSelectedInstRegOperands(*MovAddr, TII, TRI, RBI);
2568	}
2569
2570	case TargetOpcode::G_FCONSTANT:
2571	case TargetOpcode::G_CONSTANT: {
2572	const bool isFP = Opcode == TargetOpcode::G_FCONSTANT;
2573
2574	const LLT s8 = LLT::scalar(SizeInBits: `8`);
2575	const LLT s16 = LLT::scalar(SizeInBits: `16`);
2576	const LLT s32 = LLT::scalar(SizeInBits: `32`);
2577	const LLT s64 = LLT::scalar(SizeInBits: `64`);
2578	const LLT s128 = LLT::scalar(SizeInBits: `128`);
2579	const LLT p0 = LLT::pointer(AddressSpace: `0`, SizeInBits: `64`);
2580
2581	const Register DefReg = I.getOperand(i: `0`).getReg();
2582	const LLT DefTy = MRI.getType(Reg: DefReg);
2583	const unsigned DefSize = DefTy.getSizeInBits();
2584	const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
2585
2586	// FIXME: Redundant check, but even less readable when factored out.
2587	if (isFP) {
2588	if (Ty != s16 && Ty != s32 && Ty != s64 && Ty != s128) {
2589	LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty
2590	<< " constant, expected: " << s16 << " or " << s32
2591	<< " or " << s64 << " or " << s128 << `'\n'`);
2592	return false;
2593	}
2594
2595	if (RB.getID() != AArch64::FPRRegBankID) {
2596	LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty
2597	<< " constant on bank: " << RB
2598	<< ", expected: FPR\n");
2599	return false;
2600	}
2601
2602	// The case when we have 0.0 is covered by tablegen. Reject it here so we
2603	// can be sure tablegen works correctly and isn't rescued by this code.
2604	// 0.0 is not covered by tablegen for FP128. So we will handle this
2605	// scenario in the code here.
2606	if (DefSize != `128` && I.getOperand(i: `1`).getFPImm()->isExactlyValue(V: `0.0`))
2607	return false;
2608	} else {
2609	// s32 and s64 are covered by tablegen.
2610	if (Ty != p0 && Ty != s8 && Ty != s16) {
2611	LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty
2612	<< " constant, expected: " << s32 << ", " << s64
2613	<< ", or " << p0 << `'\n'`);
2614	return false;
2615	}
2616
2617	if (RB.getID() != AArch64::GPRRegBankID) {
2618	LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty
2619	<< " constant on bank: " << RB
2620	<< ", expected: GPR\n");
2621	return false;
2622	}
2623	}
2624
2625	if (isFP) {
2626	const TargetRegisterClass &FPRRC = *getRegClassForTypeOnBank(Ty: DefTy, RB);
2627	// For 16, 64, and 128b values, emit a constant pool load.
2628	switch (DefSize) {
2629	default:
2630	llvm_unreachable("Unexpected destination size for G_FCONSTANT?");
2631	case `32`:
2632	case `64`: {
2633	bool OptForSize = shouldOptForSize(MF: &MF);
2634	const auto &TLI = MF.getSubtarget().getTargetLowering();
2635	// If TLI says that this fpimm is illegal, then we'll expand to a
2636	// constant pool load.
2637	if (TLI->isFPImmLegal(I.getOperand(i: `1`).getFPImm()->getValueAPF(),
2638	EVT::getFloatingPointVT(BitWidth: DefSize), ForCodeSize: OptForSize))
2639	break;
2640	[[fallthrough]];
2641	}
2642	case `16`:
2643	case `128`: {
2644	auto *FPImm = I.getOperand(i: `1`).getFPImm();
2645	auto *LoadMI = emitLoadFromConstantPool(CPVal: FPImm, MIRBuilder&: MIB);
2646	if (!LoadMI) {
2647	LLVM_DEBUG(dbgs() << "Failed to load double constant pool entry\n");
2648	return false;
2649	}
2650	MIB.buildCopy(Res: {DefReg}, Op: {LoadMI->getOperand(i: `0`).getReg()});
2651	I.eraseFromParent();
2652	return RBI.constrainGenericRegister(Reg: DefReg, RC: FPRRC, MRI);
2653	}
2654	}
2655
2656	assert((DefSize == `32` \|\| DefSize == `64`) && "Unexpected const def size");
2657	// Either emit a FMOV, or emit a copy to emit a normal mov.
2658	const Register DefGPRReg = MRI.createVirtualRegister(
2659	DefSize == `32` ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass);
2660	MachineOperand &RegOp = I.getOperand(i: `0`);
2661	RegOp.setReg(DefGPRReg);
2662	MIB.setInsertPt(MBB&: MIB.getMBB(), II: std::next(x: I.getIterator()));
2663	MIB.buildCopy(Res: {DefReg}, Op: {DefGPRReg});
2664
2665	if (!RBI.constrainGenericRegister(Reg: DefReg, RC: FPRRC, MRI)) {
2666	LLVM_DEBUG(dbgs() << "Failed to constrain G_FCONSTANT def operand\n");
2667	return false;
2668	}
2669
2670	MachineOperand &ImmOp = I.getOperand(i: `1`);
2671	// FIXME: Is going through int64_t always correct?
2672	ImmOp.ChangeToImmediate(
2673	ImmVal: ImmOp.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
2674	} else if (I.getOperand(i: `1`).isCImm()) {
2675	uint64_t Val = I.getOperand(i: `1`).getCImm()->getZExtValue();
2676	I.getOperand(i: `1`).ChangeToImmediate(ImmVal: Val);
2677	} else if (I.getOperand(i: `1`).isImm()) {
2678	uint64_t Val = I.getOperand(i: `1`).getImm();
2679	I.getOperand(i: `1`).ChangeToImmediate(ImmVal: Val);
2680	}
2681
2682	const unsigned MovOpc =
2683	DefSize == `64` ? AArch64::MOVi64imm : AArch64::MOVi32imm;
2684	I.setDesc(TII.get(MovOpc));
2685	constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2686	return true;
2687	}
2688	case TargetOpcode::G_EXTRACT: {
2689	Register DstReg = I.getOperand(i: `0`).getReg();
2690	Register SrcReg = I.getOperand(i: `1`).getReg();
2691	LLT SrcTy = MRI.getType(Reg: SrcReg);
2692	LLT DstTy = MRI.getType(Reg: DstReg);
2693	(void)DstTy;
2694	unsigned SrcSize = SrcTy.getSizeInBits();
2695
2696	if (SrcTy.getSizeInBits() > `64`) {
2697	// This should be an extract of an s128, which is like a vector extract.
2698	if (SrcTy.getSizeInBits() != `128`)
2699	return false;
2700	// Only support extracting 64 bits from an s128 at the moment.
2701	if (DstTy.getSizeInBits() != `64`)
2702	return false;
2703
2704	unsigned Offset = I.getOperand(i: `2`).getImm();
2705	if (Offset % `64` != `0`)
2706	return false;
2707
2708	// Check we have the right regbank always.
2709	const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
2710	const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
2711	assert(SrcRB.getID() == DstRB.getID() && "Wrong extract regbank!");
2712
2713	if (SrcRB.getID() == AArch64::GPRRegBankID) {
2714	auto NewI =
2715	MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {})
2716	.addUse(SrcReg, `0`,
2717	Offset == `0` ? AArch64::sube64 : AArch64::subo64);
2718	constrainOperandRegClass(MF, TRI, MRI, TII, RBI, *NewI,
2719	AArch64::GPR64RegClass, NewI->getOperand(`0`));
2720	I.eraseFromParent();
2721	return true;
2722	}
2723
2724	// Emit the same code as a vector extract.
2725	// Offset must be a multiple of 64.
2726	unsigned LaneIdx = Offset / `64`;
2727	MachineInstr *Extract = emitExtractVectorElt(
2728	DstReg, DstRB, ScalarTy: LLT::scalar(SizeInBits: `64`), VecReg: SrcReg, LaneIdx, MIRBuilder&: MIB);
2729	if (!Extract)
2730	return false;
2731	I.eraseFromParent();
2732	return true;
2733	}
2734
2735	I.setDesc(TII.get(SrcSize == `64` ? AArch64::UBFMXri : AArch64::UBFMWri));
2736	MachineInstrBuilder (MF, I).addImm(Val: I.getOperand(i: `2`).getImm() +
2737	Ty.getSizeInBits() - `1`);
2738
2739	if (SrcSize < `64`) {
2740	assert(SrcSize == `32` && DstTy.getSizeInBits() == `16` &&
2741	"unexpected G_EXTRACT types");
2742	return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2743	}
2744
2745	DstReg = MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: `64`));
2746	MIB.setInsertPt(MBB&: MIB.getMBB(), II: std::next(x: I.getIterator()));
2747	MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(`0`).getReg()}, {})
2748	.addReg(DstReg, `0`, AArch64::sub_32);
2749	RBI.constrainGenericRegister(I.getOperand(`0`).getReg(),
2750	AArch64::GPR32RegClass, MRI);
2751	I.getOperand(i: `0`).setReg(DstReg);
2752
2753	return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2754	}
2755
2756	case TargetOpcode::G_INSERT: {
2757	LLT SrcTy = MRI.getType(Reg: I.getOperand(i: `2`).getReg());
2758	LLT DstTy = MRI.getType(Reg: I.getOperand(i: `0`).getReg());
2759	unsigned DstSize = DstTy.getSizeInBits();
2760	// Larger inserts are vectors, same-size ones should be something else by
2761	// now (split up or turned into COPYs).
2762	if (Ty.getSizeInBits() > `64` \|\| SrcTy.getSizeInBits() > `32`)
2763	return false;
2764
2765	I.setDesc(TII.get(DstSize == `64` ? AArch64::BFMXri : AArch64::BFMWri));
2766	unsigned LSB = I.getOperand(i: `3`).getImm();
2767	unsigned Width = MRI.getType(Reg: I.getOperand(i: `2`).getReg()).getSizeInBits();
2768	I.getOperand(i: `3`).setImm((DstSize - LSB) % DstSize);
2769	MachineInstrBuilder (MF, I).addImm(Val: Width - `1`);
2770
2771	if (DstSize < `64`) {
2772	assert(DstSize == `32` && SrcTy.getSizeInBits() == `16` &&
2773	"unexpected G_INSERT types");
2774	return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2775	}
2776
2777	Register SrcReg = MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: `64`));
2778	BuildMI(MBB, I.getIterator(), I.getDebugLoc(),
2779	TII.get(AArch64::SUBREG_TO_REG))
2780	.addDef(SrcReg)
2781	.addImm(`0`)
2782	.addUse(I.getOperand(`2`).getReg())
2783	.addImm(AArch64::sub_32);
2784	RBI.constrainGenericRegister(I.getOperand(`2`).getReg(),
2785	AArch64::GPR32RegClass, MRI);
2786	I.getOperand(i: `2`).setReg(SrcReg);
2787
2788	return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2789	}
2790	case TargetOpcode::G_FRAME_INDEX: {
2791	// allocas and G_FRAME_INDEX are only supported in addrspace(0).
2792	if (Ty != LLT::pointer(AddressSpace: `0`, SizeInBits: `64`)) {
2793	LLVM_DEBUG(dbgs() << "G_FRAME_INDEX pointer has type: " << Ty
2794	<< ", expected: " << LLT::pointer(`0`, `64`) << `'\n'`);
2795	return false;
2796	}
2797	I.setDesc(TII.get(AArch64::ADDXri));
2798
2799	// MOs for a #0 shifted immediate.
2800	I.addOperand(Op: MachineOperand::CreateImm(Val: `0`));
2801	I.addOperand(Op: MachineOperand::CreateImm(Val: `0`));
2802
2803	return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2804	}
2805
2806	case TargetOpcode::G_GLOBAL_VALUE: {
2807	auto GV = I.getOperand(i: `1`).getGlobal();
2808	if (GV->isThreadLocal())
2809	return selectTLSGlobalValue(I, MRI);
2810
2811	unsigned OpFlags = STI.ClassifyGlobalReference(GV, TM);
2812	if (OpFlags & AArch64II::MO_GOT) {
2813	I.setDesc(TII.get(AArch64::LOADgot));
2814	I.getOperand(i: `1`).setTargetFlags(OpFlags);
2815	} else if (TM.getCodeModel() == CodeModel::Large &&
2816	!TM.isPositionIndependent()) {
2817	// Materialize the global using movz/movk instructions.
2818	materializeLargeCMVal(I, V: GV, OpFlags);
2819	I.eraseFromParent();
2820	return true;
2821	} else if (TM.getCodeModel() == CodeModel::Tiny) {
2822	I.setDesc(TII.get(AArch64::ADR));
2823	I.getOperand(i: `1`).setTargetFlags(OpFlags);
2824	} else {
2825	I.setDesc(TII.get(AArch64::MOVaddr));
2826	I.getOperand(i: `1`).setTargetFlags(OpFlags \| AArch64II::MO_PAGE);
2827	MachineInstrBuilder MIB(MF, I);
2828	MIB.addGlobalAddress(GV, Offset: I.getOperand(i: `1`).getOffset(),
2829	TargetFlags: OpFlags \| AArch64II::MO_PAGEOFF \| AArch64II::MO_NC);
2830	}
2831	return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2832	}
2833
2834	case TargetOpcode::G_ZEXTLOAD:
2835	case TargetOpcode::G_LOAD:
2836	case TargetOpcode::G_STORE: {
2837	GLoadStore &LdSt = cast<GLoadStore>(Val&: I);
2838	bool IsZExtLoad = I.getOpcode() == TargetOpcode::G_ZEXTLOAD;
2839	LLT PtrTy = MRI.getType(Reg: LdSt.getPointerReg());
2840
2841	if (PtrTy != LLT::pointer(AddressSpace: `0`, SizeInBits: `64`)) {
2842	LLVM_DEBUG(dbgs() << "Load/Store pointer has type: " << PtrTy
2843	<< ", expected: " << LLT::pointer(`0`, `64`) << `'\n'`);
2844	return false;
2845	}
2846
2847	uint64_t MemSizeInBytes = LdSt.getMemSize();
2848	unsigned MemSizeInBits = LdSt.getMemSizeInBits();
2849	AtomicOrdering Order = LdSt.getMMO().getSuccessOrdering();
2850
2851	// Need special instructions for atomics that affect ordering.
2852	if (Order != AtomicOrdering::NotAtomic &&
2853	Order != AtomicOrdering::Unordered &&
2854	Order != AtomicOrdering::Monotonic) {
2855	assert(!isa<GZExtLoad>(LdSt));
2856	if (MemSizeInBytes > `64`)
2857	return false;
2858
2859	if (isa<GLoad>(Val: LdSt)) {
2860	static constexpr unsigned LDAPROpcodes[] = {
2861	AArch64::LDAPRB, AArch64::LDAPRH, AArch64::LDAPRW, AArch64::LDAPRX};
2862	static constexpr unsigned LDAROpcodes[] = {
2863	AArch64::LDARB, AArch64::LDARH, AArch64::LDARW, AArch64::LDARX};
2864	ArrayRef<unsigned> Opcodes =
2865	STI.hasRCPC() && Order != AtomicOrdering::SequentiallyConsistent
2866	? LDAPROpcodes
2867	: LDAROpcodes;
2868	I.setDesc(TII.get(Opcodes [Log2_32(Value: MemSizeInBytes)]));
2869	} else {
2870	static constexpr unsigned Opcodes[] = {AArch64::STLRB, AArch64::STLRH,
2871	AArch64::STLRW, AArch64::STLRX};
2872	Register ValReg = LdSt.getReg(Idx: `0`);
2873	if (MRI.getType(Reg: ValReg).getSizeInBits() == `64` && MemSizeInBits != `64`) {
2874	// Emit a subreg copy of 32 bits.
2875	Register NewVal = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
2876	MIB.buildInstr(TargetOpcode::COPY, {NewVal}, {})
2877	.addReg(I.getOperand(`0`).getReg(), `0`, AArch64::sub_32);
2878	I.getOperand(i: `0`).setReg(NewVal);
2879	}
2880	I.setDesc(TII.get(Opcodes[Log2_32(Value: MemSizeInBytes)]));
2881	}
2882	constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2883	return true;
2884	}
2885
2886	#ifndef NDEBUG
2887	const Register PtrReg = LdSt.getPointerReg();
2888	const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, MRI, TRI);
2889	// Check that the pointer register is valid.
2890	assert(PtrRB.getID() == AArch64::GPRRegBankID &&
2891	"Load/Store pointer operand isn't a GPR");
2892	assert(MRI.getType(PtrReg).isPointer() &&
2893	"Load/Store pointer operand isn't a pointer");
2894	#endif
2895
2896	const Register ValReg = LdSt.getReg(Idx: `0`);
2897	const LLT ValTy = MRI.getType(Reg: ValReg);
2898	const RegisterBank &RB = *RBI.getRegBank(ValReg, MRI, TRI);
2899
2900	// The code below doesn't support truncating stores, so we need to split it
2901	// again.
2902	if (isa<GStore>(Val: LdSt) && ValTy.getSizeInBits() > MemSizeInBits) {
2903	unsigned SubReg;
2904	LLT MemTy = LdSt.getMMO().getMemoryType();
2905	auto *RC = getRegClassForTypeOnBank(Ty: MemTy, RB);
2906	if (!getSubRegForClass(RC, TRI, SubReg))
2907	return false;
2908
2909	// Generate a subreg copy.
2910	auto Copy = MIB.buildInstr(Opc: TargetOpcode::COPY, DstOps: {MemTy}, SrcOps: {})
2911	.addReg(RegNo: ValReg, flags: `0`, SubReg)
2912	.getReg(Idx: `0`);
2913	RBI.constrainGenericRegister(Reg: Copy, RC: *RC, MRI);
2914	LdSt.getOperand(i: `0`).setReg(Copy);
2915	} else if (isa<GLoad>(Val: LdSt) && ValTy.getSizeInBits() > MemSizeInBits) {
2916	// If this is an any-extending load from the FPR bank, split it into a regular
2917	// load + extend.
2918	if (RB.getID() == AArch64::FPRRegBankID) {
2919	unsigned SubReg;
2920	LLT MemTy = LdSt.getMMO().getMemoryType();
2921	auto *RC = getRegClassForTypeOnBank(Ty: MemTy, RB);
2922	if (!getSubRegForClass(RC, TRI, SubReg))
2923	return false;
2924	Register OldDst = LdSt.getReg(Idx: `0`);
2925	Register NewDst =
2926	MRI.createGenericVirtualRegister(Ty: LdSt.getMMO().getMemoryType());
2927	LdSt.getOperand(i: `0`).setReg(NewDst);
2928	MRI.setRegBank(Reg: NewDst, RegBank: RB);
2929	// Generate a SUBREG_TO_REG to extend it.
2930	MIB.setInsertPt(MBB&: MIB.getMBB(), II: std::next(x: LdSt.getIterator()));
2931	MIB.buildInstr(AArch64::SUBREG_TO_REG, {OldDst}, {})
2932	.addImm(`0`)
2933	.addUse(NewDst)
2934	.addImm(SubReg);
2935	auto SubRegRC = getRegClassForTypeOnBank(Ty: MRI.getType(Reg: OldDst), RB);
2936	RBI.constrainGenericRegister(Reg: OldDst, RC: *SubRegRC, MRI);
2937	MIB.setInstr(LdSt);
2938	}
2939	}
2940
2941	// Helper lambda for partially selecting I. Either returns the original
2942	// instruction with an updated opcode, or a new instruction.
2943	auto SelectLoadStoreAddressingMode = [&]() -> MachineInstr * {
2944	bool IsStore = isa<GStore>(Val: I);
2945	const unsigned NewOpc =
2946	selectLoadStoreUIOp(GenericOpc: I.getOpcode(), RegBankID: RB.getID(), OpSize: MemSizeInBits);
2947	if (NewOpc == I.getOpcode())
2948	return nullptr;
2949	// Check if we can fold anything into the addressing mode.
2950	auto AddrModeFns =
2951	selectAddrModeIndexed(Root&: I.getOperand(i: `1`), Size: MemSizeInBytes);
2952	if (!AddrModeFns) {
2953	// Can't fold anything. Use the original instruction.
2954	I.setDesc(TII.get(NewOpc));
2955	I.addOperand(Op: MachineOperand::CreateImm(Val: `0`));
2956	return &I;
2957	}
2958
2959	// Folded something. Create a new instruction and return it.
2960	auto NewInst = MIB.buildInstr(Opc: NewOpc, DstOps: {}, SrcOps: {}, Flags: I.getFlags());
2961	Register CurValReg = I.getOperand(i: `0`).getReg();
2962	IsStore ? NewInst.addUse(CurValReg) : NewInst.addDef(CurValReg);
2963	NewInst.cloneMemRefs(I);
2964	for (auto &Fn : *AddrModeFns)
2965	Fn(NewInst);
2966	I.eraseFromParent();
2967	return &*NewInst;
2968	};
2969
2970	MachineInstr *LoadStore = SelectLoadStoreAddressingMode ();
2971	if (!LoadStore)
2972	return false;
2973
2974	// If we're storing a 0, use WZR/XZR.
2975	if (Opcode == TargetOpcode::G_STORE) {
2976	auto CVal = getIConstantVRegValWithLookThrough(
2977	VReg: LoadStore->getOperand(i: `0`).getReg(), MRI);
2978	if (CVal && CVal ->Value == `0`) {
2979	switch (LoadStore->getOpcode()) {
2980	case AArch64::STRWui:
2981	case AArch64::STRHHui:
2982	case AArch64::STRBBui:
2983	LoadStore->getOperand(`0`).setReg(AArch64::WZR);
2984	break;
2985	case AArch64::STRXui:
2986	LoadStore->getOperand(`0`).setReg(AArch64::XZR);
2987	break;
2988	}
2989	}
2990	}
2991
2992	if (IsZExtLoad) {
2993	// The zextload from a smaller type to i32 should be handled by the
2994	// importer.
2995	if (MRI.getType(Reg: LoadStore->getOperand(i: `0`).getReg()).getSizeInBits() != `64`)
2996	return false;
2997	// If we have a ZEXTLOAD then change the load's type to be a narrower reg
2998	// and zero_extend with SUBREG_TO_REG.
2999	Register LdReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
3000	Register DstReg = LoadStore->getOperand(i: `0`).getReg();
3001	LoadStore->getOperand(i: `0`).setReg(LdReg);
3002
3003	MIB.setInsertPt(MBB&: MIB.getMBB(), II: std::next(x: LoadStore->getIterator()));
3004	MIB.buildInstr(AArch64::SUBREG_TO_REG, {DstReg}, {})
3005	.addImm(`0`)
3006	.addUse(LdReg)
3007	.addImm(AArch64::sub_32);
3008	constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI);
3009	return RBI.constrainGenericRegister(DstReg, AArch64::GPR64allRegClass,
3010	MRI);
3011	}
3012	return constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI);
3013	}
3014
3015	case TargetOpcode::G_INDEXED_ZEXTLOAD:
3016	case TargetOpcode::G_INDEXED_SEXTLOAD:
3017	return selectIndexedExtLoad(I, MRI);
3018	case TargetOpcode::G_INDEXED_LOAD:
3019	return selectIndexedLoad(I, MRI);
3020	case TargetOpcode::G_INDEXED_STORE:
3021	return selectIndexedStore(I&: cast<GIndexedStore>(Val&: I), MRI);
3022
3023	case TargetOpcode::G_LSHR:
3024	case TargetOpcode::G_ASHR:
3025	if (MRI.getType(Reg: I.getOperand(i: `0`).getReg()).isVector())
3026	return selectVectorAshrLshr(I, MRI);
3027	[[fallthrough]];
3028	case TargetOpcode::G_SHL:
3029	if (Opcode == TargetOpcode::G_SHL &&
3030	MRI.getType(Reg: I.getOperand(i: `0`).getReg()).isVector())
3031	return selectVectorSHL(I, MRI);
3032
3033	// These shifts were legalized to have 64 bit shift amounts because we
3034	// want to take advantage of the selection patterns that assume the
3035	// immediates are s64s, however, selectBinaryOp will assume both operands
3036	// will have the same bit size.
3037	{
3038	Register SrcReg = I.getOperand(i: `1`).getReg();
3039	Register ShiftReg = I.getOperand(i: `2`).getReg();
3040	const LLT ShiftTy = MRI.getType(Reg: ShiftReg);
3041	const LLT SrcTy = MRI.getType(Reg: SrcReg);
3042	if (!SrcTy.isVector() && SrcTy.getSizeInBits() == `32` &&
3043	ShiftTy.getSizeInBits() == `64`) {
3044	assert(!ShiftTy.isVector() && "unexpected vector shift ty");
3045	// Insert a subregister copy to implement a 64->32 trunc
3046	auto Trunc = MIB.buildInstr(TargetOpcode::COPY, {SrcTy}, {})
3047	.addReg(ShiftReg, `0`, AArch64::sub_32);
3048	MRI.setRegBank(Trunc.getReg(`0`), RBI.getRegBank(AArch64::GPRRegBankID));
3049	I.getOperand(i: `2`).setReg(Trunc.getReg(`0`));
3050	}
3051	}
3052	[[fallthrough]];
3053	case TargetOpcode::G_OR: {
3054	// Reject the various things we don't support yet.
3055	if (unsupportedBinOp(I, RBI, MRI, TRI))
3056	return false;
3057
3058	const unsigned OpSize = Ty.getSizeInBits();
3059
3060	const Register DefReg = I.getOperand(i: `0`).getReg();
3061	const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
3062
3063	const unsigned NewOpc = selectBinaryOp(GenericOpc: I.getOpcode(), RegBankID: RB.getID(), OpSize);
3064	if (NewOpc == I.getOpcode())
3065	return false;
3066
3067	I.setDesc(TII.get(NewOpc));
3068	// FIXME: Should the type be always reset in setDesc?
3069
3070	// Now that we selected an opcode, we need to constrain the register
3071	// operands to use appropriate classes.
3072	return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3073	}
3074
3075	case TargetOpcode::G_PTR_ADD: {
3076	emitADD(DefReg: I.getOperand(i: `0`).getReg(), LHS&: I.getOperand(i: `1`), RHS&: I.getOperand(i: `2`), MIRBuilder&: MIB);
3077	I.eraseFromParent();
3078	return true;
3079	}
3080
3081	case TargetOpcode::G_SADDE:
3082	case TargetOpcode::G_UADDE:
3083	case TargetOpcode::G_SSUBE:
3084	case TargetOpcode::G_USUBE:
3085	case TargetOpcode::G_SADDO:
3086	case TargetOpcode::G_UADDO:
3087	case TargetOpcode::G_SSUBO:
3088	case TargetOpcode::G_USUBO:
3089	return selectOverflowOp(I, MRI);
3090
3091	case TargetOpcode::G_PTRMASK: {
3092	Register MaskReg = I.getOperand(i: `2`).getReg();
3093	std::optional<int64_t> MaskVal = getIConstantVRegSExtVal(VReg: MaskReg, MRI);
3094	// TODO: Implement arbitrary cases
3095	if (!MaskVal \|\| !isShiftedMask_64(Value: *MaskVal))
3096	return false;
3097
3098	uint64_t Mask = *MaskVal;
3099	I.setDesc(TII.get(AArch64::ANDXri));
3100	I.getOperand(i: `2`).ChangeToImmediate(
3101	ImmVal: AArch64_AM::encodeLogicalImmediate(imm: Mask, regSize: `64`));
3102
3103	return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3104	}
3105	case TargetOpcode::G_PTRTOINT:
3106	case TargetOpcode::G_TRUNC: {
3107	const LLT DstTy = MRI.getType(Reg: I.getOperand(i: `0`).getReg());
3108	const LLT SrcTy = MRI.getType(Reg: I.getOperand(i: `1`).getReg());
3109
3110	const Register DstReg = I.getOperand(i: `0`).getReg();
3111	const Register SrcReg = I.getOperand(i: `1`).getReg();
3112
3113	const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
3114	const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
3115
3116	if (DstRB.getID() != SrcRB.getID()) {
3117	LLVM_DEBUG(
3118	dbgs() << "G_TRUNC/G_PTRTOINT input/output on different banks\n");
3119	return false;
3120	}
3121
3122	if (DstRB.getID() == AArch64::GPRRegBankID) {
3123	const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(Ty: DstTy, RB: DstRB);
3124	if (!DstRC)
3125	return false;
3126
3127	const TargetRegisterClass *SrcRC = getRegClassForTypeOnBank(Ty: SrcTy, RB: SrcRB);
3128	if (!SrcRC)
3129	return false;
3130
3131	if (!RBI.constrainGenericRegister(Reg: SrcReg, RC: *SrcRC, MRI) \|\|
3132	!RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI)) {
3133	LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC/G_PTRTOINT\n");
3134	return false;
3135	}
3136
3137	if (DstRC == SrcRC) {
3138	// Nothing to be done
3139	} else if (Opcode == TargetOpcode::G_TRUNC && DstTy == LLT::scalar(SizeInBits: `32`) &&
3140	SrcTy == LLT::scalar(SizeInBits: `64`)) {
3141	llvm_unreachable("TableGen can import this case");
3142	return false;
3143	} else if (DstRC == &AArch64::GPR32RegClass &&
3144	SrcRC == &AArch64::GPR64RegClass) {
3145	I.getOperand(`1`).setSubReg(AArch64::sub_32);
3146	} else {
3147	LLVM_DEBUG(
3148	dbgs() << "Unhandled mismatched classes in G_TRUNC/G_PTRTOINT\n");
3149	return false;
3150	}
3151
3152	I.setDesc(TII.get(TargetOpcode::COPY));
3153	return true;
3154	} else if (DstRB.getID() == AArch64::FPRRegBankID) {
3155	if (DstTy == LLT::fixed_vector(NumElements: `4`, ScalarSizeInBits: `16`) &&
3156	SrcTy == LLT::fixed_vector(NumElements: `4`, ScalarSizeInBits: `32`)) {
3157	I.setDesc(TII.get(AArch64::XTNv4i16));
3158	constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3159	return true;
3160	}
3161
3162	if (!SrcTy.isVector() && SrcTy.getSizeInBits() == `128`) {
3163	MachineInstr *Extract = emitExtractVectorElt(
3164	DstReg, DstRB, ScalarTy: LLT::scalar(SizeInBits: DstTy.getSizeInBits()), VecReg: SrcReg, LaneIdx: `0`, MIRBuilder&: MIB);
3165	if (!Extract)
3166	return false;
3167	I.eraseFromParent();
3168	return true;
3169	}
3170
3171	// We might have a vector G_PTRTOINT, in which case just emit a COPY.
3172	if (Opcode == TargetOpcode::G_PTRTOINT) {
3173	assert(DstTy.isVector() && "Expected an FPR ptrtoint to be a vector");
3174	I.setDesc(TII.get(TargetOpcode::COPY));
3175	return selectCopy(I, TII, MRI, TRI, RBI);
3176	}
3177	}
3178
3179	return false;
3180	}
3181
3182	case TargetOpcode::G_ANYEXT: {
3183	if (selectUSMovFromExtend(I, MRI))
3184	return true;
3185
3186	const Register DstReg = I.getOperand(i: `0`).getReg();
3187	const Register SrcReg = I.getOperand(i: `1`).getReg();
3188
3189	const RegisterBank &RBDst = *RBI.getRegBank(DstReg, MRI, TRI);
3190	if (RBDst.getID() != AArch64::GPRRegBankID) {
3191	LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBDst
3192	<< ", expected: GPR\n");
3193	return false;
3194	}
3195
3196	const RegisterBank &RBSrc = *RBI.getRegBank(SrcReg, MRI, TRI);
3197	if (RBSrc.getID() != AArch64::GPRRegBankID) {
3198	LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBSrc
3199	<< ", expected: GPR\n");
3200	return false;
3201	}
3202
3203	const unsigned DstSize = MRI.getType(Reg: DstReg).getSizeInBits();
3204
3205	if (DstSize == `0`) {
3206	LLVM_DEBUG(dbgs() << "G_ANYEXT operand has no size, not a gvreg?\n");
3207	return false;
3208	}
3209
3210	if (DstSize != `64` && DstSize > `32`) {
3211	LLVM_DEBUG(dbgs() << "G_ANYEXT to size: " << DstSize
3212	<< ", expected: 32 or 64\n");
3213	return false;
3214	}
3215	// At this point G_ANYEXT is just like a plain COPY, but we need
3216	// to explicitly form the 64-bit value if any.
3217	if (DstSize > `32`) {
3218	Register ExtSrc = MRI.createVirtualRegister(&AArch64::GPR64allRegClass);
3219	BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::SUBREG_TO_REG))
3220	.addDef(ExtSrc)
3221	.addImm(`0`)
3222	.addUse(SrcReg)
3223	.addImm(AArch64::sub_32);
3224	I.getOperand(i: `1`).setReg(ExtSrc);
3225	}
3226	return selectCopy(I, TII, MRI, TRI, RBI);
3227	}
3228
3229	case TargetOpcode::G_ZEXT:
3230	case TargetOpcode::G_SEXT_INREG:
3231	case TargetOpcode::G_SEXT: {
3232	if (selectUSMovFromExtend(I, MRI))
3233	return true;
3234
3235	unsigned Opcode = I.getOpcode();
3236	const bool IsSigned = Opcode != TargetOpcode::G_ZEXT;
3237	const Register DefReg = I.getOperand(i: `0`).getReg();
3238	Register SrcReg = I.getOperand(i: `1`).getReg();
3239	const LLT DstTy = MRI.getType(Reg: DefReg);
3240	const LLT SrcTy = MRI.getType(Reg: SrcReg);
3241	unsigned DstSize = DstTy.getSizeInBits();
3242	unsigned SrcSize = SrcTy.getSizeInBits();
3243
3244	// SEXT_INREG has the same src reg size as dst, the size of the value to be
3245	// extended is encoded in the imm.
3246	if (Opcode == TargetOpcode::G_SEXT_INREG)
3247	SrcSize = I.getOperand(i: `2`).getImm();
3248
3249	if (DstTy.isVector())
3250	return false; // Should be handled by imported patterns.
3251
3252	assert((*RBI.getRegBank(DefReg, MRI, TRI)).getID() ==
3253	AArch64::GPRRegBankID &&
3254	"Unexpected ext regbank");
3255
3256	MachineInstr *ExtI;
3257
3258	// First check if we're extending the result of a load which has a dest type
3259	// smaller than 32 bits, then this zext is redundant. GPR32 is the smallest
3260	// GPR register on AArch64 and all loads which are smaller automatically
3261	// zero-extend the upper bits. E.g.
3262	// %v(s8) = G_LOAD %p, :: (load 1)
3263	// %v2(s32) = G_ZEXT %v(s8)
3264	if (!IsSigned) {
3265	auto *LoadMI = getOpcodeDef(Opcode: TargetOpcode::G_LOAD, Reg: SrcReg, MRI);
3266	bool IsGPR =
3267	RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::GPRRegBankID;
3268	if (LoadMI && IsGPR) {
3269	const MachineMemOperand MemOp = LoadMI->memoperands_begin();
3270	unsigned BytesLoaded = MemOp->getSize();
3271	if (BytesLoaded < `4` && SrcTy.getSizeInBytes() == BytesLoaded)
3272	return selectCopy(I, TII, MRI, TRI, RBI);
3273	}
3274
3275	// For the 32-bit -> 64-bit case, we can emit a mov (ORRWrs)
3276	// + SUBREG_TO_REG.
3277	if (IsGPR && SrcSize == `32` && DstSize == `64`) {
3278	Register SubregToRegSrc =
3279	MRI.createVirtualRegister(&AArch64::GPR32RegClass);
3280	const Register ZReg = AArch64::WZR;
3281	MIB.buildInstr(AArch64::ORRWrs, {SubregToRegSrc}, {ZReg, SrcReg})
3282	.addImm(`0`);
3283
3284	MIB.buildInstr(AArch64::SUBREG_TO_REG, {DefReg}, {})
3285	.addImm(`0`)
3286	.addUse(SubregToRegSrc)
3287	.addImm(AArch64::sub_32);
3288
3289	if (!RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass,
3290	MRI)) {
3291	LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT destination\n");
3292	return false;
3293	}
3294
3295	if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass,
3296	MRI)) {
3297	LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT source\n");
3298	return false;
3299	}
3300
3301	I.eraseFromParent();
3302	return true;
3303	}
3304	}
3305
3306	if (DstSize == `64`) {
3307	if (Opcode != TargetOpcode::G_SEXT_INREG) {
3308	// FIXME: Can we avoid manually doing this?
3309	if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass,
3310	MRI)) {
3311	LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(Opcode)
3312	<< " operand\n");
3313	return false;
3314	}
3315	SrcReg = MIB.buildInstr(AArch64::SUBREG_TO_REG,
3316	{&AArch64::GPR64RegClass}, {})
3317	.addImm(`0`)
3318	.addUse(SrcReg)
3319	.addImm(AArch64::sub_32)
3320	.getReg(`0`);
3321	}
3322
3323	ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMXri : AArch64::UBFMXri,
3324	{DefReg}, {SrcReg})
3325	.addImm(`0`)
3326	.addImm(SrcSize - `1`);
3327	} else if (DstSize <= `32`) {
3328	ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMWri : AArch64::UBFMWri,
3329	{DefReg}, {SrcReg})
3330	.addImm(`0`)
3331	.addImm(SrcSize - `1`);
3332	} else {
3333	return false;
3334	}
3335
3336	constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
3337	I.eraseFromParent();
3338	return true;
3339	}
3340
3341	case TargetOpcode::G_SITOFP:
3342	case TargetOpcode::G_UITOFP:
3343	case TargetOpcode::G_FPTOSI:
3344	case TargetOpcode::G_FPTOUI: {
3345	const LLT DstTy = MRI.getType(Reg: I.getOperand(i: `0`).getReg()),
3346	SrcTy = MRI.getType(Reg: I.getOperand(i: `1`).getReg());
3347	const unsigned NewOpc = selectFPConvOpc(GenericOpc: Opcode, DstTy, SrcTy);
3348	if (NewOpc == Opcode)
3349	return false;
3350
3351	I.setDesc(TII.get(NewOpc));
3352	constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3353	I.setFlags(MachineInstr::NoFPExcept);
3354
3355	return true;
3356	}
3357
3358	case TargetOpcode::G_FREEZE:
3359	return selectCopy(I, TII, MRI, TRI, RBI);
3360
3361	case TargetOpcode::G_INTTOPTR:
3362	// The importer is currently unable to import pointer types since they
3363	// didn't exist in SelectionDAG.
3364	return selectCopy(I, TII, MRI, TRI, RBI);
3365
3366	case TargetOpcode::G_BITCAST:
3367	// Imported SelectionDAG rules can handle every bitcast except those that
3368	// bitcast from a type to the same type. Ideally, these shouldn't occur
3369	// but we might not run an optimizer that deletes them. The other exception
3370	// is bitcasts involving pointer types, as SelectionDAG has no knowledge
3371	// of them.
3372	return selectCopy(I, TII, MRI, TRI, RBI);
3373
3374	case TargetOpcode::G_SELECT: {
3375	auto &Sel = cast<GSelect>(Val&: I);
3376	const Register CondReg = Sel.getCondReg();
3377	const Register TReg = Sel.getTrueReg();
3378	const Register FReg = Sel.getFalseReg();
3379
3380	if (tryOptSelect(Sel))
3381	return true;
3382
3383	// Make sure to use an unused vreg instead of wzr, so that the peephole
3384	// optimizations will be able to optimize these.
3385	Register DeadVReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
3386	auto TstMI = MIB.buildInstr(AArch64::ANDSWri, {DeadVReg}, {CondReg})
3387	.addImm(AArch64_AM::encodeLogicalImmediate(`1`, `32`));
3388	constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
3389	if (!emitSelect(Dst: Sel.getReg(Idx: `0`), True: TReg, False: FReg, CC: AArch64CC::NE, MIB))
3390	return false;
3391	Sel.eraseFromParent();
3392	return true;
3393	}
3394	case TargetOpcode::G_ICMP: {
3395	if (Ty.isVector())
3396	return selectVectorICmp(I, MRI);
3397
3398	if (Ty != LLT::scalar(SizeInBits: `32`)) {
3399	LLVM_DEBUG(dbgs() << "G_ICMP result has type: " << Ty
3400	<< ", expected: " << LLT::scalar(`32`) << `'\n'`);
3401	return false;
3402	}
3403
3404	auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(i: `1`).getPredicate());
3405	const AArch64CC::CondCode InvCC =
3406	changeICMPPredToAArch64CC(P: CmpInst::getInversePredicate(pred: Pred));
3407	emitIntegerCompare(LHS&: I.getOperand(i: `2`), RHS&: I.getOperand(i: `3`), Predicate&: I.getOperand(i: `1`), MIRBuilder&: MIB);
3408	emitCSINC(/Dst=/I.getOperand(`0`).getReg(), /Src1=/AArch64::WZR,
3409	/Src2=/AArch64::WZR, InvCC, MIB);
3410	I.eraseFromParent();
3411	return true;
3412	}
3413
3414	case TargetOpcode::G_FCMP: {
3415	CmpInst::Predicate Pred =
3416	static_cast<CmpInst::Predicate>(I.getOperand(i: `1`).getPredicate());
3417	if (!emitFPCompare(LHS: I.getOperand(i: `2`).getReg(), RHS: I.getOperand(i: `3`).getReg(), MIRBuilder&: MIB,
3418	Pred) \|\|
3419	!emitCSetForFCmp(Dst: I.getOperand(i: `0`).getReg(), Pred, MIRBuilder&: MIB))
3420	return false;
3421	I.eraseFromParent();
3422	return true;
3423	}
3424	case TargetOpcode::G_VASTART:
3425	return STI.isTargetDarwin() ? selectVaStartDarwin(I, MF, MRI)
3426	: selectVaStartAAPCS(I, MF, MRI);
3427	case TargetOpcode::G_INTRINSIC:
3428	return selectIntrinsic(I, MRI);
3429	case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
3430	return selectIntrinsicWithSideEffects(I, MRI);
3431	case TargetOpcode::G_IMPLICIT_DEF: {
3432	I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
3433	const LLT DstTy = MRI.getType(Reg: I.getOperand(i: `0`).getReg());
3434	const Register DstReg = I.getOperand(i: `0`).getReg();
3435	const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
3436	const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(Ty: DstTy, RB: DstRB);
3437	RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI);
3438	return true;
3439	}
3440	case TargetOpcode::G_BLOCK_ADDR: {
3441	if (TM.getCodeModel() == CodeModel::Large && !TM.isPositionIndependent()) {
3442	materializeLargeCMVal(I, V: I.getOperand(i: `1`).getBlockAddress(), OpFlags: `0`);
3443	I.eraseFromParent();
3444	return true;
3445	} else {
3446	I.setDesc(TII.get(AArch64::MOVaddrBA));
3447	auto MovMI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::MOVaddrBA),
3448	I.getOperand(`0`).getReg())
3449	.addBlockAddress(I.getOperand(`1`).getBlockAddress(),
3450	/ Offset / `0`, AArch64II::MO_PAGE)
3451	.addBlockAddress(
3452	I.getOperand(`1`).getBlockAddress(), / Offset / `0`,
3453	AArch64II::MO_NC \| AArch64II::MO_PAGEOFF);
3454	I.eraseFromParent();
3455	return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI);
3456	}
3457	}
3458	case AArch64::G_DUP: {
3459	// When the scalar of G_DUP is an s8/s16 gpr, they can't be selected by
3460	// imported patterns. Do it manually here. Avoiding generating s16 gpr is
3461	// difficult because at RBS we may end up pessimizing the fpr case if we
3462	// decided to add an anyextend to fix this. Manual selection is the most
3463	// robust solution for now.
3464	if (RBI.getRegBank(I.getOperand(`1`).getReg(), MRI, TRI)->getID() !=
3465	AArch64::GPRRegBankID)
3466	return false; // We expect the fpr regbank case to be imported.
3467	LLT VecTy = MRI.getType(Reg: I.getOperand(i: `0`).getReg());
3468	if (VecTy == LLT::fixed_vector(NumElements: `8`, ScalarSizeInBits: `8`))
3469	I.setDesc(TII.get(AArch64::DUPv8i8gpr));
3470	else if (VecTy == LLT::fixed_vector(NumElements: `16`, ScalarSizeInBits: `8`))
3471	I.setDesc(TII.get(AArch64::DUPv16i8gpr));
3472	else if (VecTy == LLT::fixed_vector(NumElements: `4`, ScalarSizeInBits: `16`))
3473	I.setDesc(TII.get(AArch64::DUPv4i16gpr));
3474	else if (VecTy == LLT::fixed_vector(NumElements: `8`, ScalarSizeInBits: `16`))
3475	I.setDesc(TII.get(AArch64::DUPv8i16gpr));
3476	else
3477	return false;
3478	return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3479	}
3480	case TargetOpcode::G_BUILD_VECTOR:
3481	return selectBuildVector(I, MRI);
3482	case TargetOpcode::G_MERGE_VALUES:
3483	return selectMergeValues(I, MRI);
3484	case TargetOpcode::G_UNMERGE_VALUES:
3485	return selectUnmergeValues(I, MRI);
3486	case TargetOpcode::G_SHUFFLE_VECTOR:
3487	return selectShuffleVector(I, MRI);
3488	case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3489	return selectExtractElt(I, MRI);
3490	case TargetOpcode::G_INSERT_VECTOR_ELT:
3491	return selectInsertElt(I, MRI);
3492	case TargetOpcode::G_CONCAT_VECTORS:
3493	return selectConcatVectors(I, MRI);
3494	case TargetOpcode::G_JUMP_TABLE:
3495	return selectJumpTable(I, MRI);
3496	case TargetOpcode::G_MEMCPY:
3497	case TargetOpcode::G_MEMCPY_INLINE:
3498	case TargetOpcode::G_MEMMOVE:
3499	case TargetOpcode::G_MEMSET:
3500	assert(STI.hasMOPS() && "Shouldn't get here without +mops feature");
3501	return selectMOPS(I, MRI);
3502	}
3503
3504	return false;
3505	}
3506
3507	bool AArch64InstructionSelector::selectAndRestoreState(MachineInstr &I) {
3508	MachineIRBuilderState OldMIBState = MIB.getState();
3509	bool Success = select(I);
3510	MIB.setState(OldMIBState);
3511	return Success;
3512	}
3513
3514	bool AArch64InstructionSelector::selectMOPS(MachineInstr &GI,
3515	MachineRegisterInfo &MRI) {
3516	unsigned Mopcode;
3517	switch (GI.getOpcode()) {
3518	case TargetOpcode::G_MEMCPY:
3519	case TargetOpcode::G_MEMCPY_INLINE:
3520	Mopcode = AArch64::MOPSMemoryCopyPseudo;
3521	break;
3522	case TargetOpcode::G_MEMMOVE:
3523	Mopcode = AArch64::MOPSMemoryMovePseudo;
3524	break;
3525	case TargetOpcode::G_MEMSET:
3526	// For tagged memset see llvm.aarch64.mops.memset.tag
3527	Mopcode = AArch64::MOPSMemorySetPseudo;
3528	break;
3529	}
3530
3531	auto &DstPtr = GI.getOperand(i: `0`);
3532	auto &SrcOrVal = GI.getOperand(i: `1`);
3533	auto &Size = GI.getOperand(i: `2`);
3534
3535	// Create copies of the registers that can be clobbered.
3536	const Register DstPtrCopy = MRI.cloneVirtualRegister(VReg: DstPtr.getReg());
3537	const Register SrcValCopy = MRI.cloneVirtualRegister(VReg: SrcOrVal.getReg());
3538	const Register SizeCopy = MRI.cloneVirtualRegister(VReg: Size.getReg());
3539
3540	const bool IsSet = Mopcode == AArch64::MOPSMemorySetPseudo;
3541	const auto &SrcValRegClass =
3542	IsSet ? AArch64::GPR64RegClass : AArch64::GPR64commonRegClass;
3543
3544	// Constrain to specific registers
3545	RBI.constrainGenericRegister(DstPtrCopy, AArch64::GPR64commonRegClass, MRI);
3546	RBI.constrainGenericRegister(Reg: SrcValCopy, RC: SrcValRegClass, MRI);
3547	RBI.constrainGenericRegister(SizeCopy, AArch64::GPR64RegClass, MRI);
3548
3549	MIB.buildCopy(Res: DstPtrCopy, Op: DstPtr);
3550	MIB.buildCopy(Res: SrcValCopy, Op: SrcOrVal);
3551	MIB.buildCopy(Res: SizeCopy, Op: Size);
3552
3553	// New instruction uses the copied registers because it must update them.
3554	// The defs are not used since they don't exist in G_MEM. They are still*
3555	// tied.
3556	// Note: order of operands is different from G_MEMSET, G_MEMCPY, G_MEMMOVE
3557	Register DefDstPtr = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass);
3558	Register DefSize = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
3559	if (IsSet) {
3560	MIB.buildInstr(Opc: Mopcode, DstOps: {DefDstPtr, DefSize},
3561	SrcOps: {DstPtrCopy, SizeCopy, SrcValCopy});
3562	} else {
3563	Register DefSrcPtr = MRI.createVirtualRegister(&SrcValRegClass);
3564	MIB.buildInstr(Opc: Mopcode, DstOps: {DefDstPtr, DefSrcPtr, DefSize},
3565	SrcOps: {DstPtrCopy, SrcValCopy, SizeCopy});
3566	}
3567
3568	GI.eraseFromParent();
3569	return true;
3570	}
3571
3572	bool AArch64InstructionSelector::selectBrJT(MachineInstr &I,
3573	MachineRegisterInfo &MRI) {
3574	assert(I.getOpcode() == TargetOpcode::G_BRJT && "Expected G_BRJT");
3575	Register JTAddr = I.getOperand(i: `0`).getReg();
3576	unsigned JTI = I.getOperand(i: `1`).getIndex();
3577	Register Index = I.getOperand(i: `2`).getReg();
3578
3579	Register TargetReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
3580	Register ScratchReg = MRI.createVirtualRegister(&AArch64::GPR64spRegClass);
3581
3582	MF->getInfo<AArch64FunctionInfo>()->setJumpTableEntryInfo(Idx: JTI, Size: `4`, PCRelSym: nullptr);
3583	auto JumpTableInst = MIB.buildInstr(AArch64::JumpTableDest32,
3584	{TargetReg, ScratchReg}, {JTAddr, Index})
3585	.addJumpTableIndex(JTI);
3586	// Save the jump table info.
3587	MIB.buildInstr(Opc: TargetOpcode::JUMP_TABLE_DEBUG_INFO, DstOps: {},
3588	SrcOps: {static_cast<int64_t>(JTI)});
3589	// Build the indirect branch.
3590	MIB.buildInstr(AArch64::BR, {}, {TargetReg});
3591	I.eraseFromParent();
3592	return constrainSelectedInstRegOperands(*JumpTableInst, TII, TRI, RBI);
3593	}
3594
3595	bool AArch64InstructionSelector::selectJumpTable(MachineInstr &I,
3596	MachineRegisterInfo &MRI) {
3597	assert(I.getOpcode() == TargetOpcode::G_JUMP_TABLE && "Expected jump table");
3598	assert(I.getOperand(`1`).isJTI() && "Jump table op should have a JTI!");
3599
3600	Register DstReg = I.getOperand(i: `0`).getReg();
3601	unsigned JTI = I.getOperand(i: `1`).getIndex();
3602	// We generate a MOVaddrJT which will get expanded to an ADRP + ADD later.
3603	auto MovMI =
3604	MIB.buildInstr(AArch64::MOVaddrJT, {DstReg}, {})
3605	.addJumpTableIndex(JTI, AArch64II::MO_PAGE)
3606	.addJumpTableIndex(JTI, AArch64II::MO_NC \| AArch64II::MO_PAGEOFF);
3607	I.eraseFromParent();
3608	return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI);
3609	}
3610
3611	bool AArch64InstructionSelector::selectTLSGlobalValue(
3612	MachineInstr &I, MachineRegisterInfo &MRI) {
3613	if (!STI.isTargetMachO())
3614	return false;
3615	MachineFunction &MF = *I.getParent()->getParent();
3616	MF.getFrameInfo().setAdjustsStack(true);
3617
3618	const auto &GlobalOp = I.getOperand(i: `1`);
3619	assert(GlobalOp.getOffset() == `0` &&
3620	"Shouldn't have an offset on TLS globals!");
3621	const GlobalValue &GV = *GlobalOp.getGlobal();
3622
3623	auto LoadGOT =
3624	MIB.buildInstr(AArch64::LOADgot, {&AArch64::GPR64commonRegClass}, {})
3625	.addGlobalAddress(&GV, `0`, AArch64II::MO_TLS);
3626
3627	auto Load = MIB.buildInstr(AArch64::LDRXui, {&AArch64::GPR64commonRegClass},
3628	{LoadGOT.getReg(`0`)})
3629	.addImm(`0`);
3630
3631	MIB.buildCopy(Register(AArch64::X0), LoadGOT.getReg(`0`));
3632	// TLS calls preserve all registers except those that absolutely must be
3633	// trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
3634	// silly).
3635	MIB.buildInstr(getBLRCallOpcode(MF), {}, {Load})
3636	.addUse(AArch64::X0, RegState::Implicit)
3637	.addDef(AArch64::X0, RegState::Implicit)
3638	.addRegMask(TRI.getTLSCallPreservedMask());
3639
3640	MIB.buildCopy(I.getOperand(`0`).getReg(), Register(AArch64::X0));
3641	RBI.constrainGenericRegister(I.getOperand(`0`).getReg(), AArch64::GPR64RegClass,
3642	MRI);
3643	I.eraseFromParent();
3644	return true;
3645	}
3646
3647	bool AArch64InstructionSelector::selectVectorICmp(
3648	MachineInstr &I, MachineRegisterInfo &MRI) {
3649	Register DstReg = I.getOperand(i: `0`).getReg();
3650	LLT DstTy = MRI.getType(Reg: DstReg);
3651	Register SrcReg = I.getOperand(i: `2`).getReg();
3652	Register Src2Reg = I.getOperand(i: `3`).getReg();
3653	LLT SrcTy = MRI.getType(Reg: SrcReg);
3654
3655	unsigned SrcEltSize = SrcTy.getElementType().getSizeInBits();
3656	unsigned NumElts = DstTy.getNumElements();
3657
3658	// First index is element size, 0 == 8b, 1 == 16b, 2 == 32b, 3 == 64b
3659	// Second index is num elts, 0 == v2, 1 == v4, 2 == v8, 3 == v16
3660	// Third index is cc opcode:
3661	// 0 == eq
3662	// 1 == ugt
3663	// 2 == uge
3664	// 3 == ult
3665	// 4 == ule
3666	// 5 == sgt
3667	// 6 == sge
3668	// 7 == slt
3669	// 8 == sle
3670	// ne is done by negating 'eq' result.
3671
3672	// This table below assumes that for some comparisons the operands will be
3673	// commuted.
3674	// ult op == commute + ugt op
3675	// ule op == commute + uge op
3676	// slt op == commute + sgt op
3677	// sle op == commute + sge op
3678	unsigned PredIdx = `0`;
3679	bool SwapOperands = false;
3680	CmpInst::Predicate Pred = (CmpInst::Predicate)I.getOperand(i: `1`).getPredicate();
3681	switch (Pred) {
3682	case CmpInst::ICMP_NE:
3683	case CmpInst::ICMP_EQ:
3684	PredIdx = `0`;
3685	break;
3686	case CmpInst::ICMP_UGT:
3687	PredIdx = `1`;
3688	break;
3689	case CmpInst::ICMP_UGE:
3690	PredIdx = `2`;
3691	break;
3692	case CmpInst::ICMP_ULT:
3693	PredIdx = `3`;
3694	SwapOperands = true;
3695	break;
3696	case CmpInst::ICMP_ULE:
3697	PredIdx = `4`;
3698	SwapOperands = true;
3699	break;
3700	case CmpInst::ICMP_SGT:
3701	PredIdx = `5`;
3702	break;
3703	case CmpInst::ICMP_SGE:
3704	PredIdx = `6`;
3705	break;
3706	case CmpInst::ICMP_SLT:
3707	PredIdx = `7`;
3708	SwapOperands = true;
3709	break;
3710	case CmpInst::ICMP_SLE:
3711	PredIdx = `8`;
3712	SwapOperands = true;
3713	break;
3714	default:
3715	llvm_unreachable("Unhandled icmp predicate");
3716	return false;
3717	}
3718
3719	// This table obviously should be tablegen'd when we have our GISel native
3720	// tablegen selector.
3721
3722	static const unsigned OpcTable[`4`][`4`][`9`] = {
3723	{
3724	{`0` / invalid /, `0` / invalid /, `0` / invalid /, `0` / invalid /,
3725	`0` / invalid /, `0` / invalid /, `0` / invalid /, `0` / invalid /,
3726	`0` / invalid /},
3727	{`0` / invalid /, `0` / invalid /, `0` / invalid /, `0` / invalid /,
3728	`0` / invalid /, `0` / invalid /, `0` / invalid /, `0` / invalid /,
3729	`0` / invalid /},
3730	{AArch64::CMEQv8i8, AArch64::CMHIv8i8, AArch64::CMHSv8i8,
3731	AArch64::CMHIv8i8, AArch64::CMHSv8i8, AArch64::CMGTv8i8,
3732	AArch64::CMGEv8i8, AArch64::CMGTv8i8, AArch64::CMGEv8i8},
3733	{AArch64::CMEQv16i8, AArch64::CMHIv16i8, AArch64::CMHSv16i8,
3734	AArch64::CMHIv16i8, AArch64::CMHSv16i8, AArch64::CMGTv16i8,
3735	AArch64::CMGEv16i8, AArch64::CMGTv16i8, AArch64::CMGEv16i8}
3736	},
3737	{
3738	{`0` / invalid /, `0` / invalid /, `0` / invalid /, `0` / invalid /,
3739	`0` / invalid /, `0` / invalid /, `0` / invalid /, `0` / invalid /,
3740	`0` / invalid /},
3741	{AArch64::CMEQv4i16, AArch64::CMHIv4i16, AArch64::CMHSv4i16,
3742	AArch64::CMHIv4i16, AArch64::CMHSv4i16, AArch64::CMGTv4i16,
3743	AArch64::CMGEv4i16, AArch64::CMGTv4i16, AArch64::CMGEv4i16},
3744	{AArch64::CMEQv8i16, AArch64::CMHIv8i16, AArch64::CMHSv8i16,
3745	AArch64::CMHIv8i16, AArch64::CMHSv8i16, AArch64::CMGTv8i16,
3746	AArch64::CMGEv8i16, AArch64::CMGTv8i16, AArch64::CMGEv8i16},
3747	{`0` / invalid /, `0` / invalid /, `0` / invalid /, `0` / invalid /,
3748	`0` / invalid /, `0` / invalid /, `0` / invalid /, `0` / invalid /,
3749	`0` / invalid /}
3750	},
3751	{
3752	{AArch64::CMEQv2i32, AArch64::CMHIv2i32, AArch64::CMHSv2i32,
3753	AArch64::CMHIv2i32, AArch64::CMHSv2i32, AArch64::CMGTv2i32,
3754	AArch64::CMGEv2i32, AArch64::CMGTv2i32, AArch64::CMGEv2i32},
3755	{AArch64::CMEQv4i32, AArch64::CMHIv4i32, AArch64::CMHSv4i32,
3756	AArch64::CMHIv4i32, AArch64::CMHSv4i32, AArch64::CMGTv4i32,
3757	AArch64::CMGEv4i32, AArch64::CMGTv4i32, AArch64::CMGEv4i32},
3758	{`0` / invalid /, `0` / invalid /, `0` / invalid /, `0` / invalid /,
3759	`0` / invalid /, `0` / invalid /, `0` / invalid /, `0` / invalid /,
3760	`0` / invalid /},
3761	{`0` / invalid /, `0` / invalid /, `0` / invalid /, `0` / invalid /,
3762	`0` / invalid /, `0` / invalid /, `0` / invalid /, `0` / invalid /,
3763	`0` / invalid /}
3764	},
3765	{
3766	{AArch64::CMEQv2i64, AArch64::CMHIv2i64, AArch64::CMHSv2i64,
3767	AArch64::CMHIv2i64, AArch64::CMHSv2i64, AArch64::CMGTv2i64,
3768	AArch64::CMGEv2i64, AArch64::CMGTv2i64, AArch64::CMGEv2i64},
3769	{`0` / invalid /, `0` / invalid /, `0` / invalid /, `0` / invalid /,
3770	`0` / invalid /, `0` / invalid /, `0` / invalid /, `0` / invalid /,
3771	`0` / invalid /},
3772	{`0` / invalid /, `0` / invalid /, `0` / invalid /, `0` / invalid /,
3773	`0` / invalid /, `0` / invalid /, `0` / invalid /, `0` / invalid /,
3774	`0` / invalid /},
3775	{`0` / invalid /, `0` / invalid /, `0` / invalid /, `0` / invalid /,
3776	`0` / invalid /, `0` / invalid /, `0` / invalid /, `0` / invalid /,
3777	`0` / invalid /}
3778	},
3779	};
3780	unsigned EltIdx = Log2_32(Value: SrcEltSize / `8`);
3781	unsigned NumEltsIdx = Log2_32(Value: NumElts / `2`);
3782	unsigned Opc = OpcTable[EltIdx][NumEltsIdx][PredIdx];
3783	if (!Opc) {
3784	LLVM_DEBUG(dbgs() << "Could not map G_ICMP to cmp opcode");
3785	return false;
3786	}
3787
3788	const RegisterBank &VecRB = *RBI.getRegBank(SrcReg, MRI, TRI);
3789	const TargetRegisterClass *SrcRC =
3790	getRegClassForTypeOnBank(Ty: SrcTy, RB: VecRB, GetAllRegSet: true);
3791	if (!SrcRC) {
3792	LLVM_DEBUG(dbgs() << "Could not determine source register class.\n");
3793	return false;
3794	}
3795
3796	unsigned NotOpc = Pred == ICmpInst::ICMP_NE ? AArch64::NOTv8i8 : `0`;
3797	if (SrcTy.getSizeInBits() == `128`)
3798	NotOpc = NotOpc ? AArch64::NOTv16i8 : `0`;
3799
3800	if (SwapOperands)
3801	std::swap(a&: SrcReg, b&: Src2Reg);
3802
3803	auto Cmp = MIB.buildInstr(Opc, DstOps: {SrcRC}, SrcOps: {SrcReg, Src2Reg});
3804	constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI);
3805
3806	// Invert if we had a 'ne' cc.
3807	if (NotOpc) {
3808	Cmp = MIB.buildInstr(NotOpc, {DstReg}, {Cmp});
3809	constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI);
3810	} else {
3811	MIB.buildCopy(Res: DstReg, Op: Cmp.getReg(`0`));
3812	}
3813	RBI.constrainGenericRegister(Reg: DstReg, RC: *SrcRC, MRI);
3814	I.eraseFromParent();
3815	return true;
3816	}
3817
3818	MachineInstr *AArch64InstructionSelector::emitScalarToVector(
3819	unsigned EltSize, const TargetRegisterClass *DstRC, Register Scalar,
3820	MachineIRBuilder &MIRBuilder) const {
3821	auto Undef = MIRBuilder.buildInstr(Opc: TargetOpcode::IMPLICIT_DEF, DstOps: {DstRC}, SrcOps: {});
3822
3823	auto BuildFn = [&](unsigned SubregIndex) {
3824	auto Ins =
3825	MIRBuilder
3826	.buildInstr(Opc: TargetOpcode::INSERT_SUBREG, DstOps: {DstRC}, SrcOps: {Undef, Scalar})
3827	.addImm(Val: SubregIndex);
3828	constrainSelectedInstRegOperands(*Undef, TII, TRI, RBI);
3829	constrainSelectedInstRegOperands(*Ins, TII, TRI, RBI);
3830	return &*Ins;
3831	};
3832
3833	switch (EltSize) {
3834	case `8`:
3835	return BuildFn(AArch64::bsub);
3836	case `16`:
3837	return BuildFn(AArch64::hsub);
3838	case `32`:
3839	return BuildFn(AArch64::ssub);
3840	case `64`:
3841	return BuildFn(AArch64::dsub);
3842	default:
3843	return nullptr;
3844	}
3845	}
3846
3847	MachineInstr *
3848	AArch64InstructionSelector::emitNarrowVector(Register DstReg, Register SrcReg,
3849	MachineIRBuilder &MIB,
3850	MachineRegisterInfo &MRI) const {
3851	LLT DstTy = MRI.getType(Reg: DstReg);
3852	const TargetRegisterClass *RC =
3853	getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(SrcReg, MRI, TRI));
3854	if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) {
3855	LLVM_DEBUG(dbgs() << "Unsupported register class!\n");
3856	return nullptr;
3857	}
3858	unsigned SubReg = `0`;
3859	if (!getSubRegForClass(RC, TRI, SubReg))
3860	return nullptr;
3861	if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) {
3862	LLVM_DEBUG(dbgs() << "Unsupported destination size! ("
3863	<< DstTy.getSizeInBits() << "\n");
3864	return nullptr;
3865	}
3866	auto Copy = MIB.buildInstr(Opc: TargetOpcode::COPY, DstOps: {DstReg}, SrcOps: {})
3867	.addReg(RegNo: SrcReg, flags: `0`, SubReg);
3868	RBI.constrainGenericRegister(Reg: DstReg, RC: *RC, MRI);
3869	return Copy;
3870	}
3871
3872	bool AArch64InstructionSelector::selectMergeValues(
3873	MachineInstr &I, MachineRegisterInfo &MRI) {
3874	assert(I.getOpcode() == TargetOpcode::G_MERGE_VALUES && "unexpected opcode");
3875	const LLT DstTy = MRI.getType(Reg: I.getOperand(i: `0`).getReg());
3876	const LLT SrcTy = MRI.getType(Reg: I.getOperand(i: `1`).getReg());
3877	assert(!DstTy.isVector() && !SrcTy.isVector() && "invalid merge operation");
3878	const RegisterBank &RB = *RBI.getRegBank(I.getOperand(i: `1`).getReg(), MRI, TRI);
3879
3880	if (I.getNumOperands() != `3`)
3881	return false;
3882
3883	// Merging 2 s64s into an s128.
3884	if (DstTy == LLT::scalar(SizeInBits: `128`)) {
3885	if (SrcTy.getSizeInBits() != `64`)
3886	return false;
3887	Register DstReg = I.getOperand(i: `0`).getReg();
3888	Register Src1Reg = I.getOperand(i: `1`).getReg();
3889	Register Src2Reg = I.getOperand(i: `2`).getReg();
3890	auto Tmp = MIB.buildInstr(Opc: TargetOpcode::IMPLICIT_DEF, DstOps: {DstTy}, SrcOps: {});
3891	MachineInstr *InsMI = emitLaneInsert(DstReg: std::nullopt, SrcReg: Tmp.getReg(Idx: `0`), EltReg: Src1Reg,
3892	/ LaneIdx / `0`, RB, MIRBuilder&: MIB);
3893	if (!InsMI)
3894	return false;
3895	MachineInstr *Ins2MI = emitLaneInsert(DstReg, SrcReg: InsMI->getOperand(i: `0`).getReg(),
3896	EltReg: Src2Reg, / LaneIdx / `1`, RB, MIRBuilder&: MIB);
3897	if (!Ins2MI)
3898	return false;
3899	constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI);
3900	constrainSelectedInstRegOperands(*Ins2MI, TII, TRI, RBI);
3901	I.eraseFromParent();
3902	return true;
3903	}
3904
3905	if (RB.getID() != AArch64::GPRRegBankID)
3906	return false;
3907
3908	if (DstTy.getSizeInBits() != `64` \|\| SrcTy.getSizeInBits() != `32`)
3909	return false;
3910
3911	auto *DstRC = &AArch64::GPR64RegClass;
3912	Register SubToRegDef = MRI.createVirtualRegister(DstRC);
3913	MachineInstr &SubRegMI = BuildMI(I.getParent(), I, I.getDebugLoc(),
3914	TII.get(TargetOpcode::SUBREG_TO_REG))
3915	.addDef(SubToRegDef)
3916	.addImm(`0`)
3917	.addUse(I.getOperand(`1`).getReg())
3918	.addImm(AArch64::sub_32);
3919	Register SubToRegDef2 = MRI.createVirtualRegister(DstRC);
3920	// Need to anyext the second scalar before we can use bfm
3921	MachineInstr &SubRegMI2 = BuildMI(I.getParent(), I, I.getDebugLoc(),
3922	TII.get(TargetOpcode::SUBREG_TO_REG))
3923	.addDef(SubToRegDef2)
3924	.addImm(`0`)
3925	.addUse(I.getOperand(`2`).getReg())
3926	.addImm(AArch64::sub_32);
3927	MachineInstr &BFM =
3928	BuildMI(I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::BFMXri))
3929	.addDef(I.getOperand(`0`).getReg())
3930	.addUse(SubToRegDef)
3931	.addUse(SubToRegDef2)
3932	.addImm(`32`)
3933	.addImm(`31`);
3934	constrainSelectedInstRegOperands(SubRegMI, TII, TRI, RBI);
3935	constrainSelectedInstRegOperands(SubRegMI2, TII, TRI, RBI);
3936	constrainSelectedInstRegOperands(BFM, TII, TRI, RBI);
3937	I.eraseFromParent();
3938	return true;
3939	}
3940
3941	static bool getLaneCopyOpcode(unsigned &CopyOpc, unsigned &ExtractSubReg,
3942	const unsigned EltSize) {
3943	// Choose a lane copy opcode and subregister based off of the size of the
3944	// vector's elements.
3945	switch (EltSize) {
3946	case `8`:
3947	CopyOpc = AArch64::DUPi8;
3948	ExtractSubReg = AArch64::bsub;
3949	break;
3950	case `16`:
3951	CopyOpc = AArch64::DUPi16;
3952	ExtractSubReg = AArch64::hsub;
3953	break;
3954	case `32`:
3955	CopyOpc = AArch64::DUPi32;
3956	ExtractSubReg = AArch64::ssub;
3957	break;
3958	case `64`:
3959	CopyOpc = AArch64::DUPi64;
3960	ExtractSubReg = AArch64::dsub;
3961	break;
3962	default:
3963	// Unknown size, bail out.
3964	LLVM_DEBUG(dbgs() << "Elt size '" << EltSize << "' unsupported.\n");
3965	return false;
3966	}
3967	return true;
3968	}
3969
3970	MachineInstr *AArch64InstructionSelector::emitExtractVectorElt(
3971	std::optional<Register> DstReg, const RegisterBank &DstRB, LLT ScalarTy,
3972	Register VecReg, unsigned LaneIdx, MachineIRBuilder &MIRBuilder) const {
3973	MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
3974	unsigned CopyOpc = `0`;
3975	unsigned ExtractSubReg = `0`;
3976	if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, EltSize: ScalarTy.getSizeInBits())) {
3977	LLVM_DEBUG(
3978	dbgs() << "Couldn't determine lane copy opcode for instruction.\n");
3979	return nullptr;
3980	}
3981
3982	const TargetRegisterClass *DstRC =
3983	getRegClassForTypeOnBank(Ty: ScalarTy, RB: DstRB, GetAllRegSet: true);
3984	if (!DstRC) {
3985	LLVM_DEBUG(dbgs() << "Could not determine destination register class.\n");
3986	return nullptr;
3987	}
3988
3989	const RegisterBank &VecRB = *RBI.getRegBank(VecReg, MRI, TRI);
3990	const LLT &VecTy = MRI.getType(Reg: VecReg);
3991	const TargetRegisterClass *VecRC =
3992	getRegClassForTypeOnBank(Ty: VecTy, RB: VecRB, GetAllRegSet: true);
3993	if (!VecRC) {
3994	LLVM_DEBUG(dbgs() << "Could not determine source register class.\n");
3995	return nullptr;
3996	}
3997
3998	// The register that we're going to copy into.
3999	Register InsertReg = VecReg;
4000	if (!DstReg)
4001	DstReg = MRI.createVirtualRegister(RegClass: DstRC);
4002	// If the lane index is 0, we just use a subregister COPY.
4003	if (LaneIdx == `0`) {
4004	auto Copy = MIRBuilder.buildInstr(Opc: TargetOpcode::COPY, DstOps: {*DstReg}, SrcOps: {})
4005	.addReg(RegNo: VecReg, flags: `0`, SubReg: ExtractSubReg);
4006	RBI.constrainGenericRegister(Reg: DstReg, RC: DstRC, MRI);
4007	return &*Copy;
4008	}
4009
4010	// Lane copies require 128-bit wide registers. If we're dealing with an
4011	// unpacked vector, then we need to move up to that width. Insert an implicit
4012	// def and a subregister insert to get us there.
4013	if (VecTy.getSizeInBits() != `128`) {
4014	MachineInstr *ScalarToVector = emitScalarToVector(
4015	VecTy.getSizeInBits(), &AArch64::FPR128RegClass, VecReg, MIRBuilder);
4016	if (!ScalarToVector)
4017	return nullptr;
4018	InsertReg = ScalarToVector->getOperand(i: `0`).getReg();
4019	}
4020
4021	MachineInstr *LaneCopyMI =
4022	MIRBuilder.buildInstr(Opc: CopyOpc, DstOps: {*DstReg}, SrcOps: {InsertReg}).addImm(Val: LaneIdx);
4023	constrainSelectedInstRegOperands(*LaneCopyMI, TII, TRI, RBI);
4024
4025	// Make sure that we actually constrain the initial copy.
4026	RBI.constrainGenericRegister(Reg: DstReg, RC: DstRC, MRI);
4027	return LaneCopyMI;
4028	}
4029
4030	bool AArch64InstructionSelector::selectExtractElt(
4031	MachineInstr &I, MachineRegisterInfo &MRI) {
4032	assert(I.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT &&
4033	"unexpected opcode!");
4034	Register DstReg = I.getOperand(i: `0`).getReg();
4035	const LLT NarrowTy = MRI.getType(Reg: DstReg);
4036	const Register SrcReg = I.getOperand(i: `1`).getReg();
4037	const LLT WideTy = MRI.getType(Reg: SrcReg);
4038	(void)WideTy;
4039	assert(WideTy.getSizeInBits() >= NarrowTy.getSizeInBits() &&
4040	"source register size too small!");
4041	assert(!NarrowTy.isVector() && "cannot extract vector into vector!");
4042
4043	// Need the lane index to determine the correct copy opcode.
4044	MachineOperand &LaneIdxOp = I.getOperand(i: `2`);
4045	assert(LaneIdxOp.isReg() && "Lane index operand was not a register?");
4046
4047	if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) {
4048	LLVM_DEBUG(dbgs() << "Cannot extract into GPR.\n");
4049	return false;
4050	}
4051
4052	// Find the index to extract from.
4053	auto VRegAndVal = getIConstantVRegValWithLookThrough(VReg: LaneIdxOp.getReg(), MRI);
4054	if (!VRegAndVal)
4055	return false;
4056	unsigned LaneIdx = VRegAndVal ->Value.getSExtValue();
4057
4058
4059	const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
4060	MachineInstr *Extract = emitExtractVectorElt(DstReg, DstRB, ScalarTy: NarrowTy, VecReg: SrcReg,
4061	LaneIdx, MIRBuilder&: MIB);
4062	if (!Extract)
4063	return false;
4064
4065	I.eraseFromParent();
4066	return true;
4067	}
4068
4069	bool AArch64InstructionSelector::selectSplitVectorUnmerge(
4070	MachineInstr &I, MachineRegisterInfo &MRI) {
4071	unsigned NumElts = I.getNumOperands() - `1`;
4072	Register SrcReg = I.getOperand(i: NumElts).getReg();
4073	const LLT NarrowTy = MRI.getType(Reg: I.getOperand(i: `0`).getReg());
4074	const LLT SrcTy = MRI.getType(Reg: SrcReg);
4075
4076	assert(NarrowTy.isVector() && "Expected an unmerge into vectors");
4077	if (SrcTy.getSizeInBits() > `128`) {
4078	LLVM_DEBUG(dbgs() << "Unexpected vector type for vec split unmerge");
4079	return false;
4080	}
4081
4082	// We implement a split vector operation by treating the sub-vectors as
4083	// scalars and extracting them.
4084	const RegisterBank &DstRB =
4085	*RBI.getRegBank(I.getOperand(i: `0`).getReg(), MRI, TRI);
4086	for (unsigned OpIdx = `0`; OpIdx < NumElts; ++OpIdx) {
4087	Register Dst = I.getOperand(i: OpIdx).getReg();
4088	MachineInstr *Extract =
4089	emitExtractVectorElt(DstReg: Dst, DstRB, ScalarTy: NarrowTy, VecReg: SrcReg, LaneIdx: OpIdx, MIRBuilder&: MIB);
4090	if (!Extract)
4091	return false;
4092	}
4093	I.eraseFromParent();
4094	return true;
4095	}
4096
4097	bool AArch64InstructionSelector::selectUnmergeValues(MachineInstr &I,
4098	MachineRegisterInfo &MRI) {
4099	assert(I.getOpcode() == TargetOpcode::G_UNMERGE_VALUES &&
4100	"unexpected opcode");
4101
4102	// TODO: Handle unmerging into GPRs and from scalars to scalars.
4103	if (RBI.getRegBank(I.getOperand(`0`).getReg(), MRI, TRI)->getID() !=
4104	AArch64::FPRRegBankID \|\|
4105	RBI.getRegBank(I.getOperand(`1`).getReg(), MRI, TRI)->getID() !=
4106	AArch64::FPRRegBankID) {
4107	LLVM_DEBUG(dbgs() << "Unmerging vector-to-gpr and scalar-to-scalar "
4108	"currently unsupported.\n");
4109	return false;
4110	}
4111
4112	// The last operand is the vector source register, and every other operand is
4113	// a register to unpack into.
4114	unsigned NumElts = I.getNumOperands() - `1`;
4115	Register SrcReg = I.getOperand(i: NumElts).getReg();
4116	const LLT NarrowTy = MRI.getType(Reg: I.getOperand(i: `0`).getReg());
4117	const LLT WideTy = MRI.getType(Reg: SrcReg);
4118	(void)WideTy;
4119	assert((WideTy.isVector() \|\| WideTy.getSizeInBits() == `128`) &&
4120	"can only unmerge from vector or s128 types!");
4121	assert(WideTy.getSizeInBits() > NarrowTy.getSizeInBits() &&
4122	"source register size too small!");
4123
4124	if (!NarrowTy.isScalar())
4125	return selectSplitVectorUnmerge(I, MRI);
4126
4127	// Choose a lane copy opcode and subregister based off of the size of the
4128	// vector's elements.
4129	unsigned CopyOpc = `0`;
4130	unsigned ExtractSubReg = `0`;
4131	if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, EltSize: NarrowTy.getSizeInBits()))
4132	return false;
4133
4134	// Set up for the lane copies.
4135	MachineBasicBlock &MBB = *I.getParent();
4136
4137	// Stores the registers we'll be copying from.
4138	SmallVector<Register, `4`> InsertRegs;
4139
4140	// We'll use the first register twice, so we only need NumElts-1 registers.
4141	unsigned NumInsertRegs = NumElts - `1`;
4142
4143	// If our elements fit into exactly 128 bits, then we can copy from the source
4144	// directly. Otherwise, we need to do a bit of setup with some subregister
4145	// inserts.
4146	if (NarrowTy.getSizeInBits() * NumElts == `128`) {
4147	InsertRegs = SmallVector<Register, `4`>(NumInsertRegs, SrcReg);
4148	} else {
4149	// No. We have to perform subregister inserts. For each insert, create an
4150	// implicit def and a subregister insert, and save the register we create.
4151	const TargetRegisterClass *RC = getRegClassForTypeOnBank(
4152	LLT::fixed_vector(NumElements: NumElts, ScalarSizeInBits: WideTy.getScalarSizeInBits()),
4153	*RBI.getRegBank(SrcReg, MRI, TRI));
4154	unsigned SubReg = `0`;
4155	bool Found = getSubRegForClass(RC, TRI, SubReg);
4156	(void)Found;
4157	assert(Found && "expected to find last operand's subeg idx");
4158	for (unsigned Idx = `0`; Idx < NumInsertRegs; ++Idx) {
4159	Register ImpDefReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass);
4160	MachineInstr &ImpDefMI =
4161	*BuildMI(MBB, I, I.getDebugLoc(), TII.get(TargetOpcode::IMPLICIT_DEF),
4162	ImpDefReg);
4163
4164	// Now, create the subregister insert from SrcReg.
4165	Register InsertReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass);
4166	MachineInstr &InsMI =
4167	*BuildMI(MBB, I, I.getDebugLoc(),
4168	TII.get(TargetOpcode::INSERT_SUBREG), InsertReg)
4169	.addUse(ImpDefReg)
4170	.addUse(SrcReg)
4171	.addImm(SubReg);
4172
4173	constrainSelectedInstRegOperands(ImpDefMI, TII, TRI, RBI);
4174	constrainSelectedInstRegOperands(InsMI, TII, TRI, RBI);
4175
4176	// Save the register so that we can copy from it after.
4177	InsertRegs.push_back(Elt: InsertReg);
4178	}
4179	}
4180
4181	// Now that we've created any necessary subregister inserts, we can
4182	// create the copies.
4183	//
4184	// Perform the first copy separately as a subregister copy.
4185	Register CopyTo = I.getOperand(i: `0`).getReg();
4186	auto FirstCopy = MIB.buildInstr(Opc: TargetOpcode::COPY, DstOps: {CopyTo}, SrcOps: {})
4187	.addReg(RegNo: InsertRegs [`0`], flags: `0`, SubReg: ExtractSubReg);
4188	constrainSelectedInstRegOperands(*FirstCopy, TII, TRI, RBI);
4189
4190	// Now, perform the remaining copies as vector lane copies.
4191	unsigned LaneIdx = `1`;
4192	for (Register InsReg : InsertRegs) {
4193	Register CopyTo = I.getOperand(i: LaneIdx).getReg();
4194	MachineInstr &CopyInst =
4195	*BuildMI(MBB, I, I.getDebugLoc(), TII.get(CopyOpc), CopyTo)
4196	.addUse(InsReg)
4197	.addImm(LaneIdx);
4198	constrainSelectedInstRegOperands(CopyInst, TII, TRI, RBI);
4199	++LaneIdx;
4200	}
4201
4202	// Separately constrain the first copy's destination. Because of the
4203	// limitation in constrainOperandRegClass, we can't guarantee that this will
4204	// actually be constrained. So, do it ourselves using the second operand.
4205	const TargetRegisterClass *RC =
4206	MRI.getRegClassOrNull(Reg: I.getOperand(i: `1`).getReg());
4207	if (!RC) {
4208	LLVM_DEBUG(dbgs() << "Couldn't constrain copy destination.\n");
4209	return false;
4210	}
4211
4212	RBI.constrainGenericRegister(Reg: CopyTo, RC: *RC, MRI);
4213	I.eraseFromParent();
4214	return true;
4215	}
4216
4217	bool AArch64InstructionSelector::selectConcatVectors(
4218	MachineInstr &I, MachineRegisterInfo &MRI) {
4219	assert(I.getOpcode() == TargetOpcode::G_CONCAT_VECTORS &&
4220	"Unexpected opcode");
4221	Register Dst = I.getOperand(i: `0`).getReg();
4222	Register Op1 = I.getOperand(i: `1`).getReg();
4223	Register Op2 = I.getOperand(i: `2`).getReg();
4224	MachineInstr *ConcatMI = emitVectorConcat(Dst, Op1, Op2, MIRBuilder&: MIB);
4225	if (!ConcatMI)
4226	return false;
4227	I.eraseFromParent();
4228	return true;
4229	}
4230
4231	unsigned
4232	AArch64InstructionSelector::emitConstantPoolEntry(const Constant *CPVal,
4233	MachineFunction &MF) const {
4234	Type *CPTy = CPVal->getType();
4235	Align Alignment = MF.getDataLayout().getPrefTypeAlign(Ty: CPTy);
4236
4237	MachineConstantPool *MCP = MF.getConstantPool();
4238	return MCP->getConstantPoolIndex(C: CPVal, Alignment);
4239	}
4240
4241	MachineInstr *AArch64InstructionSelector::emitLoadFromConstantPool(
4242	const Constant CPVal, MachineIRBuilder &MIRBuilder) const* {
4243	const TargetRegisterClass *RC;
4244	unsigned Opc;
4245	bool IsTiny = TM.getCodeModel() == CodeModel::Tiny;
4246	unsigned Size = MIRBuilder.getDataLayout().getTypeStoreSize(Ty: CPVal->getType());
4247	switch (Size) {
4248	case `16`:
4249	RC = &AArch64::FPR128RegClass;
4250	Opc = IsTiny ? AArch64::LDRQl : AArch64::LDRQui;
4251	break;
4252	case `8`:
4253	RC = &AArch64::FPR64RegClass;
4254	Opc = IsTiny ? AArch64::LDRDl : AArch64::LDRDui;
4255	break;
4256	case `4`:
4257	RC = &AArch64::FPR32RegClass;
4258	Opc = IsTiny ? AArch64::LDRSl : AArch64::LDRSui;
4259	break;
4260	case `2`:
4261	RC = &AArch64::FPR16RegClass;
4262	Opc = AArch64::LDRHui;
4263	break;
4264	default:
4265	LLVM_DEBUG(dbgs() << "Could not load from constant pool of type "
4266	<< *CPVal->getType());
4267	return nullptr;
4268	}
4269
4270	MachineInstr LoadMI = nullptr*;
4271	auto &MF = MIRBuilder.getMF();
4272	unsigned CPIdx = emitConstantPoolEntry(CPVal, MF);
4273	if (IsTiny && (Size == `16` \|\| Size == `8` \|\| Size == `4`)) {
4274	// Use load(literal) for tiny code model.
4275	LoadMI = &*MIRBuilder.buildInstr(Opc, DstOps: {RC}, SrcOps: {}).addConstantPoolIndex(Idx: CPIdx);
4276	} else {
4277	auto Adrp =
4278	MIRBuilder.buildInstr(AArch64::ADRP, {&AArch64::GPR64RegClass}, {})
4279	.addConstantPoolIndex(CPIdx, `0`, AArch64II::MO_PAGE);
4280
4281	LoadMI = &*MIRBuilder.buildInstr(Opc, {RC}, {Adrp})
4282	.addConstantPoolIndex(
4283	CPIdx, `0`, AArch64II::MO_PAGEOFF \| AArch64II::MO_NC);
4284
4285	constrainSelectedInstRegOperands(*Adrp, TII, TRI, RBI);
4286	}
4287
4288	MachinePointerInfo PtrInfo = MachinePointerInfo::getConstantPool(MF);
4289	LoadMI->addMemOperand(MF, MO: MF.getMachineMemOperand(PtrInfo,
4290	f: MachineMemOperand::MOLoad,
4291	s: Size, base_alignment: Align (Size)));
4292	constrainSelectedInstRegOperands(*LoadMI, TII, TRI, RBI);
4293	return LoadMI;
4294	}
4295
4296	/// Return an <Opcode, SubregIndex> pair to do an vector elt insert of a given
4297	/// size and RB.
4298	static std::pair<unsigned, unsigned>
4299	getInsertVecEltOpInfo(const RegisterBank &RB, unsigned EltSize) {
4300	unsigned Opc, SubregIdx;
4301	if (RB.getID() == AArch64::GPRRegBankID) {
4302	if (EltSize == `8`) {
4303	Opc = AArch64::INSvi8gpr;
4304	SubregIdx = AArch64::bsub;
4305	} else if (EltSize == `16`) {
4306	Opc = AArch64::INSvi16gpr;
4307	SubregIdx = AArch64::ssub;
4308	} else if (EltSize == `32`) {
4309	Opc = AArch64::INSvi32gpr;
4310	SubregIdx = AArch64::ssub;
4311	} else if (EltSize == `64`) {
4312	Opc = AArch64::INSvi64gpr;
4313	SubregIdx = AArch64::dsub;
4314	} else {
4315	llvm_unreachable("invalid elt size!");
4316	}
4317	} else {
4318	if (EltSize == `8`) {
4319	Opc = AArch64::INSvi8lane;
4320	SubregIdx = AArch64::bsub;
4321	} else if (EltSize == `16`) {
4322	Opc = AArch64::INSvi16lane;
4323	SubregIdx = AArch64::hsub;
4324	} else if (EltSize == `32`) {
4325	Opc = AArch64::INSvi32lane;
4326	SubregIdx = AArch64::ssub;
4327	} else if (EltSize == `64`) {
4328	Opc = AArch64::INSvi64lane;
4329	SubregIdx = AArch64::dsub;
4330	} else {
4331	llvm_unreachable("invalid elt size!");
4332	}
4333	}
4334	return std::make_pair(x&: Opc, y&: SubregIdx);
4335	}
4336
4337	MachineInstr *AArch64InstructionSelector::emitInstr(
4338	unsigned Opcode, std::initializer_list<llvm::DstOp> DstOps,
4339	std::initializer_list<llvm::SrcOp> SrcOps, MachineIRBuilder &MIRBuilder,
4340	const ComplexRendererFns &RenderFns) const {
4341	assert(Opcode && "Expected an opcode?");
4342	assert(!isPreISelGenericOpcode(Opcode) &&
4343	"Function should only be used to produce selected instructions!");
4344	auto MI = MIRBuilder.buildInstr(Opc: Opcode, DstOps, SrcOps);
4345	if (RenderFns)
4346	for (auto &Fn : *RenderFns)
4347	Fn (MI);
4348	constrainSelectedInstRegOperands(*MI, TII, TRI, RBI);
4349	return &*MI;
4350	}
4351
4352	MachineInstr *AArch64InstructionSelector::emitAddSub(
4353	const std::array<std::array<unsigned, `2`>, `5`> &AddrModeAndSizeToOpcode,
4354	Register Dst, MachineOperand &LHS, MachineOperand &RHS,
4355	MachineIRBuilder &MIRBuilder) const {
4356	MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4357	assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4358	auto Ty = MRI.getType(Reg: LHS.getReg());
4359	assert(!Ty.isVector() && "Expected a scalar or pointer?");
4360	unsigned Size = Ty.getSizeInBits();
4361	assert((Size == `32` \|\| Size == `64`) && "Expected a 32-bit or 64-bit type only");
4362	bool Is32Bit = Size == `32`;
4363
4364	// INSTRri form with positive arithmetic immediate.
4365	if (auto Fns = selectArithImmed(Root&: RHS))
4366	return emitInstr(Opcode: AddrModeAndSizeToOpcode [`0`][Is32Bit], DstOps: {Dst}, SrcOps: {LHS},
4367	MIRBuilder, RenderFns: Fns);
4368
4369	// INSTRri form with negative arithmetic immediate.
4370	if (auto Fns = selectNegArithImmed(Root&: RHS))
4371	return emitInstr(Opcode: AddrModeAndSizeToOpcode [`3`][Is32Bit], DstOps: {Dst}, SrcOps: {LHS},
4372	MIRBuilder, RenderFns: Fns);
4373
4374	// INSTRrx form.
4375	if (auto Fns = selectArithExtendedRegister(Root&: RHS))
4376	return emitInstr(Opcode: AddrModeAndSizeToOpcode [`4`][Is32Bit], DstOps: {Dst}, SrcOps: {LHS},
4377	MIRBuilder, RenderFns: Fns);
4378
4379	// INSTRrs form.
4380	if (auto Fns = selectShiftedRegister(Root&: RHS))
4381	return emitInstr(Opcode: AddrModeAndSizeToOpcode [`1`][Is32Bit], DstOps: {Dst}, SrcOps: {LHS},
4382	MIRBuilder, RenderFns: Fns);
4383	return emitInstr(Opcode: AddrModeAndSizeToOpcode [`2`][Is32Bit], DstOps: {Dst}, SrcOps: {LHS, RHS},
4384	MIRBuilder);
4385	}
4386
4387	MachineInstr *
4388	AArch64InstructionSelector::emitADD(Register DefReg, MachineOperand &LHS,
4389	MachineOperand &RHS,
4390	MachineIRBuilder &MIRBuilder) const {
4391	const std::array<std::array<unsigned, `2`>, `5`> OpcTable{
4392	{{AArch64::ADDXri, AArch64::ADDWri},
4393	{AArch64::ADDXrs, AArch64::ADDWrs},
4394	{AArch64::ADDXrr, AArch64::ADDWrr},
4395	{AArch64::SUBXri, AArch64::SUBWri},
4396	{AArch64::ADDXrx, AArch64::ADDWrx}}};
4397	return emitAddSub(AddrModeAndSizeToOpcode: OpcTable, Dst: DefReg, LHS, RHS, MIRBuilder);
4398	}
4399
4400	MachineInstr *
4401	AArch64InstructionSelector::emitADDS(Register Dst, MachineOperand &LHS,
4402	MachineOperand &RHS,
4403	MachineIRBuilder &MIRBuilder) const {
4404	const std::array<std::array<unsigned, `2`>, `5`> OpcTable{
4405	{{AArch64::ADDSXri, AArch64::ADDSWri},
4406	{AArch64::ADDSXrs, AArch64::ADDSWrs},
4407	{AArch64::ADDSXrr, AArch64::ADDSWrr},
4408	{AArch64::SUBSXri, AArch64::SUBSWri},
4409	{AArch64::ADDSXrx, AArch64::ADDSWrx}}};
4410	return emitAddSub(AddrModeAndSizeToOpcode: OpcTable, Dst, LHS, RHS, MIRBuilder);
4411	}
4412
4413	MachineInstr *
4414	AArch64InstructionSelector::emitSUBS(Register Dst, MachineOperand &LHS,
4415	MachineOperand &RHS,
4416	MachineIRBuilder &MIRBuilder) const {
4417	const std::array<std::array<unsigned, `2`>, `5`> OpcTable{
4418	{{AArch64::SUBSXri, AArch64::SUBSWri},
4419	{AArch64::SUBSXrs, AArch64::SUBSWrs},
4420	{AArch64::SUBSXrr, AArch64::SUBSWrr},
4421	{AArch64::ADDSXri, AArch64::ADDSWri},
4422	{AArch64::SUBSXrx, AArch64::SUBSWrx}}};
4423	return emitAddSub(AddrModeAndSizeToOpcode: OpcTable, Dst, LHS, RHS, MIRBuilder);
4424	}
4425
4426	MachineInstr *
4427	AArch64InstructionSelector::emitADCS(Register Dst, MachineOperand &LHS,
4428	MachineOperand &RHS,
4429	MachineIRBuilder &MIRBuilder) const {
4430	assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4431	MachineRegisterInfo *MRI = MIRBuilder.getMRI();
4432	bool Is32Bit = (MRI->getType(Reg: LHS.getReg()).getSizeInBits() == `32`);
4433	static const unsigned OpcTable[`2`] = {AArch64::ADCSXr, AArch64::ADCSWr};
4434	return emitInstr(Opcode: OpcTable[Is32Bit], DstOps: {Dst}, SrcOps: {LHS, RHS}, MIRBuilder);
4435	}
4436
4437	MachineInstr *
4438	AArch64InstructionSelector::emitSBCS(Register Dst, MachineOperand &LHS,
4439	MachineOperand &RHS,
4440	MachineIRBuilder &MIRBuilder) const {
4441	assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4442	MachineRegisterInfo *MRI = MIRBuilder.getMRI();
4443	bool Is32Bit = (MRI->getType(Reg: LHS.getReg()).getSizeInBits() == `32`);
4444	static const unsigned OpcTable[`2`] = {AArch64::SBCSXr, AArch64::SBCSWr};
4445	return emitInstr(Opcode: OpcTable[Is32Bit], DstOps: {Dst}, SrcOps: {LHS, RHS}, MIRBuilder);
4446	}
4447
4448	MachineInstr *
4449	AArch64InstructionSelector::emitCMN(MachineOperand &LHS, MachineOperand &RHS,
4450	MachineIRBuilder &MIRBuilder) const {
4451	MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4452	bool Is32Bit = (MRI.getType(Reg: LHS.getReg()).getSizeInBits() == `32`);
4453	auto RC = Is32Bit ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass;
4454	return emitADDS(Dst: MRI.createVirtualRegister(RC), LHS, RHS, MIRBuilder);
4455	}
4456
4457	MachineInstr *
4458	AArch64InstructionSelector::emitTST(MachineOperand &LHS, MachineOperand &RHS,
4459	MachineIRBuilder &MIRBuilder) const {
4460	assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4461	MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4462	LLT Ty = MRI.getType(Reg: LHS.getReg());
4463	unsigned RegSize = Ty.getSizeInBits();
4464	bool Is32Bit = (RegSize == `32`);
4465	const unsigned OpcTable[`3`][`2`] = {{AArch64::ANDSXri, AArch64::ANDSWri},
4466	{AArch64::ANDSXrs, AArch64::ANDSWrs},
4467	{AArch64::ANDSXrr, AArch64::ANDSWrr}};
4468	// ANDS needs a logical immediate for its immediate form. Check if we can
4469	// fold one in.
4470	if (auto ValAndVReg = getIConstantVRegValWithLookThrough(VReg: RHS.getReg(), MRI)) {
4471	int64_t Imm = ValAndVReg ->Value.getSExtValue();
4472
4473	if (AArch64_AM::isLogicalImmediate(imm: Imm, regSize: RegSize)) {
4474	auto TstMI = MIRBuilder.buildInstr(Opc: OpcTable[`0`][Is32Bit], DstOps: {Ty}, SrcOps: {LHS});
4475	TstMI.addImm(Val: AArch64_AM::encodeLogicalImmediate(imm: Imm, regSize: RegSize));
4476	constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
4477	return &*TstMI;
4478	}
4479	}
4480
4481	if (auto Fns = selectLogicalShiftedRegister(Root&: RHS))
4482	return emitInstr(Opcode: OpcTable[`1`][Is32Bit], DstOps: {Ty}, SrcOps: {LHS}, MIRBuilder, RenderFns: Fns);
4483	return emitInstr(Opcode: OpcTable[`2`][Is32Bit], DstOps: {Ty}, SrcOps: {LHS, RHS}, MIRBuilder);
4484	}
4485
4486	MachineInstr *AArch64InstructionSelector::emitIntegerCompare(
4487	MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate,
4488	MachineIRBuilder &MIRBuilder) const {
4489	assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!");
4490	assert(Predicate.isPredicate() && "Expected predicate?");
4491	MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4492	LLT CmpTy = MRI.getType(Reg: LHS.getReg());
4493	assert(!CmpTy.isVector() && "Expected scalar or pointer");
4494	unsigned Size = CmpTy.getSizeInBits();
4495	(void)Size;
4496	assert((Size == `32` \|\| Size == `64`) && "Expected a 32-bit or 64-bit LHS/RHS?");
4497	// Fold the compare into a cmn or tst if possible.
4498	if (auto FoldCmp = tryFoldIntegerCompare(LHS, RHS, Predicate, MIRBuilder))
4499	return FoldCmp;
4500	auto Dst = MRI.cloneVirtualRegister(VReg: LHS.getReg());
4501	return emitSUBS(Dst, LHS, RHS, MIRBuilder);
4502	}
4503
4504	MachineInstr *AArch64InstructionSelector::emitCSetForFCmp(
4505	Register Dst, CmpInst::Predicate Pred, MachineIRBuilder &MIRBuilder) const {
4506	MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4507	#ifndef NDEBUG
4508	LLT Ty = MRI.getType(Reg: Dst);
4509	assert(!Ty.isVector() && Ty.getSizeInBits() == `32` &&
4510	"Expected a 32-bit scalar register?");
4511	#endif
4512	const Register ZReg = AArch64::WZR;
4513	AArch64CC::CondCode CC1, CC2;
4514	changeFCMPPredToAArch64CC(P: Pred, CondCode&: CC1, CondCode2&: CC2);
4515	auto InvCC1 = AArch64CC::getInvertedCondCode(Code: CC1);
4516	if (CC2 == AArch64CC::AL)
4517	return emitCSINC(/Dst=/Dst, /Src1=/ZReg, /Src2=/ZReg, Pred: InvCC1,
4518	MIRBuilder);
4519	const TargetRegisterClass *RC = &AArch64::GPR32RegClass;
4520	Register Def1Reg = MRI.createVirtualRegister(RegClass: RC);
4521	Register Def2Reg = MRI.createVirtualRegister(RegClass: RC);
4522	auto InvCC2 = AArch64CC::getInvertedCondCode(Code: CC2);
4523	emitCSINC(/Dst=/Def1Reg, /Src1=/ZReg, /Src2=/ZReg, Pred: InvCC1, MIRBuilder);
4524	emitCSINC(/Dst=/Def2Reg, /Src1=/ZReg, /Src2=/ZReg, Pred: InvCC2, MIRBuilder);
4525	auto OrMI = MIRBuilder.buildInstr(AArch64::ORRWrr, {Dst}, {Def1Reg, Def2Reg});
4526	constrainSelectedInstRegOperands(*OrMI, TII, TRI, RBI);
4527	return &*OrMI;
4528	}
4529
4530	MachineInstr *AArch64InstructionSelector::emitFPCompare(
4531	Register LHS, Register RHS, MachineIRBuilder &MIRBuilder,
4532	std::optional<CmpInst::Predicate> Pred) const {
4533	MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4534	LLT Ty = MRI.getType(Reg: LHS);
4535	if (Ty.isVector())
4536	return nullptr;
4537	unsigned OpSize = Ty.getSizeInBits();
4538	assert(OpSize == `16` \|\| OpSize == `32` \|\| OpSize == `64`);
4539
4540	// If this is a compare against +0.0, then we don't have
4541	// to explicitly materialize a constant.
4542	const ConstantFP *FPImm = getConstantFPVRegVal(VReg: RHS, MRI);
4543	bool ShouldUseImm = FPImm && (FPImm->isZero() && !FPImm->isNegative());
4544
4545	auto IsEqualityPred = [](CmpInst::Predicate P) {
4546	return P == CmpInst::FCMP_OEQ \|\| P == CmpInst::FCMP_ONE \|\|
4547	P == CmpInst::FCMP_UEQ \|\| P == CmpInst::FCMP_UNE;
4548	};
4549	if (!ShouldUseImm && Pred && IsEqualityPred (*Pred)) {
4550	// Try commutating the operands.
4551	const ConstantFP *LHSImm = getConstantFPVRegVal(VReg: LHS, MRI);
4552	if (LHSImm && (LHSImm->isZero() && !LHSImm->isNegative())) {
4553	ShouldUseImm = true;
4554	std::swap(a&: LHS, b&: RHS);
4555	}
4556	}
4557	unsigned CmpOpcTbl[`2`][`3`] = {
4558	{AArch64::FCMPHrr, AArch64::FCMPSrr, AArch64::FCMPDrr},
4559	{AArch64::FCMPHri, AArch64::FCMPSri, AArch64::FCMPDri}};
4560	unsigned CmpOpc =
4561	CmpOpcTbl[ShouldUseImm][OpSize == `16` ? `0` : (OpSize == `32` ? `1` : `2`)];
4562
4563	// Partially build the compare. Decide if we need to add a use for the
4564	// third operand based off whether or not we're comparing against 0.0.
4565	auto CmpMI = MIRBuilder.buildInstr(Opcode: CmpOpc).addUse(RegNo: LHS);
4566	CmpMI.setMIFlags(MachineInstr::NoFPExcept);
4567	if (!ShouldUseImm)
4568	CmpMI.addUse(RegNo: RHS);
4569	constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI);
4570	return &*CmpMI;
4571	}
4572
4573	MachineInstr *AArch64InstructionSelector::emitVectorConcat(
4574	std::optional<Register> Dst, Register Op1, Register Op2,
4575	MachineIRBuilder &MIRBuilder) const {
4576	// We implement a vector concat by:
4577	// 1. Use scalar_to_vector to insert the lower vector into the larger dest
4578	// 2. Insert the upper vector into the destination's upper element
4579	// TODO: some of this code is common with G_BUILD_VECTOR handling.
4580	MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4581
4582	const LLT Op1Ty = MRI.getType(Reg: Op1);
4583	const LLT Op2Ty = MRI.getType(Reg: Op2);
4584
4585	if (Op1Ty != Op2Ty) {
4586	LLVM_DEBUG(dbgs() << "Could not do vector concat of differing vector tys");
4587	return nullptr;
4588	}
4589	assert(Op1Ty.isVector() && "Expected a vector for vector concat");
4590
4591	if (Op1Ty.getSizeInBits() >= `128`) {
4592	LLVM_DEBUG(dbgs() << "Vector concat not supported for full size vectors");
4593	return nullptr;
4594	}
4595
4596	// At the moment we just support 64 bit vector concats.
4597	if (Op1Ty.getSizeInBits() != `64`) {
4598	LLVM_DEBUG(dbgs() << "Vector concat supported for 64b vectors");
4599	return nullptr;
4600	}
4601
4602	const LLT ScalarTy = LLT::scalar(SizeInBits: Op1Ty.getSizeInBits());
4603	const RegisterBank &FPRBank = *RBI.getRegBank(Op1, MRI, TRI);
4604	const TargetRegisterClass *DstRC =
4605	getRegClassForTypeOnBank(Ty: Op1Ty.multiplyElements(Factor: `2`), RB: FPRBank);
4606
4607	MachineInstr *WidenedOp1 =
4608	emitScalarToVector(EltSize: ScalarTy.getSizeInBits(), DstRC, Scalar: Op1, MIRBuilder);
4609	MachineInstr *WidenedOp2 =
4610	emitScalarToVector(EltSize: ScalarTy.getSizeInBits(), DstRC, Scalar: Op2, MIRBuilder);
4611	if (!WidenedOp1 \|\| !WidenedOp2) {
4612	LLVM_DEBUG(dbgs() << "Could not emit a vector from scalar value");
4613	return nullptr;
4614	}
4615
4616	// Now do the insert of the upper element.
4617	unsigned InsertOpc, InsSubRegIdx;
4618	std::tie(args&: InsertOpc, args&: InsSubRegIdx) =
4619	getInsertVecEltOpInfo(RB: FPRBank, EltSize: ScalarTy.getSizeInBits());
4620
4621	if (!Dst)
4622	Dst = MRI.createVirtualRegister(RegClass: DstRC);
4623	auto InsElt =
4624	MIRBuilder
4625	.buildInstr(Opc: InsertOpc, DstOps: {*Dst}, SrcOps: {WidenedOp1->getOperand(i: `0`).getReg()})
4626	.addImm(Val: `1`) / Lane index /
4627	.addUse(RegNo: WidenedOp2->getOperand(i: `0`).getReg())
4628	.addImm(Val: `0`);
4629	constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI);
4630	return &*InsElt;
4631	}
4632
4633	MachineInstr *
4634	AArch64InstructionSelector::emitCSINC(Register Dst, Register Src1,
4635	Register Src2, AArch64CC::CondCode Pred,
4636	MachineIRBuilder &MIRBuilder) const {
4637	auto &MRI = *MIRBuilder.getMRI();
4638	const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg: Dst);
4639	// If we used a register class, then this won't necessarily have an LLT.
4640	// Compute the size based off whether or not we have a class or bank.
4641	unsigned Size;
4642	if (const auto RC = RegClassOrBank.dyn_cast<const* TargetRegisterClass *>())
4643	Size = TRI.getRegSizeInBits(*RC);
4644	else
4645	Size = MRI.getType(Reg: Dst).getSizeInBits();
4646	// Some opcodes use s1.
4647	assert(Size <= `64` && "Expected 64 bits or less only!");
4648	static const unsigned OpcTable[`2`] = {AArch64::CSINCWr, AArch64::CSINCXr};
4649	unsigned Opc = OpcTable[Size == `64`];
4650	auto CSINC = MIRBuilder.buildInstr(Opc, DstOps: {Dst}, SrcOps: {Src1, Src2}).addImm(Val: Pred);
4651	constrainSelectedInstRegOperands(*CSINC, TII, TRI, RBI);
4652	return &*CSINC;
4653	}
4654
4655	MachineInstr *AArch64InstructionSelector::emitCarryIn(MachineInstr &I,
4656	Register CarryReg) {
4657	MachineRegisterInfo *MRI = MIB.getMRI();
4658	unsigned Opcode = I.getOpcode();
4659
4660	// If the instruction is a SUB, we need to negate the carry,
4661	// because borrowing is indicated by carry-flag == 0.
4662	bool NeedsNegatedCarry =
4663	(Opcode == TargetOpcode::G_USUBE \|\| Opcode == TargetOpcode::G_SSUBE);
4664
4665	// If the previous instruction will already produce the correct carry, do not
4666	// emit a carry generating instruction. E.g. for G_UADDE/G_USUBE sequences
4667	// generated during legalization of wide add/sub. This optimization depends on
4668	// these sequences not being interrupted by other instructions.
4669	// We have to select the previous instruction before the carry-using
4670	// instruction is deleted by the calling function, otherwise the previous
4671	// instruction might become dead and would get deleted.
4672	MachineInstr *SrcMI = MRI->getVRegDef(Reg: CarryReg);
4673	if (SrcMI == I.getPrevNode()) {
4674	if (auto *CarrySrcMI = dyn_cast<GAddSubCarryOut>(Val: SrcMI)) {
4675	bool ProducesNegatedCarry = CarrySrcMI->isSub();
4676	if (NeedsNegatedCarry == ProducesNegatedCarry &&
4677	CarrySrcMI->isUnsigned() &&
4678	CarrySrcMI->getCarryOutReg() == CarryReg &&
4679	selectAndRestoreState(I&: *SrcMI))
4680	return nullptr;
4681	}
4682	}
4683
4684	Register DeadReg = MRI->createVirtualRegister(&AArch64::GPR32RegClass);
4685
4686	if (NeedsNegatedCarry) {
4687	// (0 - Carry) sets !C in NZCV when Carry == 1
4688	Register ZReg = AArch64::WZR;
4689	return emitInstr(AArch64::SUBSWrr, {DeadReg}, {ZReg, CarryReg}, MIB);
4690	}
4691
4692	// (Carry - 1) sets !C in NZCV when Carry == 0
4693	auto Fns = select12BitValueWithLeftShift(Immed: `1`);
4694	return emitInstr(AArch64::SUBSWri, {DeadReg}, {CarryReg}, MIB, Fns);
4695	}
4696
4697	bool AArch64InstructionSelector::selectOverflowOp(MachineInstr &I,
4698	MachineRegisterInfo &MRI) {
4699	auto &CarryMI = cast<GAddSubCarryOut>(Val&: I);
4700
4701	if (auto *CarryInMI = dyn_cast<GAddSubCarryInOut>(Val: &I)) {
4702	// Set NZCV carry according to carry-in VReg
4703	emitCarryIn(I, CarryReg: CarryInMI->getCarryInReg());
4704	}
4705
4706	// Emit the operation and get the correct condition code.
4707	auto OpAndCC = emitOverflowOp(Opcode: I.getOpcode(), Dst: CarryMI.getDstReg(),
4708	LHS&: CarryMI.getLHS(), RHS&: CarryMI.getRHS(), MIRBuilder&: MIB);
4709
4710	Register CarryOutReg = CarryMI.getCarryOutReg();
4711
4712	// Don't convert carry-out to VReg if it is never used
4713	if (!MRI.use_nodbg_empty(RegNo: CarryOutReg)) {
4714	// Now, put the overflow result in the register given by the first operand
4715	// to the overflow op. CSINC increments the result when the predicate is
4716	// false, so to get the increment when it's true, we need to use the
4717	// inverse. In this case, we want to increment when carry is set.
4718	Register ZReg = AArch64::WZR;
4719	emitCSINC(/Dst=/CarryOutReg, /Src1=/ZReg, /Src2=/ZReg,
4720	Pred: getInvertedCondCode(Code: OpAndCC.second), MIRBuilder&: MIB);
4721	}
4722
4723	I.eraseFromParent();
4724	return true;
4725	}
4726
4727	std::pair<MachineInstr *, AArch64CC::CondCode>
4728	AArch64InstructionSelector::emitOverflowOp(unsigned Opcode, Register Dst,
4729	MachineOperand &LHS,
4730	MachineOperand &RHS,
4731	MachineIRBuilder &MIRBuilder) const {
4732	switch (Opcode) {
4733	default:
4734	llvm_unreachable("Unexpected opcode!");
4735	case TargetOpcode::G_SADDO:
4736	return std::make_pair(x: emitADDS(Dst, LHS, RHS, MIRBuilder), y: AArch64CC::VS);
4737	case TargetOpcode::G_UADDO:
4738	return std::make_pair(x: emitADDS(Dst, LHS, RHS, MIRBuilder), y: AArch64CC::HS);
4739	case TargetOpcode::G_SSUBO:
4740	return std::make_pair(x: emitSUBS(Dst, LHS, RHS, MIRBuilder), y: AArch64CC::VS);
4741	case TargetOpcode::G_USUBO:
4742	return std::make_pair(x: emitSUBS(Dst, LHS, RHS, MIRBuilder), y: AArch64CC::LO);
4743	case TargetOpcode::G_SADDE:
4744	return std::make_pair(x: emitADCS(Dst, LHS, RHS, MIRBuilder), y: AArch64CC::VS);
4745	case TargetOpcode::G_UADDE:
4746	return std::make_pair(x: emitADCS(Dst, LHS, RHS, MIRBuilder), y: AArch64CC::HS);
4747	case TargetOpcode::G_SSUBE:
4748	return std::make_pair(x: emitSBCS(Dst, LHS, RHS, MIRBuilder), y: AArch64CC::VS);
4749	case TargetOpcode::G_USUBE:
4750	return std::make_pair(x: emitSBCS(Dst, LHS, RHS, MIRBuilder), y: AArch64CC::LO);
4751	}
4752	}
4753
4754	/// Returns true if @p Val is a tree of AND/OR/CMP operations that can be
4755	/// expressed as a conjunction.
4756	/// \param CanNegate Set to true if we can negate the whole sub-tree just by
4757	/// changing the conditions on the CMP tests.
4758	/// (this means we can call emitConjunctionRec() with
4759	/// Negate==true on this sub-tree)
4760	/// \param MustBeFirst Set to true if this subtree needs to be negated and we
4761	/// cannot do the negation naturally. We are required to
4762	/// emit the subtree first in this case.
4763	/// \param WillNegate Is true if are called when the result of this
4764	/// subexpression must be negated. This happens when the
4765	/// outer expression is an OR. We can use this fact to know
4766	/// that we have a double negation (or (or ...) ...) that
4767	/// can be implemented for free.
4768	static bool canEmitConjunction(Register Val, bool &CanNegate, bool &MustBeFirst,
4769	bool WillNegate, MachineRegisterInfo &MRI,
4770	unsigned Depth = `0`) {
4771	if (!MRI.hasOneNonDBGUse(RegNo: Val))
4772	return false;
4773	MachineInstr *ValDef = MRI.getVRegDef(Reg: Val);
4774	unsigned Opcode = ValDef->getOpcode();
4775	if (isa<GAnyCmp>(Val: ValDef)) {
4776	CanNegate = true;
4777	MustBeFirst = false;
4778	return true;
4779	}
4780	// Protect against exponential runtime and stack overflow.
4781	if (Depth > `6`)
4782	return false;
4783	if (Opcode == TargetOpcode::G_AND \|\| Opcode == TargetOpcode::G_OR) {
4784	bool IsOR = Opcode == TargetOpcode::G_OR;
4785	Register O0 = ValDef->getOperand(i: `1`).getReg();
4786	Register O1 = ValDef->getOperand(i: `2`).getReg();
4787	bool CanNegateL;
4788	bool MustBeFirstL;
4789	if (!canEmitConjunction(Val: O0, CanNegate&: CanNegateL, MustBeFirst&: MustBeFirstL, WillNegate: IsOR, MRI, Depth: Depth + `1`))
4790	return false;
4791	bool CanNegateR;
4792	bool MustBeFirstR;
4793	if (!canEmitConjunction(Val: O1, CanNegate&: CanNegateR, MustBeFirst&: MustBeFirstR, WillNegate: IsOR, MRI, Depth: Depth + `1`))
4794	return false;
4795
4796	if (MustBeFirstL && MustBeFirstR)
4797	return false;
4798
4799	if (IsOR) {
4800	// For an OR expression we need to be able to naturally negate at least
4801	// one side or we cannot do the transformation at all.
4802	if (!CanNegateL && !CanNegateR)
4803	return false;
4804	// If we the result of the OR will be negated and we can naturally negate
4805	// the leaves, then this sub-tree as a whole negates naturally.
4806	CanNegate = WillNegate && CanNegateL && CanNegateR;
4807	// If we cannot naturally negate the whole sub-tree, then this must be
4808	// emitted first.
4809	MustBeFirst = !CanNegate;
4810	} else {
4811	assert(Opcode == TargetOpcode::G_AND && "Must be G_AND");
4812	// We cannot naturally negate an AND operation.
4813	CanNegate = false;
4814	MustBeFirst = MustBeFirstL \|\| MustBeFirstR;
4815	}
4816	return true;
4817	}
4818	return false;
4819	}
4820
4821	MachineInstr *AArch64InstructionSelector::emitConditionalComparison(
4822	Register LHS, Register RHS, CmpInst::Predicate CC,
4823	AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC,
4824	MachineIRBuilder &MIB) const {
4825	// TODO: emit CMN as an optimization.
4826	auto &MRI = *MIB.getMRI();
4827	LLT OpTy = MRI.getType(Reg: LHS);
4828	unsigned CCmpOpc;
4829	std::optional<ValueAndVReg> C;
4830	if (CmpInst::isIntPredicate(P: CC)) {
4831	assert(OpTy.getSizeInBits() == `32` \|\| OpTy.getSizeInBits() == `64`);
4832	C = getIConstantVRegValWithLookThrough(VReg: RHS, MRI);
4833	if (C && C->Value.ult(`32`))
4834	CCmpOpc = OpTy.getSizeInBits() == `32` ? AArch64::CCMPWi : AArch64::CCMPXi;
4835	else
4836	CCmpOpc = OpTy.getSizeInBits() == `32` ? AArch64::CCMPWr : AArch64::CCMPXr;
4837	} else {
4838	assert(OpTy.getSizeInBits() == `16` \|\| OpTy.getSizeInBits() == `32` \|\|
4839	OpTy.getSizeInBits() == `64`);
4840	switch (OpTy.getSizeInBits()) {
4841	case `16`:
4842	assert(STI.hasFullFP16() && "Expected Full FP16 for fp16 comparisons");
4843	CCmpOpc = AArch64::FCCMPHrr;
4844	break;
4845	case `32`:
4846	CCmpOpc = AArch64::FCCMPSrr;
4847	break;
4848	case `64`:
4849	CCmpOpc = AArch64::FCCMPDrr;
4850	break;
4851	default:
4852	return nullptr;
4853	}
4854	}
4855	AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(Code: OutCC);
4856	unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(Code: InvOutCC);
4857	auto CCmp =
4858	MIB.buildInstr(Opc: CCmpOpc, DstOps: {}, SrcOps: {LHS});
4859	if (CCmpOpc == AArch64::CCMPWi \|\| CCmpOpc == AArch64::CCMPXi)
4860	CCmp.addImm(Val: C ->Value.getZExtValue());
4861	else
4862	CCmp.addReg(RegNo: RHS);
4863	CCmp.addImm(Val: NZCV).addImm(Val: Predicate);
4864	constrainSelectedInstRegOperands(*CCmp, TII, TRI, RBI);
4865	return &*CCmp;
4866	}
4867
4868	MachineInstr *AArch64InstructionSelector::emitConjunctionRec(
4869	Register Val, AArch64CC::CondCode &OutCC, bool Negate, Register CCOp,
4870	AArch64CC::CondCode Predicate, MachineIRBuilder &MIB) const {
4871	// We're at a tree leaf, produce a conditional comparison operation.
4872	auto &MRI = *MIB.getMRI();
4873	MachineInstr *ValDef = MRI.getVRegDef(Reg: Val);
4874	unsigned Opcode = ValDef->getOpcode();
4875	if (auto *Cmp = dyn_cast<GAnyCmp>(Val: ValDef)) {
4876	Register LHS = Cmp->getLHSReg();
4877	Register RHS = Cmp->getRHSReg();
4878	CmpInst::Predicate CC = Cmp->getCond();
4879	if (Negate)
4880	CC = CmpInst::getInversePredicate(pred: CC);
4881	if (isa<GICmp>(Val: Cmp)) {
4882	OutCC = changeICMPPredToAArch64CC(P: CC);
4883	} else {
4884	// Handle special FP cases.
4885	AArch64CC::CondCode ExtraCC;
4886	changeFPCCToANDAArch64CC(CC, CondCode&: OutCC, CondCode2&: ExtraCC);
4887	// Some floating point conditions can't be tested with a single condition
4888	// code. Construct an additional comparison in this case.
4889	if (ExtraCC != AArch64CC::AL) {
4890	MachineInstr *ExtraCmp;
4891	if (!CCOp)
4892	ExtraCmp = emitFPCompare(LHS, RHS, MIRBuilder&: MIB, Pred: CC);
4893	else
4894	ExtraCmp =
4895	emitConditionalComparison(LHS, RHS, CC, Predicate, OutCC: ExtraCC, MIB);
4896	CCOp = ExtraCmp->getOperand(i: `0`).getReg();
4897	Predicate = ExtraCC;
4898	}
4899	}
4900
4901	// Produce a normal comparison if we are first in the chain
4902	if (!CCOp) {
4903	auto Dst = MRI.cloneVirtualRegister(VReg: LHS);
4904	if (isa<GICmp>(Val: Cmp))
4905	return emitSUBS(Dst, LHS&: Cmp->getOperand(i: `2`), RHS&: Cmp->getOperand(i: `3`), MIRBuilder&: MIB);
4906	return emitFPCompare(LHS: Cmp->getOperand(i: `2`).getReg(),
4907	RHS: Cmp->getOperand(i: `3`).getReg(), MIRBuilder&: MIB);
4908	}
4909	// Otherwise produce a ccmp.
4910	return emitConditionalComparison(LHS, RHS, CC, Predicate, OutCC, MIB);
4911	}
4912	assert(MRI.hasOneNonDBGUse(Val) && "Valid conjunction/disjunction tree");
4913
4914	bool IsOR = Opcode == TargetOpcode::G_OR;
4915
4916	Register LHS = ValDef->getOperand(i: `1`).getReg();
4917	bool CanNegateL;
4918	bool MustBeFirstL;
4919	bool ValidL = canEmitConjunction(Val: LHS, CanNegate&: CanNegateL, MustBeFirst&: MustBeFirstL, WillNegate: IsOR, MRI);
4920	assert(ValidL && "Valid conjunction/disjunction tree");
4921	(void)ValidL;
4922
4923	Register RHS = ValDef->getOperand(i: `2`).getReg();
4924	bool CanNegateR;
4925	bool MustBeFirstR;
4926	bool ValidR = canEmitConjunction(Val: RHS, CanNegate&: CanNegateR, MustBeFirst&: MustBeFirstR, WillNegate: IsOR, MRI);
4927	assert(ValidR && "Valid conjunction/disjunction tree");
4928	(void)ValidR;
4929
4930	// Swap sub-tree that must come first to the right side.
4931	if (MustBeFirstL) {
4932	assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
4933	std::swap(a&: LHS, b&: RHS);
4934	std::swap(a&: CanNegateL, b&: CanNegateR);
4935	std::swap(a&: MustBeFirstL, b&: MustBeFirstR);
4936	}
4937
4938	bool NegateR;
4939	bool NegateAfterR;
4940	bool NegateL;
4941	bool NegateAfterAll;
4942	if (Opcode == TargetOpcode::G_OR) {
4943	// Swap the sub-tree that we can negate naturally to the left.
4944	if (!CanNegateL) {
4945	assert(CanNegateR && "at least one side must be negatable");
4946	assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
4947	assert(!Negate);
4948	std::swap(a&: LHS, b&: RHS);
4949	NegateR = false;
4950	NegateAfterR = true;
4951	} else {
4952	// Negate the left sub-tree if possible, otherwise negate the result.
4953	NegateR = CanNegateR;
4954	NegateAfterR = !CanNegateR;
4955	}
4956	NegateL = true;
4957	NegateAfterAll = !Negate;
4958	} else {
4959	assert(Opcode == TargetOpcode::G_AND &&
4960	"Valid conjunction/disjunction tree");
4961	assert(!Negate && "Valid conjunction/disjunction tree");
4962
4963	NegateL = false;
4964	NegateR = false;
4965	NegateAfterR = false;
4966	NegateAfterAll = false;
4967	}
4968
4969	// Emit sub-trees.
4970	AArch64CC::CondCode RHSCC;
4971	MachineInstr *CmpR =
4972	emitConjunctionRec(Val: RHS, OutCC&: RHSCC, Negate: NegateR, CCOp, Predicate, MIB);
4973	if (NegateAfterR)
4974	RHSCC = AArch64CC::getInvertedCondCode(Code: RHSCC);
4975	MachineInstr *CmpL = emitConjunctionRec(
4976	Val: LHS, OutCC, Negate: NegateL, CCOp: CmpR->getOperand(i: `0`).getReg(), Predicate: RHSCC, MIB);
4977	if (NegateAfterAll)
4978	OutCC = AArch64CC::getInvertedCondCode(Code: OutCC);
4979	return CmpL;
4980	}
4981
4982	MachineInstr *AArch64InstructionSelector::emitConjunction(
4983	Register Val, AArch64CC::CondCode &OutCC, MachineIRBuilder &MIB) const {
4984	bool DummyCanNegate;
4985	bool DummyMustBeFirst;
4986	if (!canEmitConjunction(Val, CanNegate&: DummyCanNegate, MustBeFirst&: DummyMustBeFirst, WillNegate: false,
4987	MRI&: *MIB.getMRI()))
4988	return nullptr;
4989	return emitConjunctionRec(Val, OutCC, Negate: false, CCOp: Register (), Predicate: AArch64CC::AL, MIB);
4990	}
4991
4992	bool AArch64InstructionSelector::tryOptSelectConjunction(GSelect &SelI,
4993	MachineInstr &CondMI) {
4994	AArch64CC::CondCode AArch64CC;
4995	MachineInstr *ConjMI = emitConjunction(Val: SelI.getCondReg(), OutCC&: AArch64CC, MIB);
4996	if (!ConjMI)
4997	return false;
4998
4999	emitSelect(Dst: SelI.getReg(Idx: `0`), True: SelI.getTrueReg(), False: SelI.getFalseReg(), CC: AArch64CC, MIB);
5000	SelI.eraseFromParent();
5001	return true;
5002	}
5003
5004	bool AArch64InstructionSelector::tryOptSelect(GSelect &I) {
5005	MachineRegisterInfo &MRI = *MIB.getMRI();
5006	// We want to recognize this pattern:
5007	//
5008	// $z = G_FCMP pred, $x, $y
5009	// ...
5010	// $w = G_SELECT $z, $a, $b
5011	//
5012	// Where the value of $z is only* ever used by the G_SELECT (possibly with*
5013	// some copies/truncs in between.)
5014	//
5015	// If we see this, then we can emit something like this:
5016	//
5017	// fcmp $x, $y
5018	// fcsel $w, $a, $b, pred
5019	//
5020	// Rather than emitting both of the rather long sequences in the standard
5021	// G_FCMP/G_SELECT select methods.
5022
5023	// First, check if the condition is defined by a compare.
5024	MachineInstr *CondDef = MRI.getVRegDef(Reg: I.getOperand(i: `1`).getReg());
5025
5026	// We can only fold if all of the defs have one use.
5027	Register CondDefReg = CondDef->getOperand(i: `0`).getReg();
5028	if (!MRI.hasOneNonDBGUse(RegNo: CondDefReg)) {
5029	// Unless it's another select.
5030	for (const MachineInstr &UI : MRI.use_nodbg_instructions(Reg: CondDefReg)) {
5031	if (CondDef == &UI)
5032	continue;
5033	if (UI.getOpcode() != TargetOpcode::G_SELECT)
5034	return false;
5035	}
5036	}
5037
5038	// Is the condition defined by a compare?
5039	unsigned CondOpc = CondDef->getOpcode();
5040	if (CondOpc != TargetOpcode::G_ICMP && CondOpc != TargetOpcode::G_FCMP) {
5041	if (tryOptSelectConjunction(SelI&: I, CondMI&: *CondDef))
5042	return true;
5043	return false;
5044	}
5045
5046	AArch64CC::CondCode CondCode;
5047	if (CondOpc == TargetOpcode::G_ICMP) {
5048	auto Pred =
5049	static_cast<CmpInst::Predicate>(CondDef->getOperand(i: `1`).getPredicate());
5050	CondCode = changeICMPPredToAArch64CC(P: Pred);
5051	emitIntegerCompare(LHS&: CondDef->getOperand(i: `2`), RHS&: CondDef->getOperand(i: `3`),
5052	Predicate&: CondDef->getOperand(i: `1`), MIRBuilder&: MIB);
5053	} else {
5054	// Get the condition code for the select.
5055	auto Pred =
5056	static_cast<CmpInst::Predicate>(CondDef->getOperand(i: `1`).getPredicate());
5057	AArch64CC::CondCode CondCode2;
5058	changeFCMPPredToAArch64CC(P: Pred, CondCode, CondCode2);
5059
5060	// changeFCMPPredToAArch64CC sets CondCode2 to AL when we require two
5061	// instructions to emit the comparison.
5062	// TODO: Handle FCMP_UEQ and FCMP_ONE. After that, this check will be
5063	// unnecessary.
5064	if (CondCode2 != AArch64CC::AL)
5065	return false;
5066
5067	if (!emitFPCompare(LHS: CondDef->getOperand(i: `2`).getReg(),
5068	RHS: CondDef->getOperand(i: `3`).getReg(), MIRBuilder&: MIB)) {
5069	LLVM_DEBUG(dbgs() << "Couldn't emit compare for select!\n");
5070	return false;
5071	}
5072	}
5073
5074	// Emit the select.
5075	emitSelect(Dst: I.getOperand(i: `0`).getReg(), True: I.getOperand(i: `2`).getReg(),
5076	False: I.getOperand(i: `3`).getReg(), CC: CondCode, MIB);
5077	I.eraseFromParent();
5078	return true;
5079	}
5080
5081	MachineInstr *AArch64InstructionSelector::tryFoldIntegerCompare(
5082	MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate,
5083	MachineIRBuilder &MIRBuilder) const {
5084	assert(LHS.isReg() && RHS.isReg() && Predicate.isPredicate() &&
5085	"Unexpected MachineOperand");
5086	MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
5087	// We want to find this sort of thing:
5088	// x = G_SUB 0, y
5089	// G_ICMP z, x
5090	//
5091	// In this case, we can fold the G_SUB into the G_ICMP using a CMN instead.
5092	// e.g:
5093	//
5094	// cmn z, y
5095
5096	// Check if the RHS or LHS of the G_ICMP is defined by a SUB
5097	MachineInstr *LHSDef = getDefIgnoringCopies(Reg: LHS.getReg(), MRI);
5098	MachineInstr *RHSDef = getDefIgnoringCopies(Reg: RHS.getReg(), MRI);
5099	auto P = static_cast<CmpInst::Predicate>(Predicate.getPredicate());
5100	// Given this:
5101	//
5102	// x = G_SUB 0, y
5103	// G_ICMP x, z
5104	//
5105	// Produce this:
5106	//
5107	// cmn y, z
5108	if (isCMN(MaybeSub: LHSDef, Pred: P, MRI))
5109	return emitCMN(LHS&: LHSDef->getOperand(i: `2`), RHS, MIRBuilder);
5110
5111	// Same idea here, but with the RHS of the compare instead:
5112	//
5113	// Given this:
5114	//
5115	// x = G_SUB 0, y
5116	// G_ICMP z, x
5117	//
5118	// Produce this:
5119	//
5120	// cmn z, y
5121	if (isCMN(MaybeSub: RHSDef, Pred: P, MRI))
5122	return emitCMN(LHS, RHS&: RHSDef->getOperand(i: `2`), MIRBuilder);
5123
5124	// Given this:
5125	//
5126	// z = G_AND x, y
5127	// G_ICMP z, 0
5128	//
5129	// Produce this if the compare is signed:
5130	//
5131	// tst x, y
5132	if (!CmpInst::isUnsigned(predicate: P) && LHSDef &&
5133	LHSDef->getOpcode() == TargetOpcode::G_AND) {
5134	// Make sure that the RHS is 0.
5135	auto ValAndVReg = getIConstantVRegValWithLookThrough(VReg: RHS.getReg(), MRI);
5136	if (!ValAndVReg \|\| ValAndVReg ->Value != `0`)
5137	return nullptr;
5138
5139	return emitTST(LHS&: LHSDef->getOperand(i: `1`),
5140	RHS&: LHSDef->getOperand(i: `2`), MIRBuilder);
5141	}
5142
5143	return nullptr;
5144	}
5145
5146	bool AArch64InstructionSelector::selectShuffleVector(
5147	MachineInstr &I, MachineRegisterInfo &MRI) {
5148	const LLT DstTy = MRI.getType(Reg: I.getOperand(i: `0`).getReg());
5149	Register Src1Reg = I.getOperand(i: `1`).getReg();
5150	const LLT Src1Ty = MRI.getType(Reg: Src1Reg);
5151	Register Src2Reg = I.getOperand(i: `2`).getReg();
5152	const LLT Src2Ty = MRI.getType(Reg: Src2Reg);
5153	ArrayRef<int> Mask = I.getOperand(i: `3`).getShuffleMask();
5154
5155	MachineBasicBlock &MBB = *I.getParent();
5156	MachineFunction &MF = *MBB.getParent();
5157	LLVMContext &Ctx = MF.getFunction().getContext();
5158
5159	// G_SHUFFLE_VECTOR is weird in that the source operands can be scalars, if
5160	// it's originated from a <1 x T> type. Those should have been lowered into
5161	// G_BUILD_VECTOR earlier.
5162	if (!Src1Ty.isVector() \|\| !Src2Ty.isVector()) {
5163	LLVM_DEBUG(dbgs() << "Could not select a \"scalar\" G_SHUFFLE_VECTOR\n");
5164	return false;
5165	}
5166
5167	unsigned BytesPerElt = DstTy.getElementType().getSizeInBits() / `8`;
5168
5169	SmallVector<Constant *, `64`> CstIdxs;
5170	for (int Val : Mask) {
5171	// For now, any undef indexes we'll just assume to be 0. This should be
5172	// optimized in future, e.g. to select DUP etc.
5173	Val = Val < `0` ? `0` : Val;
5174	for (unsigned Byte = `0`; Byte < BytesPerElt; ++Byte) {
5175	unsigned Offset = Byte + Val * BytesPerElt;
5176	CstIdxs.emplace_back(Args: ConstantInt::get(Ty: Type::getInt8Ty(C&: Ctx), V: Offset));
5177	}
5178	}
5179
5180	// Use a constant pool to load the index vector for TBL.
5181	Constant *CPVal = ConstantVector::get(V: CstIdxs);
5182	MachineInstr *IndexLoad = emitLoadFromConstantPool(CPVal, MIRBuilder&: MIB);
5183	if (!IndexLoad) {
5184	LLVM_DEBUG(dbgs() << "Could not load from a constant pool");
5185	return false;
5186	}
5187
5188	if (DstTy.getSizeInBits() != `128`) {
5189	assert(DstTy.getSizeInBits() == `64` && "Unexpected shuffle result ty");
5190	// This case can be done with TBL1.
5191	MachineInstr *Concat =
5192	emitVectorConcat(Dst: std::nullopt, Op1: Src1Reg, Op2: Src2Reg, MIRBuilder&: MIB);
5193	if (!Concat) {
5194	LLVM_DEBUG(dbgs() << "Could not do vector concat for tbl1");
5195	return false;
5196	}
5197
5198	// The constant pool load will be 64 bits, so need to convert to FPR128 reg.
5199	IndexLoad = emitScalarToVector(`64`, &AArch64::FPR128RegClass,
5200	IndexLoad->getOperand(`0`).getReg(), MIB);
5201
5202	auto TBL1 = MIB.buildInstr(
5203	AArch64::TBLv16i8One, {&AArch64::FPR128RegClass},
5204	{Concat->getOperand(`0`).getReg(), IndexLoad->getOperand(`0`).getReg()});
5205	constrainSelectedInstRegOperands(*TBL1, TII, TRI, RBI);
5206
5207	auto Copy =
5208	MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(`0`).getReg()}, {})
5209	.addReg(TBL1.getReg(`0`), `0`, AArch64::dsub);
5210	RBI.constrainGenericRegister(Copy.getReg(`0`), AArch64::FPR64RegClass, MRI);
5211	I.eraseFromParent();
5212	return true;
5213	}
5214
5215	// For TBL2 we need to emit a REG_SEQUENCE to tie together two consecutive
5216	// Q registers for regalloc.
5217	SmallVector<Register, `2`> Regs = {Src1Reg, Src2Reg};
5218	auto RegSeq = createQTuple(Regs, MIB);
5219	auto TBL2 = MIB.buildInstr(AArch64::TBLv16i8Two, {I.getOperand(`0`)},
5220	{RegSeq, IndexLoad->getOperand(`0`)});
5221	constrainSelectedInstRegOperands(*TBL2, TII, TRI, RBI);
5222	I.eraseFromParent();
5223	return true;
5224	}
5225
5226	MachineInstr *AArch64InstructionSelector::emitLaneInsert(
5227	std::optional<Register> DstReg, Register SrcReg, Register EltReg,
5228	unsigned LaneIdx, const RegisterBank &RB,
5229	MachineIRBuilder &MIRBuilder) const {
5230	MachineInstr InsElt = nullptr*;
5231	const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass;
5232	MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
5233
5234	// Create a register to define with the insert if one wasn't passed in.
5235	if (!DstReg)
5236	DstReg = MRI.createVirtualRegister(RegClass: DstRC);
5237
5238	unsigned EltSize = MRI.getType(Reg: EltReg).getSizeInBits();
5239	unsigned Opc = getInsertVecEltOpInfo(RB, EltSize).first;
5240
5241	if (RB.getID() == AArch64::FPRRegBankID) {
5242	auto InsSub = emitScalarToVector(EltSize, DstRC, Scalar: EltReg, MIRBuilder);
5243	InsElt = MIRBuilder.buildInstr(Opc, DstOps: {*DstReg}, SrcOps: {SrcReg})
5244	.addImm(Val: LaneIdx)
5245	.addUse(RegNo: InsSub->getOperand(i: `0`).getReg())
5246	.addImm(Val: `0`);
5247	} else {
5248	InsElt = MIRBuilder.buildInstr(Opc, DstOps: {*DstReg}, SrcOps: {SrcReg})
5249	.addImm(Val: LaneIdx)
5250	.addUse(RegNo: EltReg);
5251	}
5252
5253	constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI);
5254	return InsElt;
5255	}
5256
5257	bool AArch64InstructionSelector::selectUSMovFromExtend(
5258	MachineInstr &MI, MachineRegisterInfo &MRI) {
5259	if (MI.getOpcode() != TargetOpcode::G_SEXT &&
5260	MI.getOpcode() != TargetOpcode::G_ZEXT &&
5261	MI.getOpcode() != TargetOpcode::G_ANYEXT)
5262	return false;
5263	bool IsSigned = MI.getOpcode() == TargetOpcode::G_SEXT;
5264	const Register DefReg = MI.getOperand(i: `0`).getReg();
5265	const LLT DstTy = MRI.getType(Reg: DefReg);
5266	unsigned DstSize = DstTy.getSizeInBits();
5267
5268	if (DstSize != `32` && DstSize != `64`)
5269	return false;
5270
5271	MachineInstr *Extract = getOpcodeDef(Opcode: TargetOpcode::G_EXTRACT_VECTOR_ELT,
5272	Reg: MI.getOperand(i: `1`).getReg(), MRI);
5273	int64_t Lane;
5274	if (!Extract \|\| !mi_match(R: Extract->getOperand(i: `2`).getReg(), MRI, P: m_ICst(Cst&: Lane)))
5275	return false;
5276	Register Src0 = Extract->getOperand(i: `1`).getReg();
5277
5278	const LLT &VecTy = MRI.getType(Reg: Src0);
5279
5280	if (VecTy.getSizeInBits() != `128`) {
5281	const MachineInstr *ScalarToVector = emitScalarToVector(
5282	VecTy.getSizeInBits(), &AArch64::FPR128RegClass, Src0, MIB);
5283	assert(ScalarToVector && "Didn't expect emitScalarToVector to fail!");
5284	Src0 = ScalarToVector->getOperand(i: `0`).getReg();
5285	}
5286
5287	unsigned Opcode;
5288	if (DstSize == `64` && VecTy.getScalarSizeInBits() == `32`)
5289	Opcode = IsSigned ? AArch64::SMOVvi32to64 : AArch64::UMOVvi32;
5290	else if (DstSize == `64` && VecTy.getScalarSizeInBits() == `16`)
5291	Opcode = IsSigned ? AArch64::SMOVvi16to64 : AArch64::UMOVvi16;
5292	else if (DstSize == `64` && VecTy.getScalarSizeInBits() == `8`)
5293	Opcode = IsSigned ? AArch64::SMOVvi8to64 : AArch64::UMOVvi8;
5294	else if (DstSize == `32` && VecTy.getScalarSizeInBits() == `16`)
5295	Opcode = IsSigned ? AArch64::SMOVvi16to32 : AArch64::UMOVvi16;
5296	else if (DstSize == `32` && VecTy.getScalarSizeInBits() == `8`)
5297	Opcode = IsSigned ? AArch64::SMOVvi8to32 : AArch64::UMOVvi8;
5298	else
5299	llvm_unreachable("Unexpected type combo for S/UMov!");
5300
5301	// We may need to generate one of these, depending on the type and sign of the
5302	// input:
5303	// DstReg = SMOV Src0, Lane;
5304	// NewReg = UMOV Src0, Lane; DstReg = SUBREG_TO_REG NewReg, sub_32;
5305	MachineInstr ExtI = nullptr*;
5306	if (DstSize == `64` && !IsSigned) {
5307	Register NewReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
5308	MIB.buildInstr(Opc: Opcode, DstOps: {NewReg}, SrcOps: {Src0}).addImm(Val: Lane);
5309	ExtI = MIB.buildInstr(AArch64::SUBREG_TO_REG, {DefReg}, {})
5310	.addImm(`0`)
5311	.addUse(NewReg)
5312	.addImm(AArch64::sub_32);
5313	RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI);
5314	} else
5315	ExtI = MIB.buildInstr(Opc: Opcode, DstOps: {DefReg}, SrcOps: {Src0}).addImm(Val: Lane);
5316
5317	constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
5318	MI.eraseFromParent();
5319	return true;
5320	}
5321
5322	bool AArch64InstructionSelector::selectInsertElt(MachineInstr &I,
5323	MachineRegisterInfo &MRI) {
5324	assert(I.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT);
5325
5326	// Get information on the destination.
5327	Register DstReg = I.getOperand(i: `0`).getReg();
5328	const LLT DstTy = MRI.getType(Reg: DstReg);
5329	unsigned VecSize = DstTy.getSizeInBits();
5330
5331	// Get information on the element we want to insert into the destination.
5332	Register EltReg = I.getOperand(i: `2`).getReg();
5333	const LLT EltTy = MRI.getType(Reg: EltReg);
5334	unsigned EltSize = EltTy.getSizeInBits();
5335	if (EltSize < `8` \|\| EltSize > `64`)
5336	return false;
5337
5338	// Find the definition of the index. Bail out if it's not defined by a
5339	// G_CONSTANT.
5340	Register IdxReg = I.getOperand(i: `3`).getReg();
5341	auto VRegAndVal = getIConstantVRegValWithLookThrough(VReg: IdxReg, MRI);
5342	if (!VRegAndVal)
5343	return false;
5344	unsigned LaneIdx = VRegAndVal ->Value.getSExtValue();
5345
5346	// Perform the lane insert.
5347	Register SrcReg = I.getOperand(i: `1`).getReg();
5348	const RegisterBank &EltRB = *RBI.getRegBank(EltReg, MRI, TRI);
5349
5350	if (VecSize < `128`) {
5351	// If the vector we're inserting into is smaller than 128 bits, widen it
5352	// to 128 to do the insert.
5353	MachineInstr *ScalarToVec =
5354	emitScalarToVector(VecSize, &AArch64::FPR128RegClass, SrcReg, MIB);
5355	if (!ScalarToVec)
5356	return false;
5357	SrcReg = ScalarToVec->getOperand(i: `0`).getReg();
5358	}
5359
5360	// Create an insert into a new FPR128 register.
5361	// Note that if our vector is already 128 bits, we end up emitting an extra
5362	// register.
5363	MachineInstr *InsMI =
5364	emitLaneInsert(DstReg: std::nullopt, SrcReg, EltReg, LaneIdx, RB: EltRB, MIRBuilder&: MIB);
5365
5366	if (VecSize < `128`) {
5367	// If we had to widen to perform the insert, then we have to demote back to
5368	// the original size to get the result we want.
5369	if (!emitNarrowVector(DstReg, SrcReg: InsMI->getOperand(i: `0`).getReg(), MIB, MRI))
5370	return false;
5371	} else {
5372	// No widening needed.
5373	InsMI->getOperand(i: `0`).setReg(DstReg);
5374	constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI);
5375	}
5376
5377	I.eraseFromParent();
5378	return true;
5379	}
5380
5381	MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm8(
5382	Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder) {
5383	unsigned int Op;
5384	if (DstSize == `128`) {
5385	if (Bits.getHiBits(numBits: `64`) != Bits.getLoBits(numBits: `64`))
5386	return nullptr;
5387	Op = AArch64::MOVIv16b_ns;
5388	} else {
5389	Op = AArch64::MOVIv8b_ns;
5390	}
5391
5392	uint64_t Val = Bits.zextOrTrunc(width: `64`).getZExtValue();
5393
5394	if (AArch64_AM::isAdvSIMDModImmType9(Imm: Val)) {
5395	Val = AArch64_AM::encodeAdvSIMDModImmType9(Imm: Val);
5396	auto Mov = Builder.buildInstr(Opc: Op, DstOps: {Dst}, SrcOps: {}).addImm(Val);
5397	constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5398	return &*Mov;
5399	}
5400	return nullptr;
5401	}
5402
5403	MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm16(
5404	Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder,
5405	bool Inv) {
5406
5407	unsigned int Op;
5408	if (DstSize == `128`) {
5409	if (Bits.getHiBits(numBits: `64`) != Bits.getLoBits(numBits: `64`))
5410	return nullptr;
5411	Op = Inv ? AArch64::MVNIv8i16 : AArch64::MOVIv8i16;
5412	} else {
5413	Op = Inv ? AArch64::MVNIv4i16 : AArch64::MOVIv4i16;
5414	}
5415
5416	uint64_t Val = Bits.zextOrTrunc(width: `64`).getZExtValue();
5417	uint64_t Shift;
5418
5419	if (AArch64_AM::isAdvSIMDModImmType5(Imm: Val)) {
5420	Val = AArch64_AM::encodeAdvSIMDModImmType5(Imm: Val);
5421	Shift = `0`;
5422	} else if (AArch64_AM::isAdvSIMDModImmType6(Imm: Val)) {
5423	Val = AArch64_AM::encodeAdvSIMDModImmType6(Imm: Val);
5424	Shift = `8`;
5425	} else
5426	return nullptr;
5427
5428	auto Mov = Builder.buildInstr(Opc: Op, DstOps: {Dst}, SrcOps: {}).addImm(Val).addImm(Val: Shift);
5429	constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5430	return &*Mov;
5431	}
5432
5433	MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm32(
5434	Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder,
5435	bool Inv) {
5436
5437	unsigned int Op;
5438	if (DstSize == `128`) {
5439	if (Bits.getHiBits(numBits: `64`) != Bits.getLoBits(numBits: `64`))
5440	return nullptr;
5441	Op = Inv ? AArch64::MVNIv4i32 : AArch64::MOVIv4i32;
5442	} else {
5443	Op = Inv ? AArch64::MVNIv2i32 : AArch64::MOVIv2i32;
5444	}
5445
5446	uint64_t Val = Bits.zextOrTrunc(width: `64`).getZExtValue();
5447	uint64_t Shift;
5448
5449	if ((AArch64_AM::isAdvSIMDModImmType1(Imm: Val))) {
5450	Val = AArch64_AM::encodeAdvSIMDModImmType1(Imm: Val);
5451	Shift = `0`;
5452	} else if ((AArch64_AM::isAdvSIMDModImmType2(Imm: Val))) {
5453	Val = AArch64_AM::encodeAdvSIMDModImmType2(Imm: Val);
5454	Shift = `8`;
5455	} else if ((AArch64_AM::isAdvSIMDModImmType3(Imm: Val))) {
5456	Val = AArch64_AM::encodeAdvSIMDModImmType3(Imm: Val);
5457	Shift = `16`;
5458	} else if ((AArch64_AM::isAdvSIMDModImmType4(Imm: Val))) {
5459	Val = AArch64_AM::encodeAdvSIMDModImmType4(Imm: Val);
5460	Shift = `24`;
5461	} else
5462	return nullptr;
5463
5464	auto Mov = Builder.buildInstr(Opc: Op, DstOps: {Dst}, SrcOps: {}).addImm(Val).addImm(Val: Shift);
5465	constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5466	return &*Mov;
5467	}
5468
5469	MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm64(
5470	Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder) {
5471
5472	unsigned int Op;
5473	if (DstSize == `128`) {
5474	if (Bits.getHiBits(numBits: `64`) != Bits.getLoBits(numBits: `64`))
5475	return nullptr;
5476	Op = AArch64::MOVIv2d_ns;
5477	} else {
5478	Op = AArch64::MOVID;
5479	}
5480
5481	uint64_t Val = Bits.zextOrTrunc(width: `64`).getZExtValue();
5482	if (AArch64_AM::isAdvSIMDModImmType10(Imm: Val)) {
5483	Val = AArch64_AM::encodeAdvSIMDModImmType10(Imm: Val);
5484	auto Mov = Builder.buildInstr(Opc: Op, DstOps: {Dst}, SrcOps: {}).addImm(Val);
5485	constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5486	return &*Mov;
5487	}
5488	return nullptr;
5489	}
5490
5491	MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm321s(
5492	Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder,
5493	bool Inv) {
5494
5495	unsigned int Op;
5496	if (DstSize == `128`) {
5497	if (Bits.getHiBits(numBits: `64`) != Bits.getLoBits(numBits: `64`))
5498	return nullptr;
5499	Op = Inv ? AArch64::MVNIv4s_msl : AArch64::MOVIv4s_msl;
5500	} else {
5501	Op = Inv ? AArch64::MVNIv2s_msl : AArch64::MOVIv2s_msl;
5502	}
5503
5504	uint64_t Val = Bits.zextOrTrunc(width: `64`).getZExtValue();
5505	uint64_t Shift;
5506
5507	if (AArch64_AM::isAdvSIMDModImmType7(Imm: Val)) {
5508	Val = AArch64_AM::encodeAdvSIMDModImmType7(Imm: Val);
5509	Shift = `264`;
5510	} else if (AArch64_AM::isAdvSIMDModImmType8(Imm: Val)) {
5511	Val = AArch64_AM::encodeAdvSIMDModImmType8(Imm: Val);
5512	Shift = `272`;
5513	} else
5514	return nullptr;
5515
5516	auto Mov = Builder.buildInstr(Opc: Op, DstOps: {Dst}, SrcOps: {}).addImm(Val).addImm(Val: Shift);
5517	constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5518	return &*Mov;
5519	}
5520
5521	MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImmFP(
5522	Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder) {
5523
5524	unsigned int Op;
5525	bool IsWide = false;
5526	if (DstSize == `128`) {
5527	if (Bits.getHiBits(numBits: `64`) != Bits.getLoBits(numBits: `64`))
5528	return nullptr;
5529	Op = AArch64::FMOVv4f32_ns;
5530	IsWide = true;
5531	} else {
5532	Op = AArch64::FMOVv2f32_ns;
5533	}
5534
5535	uint64_t Val = Bits.zextOrTrunc(width: `64`).getZExtValue();
5536
5537	if (AArch64_AM::isAdvSIMDModImmType11(Imm: Val)) {
5538	Val = AArch64_AM::encodeAdvSIMDModImmType11(Imm: Val);
5539	} else if (IsWide && AArch64_AM::isAdvSIMDModImmType12(Imm: Val)) {
5540	Val = AArch64_AM::encodeAdvSIMDModImmType12(Imm: Val);
5541	Op = AArch64::FMOVv2f64_ns;
5542	} else
5543	return nullptr;
5544
5545	auto Mov = Builder.buildInstr(Opc: Op, DstOps: {Dst}, SrcOps: {}).addImm(Val);
5546	constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5547	return &*Mov;
5548	}
5549
5550	bool AArch64InstructionSelector::selectIndexedExtLoad(
5551	MachineInstr &MI, MachineRegisterInfo &MRI) {
5552	auto &ExtLd = cast<GIndexedAnyExtLoad>(Val&: MI);
5553	Register Dst = ExtLd.getDstReg();
5554	Register WriteBack = ExtLd.getWritebackReg();
5555	Register Base = ExtLd.getBaseReg();
5556	Register Offset = ExtLd.getOffsetReg();
5557	LLT Ty = MRI.getType(Reg: Dst);
5558	assert(Ty.getSizeInBits() <= `64`); // Only for scalar GPRs.
5559	unsigned MemSizeBits = ExtLd.getMMO().getMemoryType().getSizeInBits();
5560	bool IsPre = ExtLd.isPre();
5561	bool IsSExt = isa<GIndexedSExtLoad>(Val: ExtLd);
5562	bool InsertIntoXReg = false;
5563	bool IsDst64 = Ty.getSizeInBits() == `64`;
5564
5565	unsigned Opc = `0`;
5566	LLT NewLdDstTy;
5567	LLT s32 = LLT::scalar(SizeInBits: `32`);
5568	LLT s64 = LLT::scalar(SizeInBits: `64`);
5569
5570	if (MemSizeBits == `8`) {
5571	if (IsSExt) {
5572	if (IsDst64)
5573	Opc = IsPre ? AArch64::LDRSBXpre : AArch64::LDRSBXpost;
5574	else
5575	Opc = IsPre ? AArch64::LDRSBWpre : AArch64::LDRSBWpost;
5576	NewLdDstTy = IsDst64 ? s64 : s32;
5577	} else {
5578	Opc = IsPre ? AArch64::LDRBBpre : AArch64::LDRBBpost;
5579	InsertIntoXReg = IsDst64;
5580	NewLdDstTy = s32;
5581	}
5582	} else if (MemSizeBits == `16`) {
5583	if (IsSExt) {
5584	if (IsDst64)
5585	Opc = IsPre ? AArch64::LDRSHXpre : AArch64::LDRSHXpost;
5586	else
5587	Opc = IsPre ? AArch64::LDRSHWpre : AArch64::LDRSHWpost;
5588	NewLdDstTy = IsDst64 ? s64 : s32;
5589	} else {
5590	Opc = IsPre ? AArch64::LDRHHpre : AArch64::LDRHHpost;
5591	InsertIntoXReg = IsDst64;
5592	NewLdDstTy = s32;
5593	}
5594	} else if (MemSizeBits == `32`) {
5595	if (IsSExt) {
5596	Opc = IsPre ? AArch64::LDRSWpre : AArch64::LDRSWpost;
5597	NewLdDstTy = s64;
5598	} else {
5599	Opc = IsPre ? AArch64::LDRWpre : AArch64::LDRWpost;
5600	InsertIntoXReg = IsDst64;
5601	NewLdDstTy = s32;
5602	}
5603	} else {
5604	llvm_unreachable("Unexpected size for indexed load");
5605	}
5606
5607	if (RBI.getRegBank(Dst, MRI, TRI)->getID() == AArch64::FPRRegBankID)
5608	return false; // We should be on gpr.
5609
5610	auto Cst = getIConstantVRegVal(VReg: Offset, MRI);
5611	if (!Cst)
5612	return false; // Shouldn't happen, but just in case.
5613
5614	auto LdMI = MIB.buildInstr(Opc, DstOps: {WriteBack, NewLdDstTy}, SrcOps: {Base})
5615	.addImm(Val: Cst ->getSExtValue());
5616	LdMI.cloneMemRefs(OtherMI: ExtLd);
5617	constrainSelectedInstRegOperands(*LdMI, TII, TRI, RBI);
5618	// Make sure to select the load with the MemTy as the dest type, and then
5619	// insert into X reg if needed.
5620	if (InsertIntoXReg) {
5621	// Generate a SUBREG_TO_REG.
5622	auto SubToReg = MIB.buildInstr(TargetOpcode::SUBREG_TO_REG, {Dst}, {})
5623	.addImm(`0`)
5624	.addUse(LdMI.getReg(`1`))
5625	.addImm(AArch64::sub_32);
5626	RBI.constrainGenericRegister(SubToReg.getReg(`0`), AArch64::GPR64RegClass,
5627	MRI);
5628	} else {
5629	auto Copy = MIB.buildCopy(Res: Dst, Op: LdMI.getReg(Idx: `1`));
5630	selectCopy(*Copy, TII, MRI, TRI, RBI);
5631	}
5632	MI.eraseFromParent();
5633
5634	return true;
5635	}
5636
5637	bool AArch64InstructionSelector::selectIndexedLoad(MachineInstr &MI,
5638	MachineRegisterInfo &MRI) {
5639	auto &Ld = cast<GIndexedLoad>(Val&: MI);
5640	Register Dst = Ld.getDstReg();
5641	Register WriteBack = Ld.getWritebackReg();
5642	Register Base = Ld.getBaseReg();
5643	Register Offset = Ld.getOffsetReg();
5644	assert(MRI.getType(Dst).getSizeInBits() <= `128` &&
5645	"Unexpected type for indexed load");
5646	unsigned MemSize = Ld.getMMO().getMemoryType().getSizeInBytes();
5647
5648	if (MemSize < MRI.getType(Reg: Dst).getSizeInBytes())
5649	return selectIndexedExtLoad(MI, MRI);
5650
5651	unsigned Opc = `0`;
5652	if (Ld.isPre()) {
5653	static constexpr unsigned GPROpcodes[] = {
5654	AArch64::LDRBBpre, AArch64::LDRHHpre, AArch64::LDRWpre,
5655	AArch64::LDRXpre};
5656	static constexpr unsigned FPROpcodes[] = {
5657	AArch64::LDRBpre, AArch64::LDRHpre, AArch64::LDRSpre, AArch64::LDRDpre,
5658	AArch64::LDRQpre};
5659	if (RBI.getRegBank(Dst, MRI, TRI)->getID() == AArch64::FPRRegBankID)
5660	Opc = FPROpcodes[Log2_32(Value: MemSize)];
5661	else
5662	Opc = GPROpcodes[Log2_32(Value: MemSize)];
5663	} else {
5664	static constexpr unsigned GPROpcodes[] = {
5665	AArch64::LDRBBpost, AArch64::LDRHHpost, AArch64::LDRWpost,
5666	AArch64::LDRXpost};
5667	static constexpr unsigned FPROpcodes[] = {
5668	AArch64::LDRBpost, AArch64::LDRHpost, AArch64::LDRSpost,
5669	AArch64::LDRDpost, AArch64::LDRQpost};
5670	if (RBI.getRegBank(Dst, MRI, TRI)->getID() == AArch64::FPRRegBankID)
5671	Opc = FPROpcodes[Log2_32(Value: MemSize)];
5672	else
5673	Opc = GPROpcodes[Log2_32(Value: MemSize)];
5674	}
5675	auto Cst = getIConstantVRegVal(VReg: Offset, MRI);
5676	if (!Cst)
5677	return false; // Shouldn't happen, but just in case.
5678	auto LdMI =
5679	MIB.buildInstr(Opc, DstOps: {WriteBack, Dst}, SrcOps: {Base}).addImm(Val: Cst ->getSExtValue());
5680	LdMI.cloneMemRefs(OtherMI: Ld);
5681	constrainSelectedInstRegOperands(*LdMI, TII, TRI, RBI);
5682	MI.eraseFromParent();
5683	return true;
5684	}
5685
5686	bool AArch64InstructionSelector::selectIndexedStore(GIndexedStore &I,
5687	MachineRegisterInfo &MRI) {
5688	Register Dst = I.getWritebackReg();
5689	Register Val = I.getValueReg();
5690	Register Base = I.getBaseReg();
5691	Register Offset = I.getOffsetReg();
5692	LLT ValTy = MRI.getType(Reg: Val);
5693	assert(ValTy.getSizeInBits() <= `128` && "Unexpected type for indexed store");
5694
5695	unsigned Opc = `0`;
5696	if (I.isPre()) {
5697	static constexpr unsigned GPROpcodes[] = {
5698	AArch64::STRBBpre, AArch64::STRHHpre, AArch64::STRWpre,
5699	AArch64::STRXpre};
5700	static constexpr unsigned FPROpcodes[] = {
5701	AArch64::STRBpre, AArch64::STRHpre, AArch64::STRSpre, AArch64::STRDpre,
5702	AArch64::STRQpre};
5703
5704	if (RBI.getRegBank(Val, MRI, TRI)->getID() == AArch64::FPRRegBankID)
5705	Opc = FPROpcodes[Log2_32(Value: ValTy.getSizeInBytes())];
5706	else
5707	Opc = GPROpcodes[Log2_32(Value: ValTy.getSizeInBytes())];
5708	} else {
5709	static constexpr unsigned GPROpcodes[] = {
5710	AArch64::STRBBpost, AArch64::STRHHpost, AArch64::STRWpost,
5711	AArch64::STRXpost};
5712	static constexpr unsigned FPROpcodes[] = {
5713	AArch64::STRBpost, AArch64::STRHpost, AArch64::STRSpost,
5714	AArch64::STRDpost, AArch64::STRQpost};
5715
5716	if (RBI.getRegBank(Val, MRI, TRI)->getID() == AArch64::FPRRegBankID)
5717	Opc = FPROpcodes[Log2_32(Value: ValTy.getSizeInBytes())];
5718	else
5719	Opc = GPROpcodes[Log2_32(Value: ValTy.getSizeInBytes())];
5720	}
5721
5722	auto Cst = getIConstantVRegVal(VReg: Offset, MRI);
5723	if (!Cst)
5724	return false; // Shouldn't happen, but just in case.
5725	auto Str =
5726	MIB.buildInstr(Opc, DstOps: {Dst}, SrcOps: {Val, Base}).addImm(Val: Cst ->getSExtValue());
5727	Str.cloneMemRefs(OtherMI: I);
5728	constrainSelectedInstRegOperands(*Str, TII, TRI, RBI);
5729	I.eraseFromParent();
5730	return true;
5731	}
5732
5733	MachineInstr *
5734	AArch64InstructionSelector::emitConstantVector(Register Dst, Constant *CV,
5735	MachineIRBuilder &MIRBuilder,
5736	MachineRegisterInfo &MRI) {
5737	LLT DstTy = MRI.getType(Reg: Dst);
5738	unsigned DstSize = DstTy.getSizeInBits();
5739	if (CV->isNullValue()) {
5740	if (DstSize == `128`) {
5741	auto Mov =
5742	MIRBuilder.buildInstr(AArch64::MOVIv2d_ns, {Dst}, {}).addImm(`0`);
5743	constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5744	return &*Mov;
5745	}
5746
5747	if (DstSize == `64`) {
5748	auto Mov =
5749	MIRBuilder
5750	.buildInstr(AArch64::MOVIv2d_ns, {&AArch64::FPR128RegClass}, {})
5751	.addImm(`0`);
5752	auto Copy = MIRBuilder.buildInstr(TargetOpcode::COPY, {Dst}, {})
5753	.addReg(Mov.getReg(`0`), `0`, AArch64::dsub);
5754	RBI.constrainGenericRegister(Dst, AArch64::FPR64RegClass, MRI);
5755	return &*Copy;
5756	}
5757	}
5758
5759	if (CV->getSplatValue()) {
5760	APInt DefBits = APInt::getSplat(NewLen: DstSize, V: CV->getUniqueInteger());
5761	MachineInstr *NewOp;
5762	bool Inv = false;
5763	if ((NewOp = tryAdvSIMDModImm64(Dst, DstSize, Bits: DefBits, Builder&: MIRBuilder)) \|\|
5764	(NewOp = tryAdvSIMDModImm32(Dst, DstSize, Bits: DefBits, Builder&: MIRBuilder, Inv)) \|\|
5765	(NewOp =
5766	tryAdvSIMDModImm321s(Dst, DstSize, Bits: DefBits, Builder&: MIRBuilder, Inv)) \|\|
5767	(NewOp = tryAdvSIMDModImm16(Dst, DstSize, Bits: DefBits, Builder&: MIRBuilder, Inv)) \|\|
5768	(NewOp = tryAdvSIMDModImm8(Dst, DstSize, Bits: DefBits, Builder&: MIRBuilder)) \|\|
5769	(NewOp = tryAdvSIMDModImmFP(Dst, DstSize, Bits: DefBits, Builder&: MIRBuilder)))
5770	return NewOp;
5771
5772	DefBits = ~DefBits;
5773	Inv = true;
5774	if ((NewOp = tryAdvSIMDModImm32(Dst, DstSize, Bits: DefBits, Builder&: MIRBuilder, Inv)) \|\|
5775	(NewOp =
5776	tryAdvSIMDModImm321s(Dst, DstSize, Bits: DefBits, Builder&: MIRBuilder, Inv)) \|\|
5777	(NewOp = tryAdvSIMDModImm16(Dst, DstSize, Bits: DefBits, Builder&: MIRBuilder, Inv)))
5778	return NewOp;
5779	}
5780
5781	auto *CPLoad = emitLoadFromConstantPool(CPVal: CV, MIRBuilder);
5782	if (!CPLoad) {
5783	LLVM_DEBUG(dbgs() << "Could not generate cp load for constant vector!");
5784	return nullptr;
5785	}
5786
5787	auto Copy = MIRBuilder.buildCopy(Res: Dst, Op: CPLoad->getOperand(i: `0`));
5788	RBI.constrainGenericRegister(
5789	Reg: Dst, RC: *MRI.getRegClass(Reg: CPLoad->getOperand(i: `0`).getReg()), MRI);
5790	return &*Copy;
5791	}
5792
5793	bool AArch64InstructionSelector::tryOptConstantBuildVec(
5794	MachineInstr &I, LLT DstTy, MachineRegisterInfo &MRI) {
5795	assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
5796	unsigned DstSize = DstTy.getSizeInBits();
5797	assert(DstSize <= `128` && "Unexpected build_vec type!");
5798	if (DstSize < `32`)
5799	return false;
5800	// Check if we're building a constant vector, in which case we want to
5801	// generate a constant pool load instead of a vector insert sequence.
5802	SmallVector<Constant *, `16`> Csts;
5803	for (unsigned Idx = `1`; Idx < I.getNumOperands(); ++Idx) {
5804	// Try to find G_CONSTANT or G_FCONSTANT
5805	auto *OpMI =
5806	getOpcodeDef(Opcode: TargetOpcode::G_CONSTANT, Reg: I.getOperand(i: Idx).getReg(), MRI);
5807	if (OpMI)
5808	Csts.emplace_back(
5809	Args: const_cast<ConstantInt *>(OpMI->getOperand(i: `1`).getCImm()));
5810	else if ((OpMI = getOpcodeDef(Opcode: TargetOpcode::G_FCONSTANT,
5811	Reg: I.getOperand(i: Idx).getReg(), MRI)))
5812	Csts.emplace_back(
5813	Args: const_cast<ConstantFP *>(OpMI->getOperand(i: `1`).getFPImm()));
5814	else
5815	return false;
5816	}
5817	Constant *CV = ConstantVector::get(V: Csts);
5818	if (!emitConstantVector(Dst: I.getOperand(i: `0`).getReg(), CV, MIRBuilder&: MIB, MRI))
5819	return false;
5820	I.eraseFromParent();
5821	return true;
5822	}
5823
5824	bool AArch64InstructionSelector::tryOptBuildVecToSubregToReg(
5825	MachineInstr &I, MachineRegisterInfo &MRI) {
5826	// Given:
5827	// %vec = G_BUILD_VECTOR %elt, %undef, %undef, ... %undef
5828	//
5829	// Select the G_BUILD_VECTOR as a SUBREG_TO_REG from %elt.
5830	Register Dst = I.getOperand(i: `0`).getReg();
5831	Register EltReg = I.getOperand(i: `1`).getReg();
5832	LLT EltTy = MRI.getType(Reg: EltReg);
5833	// If the index isn't on the same bank as its elements, then this can't be a
5834	// SUBREG_TO_REG.
5835	const RegisterBank &EltRB = *RBI.getRegBank(EltReg, MRI, TRI);
5836	const RegisterBank &DstRB = *RBI.getRegBank(Dst, MRI, TRI);
5837	if (EltRB != DstRB)
5838	return false;
5839	if (any_of(Range: drop_begin(RangeOrContainer: I.operands(), N: `2`), P: [&MRI](const MachineOperand &Op) {
5840	return !getOpcodeDef(Opcode: TargetOpcode::G_IMPLICIT_DEF, Reg: Op.getReg(), MRI);
5841	}))
5842	return false;
5843	unsigned SubReg;
5844	const TargetRegisterClass *EltRC = getRegClassForTypeOnBank(Ty: EltTy, RB: EltRB);
5845	if (!EltRC)
5846	return false;
5847	const TargetRegisterClass *DstRC =
5848	getRegClassForTypeOnBank(Ty: MRI.getType(Reg: Dst), RB: DstRB);
5849	if (!DstRC)
5850	return false;
5851	if (!getSubRegForClass(EltRC, TRI, SubReg))
5852	return false;
5853	auto SubregToReg = MIB.buildInstr(AArch64::SUBREG_TO_REG, {Dst}, {})
5854	.addImm(`0`)
5855	.addUse(EltReg)
5856	.addImm(SubReg);
5857	I.eraseFromParent();
5858	constrainSelectedInstRegOperands(*SubregToReg, TII, TRI, RBI);
5859	return RBI.constrainGenericRegister(Reg: Dst, RC: *DstRC, MRI);
5860	}
5861
5862	bool AArch64InstructionSelector::selectBuildVector(MachineInstr &I,
5863	MachineRegisterInfo &MRI) {
5864	assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
5865	// Until we port more of the optimized selections, for now just use a vector
5866	// insert sequence.
5867	const LLT DstTy = MRI.getType(Reg: I.getOperand(i: `0`).getReg());
5868	const LLT EltTy = MRI.getType(Reg: I.getOperand(i: `1`).getReg());
5869	unsigned EltSize = EltTy.getSizeInBits();
5870
5871	if (tryOptConstantBuildVec(I, DstTy, MRI))
5872	return true;
5873	if (tryOptBuildVecToSubregToReg(I, MRI))
5874	return true;
5875
5876	if (EltSize != `8` && EltSize != `16` && EltSize != `32` && EltSize != `64`)
5877	return false; // Don't support all element types yet.
5878	const RegisterBank &RB = *RBI.getRegBank(I.getOperand(i: `1`).getReg(), MRI, TRI);
5879
5880	const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass;
5881	MachineInstr *ScalarToVec =
5882	emitScalarToVector(EltSize: DstTy.getElementType().getSizeInBits(), DstRC,
5883	Scalar: I.getOperand(i: `1`).getReg(), MIRBuilder&: MIB);
5884	if (!ScalarToVec)
5885	return false;
5886
5887	Register DstVec = ScalarToVec->getOperand(i: `0`).getReg();
5888	unsigned DstSize = DstTy.getSizeInBits();
5889
5890	// Keep track of the last MI we inserted. Later on, we might be able to save
5891	// a copy using it.
5892	MachineInstr PrevMI = nullptr*;
5893	for (unsigned i = `2`, e = DstSize / EltSize + `1`; i < e; ++i) {
5894	// Note that if we don't do a subregister copy, we can end up making an
5895	// extra register.
5896	PrevMI = &*emitLaneInsert(DstReg: std::nullopt, SrcReg: DstVec, EltReg: I.getOperand(i).getReg(),
5897	LaneIdx: i - `1`, RB, MIRBuilder&: MIB);
5898	DstVec = PrevMI->getOperand(i: `0`).getReg();
5899	}
5900
5901	// If DstTy's size in bits is less than 128, then emit a subregister copy
5902	// from DstVec to the last register we've defined.
5903	if (DstSize < `128`) {
5904	// Force this to be FPR using the destination vector.
5905	const TargetRegisterClass *RC =
5906	getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(DstVec, MRI, TRI));
5907	if (!RC)
5908	return false;
5909	if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) {
5910	LLVM_DEBUG(dbgs() << "Unsupported register class!\n");
5911	return false;
5912	}
5913
5914	unsigned SubReg = `0`;
5915	if (!getSubRegForClass(RC, TRI, SubReg))
5916	return false;
5917	if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) {
5918	LLVM_DEBUG(dbgs() << "Unsupported destination size! (" << DstSize
5919	<< "\n");
5920	return false;
5921	}
5922
5923	Register Reg = MRI.createVirtualRegister(RegClass: RC);
5924	Register DstReg = I.getOperand(i: `0`).getReg();
5925
5926	MIB.buildInstr(Opc: TargetOpcode::COPY, DstOps: {DstReg}, SrcOps: {}).addReg(RegNo: DstVec, flags: `0`, SubReg);
5927	MachineOperand &RegOp = I.getOperand(i: `1`);
5928	RegOp.setReg(Reg);
5929	RBI.constrainGenericRegister(Reg: DstReg, RC: *RC, MRI);
5930	} else {
5931	// We don't need a subregister copy. Save a copy by re-using the
5932	// destination register on the final insert.
5933	assert(PrevMI && "PrevMI was null?");
5934	PrevMI->getOperand(i: `0`).setReg(I.getOperand(i: `0`).getReg());
5935	constrainSelectedInstRegOperands(*PrevMI, TII, TRI, RBI);
5936	}
5937
5938	I.eraseFromParent();
5939	return true;
5940	}
5941
5942	bool AArch64InstructionSelector::selectVectorLoadIntrinsic(unsigned Opc,
5943	unsigned NumVecs,
5944	MachineInstr &I) {
5945	assert(I.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
5946	assert(Opc && "Expected an opcode?");
5947	assert(NumVecs > `1` && NumVecs < `5` && "Only support 2, 3, or 4 vectors");
5948	auto &MRI = *MIB.getMRI();
5949	LLT Ty = MRI.getType(Reg: I.getOperand(i: `0`).getReg());
5950	unsigned Size = Ty.getSizeInBits();
5951	assert((Size == `64` \|\| Size == `128`) &&
5952	"Destination must be 64 bits or 128 bits?");
5953	unsigned SubReg = Size == `64` ? AArch64::dsub0 : AArch64::qsub0;
5954	auto Ptr = I.getOperand(i: I.getNumOperands() - `1`).getReg();
5955	assert(MRI.getType(Ptr).isPointer() && "Expected a pointer type?");
5956	auto Load = MIB.buildInstr(Opc, DstOps: {Ty}, SrcOps: {Ptr});
5957	Load.cloneMemRefs(OtherMI: I);
5958	constrainSelectedInstRegOperands(*Load, TII, TRI, RBI);
5959	Register SelectedLoadDst = Load ->getOperand(i: `0`).getReg();
5960	for (unsigned Idx = `0`; Idx < NumVecs; ++Idx) {
5961	auto Vec = MIB.buildInstr(Opc: TargetOpcode::COPY, DstOps: {I.getOperand(i: Idx)}, SrcOps: {})
5962	.addReg(RegNo: SelectedLoadDst, flags: `0`, SubReg: SubReg + Idx);
5963	// Emit the subreg copies and immediately select them.
5964	// FIXME: We should refactor our copy code into an emitCopy helper and
5965	// clean up uses of this pattern elsewhere in the selector.
5966	selectCopy(*Vec, TII, MRI, TRI, RBI);
5967	}
5968	return true;
5969	}
5970
5971	bool AArch64InstructionSelector::selectVectorLoadLaneIntrinsic(
5972	unsigned Opc, unsigned NumVecs, MachineInstr &I) {
5973	assert(I.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
5974	assert(Opc && "Expected an opcode?");
5975	assert(NumVecs > `1` && NumVecs < `5` && "Only support 2, 3, or 4 vectors");
5976	auto &MRI = *MIB.getMRI();
5977	LLT Ty = MRI.getType(Reg: I.getOperand(i: `0`).getReg());
5978	bool Narrow = Ty.getSizeInBits() == `64`;
5979
5980	auto FirstSrcRegIt = I.operands_begin() + NumVecs + `1`;
5981	SmallVector<Register, `4`> Regs(NumVecs);
5982	std::transform(first: FirstSrcRegIt, last: FirstSrcRegIt + NumVecs, result: Regs.begin(),
5983	unary_op: [](auto MO) { return MO.getReg(); });
5984
5985	if (Narrow) {
5986	transform(Range&: Regs, d_first: Regs.begin(), F: [this](Register Reg) {
5987	return emitScalarToVector(`64`, &AArch64::FPR128RegClass, Reg, MIB)
5988	->getOperand(`0`)
5989	.getReg();
5990	});
5991	Ty = Ty.multiplyElements(Factor: `2`);
5992	}
5993
5994	Register Tuple = createQTuple(Regs, MIB);
5995	auto LaneNo = getIConstantVRegVal(VReg: (FirstSrcRegIt + NumVecs)->getReg(), MRI);
5996	if (!LaneNo)
5997	return false;
5998
5999	Register Ptr = (FirstSrcRegIt + NumVecs + `1`)->getReg();
6000	auto Load = MIB.buildInstr(Opc, DstOps: {Ty}, SrcOps: {})
6001	.addReg(RegNo: Tuple)
6002	.addImm(Val: LaneNo ->getZExtValue())
6003	.addReg(RegNo: Ptr);
6004	Load.cloneMemRefs(OtherMI: I);
6005	constrainSelectedInstRegOperands(*Load, TII, TRI, RBI);
6006	Register SelectedLoadDst = Load ->getOperand(i: `0`).getReg();
6007	unsigned SubReg = AArch64::qsub0;
6008	for (unsigned Idx = `0`; Idx < NumVecs; ++Idx) {
6009	auto Vec = MIB.buildInstr(TargetOpcode::COPY,
6010	{Narrow ? DstOp(&AArch64::FPR128RegClass)
6011	: DstOp(I.getOperand(Idx).getReg())},
6012	{})
6013	.addReg(SelectedLoadDst, `0`, SubReg + Idx);
6014	Register WideReg = Vec.getReg(`0`);
6015	// Emit the subreg copies and immediately select them.
6016	selectCopy(*Vec, TII, MRI, TRI, RBI);
6017	if (Narrow &&
6018	!emitNarrowVector(DstReg: I.getOperand(i: Idx).getReg(), SrcReg: WideReg, MIB, MRI))
6019	return false;
6020	}
6021	return true;
6022	}
6023
6024	void AArch64InstructionSelector::selectVectorStoreIntrinsic(MachineInstr &I,
6025	unsigned NumVecs,
6026	unsigned Opc) {
6027	MachineRegisterInfo &MRI = I.getParent()->getParent()->getRegInfo();
6028	LLT Ty = MRI.getType(Reg: I.getOperand(i: `1`).getReg());
6029	Register Ptr = I.getOperand(i: `1` + NumVecs).getReg();
6030
6031	SmallVector<Register, `2`> Regs(NumVecs);
6032	std::transform(first: I.operands_begin() + `1`, last: I.operands_begin() + `1` + NumVecs,
6033	result: Regs.begin(), unary_op: [](auto MO) { return MO.getReg(); });
6034
6035	Register Tuple = Ty.getSizeInBits() == `128` ? createQTuple(Regs, MIB)
6036	: createDTuple(Regs, MIB);
6037	auto Store = MIB.buildInstr(Opc, DstOps: {}, SrcOps: {Tuple, Ptr});
6038	Store.cloneMemRefs(OtherMI: I);
6039	constrainSelectedInstRegOperands(*Store, TII, TRI, RBI);
6040	}
6041
6042	bool AArch64InstructionSelector::selectVectorStoreLaneIntrinsic(
6043	MachineInstr &I, unsigned NumVecs, unsigned Opc) {
6044	MachineRegisterInfo &MRI = I.getParent()->getParent()->getRegInfo();
6045	LLT Ty = MRI.getType(Reg: I.getOperand(i: `1`).getReg());
6046	bool Narrow = Ty.getSizeInBits() == `64`;
6047
6048	SmallVector<Register, `2`> Regs(NumVecs);
6049	std::transform(first: I.operands_begin() + `1`, last: I.operands_begin() + `1` + NumVecs,
6050	result: Regs.begin(), unary_op: [](auto MO) { return MO.getReg(); });
6051
6052	if (Narrow)
6053	transform(Range&: Regs, d_first: Regs.begin(), F: [this](Register Reg) {
6054	return emitScalarToVector(`64`, &AArch64::FPR128RegClass, Reg, MIB)
6055	->getOperand(`0`)
6056	.getReg();
6057	});
6058
6059	Register Tuple = createQTuple(Regs, MIB);
6060
6061	auto LaneNo = getIConstantVRegVal(VReg: I.getOperand(i: `1` + NumVecs).getReg(), MRI);
6062	if (!LaneNo)
6063	return false;
6064	Register Ptr = I.getOperand(i: `1` + NumVecs + `1`).getReg();
6065	auto Store = MIB.buildInstr(Opc, DstOps: {}, SrcOps: {})
6066	.addReg(RegNo: Tuple)
6067	.addImm(Val: LaneNo ->getZExtValue())
6068	.addReg(RegNo: Ptr);
6069	Store.cloneMemRefs(OtherMI: I);
6070	constrainSelectedInstRegOperands(*Store, TII, TRI, RBI);
6071	return true;
6072	}
6073
6074	bool AArch64InstructionSelector::selectIntrinsicWithSideEffects(
6075	MachineInstr &I, MachineRegisterInfo &MRI) {
6076	// Find the intrinsic ID.
6077	unsigned IntrinID = cast<GIntrinsic>(Val&: I).getIntrinsicID();
6078
6079	const LLT S8 = LLT::scalar(SizeInBits: `8`);
6080	const LLT S16 = LLT::scalar(SizeInBits: `16`);
6081	const LLT S32 = LLT::scalar(SizeInBits: `32`);
6082	const LLT S64 = LLT::scalar(SizeInBits: `64`);
6083	const LLT P0 = LLT::pointer(AddressSpace: `0`, SizeInBits: `64`);
6084	// Select the instruction.
6085	switch (IntrinID) {
6086	default:
6087	return false;
6088	case Intrinsic::aarch64_ldxp:
6089	case Intrinsic::aarch64_ldaxp: {
6090	auto NewI = MIB.buildInstr(
6091	IntrinID == Intrinsic::aarch64_ldxp ? AArch64::LDXPX : AArch64::LDAXPX,
6092	{I.getOperand(`0`).getReg(), I.getOperand(`1`).getReg()},
6093	{I.getOperand(`3`)});
6094	NewI.cloneMemRefs(I);
6095	constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI);
6096	break;
6097	}
6098	case Intrinsic::trap:
6099	MIB.buildInstr(AArch64::BRK, {}, {}).addImm(`1`);
6100	break;
6101	case Intrinsic::debugtrap:
6102	MIB.buildInstr(AArch64::BRK, {}, {}).addImm(`0xF000`);
6103	break;
6104	case Intrinsic::ubsantrap:
6105	MIB.buildInstr(AArch64::BRK, {}, {})
6106	.addImm(I.getOperand(`1`).getImm() \| (`'U'` << `8`));
6107	break;
6108	case Intrinsic::aarch64_neon_ld1x2: {
6109	LLT Ty = MRI.getType(Reg: I.getOperand(i: `0`).getReg());
6110	unsigned Opc = `0`;
6111	if (Ty == LLT::fixed_vector(NumElements: `8`, ScalarTy: S8))
6112	Opc = AArch64::LD1Twov8b;
6113	else if (Ty == LLT::fixed_vector(NumElements: `16`, ScalarTy: S8))
6114	Opc = AArch64::LD1Twov16b;
6115	else if (Ty == LLT::fixed_vector(NumElements: `4`, ScalarTy: S16))
6116	Opc = AArch64::LD1Twov4h;
6117	else if (Ty == LLT::fixed_vector(NumElements: `8`, ScalarTy: S16))
6118	Opc = AArch64::LD1Twov8h;
6119	else if (Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: S32))
6120	Opc = AArch64::LD1Twov2s;
6121	else if (Ty == LLT::fixed_vector(NumElements: `4`, ScalarTy: S32))
6122	Opc = AArch64::LD1Twov4s;
6123	else if (Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: S64) \|\| Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: P0))
6124	Opc = AArch64::LD1Twov2d;
6125	else if (Ty == S64 \|\| Ty == P0)
6126	Opc = AArch64::LD1Twov1d;
6127	else
6128	llvm_unreachable("Unexpected type for ld1x2!");
6129	selectVectorLoadIntrinsic(Opc, NumVecs: `2`, I);
6130	break;
6131	}
6132	case Intrinsic::aarch64_neon_ld1x3: {
6133	LLT Ty = MRI.getType(Reg: I.getOperand(i: `0`).getReg());
6134	unsigned Opc = `0`;
6135	if (Ty == LLT::fixed_vector(NumElements: `8`, ScalarTy: S8))
6136	Opc = AArch64::LD1Threev8b;
6137	else if (Ty == LLT::fixed_vector(NumElements: `16`, ScalarTy: S8))
6138	Opc = AArch64::LD1Threev16b;
6139	else if (Ty == LLT::fixed_vector(NumElements: `4`, ScalarTy: S16))
6140	Opc = AArch64::LD1Threev4h;
6141	else if (Ty == LLT::fixed_vector(NumElements: `8`, ScalarTy: S16))
6142	Opc = AArch64::LD1Threev8h;
6143	else if (Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: S32))
6144	Opc = AArch64::LD1Threev2s;
6145	else if (Ty == LLT::fixed_vector(NumElements: `4`, ScalarTy: S32))
6146	Opc = AArch64::LD1Threev4s;
6147	else if (Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: S64) \|\| Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: P0))
6148	Opc = AArch64::LD1Threev2d;
6149	else if (Ty == S64 \|\| Ty == P0)
6150	Opc = AArch64::LD1Threev1d;
6151	else
6152	llvm_unreachable("Unexpected type for ld1x3!");
6153	selectVectorLoadIntrinsic(Opc, NumVecs: `3`, I);
6154	break;
6155	}
6156	case Intrinsic::aarch64_neon_ld1x4: {
6157	LLT Ty = MRI.getType(Reg: I.getOperand(i: `0`).getReg());
6158	unsigned Opc = `0`;
6159	if (Ty == LLT::fixed_vector(NumElements: `8`, ScalarTy: S8))
6160	Opc = AArch64::LD1Fourv8b;
6161	else if (Ty == LLT::fixed_vector(NumElements: `16`, ScalarTy: S8))
6162	Opc = AArch64::LD1Fourv16b;
6163	else if (Ty == LLT::fixed_vector(NumElements: `4`, ScalarTy: S16))
6164	Opc = AArch64::LD1Fourv4h;
6165	else if (Ty == LLT::fixed_vector(NumElements: `8`, ScalarTy: S16))
6166	Opc = AArch64::LD1Fourv8h;
6167	else if (Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: S32))
6168	Opc = AArch64::LD1Fourv2s;
6169	else if (Ty == LLT::fixed_vector(NumElements: `4`, ScalarTy: S32))
6170	Opc = AArch64::LD1Fourv4s;
6171	else if (Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: S64) \|\| Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: P0))
6172	Opc = AArch64::LD1Fourv2d;
6173	else if (Ty == S64 \|\| Ty == P0)
6174	Opc = AArch64::LD1Fourv1d;
6175	else
6176	llvm_unreachable("Unexpected type for ld1x4!");
6177	selectVectorLoadIntrinsic(Opc, NumVecs: `4`, I);
6178	break;
6179	}
6180	case Intrinsic::aarch64_neon_ld2: {
6181	LLT Ty = MRI.getType(Reg: I.getOperand(i: `0`).getReg());
6182	unsigned Opc = `0`;
6183	if (Ty == LLT::fixed_vector(NumElements: `8`, ScalarTy: S8))
6184	Opc = AArch64::LD2Twov8b;
6185	else if (Ty == LLT::fixed_vector(NumElements: `16`, ScalarTy: S8))
6186	Opc = AArch64::LD2Twov16b;
6187	else if (Ty == LLT::fixed_vector(NumElements: `4`, ScalarTy: S16))
6188	Opc = AArch64::LD2Twov4h;
6189	else if (Ty == LLT::fixed_vector(NumElements: `8`, ScalarTy: S16))
6190	Opc = AArch64::LD2Twov8h;
6191	else if (Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: S32))
6192	Opc = AArch64::LD2Twov2s;
6193	else if (Ty == LLT::fixed_vector(NumElements: `4`, ScalarTy: S32))
6194	Opc = AArch64::LD2Twov4s;
6195	else if (Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: S64) \|\| Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: P0))
6196	Opc = AArch64::LD2Twov2d;
6197	else if (Ty == S64 \|\| Ty == P0)
6198	Opc = AArch64::LD1Twov1d;
6199	else
6200	llvm_unreachable("Unexpected type for ld2!");
6201	selectVectorLoadIntrinsic(Opc, NumVecs: `2`, I);
6202	break;
6203	}
6204	case Intrinsic::aarch64_neon_ld2lane: {
6205	LLT Ty = MRI.getType(Reg: I.getOperand(i: `0`).getReg());
6206	unsigned Opc;
6207	if (Ty == LLT::fixed_vector(NumElements: `8`, ScalarTy: S8) \|\| Ty == LLT::fixed_vector(NumElements: `16`, ScalarTy: S8))
6208	Opc = AArch64::LD2i8;
6209	else if (Ty == LLT::fixed_vector(NumElements: `4`, ScalarTy: S16) \|\| Ty == LLT::fixed_vector(NumElements: `8`, ScalarTy: S16))
6210	Opc = AArch64::LD2i16;
6211	else if (Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: S32) \|\| Ty == LLT::fixed_vector(NumElements: `4`, ScalarTy: S32))
6212	Opc = AArch64::LD2i32;
6213	else if (Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: S64) \|\|
6214	Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: P0) \|\| Ty == S64 \|\| Ty == P0)
6215	Opc = AArch64::LD2i64;
6216	else
6217	llvm_unreachable("Unexpected type for st2lane!");
6218	if (!selectVectorLoadLaneIntrinsic(Opc, NumVecs: `2`, I))
6219	return false;
6220	break;
6221	}
6222	case Intrinsic::aarch64_neon_ld2r: {
6223	LLT Ty = MRI.getType(Reg: I.getOperand(i: `0`).getReg());
6224	unsigned Opc = `0`;
6225	if (Ty == LLT::fixed_vector(NumElements: `8`, ScalarTy: S8))
6226	Opc = AArch64::LD2Rv8b;
6227	else if (Ty == LLT::fixed_vector(NumElements: `16`, ScalarTy: S8))
6228	Opc = AArch64::LD2Rv16b;
6229	else if (Ty == LLT::fixed_vector(NumElements: `4`, ScalarTy: S16))
6230	Opc = AArch64::LD2Rv4h;
6231	else if (Ty == LLT::fixed_vector(NumElements: `8`, ScalarTy: S16))
6232	Opc = AArch64::LD2Rv8h;
6233	else if (Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: S32))
6234	Opc = AArch64::LD2Rv2s;
6235	else if (Ty == LLT::fixed_vector(NumElements: `4`, ScalarTy: S32))
6236	Opc = AArch64::LD2Rv4s;
6237	else if (Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: S64) \|\| Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: P0))
6238	Opc = AArch64::LD2Rv2d;
6239	else if (Ty == S64 \|\| Ty == P0)
6240	Opc = AArch64::LD2Rv1d;
6241	else
6242	llvm_unreachable("Unexpected type for ld2r!");
6243	selectVectorLoadIntrinsic(Opc, NumVecs: `2`, I);
6244	break;
6245	}
6246	case Intrinsic::aarch64_neon_ld3: {
6247	LLT Ty = MRI.getType(Reg: I.getOperand(i: `0`).getReg());
6248	unsigned Opc = `0`;
6249	if (Ty == LLT::fixed_vector(NumElements: `8`, ScalarTy: S8))
6250	Opc = AArch64::LD3Threev8b;
6251	else if (Ty == LLT::fixed_vector(NumElements: `16`, ScalarTy: S8))
6252	Opc = AArch64::LD3Threev16b;
6253	else if (Ty == LLT::fixed_vector(NumElements: `4`, ScalarTy: S16))
6254	Opc = AArch64::LD3Threev4h;
6255	else if (Ty == LLT::fixed_vector(NumElements: `8`, ScalarTy: S16))
6256	Opc = AArch64::LD3Threev8h;
6257	else if (Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: S32))
6258	Opc = AArch64::LD3Threev2s;
6259	else if (Ty == LLT::fixed_vector(NumElements: `4`, ScalarTy: S32))
6260	Opc = AArch64::LD3Threev4s;
6261	else if (Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: S64) \|\| Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: P0))
6262	Opc = AArch64::LD3Threev2d;
6263	else if (Ty == S64 \|\| Ty == P0)
6264	Opc = AArch64::LD1Threev1d;
6265	else
6266	llvm_unreachable("Unexpected type for ld3!");
6267	selectVectorLoadIntrinsic(Opc, NumVecs: `3`, I);
6268	break;
6269	}
6270	case Intrinsic::aarch64_neon_ld3lane: {
6271	LLT Ty = MRI.getType(Reg: I.getOperand(i: `0`).getReg());
6272	unsigned Opc;
6273	if (Ty == LLT::fixed_vector(NumElements: `8`, ScalarTy: S8) \|\| Ty == LLT::fixed_vector(NumElements: `16`, ScalarTy: S8))
6274	Opc = AArch64::LD3i8;
6275	else if (Ty == LLT::fixed_vector(NumElements: `4`, ScalarTy: S16) \|\| Ty == LLT::fixed_vector(NumElements: `8`, ScalarTy: S16))
6276	Opc = AArch64::LD3i16;
6277	else if (Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: S32) \|\| Ty == LLT::fixed_vector(NumElements: `4`, ScalarTy: S32))
6278	Opc = AArch64::LD3i32;
6279	else if (Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: S64) \|\|
6280	Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: P0) \|\| Ty == S64 \|\| Ty == P0)
6281	Opc = AArch64::LD3i64;
6282	else
6283	llvm_unreachable("Unexpected type for st3lane!");
6284	if (!selectVectorLoadLaneIntrinsic(Opc, NumVecs: `3`, I))
6285	return false;
6286	break;
6287	}
6288	case Intrinsic::aarch64_neon_ld3r: {
6289	LLT Ty = MRI.getType(Reg: I.getOperand(i: `0`).getReg());
6290	unsigned Opc = `0`;
6291	if (Ty == LLT::fixed_vector(NumElements: `8`, ScalarTy: S8))
6292	Opc = AArch64::LD3Rv8b;
6293	else if (Ty == LLT::fixed_vector(NumElements: `16`, ScalarTy: S8))
6294	Opc = AArch64::LD3Rv16b;
6295	else if (Ty == LLT::fixed_vector(NumElements: `4`, ScalarTy: S16))
6296	Opc = AArch64::LD3Rv4h;
6297	else if (Ty == LLT::fixed_vector(NumElements: `8`, ScalarTy: S16))
6298	Opc = AArch64::LD3Rv8h;
6299	else if (Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: S32))
6300	Opc = AArch64::LD3Rv2s;
6301	else if (Ty == LLT::fixed_vector(NumElements: `4`, ScalarTy: S32))
6302	Opc = AArch64::LD3Rv4s;
6303	else if (Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: S64) \|\| Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: P0))
6304	Opc = AArch64::LD3Rv2d;
6305	else if (Ty == S64 \|\| Ty == P0)
6306	Opc = AArch64::LD3Rv1d;
6307	else
6308	llvm_unreachable("Unexpected type for ld3r!");
6309	selectVectorLoadIntrinsic(Opc, NumVecs: `3`, I);
6310	break;
6311	}
6312	case Intrinsic::aarch64_neon_ld4: {
6313	LLT Ty = MRI.getType(Reg: I.getOperand(i: `0`).getReg());
6314	unsigned Opc = `0`;
6315	if (Ty == LLT::fixed_vector(NumElements: `8`, ScalarTy: S8))
6316	Opc = AArch64::LD4Fourv8b;
6317	else if (Ty == LLT::fixed_vector(NumElements: `16`, ScalarTy: S8))
6318	Opc = AArch64::LD4Fourv16b;
6319	else if (Ty == LLT::fixed_vector(NumElements: `4`, ScalarTy: S16))
6320	Opc = AArch64::LD4Fourv4h;
6321	else if (Ty == LLT::fixed_vector(NumElements: `8`, ScalarTy: S16))
6322	Opc = AArch64::LD4Fourv8h;
6323	else if (Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: S32))
6324	Opc = AArch64::LD4Fourv2s;
6325	else if (Ty == LLT::fixed_vector(NumElements: `4`, ScalarTy: S32))
6326	Opc = AArch64::LD4Fourv4s;
6327	else if (Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: S64) \|\| Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: P0))
6328	Opc = AArch64::LD4Fourv2d;
6329	else if (Ty == S64 \|\| Ty == P0)
6330	Opc = AArch64::LD1Fourv1d;
6331	else
6332	llvm_unreachable("Unexpected type for ld4!");
6333	selectVectorLoadIntrinsic(Opc, NumVecs: `4`, I);
6334	break;
6335	}
6336	case Intrinsic::aarch64_neon_ld4lane: {
6337	LLT Ty = MRI.getType(Reg: I.getOperand(i: `0`).getReg());
6338	unsigned Opc;
6339	if (Ty == LLT::fixed_vector(NumElements: `8`, ScalarTy: S8) \|\| Ty == LLT::fixed_vector(NumElements: `16`, ScalarTy: S8))
6340	Opc = AArch64::LD4i8;
6341	else if (Ty == LLT::fixed_vector(NumElements: `4`, ScalarTy: S16) \|\| Ty == LLT::fixed_vector(NumElements: `8`, ScalarTy: S16))
6342	Opc = AArch64::LD4i16;
6343	else if (Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: S32) \|\| Ty == LLT::fixed_vector(NumElements: `4`, ScalarTy: S32))
6344	Opc = AArch64::LD4i32;
6345	else if (Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: S64) \|\|
6346	Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: P0) \|\| Ty == S64 \|\| Ty == P0)
6347	Opc = AArch64::LD4i64;
6348	else
6349	llvm_unreachable("Unexpected type for st4lane!");
6350	if (!selectVectorLoadLaneIntrinsic(Opc, NumVecs: `4`, I))
6351	return false;
6352	break;
6353	}
6354	case Intrinsic::aarch64_neon_ld4r: {
6355	LLT Ty = MRI.getType(Reg: I.getOperand(i: `0`).getReg());
6356	unsigned Opc = `0`;
6357	if (Ty == LLT::fixed_vector(NumElements: `8`, ScalarTy: S8))
6358	Opc = AArch64::LD4Rv8b;
6359	else if (Ty == LLT::fixed_vector(NumElements: `16`, ScalarTy: S8))
6360	Opc = AArch64::LD4Rv16b;
6361	else if (Ty == LLT::fixed_vector(NumElements: `4`, ScalarTy: S16))
6362	Opc = AArch64::LD4Rv4h;
6363	else if (Ty == LLT::fixed_vector(NumElements: `8`, ScalarTy: S16))
6364	Opc = AArch64::LD4Rv8h;
6365	else if (Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: S32))
6366	Opc = AArch64::LD4Rv2s;
6367	else if (Ty == LLT::fixed_vector(NumElements: `4`, ScalarTy: S32))
6368	Opc = AArch64::LD4Rv4s;
6369	else if (Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: S64) \|\| Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: P0))
6370	Opc = AArch64::LD4Rv2d;
6371	else if (Ty == S64 \|\| Ty == P0)
6372	Opc = AArch64::LD4Rv1d;
6373	else
6374	llvm_unreachable("Unexpected type for ld4r!");
6375	selectVectorLoadIntrinsic(Opc, NumVecs: `4`, I);
6376	break;
6377	}
6378	case Intrinsic::aarch64_neon_st1x2: {
6379	LLT Ty = MRI.getType(Reg: I.getOperand(i: `1`).getReg());
6380	unsigned Opc;
6381	if (Ty == LLT::fixed_vector(NumElements: `8`, ScalarTy: S8))
6382	Opc = AArch64::ST1Twov8b;
6383	else if (Ty == LLT::fixed_vector(NumElements: `16`, ScalarTy: S8))
6384	Opc = AArch64::ST1Twov16b;
6385	else if (Ty == LLT::fixed_vector(NumElements: `4`, ScalarTy: S16))
6386	Opc = AArch64::ST1Twov4h;
6387	else if (Ty == LLT::fixed_vector(NumElements: `8`, ScalarTy: S16))
6388	Opc = AArch64::ST1Twov8h;
6389	else if (Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: S32))
6390	Opc = AArch64::ST1Twov2s;
6391	else if (Ty == LLT::fixed_vector(NumElements: `4`, ScalarTy: S32))
6392	Opc = AArch64::ST1Twov4s;
6393	else if (Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: S64) \|\| Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: P0))
6394	Opc = AArch64::ST1Twov2d;
6395	else if (Ty == S64 \|\| Ty == P0)
6396	Opc = AArch64::ST1Twov1d;
6397	else
6398	llvm_unreachable("Unexpected type for st1x2!");
6399	selectVectorStoreIntrinsic(I, NumVecs: `2`, Opc);
6400	break;
6401	}
6402	case Intrinsic::aarch64_neon_st1x3: {
6403	LLT Ty = MRI.getType(Reg: I.getOperand(i: `1`).getReg());
6404	unsigned Opc;
6405	if (Ty == LLT::fixed_vector(NumElements: `8`, ScalarTy: S8))
6406	Opc = AArch64::ST1Threev8b;
6407	else if (Ty == LLT::fixed_vector(NumElements: `16`, ScalarTy: S8))
6408	Opc = AArch64::ST1Threev16b;
6409	else if (Ty == LLT::fixed_vector(NumElements: `4`, ScalarTy: S16))
6410	Opc = AArch64::ST1Threev4h;
6411	else if (Ty == LLT::fixed_vector(NumElements: `8`, ScalarTy: S16))
6412	Opc = AArch64::ST1Threev8h;
6413	else if (Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: S32))
6414	Opc = AArch64::ST1Threev2s;
6415	else if (Ty == LLT::fixed_vector(NumElements: `4`, ScalarTy: S32))
6416	Opc = AArch64::ST1Threev4s;
6417	else if (Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: S64) \|\| Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: P0))
6418	Opc = AArch64::ST1Threev2d;
6419	else if (Ty == S64 \|\| Ty == P0)
6420	Opc = AArch64::ST1Threev1d;
6421	else
6422	llvm_unreachable("Unexpected type for st1x3!");
6423	selectVectorStoreIntrinsic(I, NumVecs: `3`, Opc);
6424	break;
6425	}
6426	case Intrinsic::aarch64_neon_st1x4: {
6427	LLT Ty = MRI.getType(Reg: I.getOperand(i: `1`).getReg());
6428	unsigned Opc;
6429	if (Ty == LLT::fixed_vector(NumElements: `8`, ScalarTy: S8))
6430	Opc = AArch64::ST1Fourv8b;
6431	else if (Ty == LLT::fixed_vector(NumElements: `16`, ScalarTy: S8))
6432	Opc = AArch64::ST1Fourv16b;
6433	else if (Ty == LLT::fixed_vector(NumElements: `4`, ScalarTy: S16))
6434	Opc = AArch64::ST1Fourv4h;
6435	else if (Ty == LLT::fixed_vector(NumElements: `8`, ScalarTy: S16))
6436	Opc = AArch64::ST1Fourv8h;
6437	else if (Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: S32))
6438	Opc = AArch64::ST1Fourv2s;
6439	else if (Ty == LLT::fixed_vector(NumElements: `4`, ScalarTy: S32))
6440	Opc = AArch64::ST1Fourv4s;
6441	else if (Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: S64) \|\| Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: P0))
6442	Opc = AArch64::ST1Fourv2d;
6443	else if (Ty == S64 \|\| Ty == P0)
6444	Opc = AArch64::ST1Fourv1d;
6445	else
6446	llvm_unreachable("Unexpected type for st1x4!");
6447	selectVectorStoreIntrinsic(I, NumVecs: `4`, Opc);
6448	break;
6449	}
6450	case Intrinsic::aarch64_neon_st2: {
6451	LLT Ty = MRI.getType(Reg: I.getOperand(i: `1`).getReg());
6452	unsigned Opc;
6453	if (Ty == LLT::fixed_vector(NumElements: `8`, ScalarTy: S8))
6454	Opc = AArch64::ST2Twov8b;
6455	else if (Ty == LLT::fixed_vector(NumElements: `16`, ScalarTy: S8))
6456	Opc = AArch64::ST2Twov16b;
6457	else if (Ty == LLT::fixed_vector(NumElements: `4`, ScalarTy: S16))
6458	Opc = AArch64::ST2Twov4h;
6459	else if (Ty == LLT::fixed_vector(NumElements: `8`, ScalarTy: S16))
6460	Opc = AArch64::ST2Twov8h;
6461	else if (Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: S32))
6462	Opc = AArch64::ST2Twov2s;
6463	else if (Ty == LLT::fixed_vector(NumElements: `4`, ScalarTy: S32))
6464	Opc = AArch64::ST2Twov4s;
6465	else if (Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: S64) \|\| Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: P0))
6466	Opc = AArch64::ST2Twov2d;
6467	else if (Ty == S64 \|\| Ty == P0)
6468	Opc = AArch64::ST1Twov1d;
6469	else
6470	llvm_unreachable("Unexpected type for st2!");
6471	selectVectorStoreIntrinsic(I, NumVecs: `2`, Opc);
6472	break;
6473	}
6474	case Intrinsic::aarch64_neon_st3: {
6475	LLT Ty = MRI.getType(Reg: I.getOperand(i: `1`).getReg());
6476	unsigned Opc;
6477	if (Ty == LLT::fixed_vector(NumElements: `8`, ScalarTy: S8))
6478	Opc = AArch64::ST3Threev8b;
6479	else if (Ty == LLT::fixed_vector(NumElements: `16`, ScalarTy: S8))
6480	Opc = AArch64::ST3Threev16b;
6481	else if (Ty == LLT::fixed_vector(NumElements: `4`, ScalarTy: S16))
6482	Opc = AArch64::ST3Threev4h;
6483	else if (Ty == LLT::fixed_vector(NumElements: `8`, ScalarTy: S16))
6484	Opc = AArch64::ST3Threev8h;
6485	else if (Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: S32))
6486	Opc = AArch64::ST3Threev2s;
6487	else if (Ty == LLT::fixed_vector(NumElements: `4`, ScalarTy: S32))
6488	Opc = AArch64::ST3Threev4s;
6489	else if (Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: S64) \|\| Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: P0))
6490	Opc = AArch64::ST3Threev2d;
6491	else if (Ty == S64 \|\| Ty == P0)
6492	Opc = AArch64::ST1Threev1d;
6493	else
6494	llvm_unreachable("Unexpected type for st3!");
6495	selectVectorStoreIntrinsic(I, NumVecs: `3`, Opc);
6496	break;
6497	}
6498	case Intrinsic::aarch64_neon_st4: {
6499	LLT Ty = MRI.getType(Reg: I.getOperand(i: `1`).getReg());
6500	unsigned Opc;
6501	if (Ty == LLT::fixed_vector(NumElements: `8`, ScalarTy: S8))
6502	Opc = AArch64::ST4Fourv8b;
6503	else if (Ty == LLT::fixed_vector(NumElements: `16`, ScalarTy: S8))
6504	Opc = AArch64::ST4Fourv16b;
6505	else if (Ty == LLT::fixed_vector(NumElements: `4`, ScalarTy: S16))
6506	Opc = AArch64::ST4Fourv4h;
6507	else if (Ty == LLT::fixed_vector(NumElements: `8`, ScalarTy: S16))
6508	Opc = AArch64::ST4Fourv8h;
6509	else if (Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: S32))
6510	Opc = AArch64::ST4Fourv2s;
6511	else if (Ty == LLT::fixed_vector(NumElements: `4`, ScalarTy: S32))
6512	Opc = AArch64::ST4Fourv4s;
6513	else if (Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: S64) \|\| Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: P0))
6514	Opc = AArch64::ST4Fourv2d;
6515	else if (Ty == S64 \|\| Ty == P0)
6516	Opc = AArch64::ST1Fourv1d;
6517	else
6518	llvm_unreachable("Unexpected type for st4!");
6519	selectVectorStoreIntrinsic(I, NumVecs: `4`, Opc);
6520	break;
6521	}
6522	case Intrinsic::aarch64_neon_st2lane: {
6523	LLT Ty = MRI.getType(Reg: I.getOperand(i: `1`).getReg());
6524	unsigned Opc;
6525	if (Ty == LLT::fixed_vector(NumElements: `8`, ScalarTy: S8) \|\| Ty == LLT::fixed_vector(NumElements: `16`, ScalarTy: S8))
6526	Opc = AArch64::ST2i8;
6527	else if (Ty == LLT::fixed_vector(NumElements: `4`, ScalarTy: S16) \|\| Ty == LLT::fixed_vector(NumElements: `8`, ScalarTy: S16))
6528	Opc = AArch64::ST2i16;
6529	else if (Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: S32) \|\| Ty == LLT::fixed_vector(NumElements: `4`, ScalarTy: S32))
6530	Opc = AArch64::ST2i32;
6531	else if (Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: S64) \|\|
6532	Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: P0) \|\| Ty == S64 \|\| Ty == P0)
6533	Opc = AArch64::ST2i64;
6534	else
6535	llvm_unreachable("Unexpected type for st2lane!");
6536	if (!selectVectorStoreLaneIntrinsic(I, NumVecs: `2`, Opc))
6537	return false;
6538	break;
6539	}
6540	case Intrinsic::aarch64_neon_st3lane: {
6541	LLT Ty = MRI.getType(Reg: I.getOperand(i: `1`).getReg());
6542	unsigned Opc;
6543	if (Ty == LLT::fixed_vector(NumElements: `8`, ScalarTy: S8) \|\| Ty == LLT::fixed_vector(NumElements: `16`, ScalarTy: S8))
6544	Opc = AArch64::ST3i8;
6545	else if (Ty == LLT::fixed_vector(NumElements: `4`, ScalarTy: S16) \|\| Ty == LLT::fixed_vector(NumElements: `8`, ScalarTy: S16))
6546	Opc = AArch64::ST3i16;
6547	else if (Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: S32) \|\| Ty == LLT::fixed_vector(NumElements: `4`, ScalarTy: S32))
6548	Opc = AArch64::ST3i32;
6549	else if (Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: S64) \|\|
6550	Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: P0) \|\| Ty == S64 \|\| Ty == P0)
6551	Opc = AArch64::ST3i64;
6552	else
6553	llvm_unreachable("Unexpected type for st3lane!");
6554	if (!selectVectorStoreLaneIntrinsic(I, NumVecs: `3`, Opc))
6555	return false;
6556	break;
6557	}
6558	case Intrinsic::aarch64_neon_st4lane: {
6559	LLT Ty = MRI.getType(Reg: I.getOperand(i: `1`).getReg());
6560	unsigned Opc;
6561	if (Ty == LLT::fixed_vector(NumElements: `8`, ScalarTy: S8) \|\| Ty == LLT::fixed_vector(NumElements: `16`, ScalarTy: S8))
6562	Opc = AArch64::ST4i8;
6563	else if (Ty == LLT::fixed_vector(NumElements: `4`, ScalarTy: S16) \|\| Ty == LLT::fixed_vector(NumElements: `8`, ScalarTy: S16))
6564	Opc = AArch64::ST4i16;
6565	else if (Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: S32) \|\| Ty == LLT::fixed_vector(NumElements: `4`, ScalarTy: S32))
6566	Opc = AArch64::ST4i32;
6567	else if (Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: S64) \|\|
6568	Ty == LLT::fixed_vector(NumElements: `2`, ScalarTy: P0) \|\| Ty == S64 \|\| Ty == P0)
6569	Opc = AArch64::ST4i64;
6570	else
6571	llvm_unreachable("Unexpected type for st4lane!");
6572	if (!selectVectorStoreLaneIntrinsic(I, NumVecs: `4`, Opc))
6573	return false;
6574	break;
6575	}
6576	case Intrinsic::aarch64_mops_memset_tag: {
6577	// Transform
6578	// %dst:gpr(p0) = \
6579	// G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aarch64.mops.memset.tag),
6580	// \ %dst:gpr(p0), %val:gpr(s64), %n:gpr(s64)
6581	// where %dst is updated, into
6582	// %Rd:GPR64common, %Rn:GPR64) = \
6583	// MOPSMemorySetTaggingPseudo \
6584	// %Rd:GPR64common, %Rn:GPR64, %Rm:GPR64
6585	// where Rd and Rn are tied.
6586	// It is expected that %val has been extended to s64 in legalization.
6587	// Note that the order of the size/value operands are swapped.
6588
6589	Register DstDef = I.getOperand(i: `0`).getReg();
6590	// I.getOperand(1) is the intrinsic function
6591	Register DstUse = I.getOperand(i: `2`).getReg();
6592	Register ValUse = I.getOperand(i: `3`).getReg();
6593	Register SizeUse = I.getOperand(i: `4`).getReg();
6594
6595	// MOPSMemorySetTaggingPseudo has two defs; the intrinsic call has only one.
6596	// Therefore an additional virtual register is requried for the updated size
6597	// operand. This value is not accessible via the semantics of the intrinsic.
6598	Register SizeDef = MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: `64`));
6599
6600	auto Memset = MIB.buildInstr(AArch64::MOPSMemorySetTaggingPseudo,
6601	{DstDef, SizeDef}, {DstUse, SizeUse, ValUse});
6602	Memset.cloneMemRefs(I);
6603	constrainSelectedInstRegOperands(*Memset, TII, TRI, RBI);
6604	break;
6605	}
6606	}
6607
6608	I.eraseFromParent();
6609	return true;
6610	}
6611
6612	bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I,
6613	MachineRegisterInfo &MRI) {
6614	unsigned IntrinID = cast<GIntrinsic>(Val&: I).getIntrinsicID();
6615
6616	switch (IntrinID) {
6617	default:
6618	break;
6619	case Intrinsic::aarch64_crypto_sha1h: {
6620	Register DstReg = I.getOperand(i: `0`).getReg();
6621	Register SrcReg = I.getOperand(i: `2`).getReg();
6622
6623	// FIXME: Should this be an assert?
6624	if (MRI.getType(Reg: DstReg).getSizeInBits() != `32` \|\|
6625	MRI.getType(Reg: SrcReg).getSizeInBits() != `32`)
6626	return false;
6627
6628	// The operation has to happen on FPRs. Set up some new FPR registers for
6629	// the source and destination if they are on GPRs.
6630	if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) {
6631	SrcReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass);
6632	MIB.buildCopy(Res: {SrcReg}, Op: {I.getOperand(i: `2`)});
6633
6634	// Make sure the copy ends up getting constrained properly.
6635	RBI.constrainGenericRegister(I.getOperand(`2`).getReg(),
6636	AArch64::GPR32RegClass, MRI);
6637	}
6638
6639	if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID)
6640	DstReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass);
6641
6642	// Actually insert the instruction.
6643	auto SHA1Inst = MIB.buildInstr(AArch64::SHA1Hrr, {DstReg}, {SrcReg});
6644	constrainSelectedInstRegOperands(*SHA1Inst, TII, TRI, RBI);
6645
6646	// Did we create a new register for the destination?
6647	if (DstReg != I.getOperand(i: `0`).getReg()) {
6648	// Yep. Copy the result of the instruction back into the original
6649	// destination.
6650	MIB.buildCopy(Res: {I.getOperand(i: `0`)}, Op: {DstReg});
6651	RBI.constrainGenericRegister(I.getOperand(`0`).getReg(),
6652	AArch64::GPR32RegClass, MRI);
6653	}
6654
6655	I.eraseFromParent();
6656	return true;
6657	}
6658	case Intrinsic::frameaddress:
6659	case Intrinsic::returnaddress: {
6660	MachineFunction &MF = *I.getParent()->getParent();
6661	MachineFrameInfo &MFI = MF.getFrameInfo();
6662
6663	unsigned Depth = I.getOperand(i: `2`).getImm();
6664	Register DstReg = I.getOperand(i: `0`).getReg();
6665	RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI);
6666
6667	if (Depth == `0` && IntrinID == Intrinsic::returnaddress) {
6668	if (!MFReturnAddr) {
6669	// Insert the copy from LR/X30 into the entry block, before it can be
6670	// clobbered by anything.
6671	MFI.setReturnAddressIsTaken(true);
6672	MFReturnAddr = getFunctionLiveInPhysReg(
6673	MF, TII, AArch64::LR, AArch64::GPR64RegClass, I.getDebugLoc());
6674	}
6675
6676	if (STI.hasPAuth()) {
6677	MIB.buildInstr(AArch64::XPACI, {DstReg}, {MFReturnAddr});
6678	} else {
6679	MIB.buildCopy({Register(AArch64::LR)}, {MFReturnAddr});
6680	MIB.buildInstr(AArch64::XPACLRI);
6681	MIB.buildCopy({DstReg}, {Register(AArch64::LR)});
6682	}
6683
6684	I.eraseFromParent();
6685	return true;
6686	}
6687
6688	MFI.setFrameAddressIsTaken(true);
6689	Register FrameAddr(AArch64::FP);
6690	while (Depth--) {
6691	Register NextFrame = MRI.createVirtualRegister(&AArch64::GPR64spRegClass);
6692	auto Ldr =
6693	MIB.buildInstr(AArch64::LDRXui, {NextFrame}, {FrameAddr}).addImm(`0`);
6694	constrainSelectedInstRegOperands(*Ldr, TII, TRI, RBI);
6695	FrameAddr = NextFrame;
6696	}
6697
6698	if (IntrinID == Intrinsic::frameaddress)
6699	MIB.buildCopy(Res: {DstReg}, Op: {FrameAddr});
6700	else {
6701	MFI.setReturnAddressIsTaken(true);
6702
6703	if (STI.hasPAuth()) {
6704	Register TmpReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
6705	MIB.buildInstr(AArch64::LDRXui, {TmpReg}, {FrameAddr}).addImm(`1`);
6706	MIB.buildInstr(AArch64::XPACI, {DstReg}, {TmpReg});
6707	} else {
6708	MIB.buildInstr(AArch64::LDRXui, {Register(AArch64::LR)}, {FrameAddr})
6709	.addImm(`1`);
6710	MIB.buildInstr(AArch64::XPACLRI);
6711	MIB.buildCopy({DstReg}, {Register(AArch64::LR)});
6712	}
6713	}
6714
6715	I.eraseFromParent();
6716	return true;
6717	}
6718	case Intrinsic::swift_async_context_addr:
6719	auto Sub = MIB.buildInstr(AArch64::SUBXri, {I.getOperand(`0`).getReg()},
6720	{Register(AArch64::FP)})
6721	.addImm(`8`)
6722	.addImm(`0`);
6723	constrainSelectedInstRegOperands(*Sub, TII, TRI, RBI);
6724
6725	MF->getFrameInfo().setFrameAddressIsTaken(true);
6726	MF->getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
6727	I.eraseFromParent();
6728	return true;
6729	}
6730	return false;
6731	}
6732
6733	InstructionSelector::ComplexRendererFns
6734	AArch64InstructionSelector::selectShiftA_32(const MachineOperand &Root) const {
6735	auto MaybeImmed = getImmedFromMO(Root);
6736	if (MaybeImmed == std::nullopt \|\| *MaybeImmed > `31`)
6737	return std::nullopt;
6738	uint64_t Enc = (`32` - *MaybeImmed) & `0x1f`;
6739	return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Enc); }}};
6740	}
6741
6742	InstructionSelector::ComplexRendererFns
6743	AArch64InstructionSelector::selectShiftB_32(const MachineOperand &Root) const {
6744	auto MaybeImmed = getImmedFromMO(Root);
6745	if (MaybeImmed == std::nullopt \|\| *MaybeImmed > `31`)
6746	return std::nullopt;
6747	uint64_t Enc = `31` - *MaybeImmed;
6748	return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Enc); }}};
6749	}
6750
6751	InstructionSelector::ComplexRendererFns
6752	AArch64InstructionSelector::selectShiftA_64(const MachineOperand &Root) const {
6753	auto MaybeImmed = getImmedFromMO(Root);
6754	if (MaybeImmed == std::nullopt \|\| *MaybeImmed > `63`)
6755	return std::nullopt;
6756	uint64_t Enc = (`64` - *MaybeImmed) & `0x3f`;
6757	return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Enc); }}};
6758	}
6759
6760	InstructionSelector::ComplexRendererFns
6761	AArch64InstructionSelector::selectShiftB_64(const MachineOperand &Root) const {
6762	auto MaybeImmed = getImmedFromMO(Root);
6763	if (MaybeImmed == std::nullopt \|\| *MaybeImmed > `63`)
6764	return std::nullopt;
6765	uint64_t Enc = `63` - *MaybeImmed;
6766	return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Enc); }}};
6767	}
6768
6769	/// Helper to select an immediate value that can be represented as a 12-bit
6770	/// value shifted left by either 0 or 12. If it is possible to do so, return
6771	/// the immediate and shift value. If not, return std::nullopt.
6772	///
6773	/// Used by selectArithImmed and selectNegArithImmed.
6774	InstructionSelector::ComplexRendererFns
6775	AArch64InstructionSelector::select12BitValueWithLeftShift(
6776	uint64_t Immed) const {
6777	unsigned ShiftAmt;
6778	if (Immed >> `12` == `0`) {
6779	ShiftAmt = `0`;
6780	} else if ((Immed & `0xfff`) == `0` && Immed >> `24` == `0`) {
6781	ShiftAmt = `12`;
6782	Immed = Immed >> `12`;
6783	} else
6784	return std::nullopt;
6785
6786	unsigned ShVal = AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: ShiftAmt);
6787	return {{
6788	[=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Immed); },
6789	[=](MachineInstrBuilder &MIB) { MIB.addImm(Val: ShVal); },
6790	}};
6791	}
6792
6793	/// SelectArithImmed - Select an immediate value that can be represented as
6794	/// a 12-bit value shifted left by either 0 or 12. If so, return true with
6795	/// Val set to the 12-bit value and Shift set to the shifter operand.
6796	InstructionSelector::ComplexRendererFns
6797	AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const {
6798	// This function is called from the addsub_shifted_imm ComplexPattern,
6799	// which lists [imm] as the list of opcode it's interested in, however
6800	// we still need to check whether the operand is actually an immediate
6801	// here because the ComplexPattern opcode list is only used in
6802	// root-level opcode matching.
6803	auto MaybeImmed = getImmedFromMO(Root);
6804	if (MaybeImmed == std::nullopt)
6805	return std::nullopt;
6806	return select12BitValueWithLeftShift(Immed: *MaybeImmed);
6807	}
6808
6809	/// SelectNegArithImmed - As above, but negates the value before trying to
6810	/// select it.
6811	InstructionSelector::ComplexRendererFns
6812	AArch64InstructionSelector::selectNegArithImmed(MachineOperand &Root) const {
6813	// We need a register here, because we need to know if we have a 64 or 32
6814	// bit immediate.
6815	if (!Root.isReg())
6816	return std::nullopt;
6817	auto MaybeImmed = getImmedFromMO(Root);
6818	if (MaybeImmed == std::nullopt)
6819	return std::nullopt;
6820	uint64_t Immed = *MaybeImmed;
6821
6822	// This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0"
6823	// have the opposite effect on the C flag, so this pattern mustn't match under
6824	// those circumstances.
6825	if (Immed == `0`)
6826	return std::nullopt;
6827
6828	// Check if we're dealing with a 32-bit type on the root or a 64-bit type on
6829	// the root.
6830	MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
6831	if (MRI.getType(Reg: Root.getReg()).getSizeInBits() == `32`)
6832	Immed = ~((uint32_t)Immed) + `1`;
6833	else
6834	Immed = ~Immed + `1ULL`;
6835
6836	if (Immed & `0xFFFFFFFFFF000000ULL`)
6837	return std::nullopt;
6838
6839	Immed &= `0xFFFFFFULL`;
6840	return select12BitValueWithLeftShift(Immed);
6841	}
6842
6843	/// Return true if it is worth folding MI into an extended register. That is,
6844	/// if it's safe to pull it into the addressing mode of a load or store as a
6845	/// shift.
6846	bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg(
6847	MachineInstr &MI, const MachineRegisterInfo &MRI) const {
6848	// Always fold if there is one use, or if we're optimizing for size.
6849	Register DefReg = MI.getOperand(i: `0`).getReg();
6850	if (MRI.hasOneNonDBGUse(RegNo: DefReg) \|\|
6851	MI.getParent()->getParent()->getFunction().hasOptSize())
6852	return true;
6853
6854	// It's better to avoid folding and recomputing shifts when we don't have a
6855	// fastpath.
6856	if (!STI.hasAddrLSLFast())
6857	return false;
6858
6859	// We have a fastpath, so folding a shift in and potentially computing it
6860	// many times may be beneficial. Check if this is only used in memory ops.
6861	// If it is, then we should fold.
6862	return all_of(Range: MRI.use_nodbg_instructions(Reg: DefReg),
6863	P: [](MachineInstr &Use) { return Use.mayLoadOrStore(); });
6864	}
6865
6866	static bool isSignExtendShiftType(AArch64_AM::ShiftExtendType Type) {
6867	switch (Type) {
6868	case AArch64_AM::SXTB:
6869	case AArch64_AM::SXTH:
6870	case AArch64_AM::SXTW:
6871	return true;
6872	default:
6873	return false;
6874	}
6875	}
6876
6877	InstructionSelector::ComplexRendererFns
6878	AArch64InstructionSelector::selectExtendedSHL(
6879	MachineOperand &Root, MachineOperand &Base, MachineOperand &Offset,
6880	unsigned SizeInBytes, bool WantsExt) const {
6881	assert(Base.isReg() && "Expected base to be a register operand");
6882	assert(Offset.isReg() && "Expected offset to be a register operand");
6883
6884	MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
6885	MachineInstr *OffsetInst = MRI.getVRegDef(Reg: Offset.getReg());
6886
6887	unsigned OffsetOpc = OffsetInst->getOpcode();
6888	bool LookedThroughZExt = false;
6889	if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL) {
6890	// Try to look through a ZEXT.
6891	if (OffsetOpc != TargetOpcode::G_ZEXT \|\| !WantsExt)
6892	return std::nullopt;
6893
6894	OffsetInst = MRI.getVRegDef(Reg: OffsetInst->getOperand(i: `1`).getReg());
6895	OffsetOpc = OffsetInst->getOpcode();
6896	LookedThroughZExt = true;
6897
6898	if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL)
6899	return std::nullopt;
6900	}
6901	// Make sure that the memory op is a valid size.
6902	int64_t LegalShiftVal = Log2_32(Value: SizeInBytes);
6903	if (LegalShiftVal == `0`)
6904	return std::nullopt;
6905	if (!isWorthFoldingIntoExtendedReg(MI&: *OffsetInst, MRI))
6906	return std::nullopt;
6907
6908	// Now, try to find the specific G_CONSTANT. Start by assuming that the
6909	// register we will offset is the LHS, and the register containing the
6910	// constant is the RHS.
6911	Register OffsetReg = OffsetInst->getOperand(i: `1`).getReg();
6912	Register ConstantReg = OffsetInst->getOperand(i: `2`).getReg();
6913	auto ValAndVReg = getIConstantVRegValWithLookThrough(VReg: ConstantReg, MRI);
6914	if (!ValAndVReg) {
6915	// We didn't get a constant on the RHS. If the opcode is a shift, then
6916	// we're done.
6917	if (OffsetOpc == TargetOpcode::G_SHL)
6918	return std::nullopt;
6919
6920	// If we have a G_MUL, we can use either register. Try looking at the RHS.
6921	std::swap(a&: OffsetReg, b&: ConstantReg);
6922	ValAndVReg = getIConstantVRegValWithLookThrough(VReg: ConstantReg, MRI);
6923	if (!ValAndVReg)
6924	return std::nullopt;
6925	}
6926
6927	// The value must fit into 3 bits, and must be positive. Make sure that is
6928	// true.
6929	int64_t ImmVal = ValAndVReg ->Value.getSExtValue();
6930
6931	// Since we're going to pull this into a shift, the constant value must be
6932	// a power of 2. If we got a multiply, then we need to check this.
6933	if (OffsetOpc == TargetOpcode::G_MUL) {
6934	if (!llvm::has_single_bit<uint32_t>(Value: ImmVal))
6935	return std::nullopt;
6936
6937	// Got a power of 2. So, the amount we'll shift is the log base-2 of that.
6938	ImmVal = Log2_32(Value: ImmVal);
6939	}
6940
6941	if ((ImmVal & `0x7`) != ImmVal)
6942	return std::nullopt;
6943
6944	// We are only allowed to shift by LegalShiftVal. This shift value is built
6945	// into the instruction, so we can't just use whatever we want.
6946	if (ImmVal != LegalShiftVal)
6947	return std::nullopt;
6948
6949	unsigned SignExtend = `0`;
6950	if (WantsExt) {
6951	// Check if the offset is defined by an extend, unless we looked through a
6952	// G_ZEXT earlier.
6953	if (!LookedThroughZExt) {
6954	MachineInstr *ExtInst = getDefIgnoringCopies(Reg: OffsetReg, MRI);
6955	auto Ext = getExtendTypeForInst(MI&: ExtInst, MRI, IsLoadStore: true*);
6956	if (Ext == AArch64_AM::InvalidShiftExtend)
6957	return std::nullopt;
6958
6959	SignExtend = isSignExtendShiftType(Type: Ext) ? `1` : `0`;
6960	// We only support SXTW for signed extension here.
6961	if (SignExtend && Ext != AArch64_AM::SXTW)
6962	return std::nullopt;
6963	OffsetReg = ExtInst->getOperand(i: `1`).getReg();
6964	}
6965
6966	// Need a 32-bit wide register here.
6967	MachineIRBuilder MIB(*MRI.getVRegDef(Reg: Root.getReg()));
6968	OffsetReg = moveScalarRegClass(OffsetReg, AArch64::GPR32RegClass, MIB);
6969	}
6970
6971	// We can use the LHS of the GEP as the base, and the LHS of the shift as an
6972	// offset. Signify that we are shifting by setting the shift flag to 1.
6973	return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(RegNo: Base.getReg()); },
6974	[=](MachineInstrBuilder &MIB) { MIB.addUse(RegNo: OffsetReg); },
6975	[=](MachineInstrBuilder &MIB) {
6976	// Need to add both immediates here to make sure that they are both
6977	// added to the instruction.
6978	MIB.addImm(Val: SignExtend);
6979	MIB.addImm(Val: `1`);
6980	}}};
6981	}
6982
6983	/// This is used for computing addresses like this:
6984	///
6985	/// ldr x1, [x2, x3, lsl #3]
6986	///
6987	/// Where x2 is the base register, and x3 is an offset register. The shift-left
6988	/// is a constant value specific to this load instruction. That is, we'll never
6989	/// see anything other than a 3 here (which corresponds to the size of the
6990	/// element being loaded.)
6991	InstructionSelector::ComplexRendererFns
6992	AArch64InstructionSelector::selectAddrModeShiftedExtendXReg(
6993	MachineOperand &Root, unsigned SizeInBytes) const {
6994	if (!Root.isReg())
6995	return std::nullopt;
6996	MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
6997
6998	// We want to find something like this:
6999	//
7000	// val = G_CONSTANT LegalShiftVal
7001	// shift = G_SHL off_reg val
7002	// ptr = G_PTR_ADD base_reg shift
7003	// x = G_LOAD ptr
7004	//
7005	// And fold it into this addressing mode:
7006	//
7007	// ldr x, [base_reg, off_reg, lsl #LegalShiftVal]
7008
7009	// Check if we can find the G_PTR_ADD.
7010	MachineInstr *PtrAdd =
7011	getOpcodeDef(Opcode: TargetOpcode::G_PTR_ADD, Reg: Root.getReg(), MRI);
7012	if (!PtrAdd \|\| !isWorthFoldingIntoExtendedReg(MI&: *PtrAdd, MRI))
7013	return std::nullopt;
7014
7015	// Now, try to match an opcode which will match our specific offset.
7016	// We want a G_SHL or a G_MUL.
7017	MachineInstr *OffsetInst =
7018	getDefIgnoringCopies(Reg: PtrAdd->getOperand(i: `2`).getReg(), MRI);
7019	return selectExtendedSHL(Root, Base&: PtrAdd->getOperand(i: `1`),
7020	Offset&: OffsetInst->getOperand(i: `0`), SizeInBytes,
7021	/WantsExt=/false);
7022	}
7023
7024	/// This is used for computing addresses like this:
7025	///
7026	/// ldr x1, [x2, x3]
7027	///
7028	/// Where x2 is the base register, and x3 is an offset register.
7029	///
7030	/// When possible (or profitable) to fold a G_PTR_ADD into the address
7031	/// calculation, this will do so. Otherwise, it will return std::nullopt.
7032	InstructionSelector::ComplexRendererFns
7033	AArch64InstructionSelector::selectAddrModeRegisterOffset(
7034	MachineOperand &Root) const {
7035	MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
7036
7037	// We need a GEP.
7038	MachineInstr *Gep = MRI.getVRegDef(Reg: Root.getReg());
7039	if (Gep->getOpcode() != TargetOpcode::G_PTR_ADD)
7040	return std::nullopt;
7041
7042	// If this is used more than once, let's not bother folding.
7043	// TODO: Check if they are memory ops. If they are, then we can still fold
7044	// without having to recompute anything.
7045	if (!MRI.hasOneNonDBGUse(RegNo: Gep->getOperand(i: `0`).getReg()))
7046	return std::nullopt;
7047
7048	// Base is the GEP's LHS, offset is its RHS.
7049	return {{[=](MachineInstrBuilder &MIB) {
7050	MIB.addUse(RegNo: Gep->getOperand(i: `1`).getReg());
7051	},
7052	[=](MachineInstrBuilder &MIB) {
7053	MIB.addUse(RegNo: Gep->getOperand(i: `2`).getReg());
7054	},
7055	[=](MachineInstrBuilder &MIB) {
7056	// Need to add both immediates here to make sure that they are both
7057	// added to the instruction.
7058	MIB.addImm(Val: `0`);
7059	MIB.addImm(Val: `0`);
7060	}}};
7061	}
7062
7063	/// This is intended to be equivalent to selectAddrModeXRO in
7064	/// AArch64ISelDAGtoDAG. It's used for selecting X register offset loads.
7065	InstructionSelector::ComplexRendererFns
7066	AArch64InstructionSelector::selectAddrModeXRO(MachineOperand &Root,
7067	unsigned SizeInBytes) const {
7068	MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
7069	if (!Root.isReg())
7070	return std::nullopt;
7071	MachineInstr *PtrAdd =
7072	getOpcodeDef(Opcode: TargetOpcode::G_PTR_ADD, Reg: Root.getReg(), MRI);
7073	if (!PtrAdd)
7074	return std::nullopt;
7075
7076	// Check for an immediates which cannot be encoded in the [base + imm]
7077	// addressing mode, and can't be encoded in an add/sub. If this happens, we'll
7078	// end up with code like:
7079	//
7080	// mov x0, wide
7081	// add x1 base, x0
7082	// ldr x2, [x1, x0]
7083	//
7084	// In this situation, we can use the [base, xreg] addressing mode to save an
7085	// add/sub:
7086	//
7087	// mov x0, wide
7088	// ldr x2, [base, x0]
7089	auto ValAndVReg =
7090	getIConstantVRegValWithLookThrough(VReg: PtrAdd->getOperand(i: `2`).getReg(), MRI);
7091	if (ValAndVReg) {
7092	unsigned Scale = Log2_32(Value: SizeInBytes);
7093	int64_t ImmOff = ValAndVReg ->Value.getSExtValue();
7094
7095	// Skip immediates that can be selected in the load/store addresing
7096	// mode.
7097	if (ImmOff % SizeInBytes == `0` && ImmOff >= `0` &&
7098	ImmOff < (`0x1000` << Scale))
7099	return std::nullopt;
7100
7101	// Helper lambda to decide whether or not it is preferable to emit an add.
7102	auto isPreferredADD = [](int64_t ImmOff) {
7103	// Constants in [0x0, 0xfff] can be encoded in an add.
7104	if ((ImmOff & `0xfffffffffffff000LL`) == `0x0LL`)
7105	return true;
7106
7107	// Can it be encoded in an add lsl #12?
7108	if ((ImmOff & `0xffffffffff000fffLL`) != `0x0LL`)
7109	return false;
7110
7111	// It can be encoded in an add lsl #12, but we may not want to. If it is
7112	// possible to select this as a single movz, then prefer that. A single
7113	// movz is faster than an add with a shift.
7114	return (ImmOff & `0xffffffffff00ffffLL`) != `0x0LL` &&
7115	(ImmOff & `0xffffffffffff0fffLL`) != `0x0LL`;
7116	};
7117
7118	// If the immediate can be encoded in a single add/sub, then bail out.
7119	if (isPreferredADD (ImmOff) \|\| isPreferredADD (-ImmOff))
7120	return std::nullopt;
7121	}
7122
7123	// Try to fold shifts into the addressing mode.
7124	auto AddrModeFns = selectAddrModeShiftedExtendXReg(Root, SizeInBytes);
7125	if (AddrModeFns)
7126	return AddrModeFns;
7127
7128	// If that doesn't work, see if it's possible to fold in registers from
7129	// a GEP.
7130	return selectAddrModeRegisterOffset(Root);
7131	}
7132
7133	/// This is used for computing addresses like this:
7134	///
7135	/// ldr x0, [xBase, wOffset, sxtw #LegalShiftVal]
7136	///
7137	/// Where we have a 64-bit base register, a 32-bit offset register, and an
7138	/// extend (which may or may not be signed).
7139	InstructionSelector::ComplexRendererFns
7140	AArch64InstructionSelector::selectAddrModeWRO(MachineOperand &Root,
7141	unsigned SizeInBytes) const {
7142	MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
7143
7144	MachineInstr *PtrAdd =
7145	getOpcodeDef(Opcode: TargetOpcode::G_PTR_ADD, Reg: Root.getReg(), MRI);
7146	if (!PtrAdd \|\| !isWorthFoldingIntoExtendedReg(MI&: *PtrAdd, MRI))
7147	return std::nullopt;
7148
7149	MachineOperand &LHS = PtrAdd->getOperand(i: `1`);
7150	MachineOperand &RHS = PtrAdd->getOperand(i: `2`);
7151	MachineInstr *OffsetInst = getDefIgnoringCopies(Reg: RHS.getReg(), MRI);
7152
7153	// The first case is the same as selectAddrModeXRO, except we need an extend.
7154	// In this case, we try to find a shift and extend, and fold them into the
7155	// addressing mode.
7156	//
7157	// E.g.
7158	//
7159	// off_reg = G_Z/S/ANYEXT ext_reg
7160	// val = G_CONSTANT LegalShiftVal
7161	// shift = G_SHL off_reg val
7162	// ptr = G_PTR_ADD base_reg shift
7163	// x = G_LOAD ptr
7164	//
7165	// In this case we can get a load like this:
7166	//
7167	// ldr x0, [base_reg, ext_reg, sxtw #LegalShiftVal]
7168	auto ExtendedShl = selectExtendedSHL(Root, Base&: LHS, Offset&: OffsetInst->getOperand(i: `0`),
7169	SizeInBytes, /WantsExt=/true);
7170	if (ExtendedShl)
7171	return ExtendedShl;
7172
7173	// There was no shift. We can try and fold a G_Z/S/ANYEXT in alone though.
7174	//
7175	// e.g.
7176	// ldr something, [base_reg, ext_reg, sxtw]
7177	if (!isWorthFoldingIntoExtendedReg(MI&: *OffsetInst, MRI))
7178	return std::nullopt;
7179
7180	// Check if this is an extend. We'll get an extend type if it is.
7181	AArch64_AM::ShiftExtendType Ext =
7182	getExtendTypeForInst(MI&: OffsetInst, MRI, /IsLoadStore=/*true);
7183	if (Ext == AArch64_AM::InvalidShiftExtend)
7184	return std::nullopt;
7185
7186	// Need a 32-bit wide register.
7187	MachineIRBuilder MIB(*PtrAdd);
7188	Register ExtReg = moveScalarRegClass(OffsetInst->getOperand(`1`).getReg(),
7189	AArch64::GPR32RegClass, MIB);
7190	unsigned SignExtend = Ext == AArch64_AM::SXTW;
7191
7192	// Base is LHS, offset is ExtReg.
7193	return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(RegNo: LHS.getReg()); },
7194	[=](MachineInstrBuilder &MIB) { MIB.addUse(RegNo: ExtReg); },
7195	[=](MachineInstrBuilder &MIB) {
7196	MIB.addImm(Val: SignExtend);
7197	MIB.addImm(Val: `0`);
7198	}}};
7199	}
7200
7201	/// Select a "register plus unscaled signed 9-bit immediate" address. This
7202	/// should only match when there is an offset that is not valid for a scaled
7203	/// immediate addressing mode. The "Size" argument is the size in bytes of the
7204	/// memory reference, which is needed here to know what is valid for a scaled
7205	/// immediate.
7206	InstructionSelector::ComplexRendererFns
7207	AArch64InstructionSelector::selectAddrModeUnscaled(MachineOperand &Root,
7208	unsigned Size) const {
7209	MachineRegisterInfo &MRI =
7210	Root.getParent()->getParent()->getParent()->getRegInfo();
7211
7212	if (!Root.isReg())
7213	return std::nullopt;
7214
7215	if (!isBaseWithConstantOffset(Root, MRI))
7216	return std::nullopt;
7217
7218	MachineInstr *RootDef = MRI.getVRegDef(Reg: Root.getReg());
7219
7220	MachineOperand &OffImm = RootDef->getOperand(i: `2`);
7221	if (!OffImm.isReg())
7222	return std::nullopt;
7223	MachineInstr *RHS = MRI.getVRegDef(Reg: OffImm.getReg());
7224	if (RHS->getOpcode() != TargetOpcode::G_CONSTANT)
7225	return std::nullopt;
7226	int64_t RHSC;
7227	MachineOperand &RHSOp1 = RHS->getOperand(i: `1`);
7228	if (!RHSOp1.isCImm() \|\| RHSOp1.getCImm()->getBitWidth() > `64`)
7229	return std::nullopt;
7230	RHSC = RHSOp1.getCImm()->getSExtValue();
7231
7232	if (RHSC >= -`256` && RHSC < `256`) {
7233	MachineOperand &Base = RootDef->getOperand(i: `1`);
7234	return {{
7235	[=](MachineInstrBuilder &MIB) { MIB.add(MO: Base); },
7236	[=](MachineInstrBuilder &MIB) { MIB.addImm(Val: RHSC); },
7237	}};
7238	}
7239	return std::nullopt;
7240	}
7241
7242	InstructionSelector::ComplexRendererFns
7243	AArch64InstructionSelector::tryFoldAddLowIntoImm(MachineInstr &RootDef,
7244	unsigned Size,
7245	MachineRegisterInfo &MRI) const {
7246	if (RootDef.getOpcode() != AArch64::G_ADD_LOW)
7247	return std::nullopt;
7248	MachineInstr &Adrp = *MRI.getVRegDef(Reg: RootDef.getOperand(i: `1`).getReg());
7249	if (Adrp.getOpcode() != AArch64::ADRP)
7250	return std::nullopt;
7251
7252	// TODO: add heuristics like isWorthFoldingADDlow() from SelectionDAG.
7253	auto Offset = Adrp.getOperand(i: `1`).getOffset();
7254	if (Offset % Size != `0`)
7255	return std::nullopt;
7256
7257	auto GV = Adrp.getOperand(i: `1`).getGlobal();
7258	if (GV->isThreadLocal())
7259	return std::nullopt;
7260
7261	auto &MF = *RootDef.getParent()->getParent();
7262	if (GV->getPointerAlignment(DL: MF.getDataLayout()) < Size)
7263	return std::nullopt;
7264
7265	unsigned OpFlags = STI.ClassifyGlobalReference(GV, TM: MF.getTarget());
7266	MachineIRBuilder MIRBuilder(RootDef);
7267	Register AdrpReg = Adrp.getOperand(i: `0`).getReg();
7268	return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(RegNo: AdrpReg); },
7269	[=](MachineInstrBuilder &MIB) {
7270	MIB.addGlobalAddress(GV, Offset,
7271	TargetFlags: OpFlags \| AArch64II::MO_PAGEOFF \|
7272	AArch64II::MO_NC);
7273	}}};
7274	}
7275
7276	/// Select a "register plus scaled unsigned 12-bit immediate" address. The
7277	/// "Size" argument is the size in bytes of the memory reference, which
7278	/// determines the scale.
7279	InstructionSelector::ComplexRendererFns
7280	AArch64InstructionSelector::selectAddrModeIndexed(MachineOperand &Root,
7281	unsigned Size) const {
7282	MachineFunction &MF = *Root.getParent()->getParent()->getParent();
7283	MachineRegisterInfo &MRI = MF.getRegInfo();
7284
7285	if (!Root.isReg())
7286	return std::nullopt;
7287
7288	MachineInstr *RootDef = MRI.getVRegDef(Reg: Root.getReg());
7289	if (RootDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) {
7290	return {{
7291	[=](MachineInstrBuilder &MIB) { MIB.add(MO: RootDef->getOperand(i: `1`)); },
7292	[=](MachineInstrBuilder &MIB) { MIB.addImm(Val: `0`); },
7293	}};
7294	}
7295
7296	CodeModel::Model CM = MF.getTarget().getCodeModel();
7297	// Check if we can fold in the ADD of small code model ADRP + ADD address.
7298	if (CM == CodeModel::Small) {
7299	auto OpFns = tryFoldAddLowIntoImm(RootDef&: *RootDef, Size, MRI);
7300	if (OpFns)
7301	return OpFns;
7302	}
7303
7304	if (isBaseWithConstantOffset(Root, MRI)) {
7305	MachineOperand &LHS = RootDef->getOperand(i: `1`);
7306	MachineOperand &RHS = RootDef->getOperand(i: `2`);
7307	MachineInstr *LHSDef = MRI.getVRegDef(Reg: LHS.getReg());
7308	MachineInstr *RHSDef = MRI.getVRegDef(Reg: RHS.getReg());
7309
7310	int64_t RHSC = (int64_t)RHSDef->getOperand(i: `1`).getCImm()->getZExtValue();
7311	unsigned Scale = Log2_32(Value: Size);
7312	if ((RHSC & (Size - `1`)) == `0` && RHSC >= `0` && RHSC < (`0x1000` << Scale)) {
7313	if (LHSDef->getOpcode() == TargetOpcode::G_FRAME_INDEX)
7314	return {{
7315	[=](MachineInstrBuilder &MIB) { MIB.add(MO: LHSDef->getOperand(i: `1`)); },
7316	[=](MachineInstrBuilder &MIB) { MIB.addImm(Val: RHSC >> Scale); },
7317	}};
7318
7319	return {{
7320	[=](MachineInstrBuilder &MIB) { MIB.add(MO: LHS); },
7321	[=](MachineInstrBuilder &MIB) { MIB.addImm(Val: RHSC >> Scale); },
7322	}};
7323	}
7324	}
7325
7326	// Before falling back to our general case, check if the unscaled
7327	// instructions can handle this. If so, that's preferable.
7328	if (selectAddrModeUnscaled(Root, Size))
7329	return std::nullopt;
7330
7331	return {{
7332	[=](MachineInstrBuilder &MIB) { MIB.add(MO: Root); },
7333	[=](MachineInstrBuilder &MIB) { MIB.addImm(Val: `0`); },
7334	}};
7335	}
7336
7337	/// Given a shift instruction, return the correct shift type for that
7338	/// instruction.
7339	static AArch64_AM::ShiftExtendType getShiftTypeForInst(MachineInstr &MI) {
7340	switch (MI.getOpcode()) {
7341	default:
7342	return AArch64_AM::InvalidShiftExtend;
7343	case TargetOpcode::G_SHL:
7344	return AArch64_AM::LSL;
7345	case TargetOpcode::G_LSHR:
7346	return AArch64_AM::LSR;
7347	case TargetOpcode::G_ASHR:
7348	return AArch64_AM::ASR;
7349	case TargetOpcode::G_ROTR:
7350	return AArch64_AM::ROR;
7351	}
7352	}
7353
7354	/// Select a "shifted register" operand. If the value is not shifted, set the
7355	/// shift operand to a default value of "lsl 0".
7356	InstructionSelector::ComplexRendererFns
7357	AArch64InstructionSelector::selectShiftedRegister(MachineOperand &Root,
7358	bool AllowROR) const {
7359	if (!Root.isReg())
7360	return std::nullopt;
7361	MachineRegisterInfo &MRI =
7362	Root.getParent()->getParent()->getParent()->getRegInfo();
7363
7364	// Check if the operand is defined by an instruction which corresponds to
7365	// a ShiftExtendType. E.g. a G_SHL, G_LSHR, etc.
7366	MachineInstr *ShiftInst = MRI.getVRegDef(Reg: Root.getReg());
7367	AArch64_AM::ShiftExtendType ShType = getShiftTypeForInst(MI&: *ShiftInst);
7368	if (ShType == AArch64_AM::InvalidShiftExtend)
7369	return std::nullopt;
7370	if (ShType == AArch64_AM::ROR && !AllowROR)
7371	return std::nullopt;
7372	if (!isWorthFoldingIntoExtendedReg(MI&: *ShiftInst, MRI))
7373	return std::nullopt;
7374
7375	// Need an immediate on the RHS.
7376	MachineOperand &ShiftRHS = ShiftInst->getOperand(i: `2`);
7377	auto Immed = getImmedFromMO(Root: ShiftRHS);
7378	if (!Immed)
7379	return std::nullopt;
7380
7381	// We have something that we can fold. Fold in the shift's LHS and RHS into
7382	// the instruction.
7383	MachineOperand &ShiftLHS = ShiftInst->getOperand(i: `1`);
7384	Register ShiftReg = ShiftLHS.getReg();
7385
7386	unsigned NumBits = MRI.getType(Reg: ShiftReg).getSizeInBits();
7387	unsigned Val = *Immed & (NumBits - `1`);
7388	unsigned ShiftVal = AArch64_AM::getShifterImm(ST: ShType, Imm: Val);
7389
7390	return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(RegNo: ShiftReg); },
7391	[=](MachineInstrBuilder &MIB) { MIB.addImm(Val: ShiftVal); }}};
7392	}
7393
7394	AArch64_AM::ShiftExtendType AArch64InstructionSelector::getExtendTypeForInst(
7395	MachineInstr &MI, MachineRegisterInfo &MRI, bool IsLoadStore) const {
7396	unsigned Opc = MI.getOpcode();
7397
7398	// Handle explicit extend instructions first.
7399	if (Opc == TargetOpcode::G_SEXT \|\| Opc == TargetOpcode::G_SEXT_INREG) {
7400	unsigned Size;
7401	if (Opc == TargetOpcode::G_SEXT)
7402	Size = MRI.getType(Reg: MI.getOperand(i: `1`).getReg()).getSizeInBits();
7403	else
7404	Size = MI.getOperand(i: `2`).getImm();
7405	assert(Size != `64` && "Extend from 64 bits?");
7406	switch (Size) {
7407	case `8`:
7408	return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::SXTB;
7409	case `16`:
7410	return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::SXTH;
7411	case `32`:
7412	return AArch64_AM::SXTW;
7413	default:
7414	return AArch64_AM::InvalidShiftExtend;
7415	}
7416	}
7417
7418	if (Opc == TargetOpcode::G_ZEXT \|\| Opc == TargetOpcode::G_ANYEXT) {
7419	unsigned Size = MRI.getType(Reg: MI.getOperand(i: `1`).getReg()).getSizeInBits();
7420	assert(Size != `64` && "Extend from 64 bits?");
7421	switch (Size) {
7422	case `8`:
7423	return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::UXTB;
7424	case `16`:
7425	return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::UXTH;
7426	case `32`:
7427	return AArch64_AM::UXTW;
7428	default:
7429	return AArch64_AM::InvalidShiftExtend;
7430	}
7431	}
7432
7433	// Don't have an explicit extend. Try to handle a G_AND with a constant mask
7434	// on the RHS.
7435	if (Opc != TargetOpcode::G_AND)
7436	return AArch64_AM::InvalidShiftExtend;
7437
7438	std::optional<uint64_t> MaybeAndMask = getImmedFromMO(Root: MI.getOperand(i: `2`));
7439	if (!MaybeAndMask)
7440	return AArch64_AM::InvalidShiftExtend;
7441	uint64_t AndMask = *MaybeAndMask;
7442	switch (AndMask) {
7443	default:
7444	return AArch64_AM::InvalidShiftExtend;
7445	case `0xFF`:
7446	return !IsLoadStore ? AArch64_AM::UXTB : AArch64_AM::InvalidShiftExtend;
7447	case `0xFFFF`:
7448	return !IsLoadStore ? AArch64_AM::UXTH : AArch64_AM::InvalidShiftExtend;
7449	case `0xFFFFFFFF`:
7450	return AArch64_AM::UXTW;
7451	}
7452	}
7453
7454	Register AArch64InstructionSelector::moveScalarRegClass(
7455	Register Reg, const TargetRegisterClass &RC, MachineIRBuilder &MIB) const {
7456	MachineRegisterInfo &MRI = *MIB.getMRI();
7457	auto Ty = MRI.getType(Reg);
7458	assert(!Ty.isVector() && "Expected scalars only!");
7459	if (Ty.getSizeInBits() == TRI.getRegSizeInBits(RC))
7460	return Reg;
7461
7462	// Create a copy and immediately select it.
7463	// FIXME: We should have an emitCopy function?
7464	auto Copy = MIB.buildCopy(Res: {&RC}, Op: {Reg});
7465	selectCopy(*Copy, TII, MRI, TRI, RBI);
7466	return Copy.getReg(Idx: `0`);
7467	}
7468
7469	/// Select an "extended register" operand. This operand folds in an extend
7470	/// followed by an optional left shift.
7471	InstructionSelector::ComplexRendererFns
7472	AArch64InstructionSelector::selectArithExtendedRegister(
7473	MachineOperand &Root) const {
7474	if (!Root.isReg())
7475	return std::nullopt;
7476	MachineRegisterInfo &MRI =
7477	Root.getParent()->getParent()->getParent()->getRegInfo();
7478
7479	uint64_t ShiftVal = `0`;
7480	Register ExtReg;
7481	AArch64_AM::ShiftExtendType Ext;
7482	MachineInstr *RootDef = getDefIgnoringCopies(Reg: Root.getReg(), MRI);
7483	if (!RootDef)
7484	return std::nullopt;
7485
7486	if (!isWorthFoldingIntoExtendedReg(MI&: *RootDef, MRI))
7487	return std::nullopt;
7488
7489	// Check if we can fold a shift and an extend.
7490	if (RootDef->getOpcode() == TargetOpcode::G_SHL) {
7491	// Look for a constant on the RHS of the shift.
7492	MachineOperand &RHS = RootDef->getOperand(i: `2`);
7493	std::optional<uint64_t> MaybeShiftVal = getImmedFromMO(Root: RHS);
7494	if (!MaybeShiftVal)
7495	return std::nullopt;
7496	ShiftVal = *MaybeShiftVal;
7497	if (ShiftVal > `4`)
7498	return std::nullopt;
7499	// Look for a valid extend instruction on the LHS of the shift.
7500	MachineOperand &LHS = RootDef->getOperand(i: `1`);
7501	MachineInstr *ExtDef = getDefIgnoringCopies(Reg: LHS.getReg(), MRI);
7502	if (!ExtDef)
7503	return std::nullopt;
7504	Ext = getExtendTypeForInst(MI&: *ExtDef, MRI);
7505	if (Ext == AArch64_AM::InvalidShiftExtend)
7506	return std::nullopt;
7507	ExtReg = ExtDef->getOperand(i: `1`).getReg();
7508	} else {
7509	// Didn't get a shift. Try just folding an extend.
7510	Ext = getExtendTypeForInst(MI&: *RootDef, MRI);
7511	if (Ext == AArch64_AM::InvalidShiftExtend)
7512	return std::nullopt;
7513	ExtReg = RootDef->getOperand(i: `1`).getReg();
7514
7515	// If we have a 32 bit instruction which zeroes out the high half of a
7516	// register, we get an implicit zero extend for free. Check if we have one.
7517	// FIXME: We actually emit the extend right now even though we don't have
7518	// to.
7519	if (Ext == AArch64_AM::UXTW && MRI.getType(Reg: ExtReg).getSizeInBits() == `32`) {
7520	MachineInstr *ExtInst = MRI.getVRegDef(Reg: ExtReg);
7521	if (isDef32(MI: *ExtInst))
7522	return std::nullopt;
7523	}
7524	}
7525
7526	// We require a GPR32 here. Narrow the ExtReg if needed using a subregister
7527	// copy.
7528	MachineIRBuilder MIB(*RootDef);
7529	ExtReg = moveScalarRegClass(ExtReg, AArch64::GPR32RegClass, MIB);
7530
7531	return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(RegNo: ExtReg); },
7532	[=](MachineInstrBuilder &MIB) {
7533	MIB.addImm(Val: getArithExtendImm(ET: Ext, Imm: ShiftVal));
7534	}}};
7535	}
7536
7537	InstructionSelector::ComplexRendererFns
7538	AArch64InstructionSelector::selectExtractHigh(MachineOperand &Root) const {
7539	if (!Root.isReg())
7540	return std::nullopt;
7541	MachineRegisterInfo &MRI =
7542	Root.getParent()->getParent()->getParent()->getRegInfo();
7543
7544	auto Extract = getDefSrcRegIgnoringCopies(Reg: Root.getReg(), MRI);
7545	while (Extract && Extract ->MI->getOpcode() == TargetOpcode::G_BITCAST &&
7546	STI.isLittleEndian())
7547	Extract =
7548	getDefSrcRegIgnoringCopies(Reg: Extract ->MI->getOperand(i: `1`).getReg(), MRI);
7549	if (!Extract)
7550	return std::nullopt;
7551
7552	if (Extract ->MI->getOpcode() == TargetOpcode::G_UNMERGE_VALUES) {
7553	if (Extract ->Reg == Extract ->MI->getOperand(i: `1`).getReg()) {
7554	Register ExtReg = Extract ->MI->getOperand(i: `2`).getReg();
7555	return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(RegNo: ExtReg); }}};
7556	}
7557	}
7558	if (Extract ->MI->getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT) {
7559	LLT SrcTy = MRI.getType(Reg: Extract ->MI->getOperand(i: `1`).getReg());
7560	auto LaneIdx = getIConstantVRegValWithLookThrough(
7561	VReg: Extract ->MI->getOperand(i: `2`).getReg(), MRI);
7562	if (LaneIdx && SrcTy == LLT::fixed_vector(NumElements: `2`, ScalarSizeInBits: `64`) &&
7563	LaneIdx ->Value.getSExtValue() == `1`) {
7564	Register ExtReg = Extract ->MI->getOperand(i: `1`).getReg();
7565	return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(RegNo: ExtReg); }}};
7566	}
7567	}
7568
7569	return std::nullopt;
7570	}
7571
7572	void AArch64InstructionSelector::renderTruncImm(MachineInstrBuilder &MIB,
7573	const MachineInstr &MI,
7574	int OpIdx) const {
7575	const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
7576	assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -`1` &&
7577	"Expected G_CONSTANT");
7578	std::optional<int64_t> CstVal =
7579	getIConstantVRegSExtVal(VReg: MI.getOperand(i: `0`).getReg(), MRI);
7580	assert(CstVal && "Expected constant value");
7581	MIB.addImm(Val: *CstVal);
7582	}
7583
7584	void AArch64InstructionSelector::renderLogicalImm32(
7585	MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const {
7586	assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -`1` &&
7587	"Expected G_CONSTANT");
7588	uint64_t CstVal = I.getOperand(i: `1`).getCImm()->getZExtValue();
7589	uint64_t Enc = AArch64_AM::encodeLogicalImmediate(imm: CstVal, regSize: `32`);
7590	MIB.addImm(Val: Enc);
7591	}
7592
7593	void AArch64InstructionSelector::renderLogicalImm64(
7594	MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const {
7595	assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -`1` &&
7596	"Expected G_CONSTANT");
7597	uint64_t CstVal = I.getOperand(i: `1`).getCImm()->getZExtValue();
7598	uint64_t Enc = AArch64_AM::encodeLogicalImmediate(imm: CstVal, regSize: `64`);
7599	MIB.addImm(Val: Enc);
7600	}
7601
7602	void AArch64InstructionSelector::renderFPImm16(MachineInstrBuilder &MIB,
7603	const MachineInstr &MI,
7604	int OpIdx) const {
7605	assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -`1` &&
7606	"Expected G_FCONSTANT");
7607	MIB.addImm(
7608	Val: AArch64_AM::getFP16Imm(FPImm: MI.getOperand(i: `1`).getFPImm()->getValueAPF()));
7609	}
7610
7611	void AArch64InstructionSelector::renderFPImm32(MachineInstrBuilder &MIB,
7612	const MachineInstr &MI,
7613	int OpIdx) const {
7614	assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -`1` &&
7615	"Expected G_FCONSTANT");
7616	MIB.addImm(
7617	Val: AArch64_AM::getFP32Imm(FPImm: MI.getOperand(i: `1`).getFPImm()->getValueAPF()));
7618	}
7619
7620	void AArch64InstructionSelector::renderFPImm64(MachineInstrBuilder &MIB,
7621	const MachineInstr &MI,
7622	int OpIdx) const {
7623	assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -`1` &&
7624	"Expected G_FCONSTANT");
7625	MIB.addImm(
7626	Val: AArch64_AM::getFP64Imm(FPImm: MI.getOperand(i: `1`).getFPImm()->getValueAPF()));
7627	}
7628
7629	void AArch64InstructionSelector::renderFPImm32SIMDModImmType4(
7630	MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7631	assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -`1` &&
7632	"Expected G_FCONSTANT");
7633	MIB.addImm(Val: AArch64_AM::encodeAdvSIMDModImmType4(Imm: MI.getOperand(i: `1`)
7634	.getFPImm()
7635	->getValueAPF()
7636	.bitcastToAPInt()
7637	.getZExtValue()));
7638	}
7639
7640	bool AArch64InstructionSelector::isLoadStoreOfNumBytes(
7641	const MachineInstr &MI, unsigned NumBytes) const {
7642	if (!MI.mayLoadOrStore())
7643	return false;
7644	assert(MI.hasOneMemOperand() &&
7645	"Expected load/store to have only one mem op!");
7646	return (*MI.memoperands_begin())->getSize() == NumBytes;
7647	}
7648
7649	bool AArch64InstructionSelector::isDef32(const MachineInstr &MI) const {
7650	const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
7651	if (MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getSizeInBits() != `32`)
7652	return false;
7653
7654	// Only return true if we know the operation will zero-out the high half of
7655	// the 64-bit register. Truncates can be subregister copies, which don't
7656	// zero out the high bits. Copies and other copy-like instructions can be
7657	// fed by truncates, or could be lowered as subregister copies.
7658	switch (MI.getOpcode()) {
7659	default:
7660	return true;
7661	case TargetOpcode::COPY:
7662	case TargetOpcode::G_BITCAST:
7663	case TargetOpcode::G_TRUNC:
7664	case TargetOpcode::G_PHI:
7665	return false;
7666	}
7667	}
7668
7669
7670	// Perform fixups on the given PHI instruction's operands to force them all
7671	// to be the same as the destination regbank.
7672	static void fixupPHIOpBanks(MachineInstr &MI, MachineRegisterInfo &MRI,
7673	const AArch64RegisterBankInfo &RBI) {
7674	assert(MI.getOpcode() == TargetOpcode::G_PHI && "Expected a G_PHI");
7675	Register DstReg = MI.getOperand(i: `0`).getReg();
7676	const RegisterBank *DstRB = MRI.getRegBankOrNull(Reg: DstReg);
7677	assert(DstRB && "Expected PHI dst to have regbank assigned");
7678	MachineIRBuilder MIB(MI);
7679
7680	// Go through each operand and ensure it has the same regbank.
7681	for (MachineOperand &MO : llvm::drop_begin(RangeOrContainer: MI.operands())) {
7682	if (!MO.isReg())
7683	continue;
7684	Register OpReg = MO.getReg();
7685	const RegisterBank *RB = MRI.getRegBankOrNull(Reg: OpReg);
7686	if (RB != DstRB) {
7687	// Insert a cross-bank copy.
7688	auto *OpDef = MRI.getVRegDef(Reg: OpReg);
7689	const LLT &Ty = MRI.getType(Reg: OpReg);
7690	MachineBasicBlock &OpDefBB = *OpDef->getParent();
7691
7692	// Any instruction we insert must appear after all PHIs in the block
7693	// for the block to be valid MIR.
7694	MachineBasicBlock::iterator InsertPt = std::next(x: OpDef->getIterator());
7695	if (InsertPt != OpDefBB.end() && InsertPt ->isPHI())
7696	InsertPt = OpDefBB.getFirstNonPHI();
7697	MIB.setInsertPt(MBB&: *OpDef->getParent(), II: InsertPt);
7698	auto Copy = MIB.buildCopy(Res: Ty, Op: OpReg);
7699	MRI.setRegBank(Reg: Copy.getReg(Idx: `0`), RegBank: *DstRB);
7700	MO.setReg(Copy.getReg(Idx: `0`));
7701	}
7702	}
7703	}
7704
7705	void AArch64InstructionSelector::processPHIs(MachineFunction &MF) {
7706	// We're looking for PHIs, build a list so we don't invalidate iterators.
7707	MachineRegisterInfo &MRI = MF.getRegInfo();
7708	SmallVector<MachineInstr *, `32`> Phis;
7709	for (auto &BB : MF) {
7710	for (auto &MI : BB) {
7711	if (MI.getOpcode() == TargetOpcode::G_PHI)
7712	Phis.emplace_back(Args: &MI);
7713	}
7714	}
7715
7716	for (auto *MI : Phis) {
7717	// We need to do some work here if the operand types are < 16 bit and they
7718	// are split across fpr/gpr banks. Since all types <32b on gpr
7719	// end up being assigned gpr32 regclasses, we can end up with PHIs here
7720	// which try to select between a gpr32 and an fpr16. Ideally RBS shouldn't
7721	// be selecting heterogenous regbanks for operands if possible, but we
7722	// still need to be able to deal with it here.
7723	//
7724	// To fix this, if we have a gpr-bank operand < 32b in size and at least
7725	// one other operand is on the fpr bank, then we add cross-bank copies
7726	// to homogenize the operand banks. For simplicity the bank that we choose
7727	// to settle on is whatever bank the def operand has. For example:
7728	//
7729	// %endbb:
7730	// %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2:fpr(s16), %bb2
7731	// =>
7732	// %bb2:
7733	// ...
7734	// %in2_copy:gpr(s16) = COPY %in2:fpr(s16)
7735	// ...
7736	// %endbb:
7737	// %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2_copy:gpr(s16), %bb2
7738	bool HasGPROp = false, HasFPROp = false;
7739	for (const MachineOperand &MO : llvm::drop_begin(RangeOrContainer: MI->operands())) {
7740	if (!MO.isReg())
7741	continue;
7742	const LLT &Ty = MRI.getType(Reg: MO.getReg());
7743	if (!Ty.isValid() \|\| !Ty.isScalar())
7744	break;
7745	if (Ty.getSizeInBits() >= `32`)
7746	break;
7747	const RegisterBank *RB = MRI.getRegBankOrNull(Reg: MO.getReg());
7748	// If for some reason we don't have a regbank yet. Don't try anything.
7749	if (!RB)
7750	break;
7751
7752	if (RB->getID() == AArch64::GPRRegBankID)
7753	HasGPROp = true;
7754	else
7755	HasFPROp = true;
7756	}
7757	// We have heterogenous regbanks, need to fixup.
7758	if (HasGPROp && HasFPROp)
7759	fixupPHIOpBanks(MI&: *MI, MRI, RBI);
7760	}
7761	}
7762
7763	namespace llvm {
7764	InstructionSelector *
7765	createAArch64InstructionSelector(const AArch64TargetMachine &TM,
7766	AArch64Subtarget &Subtarget,
7767	AArch64RegisterBankInfo &RBI) {
7768	return new AArch64InstructionSelector(TM, Subtarget, RBI);
7769	}
7770	}
7771

source code of llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp