AArch64MIPeepholeOpt.cpp source code [llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp]

1	//===- AArch64MIPeepholeOpt.cpp - AArch64 MI peephole optimization pass ---===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This pass performs below peephole optimizations on MIR level.
10	//
11	// 1. MOVi32imm + ANDWrr ==> ANDWri + ANDWri
12	// MOVi64imm + ANDXrr ==> ANDXri + ANDXri
13	//
14	// 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi
15	// MOVi64imm + ADDXrr ==> ANDXri + ANDXri
16	//
17	// 3. MOVi32imm + SUBWrr ==> SUBWRi + SUBWRi
18	// MOVi64imm + SUBXrr ==> SUBXri + SUBXri
19	//
20	// The mov pseudo instruction could be expanded to multiple mov instructions
21	// later. In this case, we could try to split the constant operand of mov
22	// instruction into two immediates which can be directly encoded into
23	// Wri/Xri instructions. It makes two AND/ADD/SUB instructions instead of
24	// multiple `mov` + `and/add/sub` instructions.
25	//
26	// 4. Remove redundant ORRWrs which is generated by zero-extend.
27	//
28	// %3:gpr32 = ORRWrs $wzr, %2, 0
29	// %4:gpr64 = SUBREG_TO_REG 0, %3, %subreg.sub_32
30	//
31	// If AArch64's 32-bit form of instruction defines the source operand of
32	// ORRWrs, we can remove the ORRWrs because the upper 32 bits of the source
33	// operand are set to zero.
34	//
35	// 5. %reg = INSERT_SUBREG %reg(tied-def 0), %subreg, subidx
36	// ==> %reg:subidx = SUBREG_TO_REG 0, %subreg, subidx
37	//
38	// 6. %intermediate:gpr32 = COPY %src:fpr128
39	// %dst:fpr128 = INSvi32gpr %dst_vec:fpr128, dst_index, %intermediate:gpr32
40	// ==> %dst:fpr128 = INSvi32lane %dst_vec:fpr128, dst_index, %src:fpr128, 0
41	//
42	// In cases where a source FPR is copied to a GPR in order to be copied
43	// to a destination FPR, we can directly copy the values between the FPRs,
44	// eliminating the use of the Integer unit. When we match a pattern of
45	// INSvi[X]gpr that is preceded by a chain of COPY instructions from a FPR
46	// source, we use the INSvi[X]lane to replace the COPY & INSvi[X]gpr
47	// instructions.
48	//
49	// 7. If MI sets zero for high 64-bits implicitly, remove `mov 0` for high
50	// 64-bits. For example,
51	//
52	// %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr
53	// %2:fpr64 = MOVID 0
54	// %4:fpr128 = IMPLICIT_DEF
55	// %3:fpr128 = INSERT_SUBREG %4:fpr128(tied-def 0), killed %2:fpr64, %subreg.dsub
56	// %6:fpr128 = IMPLICIT_DEF
57	// %5:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub
58	// %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0
59	// ==>
60	// %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr
61	// %6:fpr128 = IMPLICIT_DEF
62	// %7:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub
63	//
64	//===----------------------------------------------------------------------===//
65
66	#include "AArch64ExpandImm.h"
67	#include "AArch64InstrInfo.h"
68	#include "MCTargetDesc/AArch64AddressingModes.h"
69	#include "llvm/CodeGen/MachineDominators.h"
70	#include "llvm/CodeGen/MachineLoopInfo.h"
71
72	using namespace llvm;
73
74	#define DEBUG_TYPE "aarch64-mi-peephole-opt"
75
76	namespace {
77
78	struct AArch64MIPeepholeOpt : public MachineFunctionPass {
79	static char ID;
80
81	AArch64MIPeepholeOpt() : MachineFunctionPass (ID) {
82	initializeAArch64MIPeepholeOptPass(*PassRegistry::getPassRegistry());
83	}
84
85	const AArch64InstrInfo *TII;
86	const AArch64RegisterInfo *TRI;
87	MachineLoopInfo *MLI;
88	MachineRegisterInfo *MRI;
89
90	using OpcodePair = std::pair<unsigned, unsigned>;
91	template <typename T>
92	using SplitAndOpcFunc =
93	std::function<std::optional<OpcodePair>(T, unsigned, T &, T &)>;
94	using BuildMIFunc =
95	std::function<void(MachineInstr &, OpcodePair, unsigned, unsigned,
96	Register, Register, Register)>;
97
98	/// For instructions where an immediate operand could be split into two
99	/// separate immediate instructions, use the splitTwoPartImm two handle the
100	/// optimization.
101	///
102	/// To implement, the following function types must be passed to
103	/// splitTwoPartImm. A SplitAndOpcFunc must be implemented that determines if
104	/// splitting the immediate is valid and returns the associated new opcode. A
105	/// BuildMIFunc must be implemented to build the two immediate instructions.
106	///
107	/// Example Pattern (where IMM would require 2+ MOV instructions):
108	/// %dst = <Instr>rr %src IMM [...]
109	/// becomes:
110	/// %tmp = <Instr>ri %src (encode half IMM) [...]
111	/// %dst = <Instr>ri %tmp (encode half IMM) [...]
112	template <typename T>
113	bool splitTwoPartImm(MachineInstr &MI,
114	SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr);
115
116	bool checkMovImmInstr(MachineInstr &MI, MachineInstr *&MovMI,
117	MachineInstr *&SubregToRegMI);
118
119	template <typename T>
120	bool visitADDSUB(unsigned PosOpc, unsigned NegOpc, MachineInstr &MI);
121	template <typename T>
122	bool visitADDSSUBS(OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI);
123
124	template <typename T>
125	bool visitAND(unsigned Opc, MachineInstr &MI);
126	bool visitORR(MachineInstr &MI);
127	bool visitINSERT(MachineInstr &MI);
128	bool visitINSviGPR(MachineInstr &MI, unsigned Opc);
129	bool visitINSvi64lane(MachineInstr &MI);
130	bool visitFMOVDr(MachineInstr &MI);
131	bool runOnMachineFunction(MachineFunction &MF) override;
132
133	StringRef getPassName() const override {
134	return "AArch64 MI Peephole Optimization pass";
135	}
136
137	void getAnalysisUsage(AnalysisUsage &AU) const override {
138	AU.setPreservesCFG();
139	AU.addRequired<MachineLoopInfo>();
140	MachineFunctionPass::getAnalysisUsage(AU);
141	}
142	};
143
144	char AArch64MIPeepholeOpt::ID = `0`;
145
146	} // end anonymous namespace
147
148	INITIALIZE_PASS(AArch64MIPeepholeOpt, "aarch64-mi-peephole-opt",
149	"AArch64 MI Peephole Optimization", false, false)
150
151	template <typename T>
152	static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) {
153	T UImm = static_cast<T>(Imm);
154	if (AArch64_AM::isLogicalImmediate(imm: UImm, regSize: RegSize))
155	return false;
156
157	// If this immediate can be handled by one instruction, do not split it.
158	SmallVector<AArch64_IMM::ImmInsnModel, `4`> Insn;
159	AArch64_IMM::expandMOVImm(Imm: UImm, BitSize: RegSize, Insn);
160	if (Insn.size() == `1`)
161	return false;
162
163	// The bitmask immediate consists of consecutive ones. Let's say there is
164	// constant 0b00000000001000000000010000000000 which does not consist of
165	// consecutive ones. We can split it in to two bitmask immediate like
166	// 0b00000000001111111111110000000000 and 0b11111111111000000000011111111111.
167	// If we do AND with these two bitmask immediate, we can see original one.
168	unsigned LowestBitSet = llvm::countr_zero(UImm);
169	unsigned HighestBitSet = Log2_64(UImm);
170
171	// Create a mask which is filled with one from the position of lowest bit set
172	// to the position of highest bit set.
173	T NewImm1 = (static_cast<T>(`2`) << HighestBitSet) -
174	(static_cast<T>(`1`) << LowestBitSet);
175	// Create a mask which is filled with one outside the position of lowest bit
176	// set and the position of highest bit set.
177	T NewImm2 = UImm \| ~NewImm1;
178
179	// If the split value is not valid bitmask immediate, do not split this
180	// constant.
181	if (!AArch64_AM::isLogicalImmediate(imm: NewImm2, regSize: RegSize))
182	return false;
183
184	Imm1Enc = AArch64_AM::encodeLogicalImmediate(imm: NewImm1, regSize: RegSize);
185	Imm2Enc = AArch64_AM::encodeLogicalImmediate(imm: NewImm2, regSize: RegSize);
186	return true;
187	}
188
189	template <typename T>
190	bool AArch64MIPeepholeOpt::visitAND(
191	unsigned Opc, MachineInstr &MI) {
192	// Try below transformation.
193	//
194	// MOVi32imm + ANDWrr ==> ANDWri + ANDWri
195	// MOVi64imm + ANDXrr ==> ANDXri + ANDXri
196	//
197	// The mov pseudo instruction could be expanded to multiple mov instructions
198	// later. Let's try to split the constant operand of mov instruction into two
199	// bitmask immediates. It makes only two AND instructions intead of multiple
200	// mov + and instructions.
201
202	return splitTwoPartImm<T>(
203	MI,
204	[Opc](T Imm, unsigned RegSize, T &Imm0,
205	T &Imm1) -> std::optional<OpcodePair> {
206	if (splitBitmaskImm(Imm, RegSize, Imm0, Imm1))
207	return std::make_pair(x: Opc, y: Opc);
208	return std::nullopt;
209	},
210	[&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,
211	unsigned Imm1, Register SrcReg, Register NewTmpReg,
212	Register NewDstReg) {
213	DebugLoc DL = MI.getDebugLoc();
214	MachineBasicBlock *MBB = MI.getParent();
215	BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg)
216	.addReg(SrcReg)
217	.addImm(Imm0);
218	BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg)
219	.addReg(NewTmpReg)
220	.addImm(Imm1);
221	});
222	}
223
224	bool AArch64MIPeepholeOpt::visitORR(MachineInstr &MI) {
225	// Check this ORR comes from below zero-extend pattern.
226	//
227	// def : Pat<(i64 (zext GPR32:$src)),
228	// (SUBREG_TO_REG (i32 0), (ORRWrs WZR, GPR32:$src, 0), sub_32)>;
229	if (MI.getOperand(i: `3`).getImm() != `0`)
230	return false;
231
232	if (MI.getOperand(i: `1`).getReg() != AArch64::WZR)
233	return false;
234
235	MachineInstr *SrcMI = MRI->getUniqueVRegDef(Reg: MI.getOperand(i: `2`).getReg());
236	if (!SrcMI)
237	return false;
238
239	// From https://developer.arm.com/documentation/dui0801/b/BABBGCAC
240	//
241	// When you use the 32-bit form of an instruction, the upper 32 bits of the
242	// source registers are ignored and the upper 32 bits of the destination
243	// register are set to zero.
244	//
245	// If AArch64's 32-bit form of instruction defines the source operand of
246	// zero-extend, we do not need the zero-extend. Let's check the MI's opcode is
247	// real AArch64 instruction and if it is not, do not process the opcode
248	// conservatively.
249	if (SrcMI->getOpcode() == TargetOpcode::COPY &&
250	SrcMI->getOperand(i: `1`).getReg().isVirtual()) {
251	const TargetRegisterClass *RC =
252	MRI->getRegClass(Reg: SrcMI->getOperand(i: `1`).getReg());
253
254	// A COPY from an FPR will become a FMOVSWr, so do so now so that we know
255	// that the upper bits are zero.
256	if (RC != &AArch64::FPR32RegClass &&
257	((RC != &AArch64::FPR64RegClass && RC != &AArch64::FPR128RegClass) \|\|
258	SrcMI->getOperand(i: `1`).getSubReg() != AArch64::ssub))
259	return false;
260	Register CpySrc = SrcMI->getOperand(i: `1`).getReg();
261	if (SrcMI->getOperand(i: `1`).getSubReg() == AArch64::ssub) {
262	CpySrc = MRI->createVirtualRegister(&AArch64::FPR32RegClass);
263	BuildMI(*SrcMI->getParent(), SrcMI, SrcMI->getDebugLoc(),
264	TII->get(TargetOpcode::COPY), CpySrc)
265	.add(SrcMI->getOperand(i: `1`));
266	}
267	BuildMI(*SrcMI->getParent(), SrcMI, SrcMI->getDebugLoc(),
268	TII->get(AArch64::FMOVSWr), SrcMI->getOperand(i: `0`).getReg())
269	.addReg(CpySrc);
270	SrcMI->eraseFromParent();
271	}
272	else if (SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END)
273	return false;
274
275	Register DefReg = MI.getOperand(i: `0`).getReg();
276	Register SrcReg = MI.getOperand(i: `2`).getReg();
277	MRI->replaceRegWith(FromReg: DefReg, ToReg: SrcReg);
278	MRI->clearKillFlags(Reg: SrcReg);
279	LLVM_DEBUG(dbgs() << "Removed: " << MI << "\n");
280	MI.eraseFromParent();
281
282	return true;
283	}
284
285	bool AArch64MIPeepholeOpt::visitINSERT(MachineInstr &MI) {
286	// Check this INSERT_SUBREG comes from below zero-extend pattern.
287	//
288	// From %reg = INSERT_SUBREG %reg(tied-def 0), %subreg, subidx
289	// To %reg:subidx = SUBREG_TO_REG 0, %subreg, subidx
290	//
291	// We're assuming the first operand to INSERT_SUBREG is irrelevant because a
292	// COPY would destroy the upper part of the register anyway
293	if (!MI.isRegTiedToDefOperand(UseOpIdx: `1`))
294	return false;
295
296	Register DstReg = MI.getOperand(i: `0`).getReg();
297	const TargetRegisterClass *RC = MRI->getRegClass(Reg: DstReg);
298	MachineInstr *SrcMI = MRI->getUniqueVRegDef(Reg: MI.getOperand(i: `2`).getReg());
299	if (!SrcMI)
300	return false;
301
302	// From https://developer.arm.com/documentation/dui0801/b/BABBGCAC
303	//
304	// When you use the 32-bit form of an instruction, the upper 32 bits of the
305	// source registers are ignored and the upper 32 bits of the destination
306	// register are set to zero.
307	//
308	// If AArch64's 32-bit form of instruction defines the source operand of
309	// zero-extend, we do not need the zero-extend. Let's check the MI's opcode is
310	// real AArch64 instruction and if it is not, do not process the opcode
311	// conservatively.
312	if ((SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END) \|\|
313	!AArch64::GPR64allRegClass.hasSubClassEq(RC))
314	return false;
315
316	// Build a SUBREG_TO_REG instruction
317	MachineInstr *SubregMI =
318	BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
319	TII->get(TargetOpcode::SUBREG_TO_REG), DstReg)
320	.addImm(`0`)
321	.add(MI.getOperand(i: `2`))
322	.add(MI.getOperand(i: `3`));
323	LLVM_DEBUG(dbgs() << MI << " replace by:\n: " << *SubregMI << "\n");
324	(void)SubregMI;
325	MI.eraseFromParent();
326
327	return true;
328	}
329
330	template <typename T>
331	static bool splitAddSubImm(T Imm, unsigned RegSize, T &Imm0, T &Imm1) {
332	// The immediate must be in the form of ((imm0 << 12) + imm1), in which both
333	// imm0 and imm1 are non-zero 12-bit unsigned int.
334	if ((Imm & `0xfff000`) == `0` \|\| (Imm & `0xfff`) == `0` \|\|
335	(Imm & ~static_cast<T>(`0xffffff`)) != `0`)
336	return false;
337
338	// The immediate can not be composed via a single instruction.
339	SmallVector<AArch64_IMM::ImmInsnModel, `4`> Insn;
340	AArch64_IMM::expandMOVImm(Imm, BitSize: RegSize, Insn);
341	if (Insn.size() == `1`)
342	return false;
343
344	// Split Imm into (Imm0 << 12) + Imm1;
345	Imm0 = (Imm >> `12`) & `0xfff`;
346	Imm1 = Imm & `0xfff`;
347	return true;
348	}
349
350	template <typename T>
351	bool AArch64MIPeepholeOpt::visitADDSUB(
352	unsigned PosOpc, unsigned NegOpc, MachineInstr &MI) {
353	// Try below transformation.
354	//
355	// ADDWrr X, MOVi32imm ==> ADDWri + ADDWri
356	// ADDXrr X, MOVi64imm ==> ADDXri + ADDXri
357	//
358	// SUBWrr X, MOVi32imm ==> SUBWri + SUBWri
359	// SUBXrr X, MOVi64imm ==> SUBXri + SUBXri
360	//
361	// The mov pseudo instruction could be expanded to multiple mov instructions
362	// later. Let's try to split the constant operand of mov instruction into two
363	// legal add/sub immediates. It makes only two ADD/SUB instructions intead of
364	// multiple `mov` + `and/sub` instructions.
365
366	// We can sometimes have ADDWrr WZR, MULi32imm that have not been constant
367	// folded. Make sure that we don't generate invalid instructions that use XZR
368	// in those cases.
369	if (MI.getOperand(i: `1`).getReg() == AArch64::XZR \|\|
370	MI.getOperand(i: `1`).getReg() == AArch64::WZR)
371	return false;
372
373	return splitTwoPartImm<T>(
374	MI,
375	[PosOpc, NegOpc](T Imm, unsigned RegSize, T &Imm0,
376	T &Imm1) -> std::optional<OpcodePair> {
377	if (splitAddSubImm(Imm, RegSize, Imm0, Imm1))
378	return std::make_pair(x: PosOpc, y: PosOpc);
379	if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1))
380	return std::make_pair(x: NegOpc, y: NegOpc);
381	return std::nullopt;
382	},
383	[&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,
384	unsigned Imm1, Register SrcReg, Register NewTmpReg,
385	Register NewDstReg) {
386	DebugLoc DL = MI.getDebugLoc();
387	MachineBasicBlock *MBB = MI.getParent();
388	BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg)
389	.addReg(SrcReg)
390	.addImm(Imm0)
391	.addImm(`12`);
392	BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg)
393	.addReg(NewTmpReg)
394	.addImm(Imm1)
395	.addImm(`0`);
396	});
397	}
398
399	template <typename T>
400	bool AArch64MIPeepholeOpt::visitADDSSUBS(
401	OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI) {
402	// Try the same transformation as ADDSUB but with additional requirement
403	// that the condition code usages are only for Equal and Not Equal
404
405	if (MI.getOperand(i: `1`).getReg() == AArch64::XZR \|\|
406	MI.getOperand(i: `1`).getReg() == AArch64::WZR)
407	return false;
408
409	return splitTwoPartImm<T>(
410	MI,
411	[PosOpcs, NegOpcs, &MI, &TRI = TRI,
412	&MRI = MRI](T Imm, unsigned RegSize, T &Imm0,
413	T &Imm1) -> std::optional<OpcodePair> {
414	OpcodePair OP;
415	if (splitAddSubImm(Imm, RegSize, Imm0, Imm1))
416	OP = PosOpcs;
417	else if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1))
418	OP = NegOpcs;
419	else
420	return std::nullopt;
421	// Check conditional uses last since it is expensive for scanning
422	// proceeding instructions
423	MachineInstr &SrcMI = *MRI->getUniqueVRegDef(Reg: MI.getOperand(i: `1`).getReg());
424	std::optional<UsedNZCV> NZCVUsed = examineCFlagsUse(SrcMI, MI, *TRI);
425	if (!NZCVUsed \|\| NZCVUsed ->C \|\| NZCVUsed ->V)
426	return std::nullopt;
427	return OP;
428	},
429	[&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,
430	unsigned Imm1, Register SrcReg, Register NewTmpReg,
431	Register NewDstReg) {
432	DebugLoc DL = MI.getDebugLoc();
433	MachineBasicBlock *MBB = MI.getParent();
434	BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg)
435	.addReg(SrcReg)
436	.addImm(Imm0)
437	.addImm(`12`);
438	BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg)
439	.addReg(NewTmpReg)
440	.addImm(Imm1)
441	.addImm(`0`);
442	});
443	}
444
445	// Checks if the corresponding MOV immediate instruction is applicable for
446	// this peephole optimization.
447	bool AArch64MIPeepholeOpt::checkMovImmInstr(MachineInstr &MI,
448	MachineInstr *&MovMI,
449	MachineInstr *&SubregToRegMI) {
450	// Check whether current MBB is in loop and the AND is loop invariant.
451	MachineBasicBlock *MBB = MI.getParent();
452	MachineLoop *L = MLI->getLoopFor(BB: MBB);
453	if (L && !L->isLoopInvariant(I&: MI))
454	return false;
455
456	// Check whether current MI's operand is MOV with immediate.
457	MovMI = MRI->getUniqueVRegDef(Reg: MI.getOperand(i: `2`).getReg());
458	if (!MovMI)
459	return false;
460
461	// If it is SUBREG_TO_REG, check its operand.
462	SubregToRegMI = nullptr;
463	if (MovMI->getOpcode() == TargetOpcode::SUBREG_TO_REG) {
464	SubregToRegMI = MovMI;
465	MovMI = MRI->getUniqueVRegDef(Reg: MovMI->getOperand(i: `2`).getReg());
466	if (!MovMI)
467	return false;
468	}
469
470	if (MovMI->getOpcode() != AArch64::MOVi32imm &&
471	MovMI->getOpcode() != AArch64::MOVi64imm)
472	return false;
473
474	// If the MOV has multiple uses, do not split the immediate because it causes
475	// more instructions.
476	if (!MRI->hasOneUse(RegNo: MovMI->getOperand(i: `0`).getReg()))
477	return false;
478	if (SubregToRegMI && !MRI->hasOneUse(RegNo: SubregToRegMI->getOperand(i: `0`).getReg()))
479	return false;
480
481	// It is OK to perform this peephole optimization.
482	return true;
483	}
484
485	template <typename T>
486	bool AArch64MIPeepholeOpt::splitTwoPartImm(
487	MachineInstr &MI,
488	SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr) {
489	unsigned RegSize = sizeof(T) * `8`;
490	assert((RegSize == `32` \|\| RegSize == `64`) &&
491	"Invalid RegSize for legal immediate peephole optimization");
492
493	// Perform several essential checks against current MI.
494	MachineInstr MovMI, SubregToRegMI;
495	if (!checkMovImmInstr(MI, MovMI, SubregToRegMI))
496	return false;
497
498	// Split the immediate to Imm0 and Imm1, and calculate the Opcode.
499	T Imm = static_cast<T>(MovMI->getOperand(i: `1`).getImm()), Imm0, Imm1;
500	// For the 32 bit form of instruction, the upper 32 bits of the destination
501	// register are set to zero. If there is SUBREG_TO_REG, set the upper 32 bits
502	// of Imm to zero. This is essential if the Immediate value was a negative
503	// number since it was sign extended when we assign to the 64-bit Imm.
504	if (SubregToRegMI)
505	Imm &= `0xFFFFFFFF`;
506	OpcodePair Opcode;
507	if (auto R = SplitAndOpc(Imm, RegSize, Imm0, Imm1))
508	Opcode = *R;
509	else
510	return false;
511
512	// Create new MIs using the first and second opcodes. Opcodes might differ for
513	// flag setting operations that should only set flags on second instruction.
514	// NewTmpReg = Opcode.first SrcReg Imm0
515	// NewDstReg = Opcode.second NewTmpReg Imm1
516
517	// Determine register classes for destinations and register operands
518	MachineFunction *MF = MI.getMF();
519	const TargetRegisterClass *FirstInstrDstRC =
520	TII->getRegClass(TII->get(Opcode.first), `0`, TRI, *MF);
521	const TargetRegisterClass *FirstInstrOperandRC =
522	TII->getRegClass(TII->get(Opcode.first), `1`, TRI, *MF);
523	const TargetRegisterClass *SecondInstrDstRC =
524	(Opcode.first == Opcode.second)
525	? FirstInstrDstRC
526	: TII->getRegClass(TII->get(Opcode.second), `0`, TRI, *MF);
527	const TargetRegisterClass *SecondInstrOperandRC =
528	(Opcode.first == Opcode.second)
529	? FirstInstrOperandRC
530	: TII->getRegClass(TII->get(Opcode.second), `1`, TRI, *MF);
531
532	// Get old registers destinations and new register destinations
533	Register DstReg = MI.getOperand(i: `0`).getReg();
534	Register SrcReg = MI.getOperand(i: `1`).getReg();
535	Register NewTmpReg = MRI->createVirtualRegister(RegClass: FirstInstrDstRC);
536	// In the situation that DstReg is not Virtual (likely WZR or XZR), we want to
537	// reuse that same destination register.
538	Register NewDstReg = DstReg.isVirtual()
539	? MRI->createVirtualRegister(RegClass: SecondInstrDstRC)
540	: DstReg;
541
542	// Constrain registers based on their new uses
543	MRI->constrainRegClass(Reg: SrcReg, RC: FirstInstrOperandRC);
544	MRI->constrainRegClass(Reg: NewTmpReg, RC: SecondInstrOperandRC);
545	if (DstReg != NewDstReg)
546	MRI->constrainRegClass(Reg: NewDstReg, RC: MRI->getRegClass(Reg: DstReg));
547
548	// Call the delegating operation to build the instruction
549	BuildInstr(MI, Opcode, Imm0, Imm1, SrcReg, NewTmpReg, NewDstReg);
550
551	// replaceRegWith changes MI's definition register. Keep it for SSA form until
552	// deleting MI. Only if we made a new destination register.
553	if (DstReg != NewDstReg) {
554	MRI->replaceRegWith(FromReg: DstReg, ToReg: NewDstReg);
555	MI.getOperand(i: `0`).setReg(DstReg);
556	}
557
558	// Record the MIs need to be removed.
559	MI.eraseFromParent();
560	if (SubregToRegMI)
561	SubregToRegMI->eraseFromParent();
562	MovMI->eraseFromParent();
563
564	return true;
565	}
566
567	bool AArch64MIPeepholeOpt::visitINSviGPR(MachineInstr &MI, unsigned Opc) {
568	// Check if this INSvi[X]gpr comes from COPY of a source FPR128
569	//
570	// From
571	// %intermediate1:gpr64 = COPY %src:fpr128
572	// %intermediate2:gpr32 = COPY %intermediate1:gpr64
573	// %dst:fpr128 = INSvi[X]gpr %dst_vec:fpr128, dst_index, %intermediate2:gpr32
574	// To
575	// %dst:fpr128 = INSvi[X]lane %dst_vec:fpr128, dst_index, %src:fpr128,
576	// src_index
577	// where src_index = 0, X = [8\|16\|32\|64]
578
579	MachineInstr *SrcMI = MRI->getUniqueVRegDef(Reg: MI.getOperand(i: `3`).getReg());
580
581	// For a chain of COPY instructions, find the initial source register
582	// and check if it's an FPR128
583	while (true) {
584	if (!SrcMI \|\| SrcMI->getOpcode() != TargetOpcode::COPY)
585	return false;
586
587	if (!SrcMI->getOperand(i: `1`).getReg().isVirtual())
588	return false;
589
590	if (MRI->getRegClass(Reg: SrcMI->getOperand(i: `1`).getReg()) ==
591	&AArch64::FPR128RegClass) {
592	break;
593	}
594	SrcMI = MRI->getUniqueVRegDef(Reg: SrcMI->getOperand(i: `1`).getReg());
595	}
596
597	Register DstReg = MI.getOperand(i: `0`).getReg();
598	Register SrcReg = SrcMI->getOperand(i: `1`).getReg();
599	MachineInstr *INSvilaneMI =
600	BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(Opc), DstReg)
601	.add(MI.getOperand(i: `1`))
602	.add(MI.getOperand(i: `2`))
603	.addUse(SrcReg, getRegState(RegOp: SrcMI->getOperand(i: `1`)))
604	.addImm(`0`);
605
606	LLVM_DEBUG(dbgs() << MI << " replace by:\n: " << *INSvilaneMI << "\n");
607	(void)INSvilaneMI;
608	MI.eraseFromParent();
609	return true;
610	}
611
612	// All instructions that set a FPR64 will implicitly zero the top bits of the
613	// register.
614	static bool is64bitDefwithZeroHigh64bit(MachineInstr *MI,
615	MachineRegisterInfo *MRI) {
616	if (!MI->getOperand(i: `0`).isReg() \|\| !MI->getOperand(i: `0`).isDef())
617	return false;
618	const TargetRegisterClass *RC = MRI->getRegClass(Reg: MI->getOperand(i: `0`).getReg());
619	if (RC != &AArch64::FPR64RegClass)
620	return false;
621	return MI->getOpcode() > TargetOpcode::GENERIC_OP_END;
622	}
623
624	bool AArch64MIPeepholeOpt::visitINSvi64lane(MachineInstr &MI) {
625	// Check the MI for low 64-bits sets zero for high 64-bits implicitly.
626	// We are expecting below case.
627	//
628	// %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr
629	// %6:fpr128 = IMPLICIT_DEF
630	// %5:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub
631	// %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0
632	MachineInstr *Low64MI = MRI->getUniqueVRegDef(Reg: MI.getOperand(i: `1`).getReg());
633	if (Low64MI->getOpcode() != AArch64::INSERT_SUBREG)
634	return false;
635	Low64MI = MRI->getUniqueVRegDef(Reg: Low64MI->getOperand(i: `2`).getReg());
636	if (!Low64MI \|\| !is64bitDefwithZeroHigh64bit(MI: Low64MI, MRI))
637	return false;
638
639	// Check there is `mov 0` MI for high 64-bits.
640	// We are expecting below cases.
641	//
642	// %2:fpr64 = MOVID 0
643	// %4:fpr128 = IMPLICIT_DEF
644	// %3:fpr128 = INSERT_SUBREG %4:fpr128(tied-def 0), killed %2:fpr64, %subreg.dsub
645	// %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0
646	// or
647	// %5:fpr128 = MOVIv2d_ns 0
648	// %6:fpr64 = COPY %5.dsub:fpr128
649	// %8:fpr128 = IMPLICIT_DEF
650	// %7:fpr128 = INSERT_SUBREG %8:fpr128(tied-def 0), killed %6:fpr64, %subreg.dsub
651	// %11:fpr128 = INSvi64lane %9:fpr128(tied-def 0), 1, killed %7:fpr128, 0
652	MachineInstr *High64MI = MRI->getUniqueVRegDef(Reg: MI.getOperand(i: `3`).getReg());
653	if (!High64MI \|\| High64MI->getOpcode() != AArch64::INSERT_SUBREG)
654	return false;
655	High64MI = MRI->getUniqueVRegDef(Reg: High64MI->getOperand(i: `2`).getReg());
656	if (High64MI && High64MI->getOpcode() == TargetOpcode::COPY)
657	High64MI = MRI->getUniqueVRegDef(Reg: High64MI->getOperand(i: `1`).getReg());
658	if (!High64MI \|\| (High64MI->getOpcode() != AArch64::MOVID &&
659	High64MI->getOpcode() != AArch64::MOVIv2d_ns))
660	return false;
661	if (High64MI->getOperand(i: `1`).getImm() != `0`)
662	return false;
663
664	// Let's remove MIs for high 64-bits.
665	Register OldDef = MI.getOperand(i: `0`).getReg();
666	Register NewDef = MI.getOperand(i: `1`).getReg();
667	MRI->constrainRegClass(Reg: NewDef, RC: MRI->getRegClass(Reg: OldDef));
668	MRI->replaceRegWith(FromReg: OldDef, ToReg: NewDef);
669	MI.eraseFromParent();
670
671	return true;
672	}
673
674	bool AArch64MIPeepholeOpt::visitFMOVDr(MachineInstr &MI) {
675	// An FMOVDr sets the high 64-bits to zero implicitly, similar to ORR for GPR.
676	MachineInstr *Low64MI = MRI->getUniqueVRegDef(Reg: MI.getOperand(i: `1`).getReg());
677	if (!Low64MI \|\| !is64bitDefwithZeroHigh64bit(MI: Low64MI, MRI))
678	return false;
679
680	// Let's remove MIs for high 64-bits.
681	Register OldDef = MI.getOperand(i: `0`).getReg();
682	Register NewDef = MI.getOperand(i: `1`).getReg();
683	LLVM_DEBUG(dbgs() << "Removing: " << MI << "\n");
684	MRI->clearKillFlags(Reg: OldDef);
685	MRI->clearKillFlags(Reg: NewDef);
686	MRI->constrainRegClass(Reg: NewDef, RC: MRI->getRegClass(Reg: OldDef));
687	MRI->replaceRegWith(FromReg: OldDef, ToReg: NewDef);
688	MI.eraseFromParent();
689
690	return true;
691	}
692
693	bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
694	if (skipFunction(F: MF.getFunction()))
695	return false;
696
697	TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
698	TRI = static_cast<const AArch64RegisterInfo *>(
699	MF.getSubtarget().getRegisterInfo());
700	MLI = &getAnalysis<MachineLoopInfo>();
701	MRI = &MF.getRegInfo();
702
703	assert(MRI->isSSA() && "Expected to be run on SSA form!");
704
705	bool Changed = false;
706
707	for (MachineBasicBlock &MBB : MF) {
708	for (MachineInstr &MI : make_early_inc_range(Range&: MBB)) {
709	switch (MI.getOpcode()) {
710	default:
711	break;
712	case AArch64::INSERT_SUBREG:
713	Changed \|= visitINSERT(MI);
714	break;
715	case AArch64::ANDWrr:
716	Changed \|= visitAND<uint32_t>(AArch64::ANDWri, MI);
717	break;
718	case AArch64::ANDXrr:
719	Changed \|= visitAND<uint64_t>(AArch64::ANDXri, MI);
720	break;
721	case AArch64::ORRWrs:
722	Changed \|= visitORR(MI);
723	break;
724	case AArch64::ADDWrr:
725	Changed \|= visitADDSUB<uint32_t>(AArch64::ADDWri, AArch64::SUBWri, MI);
726	break;
727	case AArch64::SUBWrr:
728	Changed \|= visitADDSUB<uint32_t>(AArch64::SUBWri, AArch64::ADDWri, MI);
729	break;
730	case AArch64::ADDXrr:
731	Changed \|= visitADDSUB<uint64_t>(AArch64::ADDXri, AArch64::SUBXri, MI);
732	break;
733	case AArch64::SUBXrr:
734	Changed \|= visitADDSUB<uint64_t>(AArch64::SUBXri, AArch64::ADDXri, MI);
735	break;
736	case AArch64::ADDSWrr:
737	Changed \|=
738	visitADDSSUBS<uint32_t>({AArch64::ADDWri, AArch64::ADDSWri},
739	{AArch64::SUBWri, AArch64::SUBSWri}, MI);
740	break;
741	case AArch64::SUBSWrr:
742	Changed \|=
743	visitADDSSUBS<uint32_t>({AArch64::SUBWri, AArch64::SUBSWri},
744	{AArch64::ADDWri, AArch64::ADDSWri}, MI);
745	break;
746	case AArch64::ADDSXrr:
747	Changed \|=
748	visitADDSSUBS<uint64_t>({AArch64::ADDXri, AArch64::ADDSXri},
749	{AArch64::SUBXri, AArch64::SUBSXri}, MI);
750	break;
751	case AArch64::SUBSXrr:
752	Changed \|=
753	visitADDSSUBS<uint64_t>({AArch64::SUBXri, AArch64::SUBSXri},
754	{AArch64::ADDXri, AArch64::ADDSXri}, MI);
755	break;
756	case AArch64::INSvi64gpr:
757	Changed \|= visitINSviGPR(MI, AArch64::INSvi64lane);
758	break;
759	case AArch64::INSvi32gpr:
760	Changed \|= visitINSviGPR(MI, AArch64::INSvi32lane);
761	break;
762	case AArch64::INSvi16gpr:
763	Changed \|= visitINSviGPR(MI, AArch64::INSvi16lane);
764	break;
765	case AArch64::INSvi8gpr:
766	Changed \|= visitINSviGPR(MI, AArch64::INSvi8lane);
767	break;
768	case AArch64::INSvi64lane:
769	Changed \|= visitINSvi64lane(MI);
770	break;
771	case AArch64::FMOVDr:
772	Changed \|= visitFMOVDr(MI);
773	break;
774	}
775	}
776	}
777
778	return Changed;
779	}
780
781	FunctionPass *llvm::createAArch64MIPeepholeOptPass() {
782	return new AArch64MIPeepholeOpt ();
783	}
784

source code of llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp