1 | //===- AArch64MIPeepholeOpt.cpp - AArch64 MI peephole optimization pass ---===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This pass performs below peephole optimizations on MIR level. |
10 | // |
11 | // 1. MOVi32imm + ANDWrr ==> ANDWri + ANDWri |
12 | // MOVi64imm + ANDXrr ==> ANDXri + ANDXri |
13 | // |
14 | // 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi |
15 | // MOVi64imm + ADDXrr ==> ANDXri + ANDXri |
16 | // |
17 | // 3. MOVi32imm + SUBWrr ==> SUBWRi + SUBWRi |
18 | // MOVi64imm + SUBXrr ==> SUBXri + SUBXri |
19 | // |
20 | // The mov pseudo instruction could be expanded to multiple mov instructions |
21 | // later. In this case, we could try to split the constant operand of mov |
22 | // instruction into two immediates which can be directly encoded into |
23 | // *Wri/*Xri instructions. It makes two AND/ADD/SUB instructions instead of |
24 | // multiple `mov` + `and/add/sub` instructions. |
25 | // |
26 | // 4. Remove redundant ORRWrs which is generated by zero-extend. |
27 | // |
28 | // %3:gpr32 = ORRWrs $wzr, %2, 0 |
29 | // %4:gpr64 = SUBREG_TO_REG 0, %3, %subreg.sub_32 |
30 | // |
31 | // If AArch64's 32-bit form of instruction defines the source operand of |
32 | // ORRWrs, we can remove the ORRWrs because the upper 32 bits of the source |
33 | // operand are set to zero. |
34 | // |
35 | // 5. %reg = INSERT_SUBREG %reg(tied-def 0), %subreg, subidx |
36 | // ==> %reg:subidx = SUBREG_TO_REG 0, %subreg, subidx |
37 | // |
38 | // 6. %intermediate:gpr32 = COPY %src:fpr128 |
39 | // %dst:fpr128 = INSvi32gpr %dst_vec:fpr128, dst_index, %intermediate:gpr32 |
40 | // ==> %dst:fpr128 = INSvi32lane %dst_vec:fpr128, dst_index, %src:fpr128, 0 |
41 | // |
42 | // In cases where a source FPR is copied to a GPR in order to be copied |
43 | // to a destination FPR, we can directly copy the values between the FPRs, |
44 | // eliminating the use of the Integer unit. When we match a pattern of |
45 | // INSvi[X]gpr that is preceded by a chain of COPY instructions from a FPR |
46 | // source, we use the INSvi[X]lane to replace the COPY & INSvi[X]gpr |
47 | // instructions. |
48 | // |
49 | // 7. If MI sets zero for high 64-bits implicitly, remove `mov 0` for high |
50 | // 64-bits. For example, |
51 | // |
52 | // %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr |
53 | // %2:fpr64 = MOVID 0 |
54 | // %4:fpr128 = IMPLICIT_DEF |
55 | // %3:fpr128 = INSERT_SUBREG %4:fpr128(tied-def 0), killed %2:fpr64, %subreg.dsub |
56 | // %6:fpr128 = IMPLICIT_DEF |
57 | // %5:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub |
58 | // %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0 |
59 | // ==> |
60 | // %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr |
61 | // %6:fpr128 = IMPLICIT_DEF |
62 | // %7:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub |
63 | // |
64 | //===----------------------------------------------------------------------===// |
65 | |
66 | #include "AArch64ExpandImm.h" |
67 | #include "AArch64InstrInfo.h" |
68 | #include "MCTargetDesc/AArch64AddressingModes.h" |
69 | #include "llvm/CodeGen/MachineDominators.h" |
70 | #include "llvm/CodeGen/MachineLoopInfo.h" |
71 | |
72 | using namespace llvm; |
73 | |
74 | #define DEBUG_TYPE "aarch64-mi-peephole-opt" |
75 | |
76 | namespace { |
77 | |
78 | struct AArch64MIPeepholeOpt : public MachineFunctionPass { |
79 | static char ID; |
80 | |
81 | AArch64MIPeepholeOpt() : MachineFunctionPass(ID) { |
82 | initializeAArch64MIPeepholeOptPass(*PassRegistry::getPassRegistry()); |
83 | } |
84 | |
85 | const AArch64InstrInfo *TII; |
86 | const AArch64RegisterInfo *TRI; |
87 | MachineLoopInfo *MLI; |
88 | MachineRegisterInfo *MRI; |
89 | |
90 | using OpcodePair = std::pair<unsigned, unsigned>; |
91 | template <typename T> |
92 | using SplitAndOpcFunc = |
93 | std::function<std::optional<OpcodePair>(T, unsigned, T &, T &)>; |
94 | using BuildMIFunc = |
95 | std::function<void(MachineInstr &, OpcodePair, unsigned, unsigned, |
96 | Register, Register, Register)>; |
97 | |
98 | /// For instructions where an immediate operand could be split into two |
99 | /// separate immediate instructions, use the splitTwoPartImm two handle the |
100 | /// optimization. |
101 | /// |
102 | /// To implement, the following function types must be passed to |
103 | /// splitTwoPartImm. A SplitAndOpcFunc must be implemented that determines if |
104 | /// splitting the immediate is valid and returns the associated new opcode. A |
105 | /// BuildMIFunc must be implemented to build the two immediate instructions. |
106 | /// |
107 | /// Example Pattern (where IMM would require 2+ MOV instructions): |
108 | /// %dst = <Instr>rr %src IMM [...] |
109 | /// becomes: |
110 | /// %tmp = <Instr>ri %src (encode half IMM) [...] |
111 | /// %dst = <Instr>ri %tmp (encode half IMM) [...] |
112 | template <typename T> |
113 | bool splitTwoPartImm(MachineInstr &MI, |
114 | SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr); |
115 | |
116 | bool checkMovImmInstr(MachineInstr &MI, MachineInstr *&MovMI, |
117 | MachineInstr *&SubregToRegMI); |
118 | |
119 | template <typename T> |
120 | bool visitADDSUB(unsigned PosOpc, unsigned NegOpc, MachineInstr &MI); |
121 | template <typename T> |
122 | bool visitADDSSUBS(OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI); |
123 | |
124 | template <typename T> |
125 | bool visitAND(unsigned Opc, MachineInstr &MI); |
126 | bool visitORR(MachineInstr &MI); |
127 | bool visitINSERT(MachineInstr &MI); |
128 | bool visitINSviGPR(MachineInstr &MI, unsigned Opc); |
129 | bool visitINSvi64lane(MachineInstr &MI); |
130 | bool visitFMOVDr(MachineInstr &MI); |
131 | bool runOnMachineFunction(MachineFunction &MF) override; |
132 | |
133 | StringRef getPassName() const override { |
134 | return "AArch64 MI Peephole Optimization pass" ; |
135 | } |
136 | |
137 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
138 | AU.setPreservesCFG(); |
139 | AU.addRequired<MachineLoopInfo>(); |
140 | MachineFunctionPass::getAnalysisUsage(AU); |
141 | } |
142 | }; |
143 | |
144 | char AArch64MIPeepholeOpt::ID = 0; |
145 | |
146 | } // end anonymous namespace |
147 | |
148 | INITIALIZE_PASS(AArch64MIPeepholeOpt, "aarch64-mi-peephole-opt" , |
149 | "AArch64 MI Peephole Optimization" , false, false) |
150 | |
151 | template <typename T> |
152 | static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) { |
153 | T UImm = static_cast<T>(Imm); |
154 | if (AArch64_AM::isLogicalImmediate(imm: UImm, regSize: RegSize)) |
155 | return false; |
156 | |
157 | // If this immediate can be handled by one instruction, do not split it. |
158 | SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; |
159 | AArch64_IMM::expandMOVImm(Imm: UImm, BitSize: RegSize, Insn); |
160 | if (Insn.size() == 1) |
161 | return false; |
162 | |
163 | // The bitmask immediate consists of consecutive ones. Let's say there is |
164 | // constant 0b00000000001000000000010000000000 which does not consist of |
165 | // consecutive ones. We can split it in to two bitmask immediate like |
166 | // 0b00000000001111111111110000000000 and 0b11111111111000000000011111111111. |
167 | // If we do AND with these two bitmask immediate, we can see original one. |
168 | unsigned LowestBitSet = llvm::countr_zero(UImm); |
169 | unsigned HighestBitSet = Log2_64(UImm); |
170 | |
171 | // Create a mask which is filled with one from the position of lowest bit set |
172 | // to the position of highest bit set. |
173 | T NewImm1 = (static_cast<T>(2) << HighestBitSet) - |
174 | (static_cast<T>(1) << LowestBitSet); |
175 | // Create a mask which is filled with one outside the position of lowest bit |
176 | // set and the position of highest bit set. |
177 | T NewImm2 = UImm | ~NewImm1; |
178 | |
179 | // If the split value is not valid bitmask immediate, do not split this |
180 | // constant. |
181 | if (!AArch64_AM::isLogicalImmediate(imm: NewImm2, regSize: RegSize)) |
182 | return false; |
183 | |
184 | Imm1Enc = AArch64_AM::encodeLogicalImmediate(imm: NewImm1, regSize: RegSize); |
185 | Imm2Enc = AArch64_AM::encodeLogicalImmediate(imm: NewImm2, regSize: RegSize); |
186 | return true; |
187 | } |
188 | |
189 | template <typename T> |
190 | bool AArch64MIPeepholeOpt::visitAND( |
191 | unsigned Opc, MachineInstr &MI) { |
192 | // Try below transformation. |
193 | // |
194 | // MOVi32imm + ANDWrr ==> ANDWri + ANDWri |
195 | // MOVi64imm + ANDXrr ==> ANDXri + ANDXri |
196 | // |
197 | // The mov pseudo instruction could be expanded to multiple mov instructions |
198 | // later. Let's try to split the constant operand of mov instruction into two |
199 | // bitmask immediates. It makes only two AND instructions intead of multiple |
200 | // mov + and instructions. |
201 | |
202 | return splitTwoPartImm<T>( |
203 | MI, |
204 | [Opc](T Imm, unsigned RegSize, T &Imm0, |
205 | T &Imm1) -> std::optional<OpcodePair> { |
206 | if (splitBitmaskImm(Imm, RegSize, Imm0, Imm1)) |
207 | return std::make_pair(x: Opc, y: Opc); |
208 | return std::nullopt; |
209 | }, |
210 | [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0, |
211 | unsigned Imm1, Register SrcReg, Register NewTmpReg, |
212 | Register NewDstReg) { |
213 | DebugLoc DL = MI.getDebugLoc(); |
214 | MachineBasicBlock *MBB = MI.getParent(); |
215 | BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg) |
216 | .addReg(SrcReg) |
217 | .addImm(Imm0); |
218 | BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg) |
219 | .addReg(NewTmpReg) |
220 | .addImm(Imm1); |
221 | }); |
222 | } |
223 | |
224 | bool AArch64MIPeepholeOpt::visitORR(MachineInstr &MI) { |
225 | // Check this ORR comes from below zero-extend pattern. |
226 | // |
227 | // def : Pat<(i64 (zext GPR32:$src)), |
228 | // (SUBREG_TO_REG (i32 0), (ORRWrs WZR, GPR32:$src, 0), sub_32)>; |
229 | if (MI.getOperand(i: 3).getImm() != 0) |
230 | return false; |
231 | |
232 | if (MI.getOperand(i: 1).getReg() != AArch64::WZR) |
233 | return false; |
234 | |
235 | MachineInstr *SrcMI = MRI->getUniqueVRegDef(Reg: MI.getOperand(i: 2).getReg()); |
236 | if (!SrcMI) |
237 | return false; |
238 | |
239 | // From https://developer.arm.com/documentation/dui0801/b/BABBGCAC |
240 | // |
241 | // When you use the 32-bit form of an instruction, the upper 32 bits of the |
242 | // source registers are ignored and the upper 32 bits of the destination |
243 | // register are set to zero. |
244 | // |
245 | // If AArch64's 32-bit form of instruction defines the source operand of |
246 | // zero-extend, we do not need the zero-extend. Let's check the MI's opcode is |
247 | // real AArch64 instruction and if it is not, do not process the opcode |
248 | // conservatively. |
249 | if (SrcMI->getOpcode() == TargetOpcode::COPY && |
250 | SrcMI->getOperand(i: 1).getReg().isVirtual()) { |
251 | const TargetRegisterClass *RC = |
252 | MRI->getRegClass(Reg: SrcMI->getOperand(i: 1).getReg()); |
253 | |
254 | // A COPY from an FPR will become a FMOVSWr, so do so now so that we know |
255 | // that the upper bits are zero. |
256 | if (RC != &AArch64::FPR32RegClass && |
257 | ((RC != &AArch64::FPR64RegClass && RC != &AArch64::FPR128RegClass) || |
258 | SrcMI->getOperand(i: 1).getSubReg() != AArch64::ssub)) |
259 | return false; |
260 | Register CpySrc = SrcMI->getOperand(i: 1).getReg(); |
261 | if (SrcMI->getOperand(i: 1).getSubReg() == AArch64::ssub) { |
262 | CpySrc = MRI->createVirtualRegister(&AArch64::FPR32RegClass); |
263 | BuildMI(*SrcMI->getParent(), SrcMI, SrcMI->getDebugLoc(), |
264 | TII->get(TargetOpcode::COPY), CpySrc) |
265 | .add(SrcMI->getOperand(i: 1)); |
266 | } |
267 | BuildMI(*SrcMI->getParent(), SrcMI, SrcMI->getDebugLoc(), |
268 | TII->get(AArch64::FMOVSWr), SrcMI->getOperand(i: 0).getReg()) |
269 | .addReg(CpySrc); |
270 | SrcMI->eraseFromParent(); |
271 | } |
272 | else if (SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END) |
273 | return false; |
274 | |
275 | Register DefReg = MI.getOperand(i: 0).getReg(); |
276 | Register SrcReg = MI.getOperand(i: 2).getReg(); |
277 | MRI->replaceRegWith(FromReg: DefReg, ToReg: SrcReg); |
278 | MRI->clearKillFlags(Reg: SrcReg); |
279 | LLVM_DEBUG(dbgs() << "Removed: " << MI << "\n" ); |
280 | MI.eraseFromParent(); |
281 | |
282 | return true; |
283 | } |
284 | |
285 | bool AArch64MIPeepholeOpt::visitINSERT(MachineInstr &MI) { |
286 | // Check this INSERT_SUBREG comes from below zero-extend pattern. |
287 | // |
288 | // From %reg = INSERT_SUBREG %reg(tied-def 0), %subreg, subidx |
289 | // To %reg:subidx = SUBREG_TO_REG 0, %subreg, subidx |
290 | // |
291 | // We're assuming the first operand to INSERT_SUBREG is irrelevant because a |
292 | // COPY would destroy the upper part of the register anyway |
293 | if (!MI.isRegTiedToDefOperand(UseOpIdx: 1)) |
294 | return false; |
295 | |
296 | Register DstReg = MI.getOperand(i: 0).getReg(); |
297 | const TargetRegisterClass *RC = MRI->getRegClass(Reg: DstReg); |
298 | MachineInstr *SrcMI = MRI->getUniqueVRegDef(Reg: MI.getOperand(i: 2).getReg()); |
299 | if (!SrcMI) |
300 | return false; |
301 | |
302 | // From https://developer.arm.com/documentation/dui0801/b/BABBGCAC |
303 | // |
304 | // When you use the 32-bit form of an instruction, the upper 32 bits of the |
305 | // source registers are ignored and the upper 32 bits of the destination |
306 | // register are set to zero. |
307 | // |
308 | // If AArch64's 32-bit form of instruction defines the source operand of |
309 | // zero-extend, we do not need the zero-extend. Let's check the MI's opcode is |
310 | // real AArch64 instruction and if it is not, do not process the opcode |
311 | // conservatively. |
312 | if ((SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END) || |
313 | !AArch64::GPR64allRegClass.hasSubClassEq(RC)) |
314 | return false; |
315 | |
316 | // Build a SUBREG_TO_REG instruction |
317 | MachineInstr *SubregMI = |
318 | BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), |
319 | TII->get(TargetOpcode::SUBREG_TO_REG), DstReg) |
320 | .addImm(0) |
321 | .add(MI.getOperand(i: 2)) |
322 | .add(MI.getOperand(i: 3)); |
323 | LLVM_DEBUG(dbgs() << MI << " replace by:\n: " << *SubregMI << "\n" ); |
324 | (void)SubregMI; |
325 | MI.eraseFromParent(); |
326 | |
327 | return true; |
328 | } |
329 | |
330 | template <typename T> |
331 | static bool splitAddSubImm(T Imm, unsigned RegSize, T &Imm0, T &Imm1) { |
332 | // The immediate must be in the form of ((imm0 << 12) + imm1), in which both |
333 | // imm0 and imm1 are non-zero 12-bit unsigned int. |
334 | if ((Imm & 0xfff000) == 0 || (Imm & 0xfff) == 0 || |
335 | (Imm & ~static_cast<T>(0xffffff)) != 0) |
336 | return false; |
337 | |
338 | // The immediate can not be composed via a single instruction. |
339 | SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; |
340 | AArch64_IMM::expandMOVImm(Imm, BitSize: RegSize, Insn); |
341 | if (Insn.size() == 1) |
342 | return false; |
343 | |
344 | // Split Imm into (Imm0 << 12) + Imm1; |
345 | Imm0 = (Imm >> 12) & 0xfff; |
346 | Imm1 = Imm & 0xfff; |
347 | return true; |
348 | } |
349 | |
350 | template <typename T> |
351 | bool AArch64MIPeepholeOpt::visitADDSUB( |
352 | unsigned PosOpc, unsigned NegOpc, MachineInstr &MI) { |
353 | // Try below transformation. |
354 | // |
355 | // ADDWrr X, MOVi32imm ==> ADDWri + ADDWri |
356 | // ADDXrr X, MOVi64imm ==> ADDXri + ADDXri |
357 | // |
358 | // SUBWrr X, MOVi32imm ==> SUBWri + SUBWri |
359 | // SUBXrr X, MOVi64imm ==> SUBXri + SUBXri |
360 | // |
361 | // The mov pseudo instruction could be expanded to multiple mov instructions |
362 | // later. Let's try to split the constant operand of mov instruction into two |
363 | // legal add/sub immediates. It makes only two ADD/SUB instructions intead of |
364 | // multiple `mov` + `and/sub` instructions. |
365 | |
366 | // We can sometimes have ADDWrr WZR, MULi32imm that have not been constant |
367 | // folded. Make sure that we don't generate invalid instructions that use XZR |
368 | // in those cases. |
369 | if (MI.getOperand(i: 1).getReg() == AArch64::XZR || |
370 | MI.getOperand(i: 1).getReg() == AArch64::WZR) |
371 | return false; |
372 | |
373 | return splitTwoPartImm<T>( |
374 | MI, |
375 | [PosOpc, NegOpc](T Imm, unsigned RegSize, T &Imm0, |
376 | T &Imm1) -> std::optional<OpcodePair> { |
377 | if (splitAddSubImm(Imm, RegSize, Imm0, Imm1)) |
378 | return std::make_pair(x: PosOpc, y: PosOpc); |
379 | if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1)) |
380 | return std::make_pair(x: NegOpc, y: NegOpc); |
381 | return std::nullopt; |
382 | }, |
383 | [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0, |
384 | unsigned Imm1, Register SrcReg, Register NewTmpReg, |
385 | Register NewDstReg) { |
386 | DebugLoc DL = MI.getDebugLoc(); |
387 | MachineBasicBlock *MBB = MI.getParent(); |
388 | BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg) |
389 | .addReg(SrcReg) |
390 | .addImm(Imm0) |
391 | .addImm(12); |
392 | BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg) |
393 | .addReg(NewTmpReg) |
394 | .addImm(Imm1) |
395 | .addImm(0); |
396 | }); |
397 | } |
398 | |
399 | template <typename T> |
400 | bool AArch64MIPeepholeOpt::visitADDSSUBS( |
401 | OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI) { |
402 | // Try the same transformation as ADDSUB but with additional requirement |
403 | // that the condition code usages are only for Equal and Not Equal |
404 | |
405 | if (MI.getOperand(i: 1).getReg() == AArch64::XZR || |
406 | MI.getOperand(i: 1).getReg() == AArch64::WZR) |
407 | return false; |
408 | |
409 | return splitTwoPartImm<T>( |
410 | MI, |
411 | [PosOpcs, NegOpcs, &MI, &TRI = TRI, |
412 | &MRI = MRI](T Imm, unsigned RegSize, T &Imm0, |
413 | T &Imm1) -> std::optional<OpcodePair> { |
414 | OpcodePair OP; |
415 | if (splitAddSubImm(Imm, RegSize, Imm0, Imm1)) |
416 | OP = PosOpcs; |
417 | else if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1)) |
418 | OP = NegOpcs; |
419 | else |
420 | return std::nullopt; |
421 | // Check conditional uses last since it is expensive for scanning |
422 | // proceeding instructions |
423 | MachineInstr &SrcMI = *MRI->getUniqueVRegDef(Reg: MI.getOperand(i: 1).getReg()); |
424 | std::optional<UsedNZCV> NZCVUsed = examineCFlagsUse(SrcMI, MI, *TRI); |
425 | if (!NZCVUsed || NZCVUsed->C || NZCVUsed->V) |
426 | return std::nullopt; |
427 | return OP; |
428 | }, |
429 | [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0, |
430 | unsigned Imm1, Register SrcReg, Register NewTmpReg, |
431 | Register NewDstReg) { |
432 | DebugLoc DL = MI.getDebugLoc(); |
433 | MachineBasicBlock *MBB = MI.getParent(); |
434 | BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg) |
435 | .addReg(SrcReg) |
436 | .addImm(Imm0) |
437 | .addImm(12); |
438 | BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg) |
439 | .addReg(NewTmpReg) |
440 | .addImm(Imm1) |
441 | .addImm(0); |
442 | }); |
443 | } |
444 | |
445 | // Checks if the corresponding MOV immediate instruction is applicable for |
446 | // this peephole optimization. |
447 | bool AArch64MIPeepholeOpt::checkMovImmInstr(MachineInstr &MI, |
448 | MachineInstr *&MovMI, |
449 | MachineInstr *&SubregToRegMI) { |
450 | // Check whether current MBB is in loop and the AND is loop invariant. |
451 | MachineBasicBlock *MBB = MI.getParent(); |
452 | MachineLoop *L = MLI->getLoopFor(BB: MBB); |
453 | if (L && !L->isLoopInvariant(I&: MI)) |
454 | return false; |
455 | |
456 | // Check whether current MI's operand is MOV with immediate. |
457 | MovMI = MRI->getUniqueVRegDef(Reg: MI.getOperand(i: 2).getReg()); |
458 | if (!MovMI) |
459 | return false; |
460 | |
461 | // If it is SUBREG_TO_REG, check its operand. |
462 | SubregToRegMI = nullptr; |
463 | if (MovMI->getOpcode() == TargetOpcode::SUBREG_TO_REG) { |
464 | SubregToRegMI = MovMI; |
465 | MovMI = MRI->getUniqueVRegDef(Reg: MovMI->getOperand(i: 2).getReg()); |
466 | if (!MovMI) |
467 | return false; |
468 | } |
469 | |
470 | if (MovMI->getOpcode() != AArch64::MOVi32imm && |
471 | MovMI->getOpcode() != AArch64::MOVi64imm) |
472 | return false; |
473 | |
474 | // If the MOV has multiple uses, do not split the immediate because it causes |
475 | // more instructions. |
476 | if (!MRI->hasOneUse(RegNo: MovMI->getOperand(i: 0).getReg())) |
477 | return false; |
478 | if (SubregToRegMI && !MRI->hasOneUse(RegNo: SubregToRegMI->getOperand(i: 0).getReg())) |
479 | return false; |
480 | |
481 | // It is OK to perform this peephole optimization. |
482 | return true; |
483 | } |
484 | |
485 | template <typename T> |
486 | bool AArch64MIPeepholeOpt::splitTwoPartImm( |
487 | MachineInstr &MI, |
488 | SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr) { |
489 | unsigned RegSize = sizeof(T) * 8; |
490 | assert((RegSize == 32 || RegSize == 64) && |
491 | "Invalid RegSize for legal immediate peephole optimization" ); |
492 | |
493 | // Perform several essential checks against current MI. |
494 | MachineInstr *MovMI, *SubregToRegMI; |
495 | if (!checkMovImmInstr(MI, MovMI, SubregToRegMI)) |
496 | return false; |
497 | |
498 | // Split the immediate to Imm0 and Imm1, and calculate the Opcode. |
499 | T Imm = static_cast<T>(MovMI->getOperand(i: 1).getImm()), Imm0, Imm1; |
500 | // For the 32 bit form of instruction, the upper 32 bits of the destination |
501 | // register are set to zero. If there is SUBREG_TO_REG, set the upper 32 bits |
502 | // of Imm to zero. This is essential if the Immediate value was a negative |
503 | // number since it was sign extended when we assign to the 64-bit Imm. |
504 | if (SubregToRegMI) |
505 | Imm &= 0xFFFFFFFF; |
506 | OpcodePair Opcode; |
507 | if (auto R = SplitAndOpc(Imm, RegSize, Imm0, Imm1)) |
508 | Opcode = *R; |
509 | else |
510 | return false; |
511 | |
512 | // Create new MIs using the first and second opcodes. Opcodes might differ for |
513 | // flag setting operations that should only set flags on second instruction. |
514 | // NewTmpReg = Opcode.first SrcReg Imm0 |
515 | // NewDstReg = Opcode.second NewTmpReg Imm1 |
516 | |
517 | // Determine register classes for destinations and register operands |
518 | MachineFunction *MF = MI.getMF(); |
519 | const TargetRegisterClass *FirstInstrDstRC = |
520 | TII->getRegClass(TII->get(Opcode.first), 0, TRI, *MF); |
521 | const TargetRegisterClass *FirstInstrOperandRC = |
522 | TII->getRegClass(TII->get(Opcode.first), 1, TRI, *MF); |
523 | const TargetRegisterClass *SecondInstrDstRC = |
524 | (Opcode.first == Opcode.second) |
525 | ? FirstInstrDstRC |
526 | : TII->getRegClass(TII->get(Opcode.second), 0, TRI, *MF); |
527 | const TargetRegisterClass *SecondInstrOperandRC = |
528 | (Opcode.first == Opcode.second) |
529 | ? FirstInstrOperandRC |
530 | : TII->getRegClass(TII->get(Opcode.second), 1, TRI, *MF); |
531 | |
532 | // Get old registers destinations and new register destinations |
533 | Register DstReg = MI.getOperand(i: 0).getReg(); |
534 | Register SrcReg = MI.getOperand(i: 1).getReg(); |
535 | Register NewTmpReg = MRI->createVirtualRegister(RegClass: FirstInstrDstRC); |
536 | // In the situation that DstReg is not Virtual (likely WZR or XZR), we want to |
537 | // reuse that same destination register. |
538 | Register NewDstReg = DstReg.isVirtual() |
539 | ? MRI->createVirtualRegister(RegClass: SecondInstrDstRC) |
540 | : DstReg; |
541 | |
542 | // Constrain registers based on their new uses |
543 | MRI->constrainRegClass(Reg: SrcReg, RC: FirstInstrOperandRC); |
544 | MRI->constrainRegClass(Reg: NewTmpReg, RC: SecondInstrOperandRC); |
545 | if (DstReg != NewDstReg) |
546 | MRI->constrainRegClass(Reg: NewDstReg, RC: MRI->getRegClass(Reg: DstReg)); |
547 | |
548 | // Call the delegating operation to build the instruction |
549 | BuildInstr(MI, Opcode, Imm0, Imm1, SrcReg, NewTmpReg, NewDstReg); |
550 | |
551 | // replaceRegWith changes MI's definition register. Keep it for SSA form until |
552 | // deleting MI. Only if we made a new destination register. |
553 | if (DstReg != NewDstReg) { |
554 | MRI->replaceRegWith(FromReg: DstReg, ToReg: NewDstReg); |
555 | MI.getOperand(i: 0).setReg(DstReg); |
556 | } |
557 | |
558 | // Record the MIs need to be removed. |
559 | MI.eraseFromParent(); |
560 | if (SubregToRegMI) |
561 | SubregToRegMI->eraseFromParent(); |
562 | MovMI->eraseFromParent(); |
563 | |
564 | return true; |
565 | } |
566 | |
567 | bool AArch64MIPeepholeOpt::visitINSviGPR(MachineInstr &MI, unsigned Opc) { |
568 | // Check if this INSvi[X]gpr comes from COPY of a source FPR128 |
569 | // |
570 | // From |
571 | // %intermediate1:gpr64 = COPY %src:fpr128 |
572 | // %intermediate2:gpr32 = COPY %intermediate1:gpr64 |
573 | // %dst:fpr128 = INSvi[X]gpr %dst_vec:fpr128, dst_index, %intermediate2:gpr32 |
574 | // To |
575 | // %dst:fpr128 = INSvi[X]lane %dst_vec:fpr128, dst_index, %src:fpr128, |
576 | // src_index |
577 | // where src_index = 0, X = [8|16|32|64] |
578 | |
579 | MachineInstr *SrcMI = MRI->getUniqueVRegDef(Reg: MI.getOperand(i: 3).getReg()); |
580 | |
581 | // For a chain of COPY instructions, find the initial source register |
582 | // and check if it's an FPR128 |
583 | while (true) { |
584 | if (!SrcMI || SrcMI->getOpcode() != TargetOpcode::COPY) |
585 | return false; |
586 | |
587 | if (!SrcMI->getOperand(i: 1).getReg().isVirtual()) |
588 | return false; |
589 | |
590 | if (MRI->getRegClass(Reg: SrcMI->getOperand(i: 1).getReg()) == |
591 | &AArch64::FPR128RegClass) { |
592 | break; |
593 | } |
594 | SrcMI = MRI->getUniqueVRegDef(Reg: SrcMI->getOperand(i: 1).getReg()); |
595 | } |
596 | |
597 | Register DstReg = MI.getOperand(i: 0).getReg(); |
598 | Register SrcReg = SrcMI->getOperand(i: 1).getReg(); |
599 | MachineInstr *INSvilaneMI = |
600 | BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(Opc), DstReg) |
601 | .add(MI.getOperand(i: 1)) |
602 | .add(MI.getOperand(i: 2)) |
603 | .addUse(SrcReg, getRegState(RegOp: SrcMI->getOperand(i: 1))) |
604 | .addImm(0); |
605 | |
606 | LLVM_DEBUG(dbgs() << MI << " replace by:\n: " << *INSvilaneMI << "\n" ); |
607 | (void)INSvilaneMI; |
608 | MI.eraseFromParent(); |
609 | return true; |
610 | } |
611 | |
612 | // All instructions that set a FPR64 will implicitly zero the top bits of the |
613 | // register. |
614 | static bool is64bitDefwithZeroHigh64bit(MachineInstr *MI, |
615 | MachineRegisterInfo *MRI) { |
616 | if (!MI->getOperand(i: 0).isReg() || !MI->getOperand(i: 0).isDef()) |
617 | return false; |
618 | const TargetRegisterClass *RC = MRI->getRegClass(Reg: MI->getOperand(i: 0).getReg()); |
619 | if (RC != &AArch64::FPR64RegClass) |
620 | return false; |
621 | return MI->getOpcode() > TargetOpcode::GENERIC_OP_END; |
622 | } |
623 | |
624 | bool AArch64MIPeepholeOpt::visitINSvi64lane(MachineInstr &MI) { |
625 | // Check the MI for low 64-bits sets zero for high 64-bits implicitly. |
626 | // We are expecting below case. |
627 | // |
628 | // %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr |
629 | // %6:fpr128 = IMPLICIT_DEF |
630 | // %5:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub |
631 | // %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0 |
632 | MachineInstr *Low64MI = MRI->getUniqueVRegDef(Reg: MI.getOperand(i: 1).getReg()); |
633 | if (Low64MI->getOpcode() != AArch64::INSERT_SUBREG) |
634 | return false; |
635 | Low64MI = MRI->getUniqueVRegDef(Reg: Low64MI->getOperand(i: 2).getReg()); |
636 | if (!Low64MI || !is64bitDefwithZeroHigh64bit(MI: Low64MI, MRI)) |
637 | return false; |
638 | |
639 | // Check there is `mov 0` MI for high 64-bits. |
640 | // We are expecting below cases. |
641 | // |
642 | // %2:fpr64 = MOVID 0 |
643 | // %4:fpr128 = IMPLICIT_DEF |
644 | // %3:fpr128 = INSERT_SUBREG %4:fpr128(tied-def 0), killed %2:fpr64, %subreg.dsub |
645 | // %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0 |
646 | // or |
647 | // %5:fpr128 = MOVIv2d_ns 0 |
648 | // %6:fpr64 = COPY %5.dsub:fpr128 |
649 | // %8:fpr128 = IMPLICIT_DEF |
650 | // %7:fpr128 = INSERT_SUBREG %8:fpr128(tied-def 0), killed %6:fpr64, %subreg.dsub |
651 | // %11:fpr128 = INSvi64lane %9:fpr128(tied-def 0), 1, killed %7:fpr128, 0 |
652 | MachineInstr *High64MI = MRI->getUniqueVRegDef(Reg: MI.getOperand(i: 3).getReg()); |
653 | if (!High64MI || High64MI->getOpcode() != AArch64::INSERT_SUBREG) |
654 | return false; |
655 | High64MI = MRI->getUniqueVRegDef(Reg: High64MI->getOperand(i: 2).getReg()); |
656 | if (High64MI && High64MI->getOpcode() == TargetOpcode::COPY) |
657 | High64MI = MRI->getUniqueVRegDef(Reg: High64MI->getOperand(i: 1).getReg()); |
658 | if (!High64MI || (High64MI->getOpcode() != AArch64::MOVID && |
659 | High64MI->getOpcode() != AArch64::MOVIv2d_ns)) |
660 | return false; |
661 | if (High64MI->getOperand(i: 1).getImm() != 0) |
662 | return false; |
663 | |
664 | // Let's remove MIs for high 64-bits. |
665 | Register OldDef = MI.getOperand(i: 0).getReg(); |
666 | Register NewDef = MI.getOperand(i: 1).getReg(); |
667 | MRI->constrainRegClass(Reg: NewDef, RC: MRI->getRegClass(Reg: OldDef)); |
668 | MRI->replaceRegWith(FromReg: OldDef, ToReg: NewDef); |
669 | MI.eraseFromParent(); |
670 | |
671 | return true; |
672 | } |
673 | |
674 | bool AArch64MIPeepholeOpt::visitFMOVDr(MachineInstr &MI) { |
675 | // An FMOVDr sets the high 64-bits to zero implicitly, similar to ORR for GPR. |
676 | MachineInstr *Low64MI = MRI->getUniqueVRegDef(Reg: MI.getOperand(i: 1).getReg()); |
677 | if (!Low64MI || !is64bitDefwithZeroHigh64bit(MI: Low64MI, MRI)) |
678 | return false; |
679 | |
680 | // Let's remove MIs for high 64-bits. |
681 | Register OldDef = MI.getOperand(i: 0).getReg(); |
682 | Register NewDef = MI.getOperand(i: 1).getReg(); |
683 | LLVM_DEBUG(dbgs() << "Removing: " << MI << "\n" ); |
684 | MRI->clearKillFlags(Reg: OldDef); |
685 | MRI->clearKillFlags(Reg: NewDef); |
686 | MRI->constrainRegClass(Reg: NewDef, RC: MRI->getRegClass(Reg: OldDef)); |
687 | MRI->replaceRegWith(FromReg: OldDef, ToReg: NewDef); |
688 | MI.eraseFromParent(); |
689 | |
690 | return true; |
691 | } |
692 | |
693 | bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) { |
694 | if (skipFunction(F: MF.getFunction())) |
695 | return false; |
696 | |
697 | TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo()); |
698 | TRI = static_cast<const AArch64RegisterInfo *>( |
699 | MF.getSubtarget().getRegisterInfo()); |
700 | MLI = &getAnalysis<MachineLoopInfo>(); |
701 | MRI = &MF.getRegInfo(); |
702 | |
703 | assert(MRI->isSSA() && "Expected to be run on SSA form!" ); |
704 | |
705 | bool Changed = false; |
706 | |
707 | for (MachineBasicBlock &MBB : MF) { |
708 | for (MachineInstr &MI : make_early_inc_range(Range&: MBB)) { |
709 | switch (MI.getOpcode()) { |
710 | default: |
711 | break; |
712 | case AArch64::INSERT_SUBREG: |
713 | Changed |= visitINSERT(MI); |
714 | break; |
715 | case AArch64::ANDWrr: |
716 | Changed |= visitAND<uint32_t>(AArch64::ANDWri, MI); |
717 | break; |
718 | case AArch64::ANDXrr: |
719 | Changed |= visitAND<uint64_t>(AArch64::ANDXri, MI); |
720 | break; |
721 | case AArch64::ORRWrs: |
722 | Changed |= visitORR(MI); |
723 | break; |
724 | case AArch64::ADDWrr: |
725 | Changed |= visitADDSUB<uint32_t>(AArch64::ADDWri, AArch64::SUBWri, MI); |
726 | break; |
727 | case AArch64::SUBWrr: |
728 | Changed |= visitADDSUB<uint32_t>(AArch64::SUBWri, AArch64::ADDWri, MI); |
729 | break; |
730 | case AArch64::ADDXrr: |
731 | Changed |= visitADDSUB<uint64_t>(AArch64::ADDXri, AArch64::SUBXri, MI); |
732 | break; |
733 | case AArch64::SUBXrr: |
734 | Changed |= visitADDSUB<uint64_t>(AArch64::SUBXri, AArch64::ADDXri, MI); |
735 | break; |
736 | case AArch64::ADDSWrr: |
737 | Changed |= |
738 | visitADDSSUBS<uint32_t>({AArch64::ADDWri, AArch64::ADDSWri}, |
739 | {AArch64::SUBWri, AArch64::SUBSWri}, MI); |
740 | break; |
741 | case AArch64::SUBSWrr: |
742 | Changed |= |
743 | visitADDSSUBS<uint32_t>({AArch64::SUBWri, AArch64::SUBSWri}, |
744 | {AArch64::ADDWri, AArch64::ADDSWri}, MI); |
745 | break; |
746 | case AArch64::ADDSXrr: |
747 | Changed |= |
748 | visitADDSSUBS<uint64_t>({AArch64::ADDXri, AArch64::ADDSXri}, |
749 | {AArch64::SUBXri, AArch64::SUBSXri}, MI); |
750 | break; |
751 | case AArch64::SUBSXrr: |
752 | Changed |= |
753 | visitADDSSUBS<uint64_t>({AArch64::SUBXri, AArch64::SUBSXri}, |
754 | {AArch64::ADDXri, AArch64::ADDSXri}, MI); |
755 | break; |
756 | case AArch64::INSvi64gpr: |
757 | Changed |= visitINSviGPR(MI, AArch64::INSvi64lane); |
758 | break; |
759 | case AArch64::INSvi32gpr: |
760 | Changed |= visitINSviGPR(MI, AArch64::INSvi32lane); |
761 | break; |
762 | case AArch64::INSvi16gpr: |
763 | Changed |= visitINSviGPR(MI, AArch64::INSvi16lane); |
764 | break; |
765 | case AArch64::INSvi8gpr: |
766 | Changed |= visitINSviGPR(MI, AArch64::INSvi8lane); |
767 | break; |
768 | case AArch64::INSvi64lane: |
769 | Changed |= visitINSvi64lane(MI); |
770 | break; |
771 | case AArch64::FMOVDr: |
772 | Changed |= visitFMOVDr(MI); |
773 | break; |
774 | } |
775 | } |
776 | } |
777 | |
778 | return Changed; |
779 | } |
780 | |
781 | FunctionPass *llvm::createAArch64MIPeepholeOptPass() { |
782 | return new AArch64MIPeepholeOpt(); |
783 | } |
784 | |