1 | //=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp --------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This pass does combining of machine instructions at the generic MI level, |
10 | // after the legalizer. |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "AMDGPU.h" |
15 | #include "AMDGPUCombinerHelper.h" |
16 | #include "AMDGPULegalizerInfo.h" |
17 | #include "GCNSubtarget.h" |
18 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
19 | #include "llvm/CodeGen/GlobalISel/Combiner.h" |
20 | #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" |
21 | #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" |
22 | #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h" |
23 | #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" |
24 | #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" |
25 | #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" |
26 | #include "llvm/CodeGen/MachineDominators.h" |
27 | #include "llvm/CodeGen/TargetPassConfig.h" |
28 | #include "llvm/IR/IntrinsicsAMDGPU.h" |
29 | #include "llvm/Target/TargetMachine.h" |
30 | |
31 | #define GET_GICOMBINER_DEPS |
32 | #include "AMDGPUGenPreLegalizeGICombiner.inc" |
33 | #undef GET_GICOMBINER_DEPS |
34 | |
35 | #define DEBUG_TYPE "amdgpu-postlegalizer-combiner" |
36 | |
37 | using namespace llvm; |
38 | using namespace MIPatternMatch; |
39 | |
40 | namespace { |
41 | #define GET_GICOMBINER_TYPES |
42 | #include "AMDGPUGenPostLegalizeGICombiner.inc" |
43 | #undef GET_GICOMBINER_TYPES |
44 | |
45 | class AMDGPUPostLegalizerCombinerImpl : public Combiner { |
46 | protected: |
47 | const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig; |
48 | const GCNSubtarget &STI; |
49 | const SIInstrInfo &TII; |
50 | // TODO: Make CombinerHelper methods const. |
51 | mutable AMDGPUCombinerHelper Helper; |
52 | |
53 | public: |
54 | AMDGPUPostLegalizerCombinerImpl( |
55 | MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC, |
56 | GISelKnownBits &KB, GISelCSEInfo *CSEInfo, |
57 | const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig, |
58 | const GCNSubtarget &STI, MachineDominatorTree *MDT, |
59 | const LegalizerInfo *LI); |
60 | |
61 | static const char *getName() { return "AMDGPUPostLegalizerCombinerImpl" ; } |
62 | |
63 | bool tryCombineAllImpl(MachineInstr &I) const; |
64 | bool tryCombineAll(MachineInstr &I) const override; |
65 | |
66 | struct FMinFMaxLegacyInfo { |
67 | Register LHS; |
68 | Register RHS; |
69 | Register True; |
70 | Register False; |
71 | CmpInst::Predicate Pred; |
72 | }; |
73 | |
74 | // TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize |
75 | bool matchFMinFMaxLegacy(MachineInstr &MI, FMinFMaxLegacyInfo &Info) const; |
76 | void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI, |
77 | const FMinFMaxLegacyInfo &Info) const; |
78 | |
79 | bool matchUCharToFloat(MachineInstr &MI) const; |
80 | void applyUCharToFloat(MachineInstr &MI) const; |
81 | |
82 | bool |
83 | matchRcpSqrtToRsq(MachineInstr &MI, |
84 | std::function<void(MachineIRBuilder &)> &MatchInfo) const; |
85 | |
86 | bool matchFDivSqrtToRsqF16(MachineInstr &MI) const; |
87 | void applyFDivSqrtToRsqF16(MachineInstr &MI, const Register &X) const; |
88 | |
89 | // FIXME: Should be able to have 2 separate matchdatas rather than custom |
90 | // struct boilerplate. |
91 | struct CvtF32UByteMatchInfo { |
92 | Register CvtVal; |
93 | unsigned ShiftOffset; |
94 | }; |
95 | |
96 | bool matchCvtF32UByteN(MachineInstr &MI, |
97 | CvtF32UByteMatchInfo &MatchInfo) const; |
98 | void applyCvtF32UByteN(MachineInstr &MI, |
99 | const CvtF32UByteMatchInfo &MatchInfo) const; |
100 | |
101 | bool matchRemoveFcanonicalize(MachineInstr &MI, Register &Reg) const; |
102 | |
103 | // Combine unsigned buffer load and signed extension instructions to generate |
104 | // signed buffer laod instructions. |
105 | bool matchCombineSignExtendInReg( |
106 | MachineInstr &MI, std::pair<MachineInstr *, unsigned> &MatchInfo) const; |
107 | void applyCombineSignExtendInReg( |
108 | MachineInstr &MI, std::pair<MachineInstr *, unsigned> &MatchInfo) const; |
109 | |
110 | // Find the s_mul_u64 instructions where the higher bits are either |
111 | // zero-extended or sign-extended. |
112 | bool matchCombine_s_mul_u64(MachineInstr &MI, unsigned &NewOpcode) const; |
113 | // Replace the s_mul_u64 instructions with S_MUL_I64_I32_PSEUDO if the higher |
114 | // 33 bits are sign extended and with S_MUL_U64_U32_PSEUDO if the higher 32 |
115 | // bits are zero extended. |
116 | void applyCombine_s_mul_u64(MachineInstr &MI, unsigned &NewOpcode) const; |
117 | |
118 | private: |
119 | #define GET_GICOMBINER_CLASS_MEMBERS |
120 | #define AMDGPUSubtarget GCNSubtarget |
121 | #include "AMDGPUGenPostLegalizeGICombiner.inc" |
122 | #undef GET_GICOMBINER_CLASS_MEMBERS |
123 | #undef AMDGPUSubtarget |
124 | }; |
125 | |
126 | #define GET_GICOMBINER_IMPL |
127 | #define AMDGPUSubtarget GCNSubtarget |
128 | #include "AMDGPUGenPostLegalizeGICombiner.inc" |
129 | #undef AMDGPUSubtarget |
130 | #undef GET_GICOMBINER_IMPL |
131 | |
132 | AMDGPUPostLegalizerCombinerImpl::AMDGPUPostLegalizerCombinerImpl( |
133 | MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC, |
134 | GISelKnownBits &KB, GISelCSEInfo *CSEInfo, |
135 | const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig, |
136 | const GCNSubtarget &STI, MachineDominatorTree *MDT, const LegalizerInfo *LI) |
137 | : Combiner(MF, CInfo, TPC, &KB, CSEInfo), RuleConfig(RuleConfig), STI(STI), |
138 | TII(*STI.getInstrInfo()), |
139 | Helper(Observer, B, /*IsPreLegalize*/ false, &KB, MDT, LI), |
140 | #define GET_GICOMBINER_CONSTRUCTOR_INITS |
141 | #include "AMDGPUGenPostLegalizeGICombiner.inc" |
142 | #undef GET_GICOMBINER_CONSTRUCTOR_INITS |
143 | { |
144 | } |
145 | |
146 | bool AMDGPUPostLegalizerCombinerImpl::tryCombineAll(MachineInstr &MI) const { |
147 | if (tryCombineAllImpl(I&: MI)) |
148 | return true; |
149 | |
150 | switch (MI.getOpcode()) { |
151 | case TargetOpcode::G_SHL: |
152 | case TargetOpcode::G_LSHR: |
153 | case TargetOpcode::G_ASHR: |
154 | // On some subtargets, 64-bit shift is a quarter rate instruction. In the |
155 | // common case, splitting this into a move and a 32-bit shift is faster and |
156 | // the same code size. |
157 | return Helper.tryCombineShiftToUnmerge(MI, TargetShiftAmount: 32); |
158 | } |
159 | |
160 | return false; |
161 | } |
162 | |
163 | bool AMDGPUPostLegalizerCombinerImpl::matchFMinFMaxLegacy( |
164 | MachineInstr &MI, FMinFMaxLegacyInfo &Info) const { |
165 | // FIXME: Type predicate on pattern |
166 | if (MRI.getType(Reg: MI.getOperand(i: 0).getReg()) != LLT::scalar(SizeInBits: 32)) |
167 | return false; |
168 | |
169 | Register Cond = MI.getOperand(i: 1).getReg(); |
170 | if (!MRI.hasOneNonDBGUse(RegNo: Cond) || |
171 | !mi_match(Cond, MRI, |
172 | m_GFCmp(m_Pred(P&: Info.Pred), m_Reg(R&: Info.LHS), m_Reg(R&: Info.RHS)))) |
173 | return false; |
174 | |
175 | Info.True = MI.getOperand(i: 2).getReg(); |
176 | Info.False = MI.getOperand(i: 3).getReg(); |
177 | |
178 | // TODO: Handle case where the the selected value is an fneg and the compared |
179 | // constant is the negation of the selected value. |
180 | if (!(Info.LHS == Info.True && Info.RHS == Info.False) && |
181 | !(Info.LHS == Info.False && Info.RHS == Info.True)) |
182 | return false; |
183 | |
184 | switch (Info.Pred) { |
185 | case CmpInst::FCMP_FALSE: |
186 | case CmpInst::FCMP_OEQ: |
187 | case CmpInst::FCMP_ONE: |
188 | case CmpInst::FCMP_ORD: |
189 | case CmpInst::FCMP_UNO: |
190 | case CmpInst::FCMP_UEQ: |
191 | case CmpInst::FCMP_UNE: |
192 | case CmpInst::FCMP_TRUE: |
193 | return false; |
194 | default: |
195 | return true; |
196 | } |
197 | } |
198 | |
199 | void AMDGPUPostLegalizerCombinerImpl::applySelectFCmpToFMinToFMaxLegacy( |
200 | MachineInstr &MI, const FMinFMaxLegacyInfo &Info) const { |
201 | B.setInstrAndDebugLoc(MI); |
202 | auto buildNewInst = [&MI, this](unsigned Opc, Register X, Register Y) { |
203 | B.buildInstr(Opc, {MI.getOperand(i: 0)}, {X, Y}, MI.getFlags()); |
204 | }; |
205 | |
206 | switch (Info.Pred) { |
207 | case CmpInst::FCMP_ULT: |
208 | case CmpInst::FCMP_ULE: |
209 | if (Info.LHS == Info.True) |
210 | buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS); |
211 | else |
212 | buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS); |
213 | break; |
214 | case CmpInst::FCMP_OLE: |
215 | case CmpInst::FCMP_OLT: { |
216 | // We need to permute the operands to get the correct NaN behavior. The |
217 | // selected operand is the second one based on the failing compare with NaN, |
218 | // so permute it based on the compare type the hardware uses. |
219 | if (Info.LHS == Info.True) |
220 | buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS); |
221 | else |
222 | buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS); |
223 | break; |
224 | } |
225 | case CmpInst::FCMP_UGE: |
226 | case CmpInst::FCMP_UGT: { |
227 | if (Info.LHS == Info.True) |
228 | buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS); |
229 | else |
230 | buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS); |
231 | break; |
232 | } |
233 | case CmpInst::FCMP_OGT: |
234 | case CmpInst::FCMP_OGE: { |
235 | if (Info.LHS == Info.True) |
236 | buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS); |
237 | else |
238 | buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS); |
239 | break; |
240 | } |
241 | default: |
242 | llvm_unreachable("predicate should not have matched" ); |
243 | } |
244 | |
245 | MI.eraseFromParent(); |
246 | } |
247 | |
248 | bool AMDGPUPostLegalizerCombinerImpl::matchUCharToFloat( |
249 | MachineInstr &MI) const { |
250 | Register DstReg = MI.getOperand(i: 0).getReg(); |
251 | |
252 | // TODO: We could try to match extracting the higher bytes, which would be |
253 | // easier if i8 vectors weren't promoted to i32 vectors, particularly after |
254 | // types are legalized. v4i8 -> v4f32 is probably the only case to worry |
255 | // about in practice. |
256 | LLT Ty = MRI.getType(Reg: DstReg); |
257 | if (Ty == LLT::scalar(SizeInBits: 32) || Ty == LLT::scalar(SizeInBits: 16)) { |
258 | Register SrcReg = MI.getOperand(i: 1).getReg(); |
259 | unsigned SrcSize = MRI.getType(Reg: SrcReg).getSizeInBits(); |
260 | assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64); |
261 | const APInt Mask = APInt::getHighBitsSet(numBits: SrcSize, hiBitsSet: SrcSize - 8); |
262 | return Helper.getKnownBits()->maskedValueIsZero(Val: SrcReg, Mask); |
263 | } |
264 | |
265 | return false; |
266 | } |
267 | |
268 | void AMDGPUPostLegalizerCombinerImpl::applyUCharToFloat( |
269 | MachineInstr &MI) const { |
270 | B.setInstrAndDebugLoc(MI); |
271 | |
272 | const LLT S32 = LLT::scalar(SizeInBits: 32); |
273 | |
274 | Register DstReg = MI.getOperand(i: 0).getReg(); |
275 | Register SrcReg = MI.getOperand(i: 1).getReg(); |
276 | LLT Ty = MRI.getType(Reg: DstReg); |
277 | LLT SrcTy = MRI.getType(Reg: SrcReg); |
278 | if (SrcTy != S32) |
279 | SrcReg = B.buildAnyExtOrTrunc(Res: S32, Op: SrcReg).getReg(Idx: 0); |
280 | |
281 | if (Ty == S32) { |
282 | B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg}, {SrcReg}, |
283 | MI.getFlags()); |
284 | } else { |
285 | auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32}, {SrcReg}, |
286 | MI.getFlags()); |
287 | B.buildFPTrunc(Res: DstReg, Op: Cvt0, Flags: MI.getFlags()); |
288 | } |
289 | |
290 | MI.eraseFromParent(); |
291 | } |
292 | |
293 | bool AMDGPUPostLegalizerCombinerImpl::matchRcpSqrtToRsq( |
294 | MachineInstr &MI, |
295 | std::function<void(MachineIRBuilder &)> &MatchInfo) const { |
296 | auto getRcpSrc = [=](const MachineInstr &MI) -> MachineInstr * { |
297 | if (!MI.getFlag(Flag: MachineInstr::FmContract)) |
298 | return nullptr; |
299 | |
300 | if (auto *GI = dyn_cast<GIntrinsic>(&MI)) { |
301 | if (GI->is(Intrinsic::amdgcn_rcp)) |
302 | return MRI.getVRegDef(Reg: MI.getOperand(i: 2).getReg()); |
303 | } |
304 | return nullptr; |
305 | }; |
306 | |
307 | auto getSqrtSrc = [=](const MachineInstr &MI) -> MachineInstr * { |
308 | if (!MI.getFlag(Flag: MachineInstr::FmContract)) |
309 | return nullptr; |
310 | MachineInstr *SqrtSrcMI = nullptr; |
311 | auto Match = |
312 | mi_match(MI.getOperand(i: 0).getReg(), MRI, m_GFSqrt(m_MInstr(MI&: SqrtSrcMI))); |
313 | (void)Match; |
314 | return SqrtSrcMI; |
315 | }; |
316 | |
317 | MachineInstr *RcpSrcMI = nullptr, *SqrtSrcMI = nullptr; |
318 | // rcp(sqrt(x)) |
319 | if ((RcpSrcMI = getRcpSrc(MI)) && (SqrtSrcMI = getSqrtSrc(*RcpSrcMI))) { |
320 | MatchInfo = [SqrtSrcMI, &MI](MachineIRBuilder &B) { |
321 | B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)}) |
322 | .addUse(SqrtSrcMI->getOperand(0).getReg()) |
323 | .setMIFlags(MI.getFlags()); |
324 | }; |
325 | return true; |
326 | } |
327 | |
328 | // sqrt(rcp(x)) |
329 | if ((SqrtSrcMI = getSqrtSrc(MI)) && (RcpSrcMI = getRcpSrc(*SqrtSrcMI))) { |
330 | MatchInfo = [RcpSrcMI, &MI](MachineIRBuilder &B) { |
331 | B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)}) |
332 | .addUse(RcpSrcMI->getOperand(0).getReg()) |
333 | .setMIFlags(MI.getFlags()); |
334 | }; |
335 | return true; |
336 | } |
337 | return false; |
338 | } |
339 | |
340 | bool AMDGPUPostLegalizerCombinerImpl::matchFDivSqrtToRsqF16( |
341 | MachineInstr &MI) const { |
342 | Register Sqrt = MI.getOperand(i: 2).getReg(); |
343 | return MRI.hasOneNonDBGUse(RegNo: Sqrt); |
344 | } |
345 | |
346 | void AMDGPUPostLegalizerCombinerImpl::applyFDivSqrtToRsqF16( |
347 | MachineInstr &MI, const Register &X) const { |
348 | Register Dst = MI.getOperand(i: 0).getReg(); |
349 | Register Y = MI.getOperand(i: 1).getReg(); |
350 | LLT DstTy = MRI.getType(Reg: Dst); |
351 | uint32_t Flags = MI.getFlags(); |
352 | Register RSQ = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {DstTy}) |
353 | .addUse(X) |
354 | .setMIFlags(Flags) |
355 | .getReg(0); |
356 | B.buildFMul(Dst, RSQ, Y, Flags); |
357 | MI.eraseFromParent(); |
358 | } |
359 | |
360 | bool AMDGPUPostLegalizerCombinerImpl::matchCvtF32UByteN( |
361 | MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) const { |
362 | Register SrcReg = MI.getOperand(i: 1).getReg(); |
363 | |
364 | // Look through G_ZEXT. |
365 | bool IsShr = mi_match(SrcReg, MRI, m_GZExt(m_Reg(R&: SrcReg))); |
366 | |
367 | Register Src0; |
368 | int64_t ShiftAmt; |
369 | IsShr = mi_match(SrcReg, MRI, m_GLShr(m_Reg(R&: Src0), m_ICst(Cst&: ShiftAmt))); |
370 | if (IsShr || mi_match(SrcReg, MRI, m_GShl(m_Reg(R&: Src0), m_ICst(Cst&: ShiftAmt)))) { |
371 | const unsigned Offset = MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0; |
372 | |
373 | unsigned ShiftOffset = 8 * Offset; |
374 | if (IsShr) |
375 | ShiftOffset += ShiftAmt; |
376 | else |
377 | ShiftOffset -= ShiftAmt; |
378 | |
379 | MatchInfo.CvtVal = Src0; |
380 | MatchInfo.ShiftOffset = ShiftOffset; |
381 | return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0; |
382 | } |
383 | |
384 | // TODO: Simplify demanded bits. |
385 | return false; |
386 | } |
387 | |
388 | void AMDGPUPostLegalizerCombinerImpl::applyCvtF32UByteN( |
389 | MachineInstr &MI, const CvtF32UByteMatchInfo &MatchInfo) const { |
390 | B.setInstrAndDebugLoc(MI); |
391 | unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8; |
392 | |
393 | const LLT S32 = LLT::scalar(SizeInBits: 32); |
394 | Register CvtSrc = MatchInfo.CvtVal; |
395 | LLT SrcTy = MRI.getType(Reg: MatchInfo.CvtVal); |
396 | if (SrcTy != S32) { |
397 | assert(SrcTy.isScalar() && SrcTy.getSizeInBits() >= 8); |
398 | CvtSrc = B.buildAnyExt(Res: S32, Op: CvtSrc).getReg(Idx: 0); |
399 | } |
400 | |
401 | assert(MI.getOpcode() != NewOpc); |
402 | B.buildInstr(NewOpc, {MI.getOperand(i: 0)}, {CvtSrc}, MI.getFlags()); |
403 | MI.eraseFromParent(); |
404 | } |
405 | |
406 | bool AMDGPUPostLegalizerCombinerImpl::matchRemoveFcanonicalize( |
407 | MachineInstr &MI, Register &Reg) const { |
408 | const SITargetLowering *TLI = static_cast<const SITargetLowering *>( |
409 | MF.getSubtarget().getTargetLowering()); |
410 | Reg = MI.getOperand(i: 1).getReg(); |
411 | return TLI->isCanonicalized(Reg, MF); |
412 | } |
413 | |
414 | // The buffer_load_{i8, i16} intrinsics are intially lowered as buffer_load_{u8, |
415 | // u16} instructions. Here, the buffer_load_{u8, u16} instructions are combined |
416 | // with sign extension instrucions in order to generate buffer_load_{i8, i16} |
417 | // instructions. |
418 | |
419 | // Identify buffer_load_{u8, u16}. |
420 | bool AMDGPUPostLegalizerCombinerImpl::matchCombineSignExtendInReg( |
421 | MachineInstr &MI, std::pair<MachineInstr *, unsigned> &MatchData) const { |
422 | Register LoadReg = MI.getOperand(i: 1).getReg(); |
423 | if (!MRI.hasOneNonDBGUse(RegNo: LoadReg)) |
424 | return false; |
425 | |
426 | // Check if the first operand of the sign extension is a subword buffer load |
427 | // instruction. |
428 | MachineInstr *LoadMI = MRI.getVRegDef(Reg: LoadReg); |
429 | int64_t Width = MI.getOperand(i: 2).getImm(); |
430 | switch (LoadMI->getOpcode()) { |
431 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: |
432 | MatchData = {LoadMI, AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE}; |
433 | return Width == 8; |
434 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: |
435 | MatchData = {LoadMI, AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT}; |
436 | return Width == 16; |
437 | case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE: |
438 | MatchData = {LoadMI, AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE}; |
439 | return Width == 8; |
440 | case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT: |
441 | MatchData = {LoadMI, AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT}; |
442 | return Width == 16; |
443 | } |
444 | return false; |
445 | } |
446 | |
447 | // Combine buffer_load_{u8, u16} and the sign extension instruction to generate |
448 | // buffer_load_{i8, i16}. |
449 | void AMDGPUPostLegalizerCombinerImpl::applyCombineSignExtendInReg( |
450 | MachineInstr &MI, std::pair<MachineInstr *, unsigned> &MatchData) const { |
451 | auto [LoadMI, NewOpcode] = MatchData; |
452 | LoadMI->setDesc(TII.get(NewOpcode)); |
453 | // Update the destination register of the load with the destination register |
454 | // of the sign extension. |
455 | Register SignExtendInsnDst = MI.getOperand(i: 0).getReg(); |
456 | LoadMI->getOperand(0).setReg(SignExtendInsnDst); |
457 | // Remove the sign extension. |
458 | MI.eraseFromParent(); |
459 | } |
460 | |
461 | bool AMDGPUPostLegalizerCombinerImpl::matchCombine_s_mul_u64( |
462 | MachineInstr &MI, unsigned &NewOpcode) const { |
463 | Register Src0 = MI.getOperand(i: 1).getReg(); |
464 | Register Src1 = MI.getOperand(i: 2).getReg(); |
465 | if (MRI.getType(Reg: Src0) != LLT::scalar(SizeInBits: 64)) |
466 | return false; |
467 | |
468 | if (KB->getKnownBits(R: Src1).countMinLeadingZeros() >= 32 && |
469 | KB->getKnownBits(R: Src0).countMinLeadingZeros() >= 32) { |
470 | NewOpcode = AMDGPU::G_AMDGPU_S_MUL_U64_U32; |
471 | return true; |
472 | } |
473 | |
474 | if (KB->computeNumSignBits(R: Src1) >= 33 && |
475 | KB->computeNumSignBits(R: Src0) >= 33) { |
476 | NewOpcode = AMDGPU::G_AMDGPU_S_MUL_I64_I32; |
477 | return true; |
478 | } |
479 | return false; |
480 | } |
481 | |
482 | void AMDGPUPostLegalizerCombinerImpl::applyCombine_s_mul_u64( |
483 | MachineInstr &MI, unsigned &NewOpcode) const { |
484 | Helper.replaceOpcodeWith(FromMI&: MI, ToOpcode: NewOpcode); |
485 | } |
486 | |
487 | // Pass boilerplate |
488 | // ================ |
489 | |
490 | class AMDGPUPostLegalizerCombiner : public MachineFunctionPass { |
491 | public: |
492 | static char ID; |
493 | |
494 | AMDGPUPostLegalizerCombiner(bool IsOptNone = false); |
495 | |
496 | StringRef getPassName() const override { |
497 | return "AMDGPUPostLegalizerCombiner" ; |
498 | } |
499 | |
500 | bool runOnMachineFunction(MachineFunction &MF) override; |
501 | |
502 | void getAnalysisUsage(AnalysisUsage &AU) const override; |
503 | |
504 | private: |
505 | bool IsOptNone; |
506 | AMDGPUPostLegalizerCombinerImplRuleConfig RuleConfig; |
507 | }; |
508 | } // end anonymous namespace |
509 | |
510 | void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { |
511 | AU.addRequired<TargetPassConfig>(); |
512 | AU.setPreservesCFG(); |
513 | getSelectionDAGFallbackAnalysisUsage(AU); |
514 | AU.addRequired<GISelKnownBitsAnalysis>(); |
515 | AU.addPreserved<GISelKnownBitsAnalysis>(); |
516 | if (!IsOptNone) { |
517 | AU.addRequired<MachineDominatorTree>(); |
518 | AU.addPreserved<MachineDominatorTree>(); |
519 | } |
520 | MachineFunctionPass::getAnalysisUsage(AU); |
521 | } |
522 | |
523 | AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone) |
524 | : MachineFunctionPass(ID), IsOptNone(IsOptNone) { |
525 | initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry()); |
526 | |
527 | if (!RuleConfig.parseCommandLineOption()) |
528 | report_fatal_error(reason: "Invalid rule identifier" ); |
529 | } |
530 | |
531 | bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { |
532 | if (MF.getProperties().hasProperty( |
533 | P: MachineFunctionProperties::Property::FailedISel)) |
534 | return false; |
535 | auto *TPC = &getAnalysis<TargetPassConfig>(); |
536 | const Function &F = MF.getFunction(); |
537 | bool EnableOpt = |
538 | MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F); |
539 | |
540 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
541 | const AMDGPULegalizerInfo *LI = |
542 | static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo()); |
543 | |
544 | GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); |
545 | MachineDominatorTree *MDT = |
546 | IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>(); |
547 | |
548 | CombinerInfo CInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true, |
549 | LI, EnableOpt, F.hasOptSize(), F.hasMinSize()); |
550 | |
551 | AMDGPUPostLegalizerCombinerImpl Impl(MF, CInfo, TPC, *KB, /*CSEInfo*/ nullptr, |
552 | RuleConfig, ST, MDT, LI); |
553 | return Impl.combineMachineInstrs(); |
554 | } |
555 | |
556 | char AMDGPUPostLegalizerCombiner::ID = 0; |
557 | INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, |
558 | "Combine AMDGPU machine instrs after legalization" , false, |
559 | false) |
560 | INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) |
561 | INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis) |
562 | INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, |
563 | "Combine AMDGPU machine instrs after legalization" , false, |
564 | false) |
565 | |
566 | namespace llvm { |
567 | FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone) { |
568 | return new AMDGPUPostLegalizerCombiner(IsOptNone); |
569 | } |
570 | } // end namespace llvm |
571 | |