AMDGPUCombinerHelper.cpp source code [llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp]

1	//=== lib/CodeGen/GlobalISel/AMDGPUCombinerHelper.cpp ---------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#include "AMDGPUCombinerHelper.h"
10	#include "GCNSubtarget.h"
11	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
12	#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
13	#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
14	#include "llvm/IR/IntrinsicsAMDGPU.h"
15	#include "llvm/Target/TargetMachine.h"
16
17	using namespace llvm;
18	using namespace MIPatternMatch;
19
20	LLVM_READNONE
21	static bool fnegFoldsIntoMI(const MachineInstr &MI) {
22	switch (MI.getOpcode()) {
23	case AMDGPU::G_FADD:
24	case AMDGPU::G_FSUB:
25	case AMDGPU::G_FMUL:
26	case AMDGPU::G_FMA:
27	case AMDGPU::G_FMAD:
28	case AMDGPU::G_FMINNUM:
29	case AMDGPU::G_FMAXNUM:
30	case AMDGPU::G_FMINNUM_IEEE:
31	case AMDGPU::G_FMAXNUM_IEEE:
32	case AMDGPU::G_FMINIMUM:
33	case AMDGPU::G_FMAXIMUM:
34	case AMDGPU::G_FSIN:
35	case AMDGPU::G_FPEXT:
36	case AMDGPU::G_INTRINSIC_TRUNC:
37	case AMDGPU::G_FPTRUNC:
38	case AMDGPU::G_FRINT:
39	case AMDGPU::G_FNEARBYINT:
40	case AMDGPU::G_INTRINSIC_ROUND:
41	case AMDGPU::G_INTRINSIC_ROUNDEVEN:
42	case AMDGPU::G_FCANONICALIZE:
43	case AMDGPU::G_AMDGPU_RCP_IFLAG:
44	case AMDGPU::G_AMDGPU_FMIN_LEGACY:
45	case AMDGPU::G_AMDGPU_FMAX_LEGACY:
46	return true;
47	case AMDGPU::G_INTRINSIC: {
48	unsigned IntrinsicID = cast<GIntrinsic>(Val: MI).getIntrinsicID();
49	switch (IntrinsicID) {
50	case Intrinsic::amdgcn_rcp:
51	case Intrinsic::amdgcn_rcp_legacy:
52	case Intrinsic::amdgcn_sin:
53	case Intrinsic::amdgcn_fmul_legacy:
54	case Intrinsic::amdgcn_fmed3:
55	case Intrinsic::amdgcn_fma_legacy:
56	return true;
57	default:
58	return false;
59	}
60	}
61	default:
62	return false;
63	}
64	}
65
66	/// \p returns true if the operation will definitely need to use a 64-bit
67	/// encoding, and thus will use a VOP3 encoding regardless of the source
68	/// modifiers.
69	LLVM_READONLY
70	static bool opMustUseVOP3Encoding(const MachineInstr &MI,
71	const MachineRegisterInfo &MRI) {
72	return MI.getNumOperands() > (isa<GIntrinsic>(Val: MI) ? `4u` : `3u`) \|\|
73	MRI.getType(Reg: MI.getOperand(i: `0`).getReg()).getScalarSizeInBits() == `64`;
74	}
75
76	// Most FP instructions support source modifiers.
77	LLVM_READONLY
78	static bool hasSourceMods(const MachineInstr &MI) {
79	if (!MI.memoperands().empty())
80	return false;
81
82	switch (MI.getOpcode()) {
83	case AMDGPU::COPY:
84	case AMDGPU::G_SELECT:
85	case AMDGPU::G_FDIV:
86	case AMDGPU::G_FREM:
87	case TargetOpcode::INLINEASM:
88	case TargetOpcode::INLINEASM_BR:
89	case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
90	case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
91	case AMDGPU::G_BITCAST:
92	case AMDGPU::G_ANYEXT:
93	case AMDGPU::G_BUILD_VECTOR:
94	case AMDGPU::G_BUILD_VECTOR_TRUNC:
95	case AMDGPU::G_PHI:
96	return false;
97	case AMDGPU::G_INTRINSIC:
98	case AMDGPU::G_INTRINSIC_CONVERGENT: {
99	unsigned IntrinsicID = cast<GIntrinsic>(Val: MI).getIntrinsicID();
100	switch (IntrinsicID) {
101	case Intrinsic::amdgcn_interp_p1:
102	case Intrinsic::amdgcn_interp_p2:
103	case Intrinsic::amdgcn_interp_mov:
104	case Intrinsic::amdgcn_interp_p1_f16:
105	case Intrinsic::amdgcn_interp_p2_f16:
106	case Intrinsic::amdgcn_div_scale:
107	return false;
108	default:
109	return true;
110	}
111	}
112	default:
113	return true;
114	}
115	}
116
117	static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI,
118	unsigned CostThreshold = `4`) {
119	// Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
120	// it is truly free to use a source modifier in all cases. If there are
121	// multiple users but for each one will necessitate using VOP3, there will be
122	// a code size increase. Try to avoid increasing code size unless we know it
123	// will save on the instruction count.
124	unsigned NumMayIncreaseSize = `0`;
125	Register Dst = MI.getOperand(i: `0`).getReg();
126	for (const MachineInstr &Use : MRI.use_nodbg_instructions(Reg: Dst)) {
127	if (!hasSourceMods(MI: Use))
128	return false;
129
130	if (!opMustUseVOP3Encoding(MI: Use, MRI)) {
131	if (++NumMayIncreaseSize > CostThreshold)
132	return false;
133	}
134	}
135	return true;
136	}
137
138	static bool mayIgnoreSignedZero(MachineInstr &MI) {
139	const TargetOptions &Options = MI.getMF()->getTarget().Options;
140	return Options.NoSignedZerosFPMath \|\| MI.getFlag(Flag: MachineInstr::MIFlag::FmNsz);
141	}
142
143	static bool isInv2Pi(const APFloat &APF) {
144	static const APFloat KF16(APFloat::IEEEhalf(), APInt (`16`, `0x3118`));
145	static const APFloat KF32(APFloat::IEEEsingle(), APInt (`32`, `0x3e22f983`));
146	static const APFloat KF64(APFloat::IEEEdouble(),
147	APInt (`64`, `0x3fc45f306dc9c882`));
148
149	return APF.bitwiseIsEqual(RHS: KF16) \|\| APF.bitwiseIsEqual(RHS: KF32) \|\|
150	APF.bitwiseIsEqual(RHS: KF64);
151	}
152
153	// 0 and 1.0 / (0.5 pi) do not have inline immmediates, so there is an*
154	// additional cost to negate them.
155	static bool isConstantCostlierToNegate(MachineInstr &MI, Register Reg,
156	MachineRegisterInfo &MRI) {
157	std::optional<FPValueAndVReg> FPValReg;
158	if (mi_match(R: Reg, MRI, P: m_GFCstOrSplat(FPValReg))) {
159	if (FPValReg ->Value.isZero() && !FPValReg ->Value.isNegative())
160	return true;
161
162	const GCNSubtarget &ST = MI.getMF()->getSubtarget<GCNSubtarget>();
163	if (ST.hasInv2PiInlineImm() && isInv2Pi(APF: FPValReg ->Value))
164	return true;
165	}
166	return false;
167	}
168
169	static unsigned inverseMinMax(unsigned Opc) {
170	switch (Opc) {
171	case AMDGPU::G_FMAXNUM:
172	return AMDGPU::G_FMINNUM;
173	case AMDGPU::G_FMINNUM:
174	return AMDGPU::G_FMAXNUM;
175	case AMDGPU::G_FMAXNUM_IEEE:
176	return AMDGPU::G_FMINNUM_IEEE;
177	case AMDGPU::G_FMINNUM_IEEE:
178	return AMDGPU::G_FMAXNUM_IEEE;
179	case AMDGPU::G_FMAXIMUM:
180	return AMDGPU::G_FMINIMUM;
181	case AMDGPU::G_FMINIMUM:
182	return AMDGPU::G_FMAXIMUM;
183	case AMDGPU::G_AMDGPU_FMAX_LEGACY:
184	return AMDGPU::G_AMDGPU_FMIN_LEGACY;
185	case AMDGPU::G_AMDGPU_FMIN_LEGACY:
186	return AMDGPU::G_AMDGPU_FMAX_LEGACY;
187	default:
188	llvm_unreachable("invalid min/max opcode");
189	}
190	}
191
192	bool AMDGPUCombinerHelper::matchFoldableFneg(MachineInstr &MI,
193	MachineInstr *&MatchInfo) {
194	Register Src = MI.getOperand(i: `1`).getReg();
195	MatchInfo = MRI.getVRegDef(Reg: Src);
196
197	// If the input has multiple uses and we can either fold the negate down, or
198	// the other uses cannot, give up. This both prevents unprofitable
199	// transformations and infinite loops: we won't repeatedly try to fold around
200	// a negate that has no 'good' form.
201	if (MRI.hasOneNonDBGUse(RegNo: Src)) {
202	if (allUsesHaveSourceMods(MI, MRI, CostThreshold: `0`))
203	return false;
204	} else {
205	if (fnegFoldsIntoMI(MI: *MatchInfo) &&
206	(allUsesHaveSourceMods(MI, MRI) \|\|
207	!allUsesHaveSourceMods(MI&: *MatchInfo, MRI)))
208	return false;
209	}
210
211	switch (MatchInfo->getOpcode()) {
212	case AMDGPU::G_FMINNUM:
213	case AMDGPU::G_FMAXNUM:
214	case AMDGPU::G_FMINNUM_IEEE:
215	case AMDGPU::G_FMAXNUM_IEEE:
216	case AMDGPU::G_FMINIMUM:
217	case AMDGPU::G_FMAXIMUM:
218	case AMDGPU::G_AMDGPU_FMIN_LEGACY:
219	case AMDGPU::G_AMDGPU_FMAX_LEGACY:
220	// 0 doesn't have a negated inline immediate.
221	return !isConstantCostlierToNegate(MI&: *MatchInfo,
222	Reg: MatchInfo->getOperand(i: `2`).getReg(), MRI);
223	case AMDGPU::G_FADD:
224	case AMDGPU::G_FSUB:
225	case AMDGPU::G_FMA:
226	case AMDGPU::G_FMAD:
227	return mayIgnoreSignedZero(MI&: *MatchInfo);
228	case AMDGPU::G_FMUL:
229	case AMDGPU::G_FPEXT:
230	case AMDGPU::G_INTRINSIC_TRUNC:
231	case AMDGPU::G_FPTRUNC:
232	case AMDGPU::G_FRINT:
233	case AMDGPU::G_FNEARBYINT:
234	case AMDGPU::G_INTRINSIC_ROUND:
235	case AMDGPU::G_INTRINSIC_ROUNDEVEN:
236	case AMDGPU::G_FSIN:
237	case AMDGPU::G_FCANONICALIZE:
238	case AMDGPU::G_AMDGPU_RCP_IFLAG:
239	return true;
240	case AMDGPU::G_INTRINSIC:
241	case AMDGPU::G_INTRINSIC_CONVERGENT: {
242	unsigned IntrinsicID = cast<GIntrinsic>(Val: MatchInfo)->getIntrinsicID();
243	switch (IntrinsicID) {
244	case Intrinsic::amdgcn_rcp:
245	case Intrinsic::amdgcn_rcp_legacy:
246	case Intrinsic::amdgcn_sin:
247	case Intrinsic::amdgcn_fmul_legacy:
248	case Intrinsic::amdgcn_fmed3:
249	return true;
250	case Intrinsic::amdgcn_fma_legacy:
251	return mayIgnoreSignedZero(MI&: *MatchInfo);
252	default:
253	return false;
254	}
255	}
256	default:
257	return false;
258	}
259	}
260
261	void AMDGPUCombinerHelper::applyFoldableFneg(MachineInstr &MI,
262	MachineInstr *&MatchInfo) {
263	// Transform:
264	// %A = inst %Op1, ...
265	// %B = fneg %A
266	//
267	// into:
268	//
269	// (if %A has one use, specifically fneg above)
270	// %B = inst (maybe fneg %Op1), ...
271	//
272	// (if %A has multiple uses)
273	// %B = inst (maybe fneg %Op1), ...
274	// %A = fneg %B
275
276	// Replace register in operand with a register holding negated value.
277	auto NegateOperand = [&](MachineOperand &Op) {
278	Register Reg = Op.getReg();
279	if (!mi_match(R: Reg, MRI, P: m_GFNeg(Src: m_Reg(R&: Reg))))
280	Reg = Builder.buildFNeg(Dst: MRI.getType(Reg), Src0: Reg).getReg(Idx: `0`);
281	replaceRegOpWith(MRI, FromRegOp&: Op, ToReg: Reg);
282	};
283
284	// Replace either register in operands with a register holding negated value.
285	auto NegateEitherOperand = [&](MachineOperand &X, MachineOperand &Y) {
286	Register XReg = X.getReg();
287	Register YReg = Y.getReg();
288	if (mi_match(R: XReg, MRI, P: m_GFNeg(Src: m_Reg(R&: XReg))))
289	replaceRegOpWith(MRI, FromRegOp&: X, ToReg: XReg);
290	else if (mi_match(R: YReg, MRI, P: m_GFNeg(Src: m_Reg(R&: YReg))))
291	replaceRegOpWith(MRI, FromRegOp&: Y, ToReg: YReg);
292	else {
293	YReg = Builder.buildFNeg(Dst: MRI.getType(Reg: YReg), Src0: YReg).getReg(Idx: `0`);
294	replaceRegOpWith(MRI, FromRegOp&: Y, ToReg: YReg);
295	}
296	};
297
298	Builder.setInstrAndDebugLoc(*MatchInfo);
299
300	// Negate appropriate operands so that resulting value of MatchInfo is
301	// negated.
302	switch (MatchInfo->getOpcode()) {
303	case AMDGPU::G_FADD:
304	case AMDGPU::G_FSUB:
305	NegateOperand (MatchInfo->getOperand(i: `1`));
306	NegateOperand (MatchInfo->getOperand(i: `2`));
307	break;
308	case AMDGPU::G_FMUL:
309	NegateEitherOperand (MatchInfo->getOperand(i: `1`), MatchInfo->getOperand(i: `2`));
310	break;
311	case AMDGPU::G_FMINNUM:
312	case AMDGPU::G_FMAXNUM:
313	case AMDGPU::G_FMINNUM_IEEE:
314	case AMDGPU::G_FMAXNUM_IEEE:
315	case AMDGPU::G_FMINIMUM:
316	case AMDGPU::G_FMAXIMUM:
317	case AMDGPU::G_AMDGPU_FMIN_LEGACY:
318	case AMDGPU::G_AMDGPU_FMAX_LEGACY: {
319	NegateOperand (MatchInfo->getOperand(i: `1`));
320	NegateOperand (MatchInfo->getOperand(i: `2`));
321	unsigned Opposite = inverseMinMax(Opc: MatchInfo->getOpcode());
322	replaceOpcodeWith(FromMI&: *MatchInfo, ToOpcode: Opposite);
323	break;
324	}
325	case AMDGPU::G_FMA:
326	case AMDGPU::G_FMAD:
327	NegateEitherOperand (MatchInfo->getOperand(i: `1`), MatchInfo->getOperand(i: `2`));
328	NegateOperand (MatchInfo->getOperand(i: `3`));
329	break;
330	case AMDGPU::G_FPEXT:
331	case AMDGPU::G_INTRINSIC_TRUNC:
332	case AMDGPU::G_FRINT:
333	case AMDGPU::G_FNEARBYINT:
334	case AMDGPU::G_INTRINSIC_ROUND:
335	case AMDGPU::G_INTRINSIC_ROUNDEVEN:
336	case AMDGPU::G_FSIN:
337	case AMDGPU::G_FCANONICALIZE:
338	case AMDGPU::G_AMDGPU_RCP_IFLAG:
339	case AMDGPU::G_FPTRUNC:
340	NegateOperand (MatchInfo->getOperand(i: `1`));
341	break;
342	case AMDGPU::G_INTRINSIC:
343	case AMDGPU::G_INTRINSIC_CONVERGENT: {
344	unsigned IntrinsicID = cast<GIntrinsic>(Val: MatchInfo)->getIntrinsicID();
345	switch (IntrinsicID) {
346	case Intrinsic::amdgcn_rcp:
347	case Intrinsic::amdgcn_rcp_legacy:
348	case Intrinsic::amdgcn_sin:
349	NegateOperand (MatchInfo->getOperand(i: `2`));
350	break;
351	case Intrinsic::amdgcn_fmul_legacy:
352	NegateEitherOperand (MatchInfo->getOperand(i: `2`), MatchInfo->getOperand(i: `3`));
353	break;
354	case Intrinsic::amdgcn_fmed3:
355	NegateOperand (MatchInfo->getOperand(i: `2`));
356	NegateOperand (MatchInfo->getOperand(i: `3`));
357	NegateOperand (MatchInfo->getOperand(i: `4`));
358	break;
359	case Intrinsic::amdgcn_fma_legacy:
360	NegateEitherOperand (MatchInfo->getOperand(i: `2`), MatchInfo->getOperand(i: `3`));
361	NegateOperand (MatchInfo->getOperand(i: `4`));
362	break;
363	default:
364	llvm_unreachable("folding fneg not supported for this intrinsic");
365	}
366	break;
367	}
368	default:
369	llvm_unreachable("folding fneg not supported for this instruction");
370	}
371
372	Register Dst = MI.getOperand(i: `0`).getReg();
373	Register MatchInfoDst = MatchInfo->getOperand(i: `0`).getReg();
374
375	if (MRI.hasOneNonDBGUse(RegNo: MatchInfoDst)) {
376	// MatchInfo now has negated value so use that instead of old Dst.
377	replaceRegWith(MRI, FromReg: Dst, ToReg: MatchInfoDst);
378	} else {
379	// We want to swap all uses of Dst with uses of MatchInfoDst and vice versa
380	// but replaceRegWith will replace defs as well. It is easier to replace one
381	// def with a new register.
382	LLT Type = MRI.getType(Reg: Dst);
383	Register NegatedMatchInfo = MRI.createGenericVirtualRegister(Ty: Type);
384	replaceRegOpWith(MRI, FromRegOp&: MatchInfo->getOperand(i: `0`), ToReg: NegatedMatchInfo);
385
386	// MatchInfo now has negated value so use that instead of old Dst.
387	replaceRegWith(MRI, FromReg: Dst, ToReg: NegatedMatchInfo);
388
389	// Recreate non negated value for other uses of old MatchInfoDst
390	auto NextInst = ++MatchInfo->getIterator();
391	Builder.setInstrAndDebugLoc(*NextInst);
392	Builder.buildFNeg(Dst: MatchInfoDst, Src0: NegatedMatchInfo, Flags: MI.getFlags());
393	}
394
395	MI.eraseFromParent();
396	}
397
398	// TODO: Should return converted value / extension source and avoid introducing
399	// intermediate fptruncs in the apply function.
400	static bool isFPExtFromF16OrConst(const MachineRegisterInfo &MRI,
401	Register Reg) {
402	const MachineInstr *Def = MRI.getVRegDef(Reg);
403	if (Def->getOpcode() == TargetOpcode::G_FPEXT) {
404	Register SrcReg = Def->getOperand(i: `1`).getReg();
405	return MRI.getType(Reg: SrcReg) == LLT::scalar(SizeInBits: `16`);
406	}
407
408	if (Def->getOpcode() == TargetOpcode::G_FCONSTANT) {
409	APFloat Val = Def->getOperand(i: `1`).getFPImm()->getValueAPF();
410	bool LosesInfo = true;
411	Val.convert(ToSemantics: APFloat::IEEEhalf(), RM: APFloat::rmNearestTiesToEven, losesInfo: &LosesInfo);
412	return !LosesInfo;
413	}
414
415	return false;
416	}
417
418	bool AMDGPUCombinerHelper::matchExpandPromotedF16FMed3(MachineInstr &MI,
419	Register Src0,
420	Register Src1,
421	Register Src2) {
422	assert(MI.getOpcode() == TargetOpcode::G_FPTRUNC);
423	Register SrcReg = MI.getOperand(i: `1`).getReg();
424	if (!MRI.hasOneNonDBGUse(RegNo: SrcReg) \|\| MRI.getType(Reg: SrcReg) != LLT::scalar(SizeInBits: `32`))
425	return false;
426
427	return isFPExtFromF16OrConst(MRI, Reg: Src0) && isFPExtFromF16OrConst(MRI, Reg: Src1) &&
428	isFPExtFromF16OrConst(MRI, Reg: Src2);
429	}
430
431	void AMDGPUCombinerHelper::applyExpandPromotedF16FMed3(MachineInstr &MI,
432	Register Src0,
433	Register Src1,
434	Register Src2) {
435	Builder.setInstrAndDebugLoc(MI);
436
437	// We expect fptrunc (fpext x) to fold out, and to constant fold any constant
438	// sources.
439	Src0 = Builder.buildFPTrunc(Res: LLT::scalar(SizeInBits: `16`), Op: Src0).getReg(Idx: `0`);
440	Src1 = Builder.buildFPTrunc(Res: LLT::scalar(SizeInBits: `16`), Op: Src1).getReg(Idx: `0`);
441	Src2 = Builder.buildFPTrunc(Res: LLT::scalar(SizeInBits: `16`), Op: Src2).getReg(Idx: `0`);
442
443	LLT Ty = MRI.getType(Reg: Src0);
444	auto A1 = Builder.buildFMinNumIEEE(Dst: Ty, Src0, Src1);
445	auto B1 = Builder.buildFMaxNumIEEE(Dst: Ty, Src0, Src1);
446	auto C1 = Builder.buildFMaxNumIEEE(Dst: Ty, Src0: A1, Src1: Src2);
447	Builder.buildFMinNumIEEE(Dst: MI.getOperand(i: `0`), Src0: B1, Src1: C1);
448	MI.eraseFromParent();
449	}
450

source code of llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp