AMDGPUInstCombineIntrinsic.cpp source code [llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp]

1	//===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// \file
10	// This file implements a TargetTransformInfo analysis pass specific to the
11	// AMDGPU target machine. It uses the target's detailed information to provide
12	// more precise answers to certain TTI queries, while letting the target
13	// independent and default TTI implementations handle the rest.
14	//
15	//===----------------------------------------------------------------------===//
16
17	#include "AMDGPUInstrInfo.h"
18	#include "AMDGPUTargetTransformInfo.h"
19	#include "GCNSubtarget.h"
20	#include "llvm/ADT/FloatingPointMode.h"
21	#include "llvm/IR/IntrinsicsAMDGPU.h"
22	#include "llvm/Transforms/InstCombine/InstCombiner.h"
23	#include <optional>
24
25	using namespace llvm;
26	using namespace llvm::PatternMatch;
27
28	#define DEBUG_TYPE "AMDGPUtti"
29
30	namespace {
31
32	struct AMDGPUImageDMaskIntrinsic {
33	unsigned Intr;
34	};
35
36	#define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
37	#include "InstCombineTables.inc"
38
39	} // end anonymous namespace
40
41	// Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
42	//
43	// A single NaN input is folded to minnum, so we rely on that folding for
44	// handling NaNs.
45	static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
46	const APFloat &Src2) {
47	APFloat Max3 = maxnum(A: maxnum(A: Src0, B: Src1), B: Src2);
48
49	APFloat::cmpResult Cmp0 = Max3.compare(RHS: Src0);
50	assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
51	if (Cmp0 == APFloat::cmpEqual)
52	return maxnum(A: Src1, B: Src2);
53
54	APFloat::cmpResult Cmp1 = Max3.compare(RHS: Src1);
55	assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
56	if (Cmp1 == APFloat::cmpEqual)
57	return maxnum(A: Src0, B: Src2);
58
59	return maxnum(A: Src0, B: Src1);
60	}
61
62	// Check if a value can be converted to a 16-bit value without losing
63	// precision.
64	// The value is expected to be either a float (IsFloat = true) or an unsigned
65	// integer (IsFloat = false).
66	static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat) {
67	Type *VTy = V.getType();
68	if (VTy->isHalfTy() \|\| VTy->isIntegerTy(Bitwidth: `16`)) {
69	// The value is already 16-bit, so we don't want to convert to 16-bit again!
70	return false;
71	}
72	if (IsFloat) {
73	if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(Val: &V)) {
74	// We need to check that if we cast the index down to a half, we do not
75	// lose precision.
76	APFloat FloatValue(ConstFloat->getValueAPF());
77	bool LosesInfo = true;
78	FloatValue.convert(ToSemantics: APFloat::IEEEhalf(), RM: APFloat::rmTowardZero,
79	losesInfo: &LosesInfo);
80	return !LosesInfo;
81	}
82	} else {
83	if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(Val: &V)) {
84	// We need to check that if we cast the index down to an i16, we do not
85	// lose precision.
86	APInt IntValue(ConstInt->getValue());
87	return IntValue.getActiveBits() <= `16`;
88	}
89	}
90
91	Value *CastSrc;
92	bool IsExt = IsFloat ? match(V: &V, P: m_FPExt(Op: PatternMatch::m_Value(V&: CastSrc)))
93	: match(V: &V, P: m_ZExt(Op: PatternMatch::m_Value(V&: CastSrc)));
94	if (IsExt) {
95	Type *CastSrcTy = CastSrc->getType();
96	if (CastSrcTy->isHalfTy() \|\| CastSrcTy->isIntegerTy(Bitwidth: `16`))
97	return true;
98	}
99
100	return false;
101	}
102
103	// Convert a value to 16-bit.
104	static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) {
105	Type *VTy = V.getType();
106	if (isa<FPExtInst>(Val: &V) \|\| isa<SExtInst>(Val: &V) \|\| isa<ZExtInst>(Val: &V))
107	return cast<Instruction>(Val: &V)->getOperand(i: `0`);
108	if (VTy->isIntegerTy())
109	return Builder.CreateIntCast(V: &V, DestTy: Type::getInt16Ty(C&: V.getContext()), isSigned: false);
110	if (VTy->isFloatingPointTy())
111	return Builder.CreateFPCast(V: &V, DestTy: Type::getHalfTy(C&: V.getContext()));
112
113	llvm_unreachable("Should never be called!");
114	}
115
116	/// Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with
117	/// modified arguments (based on OldIntr) and replaces InstToReplace with
118	/// this newly created intrinsic call.
119	static std::optional<Instruction *> modifyIntrinsicCall(
120	IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr,
121	InstCombiner &IC,
122	std::function<void(SmallVectorImpl<Value > &, SmallVectorImpl<Type > &)>
123	Func) {
124	SmallVector<Type *, `4`> ArgTys;
125	if (!Intrinsic::getIntrinsicSignature(F: OldIntr.getCalledFunction(), ArgTys))
126	return std::nullopt;
127
128	SmallVector<Value *, `8`> Args(OldIntr.args());
129
130	// Modify arguments and types
131	Func (Args, ArgTys);
132
133	Function *I = Intrinsic::getDeclaration(M: OldIntr.getModule(), id: NewIntr, Tys: ArgTys);
134
135	CallInst *NewCall = IC.Builder.CreateCall(Callee: I, Args);
136	NewCall->takeName(V: &OldIntr);
137	NewCall->copyMetadata(SrcInst: OldIntr);
138	if (isa<FPMathOperator>(Val: NewCall))
139	NewCall->copyFastMathFlags(I: &OldIntr);
140
141	// Erase and replace uses
142	if (!InstToReplace.getType()->isVoidTy())
143	IC.replaceInstUsesWith(I&: InstToReplace, V: NewCall);
144
145	bool RemoveOldIntr = &OldIntr != &InstToReplace;
146
147	auto RetValue = IC.eraseInstFromFunction(I&: InstToReplace);
148	if (RemoveOldIntr)
149	IC.eraseInstFromFunction(I&: OldIntr);
150
151	return RetValue;
152	}
153
154	static std::optional<Instruction *>
155	simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
156	const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
157	IntrinsicInst &II, InstCombiner &IC) {
158	// Optimize _L to _LZ when _L is zero
159	if (const auto *LZMappingInfo =
160	AMDGPU::getMIMGLZMappingInfo(L: ImageDimIntr->BaseOpcode)) {
161	if (auto *ConstantLod =
162	dyn_cast<ConstantFP>(Val: II.getOperand(i_nocapture: ImageDimIntr->LodIndex))) {
163	if (ConstantLod->isZero() \|\| ConstantLod->isNegative()) {
164	const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
165	AMDGPU::getImageDimIntrinsicByBaseOpcode(BaseOpcode: LZMappingInfo->LZ,
166	Dim: ImageDimIntr->Dim);
167	return modifyIntrinsicCall(
168	OldIntr&: II, InstToReplace&: II, NewIntr: NewImageDimIntr->Intr, IC, Func: [&](auto &Args, auto &ArgTys) {
169	Args.erase(Args.begin() + ImageDimIntr->LodIndex);
170	});
171	}
172	}
173	}
174
175	// Optimize _mip away, when 'lod' is zero
176	if (const auto *MIPMappingInfo =
177	AMDGPU::getMIMGMIPMappingInfo(MIP: ImageDimIntr->BaseOpcode)) {
178	if (auto *ConstantMip =
179	dyn_cast<ConstantInt>(Val: II.getOperand(i_nocapture: ImageDimIntr->MipIndex))) {
180	if (ConstantMip->isZero()) {
181	const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
182	AMDGPU::getImageDimIntrinsicByBaseOpcode(BaseOpcode: MIPMappingInfo->NONMIP,
183	Dim: ImageDimIntr->Dim);
184	return modifyIntrinsicCall(
185	OldIntr&: II, InstToReplace&: II, NewIntr: NewImageDimIntr->Intr, IC, Func: [&](auto &Args, auto &ArgTys) {
186	Args.erase(Args.begin() + ImageDimIntr->MipIndex);
187	});
188	}
189	}
190	}
191
192	// Optimize _bias away when 'bias' is zero
193	if (const auto *BiasMappingInfo =
194	AMDGPU::getMIMGBiasMappingInfo(Bias: ImageDimIntr->BaseOpcode)) {
195	if (auto *ConstantBias =
196	dyn_cast<ConstantFP>(Val: II.getOperand(i_nocapture: ImageDimIntr->BiasIndex))) {
197	if (ConstantBias->isZero()) {
198	const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
199	AMDGPU::getImageDimIntrinsicByBaseOpcode(BaseOpcode: BiasMappingInfo->NoBias,
200	Dim: ImageDimIntr->Dim);
201	return modifyIntrinsicCall(
202	OldIntr&: II, InstToReplace&: II, NewIntr: NewImageDimIntr->Intr, IC, Func: [&](auto &Args, auto &ArgTys) {
203	Args.erase(Args.begin() + ImageDimIntr->BiasIndex);
204	ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg);
205	});
206	}
207	}
208	}
209
210	// Optimize _offset away when 'offset' is zero
211	if (const auto *OffsetMappingInfo =
212	AMDGPU::getMIMGOffsetMappingInfo(Offset: ImageDimIntr->BaseOpcode)) {
213	if (auto *ConstantOffset =
214	dyn_cast<ConstantInt>(Val: II.getOperand(i_nocapture: ImageDimIntr->OffsetIndex))) {
215	if (ConstantOffset->isZero()) {
216	const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
217	AMDGPU::getImageDimIntrinsicByBaseOpcode(
218	BaseOpcode: OffsetMappingInfo->NoOffset, Dim: ImageDimIntr->Dim);
219	return modifyIntrinsicCall(
220	OldIntr&: II, InstToReplace&: II, NewIntr: NewImageDimIntr->Intr, IC, Func: [&](auto &Args, auto &ArgTys) {
221	Args.erase(Args.begin() + ImageDimIntr->OffsetIndex);
222	});
223	}
224	}
225	}
226
227	// Try to use D16
228	if (ST->hasD16Images()) {
229
230	const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
231	AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: ImageDimIntr->BaseOpcode);
232
233	if (BaseOpcode->HasD16) {
234
235	// If the only use of image intrinsic is a fptrunc (with conversion to
236	// half) then both fptrunc and image intrinsic will be replaced with image
237	// intrinsic with D16 flag.
238	if (II.hasOneUse()) {
239	Instruction *User = II.user_back();
240
241	if (User->getOpcode() == Instruction::FPTrunc &&
242	User->getType()->getScalarType()->isHalfTy()) {
243
244	return modifyIntrinsicCall(OldIntr&: II, InstToReplace&: *User, NewIntr: ImageDimIntr->Intr, IC,
245	Func: [&](auto &Args, auto &ArgTys) {
246	// Change return type of image intrinsic.
247	// Set it to return type of fptrunc.
248	ArgTys[`0`] = User->getType();
249	});
250	}
251	}
252	}
253	}
254
255	// Try to use A16 or G16
256	if (!ST->hasA16() && !ST->hasG16())
257	return std::nullopt;
258
259	// Address is interpreted as float if the instruction has a sampler or as
260	// unsigned int if there is no sampler.
261	bool HasSampler =
262	AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: ImageDimIntr->BaseOpcode)->Sampler;
263	bool FloatCoord = false;
264	// true means derivatives can be converted to 16 bit, coordinates not
265	bool OnlyDerivatives = false;
266
267	for (unsigned OperandIndex = ImageDimIntr->GradientStart;
268	OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
269	Value *Coord = II.getOperand(i_nocapture: OperandIndex);
270	// If the values are not derived from 16-bit values, we cannot optimize.
271	if (!canSafelyConvertTo16Bit(V&: *Coord, IsFloat: HasSampler)) {
272	if (OperandIndex < ImageDimIntr->CoordStart \|\|
273	ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
274	return std::nullopt;
275	}
276	// All gradients can be converted, so convert only them
277	OnlyDerivatives = true;
278	break;
279	}
280
281	assert(OperandIndex == ImageDimIntr->GradientStart \|\|
282	FloatCoord == Coord->getType()->isFloatingPointTy());
283	FloatCoord = Coord->getType()->isFloatingPointTy();
284	}
285
286	if (!OnlyDerivatives && !ST->hasA16())
287	OnlyDerivatives = true; // Only supports G16
288
289	// Check if there is a bias parameter and if it can be converted to f16
290	if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != `0`) {
291	Value *Bias = II.getOperand(i_nocapture: ImageDimIntr->BiasIndex);
292	assert(HasSampler &&
293	"Only image instructions with a sampler can have a bias");
294	if (!canSafelyConvertTo16Bit(V&: *Bias, IsFloat: HasSampler))
295	OnlyDerivatives = true;
296	}
297
298	if (OnlyDerivatives && (!ST->hasG16() \|\| ImageDimIntr->GradientStart ==
299	ImageDimIntr->CoordStart))
300	return std::nullopt;
301
302	Type *CoordType = FloatCoord ? Type::getHalfTy(C&: II.getContext())
303	: Type::getInt16Ty(C&: II.getContext());
304
305	return modifyIntrinsicCall(
306	OldIntr&: II, InstToReplace&: II, NewIntr: II.getIntrinsicID(), IC, Func: [&](auto &Args, auto &ArgTys) {
307	ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
308	if (!OnlyDerivatives) {
309	ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
310
311	// Change the bias type
312	if (ImageDimIntr->NumBiasArgs != `0`)
313	ArgTys[ImageDimIntr->BiasTyArg] = Type::getHalfTy(C&: II.getContext());
314	}
315
316	unsigned EndIndex =
317	OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
318	for (unsigned OperandIndex = ImageDimIntr->GradientStart;
319	OperandIndex < EndIndex; OperandIndex++) {
320	Args[OperandIndex] =
321	convertTo16Bit(V&: *II.getOperand(i_nocapture: OperandIndex), Builder&: IC.Builder);
322	}
323
324	// Convert the bias
325	if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != `0`) {
326	Value *Bias = II.getOperand(i_nocapture: ImageDimIntr->BiasIndex);
327	Args[ImageDimIntr->BiasIndex] = convertTo16Bit(V&: *Bias, Builder&: IC.Builder);
328	}
329	});
330	}
331
332	bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Instruction &I,
333	const Value Op0, const* Value *Op1,
334	InstCombiner &IC) const {
335	// The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
336	// infinity, gives +0.0. If we can prove we don't have one of the special
337	// cases then we can use a normal multiply instead.
338	// TODO: Create and use isKnownFiniteNonZero instead of just matching
339	// constants here.
340	if (match(V: Op0, P: PatternMatch::m_FiniteNonZero()) \|\|
341	match(V: Op1, P: PatternMatch::m_FiniteNonZero())) {
342	// One operand is not zero or infinity or NaN.
343	return true;
344	}
345
346	SimplifyQuery SQ = IC.getSimplifyQuery().getWithInstruction(I: &I);
347	if (isKnownNeverInfOrNaN(V: Op0, /Depth=/`0`, SQ) &&
348	isKnownNeverInfOrNaN(V: Op1, /Depth=/`0`, SQ)) {
349	// Neither operand is infinity or NaN.
350	return true;
351	}
352	return false;
353	}
354
355	/// Match an fpext from half to float, or a constant we can convert.
356	static bool matchFPExtFromF16(Value Arg, Value &FPExtSrc) {
357	if (match(V: Arg, P: m_OneUse(SubPattern: m_FPExt(Op: m_Value(V&: FPExtSrc)))))
358	return FPExtSrc->getType()->isHalfTy();
359
360	ConstantFP *CFP;
361	if (match(V: Arg, P: m_ConstantFP(C&: CFP))) {
362	bool LosesInfo;
363	APFloat Val(CFP->getValueAPF());
364	Val.convert(ToSemantics: APFloat::IEEEhalf(), RM: APFloat::rmNearestTiesToEven, losesInfo: &LosesInfo);
365	if (LosesInfo)
366	return false;
367
368	FPExtSrc = ConstantFP::get(Ty: Type::getHalfTy(C&: Arg->getContext()), V: Val);
369	return true;
370	}
371
372	return false;
373	}
374
375	// Trim all zero components from the end of the vector \p UseV and return
376	// an appropriate bitset with known elements.
377	static APInt trimTrailingZerosInVector(InstCombiner &IC, Value *UseV,
378	Instruction *I) {
379	auto *VTy = cast<FixedVectorType>(Val: UseV->getType());
380	unsigned VWidth = VTy->getNumElements();
381	APInt DemandedElts = APInt::getAllOnes(numBits: VWidth);
382
383	for (int i = VWidth - `1`; i > `0`; --i) {
384	auto *Elt = findScalarElement(V: UseV, EltNo: i);
385	if (!Elt)
386	break;
387
388	if (auto *ConstElt = dyn_cast<Constant>(Val: Elt)) {
389	if (!ConstElt->isNullValue() && !isa<UndefValue>(Val: Elt))
390	break;
391	} else {
392	break;
393	}
394
395	DemandedElts.clearBit(BitPosition: i);
396	}
397
398	return DemandedElts;
399	}
400
401	// Trim elements of the end of the vector \p V, if they are
402	// equal to the first element of the vector.
403	static APInt defaultComponentBroadcast(Value *V) {
404	auto *VTy = cast<FixedVectorType>(Val: V->getType());
405	unsigned VWidth = VTy->getNumElements();
406	APInt DemandedElts = APInt::getAllOnes(numBits: VWidth);
407	Value *FirstComponent = findScalarElement(V, EltNo: `0`);
408
409	SmallVector<int> ShuffleMask;
410	if (auto *SVI = dyn_cast<ShuffleVectorInst>(Val: V))
411	SVI->getShuffleMask(Result&: ShuffleMask);
412
413	for (int I = VWidth - `1`; I > `0`; --I) {
414	if (ShuffleMask.empty()) {
415	auto *Elt = findScalarElement(V, EltNo: I);
416	if (!Elt \|\| (Elt != FirstComponent && !isa<UndefValue>(Val: Elt)))
417	break;
418	} else {
419	// Detect identical elements in the shufflevector result, even though
420	// findScalarElement cannot tell us what that element is.
421	if (ShuffleMask [I] != ShuffleMask [`0`] && ShuffleMask [I] != PoisonMaskElem)
422	break;
423	}
424	DemandedElts.clearBit(BitPosition: I);
425	}
426
427	return DemandedElts;
428	}
429
430	static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
431	IntrinsicInst &II,
432	APInt DemandedElts,
433	int DMaskIdx = -`1`,
434	bool IsLoad = true);
435
436	/// Return true if it's legal to contract llvm.amdgcn.rcp(llvm.sqrt)
437	static bool canContractSqrtToRsq(const FPMathOperator *SqrtOp) {
438	return (SqrtOp->getType()->isFloatTy() &&
439	(SqrtOp->hasApproxFunc() \|\| SqrtOp->getFPAccuracy() >= `1.0f`)) \|\|
440	SqrtOp->getType()->isHalfTy();
441	}
442
443	std::optional<Instruction *>
444	GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
445	Intrinsic::ID IID = II.getIntrinsicID();
446	switch (IID) {
447	case Intrinsic::amdgcn_rcp: {
448	Value *Src = II.getArgOperand(i: `0`);
449
450	// TODO: Move to ConstantFolding/InstSimplify?
451	if (isa<UndefValue>(Val: Src)) {
452	Type *Ty = II.getType();
453	auto *QNaN = ConstantFP::get(Ty, V: APFloat::getQNaN(Sem: Ty->getFltSemantics()));
454	return IC.replaceInstUsesWith(I&: II, V: QNaN);
455	}
456
457	if (II.isStrictFP())
458	break;
459
460	if (const ConstantFP *C = dyn_cast<ConstantFP>(Val: Src)) {
461	const APFloat &ArgVal = C->getValueAPF();
462	APFloat Val(ArgVal.getSemantics(), `1`);
463	Val.divide(RHS: ArgVal, RM: APFloat::rmNearestTiesToEven);
464
465	// This is more precise than the instruction may give.
466	//
467	// TODO: The instruction always flushes denormal results (except for f16),
468	// should this also?
469	return IC.replaceInstUsesWith(I&: II, V: ConstantFP::get(Context&: II.getContext(), V: Val));
470	}
471
472	FastMathFlags FMF = cast<FPMathOperator>(Val&: II).getFastMathFlags();
473	if (!FMF.allowContract())
474	break;
475	auto *SrcCI = dyn_cast<IntrinsicInst>(Val: Src);
476	if (!SrcCI)
477	break;
478
479	auto IID = SrcCI->getIntrinsicID();
480	// llvm.amdgcn.rcp(llvm.amdgcn.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable
481	//
482	// llvm.amdgcn.rcp(llvm.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable and
483	// relaxed.
484	if (IID == Intrinsic::amdgcn_sqrt \|\| IID == Intrinsic::sqrt) {
485	const FPMathOperator *SqrtOp = cast<FPMathOperator>(Val: SrcCI);
486	FastMathFlags InnerFMF = SqrtOp->getFastMathFlags();
487	if (!InnerFMF.allowContract() \|\| !SrcCI->hasOneUse())
488	break;
489
490	if (IID == Intrinsic::sqrt && !canContractSqrtToRsq(SqrtOp))
491	break;
492
493	Function *NewDecl = Intrinsic::getDeclaration(
494	M: SrcCI->getModule(), Intrinsic::id: amdgcn_rsq, Tys: {SrcCI->getType()});
495
496	InnerFMF \|= FMF;
497	II.setFastMathFlags(InnerFMF);
498
499	II.setCalledFunction(NewDecl);
500	return IC.replaceOperand(I&: II, OpNum: `0`, V: SrcCI->getArgOperand(i: `0`));
501	}
502
503	break;
504	}
505	case Intrinsic::amdgcn_sqrt:
506	case Intrinsic::amdgcn_rsq: {
507	Value *Src = II.getArgOperand(i: `0`);
508
509	// TODO: Move to ConstantFolding/InstSimplify?
510	if (isa<UndefValue>(Val: Src)) {
511	Type *Ty = II.getType();
512	auto *QNaN = ConstantFP::get(Ty, V: APFloat::getQNaN(Sem: Ty->getFltSemantics()));
513	return IC.replaceInstUsesWith(I&: II, V: QNaN);
514	}
515
516	// f16 amdgcn.sqrt is identical to regular sqrt.
517	if (IID == Intrinsic::amdgcn_sqrt && Src->getType()->isHalfTy()) {
518	Function *NewDecl = Intrinsic::getDeclaration(
519	M: II.getModule(), Intrinsic::id: sqrt, Tys: {II.getType()});
520	II.setCalledFunction(NewDecl);
521	return &II;
522	}
523
524	break;
525	}
526	case Intrinsic::amdgcn_log:
527	case Intrinsic::amdgcn_exp2: {
528	const bool IsLog = IID == Intrinsic::amdgcn_log;
529	const bool IsExp = IID == Intrinsic::amdgcn_exp2;
530	Value *Src = II.getArgOperand(i: `0`);
531	Type *Ty = II.getType();
532
533	if (isa<PoisonValue>(Val: Src))
534	return IC.replaceInstUsesWith(I&: II, V: Src);
535
536	if (IC.getSimplifyQuery().isUndefValue(V: Src))
537	return IC.replaceInstUsesWith(I&: II, V: ConstantFP::getNaN(Ty));
538
539	if (ConstantFP *C = dyn_cast<ConstantFP>(Val: Src)) {
540	if (C->isInfinity()) {
541	// exp2(+inf) -> +inf
542	// log2(+inf) -> +inf
543	if (!C->isNegative())
544	return IC.replaceInstUsesWith(I&: II, V: C);
545
546	// exp2(-inf) -> 0
547	if (IsExp && C->isNegative())
548	return IC.replaceInstUsesWith(I&: II, V: ConstantFP::getZero(Ty));
549	}
550
551	if (II.isStrictFP())
552	break;
553
554	if (C->isNaN()) {
555	Constant *Quieted = ConstantFP::get(Ty, V: C->getValue().makeQuiet());
556	return IC.replaceInstUsesWith(I&: II, V: Quieted);
557	}
558
559	// f32 instruction doesn't handle denormals, f16 does.
560	if (C->isZero() \|\| (C->getValue().isDenormal() && Ty->isFloatTy())) {
561	Constant FoldedValue = IsLog ? ConstantFP::getInfinity(Ty, Negative: true*)
562	: ConstantFP::get(Ty, V: `1.0`);
563	return IC.replaceInstUsesWith(I&: II, V: FoldedValue);
564	}
565
566	if (IsLog && C->isNegative())
567	return IC.replaceInstUsesWith(I&: II, V: ConstantFP::getNaN(Ty));
568
569	// TODO: Full constant folding matching hardware behavior.
570	}
571
572	break;
573	}
574	case Intrinsic::amdgcn_frexp_mant:
575	case Intrinsic::amdgcn_frexp_exp: {
576	Value *Src = II.getArgOperand(i: `0`);
577	if (const ConstantFP *C = dyn_cast<ConstantFP>(Val: Src)) {
578	int Exp;
579	APFloat Significand =
580	frexp(X: C->getValueAPF(), Exp, RM: APFloat::rmNearestTiesToEven);
581
582	if (IID == Intrinsic::amdgcn_frexp_mant) {
583	return IC.replaceInstUsesWith(
584	I&: II, V: ConstantFP::get(Context&: II.getContext(), V: Significand));
585	}
586
587	// Match instruction special case behavior.
588	if (Exp == APFloat::IEK_NaN \|\| Exp == APFloat::IEK_Inf)
589	Exp = `0`;
590
591	return IC.replaceInstUsesWith(I&: II, V: ConstantInt::get(Ty: II.getType(), V: Exp));
592	}
593
594	if (isa<UndefValue>(Val: Src)) {
595	return IC.replaceInstUsesWith(I&: II, V: UndefValue::get(T: II.getType()));
596	}
597
598	break;
599	}
600	case Intrinsic::amdgcn_class: {
601	Value *Src0 = II.getArgOperand(i: `0`);
602	Value *Src1 = II.getArgOperand(i: `1`);
603	const ConstantInt *CMask = dyn_cast<ConstantInt>(Val: Src1);
604	if (CMask) {
605	II.setCalledOperand(Intrinsic::getDeclaration(
606	M: II.getModule(), Intrinsic::id: is_fpclass, Tys: Src0->getType()));
607
608	// Clamp any excess bits, as they're illegal for the generic intrinsic.
609	II.setArgOperand(i: `1`, v: ConstantInt::get(Ty: Src1->getType(),
610	V: CMask->getZExtValue() & fcAllFlags));
611	return &II;
612	}
613
614	// Propagate poison.
615	if (isa<PoisonValue>(Val: Src0) \|\| isa<PoisonValue>(Val: Src1))
616	return IC.replaceInstUsesWith(I&: II, V: PoisonValue::get(T: II.getType()));
617
618	// llvm.amdgcn.class(_, undef) -> false
619	if (IC.getSimplifyQuery().isUndefValue(V: Src1))
620	return IC.replaceInstUsesWith(I&: II, V: ConstantInt::get(Ty: II.getType(), V: false));
621
622	// llvm.amdgcn.class(undef, mask) -> mask != 0
623	if (IC.getSimplifyQuery().isUndefValue(V: Src0)) {
624	Value *CmpMask = IC.Builder.CreateICmpNE(
625	LHS: Src1, RHS: ConstantInt::getNullValue(Ty: Src1->getType()));
626	return IC.replaceInstUsesWith(I&: II, V: CmpMask);
627	}
628	break;
629	}
630	case Intrinsic::amdgcn_cvt_pkrtz: {
631	Value *Src0 = II.getArgOperand(i: `0`);
632	Value *Src1 = II.getArgOperand(i: `1`);
633	if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Val: Src0)) {
634	if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Val: Src1)) {
635	const fltSemantics &HalfSem =
636	II.getType()->getScalarType()->getFltSemantics();
637	bool LosesInfo;
638	APFloat Val0 = C0->getValueAPF();
639	APFloat Val1 = C1->getValueAPF();
640	Val0.convert(ToSemantics: HalfSem, RM: APFloat::rmTowardZero, losesInfo: &LosesInfo);
641	Val1.convert(ToSemantics: HalfSem, RM: APFloat::rmTowardZero, losesInfo: &LosesInfo);
642
643	Constant *Folded =
644	ConstantVector::get(V: {ConstantFP::get(Context&: II.getContext(), V: Val0),
645	ConstantFP::get(Context&: II.getContext(), V: Val1)});
646	return IC.replaceInstUsesWith(I&: II, V: Folded);
647	}
648	}
649
650	if (isa<UndefValue>(Val: Src0) && isa<UndefValue>(Val: Src1)) {
651	return IC.replaceInstUsesWith(I&: II, V: UndefValue::get(T: II.getType()));
652	}
653
654	break;
655	}
656	case Intrinsic::amdgcn_cvt_pknorm_i16:
657	case Intrinsic::amdgcn_cvt_pknorm_u16:
658	case Intrinsic::amdgcn_cvt_pk_i16:
659	case Intrinsic::amdgcn_cvt_pk_u16: {
660	Value *Src0 = II.getArgOperand(i: `0`);
661	Value *Src1 = II.getArgOperand(i: `1`);
662
663	if (isa<UndefValue>(Val: Src0) && isa<UndefValue>(Val: Src1)) {
664	return IC.replaceInstUsesWith(I&: II, V: UndefValue::get(T: II.getType()));
665	}
666
667	break;
668	}
669	case Intrinsic::amdgcn_ubfe:
670	case Intrinsic::amdgcn_sbfe: {
671	// Decompose simple cases into standard shifts.
672	Value *Src = II.getArgOperand(i: `0`);
673	if (isa<UndefValue>(Val: Src)) {
674	return IC.replaceInstUsesWith(I&: II, V: Src);
675	}
676
677	unsigned Width;
678	Type *Ty = II.getType();
679	unsigned IntSize = Ty->getIntegerBitWidth();
680
681	ConstantInt *CWidth = dyn_cast<ConstantInt>(Val: II.getArgOperand(i: `2`));
682	if (CWidth) {
683	Width = CWidth->getZExtValue();
684	if ((Width & (IntSize - `1`)) == `0`) {
685	return IC.replaceInstUsesWith(I&: II, V: ConstantInt::getNullValue(Ty));
686	}
687
688	// Hardware ignores high bits, so remove those.
689	if (Width >= IntSize) {
690	return IC.replaceOperand(
691	I&: II, OpNum: `2`, V: ConstantInt::get(Ty: CWidth->getType(), V: Width & (IntSize - `1`)));
692	}
693	}
694
695	unsigned Offset;
696	ConstantInt *COffset = dyn_cast<ConstantInt>(Val: II.getArgOperand(i: `1`));
697	if (COffset) {
698	Offset = COffset->getZExtValue();
699	if (Offset >= IntSize) {
700	return IC.replaceOperand(
701	I&: II, OpNum: `1`,
702	V: ConstantInt::get(Ty: COffset->getType(), V: Offset & (IntSize - `1`)));
703	}
704	}
705
706	bool Signed = IID == Intrinsic::amdgcn_sbfe;
707
708	if (!CWidth \|\| !COffset)
709	break;
710
711	// The case of Width == 0 is handled above, which makes this transformation
712	// safe. If Width == 0, then the ashr and lshr instructions become poison
713	// value since the shift amount would be equal to the bit size.
714	assert(Width != `0`);
715
716	// TODO: This allows folding to undef when the hardware has specific
717	// behavior?
718	if (Offset + Width < IntSize) {
719	Value *Shl = IC.Builder.CreateShl(LHS: Src, RHS: IntSize - Offset - Width);
720	Value *RightShift = Signed ? IC.Builder.CreateAShr(LHS: Shl, RHS: IntSize - Width)
721	: IC.Builder.CreateLShr(LHS: Shl, RHS: IntSize - Width);
722	RightShift->takeName(V: &II);
723	return IC.replaceInstUsesWith(I&: II, V: RightShift);
724	}
725
726	Value *RightShift = Signed ? IC.Builder.CreateAShr(LHS: Src, RHS: Offset)
727	: IC.Builder.CreateLShr(LHS: Src, RHS: Offset);
728
729	RightShift->takeName(V: &II);
730	return IC.replaceInstUsesWith(I&: II, V: RightShift);
731	}
732	case Intrinsic::amdgcn_exp:
733	case Intrinsic::amdgcn_exp_row:
734	case Intrinsic::amdgcn_exp_compr: {
735	ConstantInt *En = cast<ConstantInt>(Val: II.getArgOperand(i: `1`));
736	unsigned EnBits = En->getZExtValue();
737	if (EnBits == `0xf`)
738	break; // All inputs enabled.
739
740	bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
741	bool Changed = false;
742	for (int I = `0`; I < (IsCompr ? `2` : `4`); ++I) {
743	if ((!IsCompr && (EnBits & (`1` << I)) == `0`) \|\|
744	(IsCompr && ((EnBits & (`0x3` << (`2` * I))) == `0`))) {
745	Value *Src = II.getArgOperand(i: I + `2`);
746	if (!isa<UndefValue>(Val: Src)) {
747	IC.replaceOperand(I&: II, OpNum: I + `2`, V: UndefValue::get(T: Src->getType()));
748	Changed = true;
749	}
750	}
751	}
752
753	if (Changed) {
754	return &II;
755	}
756
757	break;
758	}
759	case Intrinsic::amdgcn_fmed3: {
760	// Note this does not preserve proper sNaN behavior if IEEE-mode is enabled
761	// for the shader.
762
763	Value *Src0 = II.getArgOperand(i: `0`);
764	Value *Src1 = II.getArgOperand(i: `1`);
765	Value *Src2 = II.getArgOperand(i: `2`);
766
767	// Checking for NaN before canonicalization provides better fidelity when
768	// mapping other operations onto fmed3 since the order of operands is
769	// unchanged.
770	Value V = nullptr*;
771	if (match(V: Src0, P: PatternMatch::m_NaN()) \|\| isa<UndefValue>(Val: Src0)) {
772	V = IC.Builder.CreateMinNum(LHS: Src1, RHS: Src2);
773	} else if (match(V: Src1, P: PatternMatch::m_NaN()) \|\| isa<UndefValue>(Val: Src1)) {
774	V = IC.Builder.CreateMinNum(LHS: Src0, RHS: Src2);
775	} else if (match(V: Src2, P: PatternMatch::m_NaN()) \|\| isa<UndefValue>(Val: Src2)) {
776	V = IC.Builder.CreateMaxNum(LHS: Src0, RHS: Src1);
777	}
778
779	if (V) {
780	if (auto *CI = dyn_cast<CallInst>(Val: V)) {
781	CI->copyFastMathFlags(I: &II);
782	CI->takeName(V: &II);
783	}
784	return IC.replaceInstUsesWith(I&: II, V);
785	}
786
787	bool Swap = false;
788	// Canonicalize constants to RHS operands.
789	//
790	// fmed3(c0, x, c1) -> fmed3(x, c0, c1)
791	if (isa<Constant>(Val: Src0) && !isa<Constant>(Val: Src1)) {
792	std::swap(a&: Src0, b&: Src1);
793	Swap = true;
794	}
795
796	if (isa<Constant>(Val: Src1) && !isa<Constant>(Val: Src2)) {
797	std::swap(a&: Src1, b&: Src2);
798	Swap = true;
799	}
800
801	if (isa<Constant>(Val: Src0) && !isa<Constant>(Val: Src1)) {
802	std::swap(a&: Src0, b&: Src1);
803	Swap = true;
804	}
805
806	if (Swap) {
807	II.setArgOperand(i: `0`, v: Src0);
808	II.setArgOperand(i: `1`, v: Src1);
809	II.setArgOperand(i: `2`, v: Src2);
810	return &II;
811	}
812
813	if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Val: Src0)) {
814	if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Val: Src1)) {
815	if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Val: Src2)) {
816	APFloat Result = fmed3AMDGCN(Src0: C0->getValueAPF(), Src1: C1->getValueAPF(),
817	Src2: C2->getValueAPF());
818	return IC.replaceInstUsesWith(
819	I&: II, V: ConstantFP::get(Context&: IC.Builder.getContext(), V: Result));
820	}
821	}
822	}
823
824	if (!ST->hasMed3_16())
825	break;
826
827	Value X, Y, *Z;
828
829	// Repeat floating-point width reduction done for minnum/maxnum.
830	// fmed3((fpext X), (fpext Y), (fpext Z)) -> fpext (fmed3(X, Y, Z))
831	if (matchFPExtFromF16(Arg: Src0, FPExtSrc&: X) && matchFPExtFromF16(Arg: Src1, FPExtSrc&: Y) &&
832	matchFPExtFromF16(Arg: Src2, FPExtSrc&: Z)) {
833	Value *NewCall = IC.Builder.CreateIntrinsic(ID: IID, Types: {X->getType()},
834	Args: {X, Y, Z}, FMFSource: &II, Name: II.getName());
835	return new FPExtInst (NewCall, II.getType());
836	}
837
838	break;
839	}
840	case Intrinsic::amdgcn_icmp:
841	case Intrinsic::amdgcn_fcmp: {
842	const ConstantInt *CC = cast<ConstantInt>(Val: II.getArgOperand(i: `2`));
843	// Guard against invalid arguments.
844	int64_t CCVal = CC->getZExtValue();
845	bool IsInteger = IID == Intrinsic::amdgcn_icmp;
846	if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE \|\|
847	CCVal > CmpInst::LAST_ICMP_PREDICATE)) \|\|
848	(!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE \|\|
849	CCVal > CmpInst::LAST_FCMP_PREDICATE)))
850	break;
851
852	Value *Src0 = II.getArgOperand(i: `0`);
853	Value *Src1 = II.getArgOperand(i: `1`);
854
855	if (auto *CSrc0 = dyn_cast<Constant>(Val: Src0)) {
856	if (auto *CSrc1 = dyn_cast<Constant>(Val: Src1)) {
857	Constant *CCmp = ConstantExpr::getCompare(pred: CCVal, C1: CSrc0, C2: CSrc1);
858	if (CCmp->isNullValue()) {
859	return IC.replaceInstUsesWith(
860	I&: II, V: IC.Builder.CreateSExt(V: CCmp, DestTy: II.getType()));
861	}
862
863	// The result of V_ICMP/V_FCMP assembly instructions (which this
864	// intrinsic exposes) is one bit per thread, masked with the EXEC
865	// register (which contains the bitmask of live threads). So a
866	// comparison that always returns true is the same as a read of the
867	// EXEC register.
868	Function *NewF = Intrinsic::getDeclaration(
869	M: II.getModule(), Intrinsic::id: read_register, Tys: II.getType());
870	Metadata *MDArgs[] = {MDString::get(Context&: II.getContext(), Str: "exec")};
871	MDNode *MD = MDNode::get(Context&: II.getContext(), MDs: MDArgs);
872	Value *Args[] = {MetadataAsValue::get(Context&: II.getContext(), MD)};
873	CallInst *NewCall = IC.Builder.CreateCall(Callee: NewF, Args);
874	NewCall->addFnAttr(Attribute::Convergent);
875	NewCall->takeName(V: &II);
876	return IC.replaceInstUsesWith(I&: II, V: NewCall);
877	}
878
879	// Canonicalize constants to RHS.
880	CmpInst::Predicate SwapPred =
881	CmpInst::getSwappedPredicate(pred: static_cast<CmpInst::Predicate>(CCVal));
882	II.setArgOperand(i: `0`, v: Src1);
883	II.setArgOperand(i: `1`, v: Src0);
884	II.setArgOperand(
885	i: `2`, v: ConstantInt::get(Ty: CC->getType(), V: static_cast<int>(SwapPred)));
886	return &II;
887	}
888
889	if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
890	break;
891
892	// Canonicalize compare eq with true value to compare != 0
893	// llvm.amdgcn.icmp(zext (i1 x), 1, eq)
894	// -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
895	// llvm.amdgcn.icmp(sext (i1 x), -1, eq)
896	// -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
897	Value *ExtSrc;
898	if (CCVal == CmpInst::ICMP_EQ &&
899	((match(V: Src1, P: PatternMatch::m_One()) &&
900	match(V: Src0, P: m_ZExt(Op: PatternMatch::m_Value(V&: ExtSrc)))) \|\|
901	(match(V: Src1, P: PatternMatch::m_AllOnes()) &&
902	match(V: Src0, P: m_SExt(Op: PatternMatch::m_Value(V&: ExtSrc))))) &&
903	ExtSrc->getType()->isIntegerTy(Bitwidth: `1`)) {
904	IC.replaceOperand(I&: II, OpNum: `1`, V: ConstantInt::getNullValue(Ty: Src1->getType()));
905	IC.replaceOperand(I&: II, OpNum: `2`,
906	V: ConstantInt::get(Ty: CC->getType(), V: CmpInst::ICMP_NE));
907	return &II;
908	}
909
910	CmpInst::Predicate SrcPred;
911	Value *SrcLHS;
912	Value *SrcRHS;
913
914	// Fold compare eq/ne with 0 from a compare result as the predicate to the
915	// intrinsic. The typical use is a wave vote function in the library, which
916	// will be fed from a user code condition compared with 0. Fold in the
917	// redundant compare.
918
919	// llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
920	// -> llvm.amdgcn.[if]cmp(a, b, pred)
921	//
922	// llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
923	// -> llvm.amdgcn.[if]cmp(a, b, inv pred)
924	if (match(V: Src1, P: PatternMatch::m_Zero()) &&
925	match(V: Src0, P: PatternMatch::m_ZExtOrSExt(
926	Op: m_Cmp(Pred&: SrcPred, L: PatternMatch::m_Value(V&: SrcLHS),
927	R: PatternMatch::m_Value(V&: SrcRHS))))) {
928	if (CCVal == CmpInst::ICMP_EQ)
929	SrcPred = CmpInst::getInversePredicate(pred: SrcPred);
930
931	Intrinsic::ID NewIID = CmpInst::isFPPredicate(P: SrcPred)
932	? Intrinsic::amdgcn_fcmp
933	: Intrinsic::amdgcn_icmp;
934
935	Type *Ty = SrcLHS->getType();
936	if (auto *CmpType = dyn_cast<IntegerType>(Val: Ty)) {
937	// Promote to next legal integer type.
938	unsigned Width = CmpType->getBitWidth();
939	unsigned NewWidth = Width;
940
941	// Don't do anything for i1 comparisons.
942	if (Width == `1`)
943	break;
944
945	if (Width <= `16`)
946	NewWidth = `16`;
947	else if (Width <= `32`)
948	NewWidth = `32`;
949	else if (Width <= `64`)
950	NewWidth = `64`;
951	else
952	break; // Can't handle this.
953
954	if (Width != NewWidth) {
955	IntegerType *CmpTy = IC.Builder.getIntNTy(N: NewWidth);
956	if (CmpInst::isSigned(predicate: SrcPred)) {
957	SrcLHS = IC.Builder.CreateSExt(V: SrcLHS, DestTy: CmpTy);
958	SrcRHS = IC.Builder.CreateSExt(V: SrcRHS, DestTy: CmpTy);
959	} else {
960	SrcLHS = IC.Builder.CreateZExt(V: SrcLHS, DestTy: CmpTy);
961	SrcRHS = IC.Builder.CreateZExt(V: SrcRHS, DestTy: CmpTy);
962	}
963	}
964	} else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
965	break;
966
967	Function *NewF = Intrinsic::getDeclaration(
968	M: II.getModule(), id: NewIID, Tys: {II.getType(), SrcLHS->getType()});
969	Value *Args[] = {SrcLHS, SrcRHS,
970	ConstantInt::get(Ty: CC->getType(), V: SrcPred)};
971	CallInst *NewCall = IC.Builder.CreateCall(Callee: NewF, Args);
972	NewCall->takeName(V: &II);
973	return IC.replaceInstUsesWith(I&: II, V: NewCall);
974	}
975
976	break;
977	}
978	case Intrinsic::amdgcn_mbcnt_hi: {
979	// exec_hi is all 0, so this is just a copy.
980	if (ST->isWave32())
981	return IC.replaceInstUsesWith(I&: II, V: II.getArgOperand(i: `1`));
982	break;
983	}
984	case Intrinsic::amdgcn_ballot: {
985	if (auto *Src = dyn_cast<ConstantInt>(Val: II.getArgOperand(i: `0`))) {
986	if (Src->isZero()) {
987	// amdgcn.ballot(i1 0) is zero.
988	return IC.replaceInstUsesWith(I&: II, V: Constant::getNullValue(Ty: II.getType()));
989	}
990	}
991	if (ST->isWave32() && II.getType()->getIntegerBitWidth() == `64`) {
992	// %b64 = call i64 ballot.i64(...)
993	// =>
994	// %b32 = call i32 ballot.i32(...)
995	// %b64 = zext i32 %b32 to i64
996	Value *Call = IC.Builder.CreateZExt(
997	V: IC.Builder.CreateIntrinsic(Intrinsic::amdgcn_ballot,
998	{IC.Builder.getInt32Ty()},
999	{II.getArgOperand(i: `0`)}),
1000	DestTy: II.getType());
1001	Call->takeName(V: &II);
1002	return IC.replaceInstUsesWith(I&: II, V: Call);
1003	}
1004	break;
1005	}
1006	case Intrinsic::amdgcn_wqm_vote: {
1007	// wqm_vote is identity when the argument is constant.
1008	if (!isa<Constant>(Val: II.getArgOperand(i: `0`)))
1009	break;
1010
1011	return IC.replaceInstUsesWith(I&: II, V: II.getArgOperand(i: `0`));
1012	}
1013	case Intrinsic::amdgcn_kill: {
1014	const ConstantInt *C = dyn_cast<ConstantInt>(Val: II.getArgOperand(i: `0`));
1015	if (!C \|\| !C->getZExtValue())
1016	break;
1017
1018	// amdgcn.kill(i1 1) is a no-op
1019	return IC.eraseInstFromFunction(I&: II);
1020	}
1021	case Intrinsic::amdgcn_update_dpp: {
1022	Value *Old = II.getArgOperand(i: `0`);
1023
1024	auto *BC = cast<ConstantInt>(Val: II.getArgOperand(i: `5`));
1025	auto *RM = cast<ConstantInt>(Val: II.getArgOperand(i: `3`));
1026	auto *BM = cast<ConstantInt>(Val: II.getArgOperand(i: `4`));
1027	if (BC->isZeroValue() \|\| RM->getZExtValue() != `0xF` \|\|
1028	BM->getZExtValue() != `0xF` \|\| isa<UndefValue>(Val: Old))
1029	break;
1030
1031	// If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
1032	return IC.replaceOperand(I&: II, OpNum: `0`, V: UndefValue::get(T: Old->getType()));
1033	}
1034	case Intrinsic::amdgcn_permlane16:
1035	case Intrinsic::amdgcn_permlane16_var:
1036	case Intrinsic::amdgcn_permlanex16:
1037	case Intrinsic::amdgcn_permlanex16_var: {
1038	// Discard vdst_in if it's not going to be read.
1039	Value *VDstIn = II.getArgOperand(i: `0`);
1040	if (isa<UndefValue>(Val: VDstIn))
1041	break;
1042
1043	// FetchInvalid operand idx.
1044	unsigned int FiIdx = (IID == Intrinsic::amdgcn_permlane16 \|\|
1045	IID == Intrinsic::amdgcn_permlanex16)
1046	? `4` / for permlane16 and permlanex16 /
1047	: `3`; / for permlane16_var and permlanex16_var /
1048
1049	// BoundCtrl operand idx.
1050	// For permlane16 and permlanex16 it should be 5
1051	// For Permlane16_var and permlanex16_var it should be 4
1052	unsigned int BcIdx = FiIdx + `1`;
1053
1054	ConstantInt *FetchInvalid = cast<ConstantInt>(Val: II.getArgOperand(i: FiIdx));
1055	ConstantInt *BoundCtrl = cast<ConstantInt>(Val: II.getArgOperand(i: BcIdx));
1056	if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
1057	break;
1058
1059	return IC.replaceOperand(I&: II, OpNum: `0`, V: UndefValue::get(T: VDstIn->getType()));
1060	}
1061	case Intrinsic::amdgcn_permlane64:
1062	// A constant value is trivially uniform.
1063	if (Constant *C = dyn_cast<Constant>(Val: II.getArgOperand(i: `0`))) {
1064	return IC.replaceInstUsesWith(I&: II, V: C);
1065	}
1066	break;
1067	case Intrinsic::amdgcn_readfirstlane:
1068	case Intrinsic::amdgcn_readlane: {
1069	// A constant value is trivially uniform.
1070	if (Constant *C = dyn_cast<Constant>(Val: II.getArgOperand(i: `0`))) {
1071	return IC.replaceInstUsesWith(I&: II, V: C);
1072	}
1073
1074	// The rest of these may not be safe if the exec may not be the same between
1075	// the def and use.
1076	Value *Src = II.getArgOperand(i: `0`);
1077	Instruction *SrcInst = dyn_cast<Instruction>(Val: Src);
1078	if (SrcInst && SrcInst->getParent() != II.getParent())
1079	break;
1080
1081	// readfirstlane (readfirstlane x) -> readfirstlane x
1082	// readlane (readfirstlane x), y -> readfirstlane x
1083	if (match(Src,
1084	PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) {
1085	return IC.replaceInstUsesWith(I&: II, V: Src);
1086	}
1087
1088	if (IID == Intrinsic::amdgcn_readfirstlane) {
1089	// readfirstlane (readlane x, y) -> readlane x, y
1090	if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) {
1091	return IC.replaceInstUsesWith(I&: II, V: Src);
1092	}
1093	} else {
1094	// readlane (readlane x, y), y -> readlane x, y
1095	if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>(
1096	PatternMatch::m_Value(),
1097	PatternMatch::m_Specific(II.getArgOperand(`1`))))) {
1098	return IC.replaceInstUsesWith(I&: II, V: Src);
1099	}
1100	}
1101
1102	break;
1103	}
1104	case Intrinsic::amdgcn_fmul_legacy: {
1105	Value *Op0 = II.getArgOperand(i: `0`);
1106	Value *Op1 = II.getArgOperand(i: `1`);
1107
1108	// The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
1109	// infinity, gives +0.0.
1110	// TODO: Move to InstSimplify?
1111	if (match(V: Op0, P: PatternMatch::m_AnyZeroFP()) \|\|
1112	match(V: Op1, P: PatternMatch::m_AnyZeroFP()))
1113	return IC.replaceInstUsesWith(I&: II, V: ConstantFP::getZero(Ty: II.getType()));
1114
1115	// If we can prove we don't have one of the special cases then we can use a
1116	// normal fmul instruction instead.
1117	if (canSimplifyLegacyMulToMul(I: II, Op0, Op1, IC)) {
1118	auto *FMul = IC.Builder.CreateFMulFMF(L: Op0, R: Op1, FMFSource: &II);
1119	FMul->takeName(V: &II);
1120	return IC.replaceInstUsesWith(I&: II, V: FMul);
1121	}
1122	break;
1123	}
1124	case Intrinsic::amdgcn_fma_legacy: {
1125	Value *Op0 = II.getArgOperand(i: `0`);
1126	Value *Op1 = II.getArgOperand(i: `1`);
1127	Value *Op2 = II.getArgOperand(i: `2`);
1128
1129	// The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
1130	// infinity, gives +0.0.
1131	// TODO: Move to InstSimplify?
1132	if (match(V: Op0, P: PatternMatch::m_AnyZeroFP()) \|\|
1133	match(V: Op1, P: PatternMatch::m_AnyZeroFP())) {
1134	// It's tempting to just return Op2 here, but that would give the wrong
1135	// result if Op2 was -0.0.
1136	auto *Zero = ConstantFP::getZero(Ty: II.getType());
1137	auto *FAdd = IC.Builder.CreateFAddFMF(L: Zero, R: Op2, FMFSource: &II);
1138	FAdd->takeName(V: &II);
1139	return IC.replaceInstUsesWith(I&: II, V: FAdd);
1140	}
1141
1142	// If we can prove we don't have one of the special cases then we can use a
1143	// normal fma instead.
1144	if (canSimplifyLegacyMulToMul(I: II, Op0, Op1, IC)) {
1145	II.setCalledOperand(Intrinsic::getDeclaration(
1146	II.getModule(), Intrinsic::fma, II.getType()));
1147	return &II;
1148	}
1149	break;
1150	}
1151	case Intrinsic::amdgcn_is_shared:
1152	case Intrinsic::amdgcn_is_private: {
1153	if (isa<UndefValue>(Val: II.getArgOperand(i: `0`)))
1154	return IC.replaceInstUsesWith(I&: II, V: UndefValue::get(T: II.getType()));
1155
1156	if (isa<ConstantPointerNull>(Val: II.getArgOperand(i: `0`)))
1157	return IC.replaceInstUsesWith(I&: II, V: ConstantInt::getFalse(Ty: II.getType()));
1158	break;
1159	}
1160	case Intrinsic::amdgcn_buffer_store_format:
1161	case Intrinsic::amdgcn_raw_buffer_store_format:
1162	case Intrinsic::amdgcn_struct_buffer_store_format:
1163	case Intrinsic::amdgcn_raw_tbuffer_store:
1164	case Intrinsic::amdgcn_struct_tbuffer_store:
1165	case Intrinsic::amdgcn_tbuffer_store:
1166	case Intrinsic::amdgcn_image_store_1d:
1167	case Intrinsic::amdgcn_image_store_1darray:
1168	case Intrinsic::amdgcn_image_store_2d:
1169	case Intrinsic::amdgcn_image_store_2darray:
1170	case Intrinsic::amdgcn_image_store_2darraymsaa:
1171	case Intrinsic::amdgcn_image_store_2dmsaa:
1172	case Intrinsic::amdgcn_image_store_3d:
1173	case Intrinsic::amdgcn_image_store_cube:
1174	case Intrinsic::amdgcn_image_store_mip_1d:
1175	case Intrinsic::amdgcn_image_store_mip_1darray:
1176	case Intrinsic::amdgcn_image_store_mip_2d:
1177	case Intrinsic::amdgcn_image_store_mip_2darray:
1178	case Intrinsic::amdgcn_image_store_mip_3d:
1179	case Intrinsic::amdgcn_image_store_mip_cube: {
1180	if (!isa<FixedVectorType>(Val: II.getArgOperand(i: `0`)->getType()))
1181	break;
1182
1183	APInt DemandedElts;
1184	if (ST->hasDefaultComponentBroadcast())
1185	DemandedElts = defaultComponentBroadcast(V: II.getArgOperand(i: `0`));
1186	else if (ST->hasDefaultComponentZero())
1187	DemandedElts = trimTrailingZerosInVector(IC, UseV: II.getArgOperand(i: `0`), I: &II);
1188	else
1189	break;
1190
1191	int DMaskIdx = getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID()) ? `1` : -`1`;
1192	if (simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx,
1193	IsLoad: false)) {
1194	return IC.eraseInstFromFunction(I&: II);
1195	}
1196
1197	break;
1198	}
1199	}
1200	if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
1201	AMDGPU::getImageDimIntrinsicInfo(Intr: II.getIntrinsicID())) {
1202	return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
1203	}
1204	return std::nullopt;
1205	}
1206
1207	/// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
1208	///
1209	/// The result of simplifying amdgcn image and buffer store intrinsics is updating
1210	/// definitions of the intrinsics vector argument, not Uses of the result like
1211	/// image and buffer loads.
1212	/// Note: This only supports non-TFE/LWE image intrinsic calls; those have
1213	/// struct returns.
1214	static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
1215	IntrinsicInst &II,
1216	APInt DemandedElts,
1217	int DMaskIdx, bool IsLoad) {
1218
1219	auto *IIVTy = cast<FixedVectorType>(Val: IsLoad ? II.getType()
1220	: II.getOperand(i_nocapture: `0`)->getType());
1221	unsigned VWidth = IIVTy->getNumElements();
1222	if (VWidth == `1`)
1223	return nullptr;
1224	Type *EltTy = IIVTy->getElementType();
1225
1226	IRBuilderBase::InsertPointGuard Guard(IC.Builder);
1227	IC.Builder.SetInsertPoint(&II);
1228
1229	// Assume the arguments are unchanged and later override them, if needed.
1230	SmallVector<Value *, `16`> Args(II.args());
1231
1232	if (DMaskIdx < `0`) {
1233	// Buffer case.
1234
1235	const unsigned ActiveBits = DemandedElts.getActiveBits();
1236	const unsigned UnusedComponentsAtFront = DemandedElts.countr_zero();
1237
1238	// Start assuming the prefix of elements is demanded, but possibly clear
1239	// some other bits if there are trailing zeros (unused components at front)
1240	// and update offset.
1241	DemandedElts = (`1` << ActiveBits) - `1`;
1242
1243	if (UnusedComponentsAtFront > `0`) {
1244	static const unsigned InvalidOffsetIdx = `0xf`;
1245
1246	unsigned OffsetIdx;
1247	switch (II.getIntrinsicID()) {
1248	case Intrinsic::amdgcn_raw_buffer_load:
1249	case Intrinsic::amdgcn_raw_ptr_buffer_load:
1250	OffsetIdx = `1`;
1251	break;
1252	case Intrinsic::amdgcn_s_buffer_load:
1253	// If resulting type is vec3, there is no point in trimming the
1254	// load with updated offset, as the vec3 would most likely be widened to
1255	// vec4 anyway during lowering.
1256	if (ActiveBits == `4` && UnusedComponentsAtFront == `1`)
1257	OffsetIdx = InvalidOffsetIdx;
1258	else
1259	OffsetIdx = `1`;
1260	break;
1261	case Intrinsic::amdgcn_struct_buffer_load:
1262	case Intrinsic::amdgcn_struct_ptr_buffer_load:
1263	OffsetIdx = `2`;
1264	break;
1265	default:
1266	// TODO: handle tbuffer intrinsics.*
1267	OffsetIdx = InvalidOffsetIdx;
1268	break;
1269	}
1270
1271	if (OffsetIdx != InvalidOffsetIdx) {
1272	// Clear demanded bits and update the offset.
1273	DemandedElts &= ~((`1` << UnusedComponentsAtFront) - `1`);
1274	auto *Offset = Args [OffsetIdx];
1275	unsigned SingleComponentSizeInBits =
1276	IC.getDataLayout().getTypeSizeInBits(Ty: EltTy);
1277	unsigned OffsetAdd =
1278	UnusedComponentsAtFront * SingleComponentSizeInBits / `8`;
1279	auto *OffsetAddVal = ConstantInt::get(Ty: Offset->getType(), V: OffsetAdd);
1280	Args [OffsetIdx] = IC.Builder.CreateAdd(LHS: Offset, RHS: OffsetAddVal);
1281	}
1282	}
1283	} else {
1284	// Image case.
1285
1286	ConstantInt *DMask = cast<ConstantInt>(Val: Args [DMaskIdx]);
1287	unsigned DMaskVal = DMask->getZExtValue() & `0xf`;
1288
1289	// dmask 0 has special semantics, do not simplify.
1290	if (DMaskVal == `0`)
1291	return nullptr;
1292
1293	// Mask off values that are undefined because the dmask doesn't cover them
1294	DemandedElts &= (`1` << llvm::popcount(Value: DMaskVal)) - `1`;
1295
1296	unsigned NewDMaskVal = `0`;
1297	unsigned OrigLdStIdx = `0`;
1298	for (unsigned SrcIdx = `0`; SrcIdx < `4`; ++SrcIdx) {
1299	const unsigned Bit = `1` << SrcIdx;
1300	if (!!(DMaskVal & Bit)) {
1301	if (!!DemandedElts [OrigLdStIdx])
1302	NewDMaskVal \|= Bit;
1303	OrigLdStIdx++;
1304	}
1305	}
1306
1307	if (DMaskVal != NewDMaskVal)
1308	Args [DMaskIdx] = ConstantInt::get(Ty: DMask->getType(), V: NewDMaskVal);
1309	}
1310
1311	unsigned NewNumElts = DemandedElts.popcount();
1312	if (!NewNumElts)
1313	return PoisonValue::get(T: IIVTy);
1314
1315	if (NewNumElts >= VWidth && DemandedElts.isMask()) {
1316	if (DMaskIdx >= `0`)
1317	II.setArgOperand(i: DMaskIdx, v: Args [DMaskIdx]);
1318	return nullptr;
1319	}
1320
1321	// Validate function argument and return types, extracting overloaded types
1322	// along the way.
1323	SmallVector<Type *, `6`> OverloadTys;
1324	if (!Intrinsic::getIntrinsicSignature(F: II.getCalledFunction(), ArgTys&: OverloadTys))
1325	return nullptr;
1326
1327	Type *NewTy =
1328	(NewNumElts == `1`) ? EltTy : FixedVectorType::get(ElementType: EltTy, NumElts: NewNumElts);
1329	OverloadTys [`0`] = NewTy;
1330
1331	if (!IsLoad) {
1332	SmallVector<int, `8`> EltMask;
1333	for (unsigned OrigStoreIdx = `0`; OrigStoreIdx < VWidth; ++OrigStoreIdx)
1334	if (DemandedElts [OrigStoreIdx])
1335	EltMask.push_back(Elt: OrigStoreIdx);
1336
1337	if (NewNumElts == `1`)
1338	Args [`0`] = IC.Builder.CreateExtractElement(Vec: II.getOperand(i_nocapture: `0`), Idx: EltMask [`0`]);
1339	else
1340	Args [`0`] = IC.Builder.CreateShuffleVector(V: II.getOperand(i_nocapture: `0`), Mask: EltMask);
1341	}
1342
1343	Function *NewIntrin = Intrinsic::getDeclaration(
1344	M: II.getModule(), id: II.getIntrinsicID(), Tys: OverloadTys);
1345	CallInst *NewCall = IC.Builder.CreateCall(Callee: NewIntrin, Args);
1346	NewCall->takeName(V: &II);
1347	NewCall->copyMetadata(SrcInst: II);
1348
1349	if (IsLoad) {
1350	if (NewNumElts == `1`) {
1351	return IC.Builder.CreateInsertElement(Vec: PoisonValue::get(T: IIVTy), NewElt: NewCall,
1352	Idx: DemandedElts.countr_zero());
1353	}
1354
1355	SmallVector<int, `8`> EltMask;
1356	unsigned NewLoadIdx = `0`;
1357	for (unsigned OrigLoadIdx = `0`; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
1358	if (!!DemandedElts [OrigLoadIdx])
1359	EltMask.push_back(Elt: NewLoadIdx++);
1360	else
1361	EltMask.push_back(Elt: NewNumElts);
1362	}
1363
1364	auto *Shuffle = IC.Builder.CreateShuffleVector(V: NewCall, Mask: EltMask);
1365
1366	return Shuffle;
1367	}
1368
1369	return NewCall;
1370	}
1371
1372	std::optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
1373	InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
1374	APInt &UndefElts2, APInt &UndefElts3,
1375	std::function<void(Instruction , unsigned*, APInt, APInt &)>
1376	SimplifyAndSetOp) const {
1377	switch (II.getIntrinsicID()) {
1378	case Intrinsic::amdgcn_buffer_load:
1379	case Intrinsic::amdgcn_buffer_load_format:
1380	case Intrinsic::amdgcn_raw_buffer_load:
1381	case Intrinsic::amdgcn_raw_ptr_buffer_load:
1382	case Intrinsic::amdgcn_raw_buffer_load_format:
1383	case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
1384	case Intrinsic::amdgcn_raw_tbuffer_load:
1385	case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
1386	case Intrinsic::amdgcn_s_buffer_load:
1387	case Intrinsic::amdgcn_struct_buffer_load:
1388	case Intrinsic::amdgcn_struct_ptr_buffer_load:
1389	case Intrinsic::amdgcn_struct_buffer_load_format:
1390	case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
1391	case Intrinsic::amdgcn_struct_tbuffer_load:
1392	case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
1393	case Intrinsic::amdgcn_tbuffer_load:
1394	return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
1395	default: {
1396	if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) {
1397	return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx: `0`);
1398	}
1399	break;
1400	}
1401	}
1402	return std::nullopt;
1403	}
1404

source code of llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp