1//===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// \file
10// This file implements a TargetTransformInfo analysis pass specific to the
11// AMDGPU target machine. It uses the target's detailed information to provide
12// more precise answers to certain TTI queries, while letting the target
13// independent and default TTI implementations handle the rest.
14//
15//===----------------------------------------------------------------------===//
16
17#include "AMDGPUInstrInfo.h"
18#include "AMDGPUTargetTransformInfo.h"
19#include "GCNSubtarget.h"
20#include "llvm/ADT/FloatingPointMode.h"
21#include "llvm/IR/IntrinsicsAMDGPU.h"
22#include "llvm/Transforms/InstCombine/InstCombiner.h"
23#include <optional>
24
25using namespace llvm;
26using namespace llvm::PatternMatch;
27
28#define DEBUG_TYPE "AMDGPUtti"
29
30namespace {
31
32struct AMDGPUImageDMaskIntrinsic {
33 unsigned Intr;
34};
35
36#define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
37#include "InstCombineTables.inc"
38
39} // end anonymous namespace
40
41// Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
42//
43// A single NaN input is folded to minnum, so we rely on that folding for
44// handling NaNs.
45static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
46 const APFloat &Src2) {
47 APFloat Max3 = maxnum(A: maxnum(A: Src0, B: Src1), B: Src2);
48
49 APFloat::cmpResult Cmp0 = Max3.compare(RHS: Src0);
50 assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
51 if (Cmp0 == APFloat::cmpEqual)
52 return maxnum(A: Src1, B: Src2);
53
54 APFloat::cmpResult Cmp1 = Max3.compare(RHS: Src1);
55 assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
56 if (Cmp1 == APFloat::cmpEqual)
57 return maxnum(A: Src0, B: Src2);
58
59 return maxnum(A: Src0, B: Src1);
60}
61
62// Check if a value can be converted to a 16-bit value without losing
63// precision.
64// The value is expected to be either a float (IsFloat = true) or an unsigned
65// integer (IsFloat = false).
66static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat) {
67 Type *VTy = V.getType();
68 if (VTy->isHalfTy() || VTy->isIntegerTy(Bitwidth: 16)) {
69 // The value is already 16-bit, so we don't want to convert to 16-bit again!
70 return false;
71 }
72 if (IsFloat) {
73 if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(Val: &V)) {
74 // We need to check that if we cast the index down to a half, we do not
75 // lose precision.
76 APFloat FloatValue(ConstFloat->getValueAPF());
77 bool LosesInfo = true;
78 FloatValue.convert(ToSemantics: APFloat::IEEEhalf(), RM: APFloat::rmTowardZero,
79 losesInfo: &LosesInfo);
80 return !LosesInfo;
81 }
82 } else {
83 if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(Val: &V)) {
84 // We need to check that if we cast the index down to an i16, we do not
85 // lose precision.
86 APInt IntValue(ConstInt->getValue());
87 return IntValue.getActiveBits() <= 16;
88 }
89 }
90
91 Value *CastSrc;
92 bool IsExt = IsFloat ? match(V: &V, P: m_FPExt(Op: PatternMatch::m_Value(V&: CastSrc)))
93 : match(V: &V, P: m_ZExt(Op: PatternMatch::m_Value(V&: CastSrc)));
94 if (IsExt) {
95 Type *CastSrcTy = CastSrc->getType();
96 if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(Bitwidth: 16))
97 return true;
98 }
99
100 return false;
101}
102
103// Convert a value to 16-bit.
104static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) {
105 Type *VTy = V.getType();
106 if (isa<FPExtInst>(Val: &V) || isa<SExtInst>(Val: &V) || isa<ZExtInst>(Val: &V))
107 return cast<Instruction>(Val: &V)->getOperand(i: 0);
108 if (VTy->isIntegerTy())
109 return Builder.CreateIntCast(V: &V, DestTy: Type::getInt16Ty(C&: V.getContext()), isSigned: false);
110 if (VTy->isFloatingPointTy())
111 return Builder.CreateFPCast(V: &V, DestTy: Type::getHalfTy(C&: V.getContext()));
112
113 llvm_unreachable("Should never be called!");
114}
115
116/// Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with
117/// modified arguments (based on OldIntr) and replaces InstToReplace with
118/// this newly created intrinsic call.
119static std::optional<Instruction *> modifyIntrinsicCall(
120 IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr,
121 InstCombiner &IC,
122 std::function<void(SmallVectorImpl<Value *> &, SmallVectorImpl<Type *> &)>
123 Func) {
124 SmallVector<Type *, 4> ArgTys;
125 if (!Intrinsic::getIntrinsicSignature(F: OldIntr.getCalledFunction(), ArgTys))
126 return std::nullopt;
127
128 SmallVector<Value *, 8> Args(OldIntr.args());
129
130 // Modify arguments and types
131 Func(Args, ArgTys);
132
133 Function *I = Intrinsic::getDeclaration(M: OldIntr.getModule(), id: NewIntr, Tys: ArgTys);
134
135 CallInst *NewCall = IC.Builder.CreateCall(Callee: I, Args);
136 NewCall->takeName(V: &OldIntr);
137 NewCall->copyMetadata(SrcInst: OldIntr);
138 if (isa<FPMathOperator>(Val: NewCall))
139 NewCall->copyFastMathFlags(I: &OldIntr);
140
141 // Erase and replace uses
142 if (!InstToReplace.getType()->isVoidTy())
143 IC.replaceInstUsesWith(I&: InstToReplace, V: NewCall);
144
145 bool RemoveOldIntr = &OldIntr != &InstToReplace;
146
147 auto RetValue = IC.eraseInstFromFunction(I&: InstToReplace);
148 if (RemoveOldIntr)
149 IC.eraseInstFromFunction(I&: OldIntr);
150
151 return RetValue;
152}
153
154static std::optional<Instruction *>
155simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
156 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
157 IntrinsicInst &II, InstCombiner &IC) {
158 // Optimize _L to _LZ when _L is zero
159 if (const auto *LZMappingInfo =
160 AMDGPU::getMIMGLZMappingInfo(L: ImageDimIntr->BaseOpcode)) {
161 if (auto *ConstantLod =
162 dyn_cast<ConstantFP>(Val: II.getOperand(i_nocapture: ImageDimIntr->LodIndex))) {
163 if (ConstantLod->isZero() || ConstantLod->isNegative()) {
164 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
165 AMDGPU::getImageDimIntrinsicByBaseOpcode(BaseOpcode: LZMappingInfo->LZ,
166 Dim: ImageDimIntr->Dim);
167 return modifyIntrinsicCall(
168 OldIntr&: II, InstToReplace&: II, NewIntr: NewImageDimIntr->Intr, IC, Func: [&](auto &Args, auto &ArgTys) {
169 Args.erase(Args.begin() + ImageDimIntr->LodIndex);
170 });
171 }
172 }
173 }
174
175 // Optimize _mip away, when 'lod' is zero
176 if (const auto *MIPMappingInfo =
177 AMDGPU::getMIMGMIPMappingInfo(MIP: ImageDimIntr->BaseOpcode)) {
178 if (auto *ConstantMip =
179 dyn_cast<ConstantInt>(Val: II.getOperand(i_nocapture: ImageDimIntr->MipIndex))) {
180 if (ConstantMip->isZero()) {
181 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
182 AMDGPU::getImageDimIntrinsicByBaseOpcode(BaseOpcode: MIPMappingInfo->NONMIP,
183 Dim: ImageDimIntr->Dim);
184 return modifyIntrinsicCall(
185 OldIntr&: II, InstToReplace&: II, NewIntr: NewImageDimIntr->Intr, IC, Func: [&](auto &Args, auto &ArgTys) {
186 Args.erase(Args.begin() + ImageDimIntr->MipIndex);
187 });
188 }
189 }
190 }
191
192 // Optimize _bias away when 'bias' is zero
193 if (const auto *BiasMappingInfo =
194 AMDGPU::getMIMGBiasMappingInfo(Bias: ImageDimIntr->BaseOpcode)) {
195 if (auto *ConstantBias =
196 dyn_cast<ConstantFP>(Val: II.getOperand(i_nocapture: ImageDimIntr->BiasIndex))) {
197 if (ConstantBias->isZero()) {
198 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
199 AMDGPU::getImageDimIntrinsicByBaseOpcode(BaseOpcode: BiasMappingInfo->NoBias,
200 Dim: ImageDimIntr->Dim);
201 return modifyIntrinsicCall(
202 OldIntr&: II, InstToReplace&: II, NewIntr: NewImageDimIntr->Intr, IC, Func: [&](auto &Args, auto &ArgTys) {
203 Args.erase(Args.begin() + ImageDimIntr->BiasIndex);
204 ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg);
205 });
206 }
207 }
208 }
209
210 // Optimize _offset away when 'offset' is zero
211 if (const auto *OffsetMappingInfo =
212 AMDGPU::getMIMGOffsetMappingInfo(Offset: ImageDimIntr->BaseOpcode)) {
213 if (auto *ConstantOffset =
214 dyn_cast<ConstantInt>(Val: II.getOperand(i_nocapture: ImageDimIntr->OffsetIndex))) {
215 if (ConstantOffset->isZero()) {
216 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
217 AMDGPU::getImageDimIntrinsicByBaseOpcode(
218 BaseOpcode: OffsetMappingInfo->NoOffset, Dim: ImageDimIntr->Dim);
219 return modifyIntrinsicCall(
220 OldIntr&: II, InstToReplace&: II, NewIntr: NewImageDimIntr->Intr, IC, Func: [&](auto &Args, auto &ArgTys) {
221 Args.erase(Args.begin() + ImageDimIntr->OffsetIndex);
222 });
223 }
224 }
225 }
226
227 // Try to use D16
228 if (ST->hasD16Images()) {
229
230 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
231 AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: ImageDimIntr->BaseOpcode);
232
233 if (BaseOpcode->HasD16) {
234
235 // If the only use of image intrinsic is a fptrunc (with conversion to
236 // half) then both fptrunc and image intrinsic will be replaced with image
237 // intrinsic with D16 flag.
238 if (II.hasOneUse()) {
239 Instruction *User = II.user_back();
240
241 if (User->getOpcode() == Instruction::FPTrunc &&
242 User->getType()->getScalarType()->isHalfTy()) {
243
244 return modifyIntrinsicCall(OldIntr&: II, InstToReplace&: *User, NewIntr: ImageDimIntr->Intr, IC,
245 Func: [&](auto &Args, auto &ArgTys) {
246 // Change return type of image intrinsic.
247 // Set it to return type of fptrunc.
248 ArgTys[0] = User->getType();
249 });
250 }
251 }
252 }
253 }
254
255 // Try to use A16 or G16
256 if (!ST->hasA16() && !ST->hasG16())
257 return std::nullopt;
258
259 // Address is interpreted as float if the instruction has a sampler or as
260 // unsigned int if there is no sampler.
261 bool HasSampler =
262 AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: ImageDimIntr->BaseOpcode)->Sampler;
263 bool FloatCoord = false;
264 // true means derivatives can be converted to 16 bit, coordinates not
265 bool OnlyDerivatives = false;
266
267 for (unsigned OperandIndex = ImageDimIntr->GradientStart;
268 OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
269 Value *Coord = II.getOperand(i_nocapture: OperandIndex);
270 // If the values are not derived from 16-bit values, we cannot optimize.
271 if (!canSafelyConvertTo16Bit(V&: *Coord, IsFloat: HasSampler)) {
272 if (OperandIndex < ImageDimIntr->CoordStart ||
273 ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
274 return std::nullopt;
275 }
276 // All gradients can be converted, so convert only them
277 OnlyDerivatives = true;
278 break;
279 }
280
281 assert(OperandIndex == ImageDimIntr->GradientStart ||
282 FloatCoord == Coord->getType()->isFloatingPointTy());
283 FloatCoord = Coord->getType()->isFloatingPointTy();
284 }
285
286 if (!OnlyDerivatives && !ST->hasA16())
287 OnlyDerivatives = true; // Only supports G16
288
289 // Check if there is a bias parameter and if it can be converted to f16
290 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
291 Value *Bias = II.getOperand(i_nocapture: ImageDimIntr->BiasIndex);
292 assert(HasSampler &&
293 "Only image instructions with a sampler can have a bias");
294 if (!canSafelyConvertTo16Bit(V&: *Bias, IsFloat: HasSampler))
295 OnlyDerivatives = true;
296 }
297
298 if (OnlyDerivatives && (!ST->hasG16() || ImageDimIntr->GradientStart ==
299 ImageDimIntr->CoordStart))
300 return std::nullopt;
301
302 Type *CoordType = FloatCoord ? Type::getHalfTy(C&: II.getContext())
303 : Type::getInt16Ty(C&: II.getContext());
304
305 return modifyIntrinsicCall(
306 OldIntr&: II, InstToReplace&: II, NewIntr: II.getIntrinsicID(), IC, Func: [&](auto &Args, auto &ArgTys) {
307 ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
308 if (!OnlyDerivatives) {
309 ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
310
311 // Change the bias type
312 if (ImageDimIntr->NumBiasArgs != 0)
313 ArgTys[ImageDimIntr->BiasTyArg] = Type::getHalfTy(C&: II.getContext());
314 }
315
316 unsigned EndIndex =
317 OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
318 for (unsigned OperandIndex = ImageDimIntr->GradientStart;
319 OperandIndex < EndIndex; OperandIndex++) {
320 Args[OperandIndex] =
321 convertTo16Bit(V&: *II.getOperand(i_nocapture: OperandIndex), Builder&: IC.Builder);
322 }
323
324 // Convert the bias
325 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
326 Value *Bias = II.getOperand(i_nocapture: ImageDimIntr->BiasIndex);
327 Args[ImageDimIntr->BiasIndex] = convertTo16Bit(V&: *Bias, Builder&: IC.Builder);
328 }
329 });
330}
331
332bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Instruction &I,
333 const Value *Op0, const Value *Op1,
334 InstCombiner &IC) const {
335 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
336 // infinity, gives +0.0. If we can prove we don't have one of the special
337 // cases then we can use a normal multiply instead.
338 // TODO: Create and use isKnownFiniteNonZero instead of just matching
339 // constants here.
340 if (match(V: Op0, P: PatternMatch::m_FiniteNonZero()) ||
341 match(V: Op1, P: PatternMatch::m_FiniteNonZero())) {
342 // One operand is not zero or infinity or NaN.
343 return true;
344 }
345
346 SimplifyQuery SQ = IC.getSimplifyQuery().getWithInstruction(I: &I);
347 if (isKnownNeverInfOrNaN(V: Op0, /*Depth=*/0, SQ) &&
348 isKnownNeverInfOrNaN(V: Op1, /*Depth=*/0, SQ)) {
349 // Neither operand is infinity or NaN.
350 return true;
351 }
352 return false;
353}
354
355/// Match an fpext from half to float, or a constant we can convert.
356static bool matchFPExtFromF16(Value *Arg, Value *&FPExtSrc) {
357 if (match(V: Arg, P: m_OneUse(SubPattern: m_FPExt(Op: m_Value(V&: FPExtSrc)))))
358 return FPExtSrc->getType()->isHalfTy();
359
360 ConstantFP *CFP;
361 if (match(V: Arg, P: m_ConstantFP(C&: CFP))) {
362 bool LosesInfo;
363 APFloat Val(CFP->getValueAPF());
364 Val.convert(ToSemantics: APFloat::IEEEhalf(), RM: APFloat::rmNearestTiesToEven, losesInfo: &LosesInfo);
365 if (LosesInfo)
366 return false;
367
368 FPExtSrc = ConstantFP::get(Ty: Type::getHalfTy(C&: Arg->getContext()), V: Val);
369 return true;
370 }
371
372 return false;
373}
374
375// Trim all zero components from the end of the vector \p UseV and return
376// an appropriate bitset with known elements.
377static APInt trimTrailingZerosInVector(InstCombiner &IC, Value *UseV,
378 Instruction *I) {
379 auto *VTy = cast<FixedVectorType>(Val: UseV->getType());
380 unsigned VWidth = VTy->getNumElements();
381 APInt DemandedElts = APInt::getAllOnes(numBits: VWidth);
382
383 for (int i = VWidth - 1; i > 0; --i) {
384 auto *Elt = findScalarElement(V: UseV, EltNo: i);
385 if (!Elt)
386 break;
387
388 if (auto *ConstElt = dyn_cast<Constant>(Val: Elt)) {
389 if (!ConstElt->isNullValue() && !isa<UndefValue>(Val: Elt))
390 break;
391 } else {
392 break;
393 }
394
395 DemandedElts.clearBit(BitPosition: i);
396 }
397
398 return DemandedElts;
399}
400
401// Trim elements of the end of the vector \p V, if they are
402// equal to the first element of the vector.
403static APInt defaultComponentBroadcast(Value *V) {
404 auto *VTy = cast<FixedVectorType>(Val: V->getType());
405 unsigned VWidth = VTy->getNumElements();
406 APInt DemandedElts = APInt::getAllOnes(numBits: VWidth);
407 Value *FirstComponent = findScalarElement(V, EltNo: 0);
408
409 SmallVector<int> ShuffleMask;
410 if (auto *SVI = dyn_cast<ShuffleVectorInst>(Val: V))
411 SVI->getShuffleMask(Result&: ShuffleMask);
412
413 for (int I = VWidth - 1; I > 0; --I) {
414 if (ShuffleMask.empty()) {
415 auto *Elt = findScalarElement(V, EltNo: I);
416 if (!Elt || (Elt != FirstComponent && !isa<UndefValue>(Val: Elt)))
417 break;
418 } else {
419 // Detect identical elements in the shufflevector result, even though
420 // findScalarElement cannot tell us what that element is.
421 if (ShuffleMask[I] != ShuffleMask[0] && ShuffleMask[I] != PoisonMaskElem)
422 break;
423 }
424 DemandedElts.clearBit(BitPosition: I);
425 }
426
427 return DemandedElts;
428}
429
430static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
431 IntrinsicInst &II,
432 APInt DemandedElts,
433 int DMaskIdx = -1,
434 bool IsLoad = true);
435
436/// Return true if it's legal to contract llvm.amdgcn.rcp(llvm.sqrt)
437static bool canContractSqrtToRsq(const FPMathOperator *SqrtOp) {
438 return (SqrtOp->getType()->isFloatTy() &&
439 (SqrtOp->hasApproxFunc() || SqrtOp->getFPAccuracy() >= 1.0f)) ||
440 SqrtOp->getType()->isHalfTy();
441}
442
443std::optional<Instruction *>
444GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
445 Intrinsic::ID IID = II.getIntrinsicID();
446 switch (IID) {
447 case Intrinsic::amdgcn_rcp: {
448 Value *Src = II.getArgOperand(i: 0);
449
450 // TODO: Move to ConstantFolding/InstSimplify?
451 if (isa<UndefValue>(Val: Src)) {
452 Type *Ty = II.getType();
453 auto *QNaN = ConstantFP::get(Ty, V: APFloat::getQNaN(Sem: Ty->getFltSemantics()));
454 return IC.replaceInstUsesWith(I&: II, V: QNaN);
455 }
456
457 if (II.isStrictFP())
458 break;
459
460 if (const ConstantFP *C = dyn_cast<ConstantFP>(Val: Src)) {
461 const APFloat &ArgVal = C->getValueAPF();
462 APFloat Val(ArgVal.getSemantics(), 1);
463 Val.divide(RHS: ArgVal, RM: APFloat::rmNearestTiesToEven);
464
465 // This is more precise than the instruction may give.
466 //
467 // TODO: The instruction always flushes denormal results (except for f16),
468 // should this also?
469 return IC.replaceInstUsesWith(I&: II, V: ConstantFP::get(Context&: II.getContext(), V: Val));
470 }
471
472 FastMathFlags FMF = cast<FPMathOperator>(Val&: II).getFastMathFlags();
473 if (!FMF.allowContract())
474 break;
475 auto *SrcCI = dyn_cast<IntrinsicInst>(Val: Src);
476 if (!SrcCI)
477 break;
478
479 auto IID = SrcCI->getIntrinsicID();
480 // llvm.amdgcn.rcp(llvm.amdgcn.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable
481 //
482 // llvm.amdgcn.rcp(llvm.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable and
483 // relaxed.
484 if (IID == Intrinsic::amdgcn_sqrt || IID == Intrinsic::sqrt) {
485 const FPMathOperator *SqrtOp = cast<FPMathOperator>(Val: SrcCI);
486 FastMathFlags InnerFMF = SqrtOp->getFastMathFlags();
487 if (!InnerFMF.allowContract() || !SrcCI->hasOneUse())
488 break;
489
490 if (IID == Intrinsic::sqrt && !canContractSqrtToRsq(SqrtOp))
491 break;
492
493 Function *NewDecl = Intrinsic::getDeclaration(
494 M: SrcCI->getModule(), Intrinsic::id: amdgcn_rsq, Tys: {SrcCI->getType()});
495
496 InnerFMF |= FMF;
497 II.setFastMathFlags(InnerFMF);
498
499 II.setCalledFunction(NewDecl);
500 return IC.replaceOperand(I&: II, OpNum: 0, V: SrcCI->getArgOperand(i: 0));
501 }
502
503 break;
504 }
505 case Intrinsic::amdgcn_sqrt:
506 case Intrinsic::amdgcn_rsq: {
507 Value *Src = II.getArgOperand(i: 0);
508
509 // TODO: Move to ConstantFolding/InstSimplify?
510 if (isa<UndefValue>(Val: Src)) {
511 Type *Ty = II.getType();
512 auto *QNaN = ConstantFP::get(Ty, V: APFloat::getQNaN(Sem: Ty->getFltSemantics()));
513 return IC.replaceInstUsesWith(I&: II, V: QNaN);
514 }
515
516 // f16 amdgcn.sqrt is identical to regular sqrt.
517 if (IID == Intrinsic::amdgcn_sqrt && Src->getType()->isHalfTy()) {
518 Function *NewDecl = Intrinsic::getDeclaration(
519 M: II.getModule(), Intrinsic::id: sqrt, Tys: {II.getType()});
520 II.setCalledFunction(NewDecl);
521 return &II;
522 }
523
524 break;
525 }
526 case Intrinsic::amdgcn_log:
527 case Intrinsic::amdgcn_exp2: {
528 const bool IsLog = IID == Intrinsic::amdgcn_log;
529 const bool IsExp = IID == Intrinsic::amdgcn_exp2;
530 Value *Src = II.getArgOperand(i: 0);
531 Type *Ty = II.getType();
532
533 if (isa<PoisonValue>(Val: Src))
534 return IC.replaceInstUsesWith(I&: II, V: Src);
535
536 if (IC.getSimplifyQuery().isUndefValue(V: Src))
537 return IC.replaceInstUsesWith(I&: II, V: ConstantFP::getNaN(Ty));
538
539 if (ConstantFP *C = dyn_cast<ConstantFP>(Val: Src)) {
540 if (C->isInfinity()) {
541 // exp2(+inf) -> +inf
542 // log2(+inf) -> +inf
543 if (!C->isNegative())
544 return IC.replaceInstUsesWith(I&: II, V: C);
545
546 // exp2(-inf) -> 0
547 if (IsExp && C->isNegative())
548 return IC.replaceInstUsesWith(I&: II, V: ConstantFP::getZero(Ty));
549 }
550
551 if (II.isStrictFP())
552 break;
553
554 if (C->isNaN()) {
555 Constant *Quieted = ConstantFP::get(Ty, V: C->getValue().makeQuiet());
556 return IC.replaceInstUsesWith(I&: II, V: Quieted);
557 }
558
559 // f32 instruction doesn't handle denormals, f16 does.
560 if (C->isZero() || (C->getValue().isDenormal() && Ty->isFloatTy())) {
561 Constant *FoldedValue = IsLog ? ConstantFP::getInfinity(Ty, Negative: true)
562 : ConstantFP::get(Ty, V: 1.0);
563 return IC.replaceInstUsesWith(I&: II, V: FoldedValue);
564 }
565
566 if (IsLog && C->isNegative())
567 return IC.replaceInstUsesWith(I&: II, V: ConstantFP::getNaN(Ty));
568
569 // TODO: Full constant folding matching hardware behavior.
570 }
571
572 break;
573 }
574 case Intrinsic::amdgcn_frexp_mant:
575 case Intrinsic::amdgcn_frexp_exp: {
576 Value *Src = II.getArgOperand(i: 0);
577 if (const ConstantFP *C = dyn_cast<ConstantFP>(Val: Src)) {
578 int Exp;
579 APFloat Significand =
580 frexp(X: C->getValueAPF(), Exp, RM: APFloat::rmNearestTiesToEven);
581
582 if (IID == Intrinsic::amdgcn_frexp_mant) {
583 return IC.replaceInstUsesWith(
584 I&: II, V: ConstantFP::get(Context&: II.getContext(), V: Significand));
585 }
586
587 // Match instruction special case behavior.
588 if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
589 Exp = 0;
590
591 return IC.replaceInstUsesWith(I&: II, V: ConstantInt::get(Ty: II.getType(), V: Exp));
592 }
593
594 if (isa<UndefValue>(Val: Src)) {
595 return IC.replaceInstUsesWith(I&: II, V: UndefValue::get(T: II.getType()));
596 }
597
598 break;
599 }
600 case Intrinsic::amdgcn_class: {
601 Value *Src0 = II.getArgOperand(i: 0);
602 Value *Src1 = II.getArgOperand(i: 1);
603 const ConstantInt *CMask = dyn_cast<ConstantInt>(Val: Src1);
604 if (CMask) {
605 II.setCalledOperand(Intrinsic::getDeclaration(
606 M: II.getModule(), Intrinsic::id: is_fpclass, Tys: Src0->getType()));
607
608 // Clamp any excess bits, as they're illegal for the generic intrinsic.
609 II.setArgOperand(i: 1, v: ConstantInt::get(Ty: Src1->getType(),
610 V: CMask->getZExtValue() & fcAllFlags));
611 return &II;
612 }
613
614 // Propagate poison.
615 if (isa<PoisonValue>(Val: Src0) || isa<PoisonValue>(Val: Src1))
616 return IC.replaceInstUsesWith(I&: II, V: PoisonValue::get(T: II.getType()));
617
618 // llvm.amdgcn.class(_, undef) -> false
619 if (IC.getSimplifyQuery().isUndefValue(V: Src1))
620 return IC.replaceInstUsesWith(I&: II, V: ConstantInt::get(Ty: II.getType(), V: false));
621
622 // llvm.amdgcn.class(undef, mask) -> mask != 0
623 if (IC.getSimplifyQuery().isUndefValue(V: Src0)) {
624 Value *CmpMask = IC.Builder.CreateICmpNE(
625 LHS: Src1, RHS: ConstantInt::getNullValue(Ty: Src1->getType()));
626 return IC.replaceInstUsesWith(I&: II, V: CmpMask);
627 }
628 break;
629 }
630 case Intrinsic::amdgcn_cvt_pkrtz: {
631 Value *Src0 = II.getArgOperand(i: 0);
632 Value *Src1 = II.getArgOperand(i: 1);
633 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Val: Src0)) {
634 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Val: Src1)) {
635 const fltSemantics &HalfSem =
636 II.getType()->getScalarType()->getFltSemantics();
637 bool LosesInfo;
638 APFloat Val0 = C0->getValueAPF();
639 APFloat Val1 = C1->getValueAPF();
640 Val0.convert(ToSemantics: HalfSem, RM: APFloat::rmTowardZero, losesInfo: &LosesInfo);
641 Val1.convert(ToSemantics: HalfSem, RM: APFloat::rmTowardZero, losesInfo: &LosesInfo);
642
643 Constant *Folded =
644 ConstantVector::get(V: {ConstantFP::get(Context&: II.getContext(), V: Val0),
645 ConstantFP::get(Context&: II.getContext(), V: Val1)});
646 return IC.replaceInstUsesWith(I&: II, V: Folded);
647 }
648 }
649
650 if (isa<UndefValue>(Val: Src0) && isa<UndefValue>(Val: Src1)) {
651 return IC.replaceInstUsesWith(I&: II, V: UndefValue::get(T: II.getType()));
652 }
653
654 break;
655 }
656 case Intrinsic::amdgcn_cvt_pknorm_i16:
657 case Intrinsic::amdgcn_cvt_pknorm_u16:
658 case Intrinsic::amdgcn_cvt_pk_i16:
659 case Intrinsic::amdgcn_cvt_pk_u16: {
660 Value *Src0 = II.getArgOperand(i: 0);
661 Value *Src1 = II.getArgOperand(i: 1);
662
663 if (isa<UndefValue>(Val: Src0) && isa<UndefValue>(Val: Src1)) {
664 return IC.replaceInstUsesWith(I&: II, V: UndefValue::get(T: II.getType()));
665 }
666
667 break;
668 }
669 case Intrinsic::amdgcn_ubfe:
670 case Intrinsic::amdgcn_sbfe: {
671 // Decompose simple cases into standard shifts.
672 Value *Src = II.getArgOperand(i: 0);
673 if (isa<UndefValue>(Val: Src)) {
674 return IC.replaceInstUsesWith(I&: II, V: Src);
675 }
676
677 unsigned Width;
678 Type *Ty = II.getType();
679 unsigned IntSize = Ty->getIntegerBitWidth();
680
681 ConstantInt *CWidth = dyn_cast<ConstantInt>(Val: II.getArgOperand(i: 2));
682 if (CWidth) {
683 Width = CWidth->getZExtValue();
684 if ((Width & (IntSize - 1)) == 0) {
685 return IC.replaceInstUsesWith(I&: II, V: ConstantInt::getNullValue(Ty));
686 }
687
688 // Hardware ignores high bits, so remove those.
689 if (Width >= IntSize) {
690 return IC.replaceOperand(
691 I&: II, OpNum: 2, V: ConstantInt::get(Ty: CWidth->getType(), V: Width & (IntSize - 1)));
692 }
693 }
694
695 unsigned Offset;
696 ConstantInt *COffset = dyn_cast<ConstantInt>(Val: II.getArgOperand(i: 1));
697 if (COffset) {
698 Offset = COffset->getZExtValue();
699 if (Offset >= IntSize) {
700 return IC.replaceOperand(
701 I&: II, OpNum: 1,
702 V: ConstantInt::get(Ty: COffset->getType(), V: Offset & (IntSize - 1)));
703 }
704 }
705
706 bool Signed = IID == Intrinsic::amdgcn_sbfe;
707
708 if (!CWidth || !COffset)
709 break;
710
711 // The case of Width == 0 is handled above, which makes this transformation
712 // safe. If Width == 0, then the ashr and lshr instructions become poison
713 // value since the shift amount would be equal to the bit size.
714 assert(Width != 0);
715
716 // TODO: This allows folding to undef when the hardware has specific
717 // behavior?
718 if (Offset + Width < IntSize) {
719 Value *Shl = IC.Builder.CreateShl(LHS: Src, RHS: IntSize - Offset - Width);
720 Value *RightShift = Signed ? IC.Builder.CreateAShr(LHS: Shl, RHS: IntSize - Width)
721 : IC.Builder.CreateLShr(LHS: Shl, RHS: IntSize - Width);
722 RightShift->takeName(V: &II);
723 return IC.replaceInstUsesWith(I&: II, V: RightShift);
724 }
725
726 Value *RightShift = Signed ? IC.Builder.CreateAShr(LHS: Src, RHS: Offset)
727 : IC.Builder.CreateLShr(LHS: Src, RHS: Offset);
728
729 RightShift->takeName(V: &II);
730 return IC.replaceInstUsesWith(I&: II, V: RightShift);
731 }
732 case Intrinsic::amdgcn_exp:
733 case Intrinsic::amdgcn_exp_row:
734 case Intrinsic::amdgcn_exp_compr: {
735 ConstantInt *En = cast<ConstantInt>(Val: II.getArgOperand(i: 1));
736 unsigned EnBits = En->getZExtValue();
737 if (EnBits == 0xf)
738 break; // All inputs enabled.
739
740 bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
741 bool Changed = false;
742 for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
743 if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
744 (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
745 Value *Src = II.getArgOperand(i: I + 2);
746 if (!isa<UndefValue>(Val: Src)) {
747 IC.replaceOperand(I&: II, OpNum: I + 2, V: UndefValue::get(T: Src->getType()));
748 Changed = true;
749 }
750 }
751 }
752
753 if (Changed) {
754 return &II;
755 }
756
757 break;
758 }
759 case Intrinsic::amdgcn_fmed3: {
760 // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled
761 // for the shader.
762
763 Value *Src0 = II.getArgOperand(i: 0);
764 Value *Src1 = II.getArgOperand(i: 1);
765 Value *Src2 = II.getArgOperand(i: 2);
766
767 // Checking for NaN before canonicalization provides better fidelity when
768 // mapping other operations onto fmed3 since the order of operands is
769 // unchanged.
770 Value *V = nullptr;
771 if (match(V: Src0, P: PatternMatch::m_NaN()) || isa<UndefValue>(Val: Src0)) {
772 V = IC.Builder.CreateMinNum(LHS: Src1, RHS: Src2);
773 } else if (match(V: Src1, P: PatternMatch::m_NaN()) || isa<UndefValue>(Val: Src1)) {
774 V = IC.Builder.CreateMinNum(LHS: Src0, RHS: Src2);
775 } else if (match(V: Src2, P: PatternMatch::m_NaN()) || isa<UndefValue>(Val: Src2)) {
776 V = IC.Builder.CreateMaxNum(LHS: Src0, RHS: Src1);
777 }
778
779 if (V) {
780 if (auto *CI = dyn_cast<CallInst>(Val: V)) {
781 CI->copyFastMathFlags(I: &II);
782 CI->takeName(V: &II);
783 }
784 return IC.replaceInstUsesWith(I&: II, V);
785 }
786
787 bool Swap = false;
788 // Canonicalize constants to RHS operands.
789 //
790 // fmed3(c0, x, c1) -> fmed3(x, c0, c1)
791 if (isa<Constant>(Val: Src0) && !isa<Constant>(Val: Src1)) {
792 std::swap(a&: Src0, b&: Src1);
793 Swap = true;
794 }
795
796 if (isa<Constant>(Val: Src1) && !isa<Constant>(Val: Src2)) {
797 std::swap(a&: Src1, b&: Src2);
798 Swap = true;
799 }
800
801 if (isa<Constant>(Val: Src0) && !isa<Constant>(Val: Src1)) {
802 std::swap(a&: Src0, b&: Src1);
803 Swap = true;
804 }
805
806 if (Swap) {
807 II.setArgOperand(i: 0, v: Src0);
808 II.setArgOperand(i: 1, v: Src1);
809 II.setArgOperand(i: 2, v: Src2);
810 return &II;
811 }
812
813 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Val: Src0)) {
814 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Val: Src1)) {
815 if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Val: Src2)) {
816 APFloat Result = fmed3AMDGCN(Src0: C0->getValueAPF(), Src1: C1->getValueAPF(),
817 Src2: C2->getValueAPF());
818 return IC.replaceInstUsesWith(
819 I&: II, V: ConstantFP::get(Context&: IC.Builder.getContext(), V: Result));
820 }
821 }
822 }
823
824 if (!ST->hasMed3_16())
825 break;
826
827 Value *X, *Y, *Z;
828
829 // Repeat floating-point width reduction done for minnum/maxnum.
830 // fmed3((fpext X), (fpext Y), (fpext Z)) -> fpext (fmed3(X, Y, Z))
831 if (matchFPExtFromF16(Arg: Src0, FPExtSrc&: X) && matchFPExtFromF16(Arg: Src1, FPExtSrc&: Y) &&
832 matchFPExtFromF16(Arg: Src2, FPExtSrc&: Z)) {
833 Value *NewCall = IC.Builder.CreateIntrinsic(ID: IID, Types: {X->getType()},
834 Args: {X, Y, Z}, FMFSource: &II, Name: II.getName());
835 return new FPExtInst(NewCall, II.getType());
836 }
837
838 break;
839 }
840 case Intrinsic::amdgcn_icmp:
841 case Intrinsic::amdgcn_fcmp: {
842 const ConstantInt *CC = cast<ConstantInt>(Val: II.getArgOperand(i: 2));
843 // Guard against invalid arguments.
844 int64_t CCVal = CC->getZExtValue();
845 bool IsInteger = IID == Intrinsic::amdgcn_icmp;
846 if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
847 CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
848 (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
849 CCVal > CmpInst::LAST_FCMP_PREDICATE)))
850 break;
851
852 Value *Src0 = II.getArgOperand(i: 0);
853 Value *Src1 = II.getArgOperand(i: 1);
854
855 if (auto *CSrc0 = dyn_cast<Constant>(Val: Src0)) {
856 if (auto *CSrc1 = dyn_cast<Constant>(Val: Src1)) {
857 Constant *CCmp = ConstantExpr::getCompare(pred: CCVal, C1: CSrc0, C2: CSrc1);
858 if (CCmp->isNullValue()) {
859 return IC.replaceInstUsesWith(
860 I&: II, V: IC.Builder.CreateSExt(V: CCmp, DestTy: II.getType()));
861 }
862
863 // The result of V_ICMP/V_FCMP assembly instructions (which this
864 // intrinsic exposes) is one bit per thread, masked with the EXEC
865 // register (which contains the bitmask of live threads). So a
866 // comparison that always returns true is the same as a read of the
867 // EXEC register.
868 Function *NewF = Intrinsic::getDeclaration(
869 M: II.getModule(), Intrinsic::id: read_register, Tys: II.getType());
870 Metadata *MDArgs[] = {MDString::get(Context&: II.getContext(), Str: "exec")};
871 MDNode *MD = MDNode::get(Context&: II.getContext(), MDs: MDArgs);
872 Value *Args[] = {MetadataAsValue::get(Context&: II.getContext(), MD)};
873 CallInst *NewCall = IC.Builder.CreateCall(Callee: NewF, Args);
874 NewCall->addFnAttr(Attribute::Convergent);
875 NewCall->takeName(V: &II);
876 return IC.replaceInstUsesWith(I&: II, V: NewCall);
877 }
878
879 // Canonicalize constants to RHS.
880 CmpInst::Predicate SwapPred =
881 CmpInst::getSwappedPredicate(pred: static_cast<CmpInst::Predicate>(CCVal));
882 II.setArgOperand(i: 0, v: Src1);
883 II.setArgOperand(i: 1, v: Src0);
884 II.setArgOperand(
885 i: 2, v: ConstantInt::get(Ty: CC->getType(), V: static_cast<int>(SwapPred)));
886 return &II;
887 }
888
889 if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
890 break;
891
892 // Canonicalize compare eq with true value to compare != 0
893 // llvm.amdgcn.icmp(zext (i1 x), 1, eq)
894 // -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
895 // llvm.amdgcn.icmp(sext (i1 x), -1, eq)
896 // -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
897 Value *ExtSrc;
898 if (CCVal == CmpInst::ICMP_EQ &&
899 ((match(V: Src1, P: PatternMatch::m_One()) &&
900 match(V: Src0, P: m_ZExt(Op: PatternMatch::m_Value(V&: ExtSrc)))) ||
901 (match(V: Src1, P: PatternMatch::m_AllOnes()) &&
902 match(V: Src0, P: m_SExt(Op: PatternMatch::m_Value(V&: ExtSrc))))) &&
903 ExtSrc->getType()->isIntegerTy(Bitwidth: 1)) {
904 IC.replaceOperand(I&: II, OpNum: 1, V: ConstantInt::getNullValue(Ty: Src1->getType()));
905 IC.replaceOperand(I&: II, OpNum: 2,
906 V: ConstantInt::get(Ty: CC->getType(), V: CmpInst::ICMP_NE));
907 return &II;
908 }
909
910 CmpInst::Predicate SrcPred;
911 Value *SrcLHS;
912 Value *SrcRHS;
913
914 // Fold compare eq/ne with 0 from a compare result as the predicate to the
915 // intrinsic. The typical use is a wave vote function in the library, which
916 // will be fed from a user code condition compared with 0. Fold in the
917 // redundant compare.
918
919 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
920 // -> llvm.amdgcn.[if]cmp(a, b, pred)
921 //
922 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
923 // -> llvm.amdgcn.[if]cmp(a, b, inv pred)
924 if (match(V: Src1, P: PatternMatch::m_Zero()) &&
925 match(V: Src0, P: PatternMatch::m_ZExtOrSExt(
926 Op: m_Cmp(Pred&: SrcPred, L: PatternMatch::m_Value(V&: SrcLHS),
927 R: PatternMatch::m_Value(V&: SrcRHS))))) {
928 if (CCVal == CmpInst::ICMP_EQ)
929 SrcPred = CmpInst::getInversePredicate(pred: SrcPred);
930
931 Intrinsic::ID NewIID = CmpInst::isFPPredicate(P: SrcPred)
932 ? Intrinsic::amdgcn_fcmp
933 : Intrinsic::amdgcn_icmp;
934
935 Type *Ty = SrcLHS->getType();
936 if (auto *CmpType = dyn_cast<IntegerType>(Val: Ty)) {
937 // Promote to next legal integer type.
938 unsigned Width = CmpType->getBitWidth();
939 unsigned NewWidth = Width;
940
941 // Don't do anything for i1 comparisons.
942 if (Width == 1)
943 break;
944
945 if (Width <= 16)
946 NewWidth = 16;
947 else if (Width <= 32)
948 NewWidth = 32;
949 else if (Width <= 64)
950 NewWidth = 64;
951 else
952 break; // Can't handle this.
953
954 if (Width != NewWidth) {
955 IntegerType *CmpTy = IC.Builder.getIntNTy(N: NewWidth);
956 if (CmpInst::isSigned(predicate: SrcPred)) {
957 SrcLHS = IC.Builder.CreateSExt(V: SrcLHS, DestTy: CmpTy);
958 SrcRHS = IC.Builder.CreateSExt(V: SrcRHS, DestTy: CmpTy);
959 } else {
960 SrcLHS = IC.Builder.CreateZExt(V: SrcLHS, DestTy: CmpTy);
961 SrcRHS = IC.Builder.CreateZExt(V: SrcRHS, DestTy: CmpTy);
962 }
963 }
964 } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
965 break;
966
967 Function *NewF = Intrinsic::getDeclaration(
968 M: II.getModule(), id: NewIID, Tys: {II.getType(), SrcLHS->getType()});
969 Value *Args[] = {SrcLHS, SrcRHS,
970 ConstantInt::get(Ty: CC->getType(), V: SrcPred)};
971 CallInst *NewCall = IC.Builder.CreateCall(Callee: NewF, Args);
972 NewCall->takeName(V: &II);
973 return IC.replaceInstUsesWith(I&: II, V: NewCall);
974 }
975
976 break;
977 }
978 case Intrinsic::amdgcn_mbcnt_hi: {
979 // exec_hi is all 0, so this is just a copy.
980 if (ST->isWave32())
981 return IC.replaceInstUsesWith(I&: II, V: II.getArgOperand(i: 1));
982 break;
983 }
984 case Intrinsic::amdgcn_ballot: {
985 if (auto *Src = dyn_cast<ConstantInt>(Val: II.getArgOperand(i: 0))) {
986 if (Src->isZero()) {
987 // amdgcn.ballot(i1 0) is zero.
988 return IC.replaceInstUsesWith(I&: II, V: Constant::getNullValue(Ty: II.getType()));
989 }
990 }
991 if (ST->isWave32() && II.getType()->getIntegerBitWidth() == 64) {
992 // %b64 = call i64 ballot.i64(...)
993 // =>
994 // %b32 = call i32 ballot.i32(...)
995 // %b64 = zext i32 %b32 to i64
996 Value *Call = IC.Builder.CreateZExt(
997 V: IC.Builder.CreateIntrinsic(Intrinsic::amdgcn_ballot,
998 {IC.Builder.getInt32Ty()},
999 {II.getArgOperand(i: 0)}),
1000 DestTy: II.getType());
1001 Call->takeName(V: &II);
1002 return IC.replaceInstUsesWith(I&: II, V: Call);
1003 }
1004 break;
1005 }
1006 case Intrinsic::amdgcn_wqm_vote: {
1007 // wqm_vote is identity when the argument is constant.
1008 if (!isa<Constant>(Val: II.getArgOperand(i: 0)))
1009 break;
1010
1011 return IC.replaceInstUsesWith(I&: II, V: II.getArgOperand(i: 0));
1012 }
1013 case Intrinsic::amdgcn_kill: {
1014 const ConstantInt *C = dyn_cast<ConstantInt>(Val: II.getArgOperand(i: 0));
1015 if (!C || !C->getZExtValue())
1016 break;
1017
1018 // amdgcn.kill(i1 1) is a no-op
1019 return IC.eraseInstFromFunction(I&: II);
1020 }
1021 case Intrinsic::amdgcn_update_dpp: {
1022 Value *Old = II.getArgOperand(i: 0);
1023
1024 auto *BC = cast<ConstantInt>(Val: II.getArgOperand(i: 5));
1025 auto *RM = cast<ConstantInt>(Val: II.getArgOperand(i: 3));
1026 auto *BM = cast<ConstantInt>(Val: II.getArgOperand(i: 4));
1027 if (BC->isZeroValue() || RM->getZExtValue() != 0xF ||
1028 BM->getZExtValue() != 0xF || isa<UndefValue>(Val: Old))
1029 break;
1030
1031 // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
1032 return IC.replaceOperand(I&: II, OpNum: 0, V: UndefValue::get(T: Old->getType()));
1033 }
1034 case Intrinsic::amdgcn_permlane16:
1035 case Intrinsic::amdgcn_permlane16_var:
1036 case Intrinsic::amdgcn_permlanex16:
1037 case Intrinsic::amdgcn_permlanex16_var: {
1038 // Discard vdst_in if it's not going to be read.
1039 Value *VDstIn = II.getArgOperand(i: 0);
1040 if (isa<UndefValue>(Val: VDstIn))
1041 break;
1042
1043 // FetchInvalid operand idx.
1044 unsigned int FiIdx = (IID == Intrinsic::amdgcn_permlane16 ||
1045 IID == Intrinsic::amdgcn_permlanex16)
1046 ? 4 /* for permlane16 and permlanex16 */
1047 : 3; /* for permlane16_var and permlanex16_var */
1048
1049 // BoundCtrl operand idx.
1050 // For permlane16 and permlanex16 it should be 5
1051 // For Permlane16_var and permlanex16_var it should be 4
1052 unsigned int BcIdx = FiIdx + 1;
1053
1054 ConstantInt *FetchInvalid = cast<ConstantInt>(Val: II.getArgOperand(i: FiIdx));
1055 ConstantInt *BoundCtrl = cast<ConstantInt>(Val: II.getArgOperand(i: BcIdx));
1056 if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
1057 break;
1058
1059 return IC.replaceOperand(I&: II, OpNum: 0, V: UndefValue::get(T: VDstIn->getType()));
1060 }
1061 case Intrinsic::amdgcn_permlane64:
1062 // A constant value is trivially uniform.
1063 if (Constant *C = dyn_cast<Constant>(Val: II.getArgOperand(i: 0))) {
1064 return IC.replaceInstUsesWith(I&: II, V: C);
1065 }
1066 break;
1067 case Intrinsic::amdgcn_readfirstlane:
1068 case Intrinsic::amdgcn_readlane: {
1069 // A constant value is trivially uniform.
1070 if (Constant *C = dyn_cast<Constant>(Val: II.getArgOperand(i: 0))) {
1071 return IC.replaceInstUsesWith(I&: II, V: C);
1072 }
1073
1074 // The rest of these may not be safe if the exec may not be the same between
1075 // the def and use.
1076 Value *Src = II.getArgOperand(i: 0);
1077 Instruction *SrcInst = dyn_cast<Instruction>(Val: Src);
1078 if (SrcInst && SrcInst->getParent() != II.getParent())
1079 break;
1080
1081 // readfirstlane (readfirstlane x) -> readfirstlane x
1082 // readlane (readfirstlane x), y -> readfirstlane x
1083 if (match(Src,
1084 PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) {
1085 return IC.replaceInstUsesWith(I&: II, V: Src);
1086 }
1087
1088 if (IID == Intrinsic::amdgcn_readfirstlane) {
1089 // readfirstlane (readlane x, y) -> readlane x, y
1090 if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) {
1091 return IC.replaceInstUsesWith(I&: II, V: Src);
1092 }
1093 } else {
1094 // readlane (readlane x, y), y -> readlane x, y
1095 if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>(
1096 PatternMatch::m_Value(),
1097 PatternMatch::m_Specific(II.getArgOperand(1))))) {
1098 return IC.replaceInstUsesWith(I&: II, V: Src);
1099 }
1100 }
1101
1102 break;
1103 }
1104 case Intrinsic::amdgcn_fmul_legacy: {
1105 Value *Op0 = II.getArgOperand(i: 0);
1106 Value *Op1 = II.getArgOperand(i: 1);
1107
1108 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
1109 // infinity, gives +0.0.
1110 // TODO: Move to InstSimplify?
1111 if (match(V: Op0, P: PatternMatch::m_AnyZeroFP()) ||
1112 match(V: Op1, P: PatternMatch::m_AnyZeroFP()))
1113 return IC.replaceInstUsesWith(I&: II, V: ConstantFP::getZero(Ty: II.getType()));
1114
1115 // If we can prove we don't have one of the special cases then we can use a
1116 // normal fmul instruction instead.
1117 if (canSimplifyLegacyMulToMul(I: II, Op0, Op1, IC)) {
1118 auto *FMul = IC.Builder.CreateFMulFMF(L: Op0, R: Op1, FMFSource: &II);
1119 FMul->takeName(V: &II);
1120 return IC.replaceInstUsesWith(I&: II, V: FMul);
1121 }
1122 break;
1123 }
1124 case Intrinsic::amdgcn_fma_legacy: {
1125 Value *Op0 = II.getArgOperand(i: 0);
1126 Value *Op1 = II.getArgOperand(i: 1);
1127 Value *Op2 = II.getArgOperand(i: 2);
1128
1129 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
1130 // infinity, gives +0.0.
1131 // TODO: Move to InstSimplify?
1132 if (match(V: Op0, P: PatternMatch::m_AnyZeroFP()) ||
1133 match(V: Op1, P: PatternMatch::m_AnyZeroFP())) {
1134 // It's tempting to just return Op2 here, but that would give the wrong
1135 // result if Op2 was -0.0.
1136 auto *Zero = ConstantFP::getZero(Ty: II.getType());
1137 auto *FAdd = IC.Builder.CreateFAddFMF(L: Zero, R: Op2, FMFSource: &II);
1138 FAdd->takeName(V: &II);
1139 return IC.replaceInstUsesWith(I&: II, V: FAdd);
1140 }
1141
1142 // If we can prove we don't have one of the special cases then we can use a
1143 // normal fma instead.
1144 if (canSimplifyLegacyMulToMul(I: II, Op0, Op1, IC)) {
1145 II.setCalledOperand(Intrinsic::getDeclaration(
1146 II.getModule(), Intrinsic::fma, II.getType()));
1147 return &II;
1148 }
1149 break;
1150 }
1151 case Intrinsic::amdgcn_is_shared:
1152 case Intrinsic::amdgcn_is_private: {
1153 if (isa<UndefValue>(Val: II.getArgOperand(i: 0)))
1154 return IC.replaceInstUsesWith(I&: II, V: UndefValue::get(T: II.getType()));
1155
1156 if (isa<ConstantPointerNull>(Val: II.getArgOperand(i: 0)))
1157 return IC.replaceInstUsesWith(I&: II, V: ConstantInt::getFalse(Ty: II.getType()));
1158 break;
1159 }
1160 case Intrinsic::amdgcn_buffer_store_format:
1161 case Intrinsic::amdgcn_raw_buffer_store_format:
1162 case Intrinsic::amdgcn_struct_buffer_store_format:
1163 case Intrinsic::amdgcn_raw_tbuffer_store:
1164 case Intrinsic::amdgcn_struct_tbuffer_store:
1165 case Intrinsic::amdgcn_tbuffer_store:
1166 case Intrinsic::amdgcn_image_store_1d:
1167 case Intrinsic::amdgcn_image_store_1darray:
1168 case Intrinsic::amdgcn_image_store_2d:
1169 case Intrinsic::amdgcn_image_store_2darray:
1170 case Intrinsic::amdgcn_image_store_2darraymsaa:
1171 case Intrinsic::amdgcn_image_store_2dmsaa:
1172 case Intrinsic::amdgcn_image_store_3d:
1173 case Intrinsic::amdgcn_image_store_cube:
1174 case Intrinsic::amdgcn_image_store_mip_1d:
1175 case Intrinsic::amdgcn_image_store_mip_1darray:
1176 case Intrinsic::amdgcn_image_store_mip_2d:
1177 case Intrinsic::amdgcn_image_store_mip_2darray:
1178 case Intrinsic::amdgcn_image_store_mip_3d:
1179 case Intrinsic::amdgcn_image_store_mip_cube: {
1180 if (!isa<FixedVectorType>(Val: II.getArgOperand(i: 0)->getType()))
1181 break;
1182
1183 APInt DemandedElts;
1184 if (ST->hasDefaultComponentBroadcast())
1185 DemandedElts = defaultComponentBroadcast(V: II.getArgOperand(i: 0));
1186 else if (ST->hasDefaultComponentZero())
1187 DemandedElts = trimTrailingZerosInVector(IC, UseV: II.getArgOperand(i: 0), I: &II);
1188 else
1189 break;
1190
1191 int DMaskIdx = getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID()) ? 1 : -1;
1192 if (simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx,
1193 IsLoad: false)) {
1194 return IC.eraseInstFromFunction(I&: II);
1195 }
1196
1197 break;
1198 }
1199 }
1200 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
1201 AMDGPU::getImageDimIntrinsicInfo(Intr: II.getIntrinsicID())) {
1202 return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
1203 }
1204 return std::nullopt;
1205}
1206
1207/// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
1208///
1209/// The result of simplifying amdgcn image and buffer store intrinsics is updating
1210/// definitions of the intrinsics vector argument, not Uses of the result like
1211/// image and buffer loads.
1212/// Note: This only supports non-TFE/LWE image intrinsic calls; those have
1213/// struct returns.
1214static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
1215 IntrinsicInst &II,
1216 APInt DemandedElts,
1217 int DMaskIdx, bool IsLoad) {
1218
1219 auto *IIVTy = cast<FixedVectorType>(Val: IsLoad ? II.getType()
1220 : II.getOperand(i_nocapture: 0)->getType());
1221 unsigned VWidth = IIVTy->getNumElements();
1222 if (VWidth == 1)
1223 return nullptr;
1224 Type *EltTy = IIVTy->getElementType();
1225
1226 IRBuilderBase::InsertPointGuard Guard(IC.Builder);
1227 IC.Builder.SetInsertPoint(&II);
1228
1229 // Assume the arguments are unchanged and later override them, if needed.
1230 SmallVector<Value *, 16> Args(II.args());
1231
1232 if (DMaskIdx < 0) {
1233 // Buffer case.
1234
1235 const unsigned ActiveBits = DemandedElts.getActiveBits();
1236 const unsigned UnusedComponentsAtFront = DemandedElts.countr_zero();
1237
1238 // Start assuming the prefix of elements is demanded, but possibly clear
1239 // some other bits if there are trailing zeros (unused components at front)
1240 // and update offset.
1241 DemandedElts = (1 << ActiveBits) - 1;
1242
1243 if (UnusedComponentsAtFront > 0) {
1244 static const unsigned InvalidOffsetIdx = 0xf;
1245
1246 unsigned OffsetIdx;
1247 switch (II.getIntrinsicID()) {
1248 case Intrinsic::amdgcn_raw_buffer_load:
1249 case Intrinsic::amdgcn_raw_ptr_buffer_load:
1250 OffsetIdx = 1;
1251 break;
1252 case Intrinsic::amdgcn_s_buffer_load:
1253 // If resulting type is vec3, there is no point in trimming the
1254 // load with updated offset, as the vec3 would most likely be widened to
1255 // vec4 anyway during lowering.
1256 if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
1257 OffsetIdx = InvalidOffsetIdx;
1258 else
1259 OffsetIdx = 1;
1260 break;
1261 case Intrinsic::amdgcn_struct_buffer_load:
1262 case Intrinsic::amdgcn_struct_ptr_buffer_load:
1263 OffsetIdx = 2;
1264 break;
1265 default:
1266 // TODO: handle tbuffer* intrinsics.
1267 OffsetIdx = InvalidOffsetIdx;
1268 break;
1269 }
1270
1271 if (OffsetIdx != InvalidOffsetIdx) {
1272 // Clear demanded bits and update the offset.
1273 DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
1274 auto *Offset = Args[OffsetIdx];
1275 unsigned SingleComponentSizeInBits =
1276 IC.getDataLayout().getTypeSizeInBits(Ty: EltTy);
1277 unsigned OffsetAdd =
1278 UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
1279 auto *OffsetAddVal = ConstantInt::get(Ty: Offset->getType(), V: OffsetAdd);
1280 Args[OffsetIdx] = IC.Builder.CreateAdd(LHS: Offset, RHS: OffsetAddVal);
1281 }
1282 }
1283 } else {
1284 // Image case.
1285
1286 ConstantInt *DMask = cast<ConstantInt>(Val: Args[DMaskIdx]);
1287 unsigned DMaskVal = DMask->getZExtValue() & 0xf;
1288
1289 // dmask 0 has special semantics, do not simplify.
1290 if (DMaskVal == 0)
1291 return nullptr;
1292
1293 // Mask off values that are undefined because the dmask doesn't cover them
1294 DemandedElts &= (1 << llvm::popcount(Value: DMaskVal)) - 1;
1295
1296 unsigned NewDMaskVal = 0;
1297 unsigned OrigLdStIdx = 0;
1298 for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
1299 const unsigned Bit = 1 << SrcIdx;
1300 if (!!(DMaskVal & Bit)) {
1301 if (!!DemandedElts[OrigLdStIdx])
1302 NewDMaskVal |= Bit;
1303 OrigLdStIdx++;
1304 }
1305 }
1306
1307 if (DMaskVal != NewDMaskVal)
1308 Args[DMaskIdx] = ConstantInt::get(Ty: DMask->getType(), V: NewDMaskVal);
1309 }
1310
1311 unsigned NewNumElts = DemandedElts.popcount();
1312 if (!NewNumElts)
1313 return PoisonValue::get(T: IIVTy);
1314
1315 if (NewNumElts >= VWidth && DemandedElts.isMask()) {
1316 if (DMaskIdx >= 0)
1317 II.setArgOperand(i: DMaskIdx, v: Args[DMaskIdx]);
1318 return nullptr;
1319 }
1320
1321 // Validate function argument and return types, extracting overloaded types
1322 // along the way.
1323 SmallVector<Type *, 6> OverloadTys;
1324 if (!Intrinsic::getIntrinsicSignature(F: II.getCalledFunction(), ArgTys&: OverloadTys))
1325 return nullptr;
1326
1327 Type *NewTy =
1328 (NewNumElts == 1) ? EltTy : FixedVectorType::get(ElementType: EltTy, NumElts: NewNumElts);
1329 OverloadTys[0] = NewTy;
1330
1331 if (!IsLoad) {
1332 SmallVector<int, 8> EltMask;
1333 for (unsigned OrigStoreIdx = 0; OrigStoreIdx < VWidth; ++OrigStoreIdx)
1334 if (DemandedElts[OrigStoreIdx])
1335 EltMask.push_back(Elt: OrigStoreIdx);
1336
1337 if (NewNumElts == 1)
1338 Args[0] = IC.Builder.CreateExtractElement(Vec: II.getOperand(i_nocapture: 0), Idx: EltMask[0]);
1339 else
1340 Args[0] = IC.Builder.CreateShuffleVector(V: II.getOperand(i_nocapture: 0), Mask: EltMask);
1341 }
1342
1343 Function *NewIntrin = Intrinsic::getDeclaration(
1344 M: II.getModule(), id: II.getIntrinsicID(), Tys: OverloadTys);
1345 CallInst *NewCall = IC.Builder.CreateCall(Callee: NewIntrin, Args);
1346 NewCall->takeName(V: &II);
1347 NewCall->copyMetadata(SrcInst: II);
1348
1349 if (IsLoad) {
1350 if (NewNumElts == 1) {
1351 return IC.Builder.CreateInsertElement(Vec: PoisonValue::get(T: IIVTy), NewElt: NewCall,
1352 Idx: DemandedElts.countr_zero());
1353 }
1354
1355 SmallVector<int, 8> EltMask;
1356 unsigned NewLoadIdx = 0;
1357 for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
1358 if (!!DemandedElts[OrigLoadIdx])
1359 EltMask.push_back(Elt: NewLoadIdx++);
1360 else
1361 EltMask.push_back(Elt: NewNumElts);
1362 }
1363
1364 auto *Shuffle = IC.Builder.CreateShuffleVector(V: NewCall, Mask: EltMask);
1365
1366 return Shuffle;
1367 }
1368
1369 return NewCall;
1370}
1371
1372std::optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
1373 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
1374 APInt &UndefElts2, APInt &UndefElts3,
1375 std::function<void(Instruction *, unsigned, APInt, APInt &)>
1376 SimplifyAndSetOp) const {
1377 switch (II.getIntrinsicID()) {
1378 case Intrinsic::amdgcn_buffer_load:
1379 case Intrinsic::amdgcn_buffer_load_format:
1380 case Intrinsic::amdgcn_raw_buffer_load:
1381 case Intrinsic::amdgcn_raw_ptr_buffer_load:
1382 case Intrinsic::amdgcn_raw_buffer_load_format:
1383 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
1384 case Intrinsic::amdgcn_raw_tbuffer_load:
1385 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
1386 case Intrinsic::amdgcn_s_buffer_load:
1387 case Intrinsic::amdgcn_struct_buffer_load:
1388 case Intrinsic::amdgcn_struct_ptr_buffer_load:
1389 case Intrinsic::amdgcn_struct_buffer_load_format:
1390 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
1391 case Intrinsic::amdgcn_struct_tbuffer_load:
1392 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
1393 case Intrinsic::amdgcn_tbuffer_load:
1394 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
1395 default: {
1396 if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) {
1397 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx: 0);
1398 }
1399 break;
1400 }
1401 }
1402 return std::nullopt;
1403}
1404

source code of llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp