1//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This pass does misc. AMDGPU optimizations on IR before instruction
11/// selection.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPU.h"
16#include "AMDGPUTargetMachine.h"
17#include "llvm/Analysis/AssumptionCache.h"
18#include "llvm/Analysis/ConstantFolding.h"
19#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
20#include "llvm/Analysis/ValueTracking.h"
21#include "llvm/CodeGen/TargetPassConfig.h"
22#include "llvm/IR/Dominators.h"
23#include "llvm/IR/InstVisitor.h"
24#include "llvm/IR/IntrinsicsAMDGPU.h"
25#include "llvm/IR/IRBuilder.h"
26#include "llvm/InitializePasses.h"
27#include "llvm/Pass.h"
28#include "llvm/Support/KnownBits.h"
29#include "llvm/Transforms/Utils/IntegerDivision.h"
30
31#define DEBUG_TYPE "amdgpu-codegenprepare"
32
33using namespace llvm;
34
35namespace {
36
37static cl::opt<bool> WidenLoads(
38 "amdgpu-codegenprepare-widen-constant-loads",
39 cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
40 cl::ReallyHidden,
41 cl::init(false));
42
43static cl::opt<bool> Widen16BitOps(
44 "amdgpu-codegenprepare-widen-16-bit-ops",
45 cl::desc("Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"),
46 cl::ReallyHidden,
47 cl::init(true));
48
49static cl::opt<bool> UseMul24Intrin(
50 "amdgpu-codegenprepare-mul24",
51 cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
52 cl::ReallyHidden,
53 cl::init(true));
54
55// Legalize 64-bit division by using the generic IR expansion.
56static cl::opt<bool> ExpandDiv64InIR(
57 "amdgpu-codegenprepare-expand-div64",
58 cl::desc("Expand 64-bit division in AMDGPUCodeGenPrepare"),
59 cl::ReallyHidden,
60 cl::init(false));
61
62// Leave all division operations as they are. This supersedes ExpandDiv64InIR
63// and is used for testing the legalizer.
64static cl::opt<bool> DisableIDivExpand(
65 "amdgpu-codegenprepare-disable-idiv-expansion",
66 cl::desc("Prevent expanding integer division in AMDGPUCodeGenPrepare"),
67 cl::ReallyHidden,
68 cl::init(false));
69
70class AMDGPUCodeGenPrepare : public FunctionPass,
71 public InstVisitor<AMDGPUCodeGenPrepare, bool> {
72 const GCNSubtarget *ST = nullptr;
73 AssumptionCache *AC = nullptr;
74 DominatorTree *DT = nullptr;
75 LegacyDivergenceAnalysis *DA = nullptr;
76 Module *Mod = nullptr;
77 const DataLayout *DL = nullptr;
78 bool HasUnsafeFPMath = false;
79 bool HasFP32Denormals = false;
80
81 /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to
82 /// binary operation \p V.
83 ///
84 /// \returns Binary operation \p V.
85 /// \returns \p T's base element bit width.
86 unsigned getBaseElementBitWidth(const Type *T) const;
87
88 /// \returns Equivalent 32 bit integer type for given type \p T. For example,
89 /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32>
90 /// is returned.
91 Type *getI32Ty(IRBuilder<> &B, const Type *T) const;
92
93 /// \returns True if binary operation \p I is a signed binary operation, false
94 /// otherwise.
95 bool isSigned(const BinaryOperator &I) const;
96
97 /// \returns True if the condition of 'select' operation \p I comes from a
98 /// signed 'icmp' operation, false otherwise.
99 bool isSigned(const SelectInst &I) const;
100
101 /// \returns True if type \p T needs to be promoted to 32 bit integer type,
102 /// false otherwise.
103 bool needsPromotionToI32(const Type *T) const;
104
105 /// Promotes uniform binary operation \p I to equivalent 32 bit binary
106 /// operation.
107 ///
108 /// \details \p I's base element bit width must be greater than 1 and less
109 /// than or equal 16. Promotion is done by sign or zero extending operands to
110 /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and
111 /// truncating the result of 32 bit binary operation back to \p I's original
112 /// type. Division operation is not promoted.
113 ///
114 /// \returns True if \p I is promoted to equivalent 32 bit binary operation,
115 /// false otherwise.
116 bool promoteUniformOpToI32(BinaryOperator &I) const;
117
118 /// Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
119 ///
120 /// \details \p I's base element bit width must be greater than 1 and less
121 /// than or equal 16. Promotion is done by sign or zero extending operands to
122 /// 32 bits, and replacing \p I with 32 bit 'icmp' operation.
123 ///
124 /// \returns True.
125 bool promoteUniformOpToI32(ICmpInst &I) const;
126
127 /// Promotes uniform 'select' operation \p I to 32 bit 'select'
128 /// operation.
129 ///
130 /// \details \p I's base element bit width must be greater than 1 and less
131 /// than or equal 16. Promotion is done by sign or zero extending operands to
132 /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the
133 /// result of 32 bit 'select' operation back to \p I's original type.
134 ///
135 /// \returns True.
136 bool promoteUniformOpToI32(SelectInst &I) const;
137
138 /// Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
139 /// intrinsic.
140 ///
141 /// \details \p I's base element bit width must be greater than 1 and less
142 /// than or equal 16. Promotion is done by zero extending the operand to 32
143 /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the
144 /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the
145 /// shift amount is 32 minus \p I's base element bit width), and truncating
146 /// the result of the shift operation back to \p I's original type.
147 ///
148 /// \returns True.
149 bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
150
151 /// \returns The minimum number of bits needed to store the value of \Op as an
152 /// unsigned integer. Truncating to this size and then zero-extending to
153 /// the original will not change the value.
154 unsigned numBitsUnsigned(Value *Op) const;
155
156 /// \returns The minimum number of bits needed to store the value of \Op as a
157 /// signed integer. Truncating to this size and then sign-extending to
158 /// the original size will not change the value.
159 unsigned numBitsSigned(Value *Op) const;
160
161 /// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24.
162 /// SelectionDAG has an issue where an and asserting the bits are known
163 bool replaceMulWithMul24(BinaryOperator &I) const;
164
165 /// Perform same function as equivalently named function in DAGCombiner. Since
166 /// we expand some divisions here, we need to perform this before obscuring.
167 bool foldBinOpIntoSelect(BinaryOperator &I) const;
168
169 bool divHasSpecialOptimization(BinaryOperator &I,
170 Value *Num, Value *Den) const;
171 int getDivNumBits(BinaryOperator &I,
172 Value *Num, Value *Den,
173 unsigned AtLeast, bool Signed) const;
174
175 /// Expands 24 bit div or rem.
176 Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I,
177 Value *Num, Value *Den,
178 bool IsDiv, bool IsSigned) const;
179
180 Value *expandDivRem24Impl(IRBuilder<> &Builder, BinaryOperator &I,
181 Value *Num, Value *Den, unsigned NumBits,
182 bool IsDiv, bool IsSigned) const;
183
184 /// Expands 32 bit div or rem.
185 Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I,
186 Value *Num, Value *Den) const;
187
188 Value *shrinkDivRem64(IRBuilder<> &Builder, BinaryOperator &I,
189 Value *Num, Value *Den) const;
190 void expandDivRem64(BinaryOperator &I) const;
191
192 /// Widen a scalar load.
193 ///
194 /// \details \p Widen scalar load for uniform, small type loads from constant
195 // memory / to a full 32-bits and then truncate the input to allow a scalar
196 // load instead of a vector load.
197 //
198 /// \returns True.
199
200 bool canWidenScalarExtLoad(LoadInst &I) const;
201
202public:
203 static char ID;
204
205 AMDGPUCodeGenPrepare() : FunctionPass(ID) {}
206
207 bool visitFDiv(BinaryOperator &I);
208 bool visitXor(BinaryOperator &I);
209
210 bool visitInstruction(Instruction &I) { return false; }
211 bool visitBinaryOperator(BinaryOperator &I);
212 bool visitLoadInst(LoadInst &I);
213 bool visitICmpInst(ICmpInst &I);
214 bool visitSelectInst(SelectInst &I);
215
216 bool visitIntrinsicInst(IntrinsicInst &I);
217 bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
218
219 bool doInitialization(Module &M) override;
220 bool runOnFunction(Function &F) override;
221
222 StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
223
224 void getAnalysisUsage(AnalysisUsage &AU) const override {
225 AU.addRequired<AssumptionCacheTracker>();
226 AU.addRequired<LegacyDivergenceAnalysis>();
227
228 // FIXME: Division expansion needs to preserve the dominator tree.
229 if (!ExpandDiv64InIR)
230 AU.setPreservesAll();
231 }
232};
233
234} // end anonymous namespace
235
236unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const {
237 assert(needsPromotionToI32(T) && "T does not need promotion to i32");
238
239 if (T->isIntegerTy())
240 return T->getIntegerBitWidth();
241 return cast<VectorType>(T)->getElementType()->getIntegerBitWidth();
242}
243
244Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const {
245 assert(needsPromotionToI32(T) && "T does not need promotion to i32");
246
247 if (T->isIntegerTy())
248 return B.getInt32Ty();
249 return FixedVectorType::get(B.getInt32Ty(), cast<FixedVectorType>(T));
250}
251
252bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const {
253 return I.getOpcode() == Instruction::AShr ||
254 I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem;
255}
256
257bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const {
258 return isa<ICmpInst>(I.getOperand(0)) ?
259 cast<ICmpInst>(I.getOperand(0))->isSigned() : false;
260}
261
262bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const {
263 if (!Widen16BitOps)
264 return false;
265
266 const IntegerType *IntTy = dyn_cast<IntegerType>(T);
267 if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16)
268 return true;
269
270 if (const VectorType *VT = dyn_cast<VectorType>(T)) {
271 // TODO: The set of packed operations is more limited, so may want to
272 // promote some anyway.
273 if (ST->hasVOP3PInsts())
274 return false;
275
276 return needsPromotionToI32(VT->getElementType());
277 }
278
279 return false;
280}
281
282// Return true if the op promoted to i32 should have nsw set.
283static bool promotedOpIsNSW(const Instruction &I) {
284 switch (I.getOpcode()) {
285 case Instruction::Shl:
286 case Instruction::Add:
287 case Instruction::Sub:
288 return true;
289 case Instruction::Mul:
290 return I.hasNoUnsignedWrap();
291 default:
292 return false;
293 }
294}
295
296// Return true if the op promoted to i32 should have nuw set.
297static bool promotedOpIsNUW(const Instruction &I) {
298 switch (I.getOpcode()) {
299 case Instruction::Shl:
300 case Instruction::Add:
301 case Instruction::Mul:
302 return true;
303 case Instruction::Sub:
304 return I.hasNoUnsignedWrap();
305 default:
306 return false;
307 }
308}
309
310bool AMDGPUCodeGenPrepare::canWidenScalarExtLoad(LoadInst &I) const {
311 Type *Ty = I.getType();
312 const DataLayout &DL = Mod->getDataLayout();
313 int TySize = DL.getTypeSizeInBits(Ty);
314 Align Alignment = DL.getValueOrABITypeAlignment(I.getAlign(), Ty);
315
316 return I.isSimple() && TySize < 32 && Alignment >= 4 && DA->isUniform(&I);
317}
318
319bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const {
320 assert(needsPromotionToI32(I.getType()) &&
321 "I does not need promotion to i32");
322
323 if (I.getOpcode() == Instruction::SDiv ||
324 I.getOpcode() == Instruction::UDiv ||
325 I.getOpcode() == Instruction::SRem ||
326 I.getOpcode() == Instruction::URem)
327 return false;
328
329 IRBuilder<> Builder(&I);
330 Builder.SetCurrentDebugLocation(I.getDebugLoc());
331
332 Type *I32Ty = getI32Ty(Builder, I.getType());
333 Value *ExtOp0 = nullptr;
334 Value *ExtOp1 = nullptr;
335 Value *ExtRes = nullptr;
336 Value *TruncRes = nullptr;
337
338 if (isSigned(I)) {
339 ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
340 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
341 } else {
342 ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
343 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
344 }
345
346 ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1);
347 if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) {
348 if (promotedOpIsNSW(cast<Instruction>(I)))
349 Inst->setHasNoSignedWrap();
350
351 if (promotedOpIsNUW(cast<Instruction>(I)))
352 Inst->setHasNoUnsignedWrap();
353
354 if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I))
355 Inst->setIsExact(ExactOp->isExact());
356 }
357
358 TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
359
360 I.replaceAllUsesWith(TruncRes);
361 I.eraseFromParent();
362
363 return true;
364}
365
366bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst &I) const {
367 assert(needsPromotionToI32(I.getOperand(0)->getType()) &&
368 "I does not need promotion to i32");
369
370 IRBuilder<> Builder(&I);
371 Builder.SetCurrentDebugLocation(I.getDebugLoc());
372
373 Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType());
374 Value *ExtOp0 = nullptr;
375 Value *ExtOp1 = nullptr;
376 Value *NewICmp = nullptr;
377
378 if (I.isSigned()) {
379 ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
380 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
381 } else {
382 ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
383 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
384 }
385 NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1);
386
387 I.replaceAllUsesWith(NewICmp);
388 I.eraseFromParent();
389
390 return true;
391}
392
393bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst &I) const {
394 assert(needsPromotionToI32(I.getType()) &&
395 "I does not need promotion to i32");
396
397 IRBuilder<> Builder(&I);
398 Builder.SetCurrentDebugLocation(I.getDebugLoc());
399
400 Type *I32Ty = getI32Ty(Builder, I.getType());
401 Value *ExtOp1 = nullptr;
402 Value *ExtOp2 = nullptr;
403 Value *ExtRes = nullptr;
404 Value *TruncRes = nullptr;
405
406 if (isSigned(I)) {
407 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
408 ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty);
409 } else {
410 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
411 ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty);
412 }
413 ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2);
414 TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
415
416 I.replaceAllUsesWith(TruncRes);
417 I.eraseFromParent();
418
419 return true;
420}
421
422bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32(
423 IntrinsicInst &I) const {
424 assert(I.getIntrinsicID() == Intrinsic::bitreverse &&
425 "I must be bitreverse intrinsic");
426 assert(needsPromotionToI32(I.getType()) &&
427 "I does not need promotion to i32");
428
429 IRBuilder<> Builder(&I);
430 Builder.SetCurrentDebugLocation(I.getDebugLoc());
431
432 Type *I32Ty = getI32Ty(Builder, I.getType());
433 Function *I32 =
434 Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty });
435 Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty);
436 Value *ExtRes = Builder.CreateCall(I32, { ExtOp });
437 Value *LShrOp =
438 Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType()));
439 Value *TruncRes =
440 Builder.CreateTrunc(LShrOp, I.getType());
441
442 I.replaceAllUsesWith(TruncRes);
443 I.eraseFromParent();
444
445 return true;
446}
447
448unsigned AMDGPUCodeGenPrepare::numBitsUnsigned(Value *Op) const {
449 return computeKnownBits(Op, *DL, 0, AC).countMaxActiveBits();
450}
451
452unsigned AMDGPUCodeGenPrepare::numBitsSigned(Value *Op) const {
453 return ComputeMaxSignificantBits(Op, *DL, 0, AC);
454}
455
456static void extractValues(IRBuilder<> &Builder,
457 SmallVectorImpl<Value *> &Values, Value *V) {
458 auto *VT = dyn_cast<FixedVectorType>(V->getType());
459 if (!VT) {
460 Values.push_back(V);
461 return;
462 }
463
464 for (int I = 0, E = VT->getNumElements(); I != E; ++I)
465 Values.push_back(Builder.CreateExtractElement(V, I));
466}
467
468static Value *insertValues(IRBuilder<> &Builder,
469 Type *Ty,
470 SmallVectorImpl<Value *> &Values) {
471 if (Values.size() == 1)
472 return Values[0];
473
474 Value *NewVal = UndefValue::get(Ty);
475 for (int I = 0, E = Values.size(); I != E; ++I)
476 NewVal = Builder.CreateInsertElement(NewVal, Values[I], I);
477
478 return NewVal;
479}
480
481// Returns 24-bit or 48-bit (as per `NumBits` and `Size`) mul of `LHS` and
482// `RHS`. `NumBits` is the number of KnownBits of the result and `Size` is the
483// width of the original destination.
484static Value *getMul24(IRBuilder<> &Builder, Value *LHS, Value *RHS,
485 unsigned Size, unsigned NumBits, bool IsSigned) {
486 if (Size <= 32 || NumBits <= 32) {
487 Intrinsic::ID ID =
488 IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
489 return Builder.CreateIntrinsic(ID, {}, {LHS, RHS});
490 }
491
492 assert(NumBits <= 48);
493
494 Intrinsic::ID LoID =
495 IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
496 Intrinsic::ID HiID =
497 IsSigned ? Intrinsic::amdgcn_mulhi_i24 : Intrinsic::amdgcn_mulhi_u24;
498
499 Value *Lo = Builder.CreateIntrinsic(LoID, {}, {LHS, RHS});
500 Value *Hi = Builder.CreateIntrinsic(HiID, {}, {LHS, RHS});
501
502 IntegerType *I64Ty = Builder.getInt64Ty();
503 Lo = Builder.CreateZExtOrTrunc(Lo, I64Ty);
504 Hi = Builder.CreateZExtOrTrunc(Hi, I64Ty);
505
506 return Builder.CreateOr(Lo, Builder.CreateShl(Hi, 32));
507}
508
509bool AMDGPUCodeGenPrepare::replaceMulWithMul24(BinaryOperator &I) const {
510 if (I.getOpcode() != Instruction::Mul)
511 return false;
512
513 Type *Ty = I.getType();
514 unsigned Size = Ty->getScalarSizeInBits();
515 if (Size <= 16 && ST->has16BitInsts())
516 return false;
517
518 // Prefer scalar if this could be s_mul_i32
519 if (DA->isUniform(&I))
520 return false;
521
522 Value *LHS = I.getOperand(0);
523 Value *RHS = I.getOperand(1);
524 IRBuilder<> Builder(&I);
525 Builder.SetCurrentDebugLocation(I.getDebugLoc());
526
527 unsigned LHSBits = 0, RHSBits = 0;
528 bool IsSigned = false;
529
530 if (ST->hasMulU24() && (LHSBits = numBitsUnsigned(LHS)) <= 24 &&
531 (RHSBits = numBitsUnsigned(RHS)) <= 24) {
532 IsSigned = false;
533
534 } else if (ST->hasMulI24() && (LHSBits = numBitsSigned(LHS)) <= 24 &&
535 (RHSBits = numBitsSigned(RHS)) <= 24) {
536 IsSigned = true;
537
538 } else
539 return false;
540
541 SmallVector<Value *, 4> LHSVals;
542 SmallVector<Value *, 4> RHSVals;
543 SmallVector<Value *, 4> ResultVals;
544 extractValues(Builder, LHSVals, LHS);
545 extractValues(Builder, RHSVals, RHS);
546
547 IntegerType *I32Ty = Builder.getInt32Ty();
548 for (int I = 0, E = LHSVals.size(); I != E; ++I) {
549 Value *LHS, *RHS;
550 if (IsSigned) {
551 LHS = Builder.CreateSExtOrTrunc(LHSVals[I], I32Ty);
552 RHS = Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty);
553 } else {
554 LHS = Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty);
555 RHS = Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty);
556 }
557
558 Value *Result =
559 getMul24(Builder, LHS, RHS, Size, LHSBits + RHSBits, IsSigned);
560
561 if (IsSigned) {
562 ResultVals.push_back(
563 Builder.CreateSExtOrTrunc(Result, LHSVals[I]->getType()));
564 } else {
565 ResultVals.push_back(
566 Builder.CreateZExtOrTrunc(Result, LHSVals[I]->getType()));
567 }
568 }
569
570 Value *NewVal = insertValues(Builder, Ty, ResultVals);
571 NewVal->takeName(&I);
572 I.replaceAllUsesWith(NewVal);
573 I.eraseFromParent();
574
575 return true;
576}
577
578// Find a select instruction, which may have been casted. This is mostly to deal
579// with cases where i16 selects were promoted here to i32.
580static SelectInst *findSelectThroughCast(Value *V, CastInst *&Cast) {
581 Cast = nullptr;
582 if (SelectInst *Sel = dyn_cast<SelectInst>(V))
583 return Sel;
584
585 if ((Cast = dyn_cast<CastInst>(V))) {
586 if (SelectInst *Sel = dyn_cast<SelectInst>(Cast->getOperand(0)))
587 return Sel;
588 }
589
590 return nullptr;
591}
592
593bool AMDGPUCodeGenPrepare::foldBinOpIntoSelect(BinaryOperator &BO) const {
594 // Don't do this unless the old select is going away. We want to eliminate the
595 // binary operator, not replace a binop with a select.
596 int SelOpNo = 0;
597
598 CastInst *CastOp;
599
600 // TODO: Should probably try to handle some cases with multiple
601 // users. Duplicating the select may be profitable for division.
602 SelectInst *Sel = findSelectThroughCast(BO.getOperand(0), CastOp);
603 if (!Sel || !Sel->hasOneUse()) {
604 SelOpNo = 1;
605 Sel = findSelectThroughCast(BO.getOperand(1), CastOp);
606 }
607
608 if (!Sel || !Sel->hasOneUse())
609 return false;
610
611 Constant *CT = dyn_cast<Constant>(Sel->getTrueValue());
612 Constant *CF = dyn_cast<Constant>(Sel->getFalseValue());
613 Constant *CBO = dyn_cast<Constant>(BO.getOperand(SelOpNo ^ 1));
614 if (!CBO || !CT || !CF)
615 return false;
616
617 if (CastOp) {
618 if (!CastOp->hasOneUse())
619 return false;
620 CT = ConstantFoldCastOperand(CastOp->getOpcode(), CT, BO.getType(), *DL);
621 CF = ConstantFoldCastOperand(CastOp->getOpcode(), CF, BO.getType(), *DL);
622 }
623
624 // TODO: Handle special 0/-1 cases DAG combine does, although we only really
625 // need to handle divisions here.
626 Constant *FoldedT = SelOpNo ?
627 ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CT, *DL) :
628 ConstantFoldBinaryOpOperands(BO.getOpcode(), CT, CBO, *DL);
629 if (!FoldedT || isa<ConstantExpr>(FoldedT))
630 return false;
631
632 Constant *FoldedF = SelOpNo ?
633 ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CF, *DL) :
634 ConstantFoldBinaryOpOperands(BO.getOpcode(), CF, CBO, *DL);
635 if (!FoldedF || isa<ConstantExpr>(FoldedF))
636 return false;
637
638 IRBuilder<> Builder(&BO);
639 Builder.SetCurrentDebugLocation(BO.getDebugLoc());
640 if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(&BO))
641 Builder.setFastMathFlags(FPOp->getFastMathFlags());
642
643 Value *NewSelect = Builder.CreateSelect(Sel->getCondition(),
644 FoldedT, FoldedF);
645 NewSelect->takeName(&BO);
646 BO.replaceAllUsesWith(NewSelect);
647 BO.eraseFromParent();
648 if (CastOp)
649 CastOp->eraseFromParent();
650 Sel->eraseFromParent();
651 return true;
652}
653
654// Optimize fdiv with rcp:
655//
656// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
657// allowed with unsafe-fp-math or afn.
658//
659// a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn.
660static Value *optimizeWithRcp(Value *Num, Value *Den, bool AllowInaccurateRcp,
661 bool RcpIsAccurate, IRBuilder<> &Builder,
662 Module *Mod) {
663
664 if (!AllowInaccurateRcp && !RcpIsAccurate)
665 return nullptr;
666
667 Type *Ty = Den->getType();
668 if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num)) {
669 if (AllowInaccurateRcp || RcpIsAccurate) {
670 if (CLHS->isExactlyValue(1.0)) {
671 Function *Decl = Intrinsic::getDeclaration(
672 Mod, Intrinsic::amdgcn_rcp, Ty);
673
674 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
675 // the CI documentation has a worst case error of 1 ulp.
676 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
677 // use it as long as we aren't trying to use denormals.
678 //
679 // v_rcp_f16 and v_rsq_f16 DO support denormals.
680
681 // NOTE: v_sqrt and v_rcp will be combined to v_rsq later. So we don't
682 // insert rsq intrinsic here.
683
684 // 1.0 / x -> rcp(x)
685 return Builder.CreateCall(Decl, { Den });
686 }
687
688 // Same as for 1.0, but expand the sign out of the constant.
689 if (CLHS->isExactlyValue(-1.0)) {
690 Function *Decl = Intrinsic::getDeclaration(
691 Mod, Intrinsic::amdgcn_rcp, Ty);
692
693 // -1.0 / x -> rcp (fneg x)
694 Value *FNeg = Builder.CreateFNeg(Den);
695 return Builder.CreateCall(Decl, { FNeg });
696 }
697 }
698 }
699
700 if (AllowInaccurateRcp) {
701 Function *Decl = Intrinsic::getDeclaration(
702 Mod, Intrinsic::amdgcn_rcp, Ty);
703
704 // Turn into multiply by the reciprocal.
705 // x / y -> x * (1.0 / y)
706 Value *Recip = Builder.CreateCall(Decl, { Den });
707 return Builder.CreateFMul(Num, Recip);
708 }
709 return nullptr;
710}
711
712// optimize with fdiv.fast:
713//
714// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
715//
716// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
717//
718// NOTE: optimizeWithRcp should be tried first because rcp is the preference.
719static Value *optimizeWithFDivFast(Value *Num, Value *Den, float ReqdAccuracy,
720 bool HasDenormals, IRBuilder<> &Builder,
721 Module *Mod) {
722 // fdiv.fast can achieve 2.5 ULP accuracy.
723 if (ReqdAccuracy < 2.5f)
724 return nullptr;
725
726 // Only have fdiv.fast for f32.
727 Type *Ty = Den->getType();
728 if (!Ty->isFloatTy())
729 return nullptr;
730
731 bool NumIsOne = false;
732 if (const ConstantFP *CNum = dyn_cast<ConstantFP>(Num)) {
733 if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0))
734 NumIsOne = true;
735 }
736
737 // fdiv does not support denormals. But 1.0/x is always fine to use it.
738 if (HasDenormals && !NumIsOne)
739 return nullptr;
740
741 Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
742 return Builder.CreateCall(Decl, { Num, Den });
743}
744
745// Optimizations is performed based on fpmath, fast math flags as well as
746// denormals to optimize fdiv with either rcp or fdiv.fast.
747//
748// With rcp:
749// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
750// allowed with unsafe-fp-math or afn.
751//
752// a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn.
753//
754// With fdiv.fast:
755// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
756//
757// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
758//
759// NOTE: rcp is the preference in cases that both are legal.
760bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
761
762 Type *Ty = FDiv.getType()->getScalarType();
763
764 // The f64 rcp/rsq approximations are pretty inaccurate. We can do an
765 // expansion around them in codegen.
766 if (Ty->isDoubleTy())
767 return false;
768
769 // No intrinsic for fdiv16 if target does not support f16.
770 if (Ty->isHalfTy() && !ST->has16BitInsts())
771 return false;
772
773 const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
774 const float ReqdAccuracy = FPOp->getFPAccuracy();
775
776 // Inaccurate rcp is allowed with unsafe-fp-math or afn.
777 FastMathFlags FMF = FPOp->getFastMathFlags();
778 const bool AllowInaccurateRcp = HasUnsafeFPMath || FMF.approxFunc();
779
780 // rcp_f16 is accurate for !fpmath >= 1.0ulp.
781 // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
782 // rcp_f64 is never accurate.
783 const bool RcpIsAccurate = (Ty->isHalfTy() && ReqdAccuracy >= 1.0f) ||
784 (Ty->isFloatTy() && !HasFP32Denormals && ReqdAccuracy >= 1.0f);
785
786 IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()));
787 Builder.setFastMathFlags(FMF);
788 Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
789
790 Value *Num = FDiv.getOperand(0);
791 Value *Den = FDiv.getOperand(1);
792
793 Value *NewFDiv = nullptr;
794 if (auto *VT = dyn_cast<FixedVectorType>(FDiv.getType())) {
795 NewFDiv = UndefValue::get(VT);
796
797 // FIXME: Doesn't do the right thing for cases where the vector is partially
798 // constant. This works when the scalarizer pass is run first.
799 for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
800 Value *NumEltI = Builder.CreateExtractElement(Num, I);
801 Value *DenEltI = Builder.CreateExtractElement(Den, I);
802 // Try rcp first.
803 Value *NewElt = optimizeWithRcp(NumEltI, DenEltI, AllowInaccurateRcp,
804 RcpIsAccurate, Builder, Mod);
805 if (!NewElt) // Try fdiv.fast.
806 NewElt = optimizeWithFDivFast(NumEltI, DenEltI, ReqdAccuracy,
807 HasFP32Denormals, Builder, Mod);
808 if (!NewElt) // Keep the original.
809 NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
810
811 NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
812 }
813 } else { // Scalar FDiv.
814 // Try rcp first.
815 NewFDiv = optimizeWithRcp(Num, Den, AllowInaccurateRcp, RcpIsAccurate,
816 Builder, Mod);
817 if (!NewFDiv) { // Try fdiv.fast.
818 NewFDiv = optimizeWithFDivFast(Num, Den, ReqdAccuracy, HasFP32Denormals,
819 Builder, Mod);
820 }
821 }
822
823 if (NewFDiv) {
824 FDiv.replaceAllUsesWith(NewFDiv);
825 NewFDiv->takeName(&FDiv);
826 FDiv.eraseFromParent();
827 }
828
829 return !!NewFDiv;
830}
831
832bool AMDGPUCodeGenPrepare::visitXor(BinaryOperator &I) {
833 // Match the Xor instruction, its type and its operands
834 IntrinsicInst *IntrinsicCall = dyn_cast<IntrinsicInst>(I.getOperand(0));
835 ConstantInt *RHS = dyn_cast<ConstantInt>(I.getOperand(1));
836 if (!RHS || !IntrinsicCall || RHS->getSExtValue() != -1)
837 return visitBinaryOperator(I);
838
839 // Check if the Call is an intrinsic instruction to amdgcn_class intrinsic
840 // has only one use
841 if (IntrinsicCall->getIntrinsicID() != Intrinsic::amdgcn_class ||
842 !IntrinsicCall->hasOneUse())
843 return visitBinaryOperator(I);
844
845 // "Not" the second argument of the intrinsic call
846 ConstantInt *Arg = dyn_cast<ConstantInt>(IntrinsicCall->getOperand(1));
847 if (!Arg)
848 return visitBinaryOperator(I);
849
850 IntrinsicCall->setOperand(
851 1, ConstantInt::get(Arg->getType(), Arg->getZExtValue() ^ 0x3ff));
852 I.replaceAllUsesWith(IntrinsicCall);
853 I.eraseFromParent();
854 return true;
855}
856
857static bool hasUnsafeFPMath(const Function &F) {
858 Attribute Attr = F.getFnAttribute("unsafe-fp-math");
859 return Attr.getValueAsBool();
860}
861
862static std::pair<Value*, Value*> getMul64(IRBuilder<> &Builder,
863 Value *LHS, Value *RHS) {
864 Type *I32Ty = Builder.getInt32Ty();
865 Type *I64Ty = Builder.getInt64Ty();
866
867 Value *LHS_EXT64 = Builder.CreateZExt(LHS, I64Ty);
868 Value *RHS_EXT64 = Builder.CreateZExt(RHS, I64Ty);
869 Value *MUL64 = Builder.CreateMul(LHS_EXT64, RHS_EXT64);
870 Value *Lo = Builder.CreateTrunc(MUL64, I32Ty);
871 Value *Hi = Builder.CreateLShr(MUL64, Builder.getInt64(32));
872 Hi = Builder.CreateTrunc(Hi, I32Ty);
873 return std::make_pair(Lo, Hi);
874}
875
876static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) {
877 return getMul64(Builder, LHS, RHS).second;
878}
879
880/// Figure out how many bits are really needed for this division. \p AtLeast is
881/// an optimization hint to bypass the second ComputeNumSignBits call if we the
882/// first one is insufficient. Returns -1 on failure.
883int AMDGPUCodeGenPrepare::getDivNumBits(BinaryOperator &I,
884 Value *Num, Value *Den,
885 unsigned AtLeast, bool IsSigned) const {
886 const DataLayout &DL = Mod->getDataLayout();
887 unsigned LHSSignBits = ComputeNumSignBits(Num, DL, 0, AC, &I);
888 if (LHSSignBits < AtLeast)
889 return -1;
890
891 unsigned RHSSignBits = ComputeNumSignBits(Den, DL, 0, AC, &I);
892 if (RHSSignBits < AtLeast)
893 return -1;
894
895 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
896 unsigned DivBits = Num->getType()->getScalarSizeInBits() - SignBits;
897 if (IsSigned)
898 ++DivBits;
899 return DivBits;
900}
901
902// The fractional part of a float is enough to accurately represent up to
903// a 24-bit signed integer.
904Value *AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder,
905 BinaryOperator &I,
906 Value *Num, Value *Den,
907 bool IsDiv, bool IsSigned) const {
908 int DivBits = getDivNumBits(I, Num, Den, 9, IsSigned);
909 if (DivBits == -1)
910 return nullptr;
911 return expandDivRem24Impl(Builder, I, Num, Den, DivBits, IsDiv, IsSigned);
912}
913
914Value *AMDGPUCodeGenPrepare::expandDivRem24Impl(IRBuilder<> &Builder,
915 BinaryOperator &I,
916 Value *Num, Value *Den,
917 unsigned DivBits,
918 bool IsDiv, bool IsSigned) const {
919 Type *I32Ty = Builder.getInt32Ty();
920 Num = Builder.CreateTrunc(Num, I32Ty);
921 Den = Builder.CreateTrunc(Den, I32Ty);
922
923 Type *F32Ty = Builder.getFloatTy();
924 ConstantInt *One = Builder.getInt32(1);
925 Value *JQ = One;
926
927 if (IsSigned) {
928 // char|short jq = ia ^ ib;
929 JQ = Builder.CreateXor(Num, Den);
930
931 // jq = jq >> (bitsize - 2)
932 JQ = Builder.CreateAShr(JQ, Builder.getInt32(30));
933
934 // jq = jq | 0x1
935 JQ = Builder.CreateOr(JQ, One);
936 }
937
938 // int ia = (int)LHS;
939 Value *IA = Num;
940
941 // int ib, (int)RHS;
942 Value *IB = Den;
943
944 // float fa = (float)ia;
945 Value *FA = IsSigned ? Builder.CreateSIToFP(IA, F32Ty)
946 : Builder.CreateUIToFP(IA, F32Ty);
947
948 // float fb = (float)ib;
949 Value *FB = IsSigned ? Builder.CreateSIToFP(IB,F32Ty)
950 : Builder.CreateUIToFP(IB,F32Ty);
951
952 Function *RcpDecl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp,
953 Builder.getFloatTy());
954 Value *RCP = Builder.CreateCall(RcpDecl, { FB });
955 Value *FQM = Builder.CreateFMul(FA, RCP);
956
957 // fq = trunc(fqm);
958 CallInst *FQ = Builder.CreateUnaryIntrinsic(Intrinsic::trunc, FQM);
959 FQ->copyFastMathFlags(Builder.getFastMathFlags());
960
961 // float fqneg = -fq;
962 Value *FQNeg = Builder.CreateFNeg(FQ);
963
964 // float fr = mad(fqneg, fb, fa);
965 auto FMAD = !ST->hasMadMacF32Insts()
966 ? Intrinsic::fma
967 : (Intrinsic::ID)Intrinsic::amdgcn_fmad_ftz;
968 Value *FR = Builder.CreateIntrinsic(FMAD,
969 {FQNeg->getType()}, {FQNeg, FB, FA}, FQ);
970
971 // int iq = (int)fq;
972 Value *IQ = IsSigned ? Builder.CreateFPToSI(FQ, I32Ty)
973 : Builder.CreateFPToUI(FQ, I32Ty);
974
975 // fr = fabs(fr);
976 FR = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FR, FQ);
977
978 // fb = fabs(fb);
979 FB = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FB, FQ);
980
981 // int cv = fr >= fb;
982 Value *CV = Builder.CreateFCmpOGE(FR, FB);
983
984 // jq = (cv ? jq : 0);
985 JQ = Builder.CreateSelect(CV, JQ, Builder.getInt32(0));
986
987 // dst = iq + jq;
988 Value *Div = Builder.CreateAdd(IQ, JQ);
989
990 Value *Res = Div;
991 if (!IsDiv) {
992 // Rem needs compensation, it's easier to recompute it
993 Value *Rem = Builder.CreateMul(Div, Den);
994 Res = Builder.CreateSub(Num, Rem);
995 }
996
997 if (DivBits != 0 && DivBits < 32) {
998 // Extend in register from the number of bits this divide really is.
999 if (IsSigned) {
1000 int InRegBits = 32 - DivBits;
1001
1002 Res = Builder.CreateShl(Res, InRegBits);
1003 Res = Builder.CreateAShr(Res, InRegBits);
1004 } else {
1005 ConstantInt *TruncMask
1006 = Builder.getInt32((UINT64_C(1) << DivBits) - 1);
1007 Res = Builder.CreateAnd(Res, TruncMask);
1008 }
1009 }
1010
1011 return Res;
1012}
1013
1014// Try to recognize special cases the DAG will emit special, better expansions
1015// than the general expansion we do here.
1016
1017// TODO: It would be better to just directly handle those optimizations here.
1018bool AMDGPUCodeGenPrepare::divHasSpecialOptimization(
1019 BinaryOperator &I, Value *Num, Value *Den) const {
1020 if (Constant *C = dyn_cast<Constant>(Den)) {
1021 // Arbitrary constants get a better expansion as long as a wider mulhi is
1022 // legal.
1023 if (C->getType()->getScalarSizeInBits() <= 32)
1024 return true;
1025
1026 // TODO: Sdiv check for not exact for some reason.
1027
1028 // If there's no wider mulhi, there's only a better expansion for powers of
1029 // two.
1030 // TODO: Should really know for each vector element.
1031 if (isKnownToBeAPowerOfTwo(C, *DL, true, 0, AC, &I, DT))
1032 return true;
1033
1034 return false;
1035 }
1036
1037 if (BinaryOperator *BinOpDen = dyn_cast<BinaryOperator>(Den)) {
1038 // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
1039 if (BinOpDen->getOpcode() == Instruction::Shl &&
1040 isa<Constant>(BinOpDen->getOperand(0)) &&
1041 isKnownToBeAPowerOfTwo(BinOpDen->getOperand(0), *DL, true,
1042 0, AC, &I, DT)) {
1043 return true;
1044 }
1045 }
1046
1047 return false;
1048}
1049
1050static Value *getSign32(Value *V, IRBuilder<> &Builder, const DataLayout *DL) {
1051 // Check whether the sign can be determined statically.
1052 KnownBits Known = computeKnownBits(V, *DL);
1053 if (Known.isNegative())
1054 return Constant::getAllOnesValue(V->getType());
1055 if (Known.isNonNegative())
1056 return Constant::getNullValue(V->getType());
1057 return Builder.CreateAShr(V, Builder.getInt32(31));
1058}
1059
1060Value *AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder<> &Builder,
1061 BinaryOperator &I, Value *X,
1062 Value *Y) const {
1063 Instruction::BinaryOps Opc = I.getOpcode();
1064 assert(Opc == Instruction::URem || Opc == Instruction::UDiv ||
1065 Opc == Instruction::SRem || Opc == Instruction::SDiv);
1066
1067 FastMathFlags FMF;
1068 FMF.setFast();
1069 Builder.setFastMathFlags(FMF);
1070
1071 if (divHasSpecialOptimization(I, X, Y))
1072 return nullptr; // Keep it for later optimization.
1073
1074 bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv;
1075 bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv;
1076
1077 Type *Ty = X->getType();
1078 Type *I32Ty = Builder.getInt32Ty();
1079 Type *F32Ty = Builder.getFloatTy();
1080
1081 if (Ty->getScalarSizeInBits() < 32) {
1082 if (IsSigned) {
1083 X = Builder.CreateSExt(X, I32Ty);
1084 Y = Builder.CreateSExt(Y, I32Ty);
1085 } else {
1086 X = Builder.CreateZExt(X, I32Ty);
1087 Y = Builder.CreateZExt(Y, I32Ty);
1088 }
1089 }
1090
1091 if (Value *Res = expandDivRem24(Builder, I, X, Y, IsDiv, IsSigned)) {
1092 return IsSigned ? Builder.CreateSExtOrTrunc(Res, Ty) :
1093 Builder.CreateZExtOrTrunc(Res, Ty);
1094 }
1095
1096 ConstantInt *Zero = Builder.getInt32(0);
1097 ConstantInt *One = Builder.getInt32(1);
1098
1099 Value *Sign = nullptr;
1100 if (IsSigned) {
1101 Value *SignX = getSign32(X, Builder, DL);
1102 Value *SignY = getSign32(Y, Builder, DL);
1103 // Remainder sign is the same as LHS
1104 Sign = IsDiv ? Builder.CreateXor(SignX, SignY) : SignX;
1105
1106 X = Builder.CreateAdd(X, SignX);
1107 Y = Builder.CreateAdd(Y, SignY);
1108
1109 X = Builder.CreateXor(X, SignX);
1110 Y = Builder.CreateXor(Y, SignY);
1111 }
1112
1113 // The algorithm here is based on ideas from "Software Integer Division", Tom
1114 // Rodeheffer, August 2008.
1115 //
1116 // unsigned udiv(unsigned x, unsigned y) {
1117 // // Initial estimate of inv(y). The constant is less than 2^32 to ensure
1118 // // that this is a lower bound on inv(y), even if some of the calculations
1119 // // round up.
1120 // unsigned z = (unsigned)((4294967296.0 - 512.0) * v_rcp_f32((float)y));
1121 //
1122 // // One round of UNR (Unsigned integer Newton-Raphson) to improve z.
1123 // // Empirically this is guaranteed to give a "two-y" lower bound on
1124 // // inv(y).
1125 // z += umulh(z, -y * z);
1126 //
1127 // // Quotient/remainder estimate.
1128 // unsigned q = umulh(x, z);
1129 // unsigned r = x - q * y;
1130 //
1131 // // Two rounds of quotient/remainder refinement.
1132 // if (r >= y) {
1133 // ++q;
1134 // r -= y;
1135 // }
1136 // if (r >= y) {
1137 // ++q;
1138 // r -= y;
1139 // }
1140 //
1141 // return q;
1142 // }
1143
1144 // Initial estimate of inv(y).
1145 Value *FloatY = Builder.CreateUIToFP(Y, F32Ty);
1146 Function *Rcp = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp, F32Ty);
1147 Value *RcpY = Builder.CreateCall(Rcp, {FloatY});
1148 Constant *Scale = ConstantFP::get(F32Ty, BitsToFloat(0x4F7FFFFE));
1149 Value *ScaledY = Builder.CreateFMul(RcpY, Scale);
1150 Value *Z = Builder.CreateFPToUI(ScaledY, I32Ty);
1151
1152 // One round of UNR.
1153 Value *NegY = Builder.CreateSub(Zero, Y);
1154 Value *NegYZ = Builder.CreateMul(NegY, Z);
1155 Z = Builder.CreateAdd(Z, getMulHu(Builder, Z, NegYZ));
1156
1157 // Quotient/remainder estimate.
1158 Value *Q = getMulHu(Builder, X, Z);
1159 Value *R = Builder.CreateSub(X, Builder.CreateMul(Q, Y));
1160
1161 // First quotient/remainder refinement.
1162 Value *Cond = Builder.CreateICmpUGE(R, Y);
1163 if (IsDiv)
1164 Q = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);
1165 R = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);
1166
1167 // Second quotient/remainder refinement.
1168 Cond = Builder.CreateICmpUGE(R, Y);
1169 Value *Res;
1170 if (IsDiv)
1171 Res = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);
1172 else
1173 Res = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);
1174
1175 if (IsSigned) {
1176 Res = Builder.CreateXor(Res, Sign);
1177 Res = Builder.CreateSub(Res, Sign);
1178 }
1179
1180 Res = Builder.CreateTrunc(Res, Ty);
1181
1182 return Res;
1183}
1184
1185Value *AMDGPUCodeGenPrepare::shrinkDivRem64(IRBuilder<> &Builder,
1186 BinaryOperator &I,
1187 Value *Num, Value *Den) const {
1188 if (!ExpandDiv64InIR && divHasSpecialOptimization(I, Num, Den))
1189 return nullptr; // Keep it for later optimization.
1190
1191 Instruction::BinaryOps Opc = I.getOpcode();
1192
1193 bool IsDiv = Opc == Instruction::SDiv || Opc == Instruction::UDiv;
1194 bool IsSigned = Opc == Instruction::SDiv || Opc == Instruction::SRem;
1195
1196 int NumDivBits = getDivNumBits(I, Num, Den, 32, IsSigned);
1197 if (NumDivBits == -1)
1198 return nullptr;
1199
1200 Value *Narrowed = nullptr;
1201 if (NumDivBits <= 24) {
1202 Narrowed = expandDivRem24Impl(Builder, I, Num, Den, NumDivBits,
1203 IsDiv, IsSigned);
1204 } else if (NumDivBits <= 32) {
1205 Narrowed = expandDivRem32(Builder, I, Num, Den);
1206 }
1207
1208 if (Narrowed) {
1209 return IsSigned ? Builder.CreateSExt(Narrowed, Num->getType()) :
1210 Builder.CreateZExt(Narrowed, Num->getType());
1211 }
1212
1213 return nullptr;
1214}
1215
1216void AMDGPUCodeGenPrepare::expandDivRem64(BinaryOperator &I) const {
1217 Instruction::BinaryOps Opc = I.getOpcode();
1218 // Do the general expansion.
1219 if (Opc == Instruction::UDiv || Opc == Instruction::SDiv) {
1220 expandDivisionUpTo64Bits(&I);
1221 return;
1222 }
1223
1224 if (Opc == Instruction::URem || Opc == Instruction::SRem) {
1225 expandRemainderUpTo64Bits(&I);
1226 return;
1227 }
1228
1229 llvm_unreachable("not a division");
1230}
1231
1232bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
1233 if (foldBinOpIntoSelect(I))
1234 return true;
1235
1236 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
1237 DA->isUniform(&I) && promoteUniformOpToI32(I))
1238 return true;
1239
1240 if (UseMul24Intrin && replaceMulWithMul24(I))
1241 return true;
1242
1243 bool Changed = false;
1244 Instruction::BinaryOps Opc = I.getOpcode();
1245 Type *Ty = I.getType();
1246 Value *NewDiv = nullptr;
1247 unsigned ScalarSize = Ty->getScalarSizeInBits();
1248
1249 SmallVector<BinaryOperator *, 8> Div64ToExpand;
1250
1251 if ((Opc == Instruction::URem || Opc == Instruction::UDiv ||
1252 Opc == Instruction::SRem || Opc == Instruction::SDiv) &&
1253 ScalarSize <= 64 &&
1254 !DisableIDivExpand) {
1255 Value *Num = I.getOperand(0);
1256 Value *Den = I.getOperand(1);
1257 IRBuilder<> Builder(&I);
1258 Builder.SetCurrentDebugLocation(I.getDebugLoc());
1259
1260 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1261 NewDiv = UndefValue::get(VT);
1262
1263 for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) {
1264 Value *NumEltN = Builder.CreateExtractElement(Num, N);
1265 Value *DenEltN = Builder.CreateExtractElement(Den, N);
1266
1267 Value *NewElt;
1268 if (ScalarSize <= 32) {
1269 NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN);
1270 if (!NewElt)
1271 NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
1272 } else {
1273 // See if this 64-bit division can be shrunk to 32/24-bits before
1274 // producing the general expansion.
1275 NewElt = shrinkDivRem64(Builder, I, NumEltN, DenEltN);
1276 if (!NewElt) {
1277 // The general 64-bit expansion introduces control flow and doesn't
1278 // return the new value. Just insert a scalar copy and defer
1279 // expanding it.
1280 NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
1281 Div64ToExpand.push_back(cast<BinaryOperator>(NewElt));
1282 }
1283 }
1284
1285 NewDiv = Builder.CreateInsertElement(NewDiv, NewElt, N);
1286 }
1287 } else {
1288 if (ScalarSize <= 32)
1289 NewDiv = expandDivRem32(Builder, I, Num, Den);
1290 else {
1291 NewDiv = shrinkDivRem64(Builder, I, Num, Den);
1292 if (!NewDiv)
1293 Div64ToExpand.push_back(&I);
1294 }
1295 }
1296
1297 if (NewDiv) {
1298 I.replaceAllUsesWith(NewDiv);
1299 I.eraseFromParent();
1300 Changed = true;
1301 }
1302 }
1303
1304 if (ExpandDiv64InIR) {
1305 // TODO: We get much worse code in specially handled constant cases.
1306 for (BinaryOperator *Div : Div64ToExpand) {
1307 expandDivRem64(*Div);
1308 Changed = true;
1309 }
1310 }
1311
1312 return Changed;
1313}
1314
1315bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) {
1316 if (!WidenLoads)
1317 return false;
1318
1319 if ((I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
1320 I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
1321 canWidenScalarExtLoad(I)) {
1322 IRBuilder<> Builder(&I);
1323 Builder.SetCurrentDebugLocation(I.getDebugLoc());
1324
1325 Type *I32Ty = Builder.getInt32Ty();
1326 Type *PT = PointerType::get(I32Ty, I.getPointerAddressSpace());
1327 Value *BitCast= Builder.CreateBitCast(I.getPointerOperand(), PT);
1328 LoadInst *WidenLoad = Builder.CreateLoad(I32Ty, BitCast);
1329 WidenLoad->copyMetadata(I);
1330
1331 // If we have range metadata, we need to convert the type, and not make
1332 // assumptions about the high bits.
1333 if (auto *Range = WidenLoad->getMetadata(LLVMContext::MD_range)) {
1334 ConstantInt *Lower =
1335 mdconst::extract<ConstantInt>(Range->getOperand(0));
1336
1337 if (Lower->isNullValue()) {
1338 WidenLoad->setMetadata(LLVMContext::MD_range, nullptr);
1339 } else {
1340 Metadata *LowAndHigh[] = {
1341 ConstantAsMetadata::get(ConstantInt::get(I32Ty, Lower->getValue().zext(32))),
1342 // Don't make assumptions about the high bits.
1343 ConstantAsMetadata::get(ConstantInt::get(I32Ty, 0))
1344 };
1345
1346 WidenLoad->setMetadata(LLVMContext::MD_range,
1347 MDNode::get(Mod->getContext(), LowAndHigh));
1348 }
1349 }
1350
1351 int TySize = Mod->getDataLayout().getTypeSizeInBits(I.getType());
1352 Type *IntNTy = Builder.getIntNTy(TySize);
1353 Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy);
1354 Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType());
1355 I.replaceAllUsesWith(ValOrig);
1356 I.eraseFromParent();
1357 return true;
1358 }
1359
1360 return false;
1361}
1362
1363bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) {
1364 bool Changed = false;
1365
1366 if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) &&
1367 DA->isUniform(&I))
1368 Changed |= promoteUniformOpToI32(I);
1369
1370 return Changed;
1371}
1372
1373bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) {
1374 bool Changed = false;
1375
1376 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
1377 DA->isUniform(&I))
1378 Changed |= promoteUniformOpToI32(I);
1379
1380 return Changed;
1381}
1382
1383bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) {
1384 switch (I.getIntrinsicID()) {
1385 case Intrinsic::bitreverse:
1386 return visitBitreverseIntrinsicInst(I);
1387 default:
1388 return false;
1389 }
1390}
1391
1392bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) {
1393 bool Changed = false;
1394
1395 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
1396 DA->isUniform(&I))
1397 Changed |= promoteUniformBitreverseToI32(I);
1398
1399 return Changed;
1400}
1401
1402bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
1403 Mod = &M;
1404 DL = &Mod->getDataLayout();
1405 return false;
1406}
1407
1408bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
1409 if (skipFunction(F))
1410 return false;
1411
1412 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
1413 if (!TPC)
1414 return false;
1415
1416 const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
1417 ST = &TM.getSubtarget<GCNSubtarget>(F);
1418 AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1419 DA = &getAnalysis<LegacyDivergenceAnalysis>();
1420
1421 auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
1422 DT = DTWP ? &DTWP->getDomTree() : nullptr;
1423
1424 HasUnsafeFPMath = hasUnsafeFPMath(F);
1425
1426 AMDGPU::SIModeRegisterDefaults Mode(F);
1427 HasFP32Denormals = Mode.allFP32Denormals();
1428
1429 bool MadeChange = false;
1430
1431 Function::iterator NextBB;
1432 for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; FI = NextBB) {
1433 BasicBlock *BB = &*FI;
1434 NextBB = std::next(FI);
1435
1436 BasicBlock::iterator Next;
1437 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; I = Next) {
1438 Next = std::next(I);
1439
1440 MadeChange |= visit(*I);
1441
1442 if (Next != E) { // Control flow changed
1443 BasicBlock *NextInstBB = Next->getParent();
1444 if (NextInstBB != BB) {
1445 BB = NextInstBB;
1446 E = BB->end();
1447 FE = F.end();
1448 }
1449 }
1450 }
1451 }
1452
1453 return MadeChange;
1454}
1455
1456INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
1457 "AMDGPU IR optimizations", false, false)
1458INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
1459INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
1460INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
1461 false, false)
1462
1463char AMDGPUCodeGenPrepare::ID = 0;
1464
1465FunctionPass *llvm::createAMDGPUCodeGenPreparePass() {
1466 return new AMDGPUCodeGenPrepare();
1467}
1468

source code of llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp