1//===--- SelectOptimize.cpp - Convert select to branches if profitable ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass converts selects to conditional jumps when profitable.
10//
11//===----------------------------------------------------------------------===//
12
13#include "llvm/CodeGen/SelectOptimize.h"
14#include "llvm/ADT/SmallVector.h"
15#include "llvm/ADT/Statistic.h"
16#include "llvm/Analysis/BlockFrequencyInfo.h"
17#include "llvm/Analysis/BranchProbabilityInfo.h"
18#include "llvm/Analysis/LoopInfo.h"
19#include "llvm/Analysis/OptimizationRemarkEmitter.h"
20#include "llvm/Analysis/ProfileSummaryInfo.h"
21#include "llvm/Analysis/TargetTransformInfo.h"
22#include "llvm/CodeGen/Passes.h"
23#include "llvm/CodeGen/TargetLowering.h"
24#include "llvm/CodeGen/TargetPassConfig.h"
25#include "llvm/CodeGen/TargetSchedule.h"
26#include "llvm/CodeGen/TargetSubtargetInfo.h"
27#include "llvm/IR/BasicBlock.h"
28#include "llvm/IR/Dominators.h"
29#include "llvm/IR/Function.h"
30#include "llvm/IR/IRBuilder.h"
31#include "llvm/IR/Instruction.h"
32#include "llvm/IR/PatternMatch.h"
33#include "llvm/IR/ProfDataUtils.h"
34#include "llvm/InitializePasses.h"
35#include "llvm/Pass.h"
36#include "llvm/Support/ScaledNumber.h"
37#include "llvm/Target/TargetMachine.h"
38#include "llvm/Transforms/Utils/SizeOpts.h"
39#include <algorithm>
40#include <memory>
41#include <queue>
42#include <stack>
43
44using namespace llvm;
45using namespace llvm::PatternMatch;
46
47#define DEBUG_TYPE "select-optimize"
48
49STATISTIC(NumSelectOptAnalyzed,
50 "Number of select groups considered for conversion to branch");
51STATISTIC(NumSelectConvertedExpColdOperand,
52 "Number of select groups converted due to expensive cold operand");
53STATISTIC(NumSelectConvertedHighPred,
54 "Number of select groups converted due to high-predictability");
55STATISTIC(NumSelectUnPred,
56 "Number of select groups not converted due to unpredictability");
57STATISTIC(NumSelectColdBB,
58 "Number of select groups not converted due to cold basic block");
59STATISTIC(NumSelectConvertedLoop,
60 "Number of select groups converted due to loop-level analysis");
61STATISTIC(NumSelectsConverted, "Number of selects converted");
62
63static cl::opt<unsigned> ColdOperandThreshold(
64 "cold-operand-threshold",
65 cl::desc("Maximum frequency of path for an operand to be considered cold."),
66 cl::init(Val: 20), cl::Hidden);
67
68static cl::opt<unsigned> ColdOperandMaxCostMultiplier(
69 "cold-operand-max-cost-multiplier",
70 cl::desc("Maximum cost multiplier of TCC_expensive for the dependence "
71 "slice of a cold operand to be considered inexpensive."),
72 cl::init(Val: 1), cl::Hidden);
73
74static cl::opt<unsigned>
75 GainGradientThreshold("select-opti-loop-gradient-gain-threshold",
76 cl::desc("Gradient gain threshold (%)."),
77 cl::init(Val: 25), cl::Hidden);
78
79static cl::opt<unsigned>
80 GainCycleThreshold("select-opti-loop-cycle-gain-threshold",
81 cl::desc("Minimum gain per loop (in cycles) threshold."),
82 cl::init(Val: 4), cl::Hidden);
83
84static cl::opt<unsigned> GainRelativeThreshold(
85 "select-opti-loop-relative-gain-threshold",
86 cl::desc(
87 "Minimum relative gain per loop threshold (1/X). Defaults to 12.5%"),
88 cl::init(Val: 8), cl::Hidden);
89
90static cl::opt<unsigned> MispredictDefaultRate(
91 "mispredict-default-rate", cl::Hidden, cl::init(Val: 25),
92 cl::desc("Default mispredict rate (initialized to 25%)."));
93
94static cl::opt<bool>
95 DisableLoopLevelHeuristics("disable-loop-level-heuristics", cl::Hidden,
96 cl::init(Val: false),
97 cl::desc("Disable loop-level heuristics."));
98
99namespace {
100
101class SelectOptimizeImpl {
102 const TargetMachine *TM = nullptr;
103 const TargetSubtargetInfo *TSI = nullptr;
104 const TargetLowering *TLI = nullptr;
105 const TargetTransformInfo *TTI = nullptr;
106 const LoopInfo *LI = nullptr;
107 BlockFrequencyInfo *BFI;
108 ProfileSummaryInfo *PSI = nullptr;
109 OptimizationRemarkEmitter *ORE = nullptr;
110 TargetSchedModel TSchedModel;
111
112public:
113 SelectOptimizeImpl() = default;
114 SelectOptimizeImpl(const TargetMachine *TM) : TM(TM){};
115 PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
116 bool runOnFunction(Function &F, Pass &P);
117
118 using Scaled64 = ScaledNumber<uint64_t>;
119
120 struct CostInfo {
121 /// Predicated cost (with selects as conditional moves).
122 Scaled64 PredCost;
123 /// Non-predicated cost (with selects converted to branches).
124 Scaled64 NonPredCost;
125 };
126
127 /// SelectLike is an abstraction over SelectInst and other operations that can
128 /// act like selects. For example Or(Zext(icmp), X) can be treated like
129 /// select(icmp, X|1, X).
130 class SelectLike {
131 SelectLike(Instruction *I) : I(I) {}
132
133 Instruction *I;
134
135 public:
136 /// Match a select or select-like instruction, returning a SelectLike.
137 static SelectLike match(Instruction *I) {
138 // Select instruction are what we are usually looking for.
139 if (isa<SelectInst>(Val: I))
140 return SelectLike(I);
141
142 // An Or(zext(i1 X), Y) can also be treated like a select, with condition
143 // C and values Y|1 and Y.
144 Value *X;
145 if (PatternMatch::match(
146 V: I, P: m_c_Or(L: m_OneUse(SubPattern: m_ZExt(Op: m_Value(V&: X))), R: m_Value())) &&
147 X->getType()->isIntegerTy(Bitwidth: 1))
148 return SelectLike(I);
149
150 return SelectLike(nullptr);
151 }
152
153 bool isValid() { return I; }
154 operator bool() { return isValid(); }
155
156 Instruction *getI() { return I; }
157 const Instruction *getI() const { return I; }
158
159 Type *getType() const { return I->getType(); }
160
161 /// Return the condition for the SelectLike instruction. For example the
162 /// condition of a select or c in `or(zext(c), x)`
163 Value *getCondition() const {
164 if (auto *Sel = dyn_cast<SelectInst>(Val: I))
165 return Sel->getCondition();
166 // Or(zext) case
167 if (auto *BO = dyn_cast<BinaryOperator>(Val: I)) {
168 Value *X;
169 if (PatternMatch::match(V: BO->getOperand(i_nocapture: 0),
170 P: m_OneUse(SubPattern: m_ZExt(Op: m_Value(V&: X)))))
171 return X;
172 if (PatternMatch::match(V: BO->getOperand(i_nocapture: 1),
173 P: m_OneUse(SubPattern: m_ZExt(Op: m_Value(V&: X)))))
174 return X;
175 }
176
177 llvm_unreachable("Unhandled case in getCondition");
178 }
179
180 /// Return the true value for the SelectLike instruction. Note this may not
181 /// exist for all SelectLike instructions. For example, for `or(zext(c), x)`
182 /// the true value would be `or(x,1)`. As this value does not exist, nullptr
183 /// is returned.
184 Value *getTrueValue() const {
185 if (auto *Sel = dyn_cast<SelectInst>(Val: I))
186 return Sel->getTrueValue();
187 // Or(zext) case - The true value is Or(X), so return nullptr as the value
188 // does not yet exist.
189 if (isa<BinaryOperator>(Val: I))
190 return nullptr;
191
192 llvm_unreachable("Unhandled case in getTrueValue");
193 }
194
195 /// Return the false value for the SelectLike instruction. For example the
196 /// getFalseValue of a select or `x` in `or(zext(c), x)` (which is
197 /// `select(c, x|1, x)`)
198 Value *getFalseValue() const {
199 if (auto *Sel = dyn_cast<SelectInst>(Val: I))
200 return Sel->getFalseValue();
201 // Or(zext) case - return the operand which is not the zext.
202 if (auto *BO = dyn_cast<BinaryOperator>(Val: I)) {
203 Value *X;
204 if (PatternMatch::match(V: BO->getOperand(i_nocapture: 0),
205 P: m_OneUse(SubPattern: m_ZExt(Op: m_Value(V&: X)))))
206 return BO->getOperand(i_nocapture: 1);
207 if (PatternMatch::match(V: BO->getOperand(i_nocapture: 1),
208 P: m_OneUse(SubPattern: m_ZExt(Op: m_Value(V&: X)))))
209 return BO->getOperand(i_nocapture: 0);
210 }
211
212 llvm_unreachable("Unhandled case in getFalseValue");
213 }
214
215 /// Return the NonPredCost cost of the true op, given the costs in
216 /// InstCostMap. This may need to be generated for select-like instructions.
217 Scaled64 getTrueOpCost(DenseMap<const Instruction *, CostInfo> &InstCostMap,
218 const TargetTransformInfo *TTI) {
219 if (auto *Sel = dyn_cast<SelectInst>(Val: I))
220 if (auto *I = dyn_cast<Instruction>(Val: Sel->getTrueValue()))
221 return InstCostMap.contains(Val: I) ? InstCostMap[I].NonPredCost
222 : Scaled64::getZero();
223
224 // Or case - add the cost of an extra Or to the cost of the False case.
225 if (isa<BinaryOperator>(Val: I))
226 if (auto I = dyn_cast<Instruction>(Val: getFalseValue()))
227 if (InstCostMap.contains(Val: I)) {
228 InstructionCost OrCost = TTI->getArithmeticInstrCost(
229 Opcode: Instruction::Or, Ty: I->getType(), CostKind: TargetTransformInfo::TCK_Latency,
230 Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue,
231 .Properties: TargetTransformInfo::OP_None},
232 Opd2Info: {.Kind: TTI::OK_UniformConstantValue, .Properties: TTI::OP_PowerOf2});
233 return InstCostMap[I].NonPredCost +
234 Scaled64::get(N: *OrCost.getValue());
235 }
236
237 return Scaled64::getZero();
238 }
239
240 /// Return the NonPredCost cost of the false op, given the costs in
241 /// InstCostMap. This may need to be generated for select-like instructions.
242 Scaled64
243 getFalseOpCost(DenseMap<const Instruction *, CostInfo> &InstCostMap,
244 const TargetTransformInfo *TTI) {
245 if (auto *Sel = dyn_cast<SelectInst>(Val: I))
246 if (auto *I = dyn_cast<Instruction>(Val: Sel->getFalseValue()))
247 return InstCostMap.contains(Val: I) ? InstCostMap[I].NonPredCost
248 : Scaled64::getZero();
249
250 // Or case - return the cost of the false case
251 if (isa<BinaryOperator>(Val: I))
252 if (auto I = dyn_cast<Instruction>(Val: getFalseValue()))
253 if (InstCostMap.contains(Val: I))
254 return InstCostMap[I].NonPredCost;
255
256 return Scaled64::getZero();
257 }
258 };
259
260private:
261 // Select groups consist of consecutive select instructions with the same
262 // condition.
263 using SelectGroup = SmallVector<SelectLike, 2>;
264 using SelectGroups = SmallVector<SelectGroup, 2>;
265
266 // Converts select instructions of a function to conditional jumps when deemed
267 // profitable. Returns true if at least one select was converted.
268 bool optimizeSelects(Function &F);
269
270 // Heuristics for determining which select instructions can be profitably
271 // conveted to branches. Separate heuristics for selects in inner-most loops
272 // and the rest of code regions (base heuristics for non-inner-most loop
273 // regions).
274 void optimizeSelectsBase(Function &F, SelectGroups &ProfSIGroups);
275 void optimizeSelectsInnerLoops(Function &F, SelectGroups &ProfSIGroups);
276
277 // Converts to branches the select groups that were deemed
278 // profitable-to-convert.
279 void convertProfitableSIGroups(SelectGroups &ProfSIGroups);
280
281 // Splits selects of a given basic block into select groups.
282 void collectSelectGroups(BasicBlock &BB, SelectGroups &SIGroups);
283
284 // Determines for which select groups it is profitable converting to branches
285 // (base and inner-most-loop heuristics).
286 void findProfitableSIGroupsBase(SelectGroups &SIGroups,
287 SelectGroups &ProfSIGroups);
288 void findProfitableSIGroupsInnerLoops(const Loop *L, SelectGroups &SIGroups,
289 SelectGroups &ProfSIGroups);
290
291 // Determines if a select group should be converted to a branch (base
292 // heuristics).
293 bool isConvertToBranchProfitableBase(const SelectGroup &ASI);
294
295 // Returns true if there are expensive instructions in the cold value
296 // operand's (if any) dependence slice of any of the selects of the given
297 // group.
298 bool hasExpensiveColdOperand(const SelectGroup &ASI);
299
300 // For a given source instruction, collect its backwards dependence slice
301 // consisting of instructions exclusively computed for producing the operands
302 // of the source instruction.
303 void getExclBackwardsSlice(Instruction *I, std::stack<Instruction *> &Slice,
304 Instruction *SI, bool ForSinking = false);
305
306 // Returns true if the condition of the select is highly predictable.
307 bool isSelectHighlyPredictable(const SelectLike SI);
308
309 // Loop-level checks to determine if a non-predicated version (with branches)
310 // of the given loop is more profitable than its predicated version.
311 bool checkLoopHeuristics(const Loop *L, const CostInfo LoopDepth[2]);
312
313 // Computes instruction and loop-critical-path costs for both the predicated
314 // and non-predicated version of the given loop.
315 bool computeLoopCosts(const Loop *L, const SelectGroups &SIGroups,
316 DenseMap<const Instruction *, CostInfo> &InstCostMap,
317 CostInfo *LoopCost);
318
319 // Returns a set of all the select instructions in the given select groups.
320 SmallDenseMap<const Instruction *, SelectLike, 2>
321 getSImap(const SelectGroups &SIGroups);
322
323 // Returns the latency cost of a given instruction.
324 std::optional<uint64_t> computeInstCost(const Instruction *I);
325
326 // Returns the misprediction cost of a given select when converted to branch.
327 Scaled64 getMispredictionCost(const SelectLike SI, const Scaled64 CondCost);
328
329 // Returns the cost of a branch when the prediction is correct.
330 Scaled64 getPredictedPathCost(Scaled64 TrueCost, Scaled64 FalseCost,
331 const SelectLike SI);
332
333 // Returns true if the target architecture supports lowering a given select.
334 bool isSelectKindSupported(const SelectLike SI);
335};
336
337class SelectOptimize : public FunctionPass {
338 SelectOptimizeImpl Impl;
339
340public:
341 static char ID;
342
343 SelectOptimize() : FunctionPass(ID) {
344 initializeSelectOptimizePass(*PassRegistry::getPassRegistry());
345 }
346
347 bool runOnFunction(Function &F) override {
348 return Impl.runOnFunction(F, P&: *this);
349 }
350
351 void getAnalysisUsage(AnalysisUsage &AU) const override {
352 AU.addRequired<ProfileSummaryInfoWrapperPass>();
353 AU.addRequired<TargetPassConfig>();
354 AU.addRequired<TargetTransformInfoWrapperPass>();
355 AU.addRequired<LoopInfoWrapperPass>();
356 AU.addRequired<BlockFrequencyInfoWrapperPass>();
357 AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
358 }
359};
360
361} // namespace
362
363PreservedAnalyses SelectOptimizePass::run(Function &F,
364 FunctionAnalysisManager &FAM) {
365 SelectOptimizeImpl Impl(TM);
366 return Impl.run(F, FAM);
367}
368
369char SelectOptimize::ID = 0;
370
371INITIALIZE_PASS_BEGIN(SelectOptimize, DEBUG_TYPE, "Optimize selects", false,
372 false)
373INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
374INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
375INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
376INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
377INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
378INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
379INITIALIZE_PASS_END(SelectOptimize, DEBUG_TYPE, "Optimize selects", false,
380 false)
381
382FunctionPass *llvm::createSelectOptimizePass() { return new SelectOptimize(); }
383
384PreservedAnalyses SelectOptimizeImpl::run(Function &F,
385 FunctionAnalysisManager &FAM) {
386 TSI = TM->getSubtargetImpl(F);
387 TLI = TSI->getTargetLowering();
388
389 // If none of the select types are supported then skip this pass.
390 // This is an optimization pass. Legality issues will be handled by
391 // instruction selection.
392 if (!TLI->isSelectSupported(TargetLowering::ScalarValSelect) &&
393 !TLI->isSelectSupported(TargetLowering::ScalarCondVectorVal) &&
394 !TLI->isSelectSupported(TargetLowering::VectorMaskSelect))
395 return PreservedAnalyses::all();
396
397 TTI = &FAM.getResult<TargetIRAnalysis>(IR&: F);
398 if (!TTI->enableSelectOptimize())
399 return PreservedAnalyses::all();
400
401 PSI = FAM.getResult<ModuleAnalysisManagerFunctionProxy>(IR&: F)
402 .getCachedResult<ProfileSummaryAnalysis>(IR&: *F.getParent());
403 assert(PSI && "This pass requires module analysis pass `profile-summary`!");
404 BFI = &FAM.getResult<BlockFrequencyAnalysis>(IR&: F);
405
406 // When optimizing for size, selects are preferable over branches.
407 if (F.hasOptSize() || llvm::shouldOptimizeForSize(F: &F, PSI, BFI))
408 return PreservedAnalyses::all();
409
410 LI = &FAM.getResult<LoopAnalysis>(IR&: F);
411 ORE = &FAM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: F);
412 TSchedModel.init(TSInfo: TSI);
413
414 bool Changed = optimizeSelects(F);
415 return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
416}
417
418bool SelectOptimizeImpl::runOnFunction(Function &F, Pass &P) {
419 TM = &P.getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
420 TSI = TM->getSubtargetImpl(F);
421 TLI = TSI->getTargetLowering();
422
423 // If none of the select types are supported then skip this pass.
424 // This is an optimization pass. Legality issues will be handled by
425 // instruction selection.
426 if (!TLI->isSelectSupported(TargetLowering::ScalarValSelect) &&
427 !TLI->isSelectSupported(TargetLowering::ScalarCondVectorVal) &&
428 !TLI->isSelectSupported(TargetLowering::VectorMaskSelect))
429 return false;
430
431 TTI = &P.getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
432
433 if (!TTI->enableSelectOptimize())
434 return false;
435
436 LI = &P.getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
437 BFI = &P.getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
438 PSI = &P.getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
439 ORE = &P.getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
440 TSchedModel.init(TSInfo: TSI);
441
442 // When optimizing for size, selects are preferable over branches.
443 if (F.hasOptSize() || llvm::shouldOptimizeForSize(F: &F, PSI, BFI))
444 return false;
445
446 return optimizeSelects(F);
447}
448
449bool SelectOptimizeImpl::optimizeSelects(Function &F) {
450 // Determine for which select groups it is profitable converting to branches.
451 SelectGroups ProfSIGroups;
452 // Base heuristics apply only to non-loops and outer loops.
453 optimizeSelectsBase(F, ProfSIGroups);
454 // Separate heuristics for inner-most loops.
455 optimizeSelectsInnerLoops(F, ProfSIGroups);
456
457 // Convert to branches the select groups that were deemed
458 // profitable-to-convert.
459 convertProfitableSIGroups(ProfSIGroups);
460
461 // Code modified if at least one select group was converted.
462 return !ProfSIGroups.empty();
463}
464
465void SelectOptimizeImpl::optimizeSelectsBase(Function &F,
466 SelectGroups &ProfSIGroups) {
467 // Collect all the select groups.
468 SelectGroups SIGroups;
469 for (BasicBlock &BB : F) {
470 // Base heuristics apply only to non-loops and outer loops.
471 Loop *L = LI->getLoopFor(BB: &BB);
472 if (L && L->isInnermost())
473 continue;
474 collectSelectGroups(BB, SIGroups);
475 }
476
477 // Determine for which select groups it is profitable converting to branches.
478 findProfitableSIGroupsBase(SIGroups, ProfSIGroups);
479}
480
481void SelectOptimizeImpl::optimizeSelectsInnerLoops(Function &F,
482 SelectGroups &ProfSIGroups) {
483 SmallVector<Loop *, 4> Loops(LI->begin(), LI->end());
484 // Need to check size on each iteration as we accumulate child loops.
485 for (unsigned long i = 0; i < Loops.size(); ++i)
486 for (Loop *ChildL : Loops[i]->getSubLoops())
487 Loops.push_back(Elt: ChildL);
488
489 for (Loop *L : Loops) {
490 if (!L->isInnermost())
491 continue;
492
493 SelectGroups SIGroups;
494 for (BasicBlock *BB : L->getBlocks())
495 collectSelectGroups(BB&: *BB, SIGroups);
496
497 findProfitableSIGroupsInnerLoops(L, SIGroups, ProfSIGroups);
498 }
499}
500
501/// If \p isTrue is true, return the true value of \p SI, otherwise return
502/// false value of \p SI. If the true/false value of \p SI is defined by any
503/// select instructions in \p Selects, look through the defining select
504/// instruction until the true/false value is not defined in \p Selects.
505static Value *
506getTrueOrFalseValue(SelectOptimizeImpl::SelectLike SI, bool isTrue,
507 const SmallPtrSet<const Instruction *, 2> &Selects,
508 IRBuilder<> &IB) {
509 Value *V = nullptr;
510 for (SelectInst *DefSI = dyn_cast<SelectInst>(Val: SI.getI());
511 DefSI != nullptr && Selects.count(Ptr: DefSI);
512 DefSI = dyn_cast<SelectInst>(Val: V)) {
513 assert(DefSI->getCondition() == SI.getCondition() &&
514 "The condition of DefSI does not match with SI");
515 V = (isTrue ? DefSI->getTrueValue() : DefSI->getFalseValue());
516 }
517
518 if (isa<BinaryOperator>(Val: SI.getI())) {
519 assert(SI.getI()->getOpcode() == Instruction::Or &&
520 "Only currently handling Or instructions.");
521 V = SI.getFalseValue();
522 if (isTrue)
523 V = IB.CreateOr(LHS: V, RHS: ConstantInt::get(Ty: V->getType(), V: 1));
524 }
525
526 assert(V && "Failed to get select true/false value");
527 return V;
528}
529
530void SelectOptimizeImpl::convertProfitableSIGroups(SelectGroups &ProfSIGroups) {
531 for (SelectGroup &ASI : ProfSIGroups) {
532 // The code transformation here is a modified version of the sinking
533 // transformation in CodeGenPrepare::optimizeSelectInst with a more
534 // aggressive strategy of which instructions to sink.
535 //
536 // TODO: eliminate the redundancy of logic transforming selects to branches
537 // by removing CodeGenPrepare::optimizeSelectInst and optimizing here
538 // selects for all cases (with and without profile information).
539
540 // Transform a sequence like this:
541 // start:
542 // %cmp = cmp uge i32 %a, %b
543 // %sel = select i1 %cmp, i32 %c, i32 %d
544 //
545 // Into:
546 // start:
547 // %cmp = cmp uge i32 %a, %b
548 // %cmp.frozen = freeze %cmp
549 // br i1 %cmp.frozen, label %select.true, label %select.false
550 // select.true:
551 // br label %select.end
552 // select.false:
553 // br label %select.end
554 // select.end:
555 // %sel = phi i32 [ %c, %select.true ], [ %d, %select.false ]
556 //
557 // %cmp should be frozen, otherwise it may introduce undefined behavior.
558 // In addition, we may sink instructions that produce %c or %d into the
559 // destination(s) of the new branch.
560 // If the true or false blocks do not contain a sunken instruction, that
561 // block and its branch may be optimized away. In that case, one side of the
562 // first branch will point directly to select.end, and the corresponding PHI
563 // predecessor block will be the start block.
564
565 // Find all the instructions that can be soundly sunk to the true/false
566 // blocks. These are instructions that are computed solely for producing the
567 // operands of the select instructions in the group and can be sunk without
568 // breaking the semantics of the LLVM IR (e.g., cannot sink instructions
569 // with side effects).
570 SmallVector<std::stack<Instruction *>, 2> TrueSlices, FalseSlices;
571 typedef std::stack<Instruction *>::size_type StackSizeType;
572 StackSizeType maxTrueSliceLen = 0, maxFalseSliceLen = 0;
573 for (SelectLike SI : ASI) {
574 // For each select, compute the sinkable dependence chains of the true and
575 // false operands.
576 if (auto *TI = dyn_cast_or_null<Instruction>(Val: SI.getTrueValue())) {
577 std::stack<Instruction *> TrueSlice;
578 getExclBackwardsSlice(I: TI, Slice&: TrueSlice, SI: SI.getI(), ForSinking: true);
579 maxTrueSliceLen = std::max(a: maxTrueSliceLen, b: TrueSlice.size());
580 TrueSlices.push_back(Elt: TrueSlice);
581 }
582 if (auto *FI = dyn_cast_or_null<Instruction>(Val: SI.getFalseValue())) {
583 if (isa<SelectInst>(Val: SI.getI()) || !FI->hasOneUse()) {
584 std::stack<Instruction *> FalseSlice;
585 getExclBackwardsSlice(I: FI, Slice&: FalseSlice, SI: SI.getI(), ForSinking: true);
586 maxFalseSliceLen = std::max(a: maxFalseSliceLen, b: FalseSlice.size());
587 FalseSlices.push_back(Elt: FalseSlice);
588 }
589 }
590 }
591 // In the case of multiple select instructions in the same group, the order
592 // of non-dependent instructions (instructions of different dependence
593 // slices) in the true/false blocks appears to affect performance.
594 // Interleaving the slices seems to experimentally be the optimal approach.
595 // This interleaving scheduling allows for more ILP (with a natural downside
596 // of increasing a bit register pressure) compared to a simple ordering of
597 // one whole chain after another. One would expect that this ordering would
598 // not matter since the scheduling in the backend of the compiler would
599 // take care of it, but apparently the scheduler fails to deliver optimal
600 // ILP with a naive ordering here.
601 SmallVector<Instruction *, 2> TrueSlicesInterleaved, FalseSlicesInterleaved;
602 for (StackSizeType IS = 0; IS < maxTrueSliceLen; ++IS) {
603 for (auto &S : TrueSlices) {
604 if (!S.empty()) {
605 TrueSlicesInterleaved.push_back(Elt: S.top());
606 S.pop();
607 }
608 }
609 }
610 for (StackSizeType IS = 0; IS < maxFalseSliceLen; ++IS) {
611 for (auto &S : FalseSlices) {
612 if (!S.empty()) {
613 FalseSlicesInterleaved.push_back(Elt: S.top());
614 S.pop();
615 }
616 }
617 }
618
619 // We split the block containing the select(s) into two blocks.
620 SelectLike SI = ASI.front();
621 SelectLike LastSI = ASI.back();
622 BasicBlock *StartBlock = SI.getI()->getParent();
623 BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(LastSI.getI()));
624 // With RemoveDIs turned off, SplitPt can be a dbg.* intrinsic. With
625 // RemoveDIs turned on, SplitPt would instead point to the next
626 // instruction. To match existing dbg.* intrinsic behaviour with RemoveDIs,
627 // tell splitBasicBlock that we want to include any DbgVariableRecords
628 // attached to SplitPt in the splice.
629 SplitPt.setHeadBit(true);
630 BasicBlock *EndBlock = StartBlock->splitBasicBlock(I: SplitPt, BBName: "select.end");
631 BFI->setBlockFreq(BB: EndBlock, Freq: BFI->getBlockFreq(BB: StartBlock));
632 // Delete the unconditional branch that was just created by the split.
633 StartBlock->getTerminator()->eraseFromParent();
634
635 // Move any debug/pseudo instructions that were in-between the select
636 // group to the newly-created end block.
637 SmallVector<Instruction *, 2> DebugPseudoINS;
638 auto DIt = SI.getI()->getIterator();
639 while (&*DIt != LastSI.getI()) {
640 if (DIt->isDebugOrPseudoInst())
641 DebugPseudoINS.push_back(Elt: &*DIt);
642 DIt++;
643 }
644 for (auto *DI : DebugPseudoINS) {
645 DI->moveBeforePreserving(MovePos: &*EndBlock->getFirstInsertionPt());
646 }
647
648 // Duplicate implementation for DbgRecords, the non-instruction debug-info
649 // format. Helper lambda for moving DbgRecords to the end block.
650 auto TransferDbgRecords = [&](Instruction &I) {
651 for (auto &DbgRecord :
652 llvm::make_early_inc_range(Range: I.getDbgRecordRange())) {
653 DbgRecord.removeFromParent();
654 EndBlock->insertDbgRecordBefore(DR: &DbgRecord,
655 Here: EndBlock->getFirstInsertionPt());
656 }
657 };
658
659 // Iterate over all instructions in between SI and LastSI, not including
660 // SI itself. These are all the variable assignments that happen "in the
661 // middle" of the select group.
662 auto R = make_range(x: std::next(x: SI.getI()->getIterator()),
663 y: std::next(x: LastSI.getI()->getIterator()));
664 llvm::for_each(Range&: R, F: TransferDbgRecords);
665
666 // These are the new basic blocks for the conditional branch.
667 // At least one will become an actual new basic block.
668 BasicBlock *TrueBlock = nullptr, *FalseBlock = nullptr;
669 BranchInst *TrueBranch = nullptr, *FalseBranch = nullptr;
670 if (!TrueSlicesInterleaved.empty()) {
671 TrueBlock = BasicBlock::Create(Context&: EndBlock->getContext(), Name: "select.true.sink",
672 Parent: EndBlock->getParent(), InsertBefore: EndBlock);
673 TrueBranch = BranchInst::Create(IfTrue: EndBlock, InsertAtEnd: TrueBlock);
674 TrueBranch->setDebugLoc(LastSI.getI()->getDebugLoc());
675 for (Instruction *TrueInst : TrueSlicesInterleaved)
676 TrueInst->moveBefore(MovePos: TrueBranch);
677 }
678 if (!FalseSlicesInterleaved.empty()) {
679 FalseBlock =
680 BasicBlock::Create(Context&: EndBlock->getContext(), Name: "select.false.sink",
681 Parent: EndBlock->getParent(), InsertBefore: EndBlock);
682 FalseBranch = BranchInst::Create(IfTrue: EndBlock, InsertAtEnd: FalseBlock);
683 FalseBranch->setDebugLoc(LastSI.getI()->getDebugLoc());
684 for (Instruction *FalseInst : FalseSlicesInterleaved)
685 FalseInst->moveBefore(MovePos: FalseBranch);
686 }
687 // If there was nothing to sink, then arbitrarily choose the 'false' side
688 // for a new input value to the PHI.
689 if (TrueBlock == FalseBlock) {
690 assert(TrueBlock == nullptr &&
691 "Unexpected basic block transform while optimizing select");
692
693 FalseBlock = BasicBlock::Create(Context&: StartBlock->getContext(), Name: "select.false",
694 Parent: EndBlock->getParent(), InsertBefore: EndBlock);
695 auto *FalseBranch = BranchInst::Create(IfTrue: EndBlock, InsertAtEnd: FalseBlock);
696 FalseBranch->setDebugLoc(SI.getI()->getDebugLoc());
697 }
698
699 // Insert the real conditional branch based on the original condition.
700 // If we did not create a new block for one of the 'true' or 'false' paths
701 // of the condition, it means that side of the branch goes to the end block
702 // directly and the path originates from the start block from the point of
703 // view of the new PHI.
704 BasicBlock *TT, *FT;
705 if (TrueBlock == nullptr) {
706 TT = EndBlock;
707 FT = FalseBlock;
708 TrueBlock = StartBlock;
709 } else if (FalseBlock == nullptr) {
710 TT = TrueBlock;
711 FT = EndBlock;
712 FalseBlock = StartBlock;
713 } else {
714 TT = TrueBlock;
715 FT = FalseBlock;
716 }
717 IRBuilder<> IB(SI.getI());
718 auto *CondFr = IB.CreateFreeze(V: SI.getCondition(),
719 Name: SI.getCondition()->getName() + ".frozen");
720
721 SmallPtrSet<const Instruction *, 2> INS;
722 for (auto SI : ASI)
723 INS.insert(Ptr: SI.getI());
724
725 // Use reverse iterator because later select may use the value of the
726 // earlier select, and we need to propagate value through earlier select
727 // to get the PHI operand.
728 for (auto It = ASI.rbegin(); It != ASI.rend(); ++It) {
729 SelectLike SI = *It;
730 // The select itself is replaced with a PHI Node.
731 PHINode *PN = PHINode::Create(Ty: SI.getType(), NumReservedValues: 2, NameStr: "");
732 PN->insertBefore(InsertPos: EndBlock->begin());
733 PN->takeName(V: SI.getI());
734 PN->addIncoming(V: getTrueOrFalseValue(SI, isTrue: true, Selects: INS, IB), BB: TrueBlock);
735 PN->addIncoming(V: getTrueOrFalseValue(SI, isTrue: false, Selects: INS, IB), BB: FalseBlock);
736 PN->setDebugLoc(SI.getI()->getDebugLoc());
737 SI.getI()->replaceAllUsesWith(V: PN);
738 INS.erase(Ptr: SI.getI());
739 ++NumSelectsConverted;
740 }
741 IB.CreateCondBr(Cond: CondFr, True: TT, False: FT, MDSrc: SI.getI());
742
743 // Remove the old select instructions, now that they are not longer used.
744 for (auto SI : ASI)
745 SI.getI()->eraseFromParent();
746 }
747}
748
749void SelectOptimizeImpl::collectSelectGroups(BasicBlock &BB,
750 SelectGroups &SIGroups) {
751 BasicBlock::iterator BBIt = BB.begin();
752 while (BBIt != BB.end()) {
753 Instruction *I = &*BBIt++;
754 if (SelectLike SI = SelectLike::match(I)) {
755 if (!TTI->shouldTreatInstructionLikeSelect(I))
756 continue;
757
758 SelectGroup SIGroup;
759 SIGroup.push_back(Elt: SI);
760 while (BBIt != BB.end()) {
761 Instruction *NI = &*BBIt;
762 // Debug/pseudo instructions should be skipped and not prevent the
763 // formation of a select group.
764 if (NI->isDebugOrPseudoInst()) {
765 ++BBIt;
766 continue;
767 }
768 // We only allow selects in the same group, not other select-like
769 // instructions.
770 if (!isa<SelectInst>(Val: NI))
771 break;
772
773 SelectLike NSI = SelectLike::match(I: NI);
774 if (NSI && SI.getCondition() == NSI.getCondition()) {
775 SIGroup.push_back(Elt: NSI);
776 } else
777 break;
778 ++BBIt;
779 }
780
781 // If the select type is not supported, no point optimizing it.
782 // Instruction selection will take care of it.
783 if (!isSelectKindSupported(SI))
784 continue;
785
786 SIGroups.push_back(Elt: SIGroup);
787 }
788 }
789}
790
791void SelectOptimizeImpl::findProfitableSIGroupsBase(
792 SelectGroups &SIGroups, SelectGroups &ProfSIGroups) {
793 for (SelectGroup &ASI : SIGroups) {
794 ++NumSelectOptAnalyzed;
795 if (isConvertToBranchProfitableBase(ASI))
796 ProfSIGroups.push_back(Elt: ASI);
797 }
798}
799
800static void EmitAndPrintRemark(OptimizationRemarkEmitter *ORE,
801 DiagnosticInfoOptimizationBase &Rem) {
802 LLVM_DEBUG(dbgs() << Rem.getMsg() << "\n");
803 ORE->emit(OptDiag&: Rem);
804}
805
806void SelectOptimizeImpl::findProfitableSIGroupsInnerLoops(
807 const Loop *L, SelectGroups &SIGroups, SelectGroups &ProfSIGroups) {
808 NumSelectOptAnalyzed += SIGroups.size();
809 // For each select group in an inner-most loop,
810 // a branch is more preferable than a select/conditional-move if:
811 // i) conversion to branches for all the select groups of the loop satisfies
812 // loop-level heuristics including reducing the loop's critical path by
813 // some threshold (see SelectOptimizeImpl::checkLoopHeuristics); and
814 // ii) the total cost of the select group is cheaper with a branch compared
815 // to its predicated version. The cost is in terms of latency and the cost
816 // of a select group is the cost of its most expensive select instruction
817 // (assuming infinite resources and thus fully leveraging available ILP).
818
819 DenseMap<const Instruction *, CostInfo> InstCostMap;
820 CostInfo LoopCost[2] = {{.PredCost: Scaled64::getZero(), .NonPredCost: Scaled64::getZero()},
821 {.PredCost: Scaled64::getZero(), .NonPredCost: Scaled64::getZero()}};
822 if (!computeLoopCosts(L, SIGroups, InstCostMap, LoopCost) ||
823 !checkLoopHeuristics(L, LoopDepth: LoopCost)) {
824 return;
825 }
826
827 for (SelectGroup &ASI : SIGroups) {
828 // Assuming infinite resources, the cost of a group of instructions is the
829 // cost of the most expensive instruction of the group.
830 Scaled64 SelectCost = Scaled64::getZero(), BranchCost = Scaled64::getZero();
831 for (SelectLike SI : ASI) {
832 SelectCost = std::max(a: SelectCost, b: InstCostMap[SI.getI()].PredCost);
833 BranchCost = std::max(a: BranchCost, b: InstCostMap[SI.getI()].NonPredCost);
834 }
835 if (BranchCost < SelectCost) {
836 OptimizationRemark OR(DEBUG_TYPE, "SelectOpti", ASI.front().getI());
837 OR << "Profitable to convert to branch (loop analysis). BranchCost="
838 << BranchCost.toString() << ", SelectCost=" << SelectCost.toString()
839 << ". ";
840 EmitAndPrintRemark(ORE, Rem&: OR);
841 ++NumSelectConvertedLoop;
842 ProfSIGroups.push_back(Elt: ASI);
843 } else {
844 OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti",
845 ASI.front().getI());
846 ORmiss << "Select is more profitable (loop analysis). BranchCost="
847 << BranchCost.toString()
848 << ", SelectCost=" << SelectCost.toString() << ". ";
849 EmitAndPrintRemark(ORE, Rem&: ORmiss);
850 }
851 }
852}
853
854bool SelectOptimizeImpl::isConvertToBranchProfitableBase(
855 const SelectGroup &ASI) {
856 SelectLike SI = ASI.front();
857 LLVM_DEBUG(dbgs() << "Analyzing select group containing " << *SI.getI()
858 << "\n");
859 OptimizationRemark OR(DEBUG_TYPE, "SelectOpti", SI.getI());
860 OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", SI.getI());
861
862 // Skip cold basic blocks. Better to optimize for size for cold blocks.
863 if (PSI->isColdBlock(BB: SI.getI()->getParent(), BFI)) {
864 ++NumSelectColdBB;
865 ORmiss << "Not converted to branch because of cold basic block. ";
866 EmitAndPrintRemark(ORE, Rem&: ORmiss);
867 return false;
868 }
869
870 // If unpredictable, branch form is less profitable.
871 if (SI.getI()->getMetadata(KindID: LLVMContext::MD_unpredictable)) {
872 ++NumSelectUnPred;
873 ORmiss << "Not converted to branch because of unpredictable branch. ";
874 EmitAndPrintRemark(ORE, Rem&: ORmiss);
875 return false;
876 }
877
878 // If highly predictable, branch form is more profitable, unless a
879 // predictable select is inexpensive in the target architecture.
880 if (isSelectHighlyPredictable(SI) && TLI->isPredictableSelectExpensive()) {
881 ++NumSelectConvertedHighPred;
882 OR << "Converted to branch because of highly predictable branch. ";
883 EmitAndPrintRemark(ORE, Rem&: OR);
884 return true;
885 }
886
887 // Look for expensive instructions in the cold operand's (if any) dependence
888 // slice of any of the selects in the group.
889 if (hasExpensiveColdOperand(ASI)) {
890 ++NumSelectConvertedExpColdOperand;
891 OR << "Converted to branch because of expensive cold operand.";
892 EmitAndPrintRemark(ORE, Rem&: OR);
893 return true;
894 }
895
896 ORmiss << "Not profitable to convert to branch (base heuristic).";
897 EmitAndPrintRemark(ORE, Rem&: ORmiss);
898 return false;
899}
900
901static InstructionCost divideNearest(InstructionCost Numerator,
902 uint64_t Denominator) {
903 return (Numerator + (Denominator / 2)) / Denominator;
904}
905
906static bool extractBranchWeights(const SelectOptimizeImpl::SelectLike SI,
907 uint64_t &TrueVal, uint64_t &FalseVal) {
908 if (isa<SelectInst>(Val: SI.getI()))
909 return extractBranchWeights(I: *SI.getI(), TrueVal, FalseVal);
910 return false;
911}
912
913bool SelectOptimizeImpl::hasExpensiveColdOperand(const SelectGroup &ASI) {
914 bool ColdOperand = false;
915 uint64_t TrueWeight, FalseWeight, TotalWeight;
916 if (extractBranchWeights(SI: ASI.front(), TrueVal&: TrueWeight, FalseVal&: FalseWeight)) {
917 uint64_t MinWeight = std::min(a: TrueWeight, b: FalseWeight);
918 TotalWeight = TrueWeight + FalseWeight;
919 // Is there a path with frequency <ColdOperandThreshold% (default:20%) ?
920 ColdOperand = TotalWeight * ColdOperandThreshold > 100 * MinWeight;
921 } else if (PSI->hasProfileSummary()) {
922 OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti",
923 ASI.front().getI());
924 ORmiss << "Profile data available but missing branch-weights metadata for "
925 "select instruction. ";
926 EmitAndPrintRemark(ORE, Rem&: ORmiss);
927 }
928 if (!ColdOperand)
929 return false;
930 // Check if the cold path's dependence slice is expensive for any of the
931 // selects of the group.
932 for (SelectLike SI : ASI) {
933 Instruction *ColdI = nullptr;
934 uint64_t HotWeight;
935 if (TrueWeight < FalseWeight) {
936 ColdI = dyn_cast_or_null<Instruction>(Val: SI.getTrueValue());
937 HotWeight = FalseWeight;
938 } else {
939 ColdI = dyn_cast_or_null<Instruction>(Val: SI.getFalseValue());
940 HotWeight = TrueWeight;
941 }
942 if (ColdI) {
943 std::stack<Instruction *> ColdSlice;
944 getExclBackwardsSlice(I: ColdI, Slice&: ColdSlice, SI: SI.getI());
945 InstructionCost SliceCost = 0;
946 while (!ColdSlice.empty()) {
947 SliceCost += TTI->getInstructionCost(U: ColdSlice.top(),
948 CostKind: TargetTransformInfo::TCK_Latency);
949 ColdSlice.pop();
950 }
951 // The colder the cold value operand of the select is the more expensive
952 // the cmov becomes for computing the cold value operand every time. Thus,
953 // the colder the cold operand is the more its cost counts.
954 // Get nearest integer cost adjusted for coldness.
955 InstructionCost AdjSliceCost =
956 divideNearest(Numerator: SliceCost * HotWeight, Denominator: TotalWeight);
957 if (AdjSliceCost >=
958 ColdOperandMaxCostMultiplier * TargetTransformInfo::TCC_Expensive)
959 return true;
960 }
961 }
962 return false;
963}
964
965// Check if it is safe to move LoadI next to the SI.
966// Conservatively assume it is safe only if there is no instruction
967// modifying memory in-between the load and the select instruction.
968static bool isSafeToSinkLoad(Instruction *LoadI, Instruction *SI) {
969 // Assume loads from different basic blocks are unsafe to move.
970 if (LoadI->getParent() != SI->getParent())
971 return false;
972 auto It = LoadI->getIterator();
973 while (&*It != SI) {
974 if (It->mayWriteToMemory())
975 return false;
976 It++;
977 }
978 return true;
979}
980
981// For a given source instruction, collect its backwards dependence slice
982// consisting of instructions exclusively computed for the purpose of producing
983// the operands of the source instruction. As an approximation
984// (sufficiently-accurate in practice), we populate this set with the
985// instructions of the backwards dependence slice that only have one-use and
986// form an one-use chain that leads to the source instruction.
987void SelectOptimizeImpl::getExclBackwardsSlice(Instruction *I,
988 std::stack<Instruction *> &Slice,
989 Instruction *SI,
990 bool ForSinking) {
991 SmallPtrSet<Instruction *, 2> Visited;
992 std::queue<Instruction *> Worklist;
993 Worklist.push(x: I);
994 while (!Worklist.empty()) {
995 Instruction *II = Worklist.front();
996 Worklist.pop();
997
998 // Avoid cycles.
999 if (!Visited.insert(Ptr: II).second)
1000 continue;
1001
1002 if (!II->hasOneUse())
1003 continue;
1004
1005 // Cannot soundly sink instructions with side-effects.
1006 // Terminator or phi instructions cannot be sunk.
1007 // Avoid sinking other select instructions (should be handled separetely).
1008 if (ForSinking && (II->isTerminator() || II->mayHaveSideEffects() ||
1009 isa<SelectInst>(Val: II) || isa<PHINode>(Val: II)))
1010 continue;
1011
1012 // Avoid sinking loads in order not to skip state-modifying instructions,
1013 // that may alias with the loaded address.
1014 // Only allow sinking of loads within the same basic block that are
1015 // conservatively proven to be safe.
1016 if (ForSinking && II->mayReadFromMemory() && !isSafeToSinkLoad(LoadI: II, SI))
1017 continue;
1018
1019 // Avoid considering instructions with less frequency than the source
1020 // instruction (i.e., avoid colder code regions of the dependence slice).
1021 if (BFI->getBlockFreq(BB: II->getParent()) < BFI->getBlockFreq(BB: I->getParent()))
1022 continue;
1023
1024 // Eligible one-use instruction added to the dependence slice.
1025 Slice.push(x: II);
1026
1027 // Explore all the operands of the current instruction to expand the slice.
1028 for (unsigned k = 0; k < II->getNumOperands(); ++k)
1029 if (auto *OpI = dyn_cast<Instruction>(Val: II->getOperand(i: k)))
1030 Worklist.push(x: OpI);
1031 }
1032}
1033
1034bool SelectOptimizeImpl::isSelectHighlyPredictable(const SelectLike SI) {
1035 uint64_t TrueWeight, FalseWeight;
1036 if (extractBranchWeights(SI, TrueVal&: TrueWeight, FalseVal&: FalseWeight)) {
1037 uint64_t Max = std::max(a: TrueWeight, b: FalseWeight);
1038 uint64_t Sum = TrueWeight + FalseWeight;
1039 if (Sum != 0) {
1040 auto Probability = BranchProbability::getBranchProbability(Numerator: Max, Denominator: Sum);
1041 if (Probability > TTI->getPredictableBranchThreshold())
1042 return true;
1043 }
1044 }
1045 return false;
1046}
1047
1048bool SelectOptimizeImpl::checkLoopHeuristics(const Loop *L,
1049 const CostInfo LoopCost[2]) {
1050 // Loop-level checks to determine if a non-predicated version (with branches)
1051 // of the loop is more profitable than its predicated version.
1052
1053 if (DisableLoopLevelHeuristics)
1054 return true;
1055
1056 OptimizationRemarkMissed ORmissL(DEBUG_TYPE, "SelectOpti",
1057 L->getHeader()->getFirstNonPHI());
1058
1059 if (LoopCost[0].NonPredCost > LoopCost[0].PredCost ||
1060 LoopCost[1].NonPredCost >= LoopCost[1].PredCost) {
1061 ORmissL << "No select conversion in the loop due to no reduction of loop's "
1062 "critical path. ";
1063 EmitAndPrintRemark(ORE, Rem&: ORmissL);
1064 return false;
1065 }
1066
1067 Scaled64 Gain[2] = {LoopCost[0].PredCost - LoopCost[0].NonPredCost,
1068 LoopCost[1].PredCost - LoopCost[1].NonPredCost};
1069
1070 // Profitably converting to branches need to reduce the loop's critical path
1071 // by at least some threshold (absolute gain of GainCycleThreshold cycles and
1072 // relative gain of 12.5%).
1073 if (Gain[1] < Scaled64::get(N: GainCycleThreshold) ||
1074 Gain[1] * Scaled64::get(N: GainRelativeThreshold) < LoopCost[1].PredCost) {
1075 Scaled64 RelativeGain = Scaled64::get(N: 100) * Gain[1] / LoopCost[1].PredCost;
1076 ORmissL << "No select conversion in the loop due to small reduction of "
1077 "loop's critical path. Gain="
1078 << Gain[1].toString()
1079 << ", RelativeGain=" << RelativeGain.toString() << "%. ";
1080 EmitAndPrintRemark(ORE, Rem&: ORmissL);
1081 return false;
1082 }
1083
1084 // If the loop's critical path involves loop-carried dependences, the gradient
1085 // of the gain needs to be at least GainGradientThreshold% (defaults to 25%).
1086 // This check ensures that the latency reduction for the loop's critical path
1087 // keeps decreasing with sufficient rate beyond the two analyzed loop
1088 // iterations.
1089 if (Gain[1] > Gain[0]) {
1090 Scaled64 GradientGain = Scaled64::get(N: 100) * (Gain[1] - Gain[0]) /
1091 (LoopCost[1].PredCost - LoopCost[0].PredCost);
1092 if (GradientGain < Scaled64::get(N: GainGradientThreshold)) {
1093 ORmissL << "No select conversion in the loop due to small gradient gain. "
1094 "GradientGain="
1095 << GradientGain.toString() << "%. ";
1096 EmitAndPrintRemark(ORE, Rem&: ORmissL);
1097 return false;
1098 }
1099 }
1100 // If the gain decreases it is not profitable to convert.
1101 else if (Gain[1] < Gain[0]) {
1102 ORmissL
1103 << "No select conversion in the loop due to negative gradient gain. ";
1104 EmitAndPrintRemark(ORE, Rem&: ORmissL);
1105 return false;
1106 }
1107
1108 // Non-predicated version of the loop is more profitable than its
1109 // predicated version.
1110 return true;
1111}
1112
1113// Computes instruction and loop-critical-path costs for both the predicated
1114// and non-predicated version of the given loop.
1115// Returns false if unable to compute these costs due to invalid cost of loop
1116// instruction(s).
1117bool SelectOptimizeImpl::computeLoopCosts(
1118 const Loop *L, const SelectGroups &SIGroups,
1119 DenseMap<const Instruction *, CostInfo> &InstCostMap, CostInfo *LoopCost) {
1120 LLVM_DEBUG(dbgs() << "Calculating Latency / IPredCost / INonPredCost of loop "
1121 << L->getHeader()->getName() << "\n");
1122 const auto &SImap = getSImap(SIGroups);
1123 // Compute instruction and loop-critical-path costs across two iterations for
1124 // both predicated and non-predicated version.
1125 const unsigned Iterations = 2;
1126 for (unsigned Iter = 0; Iter < Iterations; ++Iter) {
1127 // Cost of the loop's critical path.
1128 CostInfo &MaxCost = LoopCost[Iter];
1129 for (BasicBlock *BB : L->getBlocks()) {
1130 for (const Instruction &I : *BB) {
1131 if (I.isDebugOrPseudoInst())
1132 continue;
1133 // Compute the predicated and non-predicated cost of the instruction.
1134 Scaled64 IPredCost = Scaled64::getZero(),
1135 INonPredCost = Scaled64::getZero();
1136
1137 // Assume infinite resources that allow to fully exploit the available
1138 // instruction-level parallelism.
1139 // InstCost = InstLatency + max(Op1Cost, Op2Cost, … OpNCost)
1140 for (const Use &U : I.operands()) {
1141 auto UI = dyn_cast<Instruction>(Val: U.get());
1142 if (!UI)
1143 continue;
1144 if (InstCostMap.count(Val: UI)) {
1145 IPredCost = std::max(a: IPredCost, b: InstCostMap[UI].PredCost);
1146 INonPredCost = std::max(a: INonPredCost, b: InstCostMap[UI].NonPredCost);
1147 }
1148 }
1149 auto ILatency = computeInstCost(I: &I);
1150 if (!ILatency) {
1151 OptimizationRemarkMissed ORmissL(DEBUG_TYPE, "SelectOpti", &I);
1152 ORmissL << "Invalid instruction cost preventing analysis and "
1153 "optimization of the inner-most loop containing this "
1154 "instruction. ";
1155 EmitAndPrintRemark(ORE, Rem&: ORmissL);
1156 return false;
1157 }
1158 IPredCost += Scaled64::get(N: *ILatency);
1159 INonPredCost += Scaled64::get(N: *ILatency);
1160
1161 // For a select that can be converted to branch,
1162 // compute its cost as a branch (non-predicated cost).
1163 //
1164 // BranchCost = PredictedPathCost + MispredictCost
1165 // PredictedPathCost = TrueOpCost * TrueProb + FalseOpCost * FalseProb
1166 // MispredictCost = max(MispredictPenalty, CondCost) * MispredictRate
1167 if (SImap.contains(Val: &I)) {
1168 auto SI = SImap.at(Val: &I);
1169 Scaled64 TrueOpCost = SI.getTrueOpCost(InstCostMap, TTI);
1170 Scaled64 FalseOpCost = SI.getFalseOpCost(InstCostMap, TTI);
1171 Scaled64 PredictedPathCost =
1172 getPredictedPathCost(TrueCost: TrueOpCost, FalseCost: FalseOpCost, SI);
1173
1174 Scaled64 CondCost = Scaled64::getZero();
1175 if (auto *CI = dyn_cast<Instruction>(Val: SI.getCondition()))
1176 if (InstCostMap.count(Val: CI))
1177 CondCost = InstCostMap[CI].NonPredCost;
1178 Scaled64 MispredictCost = getMispredictionCost(SI, CondCost);
1179
1180 INonPredCost = PredictedPathCost + MispredictCost;
1181 }
1182 LLVM_DEBUG(dbgs() << " " << ILatency << "/" << IPredCost << "/"
1183 << INonPredCost << " for " << I << "\n");
1184
1185 InstCostMap[&I] = {.PredCost: IPredCost, .NonPredCost: INonPredCost};
1186 MaxCost.PredCost = std::max(a: MaxCost.PredCost, b: IPredCost);
1187 MaxCost.NonPredCost = std::max(a: MaxCost.NonPredCost, b: INonPredCost);
1188 }
1189 }
1190 LLVM_DEBUG(dbgs() << "Iteration " << Iter + 1
1191 << " MaxCost = " << MaxCost.PredCost << " "
1192 << MaxCost.NonPredCost << "\n");
1193 }
1194 return true;
1195}
1196
1197SmallDenseMap<const Instruction *, SelectOptimizeImpl::SelectLike, 2>
1198SelectOptimizeImpl::getSImap(const SelectGroups &SIGroups) {
1199 SmallDenseMap<const Instruction *, SelectLike, 2> SImap;
1200 for (const SelectGroup &ASI : SIGroups)
1201 for (SelectLike SI : ASI)
1202 SImap.try_emplace(Key: SI.getI(), Args&: SI);
1203 return SImap;
1204}
1205
1206std::optional<uint64_t>
1207SelectOptimizeImpl::computeInstCost(const Instruction *I) {
1208 InstructionCost ICost =
1209 TTI->getInstructionCost(U: I, CostKind: TargetTransformInfo::TCK_Latency);
1210 if (auto OC = ICost.getValue())
1211 return std::optional<uint64_t>(*OC);
1212 return std::nullopt;
1213}
1214
1215ScaledNumber<uint64_t>
1216SelectOptimizeImpl::getMispredictionCost(const SelectLike SI,
1217 const Scaled64 CondCost) {
1218 uint64_t MispredictPenalty = TSchedModel.getMCSchedModel()->MispredictPenalty;
1219
1220 // Account for the default misprediction rate when using a branch
1221 // (conservatively set to 25% by default).
1222 uint64_t MispredictRate = MispredictDefaultRate;
1223 // If the select condition is obviously predictable, then the misprediction
1224 // rate is zero.
1225 if (isSelectHighlyPredictable(SI))
1226 MispredictRate = 0;
1227
1228 // CondCost is included to account for cases where the computation of the
1229 // condition is part of a long dependence chain (potentially loop-carried)
1230 // that would delay detection of a misprediction and increase its cost.
1231 Scaled64 MispredictCost =
1232 std::max(a: Scaled64::get(N: MispredictPenalty), b: CondCost) *
1233 Scaled64::get(N: MispredictRate);
1234 MispredictCost /= Scaled64::get(N: 100);
1235
1236 return MispredictCost;
1237}
1238
1239// Returns the cost of a branch when the prediction is correct.
1240// TrueCost * TrueProbability + FalseCost * FalseProbability.
1241ScaledNumber<uint64_t>
1242SelectOptimizeImpl::getPredictedPathCost(Scaled64 TrueCost, Scaled64 FalseCost,
1243 const SelectLike SI) {
1244 Scaled64 PredPathCost;
1245 uint64_t TrueWeight, FalseWeight;
1246 if (extractBranchWeights(SI, TrueVal&: TrueWeight, FalseVal&: FalseWeight)) {
1247 uint64_t SumWeight = TrueWeight + FalseWeight;
1248 if (SumWeight != 0) {
1249 PredPathCost = TrueCost * Scaled64::get(N: TrueWeight) +
1250 FalseCost * Scaled64::get(N: FalseWeight);
1251 PredPathCost /= Scaled64::get(N: SumWeight);
1252 return PredPathCost;
1253 }
1254 }
1255 // Without branch weight metadata, we assume 75% for the one path and 25% for
1256 // the other, and pick the result with the biggest cost.
1257 PredPathCost = std::max(a: TrueCost * Scaled64::get(N: 3) + FalseCost,
1258 b: FalseCost * Scaled64::get(N: 3) + TrueCost);
1259 PredPathCost /= Scaled64::get(N: 4);
1260 return PredPathCost;
1261}
1262
1263bool SelectOptimizeImpl::isSelectKindSupported(const SelectLike SI) {
1264 bool VectorCond = !SI.getCondition()->getType()->isIntegerTy(Bitwidth: 1);
1265 if (VectorCond)
1266 return false;
1267 TargetLowering::SelectSupportKind SelectKind;
1268 if (SI.getType()->isVectorTy())
1269 SelectKind = TargetLowering::ScalarCondVectorVal;
1270 else
1271 SelectKind = TargetLowering::ScalarValSelect;
1272 return TLI->isSelectSupported(SelectKind);
1273}
1274

source code of llvm/lib/CodeGen/SelectOptimize.cpp