1 | //===--- SelectOptimize.cpp - Convert select to branches if profitable ---===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This pass converts selects to conditional jumps when profitable. |
10 | // |
11 | //===----------------------------------------------------------------------===// |
12 | |
13 | #include "llvm/CodeGen/SelectOptimize.h" |
14 | #include "llvm/ADT/SmallVector.h" |
15 | #include "llvm/ADT/Statistic.h" |
16 | #include "llvm/Analysis/BlockFrequencyInfo.h" |
17 | #include "llvm/Analysis/BranchProbabilityInfo.h" |
18 | #include "llvm/Analysis/LoopInfo.h" |
19 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
20 | #include "llvm/Analysis/ProfileSummaryInfo.h" |
21 | #include "llvm/Analysis/TargetTransformInfo.h" |
22 | #include "llvm/CodeGen/Passes.h" |
23 | #include "llvm/CodeGen/TargetLowering.h" |
24 | #include "llvm/CodeGen/TargetPassConfig.h" |
25 | #include "llvm/CodeGen/TargetSchedule.h" |
26 | #include "llvm/CodeGen/TargetSubtargetInfo.h" |
27 | #include "llvm/IR/BasicBlock.h" |
28 | #include "llvm/IR/Dominators.h" |
29 | #include "llvm/IR/Function.h" |
30 | #include "llvm/IR/IRBuilder.h" |
31 | #include "llvm/IR/Instruction.h" |
32 | #include "llvm/IR/PatternMatch.h" |
33 | #include "llvm/IR/ProfDataUtils.h" |
34 | #include "llvm/InitializePasses.h" |
35 | #include "llvm/Pass.h" |
36 | #include "llvm/Support/ScaledNumber.h" |
37 | #include "llvm/Target/TargetMachine.h" |
38 | #include "llvm/Transforms/Utils/SizeOpts.h" |
39 | #include <algorithm> |
40 | #include <memory> |
41 | #include <queue> |
42 | #include <stack> |
43 | |
44 | using namespace llvm; |
45 | using namespace llvm::PatternMatch; |
46 | |
47 | #define DEBUG_TYPE "select-optimize" |
48 | |
49 | STATISTIC(NumSelectOptAnalyzed, |
50 | "Number of select groups considered for conversion to branch" ); |
51 | STATISTIC(NumSelectConvertedExpColdOperand, |
52 | "Number of select groups converted due to expensive cold operand" ); |
53 | STATISTIC(NumSelectConvertedHighPred, |
54 | "Number of select groups converted due to high-predictability" ); |
55 | STATISTIC(NumSelectUnPred, |
56 | "Number of select groups not converted due to unpredictability" ); |
57 | STATISTIC(NumSelectColdBB, |
58 | "Number of select groups not converted due to cold basic block" ); |
59 | STATISTIC(NumSelectConvertedLoop, |
60 | "Number of select groups converted due to loop-level analysis" ); |
61 | STATISTIC(NumSelectsConverted, "Number of selects converted" ); |
62 | |
63 | static cl::opt<unsigned> ColdOperandThreshold( |
64 | "cold-operand-threshold" , |
65 | cl::desc("Maximum frequency of path for an operand to be considered cold." ), |
66 | cl::init(Val: 20), cl::Hidden); |
67 | |
68 | static cl::opt<unsigned> ColdOperandMaxCostMultiplier( |
69 | "cold-operand-max-cost-multiplier" , |
70 | cl::desc("Maximum cost multiplier of TCC_expensive for the dependence " |
71 | "slice of a cold operand to be considered inexpensive." ), |
72 | cl::init(Val: 1), cl::Hidden); |
73 | |
74 | static cl::opt<unsigned> |
75 | GainGradientThreshold("select-opti-loop-gradient-gain-threshold" , |
76 | cl::desc("Gradient gain threshold (%)." ), |
77 | cl::init(Val: 25), cl::Hidden); |
78 | |
79 | static cl::opt<unsigned> |
80 | GainCycleThreshold("select-opti-loop-cycle-gain-threshold" , |
81 | cl::desc("Minimum gain per loop (in cycles) threshold." ), |
82 | cl::init(Val: 4), cl::Hidden); |
83 | |
84 | static cl::opt<unsigned> GainRelativeThreshold( |
85 | "select-opti-loop-relative-gain-threshold" , |
86 | cl::desc( |
87 | "Minimum relative gain per loop threshold (1/X). Defaults to 12.5%" ), |
88 | cl::init(Val: 8), cl::Hidden); |
89 | |
90 | static cl::opt<unsigned> MispredictDefaultRate( |
91 | "mispredict-default-rate" , cl::Hidden, cl::init(Val: 25), |
92 | cl::desc("Default mispredict rate (initialized to 25%)." )); |
93 | |
94 | static cl::opt<bool> |
95 | DisableLoopLevelHeuristics("disable-loop-level-heuristics" , cl::Hidden, |
96 | cl::init(Val: false), |
97 | cl::desc("Disable loop-level heuristics." )); |
98 | |
99 | namespace { |
100 | |
101 | class SelectOptimizeImpl { |
102 | const TargetMachine *TM = nullptr; |
103 | const TargetSubtargetInfo *TSI = nullptr; |
104 | const TargetLowering *TLI = nullptr; |
105 | const TargetTransformInfo *TTI = nullptr; |
106 | const LoopInfo *LI = nullptr; |
107 | BlockFrequencyInfo *BFI; |
108 | ProfileSummaryInfo *PSI = nullptr; |
109 | OptimizationRemarkEmitter *ORE = nullptr; |
110 | TargetSchedModel TSchedModel; |
111 | |
112 | public: |
113 | SelectOptimizeImpl() = default; |
114 | SelectOptimizeImpl(const TargetMachine *TM) : TM(TM){}; |
115 | PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM); |
116 | bool runOnFunction(Function &F, Pass &P); |
117 | |
118 | using Scaled64 = ScaledNumber<uint64_t>; |
119 | |
120 | struct CostInfo { |
121 | /// Predicated cost (with selects as conditional moves). |
122 | Scaled64 PredCost; |
123 | /// Non-predicated cost (with selects converted to branches). |
124 | Scaled64 NonPredCost; |
125 | }; |
126 | |
127 | /// SelectLike is an abstraction over SelectInst and other operations that can |
128 | /// act like selects. For example Or(Zext(icmp), X) can be treated like |
129 | /// select(icmp, X|1, X). |
130 | class SelectLike { |
131 | SelectLike(Instruction *I) : I(I) {} |
132 | |
133 | Instruction *I; |
134 | |
135 | public: |
136 | /// Match a select or select-like instruction, returning a SelectLike. |
137 | static SelectLike match(Instruction *I) { |
138 | // Select instruction are what we are usually looking for. |
139 | if (isa<SelectInst>(Val: I)) |
140 | return SelectLike(I); |
141 | |
142 | // An Or(zext(i1 X), Y) can also be treated like a select, with condition |
143 | // C and values Y|1 and Y. |
144 | Value *X; |
145 | if (PatternMatch::match( |
146 | V: I, P: m_c_Or(L: m_OneUse(SubPattern: m_ZExt(Op: m_Value(V&: X))), R: m_Value())) && |
147 | X->getType()->isIntegerTy(Bitwidth: 1)) |
148 | return SelectLike(I); |
149 | |
150 | return SelectLike(nullptr); |
151 | } |
152 | |
153 | bool isValid() { return I; } |
154 | operator bool() { return isValid(); } |
155 | |
156 | Instruction *getI() { return I; } |
157 | const Instruction *getI() const { return I; } |
158 | |
159 | Type *getType() const { return I->getType(); } |
160 | |
161 | /// Return the condition for the SelectLike instruction. For example the |
162 | /// condition of a select or c in `or(zext(c), x)` |
163 | Value *getCondition() const { |
164 | if (auto *Sel = dyn_cast<SelectInst>(Val: I)) |
165 | return Sel->getCondition(); |
166 | // Or(zext) case |
167 | if (auto *BO = dyn_cast<BinaryOperator>(Val: I)) { |
168 | Value *X; |
169 | if (PatternMatch::match(V: BO->getOperand(i_nocapture: 0), |
170 | P: m_OneUse(SubPattern: m_ZExt(Op: m_Value(V&: X))))) |
171 | return X; |
172 | if (PatternMatch::match(V: BO->getOperand(i_nocapture: 1), |
173 | P: m_OneUse(SubPattern: m_ZExt(Op: m_Value(V&: X))))) |
174 | return X; |
175 | } |
176 | |
177 | llvm_unreachable("Unhandled case in getCondition" ); |
178 | } |
179 | |
180 | /// Return the true value for the SelectLike instruction. Note this may not |
181 | /// exist for all SelectLike instructions. For example, for `or(zext(c), x)` |
182 | /// the true value would be `or(x,1)`. As this value does not exist, nullptr |
183 | /// is returned. |
184 | Value *getTrueValue() const { |
185 | if (auto *Sel = dyn_cast<SelectInst>(Val: I)) |
186 | return Sel->getTrueValue(); |
187 | // Or(zext) case - The true value is Or(X), so return nullptr as the value |
188 | // does not yet exist. |
189 | if (isa<BinaryOperator>(Val: I)) |
190 | return nullptr; |
191 | |
192 | llvm_unreachable("Unhandled case in getTrueValue" ); |
193 | } |
194 | |
195 | /// Return the false value for the SelectLike instruction. For example the |
196 | /// getFalseValue of a select or `x` in `or(zext(c), x)` (which is |
197 | /// `select(c, x|1, x)`) |
198 | Value *getFalseValue() const { |
199 | if (auto *Sel = dyn_cast<SelectInst>(Val: I)) |
200 | return Sel->getFalseValue(); |
201 | // Or(zext) case - return the operand which is not the zext. |
202 | if (auto *BO = dyn_cast<BinaryOperator>(Val: I)) { |
203 | Value *X; |
204 | if (PatternMatch::match(V: BO->getOperand(i_nocapture: 0), |
205 | P: m_OneUse(SubPattern: m_ZExt(Op: m_Value(V&: X))))) |
206 | return BO->getOperand(i_nocapture: 1); |
207 | if (PatternMatch::match(V: BO->getOperand(i_nocapture: 1), |
208 | P: m_OneUse(SubPattern: m_ZExt(Op: m_Value(V&: X))))) |
209 | return BO->getOperand(i_nocapture: 0); |
210 | } |
211 | |
212 | llvm_unreachable("Unhandled case in getFalseValue" ); |
213 | } |
214 | |
215 | /// Return the NonPredCost cost of the true op, given the costs in |
216 | /// InstCostMap. This may need to be generated for select-like instructions. |
217 | Scaled64 getTrueOpCost(DenseMap<const Instruction *, CostInfo> &InstCostMap, |
218 | const TargetTransformInfo *TTI) { |
219 | if (auto *Sel = dyn_cast<SelectInst>(Val: I)) |
220 | if (auto *I = dyn_cast<Instruction>(Val: Sel->getTrueValue())) |
221 | return InstCostMap.contains(Val: I) ? InstCostMap[I].NonPredCost |
222 | : Scaled64::getZero(); |
223 | |
224 | // Or case - add the cost of an extra Or to the cost of the False case. |
225 | if (isa<BinaryOperator>(Val: I)) |
226 | if (auto I = dyn_cast<Instruction>(Val: getFalseValue())) |
227 | if (InstCostMap.contains(Val: I)) { |
228 | InstructionCost OrCost = TTI->getArithmeticInstrCost( |
229 | Opcode: Instruction::Or, Ty: I->getType(), CostKind: TargetTransformInfo::TCK_Latency, |
230 | Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, |
231 | .Properties: TargetTransformInfo::OP_None}, |
232 | Opd2Info: {.Kind: TTI::OK_UniformConstantValue, .Properties: TTI::OP_PowerOf2}); |
233 | return InstCostMap[I].NonPredCost + |
234 | Scaled64::get(N: *OrCost.getValue()); |
235 | } |
236 | |
237 | return Scaled64::getZero(); |
238 | } |
239 | |
240 | /// Return the NonPredCost cost of the false op, given the costs in |
241 | /// InstCostMap. This may need to be generated for select-like instructions. |
242 | Scaled64 |
243 | getFalseOpCost(DenseMap<const Instruction *, CostInfo> &InstCostMap, |
244 | const TargetTransformInfo *TTI) { |
245 | if (auto *Sel = dyn_cast<SelectInst>(Val: I)) |
246 | if (auto *I = dyn_cast<Instruction>(Val: Sel->getFalseValue())) |
247 | return InstCostMap.contains(Val: I) ? InstCostMap[I].NonPredCost |
248 | : Scaled64::getZero(); |
249 | |
250 | // Or case - return the cost of the false case |
251 | if (isa<BinaryOperator>(Val: I)) |
252 | if (auto I = dyn_cast<Instruction>(Val: getFalseValue())) |
253 | if (InstCostMap.contains(Val: I)) |
254 | return InstCostMap[I].NonPredCost; |
255 | |
256 | return Scaled64::getZero(); |
257 | } |
258 | }; |
259 | |
260 | private: |
261 | // Select groups consist of consecutive select instructions with the same |
262 | // condition. |
263 | using SelectGroup = SmallVector<SelectLike, 2>; |
264 | using SelectGroups = SmallVector<SelectGroup, 2>; |
265 | |
266 | // Converts select instructions of a function to conditional jumps when deemed |
267 | // profitable. Returns true if at least one select was converted. |
268 | bool optimizeSelects(Function &F); |
269 | |
270 | // Heuristics for determining which select instructions can be profitably |
271 | // conveted to branches. Separate heuristics for selects in inner-most loops |
272 | // and the rest of code regions (base heuristics for non-inner-most loop |
273 | // regions). |
274 | void optimizeSelectsBase(Function &F, SelectGroups &ProfSIGroups); |
275 | void optimizeSelectsInnerLoops(Function &F, SelectGroups &ProfSIGroups); |
276 | |
277 | // Converts to branches the select groups that were deemed |
278 | // profitable-to-convert. |
279 | void convertProfitableSIGroups(SelectGroups &ProfSIGroups); |
280 | |
281 | // Splits selects of a given basic block into select groups. |
282 | void collectSelectGroups(BasicBlock &BB, SelectGroups &SIGroups); |
283 | |
284 | // Determines for which select groups it is profitable converting to branches |
285 | // (base and inner-most-loop heuristics). |
286 | void findProfitableSIGroupsBase(SelectGroups &SIGroups, |
287 | SelectGroups &ProfSIGroups); |
288 | void findProfitableSIGroupsInnerLoops(const Loop *L, SelectGroups &SIGroups, |
289 | SelectGroups &ProfSIGroups); |
290 | |
291 | // Determines if a select group should be converted to a branch (base |
292 | // heuristics). |
293 | bool isConvertToBranchProfitableBase(const SelectGroup &ASI); |
294 | |
295 | // Returns true if there are expensive instructions in the cold value |
296 | // operand's (if any) dependence slice of any of the selects of the given |
297 | // group. |
298 | bool hasExpensiveColdOperand(const SelectGroup &ASI); |
299 | |
300 | // For a given source instruction, collect its backwards dependence slice |
301 | // consisting of instructions exclusively computed for producing the operands |
302 | // of the source instruction. |
303 | void getExclBackwardsSlice(Instruction *I, std::stack<Instruction *> &Slice, |
304 | Instruction *SI, bool ForSinking = false); |
305 | |
306 | // Returns true if the condition of the select is highly predictable. |
307 | bool isSelectHighlyPredictable(const SelectLike SI); |
308 | |
309 | // Loop-level checks to determine if a non-predicated version (with branches) |
310 | // of the given loop is more profitable than its predicated version. |
311 | bool checkLoopHeuristics(const Loop *L, const CostInfo LoopDepth[2]); |
312 | |
313 | // Computes instruction and loop-critical-path costs for both the predicated |
314 | // and non-predicated version of the given loop. |
315 | bool computeLoopCosts(const Loop *L, const SelectGroups &SIGroups, |
316 | DenseMap<const Instruction *, CostInfo> &InstCostMap, |
317 | CostInfo *LoopCost); |
318 | |
319 | // Returns a set of all the select instructions in the given select groups. |
320 | SmallDenseMap<const Instruction *, SelectLike, 2> |
321 | getSImap(const SelectGroups &SIGroups); |
322 | |
323 | // Returns the latency cost of a given instruction. |
324 | std::optional<uint64_t> computeInstCost(const Instruction *I); |
325 | |
326 | // Returns the misprediction cost of a given select when converted to branch. |
327 | Scaled64 getMispredictionCost(const SelectLike SI, const Scaled64 CondCost); |
328 | |
329 | // Returns the cost of a branch when the prediction is correct. |
330 | Scaled64 getPredictedPathCost(Scaled64 TrueCost, Scaled64 FalseCost, |
331 | const SelectLike SI); |
332 | |
333 | // Returns true if the target architecture supports lowering a given select. |
334 | bool isSelectKindSupported(const SelectLike SI); |
335 | }; |
336 | |
337 | class SelectOptimize : public FunctionPass { |
338 | SelectOptimizeImpl Impl; |
339 | |
340 | public: |
341 | static char ID; |
342 | |
343 | SelectOptimize() : FunctionPass(ID) { |
344 | initializeSelectOptimizePass(*PassRegistry::getPassRegistry()); |
345 | } |
346 | |
347 | bool runOnFunction(Function &F) override { |
348 | return Impl.runOnFunction(F, P&: *this); |
349 | } |
350 | |
351 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
352 | AU.addRequired<ProfileSummaryInfoWrapperPass>(); |
353 | AU.addRequired<TargetPassConfig>(); |
354 | AU.addRequired<TargetTransformInfoWrapperPass>(); |
355 | AU.addRequired<LoopInfoWrapperPass>(); |
356 | AU.addRequired<BlockFrequencyInfoWrapperPass>(); |
357 | AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); |
358 | } |
359 | }; |
360 | |
361 | } // namespace |
362 | |
363 | PreservedAnalyses SelectOptimizePass::run(Function &F, |
364 | FunctionAnalysisManager &FAM) { |
365 | SelectOptimizeImpl Impl(TM); |
366 | return Impl.run(F, FAM); |
367 | } |
368 | |
369 | char SelectOptimize::ID = 0; |
370 | |
371 | INITIALIZE_PASS_BEGIN(SelectOptimize, DEBUG_TYPE, "Optimize selects" , false, |
372 | false) |
373 | INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) |
374 | INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) |
375 | INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) |
376 | INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) |
377 | INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) |
378 | INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) |
379 | INITIALIZE_PASS_END(SelectOptimize, DEBUG_TYPE, "Optimize selects" , false, |
380 | false) |
381 | |
382 | FunctionPass *llvm::createSelectOptimizePass() { return new SelectOptimize(); } |
383 | |
384 | PreservedAnalyses SelectOptimizeImpl::run(Function &F, |
385 | FunctionAnalysisManager &FAM) { |
386 | TSI = TM->getSubtargetImpl(F); |
387 | TLI = TSI->getTargetLowering(); |
388 | |
389 | // If none of the select types are supported then skip this pass. |
390 | // This is an optimization pass. Legality issues will be handled by |
391 | // instruction selection. |
392 | if (!TLI->isSelectSupported(TargetLowering::ScalarValSelect) && |
393 | !TLI->isSelectSupported(TargetLowering::ScalarCondVectorVal) && |
394 | !TLI->isSelectSupported(TargetLowering::VectorMaskSelect)) |
395 | return PreservedAnalyses::all(); |
396 | |
397 | TTI = &FAM.getResult<TargetIRAnalysis>(IR&: F); |
398 | if (!TTI->enableSelectOptimize()) |
399 | return PreservedAnalyses::all(); |
400 | |
401 | PSI = FAM.getResult<ModuleAnalysisManagerFunctionProxy>(IR&: F) |
402 | .getCachedResult<ProfileSummaryAnalysis>(IR&: *F.getParent()); |
403 | assert(PSI && "This pass requires module analysis pass `profile-summary`!" ); |
404 | BFI = &FAM.getResult<BlockFrequencyAnalysis>(IR&: F); |
405 | |
406 | // When optimizing for size, selects are preferable over branches. |
407 | if (F.hasOptSize() || llvm::shouldOptimizeForSize(F: &F, PSI, BFI)) |
408 | return PreservedAnalyses::all(); |
409 | |
410 | LI = &FAM.getResult<LoopAnalysis>(IR&: F); |
411 | ORE = &FAM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: F); |
412 | TSchedModel.init(TSInfo: TSI); |
413 | |
414 | bool Changed = optimizeSelects(F); |
415 | return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); |
416 | } |
417 | |
418 | bool SelectOptimizeImpl::runOnFunction(Function &F, Pass &P) { |
419 | TM = &P.getAnalysis<TargetPassConfig>().getTM<TargetMachine>(); |
420 | TSI = TM->getSubtargetImpl(F); |
421 | TLI = TSI->getTargetLowering(); |
422 | |
423 | // If none of the select types are supported then skip this pass. |
424 | // This is an optimization pass. Legality issues will be handled by |
425 | // instruction selection. |
426 | if (!TLI->isSelectSupported(TargetLowering::ScalarValSelect) && |
427 | !TLI->isSelectSupported(TargetLowering::ScalarCondVectorVal) && |
428 | !TLI->isSelectSupported(TargetLowering::VectorMaskSelect)) |
429 | return false; |
430 | |
431 | TTI = &P.getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); |
432 | |
433 | if (!TTI->enableSelectOptimize()) |
434 | return false; |
435 | |
436 | LI = &P.getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); |
437 | BFI = &P.getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); |
438 | PSI = &P.getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); |
439 | ORE = &P.getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); |
440 | TSchedModel.init(TSInfo: TSI); |
441 | |
442 | // When optimizing for size, selects are preferable over branches. |
443 | if (F.hasOptSize() || llvm::shouldOptimizeForSize(F: &F, PSI, BFI)) |
444 | return false; |
445 | |
446 | return optimizeSelects(F); |
447 | } |
448 | |
449 | bool SelectOptimizeImpl::optimizeSelects(Function &F) { |
450 | // Determine for which select groups it is profitable converting to branches. |
451 | SelectGroups ProfSIGroups; |
452 | // Base heuristics apply only to non-loops and outer loops. |
453 | optimizeSelectsBase(F, ProfSIGroups); |
454 | // Separate heuristics for inner-most loops. |
455 | optimizeSelectsInnerLoops(F, ProfSIGroups); |
456 | |
457 | // Convert to branches the select groups that were deemed |
458 | // profitable-to-convert. |
459 | convertProfitableSIGroups(ProfSIGroups); |
460 | |
461 | // Code modified if at least one select group was converted. |
462 | return !ProfSIGroups.empty(); |
463 | } |
464 | |
465 | void SelectOptimizeImpl::optimizeSelectsBase(Function &F, |
466 | SelectGroups &ProfSIGroups) { |
467 | // Collect all the select groups. |
468 | SelectGroups SIGroups; |
469 | for (BasicBlock &BB : F) { |
470 | // Base heuristics apply only to non-loops and outer loops. |
471 | Loop *L = LI->getLoopFor(BB: &BB); |
472 | if (L && L->isInnermost()) |
473 | continue; |
474 | collectSelectGroups(BB, SIGroups); |
475 | } |
476 | |
477 | // Determine for which select groups it is profitable converting to branches. |
478 | findProfitableSIGroupsBase(SIGroups, ProfSIGroups); |
479 | } |
480 | |
481 | void SelectOptimizeImpl::optimizeSelectsInnerLoops(Function &F, |
482 | SelectGroups &ProfSIGroups) { |
483 | SmallVector<Loop *, 4> Loops(LI->begin(), LI->end()); |
484 | // Need to check size on each iteration as we accumulate child loops. |
485 | for (unsigned long i = 0; i < Loops.size(); ++i) |
486 | for (Loop *ChildL : Loops[i]->getSubLoops()) |
487 | Loops.push_back(Elt: ChildL); |
488 | |
489 | for (Loop *L : Loops) { |
490 | if (!L->isInnermost()) |
491 | continue; |
492 | |
493 | SelectGroups SIGroups; |
494 | for (BasicBlock *BB : L->getBlocks()) |
495 | collectSelectGroups(BB&: *BB, SIGroups); |
496 | |
497 | findProfitableSIGroupsInnerLoops(L, SIGroups, ProfSIGroups); |
498 | } |
499 | } |
500 | |
501 | /// If \p isTrue is true, return the true value of \p SI, otherwise return |
502 | /// false value of \p SI. If the true/false value of \p SI is defined by any |
503 | /// select instructions in \p Selects, look through the defining select |
504 | /// instruction until the true/false value is not defined in \p Selects. |
505 | static Value * |
506 | getTrueOrFalseValue(SelectOptimizeImpl::SelectLike SI, bool isTrue, |
507 | const SmallPtrSet<const Instruction *, 2> &Selects, |
508 | IRBuilder<> &IB) { |
509 | Value *V = nullptr; |
510 | for (SelectInst *DefSI = dyn_cast<SelectInst>(Val: SI.getI()); |
511 | DefSI != nullptr && Selects.count(Ptr: DefSI); |
512 | DefSI = dyn_cast<SelectInst>(Val: V)) { |
513 | assert(DefSI->getCondition() == SI.getCondition() && |
514 | "The condition of DefSI does not match with SI" ); |
515 | V = (isTrue ? DefSI->getTrueValue() : DefSI->getFalseValue()); |
516 | } |
517 | |
518 | if (isa<BinaryOperator>(Val: SI.getI())) { |
519 | assert(SI.getI()->getOpcode() == Instruction::Or && |
520 | "Only currently handling Or instructions." ); |
521 | V = SI.getFalseValue(); |
522 | if (isTrue) |
523 | V = IB.CreateOr(LHS: V, RHS: ConstantInt::get(Ty: V->getType(), V: 1)); |
524 | } |
525 | |
526 | assert(V && "Failed to get select true/false value" ); |
527 | return V; |
528 | } |
529 | |
530 | void SelectOptimizeImpl::convertProfitableSIGroups(SelectGroups &ProfSIGroups) { |
531 | for (SelectGroup &ASI : ProfSIGroups) { |
532 | // The code transformation here is a modified version of the sinking |
533 | // transformation in CodeGenPrepare::optimizeSelectInst with a more |
534 | // aggressive strategy of which instructions to sink. |
535 | // |
536 | // TODO: eliminate the redundancy of logic transforming selects to branches |
537 | // by removing CodeGenPrepare::optimizeSelectInst and optimizing here |
538 | // selects for all cases (with and without profile information). |
539 | |
540 | // Transform a sequence like this: |
541 | // start: |
542 | // %cmp = cmp uge i32 %a, %b |
543 | // %sel = select i1 %cmp, i32 %c, i32 %d |
544 | // |
545 | // Into: |
546 | // start: |
547 | // %cmp = cmp uge i32 %a, %b |
548 | // %cmp.frozen = freeze %cmp |
549 | // br i1 %cmp.frozen, label %select.true, label %select.false |
550 | // select.true: |
551 | // br label %select.end |
552 | // select.false: |
553 | // br label %select.end |
554 | // select.end: |
555 | // %sel = phi i32 [ %c, %select.true ], [ %d, %select.false ] |
556 | // |
557 | // %cmp should be frozen, otherwise it may introduce undefined behavior. |
558 | // In addition, we may sink instructions that produce %c or %d into the |
559 | // destination(s) of the new branch. |
560 | // If the true or false blocks do not contain a sunken instruction, that |
561 | // block and its branch may be optimized away. In that case, one side of the |
562 | // first branch will point directly to select.end, and the corresponding PHI |
563 | // predecessor block will be the start block. |
564 | |
565 | // Find all the instructions that can be soundly sunk to the true/false |
566 | // blocks. These are instructions that are computed solely for producing the |
567 | // operands of the select instructions in the group and can be sunk without |
568 | // breaking the semantics of the LLVM IR (e.g., cannot sink instructions |
569 | // with side effects). |
570 | SmallVector<std::stack<Instruction *>, 2> TrueSlices, FalseSlices; |
571 | typedef std::stack<Instruction *>::size_type StackSizeType; |
572 | StackSizeType maxTrueSliceLen = 0, maxFalseSliceLen = 0; |
573 | for (SelectLike SI : ASI) { |
574 | // For each select, compute the sinkable dependence chains of the true and |
575 | // false operands. |
576 | if (auto *TI = dyn_cast_or_null<Instruction>(Val: SI.getTrueValue())) { |
577 | std::stack<Instruction *> TrueSlice; |
578 | getExclBackwardsSlice(I: TI, Slice&: TrueSlice, SI: SI.getI(), ForSinking: true); |
579 | maxTrueSliceLen = std::max(a: maxTrueSliceLen, b: TrueSlice.size()); |
580 | TrueSlices.push_back(Elt: TrueSlice); |
581 | } |
582 | if (auto *FI = dyn_cast_or_null<Instruction>(Val: SI.getFalseValue())) { |
583 | if (isa<SelectInst>(Val: SI.getI()) || !FI->hasOneUse()) { |
584 | std::stack<Instruction *> FalseSlice; |
585 | getExclBackwardsSlice(I: FI, Slice&: FalseSlice, SI: SI.getI(), ForSinking: true); |
586 | maxFalseSliceLen = std::max(a: maxFalseSliceLen, b: FalseSlice.size()); |
587 | FalseSlices.push_back(Elt: FalseSlice); |
588 | } |
589 | } |
590 | } |
591 | // In the case of multiple select instructions in the same group, the order |
592 | // of non-dependent instructions (instructions of different dependence |
593 | // slices) in the true/false blocks appears to affect performance. |
594 | // Interleaving the slices seems to experimentally be the optimal approach. |
595 | // This interleaving scheduling allows for more ILP (with a natural downside |
596 | // of increasing a bit register pressure) compared to a simple ordering of |
597 | // one whole chain after another. One would expect that this ordering would |
598 | // not matter since the scheduling in the backend of the compiler would |
599 | // take care of it, but apparently the scheduler fails to deliver optimal |
600 | // ILP with a naive ordering here. |
601 | SmallVector<Instruction *, 2> TrueSlicesInterleaved, FalseSlicesInterleaved; |
602 | for (StackSizeType IS = 0; IS < maxTrueSliceLen; ++IS) { |
603 | for (auto &S : TrueSlices) { |
604 | if (!S.empty()) { |
605 | TrueSlicesInterleaved.push_back(Elt: S.top()); |
606 | S.pop(); |
607 | } |
608 | } |
609 | } |
610 | for (StackSizeType IS = 0; IS < maxFalseSliceLen; ++IS) { |
611 | for (auto &S : FalseSlices) { |
612 | if (!S.empty()) { |
613 | FalseSlicesInterleaved.push_back(Elt: S.top()); |
614 | S.pop(); |
615 | } |
616 | } |
617 | } |
618 | |
619 | // We split the block containing the select(s) into two blocks. |
620 | SelectLike SI = ASI.front(); |
621 | SelectLike LastSI = ASI.back(); |
622 | BasicBlock *StartBlock = SI.getI()->getParent(); |
623 | BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(LastSI.getI())); |
624 | // With RemoveDIs turned off, SplitPt can be a dbg.* intrinsic. With |
625 | // RemoveDIs turned on, SplitPt would instead point to the next |
626 | // instruction. To match existing dbg.* intrinsic behaviour with RemoveDIs, |
627 | // tell splitBasicBlock that we want to include any DbgVariableRecords |
628 | // attached to SplitPt in the splice. |
629 | SplitPt.setHeadBit(true); |
630 | BasicBlock *EndBlock = StartBlock->splitBasicBlock(I: SplitPt, BBName: "select.end" ); |
631 | BFI->setBlockFreq(BB: EndBlock, Freq: BFI->getBlockFreq(BB: StartBlock)); |
632 | // Delete the unconditional branch that was just created by the split. |
633 | StartBlock->getTerminator()->eraseFromParent(); |
634 | |
635 | // Move any debug/pseudo instructions that were in-between the select |
636 | // group to the newly-created end block. |
637 | SmallVector<Instruction *, 2> DebugPseudoINS; |
638 | auto DIt = SI.getI()->getIterator(); |
639 | while (&*DIt != LastSI.getI()) { |
640 | if (DIt->isDebugOrPseudoInst()) |
641 | DebugPseudoINS.push_back(Elt: &*DIt); |
642 | DIt++; |
643 | } |
644 | for (auto *DI : DebugPseudoINS) { |
645 | DI->moveBeforePreserving(MovePos: &*EndBlock->getFirstInsertionPt()); |
646 | } |
647 | |
648 | // Duplicate implementation for DbgRecords, the non-instruction debug-info |
649 | // format. Helper lambda for moving DbgRecords to the end block. |
650 | auto TransferDbgRecords = [&](Instruction &I) { |
651 | for (auto &DbgRecord : |
652 | llvm::make_early_inc_range(Range: I.getDbgRecordRange())) { |
653 | DbgRecord.removeFromParent(); |
654 | EndBlock->insertDbgRecordBefore(DR: &DbgRecord, |
655 | Here: EndBlock->getFirstInsertionPt()); |
656 | } |
657 | }; |
658 | |
659 | // Iterate over all instructions in between SI and LastSI, not including |
660 | // SI itself. These are all the variable assignments that happen "in the |
661 | // middle" of the select group. |
662 | auto R = make_range(x: std::next(x: SI.getI()->getIterator()), |
663 | y: std::next(x: LastSI.getI()->getIterator())); |
664 | llvm::for_each(Range&: R, F: TransferDbgRecords); |
665 | |
666 | // These are the new basic blocks for the conditional branch. |
667 | // At least one will become an actual new basic block. |
668 | BasicBlock *TrueBlock = nullptr, *FalseBlock = nullptr; |
669 | BranchInst *TrueBranch = nullptr, *FalseBranch = nullptr; |
670 | if (!TrueSlicesInterleaved.empty()) { |
671 | TrueBlock = BasicBlock::Create(Context&: EndBlock->getContext(), Name: "select.true.sink" , |
672 | Parent: EndBlock->getParent(), InsertBefore: EndBlock); |
673 | TrueBranch = BranchInst::Create(IfTrue: EndBlock, InsertAtEnd: TrueBlock); |
674 | TrueBranch->setDebugLoc(LastSI.getI()->getDebugLoc()); |
675 | for (Instruction *TrueInst : TrueSlicesInterleaved) |
676 | TrueInst->moveBefore(MovePos: TrueBranch); |
677 | } |
678 | if (!FalseSlicesInterleaved.empty()) { |
679 | FalseBlock = |
680 | BasicBlock::Create(Context&: EndBlock->getContext(), Name: "select.false.sink" , |
681 | Parent: EndBlock->getParent(), InsertBefore: EndBlock); |
682 | FalseBranch = BranchInst::Create(IfTrue: EndBlock, InsertAtEnd: FalseBlock); |
683 | FalseBranch->setDebugLoc(LastSI.getI()->getDebugLoc()); |
684 | for (Instruction *FalseInst : FalseSlicesInterleaved) |
685 | FalseInst->moveBefore(MovePos: FalseBranch); |
686 | } |
687 | // If there was nothing to sink, then arbitrarily choose the 'false' side |
688 | // for a new input value to the PHI. |
689 | if (TrueBlock == FalseBlock) { |
690 | assert(TrueBlock == nullptr && |
691 | "Unexpected basic block transform while optimizing select" ); |
692 | |
693 | FalseBlock = BasicBlock::Create(Context&: StartBlock->getContext(), Name: "select.false" , |
694 | Parent: EndBlock->getParent(), InsertBefore: EndBlock); |
695 | auto *FalseBranch = BranchInst::Create(IfTrue: EndBlock, InsertAtEnd: FalseBlock); |
696 | FalseBranch->setDebugLoc(SI.getI()->getDebugLoc()); |
697 | } |
698 | |
699 | // Insert the real conditional branch based on the original condition. |
700 | // If we did not create a new block for one of the 'true' or 'false' paths |
701 | // of the condition, it means that side of the branch goes to the end block |
702 | // directly and the path originates from the start block from the point of |
703 | // view of the new PHI. |
704 | BasicBlock *TT, *FT; |
705 | if (TrueBlock == nullptr) { |
706 | TT = EndBlock; |
707 | FT = FalseBlock; |
708 | TrueBlock = StartBlock; |
709 | } else if (FalseBlock == nullptr) { |
710 | TT = TrueBlock; |
711 | FT = EndBlock; |
712 | FalseBlock = StartBlock; |
713 | } else { |
714 | TT = TrueBlock; |
715 | FT = FalseBlock; |
716 | } |
717 | IRBuilder<> IB(SI.getI()); |
718 | auto *CondFr = IB.CreateFreeze(V: SI.getCondition(), |
719 | Name: SI.getCondition()->getName() + ".frozen" ); |
720 | |
721 | SmallPtrSet<const Instruction *, 2> INS; |
722 | for (auto SI : ASI) |
723 | INS.insert(Ptr: SI.getI()); |
724 | |
725 | // Use reverse iterator because later select may use the value of the |
726 | // earlier select, and we need to propagate value through earlier select |
727 | // to get the PHI operand. |
728 | for (auto It = ASI.rbegin(); It != ASI.rend(); ++It) { |
729 | SelectLike SI = *It; |
730 | // The select itself is replaced with a PHI Node. |
731 | PHINode *PN = PHINode::Create(Ty: SI.getType(), NumReservedValues: 2, NameStr: "" ); |
732 | PN->insertBefore(InsertPos: EndBlock->begin()); |
733 | PN->takeName(V: SI.getI()); |
734 | PN->addIncoming(V: getTrueOrFalseValue(SI, isTrue: true, Selects: INS, IB), BB: TrueBlock); |
735 | PN->addIncoming(V: getTrueOrFalseValue(SI, isTrue: false, Selects: INS, IB), BB: FalseBlock); |
736 | PN->setDebugLoc(SI.getI()->getDebugLoc()); |
737 | SI.getI()->replaceAllUsesWith(V: PN); |
738 | INS.erase(Ptr: SI.getI()); |
739 | ++NumSelectsConverted; |
740 | } |
741 | IB.CreateCondBr(Cond: CondFr, True: TT, False: FT, MDSrc: SI.getI()); |
742 | |
743 | // Remove the old select instructions, now that they are not longer used. |
744 | for (auto SI : ASI) |
745 | SI.getI()->eraseFromParent(); |
746 | } |
747 | } |
748 | |
749 | void SelectOptimizeImpl::collectSelectGroups(BasicBlock &BB, |
750 | SelectGroups &SIGroups) { |
751 | BasicBlock::iterator BBIt = BB.begin(); |
752 | while (BBIt != BB.end()) { |
753 | Instruction *I = &*BBIt++; |
754 | if (SelectLike SI = SelectLike::match(I)) { |
755 | if (!TTI->shouldTreatInstructionLikeSelect(I)) |
756 | continue; |
757 | |
758 | SelectGroup SIGroup; |
759 | SIGroup.push_back(Elt: SI); |
760 | while (BBIt != BB.end()) { |
761 | Instruction *NI = &*BBIt; |
762 | // Debug/pseudo instructions should be skipped and not prevent the |
763 | // formation of a select group. |
764 | if (NI->isDebugOrPseudoInst()) { |
765 | ++BBIt; |
766 | continue; |
767 | } |
768 | // We only allow selects in the same group, not other select-like |
769 | // instructions. |
770 | if (!isa<SelectInst>(Val: NI)) |
771 | break; |
772 | |
773 | SelectLike NSI = SelectLike::match(I: NI); |
774 | if (NSI && SI.getCondition() == NSI.getCondition()) { |
775 | SIGroup.push_back(Elt: NSI); |
776 | } else |
777 | break; |
778 | ++BBIt; |
779 | } |
780 | |
781 | // If the select type is not supported, no point optimizing it. |
782 | // Instruction selection will take care of it. |
783 | if (!isSelectKindSupported(SI)) |
784 | continue; |
785 | |
786 | SIGroups.push_back(Elt: SIGroup); |
787 | } |
788 | } |
789 | } |
790 | |
791 | void SelectOptimizeImpl::findProfitableSIGroupsBase( |
792 | SelectGroups &SIGroups, SelectGroups &ProfSIGroups) { |
793 | for (SelectGroup &ASI : SIGroups) { |
794 | ++NumSelectOptAnalyzed; |
795 | if (isConvertToBranchProfitableBase(ASI)) |
796 | ProfSIGroups.push_back(Elt: ASI); |
797 | } |
798 | } |
799 | |
800 | static void EmitAndPrintRemark(OptimizationRemarkEmitter *ORE, |
801 | DiagnosticInfoOptimizationBase &Rem) { |
802 | LLVM_DEBUG(dbgs() << Rem.getMsg() << "\n" ); |
803 | ORE->emit(OptDiag&: Rem); |
804 | } |
805 | |
806 | void SelectOptimizeImpl::findProfitableSIGroupsInnerLoops( |
807 | const Loop *L, SelectGroups &SIGroups, SelectGroups &ProfSIGroups) { |
808 | NumSelectOptAnalyzed += SIGroups.size(); |
809 | // For each select group in an inner-most loop, |
810 | // a branch is more preferable than a select/conditional-move if: |
811 | // i) conversion to branches for all the select groups of the loop satisfies |
812 | // loop-level heuristics including reducing the loop's critical path by |
813 | // some threshold (see SelectOptimizeImpl::checkLoopHeuristics); and |
814 | // ii) the total cost of the select group is cheaper with a branch compared |
815 | // to its predicated version. The cost is in terms of latency and the cost |
816 | // of a select group is the cost of its most expensive select instruction |
817 | // (assuming infinite resources and thus fully leveraging available ILP). |
818 | |
819 | DenseMap<const Instruction *, CostInfo> InstCostMap; |
820 | CostInfo LoopCost[2] = {{.PredCost: Scaled64::getZero(), .NonPredCost: Scaled64::getZero()}, |
821 | {.PredCost: Scaled64::getZero(), .NonPredCost: Scaled64::getZero()}}; |
822 | if (!computeLoopCosts(L, SIGroups, InstCostMap, LoopCost) || |
823 | !checkLoopHeuristics(L, LoopDepth: LoopCost)) { |
824 | return; |
825 | } |
826 | |
827 | for (SelectGroup &ASI : SIGroups) { |
828 | // Assuming infinite resources, the cost of a group of instructions is the |
829 | // cost of the most expensive instruction of the group. |
830 | Scaled64 SelectCost = Scaled64::getZero(), BranchCost = Scaled64::getZero(); |
831 | for (SelectLike SI : ASI) { |
832 | SelectCost = std::max(a: SelectCost, b: InstCostMap[SI.getI()].PredCost); |
833 | BranchCost = std::max(a: BranchCost, b: InstCostMap[SI.getI()].NonPredCost); |
834 | } |
835 | if (BranchCost < SelectCost) { |
836 | OptimizationRemark OR(DEBUG_TYPE, "SelectOpti" , ASI.front().getI()); |
837 | OR << "Profitable to convert to branch (loop analysis). BranchCost=" |
838 | << BranchCost.toString() << ", SelectCost=" << SelectCost.toString() |
839 | << ". " ; |
840 | EmitAndPrintRemark(ORE, Rem&: OR); |
841 | ++NumSelectConvertedLoop; |
842 | ProfSIGroups.push_back(Elt: ASI); |
843 | } else { |
844 | OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti" , |
845 | ASI.front().getI()); |
846 | ORmiss << "Select is more profitable (loop analysis). BranchCost=" |
847 | << BranchCost.toString() |
848 | << ", SelectCost=" << SelectCost.toString() << ". " ; |
849 | EmitAndPrintRemark(ORE, Rem&: ORmiss); |
850 | } |
851 | } |
852 | } |
853 | |
854 | bool SelectOptimizeImpl::isConvertToBranchProfitableBase( |
855 | const SelectGroup &ASI) { |
856 | SelectLike SI = ASI.front(); |
857 | LLVM_DEBUG(dbgs() << "Analyzing select group containing " << *SI.getI() |
858 | << "\n" ); |
859 | OptimizationRemark OR(DEBUG_TYPE, "SelectOpti" , SI.getI()); |
860 | OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti" , SI.getI()); |
861 | |
862 | // Skip cold basic blocks. Better to optimize for size for cold blocks. |
863 | if (PSI->isColdBlock(BB: SI.getI()->getParent(), BFI)) { |
864 | ++NumSelectColdBB; |
865 | ORmiss << "Not converted to branch because of cold basic block. " ; |
866 | EmitAndPrintRemark(ORE, Rem&: ORmiss); |
867 | return false; |
868 | } |
869 | |
870 | // If unpredictable, branch form is less profitable. |
871 | if (SI.getI()->getMetadata(KindID: LLVMContext::MD_unpredictable)) { |
872 | ++NumSelectUnPred; |
873 | ORmiss << "Not converted to branch because of unpredictable branch. " ; |
874 | EmitAndPrintRemark(ORE, Rem&: ORmiss); |
875 | return false; |
876 | } |
877 | |
878 | // If highly predictable, branch form is more profitable, unless a |
879 | // predictable select is inexpensive in the target architecture. |
880 | if (isSelectHighlyPredictable(SI) && TLI->isPredictableSelectExpensive()) { |
881 | ++NumSelectConvertedHighPred; |
882 | OR << "Converted to branch because of highly predictable branch. " ; |
883 | EmitAndPrintRemark(ORE, Rem&: OR); |
884 | return true; |
885 | } |
886 | |
887 | // Look for expensive instructions in the cold operand's (if any) dependence |
888 | // slice of any of the selects in the group. |
889 | if (hasExpensiveColdOperand(ASI)) { |
890 | ++NumSelectConvertedExpColdOperand; |
891 | OR << "Converted to branch because of expensive cold operand." ; |
892 | EmitAndPrintRemark(ORE, Rem&: OR); |
893 | return true; |
894 | } |
895 | |
896 | ORmiss << "Not profitable to convert to branch (base heuristic)." ; |
897 | EmitAndPrintRemark(ORE, Rem&: ORmiss); |
898 | return false; |
899 | } |
900 | |
901 | static InstructionCost divideNearest(InstructionCost Numerator, |
902 | uint64_t Denominator) { |
903 | return (Numerator + (Denominator / 2)) / Denominator; |
904 | } |
905 | |
906 | static bool (const SelectOptimizeImpl::SelectLike SI, |
907 | uint64_t &TrueVal, uint64_t &FalseVal) { |
908 | if (isa<SelectInst>(Val: SI.getI())) |
909 | return extractBranchWeights(I: *SI.getI(), TrueVal, FalseVal); |
910 | return false; |
911 | } |
912 | |
913 | bool SelectOptimizeImpl::hasExpensiveColdOperand(const SelectGroup &ASI) { |
914 | bool ColdOperand = false; |
915 | uint64_t TrueWeight, FalseWeight, TotalWeight; |
916 | if (extractBranchWeights(SI: ASI.front(), TrueVal&: TrueWeight, FalseVal&: FalseWeight)) { |
917 | uint64_t MinWeight = std::min(a: TrueWeight, b: FalseWeight); |
918 | TotalWeight = TrueWeight + FalseWeight; |
919 | // Is there a path with frequency <ColdOperandThreshold% (default:20%) ? |
920 | ColdOperand = TotalWeight * ColdOperandThreshold > 100 * MinWeight; |
921 | } else if (PSI->hasProfileSummary()) { |
922 | OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti" , |
923 | ASI.front().getI()); |
924 | ORmiss << "Profile data available but missing branch-weights metadata for " |
925 | "select instruction. " ; |
926 | EmitAndPrintRemark(ORE, Rem&: ORmiss); |
927 | } |
928 | if (!ColdOperand) |
929 | return false; |
930 | // Check if the cold path's dependence slice is expensive for any of the |
931 | // selects of the group. |
932 | for (SelectLike SI : ASI) { |
933 | Instruction *ColdI = nullptr; |
934 | uint64_t HotWeight; |
935 | if (TrueWeight < FalseWeight) { |
936 | ColdI = dyn_cast_or_null<Instruction>(Val: SI.getTrueValue()); |
937 | HotWeight = FalseWeight; |
938 | } else { |
939 | ColdI = dyn_cast_or_null<Instruction>(Val: SI.getFalseValue()); |
940 | HotWeight = TrueWeight; |
941 | } |
942 | if (ColdI) { |
943 | std::stack<Instruction *> ColdSlice; |
944 | getExclBackwardsSlice(I: ColdI, Slice&: ColdSlice, SI: SI.getI()); |
945 | InstructionCost SliceCost = 0; |
946 | while (!ColdSlice.empty()) { |
947 | SliceCost += TTI->getInstructionCost(U: ColdSlice.top(), |
948 | CostKind: TargetTransformInfo::TCK_Latency); |
949 | ColdSlice.pop(); |
950 | } |
951 | // The colder the cold value operand of the select is the more expensive |
952 | // the cmov becomes for computing the cold value operand every time. Thus, |
953 | // the colder the cold operand is the more its cost counts. |
954 | // Get nearest integer cost adjusted for coldness. |
955 | InstructionCost AdjSliceCost = |
956 | divideNearest(Numerator: SliceCost * HotWeight, Denominator: TotalWeight); |
957 | if (AdjSliceCost >= |
958 | ColdOperandMaxCostMultiplier * TargetTransformInfo::TCC_Expensive) |
959 | return true; |
960 | } |
961 | } |
962 | return false; |
963 | } |
964 | |
965 | // Check if it is safe to move LoadI next to the SI. |
966 | // Conservatively assume it is safe only if there is no instruction |
967 | // modifying memory in-between the load and the select instruction. |
968 | static bool isSafeToSinkLoad(Instruction *LoadI, Instruction *SI) { |
969 | // Assume loads from different basic blocks are unsafe to move. |
970 | if (LoadI->getParent() != SI->getParent()) |
971 | return false; |
972 | auto It = LoadI->getIterator(); |
973 | while (&*It != SI) { |
974 | if (It->mayWriteToMemory()) |
975 | return false; |
976 | It++; |
977 | } |
978 | return true; |
979 | } |
980 | |
981 | // For a given source instruction, collect its backwards dependence slice |
982 | // consisting of instructions exclusively computed for the purpose of producing |
983 | // the operands of the source instruction. As an approximation |
984 | // (sufficiently-accurate in practice), we populate this set with the |
985 | // instructions of the backwards dependence slice that only have one-use and |
986 | // form an one-use chain that leads to the source instruction. |
987 | void SelectOptimizeImpl::getExclBackwardsSlice(Instruction *I, |
988 | std::stack<Instruction *> &Slice, |
989 | Instruction *SI, |
990 | bool ForSinking) { |
991 | SmallPtrSet<Instruction *, 2> Visited; |
992 | std::queue<Instruction *> Worklist; |
993 | Worklist.push(x: I); |
994 | while (!Worklist.empty()) { |
995 | Instruction *II = Worklist.front(); |
996 | Worklist.pop(); |
997 | |
998 | // Avoid cycles. |
999 | if (!Visited.insert(Ptr: II).second) |
1000 | continue; |
1001 | |
1002 | if (!II->hasOneUse()) |
1003 | continue; |
1004 | |
1005 | // Cannot soundly sink instructions with side-effects. |
1006 | // Terminator or phi instructions cannot be sunk. |
1007 | // Avoid sinking other select instructions (should be handled separetely). |
1008 | if (ForSinking && (II->isTerminator() || II->mayHaveSideEffects() || |
1009 | isa<SelectInst>(Val: II) || isa<PHINode>(Val: II))) |
1010 | continue; |
1011 | |
1012 | // Avoid sinking loads in order not to skip state-modifying instructions, |
1013 | // that may alias with the loaded address. |
1014 | // Only allow sinking of loads within the same basic block that are |
1015 | // conservatively proven to be safe. |
1016 | if (ForSinking && II->mayReadFromMemory() && !isSafeToSinkLoad(LoadI: II, SI)) |
1017 | continue; |
1018 | |
1019 | // Avoid considering instructions with less frequency than the source |
1020 | // instruction (i.e., avoid colder code regions of the dependence slice). |
1021 | if (BFI->getBlockFreq(BB: II->getParent()) < BFI->getBlockFreq(BB: I->getParent())) |
1022 | continue; |
1023 | |
1024 | // Eligible one-use instruction added to the dependence slice. |
1025 | Slice.push(x: II); |
1026 | |
1027 | // Explore all the operands of the current instruction to expand the slice. |
1028 | for (unsigned k = 0; k < II->getNumOperands(); ++k) |
1029 | if (auto *OpI = dyn_cast<Instruction>(Val: II->getOperand(i: k))) |
1030 | Worklist.push(x: OpI); |
1031 | } |
1032 | } |
1033 | |
1034 | bool SelectOptimizeImpl::isSelectHighlyPredictable(const SelectLike SI) { |
1035 | uint64_t TrueWeight, FalseWeight; |
1036 | if (extractBranchWeights(SI, TrueVal&: TrueWeight, FalseVal&: FalseWeight)) { |
1037 | uint64_t Max = std::max(a: TrueWeight, b: FalseWeight); |
1038 | uint64_t Sum = TrueWeight + FalseWeight; |
1039 | if (Sum != 0) { |
1040 | auto Probability = BranchProbability::getBranchProbability(Numerator: Max, Denominator: Sum); |
1041 | if (Probability > TTI->getPredictableBranchThreshold()) |
1042 | return true; |
1043 | } |
1044 | } |
1045 | return false; |
1046 | } |
1047 | |
1048 | bool SelectOptimizeImpl::checkLoopHeuristics(const Loop *L, |
1049 | const CostInfo LoopCost[2]) { |
1050 | // Loop-level checks to determine if a non-predicated version (with branches) |
1051 | // of the loop is more profitable than its predicated version. |
1052 | |
1053 | if (DisableLoopLevelHeuristics) |
1054 | return true; |
1055 | |
1056 | OptimizationRemarkMissed ORmissL(DEBUG_TYPE, "SelectOpti" , |
1057 | L->getHeader()->getFirstNonPHI()); |
1058 | |
1059 | if (LoopCost[0].NonPredCost > LoopCost[0].PredCost || |
1060 | LoopCost[1].NonPredCost >= LoopCost[1].PredCost) { |
1061 | ORmissL << "No select conversion in the loop due to no reduction of loop's " |
1062 | "critical path. " ; |
1063 | EmitAndPrintRemark(ORE, Rem&: ORmissL); |
1064 | return false; |
1065 | } |
1066 | |
1067 | Scaled64 Gain[2] = {LoopCost[0].PredCost - LoopCost[0].NonPredCost, |
1068 | LoopCost[1].PredCost - LoopCost[1].NonPredCost}; |
1069 | |
1070 | // Profitably converting to branches need to reduce the loop's critical path |
1071 | // by at least some threshold (absolute gain of GainCycleThreshold cycles and |
1072 | // relative gain of 12.5%). |
1073 | if (Gain[1] < Scaled64::get(N: GainCycleThreshold) || |
1074 | Gain[1] * Scaled64::get(N: GainRelativeThreshold) < LoopCost[1].PredCost) { |
1075 | Scaled64 RelativeGain = Scaled64::get(N: 100) * Gain[1] / LoopCost[1].PredCost; |
1076 | ORmissL << "No select conversion in the loop due to small reduction of " |
1077 | "loop's critical path. Gain=" |
1078 | << Gain[1].toString() |
1079 | << ", RelativeGain=" << RelativeGain.toString() << "%. " ; |
1080 | EmitAndPrintRemark(ORE, Rem&: ORmissL); |
1081 | return false; |
1082 | } |
1083 | |
1084 | // If the loop's critical path involves loop-carried dependences, the gradient |
1085 | // of the gain needs to be at least GainGradientThreshold% (defaults to 25%). |
1086 | // This check ensures that the latency reduction for the loop's critical path |
1087 | // keeps decreasing with sufficient rate beyond the two analyzed loop |
1088 | // iterations. |
1089 | if (Gain[1] > Gain[0]) { |
1090 | Scaled64 GradientGain = Scaled64::get(N: 100) * (Gain[1] - Gain[0]) / |
1091 | (LoopCost[1].PredCost - LoopCost[0].PredCost); |
1092 | if (GradientGain < Scaled64::get(N: GainGradientThreshold)) { |
1093 | ORmissL << "No select conversion in the loop due to small gradient gain. " |
1094 | "GradientGain=" |
1095 | << GradientGain.toString() << "%. " ; |
1096 | EmitAndPrintRemark(ORE, Rem&: ORmissL); |
1097 | return false; |
1098 | } |
1099 | } |
1100 | // If the gain decreases it is not profitable to convert. |
1101 | else if (Gain[1] < Gain[0]) { |
1102 | ORmissL |
1103 | << "No select conversion in the loop due to negative gradient gain. " ; |
1104 | EmitAndPrintRemark(ORE, Rem&: ORmissL); |
1105 | return false; |
1106 | } |
1107 | |
1108 | // Non-predicated version of the loop is more profitable than its |
1109 | // predicated version. |
1110 | return true; |
1111 | } |
1112 | |
1113 | // Computes instruction and loop-critical-path costs for both the predicated |
1114 | // and non-predicated version of the given loop. |
1115 | // Returns false if unable to compute these costs due to invalid cost of loop |
1116 | // instruction(s). |
1117 | bool SelectOptimizeImpl::computeLoopCosts( |
1118 | const Loop *L, const SelectGroups &SIGroups, |
1119 | DenseMap<const Instruction *, CostInfo> &InstCostMap, CostInfo *LoopCost) { |
1120 | LLVM_DEBUG(dbgs() << "Calculating Latency / IPredCost / INonPredCost of loop " |
1121 | << L->getHeader()->getName() << "\n" ); |
1122 | const auto &SImap = getSImap(SIGroups); |
1123 | // Compute instruction and loop-critical-path costs across two iterations for |
1124 | // both predicated and non-predicated version. |
1125 | const unsigned Iterations = 2; |
1126 | for (unsigned Iter = 0; Iter < Iterations; ++Iter) { |
1127 | // Cost of the loop's critical path. |
1128 | CostInfo &MaxCost = LoopCost[Iter]; |
1129 | for (BasicBlock *BB : L->getBlocks()) { |
1130 | for (const Instruction &I : *BB) { |
1131 | if (I.isDebugOrPseudoInst()) |
1132 | continue; |
1133 | // Compute the predicated and non-predicated cost of the instruction. |
1134 | Scaled64 IPredCost = Scaled64::getZero(), |
1135 | INonPredCost = Scaled64::getZero(); |
1136 | |
1137 | // Assume infinite resources that allow to fully exploit the available |
1138 | // instruction-level parallelism. |
1139 | // InstCost = InstLatency + max(Op1Cost, Op2Cost, … OpNCost) |
1140 | for (const Use &U : I.operands()) { |
1141 | auto UI = dyn_cast<Instruction>(Val: U.get()); |
1142 | if (!UI) |
1143 | continue; |
1144 | if (InstCostMap.count(Val: UI)) { |
1145 | IPredCost = std::max(a: IPredCost, b: InstCostMap[UI].PredCost); |
1146 | INonPredCost = std::max(a: INonPredCost, b: InstCostMap[UI].NonPredCost); |
1147 | } |
1148 | } |
1149 | auto ILatency = computeInstCost(I: &I); |
1150 | if (!ILatency) { |
1151 | OptimizationRemarkMissed ORmissL(DEBUG_TYPE, "SelectOpti" , &I); |
1152 | ORmissL << "Invalid instruction cost preventing analysis and " |
1153 | "optimization of the inner-most loop containing this " |
1154 | "instruction. " ; |
1155 | EmitAndPrintRemark(ORE, Rem&: ORmissL); |
1156 | return false; |
1157 | } |
1158 | IPredCost += Scaled64::get(N: *ILatency); |
1159 | INonPredCost += Scaled64::get(N: *ILatency); |
1160 | |
1161 | // For a select that can be converted to branch, |
1162 | // compute its cost as a branch (non-predicated cost). |
1163 | // |
1164 | // BranchCost = PredictedPathCost + MispredictCost |
1165 | // PredictedPathCost = TrueOpCost * TrueProb + FalseOpCost * FalseProb |
1166 | // MispredictCost = max(MispredictPenalty, CondCost) * MispredictRate |
1167 | if (SImap.contains(Val: &I)) { |
1168 | auto SI = SImap.at(Val: &I); |
1169 | Scaled64 TrueOpCost = SI.getTrueOpCost(InstCostMap, TTI); |
1170 | Scaled64 FalseOpCost = SI.getFalseOpCost(InstCostMap, TTI); |
1171 | Scaled64 PredictedPathCost = |
1172 | getPredictedPathCost(TrueCost: TrueOpCost, FalseCost: FalseOpCost, SI); |
1173 | |
1174 | Scaled64 CondCost = Scaled64::getZero(); |
1175 | if (auto *CI = dyn_cast<Instruction>(Val: SI.getCondition())) |
1176 | if (InstCostMap.count(Val: CI)) |
1177 | CondCost = InstCostMap[CI].NonPredCost; |
1178 | Scaled64 MispredictCost = getMispredictionCost(SI, CondCost); |
1179 | |
1180 | INonPredCost = PredictedPathCost + MispredictCost; |
1181 | } |
1182 | LLVM_DEBUG(dbgs() << " " << ILatency << "/" << IPredCost << "/" |
1183 | << INonPredCost << " for " << I << "\n" ); |
1184 | |
1185 | InstCostMap[&I] = {.PredCost: IPredCost, .NonPredCost: INonPredCost}; |
1186 | MaxCost.PredCost = std::max(a: MaxCost.PredCost, b: IPredCost); |
1187 | MaxCost.NonPredCost = std::max(a: MaxCost.NonPredCost, b: INonPredCost); |
1188 | } |
1189 | } |
1190 | LLVM_DEBUG(dbgs() << "Iteration " << Iter + 1 |
1191 | << " MaxCost = " << MaxCost.PredCost << " " |
1192 | << MaxCost.NonPredCost << "\n" ); |
1193 | } |
1194 | return true; |
1195 | } |
1196 | |
1197 | SmallDenseMap<const Instruction *, SelectOptimizeImpl::SelectLike, 2> |
1198 | SelectOptimizeImpl::getSImap(const SelectGroups &SIGroups) { |
1199 | SmallDenseMap<const Instruction *, SelectLike, 2> SImap; |
1200 | for (const SelectGroup &ASI : SIGroups) |
1201 | for (SelectLike SI : ASI) |
1202 | SImap.try_emplace(Key: SI.getI(), Args&: SI); |
1203 | return SImap; |
1204 | } |
1205 | |
1206 | std::optional<uint64_t> |
1207 | SelectOptimizeImpl::computeInstCost(const Instruction *I) { |
1208 | InstructionCost ICost = |
1209 | TTI->getInstructionCost(U: I, CostKind: TargetTransformInfo::TCK_Latency); |
1210 | if (auto OC = ICost.getValue()) |
1211 | return std::optional<uint64_t>(*OC); |
1212 | return std::nullopt; |
1213 | } |
1214 | |
1215 | ScaledNumber<uint64_t> |
1216 | SelectOptimizeImpl::getMispredictionCost(const SelectLike SI, |
1217 | const Scaled64 CondCost) { |
1218 | uint64_t MispredictPenalty = TSchedModel.getMCSchedModel()->MispredictPenalty; |
1219 | |
1220 | // Account for the default misprediction rate when using a branch |
1221 | // (conservatively set to 25% by default). |
1222 | uint64_t MispredictRate = MispredictDefaultRate; |
1223 | // If the select condition is obviously predictable, then the misprediction |
1224 | // rate is zero. |
1225 | if (isSelectHighlyPredictable(SI)) |
1226 | MispredictRate = 0; |
1227 | |
1228 | // CondCost is included to account for cases where the computation of the |
1229 | // condition is part of a long dependence chain (potentially loop-carried) |
1230 | // that would delay detection of a misprediction and increase its cost. |
1231 | Scaled64 MispredictCost = |
1232 | std::max(a: Scaled64::get(N: MispredictPenalty), b: CondCost) * |
1233 | Scaled64::get(N: MispredictRate); |
1234 | MispredictCost /= Scaled64::get(N: 100); |
1235 | |
1236 | return MispredictCost; |
1237 | } |
1238 | |
1239 | // Returns the cost of a branch when the prediction is correct. |
1240 | // TrueCost * TrueProbability + FalseCost * FalseProbability. |
1241 | ScaledNumber<uint64_t> |
1242 | SelectOptimizeImpl::getPredictedPathCost(Scaled64 TrueCost, Scaled64 FalseCost, |
1243 | const SelectLike SI) { |
1244 | Scaled64 PredPathCost; |
1245 | uint64_t TrueWeight, FalseWeight; |
1246 | if (extractBranchWeights(SI, TrueVal&: TrueWeight, FalseVal&: FalseWeight)) { |
1247 | uint64_t SumWeight = TrueWeight + FalseWeight; |
1248 | if (SumWeight != 0) { |
1249 | PredPathCost = TrueCost * Scaled64::get(N: TrueWeight) + |
1250 | FalseCost * Scaled64::get(N: FalseWeight); |
1251 | PredPathCost /= Scaled64::get(N: SumWeight); |
1252 | return PredPathCost; |
1253 | } |
1254 | } |
1255 | // Without branch weight metadata, we assume 75% for the one path and 25% for |
1256 | // the other, and pick the result with the biggest cost. |
1257 | PredPathCost = std::max(a: TrueCost * Scaled64::get(N: 3) + FalseCost, |
1258 | b: FalseCost * Scaled64::get(N: 3) + TrueCost); |
1259 | PredPathCost /= Scaled64::get(N: 4); |
1260 | return PredPathCost; |
1261 | } |
1262 | |
1263 | bool SelectOptimizeImpl::isSelectKindSupported(const SelectLike SI) { |
1264 | bool VectorCond = !SI.getCondition()->getType()->isIntegerTy(Bitwidth: 1); |
1265 | if (VectorCond) |
1266 | return false; |
1267 | TargetLowering::SelectSupportKind SelectKind; |
1268 | if (SI.getType()->isVectorTy()) |
1269 | SelectKind = TargetLowering::ScalarCondVectorVal; |
1270 | else |
1271 | SelectKind = TargetLowering::ScalarValSelect; |
1272 | return TLI->isSelectSupported(SelectKind); |
1273 | } |
1274 | |