SelectOptimize.cpp source code [llvm/lib/CodeGen/SelectOptimize.cpp]

1	//===--- SelectOptimize.cpp - Convert select to branches if profitable ---===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This pass converts selects to conditional jumps when profitable.
10	//
11	//===----------------------------------------------------------------------===//
12
13	#include "llvm/CodeGen/SelectOptimize.h"
14	#include "llvm/ADT/SmallVector.h"
15	#include "llvm/ADT/Statistic.h"
16	#include "llvm/Analysis/BlockFrequencyInfo.h"
17	#include "llvm/Analysis/BranchProbabilityInfo.h"
18	#include "llvm/Analysis/LoopInfo.h"
19	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
20	#include "llvm/Analysis/ProfileSummaryInfo.h"
21	#include "llvm/Analysis/TargetTransformInfo.h"
22	#include "llvm/CodeGen/Passes.h"
23	#include "llvm/CodeGen/TargetLowering.h"
24	#include "llvm/CodeGen/TargetPassConfig.h"
25	#include "llvm/CodeGen/TargetSchedule.h"
26	#include "llvm/CodeGen/TargetSubtargetInfo.h"
27	#include "llvm/IR/BasicBlock.h"
28	#include "llvm/IR/Dominators.h"
29	#include "llvm/IR/Function.h"
30	#include "llvm/IR/IRBuilder.h"
31	#include "llvm/IR/Instruction.h"
32	#include "llvm/IR/PatternMatch.h"
33	#include "llvm/IR/ProfDataUtils.h"
34	#include "llvm/InitializePasses.h"
35	#include "llvm/Pass.h"
36	#include "llvm/Support/ScaledNumber.h"
37	#include "llvm/Target/TargetMachine.h"
38	#include "llvm/Transforms/Utils/SizeOpts.h"
39	#include <algorithm>
40	#include <memory>
41	#include <queue>
42	#include <stack>
43
44	using namespace llvm;
45	using namespace llvm::PatternMatch;
46
47	#define DEBUG_TYPE "select-optimize"
48
49	STATISTIC(NumSelectOptAnalyzed,
50	"Number of select groups considered for conversion to branch");
51	STATISTIC(NumSelectConvertedExpColdOperand,
52	"Number of select groups converted due to expensive cold operand");
53	STATISTIC(NumSelectConvertedHighPred,
54	"Number of select groups converted due to high-predictability");
55	STATISTIC(NumSelectUnPred,
56	"Number of select groups not converted due to unpredictability");
57	STATISTIC(NumSelectColdBB,
58	"Number of select groups not converted due to cold basic block");
59	STATISTIC(NumSelectConvertedLoop,
60	"Number of select groups converted due to loop-level analysis");
61	STATISTIC(NumSelectsConverted, "Number of selects converted");
62
63	static cl::opt<unsigned> ColdOperandThreshold(
64	"cold-operand-threshold",
65	cl::desc ("Maximum frequency of path for an operand to be considered cold."),
66	cl::init(Val: `20`), cl::Hidden);
67
68	static cl::opt<unsigned> ColdOperandMaxCostMultiplier(
69	"cold-operand-max-cost-multiplier",
70	cl::desc ("Maximum cost multiplier of TCC_expensive for the dependence "
71	"slice of a cold operand to be considered inexpensive."),
72	cl::init(Val: `1`), cl::Hidden);
73
74	static cl::opt<unsigned>
75	GainGradientThreshold("select-opti-loop-gradient-gain-threshold",
76	cl::desc ("Gradient gain threshold (%)."),
77	cl::init(Val: `25`), cl::Hidden);
78
79	static cl::opt<unsigned>
80	GainCycleThreshold("select-opti-loop-cycle-gain-threshold",
81	cl::desc ("Minimum gain per loop (in cycles) threshold."),
82	cl::init(Val: `4`), cl::Hidden);
83
84	static cl::opt<unsigned> GainRelativeThreshold(
85	"select-opti-loop-relative-gain-threshold",
86	cl::desc (
87	"Minimum relative gain per loop threshold (1/X). Defaults to 12.5%"),
88	cl::init(Val: `8`), cl::Hidden);
89
90	static cl::opt<unsigned> MispredictDefaultRate(
91	"mispredict-default-rate", cl::Hidden, cl::init(Val: `25`),
92	cl::desc ("Default mispredict rate (initialized to 25%)."));
93
94	static cl::opt<bool>
95	DisableLoopLevelHeuristics("disable-loop-level-heuristics", cl::Hidden,
96	cl::init(Val: false),
97	cl::desc ("Disable loop-level heuristics."));
98
99	namespace {
100
101	class SelectOptimizeImpl {
102	const TargetMachine TM = nullptr*;
103	const TargetSubtargetInfo TSI = nullptr*;
104	const TargetLowering TLI = nullptr*;
105	const TargetTransformInfo TTI = nullptr*;
106	const LoopInfo LI = nullptr*;
107	BlockFrequencyInfo *BFI;
108	ProfileSummaryInfo PSI = nullptr*;
109	OptimizationRemarkEmitter ORE = nullptr*;
110	TargetSchedModel TSchedModel;
111
112	public:
113	SelectOptimizeImpl() = default;
114	SelectOptimizeImpl(const TargetMachine *TM) : TM(TM){};
115	PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
116	bool runOnFunction(Function &F, Pass &P);
117
118	using Scaled64 = ScaledNumber<uint64_t>;
119
120	struct CostInfo {
121	/// Predicated cost (with selects as conditional moves).
122	Scaled64 PredCost;
123	/// Non-predicated cost (with selects converted to branches).
124	Scaled64 NonPredCost;
125	};
126
127	/// SelectLike is an abstraction over SelectInst and other operations that can
128	/// act like selects. For example Or(Zext(icmp), X) can be treated like
129	/// select(icmp, X\|1, X).
130	class SelectLike {
131	SelectLike(Instruction *I) : I(I) {}
132
133	Instruction *I;
134
135	public:
136	/// Match a select or select-like instruction, returning a SelectLike.
137	static SelectLike match(Instruction *I) {
138	// Select instruction are what we are usually looking for.
139	if (isa<SelectInst>(Val: I))
140	return SelectLike (I);
141
142	// An Or(zext(i1 X), Y) can also be treated like a select, with condition
143	// C and values Y\|1 and Y.
144	Value *X;
145	if (PatternMatch::match(
146	V: I, P: m_c_Or(L: m_OneUse(SubPattern: m_ZExt(Op: m_Value(V&: X))), R: m_Value())) &&
147	X->getType()->isIntegerTy(Bitwidth: `1`))
148	return SelectLike (I);
149
150	return SelectLike (nullptr);
151	}
152
153	bool isValid() { return I; }
154	operator bool() { return isValid(); }
155
156	Instruction getI() { return* I; }
157	const Instruction getI() const* { return I; }
158
159	Type getType() const* { return I->getType(); }
160
161	/// Return the condition for the SelectLike instruction. For example the
162	/// condition of a select or c in `or(zext(c), x)`
163	Value getCondition() const* {
164	if (auto *Sel = dyn_cast<SelectInst>(Val: I))
165	return Sel->getCondition();
166	// Or(zext) case
167	if (auto *BO = dyn_cast<BinaryOperator>(Val: I)) {
168	Value *X;
169	if (PatternMatch::match(V: BO->getOperand(i_nocapture: `0`),
170	P: m_OneUse(SubPattern: m_ZExt(Op: m_Value(V&: X)))))
171	return X;
172	if (PatternMatch::match(V: BO->getOperand(i_nocapture: `1`),
173	P: m_OneUse(SubPattern: m_ZExt(Op: m_Value(V&: X)))))
174	return X;
175	}
176
177	llvm_unreachable("Unhandled case in getCondition");
178	}
179
180	/// Return the true value for the SelectLike instruction. Note this may not
181	/// exist for all SelectLike instructions. For example, for `or(zext(c), x)`
182	/// the true value would be `or(x,1)`. As this value does not exist, nullptr
183	/// is returned.
184	Value getTrueValue() const* {
185	if (auto *Sel = dyn_cast<SelectInst>(Val: I))
186	return Sel->getTrueValue();
187	// Or(zext) case - The true value is Or(X), so return nullptr as the value
188	// does not yet exist.
189	if (isa<BinaryOperator>(Val: I))
190	return nullptr;
191
192	llvm_unreachable("Unhandled case in getTrueValue");
193	}
194
195	/// Return the false value for the SelectLike instruction. For example the
196	/// getFalseValue of a select or `x` in `or(zext(c), x)` (which is
197	/// `select(c, x\|1, x)`)
198	Value getFalseValue() const* {
199	if (auto *Sel = dyn_cast<SelectInst>(Val: I))
200	return Sel->getFalseValue();
201	// Or(zext) case - return the operand which is not the zext.
202	if (auto *BO = dyn_cast<BinaryOperator>(Val: I)) {
203	Value *X;
204	if (PatternMatch::match(V: BO->getOperand(i_nocapture: `0`),
205	P: m_OneUse(SubPattern: m_ZExt(Op: m_Value(V&: X)))))
206	return BO->getOperand(i_nocapture: `1`);
207	if (PatternMatch::match(V: BO->getOperand(i_nocapture: `1`),
208	P: m_OneUse(SubPattern: m_ZExt(Op: m_Value(V&: X)))))
209	return BO->getOperand(i_nocapture: `0`);
210	}
211
212	llvm_unreachable("Unhandled case in getFalseValue");
213	}
214
215	/// Return the NonPredCost cost of the true op, given the costs in
216	/// InstCostMap. This may need to be generated for select-like instructions.
217	Scaled64 getTrueOpCost(DenseMap<const Instruction *, CostInfo> &InstCostMap,
218	const TargetTransformInfo *TTI) {
219	if (auto *Sel = dyn_cast<SelectInst>(Val: I))
220	if (auto *I = dyn_cast<Instruction>(Val: Sel->getTrueValue()))
221	return InstCostMap.contains(Val: I) ? InstCostMap [I].NonPredCost
222	: Scaled64::getZero();
223
224	// Or case - add the cost of an extra Or to the cost of the False case.
225	if (isa<BinaryOperator>(Val: I))
226	if (auto I = dyn_cast<Instruction>(Val: getFalseValue()))
227	if (InstCostMap.contains(Val: I)) {
228	InstructionCost OrCost = TTI->getArithmeticInstrCost(
229	Opcode: Instruction::Or, Ty: I->getType(), CostKind: TargetTransformInfo::TCK_Latency,
230	Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue,
231	.Properties: TargetTransformInfo::OP_None},
232	Opd2Info: {.Kind: TTI::OK_UniformConstantValue, .Properties: TTI::OP_PowerOf2});
233	return InstCostMap [I].NonPredCost +
234	Scaled64::get(N: *OrCost.getValue());
235	}
236
237	return Scaled64::getZero();
238	}
239
240	/// Return the NonPredCost cost of the false op, given the costs in
241	/// InstCostMap. This may need to be generated for select-like instructions.
242	Scaled64
243	getFalseOpCost(DenseMap<const Instruction *, CostInfo> &InstCostMap,
244	const TargetTransformInfo *TTI) {
245	if (auto *Sel = dyn_cast<SelectInst>(Val: I))
246	if (auto *I = dyn_cast<Instruction>(Val: Sel->getFalseValue()))
247	return InstCostMap.contains(Val: I) ? InstCostMap [I].NonPredCost
248	: Scaled64::getZero();
249
250	// Or case - return the cost of the false case
251	if (isa<BinaryOperator>(Val: I))
252	if (auto I = dyn_cast<Instruction>(Val: getFalseValue()))
253	if (InstCostMap.contains(Val: I))
254	return InstCostMap [I].NonPredCost;
255
256	return Scaled64::getZero();
257	}
258	};
259
260	private:
261	// Select groups consist of consecutive select instructions with the same
262	// condition.
263	using SelectGroup = SmallVector<SelectLike, `2`>;
264	using SelectGroups = SmallVector<SelectGroup, `2`>;
265
266	// Converts select instructions of a function to conditional jumps when deemed
267	// profitable. Returns true if at least one select was converted.
268	bool optimizeSelects(Function &F);
269
270	// Heuristics for determining which select instructions can be profitably
271	// conveted to branches. Separate heuristics for selects in inner-most loops
272	// and the rest of code regions (base heuristics for non-inner-most loop
273	// regions).
274	void optimizeSelectsBase(Function &F, SelectGroups &ProfSIGroups);
275	void optimizeSelectsInnerLoops(Function &F, SelectGroups &ProfSIGroups);
276
277	// Converts to branches the select groups that were deemed
278	// profitable-to-convert.
279	void convertProfitableSIGroups(SelectGroups &ProfSIGroups);
280
281	// Splits selects of a given basic block into select groups.
282	void collectSelectGroups(BasicBlock &BB, SelectGroups &SIGroups);
283
284	// Determines for which select groups it is profitable converting to branches
285	// (base and inner-most-loop heuristics).
286	void findProfitableSIGroupsBase(SelectGroups &SIGroups,
287	SelectGroups &ProfSIGroups);
288	void findProfitableSIGroupsInnerLoops(const Loop *L, SelectGroups &SIGroups,
289	SelectGroups &ProfSIGroups);
290
291	// Determines if a select group should be converted to a branch (base
292	// heuristics).
293	bool isConvertToBranchProfitableBase(const SelectGroup &ASI);
294
295	// Returns true if there are expensive instructions in the cold value
296	// operand's (if any) dependence slice of any of the selects of the given
297	// group.
298	bool hasExpensiveColdOperand(const SelectGroup &ASI);
299
300	// For a given source instruction, collect its backwards dependence slice
301	// consisting of instructions exclusively computed for producing the operands
302	// of the source instruction.
303	void getExclBackwardsSlice(Instruction I, std::stack<Instruction > &Slice,
304	Instruction SI, bool* ForSinking = false);
305
306	// Returns true if the condition of the select is highly predictable.
307	bool isSelectHighlyPredictable(const SelectLike SI);
308
309	// Loop-level checks to determine if a non-predicated version (with branches)
310	// of the given loop is more profitable than its predicated version.
311	bool checkLoopHeuristics(const Loop L, const* CostInfo LoopDepth[`2`]);
312
313	// Computes instruction and loop-critical-path costs for both the predicated
314	// and non-predicated version of the given loop.
315	bool computeLoopCosts(const Loop L, const* SelectGroups &SIGroups,
316	DenseMap<const Instruction *, CostInfo> &InstCostMap,
317	CostInfo *LoopCost);
318
319	// Returns a set of all the select instructions in the given select groups.
320	SmallDenseMap<const Instruction *, SelectLike, `2`>
321	getSImap(const SelectGroups &SIGroups);
322
323	// Returns the latency cost of a given instruction.
324	std::optional<uint64_t> computeInstCost(const Instruction *I);
325
326	// Returns the misprediction cost of a given select when converted to branch.
327	Scaled64 getMispredictionCost(const SelectLike SI, const Scaled64 CondCost);
328
329	// Returns the cost of a branch when the prediction is correct.
330	Scaled64 getPredictedPathCost(Scaled64 TrueCost, Scaled64 FalseCost,
331	const SelectLike SI);
332
333	// Returns true if the target architecture supports lowering a given select.
334	bool isSelectKindSupported(const SelectLike SI);
335	};
336
337	class SelectOptimize : public FunctionPass {
338	SelectOptimizeImpl Impl;
339
340	public:
341	static char ID;
342
343	SelectOptimize() : FunctionPass (ID) {
344	initializeSelectOptimizePass(*PassRegistry::getPassRegistry());
345	}
346
347	bool runOnFunction(Function &F) override {
348	return Impl.runOnFunction(F, P&: *this);
349	}
350
351	void getAnalysisUsage(AnalysisUsage &AU) const override {
352	AU.addRequired<ProfileSummaryInfoWrapperPass>();
353	AU.addRequired<TargetPassConfig>();
354	AU.addRequired<TargetTransformInfoWrapperPass>();
355	AU.addRequired<LoopInfoWrapperPass>();
356	AU.addRequired<BlockFrequencyInfoWrapperPass>();
357	AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
358	}
359	};
360
361	} // namespace
362
363	PreservedAnalyses SelectOptimizePass::run(Function &F,
364	FunctionAnalysisManager &FAM) {
365	SelectOptimizeImpl Impl(TM);
366	return Impl.run(F, FAM);
367	}
368
369	char SelectOptimize::ID = `0`;
370
371	INITIALIZE_PASS_BEGIN(SelectOptimize, DEBUG_TYPE, "Optimize selects", false,
372	false)
373	INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
374	INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
375	INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
376	INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
377	INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
378	INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
379	INITIALIZE_PASS_END(SelectOptimize, DEBUG_TYPE, "Optimize selects", false,
380	false)
381
382	FunctionPass llvm::createSelectOptimizePass() { return* new SelectOptimize (); }
383
384	PreservedAnalyses SelectOptimizeImpl::run(Function &F,
385	FunctionAnalysisManager &FAM) {
386	TSI = TM->getSubtargetImpl(F);
387	TLI = TSI->getTargetLowering();
388
389	// If none of the select types are supported then skip this pass.
390	// This is an optimization pass. Legality issues will be handled by
391	// instruction selection.
392	if (!TLI->isSelectSupported(TargetLowering::ScalarValSelect) &&
393	!TLI->isSelectSupported(TargetLowering::ScalarCondVectorVal) &&
394	!TLI->isSelectSupported(TargetLowering::VectorMaskSelect))
395	return PreservedAnalyses::all();
396
397	TTI = &FAM.getResult<TargetIRAnalysis>(IR&: F);
398	if (!TTI->enableSelectOptimize())
399	return PreservedAnalyses::all();
400
401	PSI = FAM.getResult<ModuleAnalysisManagerFunctionProxy>(IR&: F)
402	.getCachedResult<ProfileSummaryAnalysis>(IR&: *F.getParent());
403	assert(PSI && "This pass requires module analysis pass `profile-summary`!");
404	BFI = &FAM.getResult<BlockFrequencyAnalysis>(IR&: F);
405
406	// When optimizing for size, selects are preferable over branches.
407	if (F.hasOptSize() \|\| llvm::shouldOptimizeForSize(F: &F, PSI, BFI))
408	return PreservedAnalyses::all();
409
410	LI = &FAM.getResult<LoopAnalysis>(IR&: F);
411	ORE = &FAM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: F);
412	TSchedModel.init(TSInfo: TSI);
413
414	bool Changed = optimizeSelects(F);
415	return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
416	}
417
418	bool SelectOptimizeImpl::runOnFunction(Function &F, Pass &P) {
419	TM = &P.getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
420	TSI = TM->getSubtargetImpl(F);
421	TLI = TSI->getTargetLowering();
422
423	// If none of the select types are supported then skip this pass.
424	// This is an optimization pass. Legality issues will be handled by
425	// instruction selection.
426	if (!TLI->isSelectSupported(TargetLowering::ScalarValSelect) &&
427	!TLI->isSelectSupported(TargetLowering::ScalarCondVectorVal) &&
428	!TLI->isSelectSupported(TargetLowering::VectorMaskSelect))
429	return false;
430
431	TTI = &P.getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
432
433	if (!TTI->enableSelectOptimize())
434	return false;
435
436	LI = &P.getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
437	BFI = &P.getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
438	PSI = &P.getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
439	ORE = &P.getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
440	TSchedModel.init(TSInfo: TSI);
441
442	// When optimizing for size, selects are preferable over branches.
443	if (F.hasOptSize() \|\| llvm::shouldOptimizeForSize(F: &F, PSI, BFI))
444	return false;
445
446	return optimizeSelects(F);
447	}
448
449	bool SelectOptimizeImpl::optimizeSelects(Function &F) {
450	// Determine for which select groups it is profitable converting to branches.
451	SelectGroups ProfSIGroups;
452	// Base heuristics apply only to non-loops and outer loops.
453	optimizeSelectsBase(F, ProfSIGroups);
454	// Separate heuristics for inner-most loops.
455	optimizeSelectsInnerLoops(F, ProfSIGroups);
456
457	// Convert to branches the select groups that were deemed
458	// profitable-to-convert.
459	convertProfitableSIGroups(ProfSIGroups);
460
461	// Code modified if at least one select group was converted.
462	return !ProfSIGroups.empty();
463	}
464
465	void SelectOptimizeImpl::optimizeSelectsBase(Function &F,
466	SelectGroups &ProfSIGroups) {
467	// Collect all the select groups.
468	SelectGroups SIGroups;
469	for (BasicBlock &BB : F) {
470	// Base heuristics apply only to non-loops and outer loops.
471	Loop *L = LI->getLoopFor(BB: &BB);
472	if (L && L->isInnermost())
473	continue;
474	collectSelectGroups(BB, SIGroups);
475	}
476
477	// Determine for which select groups it is profitable converting to branches.
478	findProfitableSIGroupsBase(SIGroups, ProfSIGroups);
479	}
480
481	void SelectOptimizeImpl::optimizeSelectsInnerLoops(Function &F,
482	SelectGroups &ProfSIGroups) {
483	SmallVector<Loop *, `4`> Loops(LI->begin(), LI->end());
484	// Need to check size on each iteration as we accumulate child loops.
485	for (unsigned long i = `0`; i < Loops.size(); ++i)
486	for (Loop *ChildL : Loops [i]->getSubLoops())
487	Loops.push_back(Elt: ChildL);
488
489	for (Loop *L : Loops) {
490	if (!L->isInnermost())
491	continue;
492
493	SelectGroups SIGroups;
494	for (BasicBlock *BB : L->getBlocks())
495	collectSelectGroups(BB&: *BB, SIGroups);
496
497	findProfitableSIGroupsInnerLoops(L, SIGroups, ProfSIGroups);
498	}
499	}
500
501	/// If \p isTrue is true, return the true value of \p SI, otherwise return
502	/// false value of \p SI. If the true/false value of \p SI is defined by any
503	/// select instructions in \p Selects, look through the defining select
504	/// instruction until the true/false value is not defined in \p Selects.
505	static Value *
506	getTrueOrFalseValue(SelectOptimizeImpl::SelectLike SI, bool isTrue,
507	const SmallPtrSet<const Instruction *, `2`> &Selects,
508	IRBuilder<> &IB) {
509	Value V = nullptr*;
510	for (SelectInst *DefSI = dyn_cast<SelectInst>(Val: SI.getI());
511	DefSI != nullptr && Selects.count(Ptr: DefSI);
512	DefSI = dyn_cast<SelectInst>(Val: V)) {
513	assert(DefSI->getCondition() == SI.getCondition() &&
514	"The condition of DefSI does not match with SI");
515	V = (isTrue ? DefSI->getTrueValue() : DefSI->getFalseValue());
516	}
517
518	if (isa<BinaryOperator>(Val: SI.getI())) {
519	assert(SI.getI()->getOpcode() == Instruction::Or &&
520	"Only currently handling Or instructions.");
521	V = SI.getFalseValue();
522	if (isTrue)
523	V = IB.CreateOr(LHS: V, RHS: ConstantInt::get(Ty: V->getType(), V: `1`));
524	}
525
526	assert(V && "Failed to get select true/false value");
527	return V;
528	}
529
530	void SelectOptimizeImpl::convertProfitableSIGroups(SelectGroups &ProfSIGroups) {
531	for (SelectGroup &ASI : ProfSIGroups) {
532	// The code transformation here is a modified version of the sinking
533	// transformation in CodeGenPrepare::optimizeSelectInst with a more
534	// aggressive strategy of which instructions to sink.
535	//
536	// TODO: eliminate the redundancy of logic transforming selects to branches
537	// by removing CodeGenPrepare::optimizeSelectInst and optimizing here
538	// selects for all cases (with and without profile information).
539
540	// Transform a sequence like this:
541	// start:
542	// %cmp = cmp uge i32 %a, %b
543	// %sel = select i1 %cmp, i32 %c, i32 %d
544	//
545	// Into:
546	// start:
547	// %cmp = cmp uge i32 %a, %b
548	// %cmp.frozen = freeze %cmp
549	// br i1 %cmp.frozen, label %select.true, label %select.false
550	// select.true:
551	// br label %select.end
552	// select.false:
553	// br label %select.end
554	// select.end:
555	// %sel = phi i32 [ %c, %select.true ], [ %d, %select.false ]
556	//
557	// %cmp should be frozen, otherwise it may introduce undefined behavior.
558	// In addition, we may sink instructions that produce %c or %d into the
559	// destination(s) of the new branch.
560	// If the true or false blocks do not contain a sunken instruction, that
561	// block and its branch may be optimized away. In that case, one side of the
562	// first branch will point directly to select.end, and the corresponding PHI
563	// predecessor block will be the start block.
564
565	// Find all the instructions that can be soundly sunk to the true/false
566	// blocks. These are instructions that are computed solely for producing the
567	// operands of the select instructions in the group and can be sunk without
568	// breaking the semantics of the LLVM IR (e.g., cannot sink instructions
569	// with side effects).
570	SmallVector<std::stack<Instruction *>, `2`> TrueSlices, FalseSlices;
571	typedef std::stack<Instruction *>::size_type StackSizeType;
572	StackSizeType maxTrueSliceLen = `0`, maxFalseSliceLen = `0`;
573	for (SelectLike SI : ASI) {
574	// For each select, compute the sinkable dependence chains of the true and
575	// false operands.
576	if (auto *TI = dyn_cast_or_null<Instruction>(Val: SI.getTrueValue())) {
577	std::stack<Instruction *> TrueSlice;
578	getExclBackwardsSlice(I: TI, Slice&: TrueSlice, SI: SI.getI(), ForSinking: true);
579	maxTrueSliceLen = std::max(a: maxTrueSliceLen, b: TrueSlice.size());
580	TrueSlices.push_back(Elt: TrueSlice);
581	}
582	if (auto *FI = dyn_cast_or_null<Instruction>(Val: SI.getFalseValue())) {
583	if (isa<SelectInst>(Val: SI.getI()) \|\| !FI->hasOneUse()) {
584	std::stack<Instruction *> FalseSlice;
585	getExclBackwardsSlice(I: FI, Slice&: FalseSlice, SI: SI.getI(), ForSinking: true);
586	maxFalseSliceLen = std::max(a: maxFalseSliceLen, b: FalseSlice.size());
587	FalseSlices.push_back(Elt: FalseSlice);
588	}
589	}
590	}
591	// In the case of multiple select instructions in the same group, the order
592	// of non-dependent instructions (instructions of different dependence
593	// slices) in the true/false blocks appears to affect performance.
594	// Interleaving the slices seems to experimentally be the optimal approach.
595	// This interleaving scheduling allows for more ILP (with a natural downside
596	// of increasing a bit register pressure) compared to a simple ordering of
597	// one whole chain after another. One would expect that this ordering would
598	// not matter since the scheduling in the backend of the compiler would
599	// take care of it, but apparently the scheduler fails to deliver optimal
600	// ILP with a naive ordering here.
601	SmallVector<Instruction *, `2`> TrueSlicesInterleaved, FalseSlicesInterleaved;
602	for (StackSizeType IS = `0`; IS < maxTrueSliceLen; ++IS) {
603	for (auto &S : TrueSlices) {
604	if (!S.empty()) {
605	TrueSlicesInterleaved.push_back(Elt: S.top());
606	S.pop();
607	}
608	}
609	}
610	for (StackSizeType IS = `0`; IS < maxFalseSliceLen; ++IS) {
611	for (auto &S : FalseSlices) {
612	if (!S.empty()) {
613	FalseSlicesInterleaved.push_back(Elt: S.top());
614	S.pop();
615	}
616	}
617	}
618
619	// We split the block containing the select(s) into two blocks.
620	SelectLike SI = ASI.front();
621	SelectLike LastSI = ASI.back();
622	BasicBlock *StartBlock = SI.getI()->getParent();
623	BasicBlock::iterator SplitPt = ++(BasicBlock::iterator (LastSI.getI()));
624	// With RemoveDIs turned off, SplitPt can be a dbg. intrinsic. With*
625	// RemoveDIs turned on, SplitPt would instead point to the next
626	// instruction. To match existing dbg. intrinsic behaviour with RemoveDIs,*
627	// tell splitBasicBlock that we want to include any DbgVariableRecords
628	// attached to SplitPt in the splice.
629	SplitPt.setHeadBit(true);
630	BasicBlock *EndBlock = StartBlock->splitBasicBlock(I: SplitPt, BBName: "select.end");
631	BFI->setBlockFreq(BB: EndBlock, Freq: BFI->getBlockFreq(BB: StartBlock));
632	// Delete the unconditional branch that was just created by the split.
633	StartBlock->getTerminator()->eraseFromParent();
634
635	// Move any debug/pseudo instructions that were in-between the select
636	// group to the newly-created end block.
637	SmallVector<Instruction *, `2`> DebugPseudoINS;
638	auto DIt = SI.getI()->getIterator();
639	while (&*DIt != LastSI.getI()) {
640	if (DIt ->isDebugOrPseudoInst())
641	DebugPseudoINS.push_back(Elt: &*DIt);
642	DIt ++;
643	}
644	for (auto *DI : DebugPseudoINS) {
645	DI->moveBeforePreserving(MovePos: &*EndBlock->getFirstInsertionPt());
646	}
647
648	// Duplicate implementation for DbgRecords, the non-instruction debug-info
649	// format. Helper lambda for moving DbgRecords to the end block.
650	auto TransferDbgRecords = [&](Instruction &I) {
651	for (auto &DbgRecord :
652	llvm::make_early_inc_range(Range: I.getDbgRecordRange())) {
653	DbgRecord.removeFromParent();
654	EndBlock->insertDbgRecordBefore(DR: &DbgRecord,
655	Here: EndBlock->getFirstInsertionPt());
656	}
657	};
658
659	// Iterate over all instructions in between SI and LastSI, not including
660	// SI itself. These are all the variable assignments that happen "in the
661	// middle" of the select group.
662	auto R = make_range(x: std::next(x: SI.getI()->getIterator()),
663	y: std::next(x: LastSI.getI()->getIterator()));
664	llvm::for_each(Range&: R, F: TransferDbgRecords);
665
666	// These are the new basic blocks for the conditional branch.
667	// At least one will become an actual new basic block.
668	BasicBlock TrueBlock = nullptr, FalseBlock = nullptr;
669	BranchInst TrueBranch = nullptr, FalseBranch = nullptr;
670	if (!TrueSlicesInterleaved.empty()) {
671	TrueBlock = BasicBlock::Create(Context&: EndBlock->getContext(), Name: "select.true.sink",
672	Parent: EndBlock->getParent(), InsertBefore: EndBlock);
673	TrueBranch = BranchInst::Create(IfTrue: EndBlock, InsertAtEnd: TrueBlock);
674	TrueBranch->setDebugLoc(LastSI.getI()->getDebugLoc());
675	for (Instruction *TrueInst : TrueSlicesInterleaved)
676	TrueInst->moveBefore(MovePos: TrueBranch);
677	}
678	if (!FalseSlicesInterleaved.empty()) {
679	FalseBlock =
680	BasicBlock::Create(Context&: EndBlock->getContext(), Name: "select.false.sink",
681	Parent: EndBlock->getParent(), InsertBefore: EndBlock);
682	FalseBranch = BranchInst::Create(IfTrue: EndBlock, InsertAtEnd: FalseBlock);
683	FalseBranch->setDebugLoc(LastSI.getI()->getDebugLoc());
684	for (Instruction *FalseInst : FalseSlicesInterleaved)
685	FalseInst->moveBefore(MovePos: FalseBranch);
686	}
687	// If there was nothing to sink, then arbitrarily choose the 'false' side
688	// for a new input value to the PHI.
689	if (TrueBlock == FalseBlock) {
690	assert(TrueBlock == nullptr &&
691	"Unexpected basic block transform while optimizing select");
692
693	FalseBlock = BasicBlock::Create(Context&: StartBlock->getContext(), Name: "select.false",
694	Parent: EndBlock->getParent(), InsertBefore: EndBlock);
695	auto *FalseBranch = BranchInst::Create(IfTrue: EndBlock, InsertAtEnd: FalseBlock);
696	FalseBranch->setDebugLoc(SI.getI()->getDebugLoc());
697	}
698
699	// Insert the real conditional branch based on the original condition.
700	// If we did not create a new block for one of the 'true' or 'false' paths
701	// of the condition, it means that side of the branch goes to the end block
702	// directly and the path originates from the start block from the point of
703	// view of the new PHI.
704	BasicBlock TT, FT;
705	if (TrueBlock == nullptr) {
706	TT = EndBlock;
707	FT = FalseBlock;
708	TrueBlock = StartBlock;
709	} else if (FalseBlock == nullptr) {
710	TT = TrueBlock;
711	FT = EndBlock;
712	FalseBlock = StartBlock;
713	} else {
714	TT = TrueBlock;
715	FT = FalseBlock;
716	}
717	IRBuilder<> IB(SI.getI());
718	auto *CondFr = IB.CreateFreeze(V: SI.getCondition(),
719	Name: SI.getCondition()->getName() + ".frozen");
720
721	SmallPtrSet<const Instruction *, `2`> INS;
722	for (auto SI : ASI)
723	INS.insert(Ptr: SI.getI());
724
725	// Use reverse iterator because later select may use the value of the
726	// earlier select, and we need to propagate value through earlier select
727	// to get the PHI operand.
728	for (auto It = ASI.rbegin(); It != ASI.rend(); ++It) {
729	SelectLike SI = *It;
730	// The select itself is replaced with a PHI Node.
731	PHINode *PN = PHINode::Create(Ty: SI.getType(), NumReservedValues: `2`, NameStr: "");
732	PN->insertBefore(InsertPos: EndBlock->begin());
733	PN->takeName(V: SI.getI());
734	PN->addIncoming(V: getTrueOrFalseValue(SI, isTrue: true, Selects: INS, IB), BB: TrueBlock);
735	PN->addIncoming(V: getTrueOrFalseValue(SI, isTrue: false, Selects: INS, IB), BB: FalseBlock);
736	PN->setDebugLoc(SI.getI()->getDebugLoc());
737	SI.getI()->replaceAllUsesWith(V: PN);
738	INS.erase(Ptr: SI.getI());
739	++NumSelectsConverted;
740	}
741	IB.CreateCondBr(Cond: CondFr, True: TT, False: FT, MDSrc: SI.getI());
742
743	// Remove the old select instructions, now that they are not longer used.
744	for (auto SI : ASI)
745	SI.getI()->eraseFromParent();
746	}
747	}
748
749	void SelectOptimizeImpl::collectSelectGroups(BasicBlock &BB,
750	SelectGroups &SIGroups) {
751	BasicBlock::iterator BBIt = BB.begin();
752	while (BBIt != BB.end()) {
753	Instruction I = &BBIt ++;
754	if (SelectLike SI = SelectLike::match(I)) {
755	if (!TTI->shouldTreatInstructionLikeSelect(I))
756	continue;
757
758	SelectGroup SIGroup;
759	SIGroup.push_back(Elt: SI);
760	while (BBIt != BB.end()) {
761	Instruction NI = &BBIt;
762	// Debug/pseudo instructions should be skipped and not prevent the
763	// formation of a select group.
764	if (NI->isDebugOrPseudoInst()) {
765	++BBIt;
766	continue;
767	}
768	// We only allow selects in the same group, not other select-like
769	// instructions.
770	if (!isa<SelectInst>(Val: NI))
771	break;
772
773	SelectLike NSI = SelectLike::match(I: NI);
774	if (NSI && SI.getCondition() == NSI.getCondition()) {
775	SIGroup.push_back(Elt: NSI);
776	} else
777	break;
778	++BBIt;
779	}
780
781	// If the select type is not supported, no point optimizing it.
782	// Instruction selection will take care of it.
783	if (!isSelectKindSupported(SI))
784	continue;
785
786	SIGroups.push_back(Elt: SIGroup);
787	}
788	}
789	}
790
791	void SelectOptimizeImpl::findProfitableSIGroupsBase(
792	SelectGroups &SIGroups, SelectGroups &ProfSIGroups) {
793	for (SelectGroup &ASI : SIGroups) {
794	++NumSelectOptAnalyzed;
795	if (isConvertToBranchProfitableBase(ASI))
796	ProfSIGroups.push_back(Elt: ASI);
797	}
798	}
799
800	static void EmitAndPrintRemark(OptimizationRemarkEmitter *ORE,
801	DiagnosticInfoOptimizationBase &Rem) {
802	LLVM_DEBUG(dbgs() << Rem.getMsg() << "\n");
803	ORE->emit(OptDiag&: Rem);
804	}
805
806	void SelectOptimizeImpl::findProfitableSIGroupsInnerLoops(
807	const Loop *L, SelectGroups &SIGroups, SelectGroups &ProfSIGroups) {
808	NumSelectOptAnalyzed += SIGroups.size();
809	// For each select group in an inner-most loop,
810	// a branch is more preferable than a select/conditional-move if:
811	// i) conversion to branches for all the select groups of the loop satisfies
812	// loop-level heuristics including reducing the loop's critical path by
813	// some threshold (see SelectOptimizeImpl::checkLoopHeuristics); and
814	// ii) the total cost of the select group is cheaper with a branch compared
815	// to its predicated version. The cost is in terms of latency and the cost
816	// of a select group is the cost of its most expensive select instruction
817	// (assuming infinite resources and thus fully leveraging available ILP).
818
819	DenseMap<const Instruction *, CostInfo> InstCostMap;
820	CostInfo LoopCost[`2`] = {{.PredCost: Scaled64::getZero(), .NonPredCost: Scaled64::getZero()},
821	{.PredCost: Scaled64::getZero(), .NonPredCost: Scaled64::getZero()}};
822	if (!computeLoopCosts(L, SIGroups, InstCostMap, LoopCost) \|\|
823	!checkLoopHeuristics(L, LoopDepth: LoopCost)) {
824	return;
825	}
826
827	for (SelectGroup &ASI : SIGroups) {
828	// Assuming infinite resources, the cost of a group of instructions is the
829	// cost of the most expensive instruction of the group.
830	Scaled64 SelectCost = Scaled64::getZero(), BranchCost = Scaled64::getZero();
831	for (SelectLike SI : ASI) {
832	SelectCost = std::max(a: SelectCost, b: InstCostMap [SI.getI()].PredCost);
833	BranchCost = std::max(a: BranchCost, b: InstCostMap [SI.getI()].NonPredCost);
834	}
835	if (BranchCost < SelectCost) {
836	OptimizationRemark OR(DEBUG_TYPE, "SelectOpti", ASI.front().getI());
837	OR << "Profitable to convert to branch (loop analysis). BranchCost="
838	<< BranchCost.toString() << ", SelectCost=" << SelectCost.toString()
839	<< ". ";
840	EmitAndPrintRemark(ORE, Rem&: OR);
841	++NumSelectConvertedLoop;
842	ProfSIGroups.push_back(Elt: ASI);
843	} else {
844	OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti",
845	ASI.front().getI());
846	ORmiss << "Select is more profitable (loop analysis). BranchCost="
847	<< BranchCost.toString()
848	<< ", SelectCost=" << SelectCost.toString() << ". ";
849	EmitAndPrintRemark(ORE, Rem&: ORmiss);
850	}
851	}
852	}
853
854	bool SelectOptimizeImpl::isConvertToBranchProfitableBase(
855	const SelectGroup &ASI) {
856	SelectLike SI = ASI.front();
857	LLVM_DEBUG(dbgs() << "Analyzing select group containing " << *SI.getI()
858	<< "\n");
859	OptimizationRemark OR(DEBUG_TYPE, "SelectOpti", SI.getI());
860	OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", SI.getI());
861
862	// Skip cold basic blocks. Better to optimize for size for cold blocks.
863	if (PSI->isColdBlock(BB: SI.getI()->getParent(), BFI)) {
864	++NumSelectColdBB;
865	ORmiss << "Not converted to branch because of cold basic block. ";
866	EmitAndPrintRemark(ORE, Rem&: ORmiss);
867	return false;
868	}
869
870	// If unpredictable, branch form is less profitable.
871	if (SI.getI()->getMetadata(KindID: LLVMContext::MD_unpredictable)) {
872	++NumSelectUnPred;
873	ORmiss << "Not converted to branch because of unpredictable branch. ";
874	EmitAndPrintRemark(ORE, Rem&: ORmiss);
875	return false;
876	}
877
878	// If highly predictable, branch form is more profitable, unless a
879	// predictable select is inexpensive in the target architecture.
880	if (isSelectHighlyPredictable(SI) && TLI->isPredictableSelectExpensive()) {
881	++NumSelectConvertedHighPred;
882	OR << "Converted to branch because of highly predictable branch. ";
883	EmitAndPrintRemark(ORE, Rem&: OR);
884	return true;
885	}
886
887	// Look for expensive instructions in the cold operand's (if any) dependence
888	// slice of any of the selects in the group.
889	if (hasExpensiveColdOperand(ASI)) {
890	++NumSelectConvertedExpColdOperand;
891	OR << "Converted to branch because of expensive cold operand.";
892	EmitAndPrintRemark(ORE, Rem&: OR);
893	return true;
894	}
895
896	ORmiss << "Not profitable to convert to branch (base heuristic).";
897	EmitAndPrintRemark(ORE, Rem&: ORmiss);
898	return false;
899	}
900
901	static InstructionCost divideNearest(InstructionCost Numerator,
902	uint64_t Denominator) {
903	return (Numerator + (Denominator / `2`)) / Denominator;
904	}
905
906	static bool extractBranchWeights(const SelectOptimizeImpl::SelectLike SI,
907	uint64_t &TrueVal, uint64_t &FalseVal) {
908	if (isa<SelectInst>(Val: SI.getI()))
909	return extractBranchWeights(I: *SI.getI(), TrueVal, FalseVal);
910	return false;
911	}
912
913	bool SelectOptimizeImpl::hasExpensiveColdOperand(const SelectGroup &ASI) {
914	bool ColdOperand = false;
915	uint64_t TrueWeight, FalseWeight, TotalWeight;
916	if (extractBranchWeights(SI: ASI.front(), TrueVal&: TrueWeight, FalseVal&: FalseWeight)) {
917	uint64_t MinWeight = std::min(a: TrueWeight, b: FalseWeight);
918	TotalWeight = TrueWeight + FalseWeight;
919	// Is there a path with frequency <ColdOperandThreshold% (default:20%) ?
920	ColdOperand = TotalWeight * ColdOperandThreshold > `100` * MinWeight;
921	} else if (PSI->hasProfileSummary()) {
922	OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti",
923	ASI.front().getI());
924	ORmiss << "Profile data available but missing branch-weights metadata for "
925	"select instruction. ";
926	EmitAndPrintRemark(ORE, Rem&: ORmiss);
927	}
928	if (!ColdOperand)
929	return false;
930	// Check if the cold path's dependence slice is expensive for any of the
931	// selects of the group.
932	for (SelectLike SI : ASI) {
933	Instruction ColdI = nullptr*;
934	uint64_t HotWeight;
935	if (TrueWeight < FalseWeight) {
936	ColdI = dyn_cast_or_null<Instruction>(Val: SI.getTrueValue());
937	HotWeight = FalseWeight;
938	} else {
939	ColdI = dyn_cast_or_null<Instruction>(Val: SI.getFalseValue());
940	HotWeight = TrueWeight;
941	}
942	if (ColdI) {
943	std::stack<Instruction *> ColdSlice;
944	getExclBackwardsSlice(I: ColdI, Slice&: ColdSlice, SI: SI.getI());
945	InstructionCost SliceCost = `0`;
946	while (!ColdSlice.empty()) {
947	SliceCost += TTI->getInstructionCost(U: ColdSlice.top(),
948	CostKind: TargetTransformInfo::TCK_Latency);
949	ColdSlice.pop();
950	}
951	// The colder the cold value operand of the select is the more expensive
952	// the cmov becomes for computing the cold value operand every time. Thus,
953	// the colder the cold operand is the more its cost counts.
954	// Get nearest integer cost adjusted for coldness.
955	InstructionCost AdjSliceCost =
956	divideNearest(Numerator: SliceCost * HotWeight, Denominator: TotalWeight);
957	if (AdjSliceCost >=
958	ColdOperandMaxCostMultiplier * TargetTransformInfo::TCC_Expensive)
959	return true;
960	}
961	}
962	return false;
963	}
964
965	// Check if it is safe to move LoadI next to the SI.
966	// Conservatively assume it is safe only if there is no instruction
967	// modifying memory in-between the load and the select instruction.
968	static bool isSafeToSinkLoad(Instruction LoadI, Instruction SI) {
969	// Assume loads from different basic blocks are unsafe to move.
970	if (LoadI->getParent() != SI->getParent())
971	return false;
972	auto It = LoadI->getIterator();
973	while (&*It != SI) {
974	if (It ->mayWriteToMemory())
975	return false;
976	It ++;
977	}
978	return true;
979	}
980
981	// For a given source instruction, collect its backwards dependence slice
982	// consisting of instructions exclusively computed for the purpose of producing
983	// the operands of the source instruction. As an approximation
984	// (sufficiently-accurate in practice), we populate this set with the
985	// instructions of the backwards dependence slice that only have one-use and
986	// form an one-use chain that leads to the source instruction.
987	void SelectOptimizeImpl::getExclBackwardsSlice(Instruction *I,
988	std::stack<Instruction *> &Slice,
989	Instruction *SI,
990	bool ForSinking) {
991	SmallPtrSet<Instruction *, `2`> Visited;
992	std::queue<Instruction *> Worklist;
993	Worklist.push(x: I);
994	while (!Worklist.empty()) {
995	Instruction *II = Worklist.front();
996	Worklist.pop();
997
998	// Avoid cycles.
999	if (!Visited.insert(Ptr: II).second)
1000	continue;
1001
1002	if (!II->hasOneUse())
1003	continue;
1004
1005	// Cannot soundly sink instructions with side-effects.
1006	// Terminator or phi instructions cannot be sunk.
1007	// Avoid sinking other select instructions (should be handled separetely).
1008	if (ForSinking && (II->isTerminator() \|\| II->mayHaveSideEffects() \|\|
1009	isa<SelectInst>(Val: II) \|\| isa<PHINode>(Val: II)))
1010	continue;
1011
1012	// Avoid sinking loads in order not to skip state-modifying instructions,
1013	// that may alias with the loaded address.
1014	// Only allow sinking of loads within the same basic block that are
1015	// conservatively proven to be safe.
1016	if (ForSinking && II->mayReadFromMemory() && !isSafeToSinkLoad(LoadI: II, SI))
1017	continue;
1018
1019	// Avoid considering instructions with less frequency than the source
1020	// instruction (i.e., avoid colder code regions of the dependence slice).
1021	if (BFI->getBlockFreq(BB: II->getParent()) < BFI->getBlockFreq(BB: I->getParent()))
1022	continue;
1023
1024	// Eligible one-use instruction added to the dependence slice.
1025	Slice.push(x: II);
1026
1027	// Explore all the operands of the current instruction to expand the slice.
1028	for (unsigned k = `0`; k < II->getNumOperands(); ++k)
1029	if (auto *OpI = dyn_cast<Instruction>(Val: II->getOperand(i: k)))
1030	Worklist.push(x: OpI);
1031	}
1032	}
1033
1034	bool SelectOptimizeImpl::isSelectHighlyPredictable(const SelectLike SI) {
1035	uint64_t TrueWeight, FalseWeight;
1036	if (extractBranchWeights(SI, TrueVal&: TrueWeight, FalseVal&: FalseWeight)) {
1037	uint64_t Max = std::max(a: TrueWeight, b: FalseWeight);
1038	uint64_t Sum = TrueWeight + FalseWeight;
1039	if (Sum != `0`) {
1040	auto Probability = BranchProbability::getBranchProbability(Numerator: Max, Denominator: Sum);
1041	if (Probability > TTI->getPredictableBranchThreshold())
1042	return true;
1043	}
1044	}
1045	return false;
1046	}
1047
1048	bool SelectOptimizeImpl::checkLoopHeuristics(const Loop *L,
1049	const CostInfo LoopCost[`2`]) {
1050	// Loop-level checks to determine if a non-predicated version (with branches)
1051	// of the loop is more profitable than its predicated version.
1052
1053	if (DisableLoopLevelHeuristics)
1054	return true;
1055
1056	OptimizationRemarkMissed ORmissL(DEBUG_TYPE, "SelectOpti",
1057	L->getHeader()->getFirstNonPHI());
1058
1059	if (LoopCost[`0`].NonPredCost > LoopCost[`0`].PredCost \|\|
1060	LoopCost[`1`].NonPredCost >= LoopCost[`1`].PredCost) {
1061	ORmissL << "No select conversion in the loop due to no reduction of loop's "
1062	"critical path. ";
1063	EmitAndPrintRemark(ORE, Rem&: ORmissL);
1064	return false;
1065	}
1066
1067	Scaled64 Gain[`2`] = {LoopCost[`0`].PredCost - LoopCost[`0`].NonPredCost,
1068	LoopCost[`1`].PredCost - LoopCost[`1`].NonPredCost};
1069
1070	// Profitably converting to branches need to reduce the loop's critical path
1071	// by at least some threshold (absolute gain of GainCycleThreshold cycles and
1072	// relative gain of 12.5%).
1073	if (Gain[`1`] < Scaled64::get(N: GainCycleThreshold) \|\|
1074	Gain[`1`] * Scaled64::get(N: GainRelativeThreshold) < LoopCost[`1`].PredCost) {
1075	Scaled64 RelativeGain = Scaled64::get(N: `100`) * Gain[`1`] / LoopCost[`1`].PredCost;
1076	ORmissL << "No select conversion in the loop due to small reduction of "
1077	"loop's critical path. Gain="
1078	<< Gain[`1`].toString()
1079	<< ", RelativeGain=" << RelativeGain.toString() << "%. ";
1080	EmitAndPrintRemark(ORE, Rem&: ORmissL);
1081	return false;
1082	}
1083
1084	// If the loop's critical path involves loop-carried dependences, the gradient
1085	// of the gain needs to be at least GainGradientThreshold% (defaults to 25%).
1086	// This check ensures that the latency reduction for the loop's critical path
1087	// keeps decreasing with sufficient rate beyond the two analyzed loop
1088	// iterations.
1089	if (Gain[`1`] > Gain[`0`]) {
1090	Scaled64 GradientGain = Scaled64::get(N: `100`) * (Gain[`1`] - Gain[`0`]) /
1091	(LoopCost[`1`].PredCost - LoopCost[`0`].PredCost);
1092	if (GradientGain < Scaled64::get(N: GainGradientThreshold)) {
1093	ORmissL << "No select conversion in the loop due to small gradient gain. "
1094	"GradientGain="
1095	<< GradientGain.toString() << "%. ";
1096	EmitAndPrintRemark(ORE, Rem&: ORmissL);
1097	return false;
1098	}
1099	}
1100	// If the gain decreases it is not profitable to convert.
1101	else if (Gain[`1`] < Gain[`0`]) {
1102	ORmissL
1103	<< "No select conversion in the loop due to negative gradient gain. ";
1104	EmitAndPrintRemark(ORE, Rem&: ORmissL);
1105	return false;
1106	}
1107
1108	// Non-predicated version of the loop is more profitable than its
1109	// predicated version.
1110	return true;
1111	}
1112
1113	// Computes instruction and loop-critical-path costs for both the predicated
1114	// and non-predicated version of the given loop.
1115	// Returns false if unable to compute these costs due to invalid cost of loop
1116	// instruction(s).
1117	bool SelectOptimizeImpl::computeLoopCosts(
1118	const Loop L, const* SelectGroups &SIGroups,
1119	DenseMap<const Instruction , CostInfo> &InstCostMap, CostInfo LoopCost) {
1120	LLVM_DEBUG(dbgs() << "Calculating Latency / IPredCost / INonPredCost of loop "
1121	<< L->getHeader()->getName() << "\n");
1122	const auto &SImap = getSImap(SIGroups);
1123	// Compute instruction and loop-critical-path costs across two iterations for
1124	// both predicated and non-predicated version.
1125	const unsigned Iterations = `2`;
1126	for (unsigned Iter = `0`; Iter < Iterations; ++Iter) {
1127	// Cost of the loop's critical path.
1128	CostInfo &MaxCost = LoopCost[Iter];
1129	for (BasicBlock *BB : L->getBlocks()) {
1130	for (const Instruction &I : *BB) {
1131	if (I.isDebugOrPseudoInst())
1132	continue;
1133	// Compute the predicated and non-predicated cost of the instruction.
1134	Scaled64 IPredCost = Scaled64::getZero(),
1135	INonPredCost = Scaled64::getZero();
1136
1137	// Assume infinite resources that allow to fully exploit the available
1138	// instruction-level parallelism.
1139	// InstCost = InstLatency + max(Op1Cost, Op2Cost, … OpNCost)
1140	for (const Use &U : I.operands()) {
1141	auto UI = dyn_cast<Instruction>(Val: U.get());
1142	if (!UI)
1143	continue;
1144	if (InstCostMap.count(Val: UI)) {
1145	IPredCost = std::max(a: IPredCost, b: InstCostMap [UI].PredCost);
1146	INonPredCost = std::max(a: INonPredCost, b: InstCostMap [UI].NonPredCost);
1147	}
1148	}
1149	auto ILatency = computeInstCost(I: &I);
1150	if (!ILatency) {
1151	OptimizationRemarkMissed ORmissL(DEBUG_TYPE, "SelectOpti", &I);
1152	ORmissL << "Invalid instruction cost preventing analysis and "
1153	"optimization of the inner-most loop containing this "
1154	"instruction. ";
1155	EmitAndPrintRemark(ORE, Rem&: ORmissL);
1156	return false;
1157	}
1158	IPredCost += Scaled64::get(N: *ILatency);
1159	INonPredCost += Scaled64::get(N: *ILatency);
1160
1161	// For a select that can be converted to branch,
1162	// compute its cost as a branch (non-predicated cost).
1163	//
1164	// BranchCost = PredictedPathCost + MispredictCost
1165	// PredictedPathCost = TrueOpCost TrueProb + FalseOpCost * FalseProb*
1166	// MispredictCost = max(MispredictPenalty, CondCost) MispredictRate*
1167	if (SImap.contains(Val: &I)) {
1168	auto SI = SImap.at(Val: &I);
1169	Scaled64 TrueOpCost = SI.getTrueOpCost(InstCostMap, TTI);
1170	Scaled64 FalseOpCost = SI.getFalseOpCost(InstCostMap, TTI);
1171	Scaled64 PredictedPathCost =
1172	getPredictedPathCost(TrueCost: TrueOpCost, FalseCost: FalseOpCost, SI);
1173
1174	Scaled64 CondCost = Scaled64::getZero();
1175	if (auto *CI = dyn_cast<Instruction>(Val: SI.getCondition()))
1176	if (InstCostMap.count(Val: CI))
1177	CondCost = InstCostMap [CI].NonPredCost;
1178	Scaled64 MispredictCost = getMispredictionCost(SI, CondCost);
1179
1180	INonPredCost = PredictedPathCost + MispredictCost;
1181	}
1182	LLVM_DEBUG(dbgs() << " " << ILatency << "/" << IPredCost << "/"
1183	<< INonPredCost << " for " << I << "\n");
1184
1185	InstCostMap [&I] = {.PredCost: IPredCost, .NonPredCost: INonPredCost};
1186	MaxCost.PredCost = std::max(a: MaxCost.PredCost, b: IPredCost);
1187	MaxCost.NonPredCost = std::max(a: MaxCost.NonPredCost, b: INonPredCost);
1188	}
1189	}
1190	LLVM_DEBUG(dbgs() << "Iteration " << Iter + `1`
1191	<< " MaxCost = " << MaxCost.PredCost << " "
1192	<< MaxCost.NonPredCost << "\n");
1193	}
1194	return true;
1195	}
1196
1197	SmallDenseMap<const Instruction *, SelectOptimizeImpl::SelectLike, `2`>
1198	SelectOptimizeImpl::getSImap(const SelectGroups &SIGroups) {
1199	SmallDenseMap<const Instruction *, SelectLike, `2`> SImap;
1200	for (const SelectGroup &ASI : SIGroups)
1201	for (SelectLike SI : ASI)
1202	SImap.try_emplace(Key: SI.getI(), Args&: SI);
1203	return SImap;
1204	}
1205
1206	std::optional<uint64_t>
1207	SelectOptimizeImpl::computeInstCost(const Instruction *I) {
1208	InstructionCost ICost =
1209	TTI->getInstructionCost(U: I, CostKind: TargetTransformInfo::TCK_Latency);
1210	if (auto OC = ICost.getValue())
1211	return std::optional<uint64_t>(*OC);
1212	return std::nullopt;
1213	}
1214
1215	ScaledNumber<uint64_t>
1216	SelectOptimizeImpl::getMispredictionCost(const SelectLike SI,
1217	const Scaled64 CondCost) {
1218	uint64_t MispredictPenalty = TSchedModel.getMCSchedModel()->MispredictPenalty;
1219
1220	// Account for the default misprediction rate when using a branch
1221	// (conservatively set to 25% by default).
1222	uint64_t MispredictRate = MispredictDefaultRate;
1223	// If the select condition is obviously predictable, then the misprediction
1224	// rate is zero.
1225	if (isSelectHighlyPredictable(SI))
1226	MispredictRate = `0`;
1227
1228	// CondCost is included to account for cases where the computation of the
1229	// condition is part of a long dependence chain (potentially loop-carried)
1230	// that would delay detection of a misprediction and increase its cost.
1231	Scaled64 MispredictCost =
1232	std::max(a: Scaled64::get(N: MispredictPenalty), b: CondCost) *
1233	Scaled64::get(N: MispredictRate);
1234	MispredictCost /= Scaled64::get(N: `100`);
1235
1236	return MispredictCost;
1237	}
1238
1239	// Returns the cost of a branch when the prediction is correct.
1240	// TrueCost TrueProbability + FalseCost * FalseProbability.*
1241	ScaledNumber<uint64_t>
1242	SelectOptimizeImpl::getPredictedPathCost(Scaled64 TrueCost, Scaled64 FalseCost,
1243	const SelectLike SI) {
1244	Scaled64 PredPathCost;
1245	uint64_t TrueWeight, FalseWeight;
1246	if (extractBranchWeights(SI, TrueVal&: TrueWeight, FalseVal&: FalseWeight)) {
1247	uint64_t SumWeight = TrueWeight + FalseWeight;
1248	if (SumWeight != `0`) {
1249	PredPathCost = TrueCost * Scaled64::get(N: TrueWeight) +
1250	FalseCost * Scaled64::get(N: FalseWeight);
1251	PredPathCost /= Scaled64::get(N: SumWeight);
1252	return PredPathCost;
1253	}
1254	}
1255	// Without branch weight metadata, we assume 75% for the one path and 25% for
1256	// the other, and pick the result with the biggest cost.
1257	PredPathCost = std::max(a: TrueCost * Scaled64::get(N: `3`) + FalseCost,
1258	b: FalseCost * Scaled64::get(N: `3`) + TrueCost);
1259	PredPathCost /= Scaled64::get(N: `4`);
1260	return PredPathCost;
1261	}
1262
1263	bool SelectOptimizeImpl::isSelectKindSupported(const SelectLike SI) {
1264	bool VectorCond = !SI.getCondition()->getType()->isIntegerTy(Bitwidth: `1`);
1265	if (VectorCond)
1266	return false;
1267	TargetLowering::SelectSupportKind SelectKind;
1268	if (SI.getType()->isVectorTy())
1269	SelectKind = TargetLowering::ScalarCondVectorVal;
1270	else
1271	SelectKind = TargetLowering::ScalarValSelect;
1272	return TLI->isSelectSupported(SelectKind);
1273	}
1274

source code of llvm/lib/CodeGen/SelectOptimize.cpp