X86CmovConversion.cpp source code [llvm/lib/Target/X86/X86CmovConversion.cpp]

1	//====- X86CmovConversion.cpp - Convert Cmov to Branch --------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// This file implements a pass that converts X86 cmov instructions into
11	/// branches when profitable. This pass is conservative. It transforms if and
12	/// only if it can guarantee a gain with high confidence.
13	///
14	/// Thus, the optimization applies under the following conditions:
15	/// 1. Consider as candidates only CMOVs in innermost loops (assume that
16	/// most hotspots are represented by these loops).
17	/// 2. Given a group of CMOV instructions that are using the same EFLAGS def
18	/// instruction:
19	/// a. Consider them as candidates only if all have the same code condition
20	/// or the opposite one to prevent generating more than one conditional
21	/// jump per EFLAGS def instruction.
22	/// b. Consider them as candidates only if all are profitable to be
23	/// converted (assume that one bad conversion may cause a degradation).
24	/// 3. Apply conversion only for loops that are found profitable and only for
25	/// CMOV candidates that were found profitable.
26	/// a. A loop is considered profitable only if conversion will reduce its
27	/// depth cost by some threshold.
28	/// b. CMOV is considered profitable if the cost of its condition is higher
29	/// than the average cost of its true-value and false-value by 25% of
30	/// branch-misprediction-penalty. This assures no degradation even with
31	/// 25% branch misprediction.
32	///
33	/// Note: This pass is assumed to run on SSA machine code.
34	//
35	//===----------------------------------------------------------------------===//
36	//
37	// External interfaces:
38	// FunctionPass llvm::createX86CmovConverterPass();*
39	// bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF);
40	//
41	//===----------------------------------------------------------------------===//
42
43	#include "X86.h"
44	#include "X86InstrInfo.h"
45	#include "llvm/ADT/ArrayRef.h"
46	#include "llvm/ADT/DenseMap.h"
47	#include "llvm/ADT/STLExtras.h"
48	#include "llvm/ADT/SmallPtrSet.h"
49	#include "llvm/ADT/SmallVector.h"
50	#include "llvm/ADT/Statistic.h"
51	#include "llvm/CodeGen/MachineBasicBlock.h"
52	#include "llvm/CodeGen/MachineFunction.h"
53	#include "llvm/CodeGen/MachineFunctionPass.h"
54	#include "llvm/CodeGen/MachineInstr.h"
55	#include "llvm/CodeGen/MachineInstrBuilder.h"
56	#include "llvm/CodeGen/MachineLoopInfo.h"
57	#include "llvm/CodeGen/MachineOperand.h"
58	#include "llvm/CodeGen/MachineRegisterInfo.h"
59	#include "llvm/CodeGen/TargetInstrInfo.h"
60	#include "llvm/CodeGen/TargetRegisterInfo.h"
61	#include "llvm/CodeGen/TargetSchedule.h"
62	#include "llvm/CodeGen/TargetSubtargetInfo.h"
63	#include "llvm/IR/DebugLoc.h"
64	#include "llvm/InitializePasses.h"
65	#include "llvm/MC/MCSchedule.h"
66	#include "llvm/Pass.h"
67	#include "llvm/Support/CommandLine.h"
68	#include "llvm/Support/Debug.h"
69	#include "llvm/Support/raw_ostream.h"
70	#include "llvm/Target/CGPassBuilderOption.h"
71	#include <algorithm>
72	#include <cassert>
73	#include <iterator>
74	#include <utility>
75
76	using namespace llvm;
77
78	#define DEBUG_TYPE "x86-cmov-conversion"
79
80	STATISTIC(NumOfSkippedCmovGroups, "Number of unsupported CMOV-groups");
81	STATISTIC(NumOfCmovGroupCandidate, "Number of CMOV-group candidates");
82	STATISTIC(NumOfLoopCandidate, "Number of CMOV-conversion profitable loops");
83	STATISTIC(NumOfOptimizedCmovGroups, "Number of optimized CMOV-groups");
84
85	// This internal switch can be used to turn off the cmov/branch optimization.
86	static cl::opt<bool>
87	EnableCmovConverter("x86-cmov-converter",
88	cl::desc ("Enable the X86 cmov-to-branch optimization."),
89	cl::init(Val: true), cl::Hidden);
90
91	static cl::opt<unsigned>
92	GainCycleThreshold("x86-cmov-converter-threshold",
93	cl::desc ("Minimum gain per loop (in cycles) threshold."),
94	cl::init(Val: `4`), cl::Hidden);
95
96	static cl::opt<bool> ForceMemOperand(
97	"x86-cmov-converter-force-mem-operand",
98	cl::desc ("Convert cmovs to branches whenever they have memory operands."),
99	cl::init(Val: true), cl::Hidden);
100
101	static cl::opt<bool> ForceAll(
102	"x86-cmov-converter-force-all",
103	cl::desc ("Convert all cmovs to branches."),
104	cl::init(Val: false), cl::Hidden);
105
106	namespace {
107
108	/// Converts X86 cmov instructions into branches when profitable.
109	class X86CmovConverterPass : public MachineFunctionPass {
110	public:
111	X86CmovConverterPass() : MachineFunctionPass (ID) { }
112
113	StringRef getPassName() const override { return "X86 cmov Conversion"; }
114	bool runOnMachineFunction(MachineFunction &MF) override;
115	void getAnalysisUsage(AnalysisUsage &AU) const override;
116
117	/// Pass identification, replacement for typeid.
118	static char ID;
119
120	private:
121	MachineRegisterInfo MRI = nullptr*;
122	const TargetInstrInfo TII = nullptr*;
123	const TargetRegisterInfo TRI = nullptr*;
124	MachineLoopInfo MLI = nullptr*;
125	TargetSchedModel TSchedModel;
126
127	/// List of consecutive CMOV instructions.
128	using CmovGroup = SmallVector<MachineInstr *, `2`>;
129	using CmovGroups = SmallVector<CmovGroup, `2`>;
130
131	/// Collect all CMOV-group-candidates in \p CurrLoop and update \p
132	/// CmovInstGroups accordingly.
133	///
134	/// \param Blocks List of blocks to process.
135	/// \param CmovInstGroups List of consecutive CMOV instructions in CurrLoop.
136	/// \returns true iff it found any CMOV-group-candidate.
137	bool collectCmovCandidates(ArrayRef<MachineBasicBlock *> Blocks,
138	CmovGroups &CmovInstGroups,
139	bool IncludeLoads = false);
140
141	/// Check if it is profitable to transform each CMOV-group-candidates into
142	/// branch. Remove all groups that are not profitable from \p CmovInstGroups.
143	///
144	/// \param Blocks List of blocks to process.
145	/// \param CmovInstGroups List of consecutive CMOV instructions in CurrLoop.
146	/// \returns true iff any CMOV-group-candidate remain.
147	bool checkForProfitableCmovCandidates(ArrayRef<MachineBasicBlock *> Blocks,
148	CmovGroups &CmovInstGroups);
149
150	/// Convert the given list of consecutive CMOV instructions into a branch.
151	///
152	/// \param Group Consecutive CMOV instructions to be converted into branch.
153	void convertCmovInstsToBranches(SmallVectorImpl<MachineInstr > &Group) const*;
154	};
155
156	} // end anonymous namespace
157
158	char X86CmovConverterPass::ID = `0`;
159
160	void X86CmovConverterPass::getAnalysisUsage(AnalysisUsage &AU) const {
161	MachineFunctionPass::getAnalysisUsage(AU);
162	AU.addRequired<MachineLoopInfo>();
163	}
164
165	bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF) {
166	if (skipFunction(F: MF.getFunction()))
167	return false;
168	if (!EnableCmovConverter)
169	return false;
170
171	// If the SelectOptimize pass is enabled, cmovs have already been optimized.
172	if (!getCGPassBuilderOption().DisableSelectOptimize)
173	return false;
174
175	LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
176	<< "**********\n");
177
178	bool Changed = false;
179	MLI = &getAnalysis<MachineLoopInfo>();
180	const TargetSubtargetInfo &STI = MF.getSubtarget();
181	MRI = &MF.getRegInfo();
182	TII = STI.getInstrInfo();
183	TRI = STI.getRegisterInfo();
184	TSchedModel.init(TSInfo: &STI);
185
186	// Before we handle the more subtle cases of register-register CMOVs inside
187	// of potentially hot loops, we want to quickly remove all CMOVs (ForceAll) or
188	// the ones with a memory operand (ForceMemOperand option). The latter CMOV
189	// will risk a stall waiting for the load to complete that speculative
190	// execution behind a branch is better suited to handle on modern x86 chips.
191	if (ForceMemOperand \|\| ForceAll) {
192	CmovGroups AllCmovGroups;
193	SmallVector<MachineBasicBlock *, `4`> Blocks;
194	for (auto &MBB : MF)
195	Blocks.push_back(Elt: &MBB);
196	if (collectCmovCandidates(Blocks, CmovInstGroups&: AllCmovGroups, /IncludeLoads/ true)) {
197	for (auto &Group : AllCmovGroups) {
198	// Skip any group that doesn't do at least one memory operand cmov.
199	if (ForceMemOperand && !ForceAll &&
200	llvm::none_of(Range&: Group, P: [&](MachineInstr I) { return* I->mayLoad(); }))
201	continue;
202
203	// For CMOV groups which we can rewrite and which contain a memory load,
204	// always rewrite them. On x86, a CMOV will dramatically amplify any
205	// memory latency by blocking speculative execution.
206	Changed = true;
207	convertCmovInstsToBranches(Group);
208	}
209	}
210	// Early return as ForceAll converts all CmovGroups.
211	if (ForceAll)
212	return Changed;
213	}
214
215	//===--------------------------------------------------------------------===//
216	// Register-operand Conversion Algorithm
217	// ---------
218	// For each innermost loop
219	// collectCmovCandidates() {
220	// Find all CMOV-group-candidates.
221	// }
222	//
223	// checkForProfitableCmovCandidates() {
224	// Calculate both loop-depth and optimized-loop-depth.*
225	// Use these depth to check for loop transformation profitability.*
226	// Check for CMOV-group-candidate transformation profitability.*
227	// }
228	//
229	// For each profitable CMOV-group-candidate
230	// convertCmovInstsToBranches() {
231	// Create FalseBB, SinkBB, Conditional branch to SinkBB.*
232	// Replace each CMOV instruction with a PHI instruction in SinkBB.*
233	// }
234	//
235	// Note: For more details, see each function description.
236	//===--------------------------------------------------------------------===//
237
238	// Build up the loops in pre-order.
239	SmallVector<MachineLoop *, `4`> Loops(MLI->begin(), MLI->end());
240	// Note that we need to check size on each iteration as we accumulate child
241	// loops.
242	for (int i = `0`; i < (int)Loops.size(); ++i)
243	for (MachineLoop *Child : Loops [i]->getSubLoops())
244	Loops.push_back(Elt: Child);
245
246	for (MachineLoop *CurrLoop : Loops) {
247	// Optimize only innermost loops.
248	if (!CurrLoop->getSubLoops().empty())
249	continue;
250
251	// List of consecutive CMOV instructions to be processed.
252	CmovGroups CmovInstGroups;
253
254	if (!collectCmovCandidates(Blocks: CurrLoop->getBlocks(), CmovInstGroups))
255	continue;
256
257	if (!checkForProfitableCmovCandidates(Blocks: CurrLoop->getBlocks(),
258	CmovInstGroups))
259	continue;
260
261	Changed = true;
262	for (auto &Group : CmovInstGroups)
263	convertCmovInstsToBranches(Group);
264	}
265
266	return Changed;
267	}
268
269	bool X86CmovConverterPass::collectCmovCandidates(
270	ArrayRef<MachineBasicBlock *> Blocks, CmovGroups &CmovInstGroups,
271	bool IncludeLoads) {
272	//===--------------------------------------------------------------------===//
273	// Collect all CMOV-group-candidates and add them into CmovInstGroups.
274	//
275	// CMOV-group:
276	// CMOV instructions, in same MBB, that uses same EFLAGS def instruction.
277	//
278	// CMOV-group-candidate:
279	// CMOV-group where all the CMOV instructions are
280	// 1. consecutive.
281	// 2. have same condition code or opposite one.
282	// 3. have only operand registers (X86::CMOVrr).
283	//===--------------------------------------------------------------------===//
284	// List of possible improvement (TODO's):
285	// --------------------------------------
286	// TODO: Add support for X86::CMOVrm instructions.
287	// TODO: Add support for X86::SETcc instructions.
288	// TODO: Add support for CMOV-groups with non consecutive CMOV instructions.
289	//===--------------------------------------------------------------------===//
290
291	// Current processed CMOV-Group.
292	CmovGroup Group;
293	for (auto *MBB : Blocks) {
294	Group.clear();
295	// Condition code of first CMOV instruction current processed range and its
296	// opposite condition code.
297	X86::CondCode FirstCC = X86::COND_INVALID, FirstOppCC = X86::COND_INVALID,
298	MemOpCC = X86::COND_INVALID;
299	// Indicator of a non CMOVrr instruction in the current processed range.
300	bool FoundNonCMOVInst = false;
301	// Indicator for current processed CMOV-group if it should be skipped.
302	bool SkipGroup = false;
303
304	for (auto &I : *MBB) {
305	// Skip debug instructions.
306	if (I.isDebugInstr())
307	continue;
308
309	X86::CondCode CC = X86::getCondFromCMov(MI: I);
310	// Check if we found a X86::CMOVrr instruction. If it is marked as
311	// unpredictable, skip it and do not convert it to branch.
312	if (CC != X86::COND_INVALID &&
313	!I.getFlag(Flag: MachineInstr::MIFlag::Unpredictable) &&
314	(IncludeLoads \|\| !I.mayLoad())) {
315	if (Group.empty()) {
316	// We found first CMOV in the range, reset flags.
317	FirstCC = CC;
318	FirstOppCC = X86::GetOppositeBranchCondition(CC);
319	// Clear out the prior group's memory operand CC.
320	MemOpCC = X86::COND_INVALID;
321	FoundNonCMOVInst = false;
322	SkipGroup = false;
323	}
324	Group.push_back(Elt: &I);
325	// Check if it is a non-consecutive CMOV instruction or it has different
326	// condition code than FirstCC or FirstOppCC.
327	if (FoundNonCMOVInst \|\| (CC != FirstCC && CC != FirstOppCC))
328	// Mark the SKipGroup indicator to skip current processed CMOV-Group.
329	SkipGroup = true;
330	if (I.mayLoad()) {
331	if (MemOpCC == X86::COND_INVALID)
332	// The first memory operand CMOV.
333	MemOpCC = CC;
334	else if (CC != MemOpCC)
335	// Can't handle mixed conditions with memory operands.
336	SkipGroup = true;
337	}
338	// Check if we were relying on zero-extending behavior of the CMOV.
339	if (!SkipGroup &&
340	llvm::any_of(
341	Range: MRI->use_nodbg_instructions(Reg: I.defs().begin()->getReg()),
342	P: [&](MachineInstr &UseI) {
343	return UseI.getOpcode() == X86::SUBREG_TO_REG;
344	}))
345	// FIXME: We should model the cost of using an explicit MOV to handle
346	// the zero-extension rather than just refusing to handle this.
347	SkipGroup = true;
348	continue;
349	}
350	// If Group is empty, keep looking for first CMOV in the range.
351	if (Group.empty())
352	continue;
353
354	// We found a non X86::CMOVrr instruction.
355	FoundNonCMOVInst = true;
356	// Check if this instruction define EFLAGS, to determine end of processed
357	// range, as there would be no more instructions using current EFLAGS def.
358	if (I.definesRegister(X86::Reg: EFLAGS, /TRI=/nullptr)) {
359	// Check if current processed CMOV-group should not be skipped and add
360	// it as a CMOV-group-candidate.
361	if (!SkipGroup)
362	CmovInstGroups.push_back(Elt: Group);
363	else
364	++NumOfSkippedCmovGroups;
365	Group.clear();
366	}
367	}
368	// End of basic block is considered end of range, check if current processed
369	// CMOV-group should not be skipped and add it as a CMOV-group-candidate.
370	if (Group.empty())
371	continue;
372	if (!SkipGroup)
373	CmovInstGroups.push_back(Elt: Group);
374	else
375	++NumOfSkippedCmovGroups;
376	}
377
378	NumOfCmovGroupCandidate += CmovInstGroups.size();
379	return !CmovInstGroups.empty();
380	}
381
382	/// \returns Depth of CMOV instruction as if it was converted into branch.
383	/// \param TrueOpDepth depth cost of CMOV true value operand.
384	/// \param FalseOpDepth depth cost of CMOV false value operand.
385	static unsigned getDepthOfOptCmov(unsigned TrueOpDepth, unsigned FalseOpDepth) {
386	// The depth of the result after branch conversion is
387	// TrueOpDepth TrueOpProbability + FalseOpDepth * FalseOpProbability.*
388	// As we have no info about branch weight, we assume 75% for one and 25% for
389	// the other, and pick the result with the largest resulting depth.
390	return std::max(
391	a: divideCeil(Numerator: TrueOpDepth * `3` + FalseOpDepth, Denominator: `4`),
392	b: divideCeil(Numerator: FalseOpDepth * `3` + TrueOpDepth, Denominator: `4`));
393	}
394
395	bool X86CmovConverterPass::checkForProfitableCmovCandidates(
396	ArrayRef<MachineBasicBlock *> Blocks, CmovGroups &CmovInstGroups) {
397	struct DepthInfo {
398	/// Depth of original loop.
399	unsigned Depth;
400	/// Depth of optimized loop.
401	unsigned OptDepth;
402	};
403	/// Number of loop iterations to calculate depth for ?!
404	static const unsigned LoopIterations = `2`;
405	DenseMap<MachineInstr *, DepthInfo> DepthMap;
406	DepthInfo LoopDepth[LoopIterations] = {{.Depth: `0`, .OptDepth: `0`}, {.Depth: `0`, .OptDepth: `0`}};
407	enum { PhyRegType = `0`, VirRegType = `1`, RegTypeNum = `2` };
408	/// For each register type maps the register to its last def instruction.
409	DenseMap<unsigned, MachineInstr *> RegDefMaps[RegTypeNum];
410	/// Maps register operand to its def instruction, which can be nullptr if it
411	/// is unknown (e.g., operand is defined outside the loop).
412	DenseMap<MachineOperand , MachineInstr > OperandToDefMap;
413
414	// Set depth of unknown instruction (i.e., nullptr) to zero.
415	DepthMap [nullptr] = {.Depth: `0`, .OptDepth: `0`};
416
417	SmallPtrSet<MachineInstr *, `4`> CmovInstructions;
418	for (auto &Group : CmovInstGroups)
419	CmovInstructions.insert(I: Group.begin(), E: Group.end());
420
421	//===--------------------------------------------------------------------===//
422	// Step 1: Calculate instruction depth and loop depth.
423	// Optimized-Loop:
424	// loop with CMOV-group-candidates converted into branches.
425	//
426	// Instruction-Depth:
427	// instruction latency + max operand depth.
428	// For CMOV instruction in optimized loop the depth is calculated as:*
429	// CMOV latency + getDepthOfOptCmov(True-Op-Depth, False-Op-depth)
430	// TODO: Find a better way to estimate the latency of the branch instruction
431	// rather than using the CMOV latency.
432	//
433	// Loop-Depth:
434	// max instruction depth of all instructions in the loop.
435	// Note: instruction with max depth represents the critical-path in the loop.
436	//
437	// Loop-Depth[i]:
438	// Loop-Depth calculated for first `i` iterations.
439	// Note: it is enough to calculate depth for up to two iterations.
440	//
441	// Depth-Diff[i]:
442	// Number of cycles saved in first 'i` iterations by optimizing the loop.
443	//===--------------------------------------------------------------------===//
444	for (DepthInfo &MaxDepth : LoopDepth) {
445	for (auto *MBB : Blocks) {
446	// Clear physical registers Def map.
447	RegDefMaps[PhyRegType].clear();
448	for (MachineInstr &MI : *MBB) {
449	// Skip debug instructions.
450	if (MI.isDebugInstr())
451	continue;
452	unsigned MIDepth = `0`;
453	unsigned MIDepthOpt = `0`;
454	bool IsCMOV = CmovInstructions.count(Ptr: &MI);
455	for (auto &MO : MI.uses()) {
456	// Checks for "isUse()" as "uses()" returns also implicit definitions.
457	if (!MO.isReg() \|\| !MO.isUse())
458	continue;
459	Register Reg = MO.getReg();
460	auto &RDM = RegDefMaps[Reg.isVirtual()];
461	if (MachineInstr *DefMI = RDM.lookup(Val: Reg)) {
462	OperandToDefMap [&MO] = DefMI;
463	DepthInfo Info = DepthMap.lookup(Val: DefMI);
464	MIDepth = std::max(a: MIDepth, b: Info.Depth);
465	if (!IsCMOV)
466	MIDepthOpt = std::max(a: MIDepthOpt, b: Info.OptDepth);
467	}
468	}
469
470	if (IsCMOV)
471	MIDepthOpt = getDepthOfOptCmov(
472	TrueOpDepth: DepthMap [OperandToDefMap.lookup(Val: &MI.getOperand(i: `1`))].OptDepth,
473	FalseOpDepth: DepthMap [OperandToDefMap.lookup(Val: &MI.getOperand(i: `2`))].OptDepth);
474
475	// Iterates over all operands to handle implicit definitions as well.
476	for (auto &MO : MI.operands()) {
477	if (!MO.isReg() \|\| !MO.isDef())
478	continue;
479	Register Reg = MO.getReg();
480	RegDefMaps[Reg.isVirtual()][Reg] = &MI;
481	}
482
483	unsigned Latency = TSchedModel.computeInstrLatency(MI: &MI);
484	DepthMap [&MI] = {.Depth: MIDepth += Latency, .OptDepth: MIDepthOpt += Latency};
485	MaxDepth.Depth = std::max(a: MaxDepth.Depth, b: MIDepth);
486	MaxDepth.OptDepth = std::max(a: MaxDepth.OptDepth, b: MIDepthOpt);
487	}
488	}
489	}
490
491	unsigned Diff[LoopIterations] = {LoopDepth[`0`].Depth - LoopDepth[`0`].OptDepth,
492	LoopDepth[`1`].Depth - LoopDepth[`1`].OptDepth};
493
494	//===--------------------------------------------------------------------===//
495	// Step 2: Check if Loop worth to be optimized.
496	// Worth-Optimize-Loop:
497	// case 1: Diff[1] == Diff[0]
498	// Critical-path is iteration independent - there is no dependency
499	// of critical-path instructions on critical-path instructions of
500	// previous iteration.
501	// Thus, it is enough to check gain percent of 1st iteration -
502	// To be conservative, the optimized loop need to have a depth of
503	// 12.5% cycles less than original loop, per iteration.
504	//
505	// case 2: Diff[1] > Diff[0]
506	// Critical-path is iteration dependent - there is dependency of
507	// critical-path instructions on critical-path instructions of
508	// previous iteration.
509	// Thus, check the gain percent of the 2nd iteration (similar to the
510	// previous case), but it is also required to check the gradient of
511	// the gain - the change in Depth-Diff compared to the change in
512	// Loop-Depth between 1st and 2nd iterations.
513	// To be conservative, the gradient need to be at least 50%.
514	//
515	// In addition, In order not to optimize loops with very small gain, the
516	// gain (in cycles) after 2nd iteration should not be less than a given
517	// threshold. Thus, the check (Diff[1] >= GainCycleThreshold) must apply.
518	//
519	// If loop is not worth optimizing, remove all CMOV-group-candidates.
520	//===--------------------------------------------------------------------===//
521	if (Diff[`1`] < GainCycleThreshold)
522	return false;
523
524	bool WorthOptLoop = false;
525	if (Diff[`1`] == Diff[`0`])
526	WorthOptLoop = Diff[`0`] * `8` >= LoopDepth[`0`].Depth;
527	else if (Diff[`1`] > Diff[`0`])
528	WorthOptLoop =
529	(Diff[`1`] - Diff[`0`]) * `2` >= (LoopDepth[`1`].Depth - LoopDepth[`0`].Depth) &&
530	(Diff[`1`] * `8` >= LoopDepth[`1`].Depth);
531
532	if (!WorthOptLoop)
533	return false;
534
535	++NumOfLoopCandidate;
536
537	//===--------------------------------------------------------------------===//
538	// Step 3: Check for each CMOV-group-candidate if it worth to be optimized.
539	// Worth-Optimize-Group:
540	// Iff it is worth to optimize all CMOV instructions in the group.
541	//
542	// Worth-Optimize-CMOV:
543	// Predicted branch is faster than CMOV by the difference between depth of
544	// condition operand and depth of taken (predicted) value operand.
545	// To be conservative, the gain of such CMOV transformation should cover at
546	// at least 25% of branch-misprediction-penalty.
547	//===--------------------------------------------------------------------===//
548	unsigned MispredictPenalty = TSchedModel.getMCSchedModel()->MispredictPenalty;
549	CmovGroups TempGroups;
550	std::swap(LHS&: TempGroups, RHS&: CmovInstGroups);
551	for (auto &Group : TempGroups) {
552	bool WorthOpGroup = true;
553	for (auto *MI : Group) {
554	// Avoid CMOV instruction which value is used as a pointer to load from.
555	// This is another conservative check to avoid converting CMOV instruction
556	// used with tree-search like algorithm, where the branch is unpredicted.
557	auto UIs = MRI->use_instructions(Reg: MI->defs().begin()->getReg());
558	if (!UIs.empty() && ++UIs.begin() == UIs.end()) {
559	unsigned Op = UIs.begin()->getOpcode();
560	if (Op == X86::MOV64rm \|\| Op == X86::MOV32rm) {
561	WorthOpGroup = false;
562	break;
563	}
564	}
565
566	unsigned CondCost =
567	DepthMap [OperandToDefMap.lookup(Val: &MI->getOperand(i: `4`))].Depth;
568	unsigned ValCost = getDepthOfOptCmov(
569	TrueOpDepth: DepthMap [OperandToDefMap.lookup(Val: &MI->getOperand(i: `1`))].Depth,
570	FalseOpDepth: DepthMap [OperandToDefMap.lookup(Val: &MI->getOperand(i: `2`))].Depth);
571	if (ValCost > CondCost \|\| (CondCost - ValCost) * `4` < MispredictPenalty) {
572	WorthOpGroup = false;
573	break;
574	}
575	}
576
577	if (WorthOpGroup)
578	CmovInstGroups.push_back(Elt: Group);
579	}
580
581	return !CmovInstGroups.empty();
582	}
583
584	static bool checkEFLAGSLive(MachineInstr *MI) {
585	if (MI->killsRegister(X86::Reg: EFLAGS, /TRI=/nullptr))
586	return false;
587
588	// The EFLAGS operand of MI might be missing a kill marker.
589	// Figure out whether EFLAGS operand should LIVE after MI instruction.
590	MachineBasicBlock *BB = MI->getParent();
591	MachineBasicBlock::iterator ItrMI = MI;
592
593	// Scan forward through BB for a use/def of EFLAGS.
594	for (auto I = std::next(x: ItrMI), E = BB->end(); I != E; ++I) {
595	if (I ->readsRegister(X86::Reg: EFLAGS, /TRI=/nullptr))
596	return true;
597	if (I ->definesRegister(X86::Reg: EFLAGS, /TRI=/nullptr))
598	return false;
599	}
600
601	// We hit the end of the block, check whether EFLAGS is live into a successor.
602	for (MachineBasicBlock *Succ : BB->successors())
603	if (Succ->isLiveIn(X86::Reg: EFLAGS))
604	return true;
605
606	return false;
607	}
608
609	/// Given /p First CMOV instruction and /p Last CMOV instruction representing a
610	/// group of CMOV instructions, which may contain debug instructions in between,
611	/// move all debug instructions to after the last CMOV instruction, making the
612	/// CMOV group consecutive.
613	static void packCmovGroup(MachineInstr First, MachineInstr Last) {
614	assert(X86::getCondFromCMov(*Last) != X86::COND_INVALID &&
615	"Last instruction in a CMOV group must be a CMOV instruction");
616
617	SmallVector<MachineInstr *, `2`> DBGInstructions;
618	for (auto I = First->getIterator(), E = Last->getIterator(); I != E; I ++) {
619	if (I ->isDebugInstr())
620	DBGInstructions.push_back(Elt: &*I);
621	}
622
623	// Splice the debug instruction after the cmov group.
624	MachineBasicBlock *MBB = First->getParent();
625	for (auto *MI : DBGInstructions)
626	MBB->insertAfter(I: Last, MI: MI->removeFromParent());
627	}
628
629	void X86CmovConverterPass::convertCmovInstsToBranches(
630	SmallVectorImpl<MachineInstr > &Group) const* {
631	assert(!Group.empty() && "No CMOV instructions to convert");
632	++NumOfOptimizedCmovGroups;
633
634	// If the CMOV group is not packed, e.g., there are debug instructions between
635	// first CMOV and last CMOV, then pack the group and make the CMOV instruction
636	// consecutive by moving the debug instructions to after the last CMOV.
637	packCmovGroup(First: Group.front(), Last: Group.back());
638
639	// To convert a CMOVcc instruction, we actually have to insert the diamond
640	// control-flow pattern. The incoming instruction knows the destination vreg
641	// to set, the condition code register to branch on, the true/false values to
642	// select between, and a branch opcode to use.
643
644	// Before
645	// -----
646	// MBB:
647	// cond = cmp ...
648	// v1 = CMOVge t1, f1, cond
649	// v2 = CMOVlt t2, f2, cond
650	// v3 = CMOVge v1, f3, cond
651	//
652	// After
653	// -----
654	// MBB:
655	// cond = cmp ...
656	// jge %SinkMBB
657	//
658	// FalseMBB:
659	// jmp %SinkMBB
660	//
661	// SinkMBB:
662	// %v1 = phi[%f1, %FalseMBB], [%t1, %MBB]
663	// %v2 = phi[%t2, %FalseMBB], [%f2, %MBB] ; For CMOV with OppCC switch
664	// ; true-value with false-value
665	// %v3 = phi[%f3, %FalseMBB], [%t1, %MBB] ; Phi instruction cannot use
666	// ; previous Phi instruction result
667
668	MachineInstr &MI = *Group.front();
669	MachineInstr *LastCMOV = Group.back();
670	DebugLoc DL = MI.getDebugLoc();
671
672	X86::CondCode CC = X86::CondCode(X86::getCondFromCMov(MI));
673	X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
674	// Potentially swap the condition codes so that any memory operand to a CMOV
675	// is in the false* position instead of the true position. We can invert*
676	// any non-memory operand CMOV instructions to cope with this and we ensure
677	// memory operand CMOVs are only included with a single condition code.
678	if (llvm::any_of(Range&: Group, P: [&](MachineInstr *I) {
679	return I->mayLoad() && X86::getCondFromCMov(MI: *I) == CC;
680	}))
681	std::swap(a&: CC, b&: OppCC);
682
683	MachineBasicBlock *MBB = MI.getParent();
684	MachineFunction::iterator It = ++MBB->getIterator();
685	MachineFunction *F = MBB->getParent();
686	const BasicBlock *BB = MBB->getBasicBlock();
687
688	MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(BB);
689	MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(BB);
690	F->insert(MBBI: It, MBB: FalseMBB);
691	F->insert(MBBI: It, MBB: SinkMBB);
692
693	// If the EFLAGS register isn't dead in the terminator, then claim that it's
694	// live into the sink and copy blocks.
695	if (checkEFLAGSLive(MI: LastCMOV)) {
696	FalseMBB->addLiveIn(X86::EFLAGS);
697	SinkMBB->addLiveIn(X86::EFLAGS);
698	}
699
700	// Transfer the remainder of BB and its successor edges to SinkMBB.
701	SinkMBB->splice(Where: SinkMBB->begin(), Other: MBB,
702	From: std::next(x: MachineBasicBlock::iterator (LastCMOV)), To: MBB->end());
703	SinkMBB->transferSuccessorsAndUpdatePHIs(FromMBB: MBB);
704
705	// Add the false and sink blocks as its successors.
706	MBB->addSuccessor(Succ: FalseMBB);
707	MBB->addSuccessor(Succ: SinkMBB);
708
709	// Create the conditional branch instruction.
710	BuildMI(MBB, DL, TII->get(X86::Opcode: JCC_1)).addMBB(SinkMBB).addImm(CC);
711
712	// Add the sink block to the false block successors.
713	FalseMBB->addSuccessor(Succ: SinkMBB);
714
715	MachineInstrBuilder MIB;
716	MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator (MI);
717	MachineBasicBlock::iterator MIItEnd =
718	std::next(x: MachineBasicBlock::iterator (LastCMOV));
719	MachineBasicBlock::iterator FalseInsertionPoint = FalseMBB->begin();
720	MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
721
722	// First we need to insert an explicit load on the false path for any memory
723	// operand. We also need to potentially do register rewriting here, but it is
724	// simpler as the memory operands are always on the false path so we can
725	// simply take that input, whatever it is.
726	DenseMap<unsigned, unsigned> FalseBBRegRewriteTable;
727	for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd;) {
728	auto &MI = *MIIt ++;
729	// Skip any CMOVs in this group which don't load from memory.
730	if (!MI.mayLoad()) {
731	// Remember the false-side register input.
732	Register FalseReg =
733	MI.getOperand(i: X86::getCondFromCMov(MI) == CC ? `1` : `2`).getReg();
734	// Walk back through any intermediate cmovs referenced.
735	while (true) {
736	auto FRIt = FalseBBRegRewriteTable.find(Val: FalseReg);
737	if (FRIt == FalseBBRegRewriteTable.end())
738	break;
739	FalseReg = FRIt ->second;
740	}
741	FalseBBRegRewriteTable [MI.getOperand(i: `0`).getReg()] = FalseReg;
742	continue;
743	}
744
745	// The condition must be the opposite* of the one we've decided to branch*
746	// on as the branch will go around* the load and the load should happen*
747	// when the CMOV condition is false.
748	assert(X86::getCondFromCMov(MI) == OppCC &&
749	"Can only handle memory-operand cmov instructions with a condition "
750	"opposite to the selected branch direction.");
751
752	// The goal is to rewrite the cmov from:
753	//
754	// MBB:
755	// %A = CMOVcc %B (tied), (mem)
756	//
757	// to
758	//
759	// MBB:
760	// %A = CMOVcc %B (tied), %C
761	// FalseMBB:
762	// %C = MOV (mem)
763	//
764	// Which will allow the next loop to rewrite the CMOV in terms of a PHI:
765	//
766	// MBB:
767	// JMP!cc SinkMBB
768	// FalseMBB:
769	// %C = MOV (mem)
770	// SinkMBB:
771	// %A = PHI [ %C, FalseMBB ], [ %B, MBB]
772
773	// Get a fresh register to use as the destination of the MOV.
774	const TargetRegisterClass *RC = MRI->getRegClass(Reg: MI.getOperand(i: `0`).getReg());
775	Register TmpReg = MRI->createVirtualRegister(RegClass: RC);
776
777	// Retain debug instr number when unfolded.
778	unsigned OldDebugInstrNum = MI.peekDebugInstrNum();
779	SmallVector<MachineInstr *, `4`> NewMIs;
780	bool Unfolded = TII->unfoldMemoryOperand(MF&: *MBB->getParent(), MI, Reg: TmpReg,
781	/UnfoldLoad/ true,
782	/UnfoldStore/ false, NewMIs);
783	(void)Unfolded;
784	assert(Unfolded && "Should never fail to unfold a loading cmov!");
785
786	// Move the new CMOV to just before the old one and reset any impacted
787	// iterator.
788	auto *NewCMOV = NewMIs.pop_back_val();
789	assert(X86::getCondFromCMov(*NewCMOV) == OppCC &&
790	"Last new instruction isn't the expected CMOV!");
791	LLVM_DEBUG(dbgs() << "\tRewritten cmov: "; NewCMOV->dump());
792	MBB->insert(I: MachineBasicBlock::iterator (MI), MI: NewCMOV);
793	if (&*MIItBegin == &MI)
794	MIItBegin = MachineBasicBlock::iterator (NewCMOV);
795
796	if (OldDebugInstrNum)
797	NewCMOV->setDebugInstrNum(OldDebugInstrNum);
798
799	// Sink whatever instructions were needed to produce the unfolded operand
800	// into the false block.
801	for (auto *NewMI : NewMIs) {
802	LLVM_DEBUG(dbgs() << "\tRewritten load instr: "; NewMI->dump());
803	FalseMBB->insert(I: FalseInsertionPoint, MI: NewMI);
804	// Re-map any operands that are from other cmovs to the inputs for this block.
805	for (auto &MOp : NewMI->uses()) {
806	if (!MOp.isReg())
807	continue;
808	auto It = FalseBBRegRewriteTable.find(Val: MOp.getReg());
809	if (It == FalseBBRegRewriteTable.end())
810	continue;
811
812	MOp.setReg(It ->second);
813	// This might have been a kill when it referenced the cmov result, but
814	// it won't necessarily be once rewritten.
815	// FIXME: We could potentially improve this by tracking whether the
816	// operand to the cmov was also a kill, and then skipping the PHI node
817	// construction below.
818	MOp.setIsKill(false);
819	}
820	}
821	MBB->erase(I: &MI);
822
823	// Add this PHI to the rewrite table.
824	FalseBBRegRewriteTable [NewCMOV->getOperand(i: `0`).getReg()] = TmpReg;
825	}
826
827	// As we are creating the PHIs, we have to be careful if there is more than
828	// one. Later CMOVs may reference the results of earlier CMOVs, but later
829	// PHIs have to reference the individual true/false inputs from earlier PHIs.
830	// That also means that PHI construction must work forward from earlier to
831	// later, and that the code must maintain a mapping from earlier PHI's
832	// destination registers, and the registers that went into the PHI.
833	DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
834
835	for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
836	Register DestReg = MIIt ->getOperand(i: `0`).getReg();
837	Register Op1Reg = MIIt ->getOperand(i: `1`).getReg();
838	Register Op2Reg = MIIt ->getOperand(i: `2`).getReg();
839
840	// If this CMOV we are processing is the opposite condition from the jump we
841	// generated, then we have to swap the operands for the PHI that is going to
842	// be generated.
843	if (X86::getCondFromCMov(MI: *MIIt) == OppCC)
844	std::swap(a&: Op1Reg, b&: Op2Reg);
845
846	auto Op1Itr = RegRewriteTable.find(Val: Op1Reg);
847	if (Op1Itr != RegRewriteTable.end())
848	Op1Reg = Op1Itr ->second.first;
849
850	auto Op2Itr = RegRewriteTable.find(Val: Op2Reg);
851	if (Op2Itr != RegRewriteTable.end())
852	Op2Reg = Op2Itr ->second.second;
853
854	// SinkMBB:
855	// %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, MBB ]
856	// ...
857	MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::Opcode: PHI), DestReg)
858	.addReg(Op1Reg)
859	.addMBB(FalseMBB)
860	.addReg(Op2Reg)
861	.addMBB(MBB);
862	(void)MIB;
863	LLVM_DEBUG(dbgs() << "\tFrom: "; MIIt ->dump());
864	LLVM_DEBUG(dbgs() << "\tTo: "; MIB ->dump());
865
866	// debug-info: we can just copy the instr-ref number from one instruction
867	// to the other, seeing how it's a one-for-one substitution.
868	if (unsigned InstrNum = MIIt ->peekDebugInstrNum())
869	MIB ->setDebugInstrNum(InstrNum);
870
871	// Add this PHI to the rewrite table.
872	RegRewriteTable [DestReg] = std::make_pair(x&: Op1Reg, y&: Op2Reg);
873	}
874
875	// Reset the NoPHIs property if a PHI was inserted to prevent a conflict with
876	// the MachineVerifier during testing.
877	if (MIItBegin != MIItEnd)
878	F->getProperties().reset(P: MachineFunctionProperties::Property::NoPHIs);
879
880	// Now remove the CMOV(s).
881	MBB->erase(I: MIItBegin, E: MIItEnd);
882
883	// Add new basic blocks to MachineLoopInfo.
884	if (MachineLoop *L = MLI->getLoopFor(BB: MBB)) {
885	L->addBasicBlockToLoop(NewBB: FalseMBB, LI&: MLI->getBase());
886	L->addBasicBlockToLoop(NewBB: SinkMBB, LI&: MLI->getBase());
887	}
888	}
889
890	INITIALIZE_PASS_BEGIN(X86CmovConverterPass, DEBUG_TYPE, "X86 cmov Conversion",
891	false, false)
892	INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
893	INITIALIZE_PASS_END(X86CmovConverterPass, DEBUG_TYPE, "X86 cmov Conversion",
894	false, false)
895
896	FunctionPass *llvm::createX86CmovConverterPass() {
897	return new X86CmovConverterPass ();
898	}
899

source code of llvm/lib/Target/X86/X86CmovConversion.cpp