SplitFunctions.cpp source code [bolt/lib/Passes/SplitFunctions.cpp]

1	//===- bolt/Passes/SplitFunctions.cpp - Pass for splitting function code --===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file implements the SplitFunctions pass.
10	//
11	//===----------------------------------------------------------------------===//
12
13	#include "bolt/Passes/SplitFunctions.h"
14	#include "bolt/Core/BinaryBasicBlock.h"
15	#include "bolt/Core/BinaryFunction.h"
16	#include "bolt/Core/FunctionLayout.h"
17	#include "bolt/Core/ParallelUtilities.h"
18	#include "bolt/Utils/CommandLineOpts.h"
19	#include "llvm/ADT/STLExtras.h"
20	#include "llvm/ADT/SmallVector.h"
21	#include "llvm/ADT/iterator_range.h"
22	#include "llvm/Support/CommandLine.h"
23	#include "llvm/Support/FormatVariadic.h"
24	#include <algorithm>
25	#include <iterator>
26	#include <memory>
27	#include <numeric>
28	#include <random>
29	#include <vector>
30
31	#define DEBUG_TYPE "bolt-opts"
32
33	using namespace llvm;
34	using namespace bolt;
35
36	namespace {
37	class DeprecatedSplitFunctionOptionParser : public cl::parser<bool> {
38	public:
39	explicit DeprecatedSplitFunctionOptionParser(cl::Option &O)
40	: cl::parser<bool>(O) {}
41
42	bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, bool &Value) {
43	if (Arg == "2" \|\| Arg == "3") {
44	Value = true;
45	errs() << formatv(Fmt: "BOLT-WARNING: specifying non-boolean value \"{0}\" "
46	"for option -{1} is deprecated\n",
47	Vals&: Arg, Vals&: ArgName);
48	return false;
49	}
50	return cl::parser<bool>::parse(O, ArgName, Arg, Val&: Value);
51	}
52	};
53	} // namespace
54
55	namespace opts {
56
57	extern cl::OptionCategory BoltOptCategory;
58
59	extern cl::opt<bool> SplitEH;
60	extern cl::opt<unsigned> ExecutionCountThreshold;
61	extern cl::opt<uint32_t> RandomSeed;
62
63	static cl::opt<bool> AggressiveSplitting(
64	"split-all-cold", cl::desc ("outline as many cold basic blocks as possible"),
65	cl::cat (BoltOptCategory));
66
67	static cl::opt<unsigned> SplitAlignThreshold(
68	"split-align-threshold",
69	cl::desc ("when deciding to split a function, apply this alignment "
70	"while doing the size comparison (see -split-threshold). "
71	"Default value: 2."),
72	cl::init(Val: `2`),
73
74	cl::Hidden, cl::cat (BoltOptCategory));
75
76	static cl::opt<bool, false, DeprecatedSplitFunctionOptionParser>
77	SplitFunctions("split-functions",
78	cl::desc ("split functions into fragments"),
79	cl::cat (BoltOptCategory));
80
81	static cl::opt<unsigned> SplitThreshold(
82	"split-threshold",
83	cl::desc ("split function only if its main size is reduced by more than "
84	"given amount of bytes. Default value: 0, i.e. split iff the "
85	"size is reduced. Note that on some architectures the size can "
86	"increase after splitting."),
87	cl::init(Val: `0`), cl::Hidden, cl::cat (BoltOptCategory));
88
89	static cl::opt<SplitFunctionsStrategy> SplitStrategy(
90	"split-strategy", cl::init(Val: SplitFunctionsStrategy::Profile2),
91	cl::values(clEnumValN(SplitFunctionsStrategy::Profile2, "profile2",
92	"split each function into a hot and cold fragment "
93	"using profiling information")),
94	cl::values(clEnumValN(SplitFunctionsStrategy::CDSplit, "cdsplit",
95	"split each function into a hot, warm, and cold "
96	"fragment using profiling information")),
97	cl::values(clEnumValN(
98	SplitFunctionsStrategy::Random2, "random2",
99	"split each function into a hot and cold fragment at a randomly chosen "
100	"split point (ignoring any available profiling information)")),
101	cl::values(clEnumValN(
102	SplitFunctionsStrategy::RandomN, "randomN",
103	"split each function into N fragments at a randomly chosen split "
104	"points (ignoring any available profiling information)")),
105	cl::values(clEnumValN(
106	SplitFunctionsStrategy::All, "all",
107	"split all basic blocks of each function into fragments such that each "
108	"fragment contains exactly a single basic block")),
109	cl::desc ("strategy used to partition blocks into fragments"),
110	cl::cat (BoltOptCategory));
111
112	static cl::opt<double> CallScale(
113	"call-scale",
114	cl::desc ("Call score scale coefficient (when --split-strategy=cdsplit)"),
115	cl::init(Val: `0.95`), cl::ReallyHidden, cl::cat (BoltOptCategory));
116
117	static cl::opt<double>
118	CallPower("call-power",
119	cl::desc ("Call score power (when --split-strategy=cdsplit)"),
120	cl::init(Val: `0.05`), cl::ReallyHidden, cl::cat (BoltOptCategory));
121
122	static cl::opt<double>
123	JumpPower("jump-power",
124	cl::desc ("Jump score power (when --split-strategy=cdsplit)"),
125	cl::init(Val: `0.15`), cl::ReallyHidden, cl::cat (BoltOptCategory));
126	} // namespace opts
127
128	namespace {
129	bool hasFullProfile(const BinaryFunction &BF) {
130	return llvm::all_of(Range: BF.blocks(), P: [](const BinaryBasicBlock &BB) {
131	return BB.getExecutionCount() != BinaryBasicBlock::COUNT_NO_PROFILE;
132	});
133	}
134
135	bool allBlocksCold(const BinaryFunction &BF) {
136	return llvm::all_of(Range: BF.blocks(), P: [](const BinaryBasicBlock &BB) {
137	return BB.getExecutionCount() == `0`;
138	});
139	}
140
141	struct SplitProfile2 final : public SplitStrategy {
142	bool canSplit(const BinaryFunction &BF) override {
143	return BF.hasValidProfile() && hasFullProfile(BF) && !allBlocksCold(BF);
144	}
145
146	bool compactFragments() override { return true; }
147
148	void fragment(const BlockIt Start, const BlockIt End) override {
149	for (BinaryBasicBlock *const BB : llvm::make_range(x: Start, y: End)) {
150	if (BB->getExecutionCount() == `0`)
151	BB->setFragmentNum(FragmentNum::cold());
152	}
153	}
154	};
155
156	struct SplitCacheDirected final : public SplitStrategy {
157	BinaryContext &BC;
158	using BasicBlockOrder = BinaryFunction::BasicBlockOrderType;
159
160	bool canSplit(const BinaryFunction &BF) override {
161	return BF.hasValidProfile() && hasFullProfile(BF) && !allBlocksCold(BF);
162	}
163
164	explicit SplitCacheDirected(BinaryContext &BC) : BC(BC) {
165	initializeAuxiliaryVariables();
166	buildCallGraph();
167	}
168
169	// When some functions are hot-warm split and others are hot-warm-cold split,
170	// we do not want to change the fragment numbers of the blocks in the hot-warm
171	// split functions.
172	bool compactFragments() override { return false; }
173
174	void fragment(const BlockIt Start, const BlockIt End) override {
175	BasicBlockOrder BlockOrder(Start, End);
176	BinaryFunction &BF = *BlockOrder.front()->getFunction();
177	// No need to re-split small functions.
178	if (BlockOrder.size() <= `2`)
179	return;
180
181	size_t BestSplitIndex = findSplitIndex(BF, BlockOrder);
182	assert(BestSplitIndex < BlockOrder.size());
183
184	// Assign fragments based on the computed best split index.
185	// All basic blocks with index up to the best split index become hot.
186	// All remaining blocks are warm / cold depending on if count is
187	// greater than zero or not.
188	for (size_t Index = `0`; Index < BlockOrder.size(); Index++) {
189	BinaryBasicBlock *BB = BlockOrder [Index];
190	if (Index <= BestSplitIndex)
191	BB->setFragmentNum(FragmentNum::main());
192	else
193	BB->setFragmentNum(BB->getKnownExecutionCount() > `0`
194	? FragmentNum::warm()
195	: FragmentNum::cold());
196	}
197	}
198
199	private:
200	struct CallInfo {
201	size_t Length;
202	size_t Count;
203	};
204
205	struct SplitScore {
206	size_t SplitIndex = size_t(-`1`);
207	size_t HotSizeReduction = `0`;
208	double LocalScore = `0`;
209	double CoverCallScore = `0`;
210
211	double sum() const { return LocalScore + CoverCallScore; }
212	};
213
214	// Auxiliary variables used by the algorithm.
215	size_t TotalNumBlocks{`0`};
216	size_t OrigHotSectionSize{`0`};
217	DenseMap<const BinaryBasicBlock *, size_t> GlobalIndices;
218	DenseMap<const BinaryBasicBlock *, size_t> BBSizes;
219	DenseMap<const BinaryBasicBlock *, size_t> BBOffsets;
220
221	// Call graph.
222	std::vector<SmallVector<const BinaryBasicBlock *, `0`>> Callers;
223	std::vector<SmallVector<const BinaryBasicBlock *, `0`>> Callees;
224
225	bool shouldConsiderForCallGraph(const BinaryFunction &BF) {
226	// Only a subset of the functions in the binary will be considered
227	// for initializing auxiliary variables and building call graph.
228	return BF.hasValidIndex() && BF.hasValidProfile() && !BF.empty();
229	}
230
231	void initializeAuxiliaryVariables() {
232	for (BinaryFunction *BF : BC.getSortedFunctions()) {
233	if (!shouldConsiderForCallGraph(BF: *BF))
234	continue;
235
236	// Calculate the size of each BB after hot-cold splitting.
237	// This populates BinaryBasicBlock::OutputAddressRange which
238	// can be used to compute the size of each BB.
239	BC.calculateEmittedSize(BF&: BF, /FixBranches=/*true);
240
241	for (BinaryBasicBlock *BB : BF->getLayout().blocks()) {
242	// Unique global index.
243	GlobalIndices [BB] = TotalNumBlocks;
244	TotalNumBlocks++;
245
246	// Block size after hot-cold splitting.
247	BBSizes [BB] = BB->getOutputSize();
248
249	// Hot block offset after hot-cold splitting.
250	BBOffsets [BB] = OrigHotSectionSize;
251	if (!BB->isSplit())
252	OrigHotSectionSize += BBSizes [BB];
253	}
254	}
255	}
256
257	void buildCallGraph() {
258	Callers.resize(new_size: TotalNumBlocks);
259	Callees.resize(new_size: TotalNumBlocks);
260	for (const BinaryFunction *SrcFunction : BC.getSortedFunctions()) {
261	if (!shouldConsiderForCallGraph(BF: *SrcFunction))
262	continue;
263
264	for (BinaryBasicBlock &SrcBB : SrcFunction->blocks()) {
265	// Skip blocks that are not executed
266	if (SrcBB.getKnownExecutionCount() == `0`)
267	continue;
268
269	// Find call instructions and extract target symbols from each one
270	for (const MCInst &Inst : SrcBB) {
271	if (!BC.MIB ->isCall(Inst))
272	continue;
273
274	// Call info
275	const MCSymbol *DstSym = BC.MIB ->getTargetSymbol(Inst);
276	// Ignore calls w/o information
277	if (!DstSym)
278	continue;
279
280	const BinaryFunction *DstFunction = BC.getFunctionForSymbol(Symbol: DstSym);
281	// Ignore calls that do not have a valid target, but do not ignore
282	// recursive calls, because caller block could be moved to warm.
283	if (!DstFunction \|\| DstFunction->getLayout().block_empty())
284	continue;
285
286	const BinaryBasicBlock *DstBB = &(DstFunction->front());
287
288	// Record the call only if DstBB is also in functions to consider for
289	// call graph.
290	if (GlobalIndices.contains(Val: DstBB)) {
291	Callers [GlobalIndices [DstBB]].push_back(Elt: &SrcBB);
292	Callees [GlobalIndices [&SrcBB]].push_back(Elt: DstBB);
293	}
294	}
295	}
296	}
297	}
298
299	/// Populate BinaryBasicBlock::OutputAddressRange with estimated basic block
300	/// start and end addresses for hot and warm basic blocks, assuming hot-warm
301	/// splitting happens at \p SplitIndex. Also return estimated end addresses
302	/// of the hot fragment before and after splitting.
303	/// The estimations take into account the potential addition of branch
304	/// instructions due to split fall through branches as well as the need to
305	/// use longer branch instructions for split (un)conditional branches.
306	std::pair<size_t, size_t>
307	estimatePostSplitBBAddress(const BasicBlockOrder &BlockOrder,
308	const size_t SplitIndex) {
309	assert(SplitIndex < BlockOrder.size() && "Invalid split index");
310
311	// Update function layout assuming hot-warm splitting at SplitIndex.
312	for (size_t Index = `0`; Index < BlockOrder.size(); Index++) {
313	BinaryBasicBlock *BB = BlockOrder [Index];
314	if (BB->getFragmentNum() == FragmentNum::cold())
315	break;
316	BB->setFragmentNum(Index <= SplitIndex ? FragmentNum::main()
317	: FragmentNum::warm());
318	}
319	BinaryFunction *BF = BlockOrder [`0`]->getFunction();
320	BF->getLayout().update(NewLayout: BlockOrder);
321	// Populate BB.OutputAddressRange under the updated layout.
322	BC.calculateEmittedSize(BF&: *BF);
323
324	// Populate BB.OutputAddressRange with estimated new start and end addresses
325	// and compute the old end address of the hot section and the new end
326	// address of the hot section.
327	size_t OldHotEndAddr{`0`};
328	size_t NewHotEndAddr{`0`};
329	size_t CurrentAddr = BBOffsets [BlockOrder [`0`]];
330	for (BinaryBasicBlock *BB : BlockOrder) {
331	// We only care about new addresses of blocks in hot/warm.
332	if (BB->getFragmentNum() == FragmentNum::cold())
333	break;
334	const size_t NewSize = BB->getOutputSize();
335	BB->setOutputStartAddress(CurrentAddr);
336	CurrentAddr += NewSize;
337	BB->setOutputEndAddress(CurrentAddr);
338	if (BB->getLayoutIndex() == SplitIndex) {
339	NewHotEndAddr = CurrentAddr;
340	// Approximate the start address of the warm fragment of the current
341	// function using the original hot section size.
342	CurrentAddr = OrigHotSectionSize;
343	}
344	OldHotEndAddr = BBOffsets [BB] + BBSizes [BB];
345	}
346	return std::make_pair(x&: OldHotEndAddr, y&: NewHotEndAddr);
347	}
348
349	/// Get a collection of "shortenable" calls, that is, calls of type X->Y
350	/// when the function order is [... X ... BF ... Y ...].
351	/// If the hot fragment size of BF is reduced, then such calls are guaranteed
352	/// to get shorter by the reduced hot fragment size.
353	std::vector<CallInfo> extractCoverCalls(const BinaryFunction &BF) {
354	// Record the length and the count of the calls that can be shortened
355	std::vector<CallInfo> CoverCalls;
356	if (opts::CallScale == `0`)
357	return CoverCalls;
358
359	const BinaryFunction *ThisBF = &BF;
360	const BinaryBasicBlock *ThisBB = &(ThisBF->front());
361	const size_t ThisGI = GlobalIndices [ThisBB];
362
363	for (const BinaryFunction *DstBF : BC.getSortedFunctions()) {
364	if (!shouldConsiderForCallGraph(BF: *DstBF))
365	continue;
366
367	const BinaryBasicBlock *DstBB = &(DstBF->front());
368	if (DstBB->getKnownExecutionCount() == `0`)
369	continue;
370
371	const size_t DstGI = GlobalIndices [DstBB];
372	for (const BinaryBasicBlock *SrcBB : Callers [DstGI]) {
373	const BinaryFunction *SrcBF = SrcBB->getFunction();
374	if (ThisBF == SrcBF)
375	continue;
376
377	const size_t CallCount = SrcBB->getKnownExecutionCount();
378
379	const size_t SrcGI = GlobalIndices [SrcBB];
380
381	const bool IsCoverCall = (SrcGI < ThisGI && ThisGI < DstGI) \|\|
382	(DstGI <= ThisGI && ThisGI < SrcGI);
383	if (!IsCoverCall)
384	continue;
385
386	const size_t SrcBBEndAddr = BBOffsets [SrcBB] + BBSizes [SrcBB];
387	const size_t DstBBStartAddr = BBOffsets [DstBB];
388	const size_t CallLength =
389	AbsoluteDifference(X: SrcBBEndAddr, Y: DstBBStartAddr);
390	const CallInfo CI{.Length: CallLength, .Count: CallCount};
391	CoverCalls.emplace_back(args: CI);
392	}
393	}
394	return CoverCalls;
395	}
396
397	/// Compute the edge score of a call edge.
398	double computeCallScore(uint64_t CallCount, size_t CallLength) {
399	// Increase call lengths by 1 to avoid raising 0 to a negative power.
400	return opts::CallScale * static_cast<double>(CallCount) /
401	std::pow(x: static_cast<double>(CallLength + `1`), y: opts::CallPower);
402	}
403
404	/// Compute the edge score of a jump (branch) edge.
405	double computeJumpScore(uint64_t JumpCount, size_t JumpLength) {
406	// Increase jump lengths by 1 to avoid raising 0 to a negative power.
407	return static_cast<double>(JumpCount) /
408	std::pow(x: static_cast<double>(JumpLength + `1`), y: opts::JumpPower);
409	}
410
411	/// Compute sum of scores over jumps within \p BlockOrder given \p SplitIndex.
412	/// Increament Score.LocalScore in place by the sum.
413	void computeJumpScore(const BasicBlockOrder &BlockOrder,
414	const size_t SplitIndex, SplitScore &Score) {
415
416	for (const BinaryBasicBlock *SrcBB : BlockOrder) {
417	if (SrcBB->getKnownExecutionCount() == `0`)
418	continue;
419
420	const size_t SrcBBEndAddr = SrcBB->getOutputAddressRange().second;
421
422	for (const auto Pair : zip(t: SrcBB->successors(), u: SrcBB->branch_info())) {
423	const BinaryBasicBlock *DstBB = std::get<`0`>(t: Pair);
424	const BinaryBasicBlock::BinaryBranchInfo &Branch = std::get<`1`>(t: Pair);
425	const size_t JumpCount = Branch.Count;
426
427	if (JumpCount == `0`)
428	continue;
429
430	const size_t DstBBStartAddr = DstBB->getOutputAddressRange().first;
431	const size_t NewJumpLength =
432	AbsoluteDifference(X: SrcBBEndAddr, Y: DstBBStartAddr);
433	Score.LocalScore += computeJumpScore(JumpCount, JumpLength: NewJumpLength);
434	}
435	}
436	}
437
438	/// Compute sum of scores over calls originated in the current function
439	/// given \p SplitIndex. Increament Score.LocalScore in place by the sum.
440	void computeLocalCallScore(const BasicBlockOrder &BlockOrder,
441	const size_t SplitIndex, SplitScore &Score) {
442	if (opts::CallScale == `0`)
443	return;
444
445	// Global index of the last block in the current function.
446	// This is later used to determine whether a call originated in the current
447	// function is to a function that comes after the current function.
448	const size_t LastGlobalIndex = GlobalIndices [BlockOrder.back()];
449
450	// The length of calls originated in the input function can increase /
451	// decrease depending on the splitting decision.
452	for (const BinaryBasicBlock *SrcBB : BlockOrder) {
453	const size_t CallCount = SrcBB->getKnownExecutionCount();
454	// If SrcBB does not call any functions, skip it.
455	if (CallCount == `0`)
456	continue;
457
458	// Obtain an estimate on the end address of the src basic block
459	// after splitting at SplitIndex.
460	const size_t SrcBBEndAddr = SrcBB->getOutputAddressRange().second;
461
462	for (const BinaryBasicBlock *DstBB : Callees [GlobalIndices [SrcBB]]) {
463	// Obtain an estimate on the start address of the dst basic block
464	// after splitting at SplitIndex. If DstBB is in a function before
465	// the current function, then its start address remains unchanged.
466	size_t DstBBStartAddr = BBOffsets [DstBB];
467	// If DstBB is in a function after the current function, then its
468	// start address should be adjusted based on the reduction in hot size.
469	if (GlobalIndices [DstBB] > LastGlobalIndex) {
470	assert(DstBBStartAddr >= Score.HotSizeReduction);
471	DstBBStartAddr -= Score.HotSizeReduction;
472	}
473	const size_t NewCallLength =
474	AbsoluteDifference(X: SrcBBEndAddr, Y: DstBBStartAddr);
475	Score.LocalScore += computeCallScore(CallCount, CallLength: NewCallLength);
476	}
477	}
478	}
479
480	/// Compute sum of splitting scores for cover calls of the input function.
481	/// Increament Score.CoverCallScore in place by the sum.
482	void computeCoverCallScore(const BasicBlockOrder &BlockOrder,
483	const size_t SplitIndex,
484	const std::vector<CallInfo> &CoverCalls,
485	SplitScore &Score) {
486	if (opts::CallScale == `0`)
487	return;
488
489	for (const CallInfo CI : CoverCalls) {
490	assert(CI.Length >= Score.HotSizeReduction &&
491	"Length of cover calls must exceed reduced size of hot fragment.");
492	// Compute the new length of the call, which is shorter than the original
493	// one by the size of the splitted fragment minus the total size increase.
494	const size_t NewCallLength = CI.Length - Score.HotSizeReduction;
495	Score.CoverCallScore += computeCallScore(CallCount: CI.Count, CallLength: NewCallLength);
496	}
497	}
498
499	/// Compute the split score of splitting a function at a given index.
500	/// The split score consists of local score and cover score. This function
501	/// returns \p Score of SplitScore type. It contains the local score and
502	/// cover score of the current splitting index. For easier book keeping and
503	/// comparison, it also stores the split index and the resulting reduction
504	/// in hot fragment size.
505	SplitScore computeSplitScore(const BinaryFunction &BF,
506	const BasicBlockOrder &BlockOrder,
507	const size_t SplitIndex,
508	const std::vector<CallInfo> &CoverCalls) {
509	// Populate BinaryBasicBlock::OutputAddressRange with estimated
510	// new start and end addresses after hot-warm splitting at SplitIndex.
511	size_t OldHotEnd;
512	size_t NewHotEnd;
513	std::tie(args&: OldHotEnd, args&: NewHotEnd) =
514	estimatePostSplitBBAddress(BlockOrder, SplitIndex);
515
516	SplitScore Score;
517	Score.SplitIndex = SplitIndex;
518
519	// It's not worth splitting if OldHotEnd < NewHotEnd.
520	if (OldHotEnd < NewHotEnd)
521	return Score;
522
523	// Hot fragment size reduction due to splitting.
524	Score.HotSizeReduction = OldHotEnd - NewHotEnd;
525
526	// First part of LocalScore is the sum over call edges originated in the
527	// input function. These edges can get shorter or longer depending on
528	// SplitIndex. Score.LocalScore is increamented in place.
529	computeLocalCallScore(BlockOrder, SplitIndex, Score);
530
531	// Second part of LocalScore is the sum over jump edges with src basic block
532	// and dst basic block in the current function. Score.LocalScore is
533	// increamented in place.
534	computeJumpScore(BlockOrder, SplitIndex, Score);
535
536	// Compute CoverCallScore and store in Score in place.
537	computeCoverCallScore(BlockOrder, SplitIndex, CoverCalls, Score);
538	return Score;
539	}
540
541	/// Find the most likely successor of a basic block when it has one or two
542	/// successors. Return nullptr otherwise.
543	const BinaryBasicBlock getMostLikelySuccessor(const* BinaryBasicBlock *BB) {
544	if (BB->succ_size() == `1`)
545	return BB->getSuccessor();
546	if (BB->succ_size() == `2`) {
547	uint64_t TakenCount = BB->getTakenBranchInfo().Count;
548	assert(TakenCount != BinaryBasicBlock::COUNT_NO_PROFILE);
549	uint64_t NonTakenCount = BB->getFallthroughBranchInfo().Count;
550	assert(NonTakenCount != BinaryBasicBlock::COUNT_NO_PROFILE);
551	if (TakenCount > NonTakenCount)
552	return BB->getConditionalSuccessor(Condition: true);
553	else if (TakenCount < NonTakenCount)
554	return BB->getConditionalSuccessor(Condition: false);
555	}
556	return nullptr;
557	}
558
559	/// Find the best index for splitting. The returned value is the index of the
560	/// last hot basic block. Hence, "no splitting" is equivalent to returning the
561	/// value which is one less than the size of the function.
562	size_t findSplitIndex(const BinaryFunction &BF,
563	const BasicBlockOrder &BlockOrder) {
564	assert(BlockOrder.size() > `2`);
565	// Find all function calls that can be shortened if we move blocks of the
566	// current function to warm/cold
567	const std::vector<CallInfo> CoverCalls = extractCoverCalls(BF);
568
569	// Find the existing hot-cold splitting index.
570	size_t HotColdIndex = `0`;
571	while (HotColdIndex + `1` < BlockOrder.size()) {
572	if (BlockOrder [HotColdIndex + `1`]->getFragmentNum() == FragmentNum::cold())
573	break;
574	HotColdIndex++;
575	}
576	assert(HotColdIndex + `1` == BlockOrder.size() \|\|
577	(BlockOrder[HotColdIndex]->getFragmentNum() == FragmentNum::main() &&
578	BlockOrder[HotColdIndex + `1`]->getFragmentNum() ==
579	FragmentNum::cold()));
580
581	// Try all possible split indices up to HotColdIndex (blocks that have
582	// Index <= SplitIndex are in hot) and find the one maximizing the
583	// splitting score.
584	SplitScore BestScore;
585	for (size_t Index = `0`; Index <= HotColdIndex; Index++) {
586	const BinaryBasicBlock *LastHotBB = BlockOrder [Index];
587	assert(LastHotBB->getFragmentNum() != FragmentNum::cold());
588
589	// Do not break jump to the most likely successor.
590	if (Index + `1` < BlockOrder.size() &&
591	BlockOrder [Index + `1`] == getMostLikelySuccessor(BB: LastHotBB))
592	continue;
593
594	const SplitScore Score =
595	computeSplitScore(BF, BlockOrder, SplitIndex: Index, CoverCalls);
596	if (Score.sum() > BestScore.sum())
597	BestScore = Score;
598	}
599
600	// If we don't find a good splitting point, fallback to the original one.
601	if (BestScore.SplitIndex == size_t(-`1`))
602	return HotColdIndex;
603
604	return BestScore.SplitIndex;
605	}
606	};
607
608	struct SplitRandom2 final : public SplitStrategy {
609	std::minstd_rand0 Gen;
610
611	SplitRandom2() : Gen (opts::RandomSeed.getValue()) {}
612
613	bool canSplit(const BinaryFunction &BF) override { return true; }
614
615	bool compactFragments() override { return true; }
616
617	void fragment(const BlockIt Start, const BlockIt End) override {
618	using DiffT = typename std::iterator_traits<BlockIt>::difference_type;
619	const DiffT NumBlocks = End - Start;
620	assert(NumBlocks > `0` && "Cannot fragment empty function");
621
622	// We want to split at least one block
623	const auto LastSplitPoint = std::max<DiffT>(a: NumBlocks - `1`, b: `1`);
624	std::uniform_int_distribution<DiffT> Dist(`1`, LastSplitPoint);
625	const DiffT SplitPoint = Dist (Gen);
626	for (BinaryBasicBlock *BB : llvm::make_range(x: Start + SplitPoint, y: End))
627	BB->setFragmentNum(FragmentNum::cold());
628
629	LLVM_DEBUG(dbgs() << formatv("BOLT-DEBUG: randomly chose last {0} (out of "
630	"{1} possible) blocks to split\n",
631	NumBlocks - SplitPoint, End - Start));
632	}
633	};
634
635	struct SplitRandomN final : public SplitStrategy {
636	std::minstd_rand0 Gen;
637
638	SplitRandomN() : Gen (opts::RandomSeed.getValue()) {}
639
640	bool canSplit(const BinaryFunction &BF) override { return true; }
641
642	bool compactFragments() override { return true; }
643
644	void fragment(const BlockIt Start, const BlockIt End) override {
645	using DiffT = typename std::iterator_traits<BlockIt>::difference_type;
646	const DiffT NumBlocks = End - Start;
647	assert(NumBlocks > `0` && "Cannot fragment empty function");
648
649	// With n blocks, there are n-1 places to split them.
650	const DiffT MaximumSplits = NumBlocks - `1`;
651	// We want to generate at least two fragment if possible, but if there is
652	// only one block, no splits are possible.
653	const auto MinimumSplits = std::min<DiffT>(a: MaximumSplits, b: `1`);
654	std::uniform_int_distribution<DiffT> Dist(MinimumSplits, MaximumSplits);
655	// Choose how many splits to perform
656	const DiffT NumSplits = Dist (Gen);
657
658	// Draw split points from a lottery
659	SmallVector<unsigned, `0`> Lottery(MaximumSplits);
660	// Start lottery at 1, because there is no meaningful splitpoint before the
661	// first block.
662	std::iota(first: Lottery.begin(), last: Lottery.end(), value: `1u`);
663	std::shuffle(first: Lottery.begin(), last: Lottery.end(), g&: Gen);
664	Lottery.resize(N: NumSplits);
665	llvm::sort(C&: Lottery);
666
667	// Add one past the end entry to lottery
668	Lottery.push_back(Elt: NumBlocks);
669
670	unsigned LotteryIndex = `0`;
671	unsigned BBPos = `0`;
672	for (BinaryBasicBlock *const BB : make_range(x: Start, y: End)) {
673	// Check whether to start new fragment
674	if (BBPos >= Lottery [LotteryIndex])
675	++LotteryIndex;
676
677	// Because LotteryIndex is 0 based and cold fragments are 1 based, we can
678	// use the index to assign fragments.
679	BB->setFragmentNum(FragmentNum (LotteryIndex));
680
681	++BBPos;
682	}
683	}
684	};
685
686	struct SplitAll final : public SplitStrategy {
687	bool canSplit(const BinaryFunction &BF) override { return true; }
688
689	bool compactFragments() override {
690	// Keeping empty fragments allows us to test, that empty fragments do not
691	// generate symbols.
692	return false;
693	}
694
695	void fragment(const BlockIt Start, const BlockIt End) override {
696	unsigned Fragment = `0`;
697	for (BinaryBasicBlock *const BB : llvm::make_range(x: Start, y: End))
698	BB->setFragmentNum(FragmentNum (Fragment++));
699	}
700	};
701	} // namespace
702
703	namespace llvm {
704	namespace bolt {
705
706	bool SplitFunctions::shouldOptimize(const BinaryFunction &BF) const {
707	// Apply execution count threshold
708	if (BF.getKnownExecutionCount() < opts::ExecutionCountThreshold)
709	return false;
710
711	return BinaryFunctionPass::shouldOptimize(BF);
712	}
713
714	Error SplitFunctions::runOnFunctions(BinaryContext &BC) {
715	if (!opts::SplitFunctions)
716	return Error::success();
717
718	if (BC.IsLinuxKernel && BC.BOLTReserved.empty()) {
719	BC.errs() << "BOLT-ERROR: split functions require reserved space in the "
720	"Linux kernel binary\n";
721	exit(status: `1`);
722	}
723
724	// If split strategy is not CDSplit, then a second run of the pass is not
725	// needed after function reordering.
726	if (BC.HasFinalizedFunctionOrder &&
727	opts::SplitStrategy != SplitFunctionsStrategy::CDSplit)
728	return Error::success();
729
730	std::unique_ptr<SplitStrategy> Strategy;
731	bool ForceSequential = false;
732
733	switch (opts::SplitStrategy) {
734	case SplitFunctionsStrategy::CDSplit:
735	// CDSplit runs two splitting passes: hot-cold splitting (SplitPrfoile2)
736	// before function reordering and hot-warm-cold splitting
737	// (SplitCacheDirected) after function reordering.
738	if (BC.HasFinalizedFunctionOrder)
739	Strategy = std::make_unique<SplitCacheDirected>(args&: BC);
740	else
741	Strategy = std::make_unique<SplitProfile2>();
742	opts::AggressiveSplitting = true;
743	BC.HasWarmSection = true;
744	break;
745	case SplitFunctionsStrategy::Profile2:
746	Strategy = std::make_unique<SplitProfile2>();
747	break;
748	case SplitFunctionsStrategy::Random2:
749	Strategy = std::make_unique<SplitRandom2>();
750	// If we split functions randomly, we need to ensure that across runs with
751	// the same input, we generate random numbers for each function in the same
752	// order.
753	ForceSequential = true;
754	break;
755	case SplitFunctionsStrategy::RandomN:
756	Strategy = std::make_unique<SplitRandomN>();
757	ForceSequential = true;
758	break;
759	case SplitFunctionsStrategy::All:
760	Strategy = std::make_unique<SplitAll>();
761	break;
762	}
763
764	ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
765	return !shouldOptimize(BF);
766	};
767
768	ParallelUtilities::runOnEachFunction(
769	BC, SchedPolicy: ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR,
770	WorkFunction: [&](BinaryFunction &BF) { splitFunction(Function&: BF, Strategy&: *Strategy); }, SkipPredicate: SkipFunc,
771	LogName: "SplitFunctions", ForceSequential);
772
773	if (SplitBytesHot + SplitBytesCold > `0`)
774	BC.outs() << "BOLT-INFO: splitting separates " << SplitBytesHot
775	<< " hot bytes from " << SplitBytesCold << " cold bytes "
776	<< format(Fmt: "(%.2lf%% of split functions is hot).\n",
777	Vals: `100.0` * SplitBytesHot /
778	(SplitBytesHot + SplitBytesCold));
779	return Error::success();
780	}
781
782	void SplitFunctions::splitFunction(BinaryFunction &BF, SplitStrategy &S) {
783	if (BF.empty())
784	return;
785
786	if (!S.canSplit(BF))
787	return;
788
789	FunctionLayout &Layout = BF.getLayout();
790	BinaryFunction::BasicBlockOrderType PreSplitLayout(Layout.block_begin(),
791	Layout.block_end());
792
793	BinaryContext &BC = BF.getBinaryContext();
794	size_t OriginalHotSize;
795	size_t HotSize;
796	size_t ColdSize;
797	if (BC.isX86()) {
798	std::tie(args&: OriginalHotSize, args&: ColdSize) = BC.calculateEmittedSize(BF);
799	LLVM_DEBUG(dbgs() << "Estimated size for function " << BF
800	<< " pre-split is <0x"
801	<< Twine::utohexstr(OriginalHotSize) << ", 0x"
802	<< Twine::utohexstr(ColdSize) << ">\n");
803	}
804
805	BinaryFunction::BasicBlockOrderType NewLayout(Layout.block_begin(),
806	Layout.block_end());
807	// Never outline the first basic block.
808	NewLayout.front()->setCanOutline(false);
809	for (BinaryBasicBlock *const BB : NewLayout) {
810	if (!BB->canOutline())
811	continue;
812
813	// Do not split extra entry points in aarch64. They can be referred by
814	// using ADRs and when this happens, these blocks cannot be placed far
815	// away due to the limited range in ADR instruction.
816	if (BC.isAArch64() && BB->isEntryPoint()) {
817	BB->setCanOutline(false);
818	continue;
819	}
820
821	if (BF.hasEHRanges() && !opts::SplitEH) {
822	// We cannot move landing pads (or rather entry points for landing pads).
823	if (BB->isLandingPad()) {
824	BB->setCanOutline(false);
825	continue;
826	}
827	// We cannot move a block that can throw since exception-handling
828	// runtime cannot deal with split functions. However, if we can guarantee
829	// that the block never throws, it is safe to move the block to
830	// decrease the size of the function.
831	for (MCInst &Instr : *BB) {
832	if (BC.MIB ->isInvoke(Inst: Instr)) {
833	BB->setCanOutline(false);
834	break;
835	}
836	}
837	}
838
839	// Outlining blocks with dynamic branches is not supported yet.
840	if (BC.IsLinuxKernel) {
841	if (llvm::any_of(
842	Range&: BB, P: [&](MCInst &Inst) { return* BC.MIB ->isDynamicBranch(Inst); }))
843	BB->setCanOutline(false);
844	}
845	}
846
847	BF.getLayout().updateLayoutIndices();
848	S.fragment(Start: NewLayout.begin(), End: NewLayout.end());
849
850	// Make sure all non-outlineable blocks are in the main-fragment.
851	for (BinaryBasicBlock *const BB : NewLayout) {
852	if (!BB->canOutline())
853	BB->setFragmentNum(FragmentNum::main());
854	}
855
856	if (opts::AggressiveSplitting) {
857	// All blocks with 0 count that we can move go to the end of the function.
858	// Even if they were natural to cluster formation and were seen in-between
859	// hot basic blocks.
860	llvm::stable_sort(Range&: NewLayout, C: [&](const BinaryBasicBlock *const A,
861	const BinaryBasicBlock *const B) {
862	return A->getFragmentNum() < B->getFragmentNum();
863	});
864	} else if (BF.hasEHRanges() && !opts::SplitEH) {
865	// Typically functions with exception handling have landing pads at the end.
866	// We cannot move beginning of landing pads, but we can move 0-count blocks
867	// comprising landing pads to the end and thus facilitate splitting.
868	auto FirstLP = NewLayout.begin();
869	while ((*FirstLP)->isLandingPad())
870	++FirstLP;
871
872	std::stable_sort(first: FirstLP, last: NewLayout.end(),
873	comp: [&](BinaryBasicBlock A, BinaryBasicBlock B) {
874	return A->getFragmentNum() < B->getFragmentNum();
875	});
876	}
877
878	// Make sure that fragments are increasing.
879	FragmentNum CurrentFragment = NewLayout.back()->getFragmentNum();
880	for (BinaryBasicBlock *const BB : reverse(C&: NewLayout)) {
881	if (BB->getFragmentNum() > CurrentFragment)
882	BB->setFragmentNum(CurrentFragment);
883	CurrentFragment = BB->getFragmentNum();
884	}
885
886	if (S.compactFragments()) {
887	FragmentNum CurrentFragment = FragmentNum::main();
888	FragmentNum NewFragment = FragmentNum::main();
889	for (BinaryBasicBlock *const BB : NewLayout) {
890	if (BB->getFragmentNum() > CurrentFragment) {
891	CurrentFragment = BB->getFragmentNum();
892	NewFragment = FragmentNum (NewFragment.get() + `1`);
893	}
894	BB->setFragmentNum(NewFragment);
895	}
896	}
897
898	const bool LayoutUpdated = BF.getLayout().update(NewLayout);
899
900	// For shared objects, invoke instructions and corresponding landing pads
901	// have to be placed in the same fragment. When we split them, create
902	// trampoline landing pads that will redirect the execution to real LPs.
903	TrampolineSetType Trampolines;
904	if (BF.hasEHRanges() && BF.isSplit()) {
905	// If all landing pads for this fragment are grouped in one (potentially
906	// different) fragment, we can set LPStart to the start of that fragment
907	// and avoid trampoline code.
908	bool NeedsTrampolines = false;
909	for (FunctionFragment &FF : BF.getLayout().fragments()) {
910	// Vector of fragments that contain landing pads for this fragment.
911	SmallVector<FragmentNum, `4`> LandingPadFragments;
912	for (const BinaryBasicBlock *BB : FF)
913	for (const BinaryBasicBlock *LPB : BB->landing_pads())
914	LandingPadFragments.push_back(Elt: LPB->getFragmentNum());
915
916	// Eliminate duplicate entries from the vector.
917	llvm::sort(C&: LandingPadFragments);
918	auto Last = llvm::unique(R&: LandingPadFragments);
919	LandingPadFragments.erase(CS: Last, CE: LandingPadFragments.end());
920
921	if (LandingPadFragments.size() == `0`) {
922	// If the fragment has no landing pads, we can safely set itself as its
923	// landing pad fragment.
924	BF.setLPFragment(F: FF.getFragmentNum(), LPF: FF.getFragmentNum());
925	} else if (LandingPadFragments.size() == `1`) {
926	BF.setLPFragment(F: FF.getFragmentNum(), LPF: LandingPadFragments.front());
927	} else {
928	if (!BC.HasFixedLoadAddress) {
929	NeedsTrampolines = true;
930	break;
931	} else {
932	BF.setLPFragment(F: FF.getFragmentNum(), LPF: std::nullopt);
933	}
934	}
935	}
936
937	// Trampolines guarantee that all landing pads for any given fragment will
938	// be contained in the same fragment.
939	if (NeedsTrampolines) {
940	for (FunctionFragment &FF : BF.getLayout().fragments())
941	BF.setLPFragment(F: FF.getFragmentNum(), LPF: FF.getFragmentNum());
942	Trampolines = createEHTrampolines(Function&: BF);
943	}
944	}
945
946	// Check the new size to see if it's worth splitting the function.
947	if (BC.isX86() && LayoutUpdated) {
948	std::tie(args&: HotSize, args&: ColdSize) = BC.calculateEmittedSize(BF);
949	LLVM_DEBUG(dbgs() << "Estimated size for function " << BF
950	<< " post-split is <0x" << Twine::utohexstr(HotSize)
951	<< ", 0x" << Twine::utohexstr(ColdSize) << ">\n");
952	if (alignTo(Value: OriginalHotSize, Align: opts::SplitAlignThreshold) <=
953	alignTo(Value: HotSize, Align: opts::SplitAlignThreshold) + opts::SplitThreshold) {
954	if (opts::Verbosity >= `2`) {
955	BC.outs() << "BOLT-INFO: Reversing splitting of function "
956	<< formatv(Fmt: "{0}:\n {1:x}, {2:x} -> {3:x}\n", Vals&: BF, Vals&: HotSize,
957	Vals&: ColdSize, Vals&: OriginalHotSize);
958	}
959
960	// Reverse the action of createEHTrampolines(). The trampolines will be
961	// placed immediately before the matching destination resulting in no
962	// extra code.
963	if (PreSplitLayout.size() != BF.size())
964	PreSplitLayout = mergeEHTrampolines(BF, Layout&: PreSplitLayout, Trampolines);
965
966	for (BinaryBasicBlock &BB : BF)
967	BB.setFragmentNum(FragmentNum::main());
968	BF.getLayout().update(NewLayout: PreSplitLayout);
969	} else {
970	SplitBytesHot += HotSize;
971	SplitBytesCold += ColdSize;
972	}
973	}
974
975	// Restore LP fragment for the main fragment if the splitting was undone.
976	if (BF.hasEHRanges() && !BF.isSplit())
977	BF.setLPFragment(F: FragmentNum::main(), LPF: FragmentNum::main());
978
979	// Fix branches if the splitting decision of the pass after function
980	// reordering is different from that of the pass before function reordering.
981	if (LayoutUpdated && BC.HasFinalizedFunctionOrder)
982	BF.fixBranches();
983	}
984
985	SplitFunctions::TrampolineSetType
986	SplitFunctions::createEHTrampolines(BinaryFunction &BF) const {
987	const auto &MIB = BF.getBinaryContext().MIB;
988
989	// Map real landing pads to the corresponding trampolines.
990	TrampolineSetType LPTrampolines;
991
992	// Iterate over the copy of basic blocks since we are adding new blocks to the
993	// function which will invalidate its iterators.
994	std::vector<BinaryBasicBlock *> Blocks(BF.pbegin(), BF.pend());
995	for (BinaryBasicBlock *BB : Blocks) {
996	for (MCInst &Instr : *BB) {
997	const std::optional<MCPlus::MCLandingPad> EHInfo = MIB ->getEHInfo(Inst: Instr);
998	if (!EHInfo \|\| !EHInfo ->first)
999	continue;
1000
1001	const MCSymbol *LPLabel = EHInfo ->first;
1002	BinaryBasicBlock *LPBlock = BF.getBasicBlockForLabel(Label: LPLabel);
1003	if (BB->getFragmentNum() == LPBlock->getFragmentNum())
1004	continue;
1005
1006	const MCSymbol TrampolineLabel = nullptr*;
1007	const TrampolineKey Key(BB->getFragmentNum(), LPLabel);
1008	auto Iter = LPTrampolines.find(Val: Key);
1009	if (Iter != LPTrampolines.end()) {
1010	TrampolineLabel = Iter ->second;
1011	} else {
1012	// Create a trampoline basic block in the same fragment as the thrower.
1013	// Note: there's no need to insert the jump instruction, it will be
1014	// added by fixBranches().
1015	BinaryBasicBlock *TrampolineBB = BF.addBasicBlock();
1016	TrampolineBB->setFragmentNum(BB->getFragmentNum());
1017	TrampolineBB->setExecutionCount(LPBlock->getExecutionCount());
1018	TrampolineBB->addSuccessor(Succ: LPBlock, Count: TrampolineBB->getExecutionCount());
1019	TrampolineBB->setCFIState(LPBlock->getCFIState());
1020	TrampolineLabel = TrampolineBB->getLabel();
1021	LPTrampolines.insert(KV: std::make_pair(x: Key, y&: TrampolineLabel));
1022	}
1023
1024	// Substitute the landing pad with the trampoline.
1025	MIB ->updateEHInfo(Inst&: Instr,
1026	LP: MCPlus::MCLandingPad (TrampolineLabel, EHInfo ->second));
1027	}
1028	}
1029
1030	if (LPTrampolines.empty())
1031	return LPTrampolines;
1032
1033	// All trampoline blocks were added to the end of the function. Place them at
1034	// the end of corresponding fragments.
1035	BinaryFunction::BasicBlockOrderType NewLayout(BF.getLayout().block_begin(),
1036	BF.getLayout().block_end());
1037	stable_sort(Range&: NewLayout, C: [&](BinaryBasicBlock A, BinaryBasicBlock B) {
1038	return A->getFragmentNum() < B->getFragmentNum();
1039	});
1040	BF.getLayout().update(NewLayout);
1041
1042	// Conservatively introduce branch instructions.
1043	BF.fixBranches();
1044
1045	// Update exception-handling CFG for the function.
1046	BF.recomputeLandingPads();
1047
1048	return LPTrampolines;
1049	}
1050
1051	SplitFunctions::BasicBlockOrderType SplitFunctions::mergeEHTrampolines(
1052	BinaryFunction &BF, SplitFunctions::BasicBlockOrderType &Layout,
1053	const SplitFunctions::TrampolineSetType &Trampolines) const {
1054	DenseMap<const MCSymbol , SmallVector<const* MCSymbol *, `0`>>
1055	IncomingTrampolines;
1056	for (const auto &Entry : Trampolines) {
1057	IncomingTrampolines [Entry.getFirst().Target].emplace_back(
1058	Args: Entry.getSecond());
1059	}
1060
1061	BasicBlockOrderType MergedLayout;
1062	for (BinaryBasicBlock *BB : Layout) {
1063	auto Iter = IncomingTrampolines.find(Val: BB->getLabel());
1064	if (Iter != IncomingTrampolines.end()) {
1065	for (const MCSymbol *const Trampoline : Iter ->getSecond()) {
1066	BinaryBasicBlock *LPBlock = BF.getBasicBlockForLabel(Label: Trampoline);
1067	assert(LPBlock && "Could not find matching landing pad block.");
1068	MergedLayout.push_back(Elt: LPBlock);
1069	}
1070	}
1071	MergedLayout.push_back(Elt: BB);
1072	}
1073
1074	return MergedLayout;
1075	}
1076
1077	} // namespace bolt
1078	} // namespace llvm
1079

Provided by KDAB

Learn to use CMake with our Intro Training

Find out more

Definitions

source code of bolt/lib/Passes/SplitFunctions.cpp