1//===- bolt/Passes/SplitFunctions.cpp - Pass for splitting function code --===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the SplitFunctions pass.
10//
11//===----------------------------------------------------------------------===//
12
13#include "bolt/Passes/SplitFunctions.h"
14#include "bolt/Core/BinaryBasicBlock.h"
15#include "bolt/Core/BinaryFunction.h"
16#include "bolt/Core/FunctionLayout.h"
17#include "bolt/Core/ParallelUtilities.h"
18#include "bolt/Utils/CommandLineOpts.h"
19#include "llvm/ADT/STLExtras.h"
20#include "llvm/ADT/SmallVector.h"
21#include "llvm/ADT/iterator_range.h"
22#include "llvm/Support/CommandLine.h"
23#include "llvm/Support/FormatVariadic.h"
24#include <algorithm>
25#include <iterator>
26#include <memory>
27#include <numeric>
28#include <random>
29#include <vector>
30
31#define DEBUG_TYPE "bolt-opts"
32
33using namespace llvm;
34using namespace bolt;
35
36namespace {
37class DeprecatedSplitFunctionOptionParser : public cl::parser<bool> {
38public:
39 explicit DeprecatedSplitFunctionOptionParser(cl::Option &O)
40 : cl::parser<bool>(O) {}
41
42 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, bool &Value) {
43 if (Arg == "2" || Arg == "3") {
44 Value = true;
45 errs() << formatv(Fmt: "BOLT-WARNING: specifying non-boolean value \"{0}\" "
46 "for option -{1} is deprecated\n",
47 Vals&: Arg, Vals&: ArgName);
48 return false;
49 }
50 return cl::parser<bool>::parse(O, ArgName, Arg, Val&: Value);
51 }
52};
53} // namespace
54
55namespace opts {
56
57extern cl::OptionCategory BoltOptCategory;
58
59extern cl::opt<bool> SplitEH;
60extern cl::opt<unsigned> ExecutionCountThreshold;
61extern cl::opt<uint32_t> RandomSeed;
62
63static cl::opt<bool> AggressiveSplitting(
64 "split-all-cold", cl::desc("outline as many cold basic blocks as possible"),
65 cl::cat(BoltOptCategory));
66
67static cl::opt<unsigned> SplitAlignThreshold(
68 "split-align-threshold",
69 cl::desc("when deciding to split a function, apply this alignment "
70 "while doing the size comparison (see -split-threshold). "
71 "Default value: 2."),
72 cl::init(Val: 2),
73
74 cl::Hidden, cl::cat(BoltOptCategory));
75
76static cl::opt<bool, false, DeprecatedSplitFunctionOptionParser>
77 SplitFunctions("split-functions",
78 cl::desc("split functions into fragments"),
79 cl::cat(BoltOptCategory));
80
81static cl::opt<unsigned> SplitThreshold(
82 "split-threshold",
83 cl::desc("split function only if its main size is reduced by more than "
84 "given amount of bytes. Default value: 0, i.e. split iff the "
85 "size is reduced. Note that on some architectures the size can "
86 "increase after splitting."),
87 cl::init(Val: 0), cl::Hidden, cl::cat(BoltOptCategory));
88
89static cl::opt<SplitFunctionsStrategy> SplitStrategy(
90 "split-strategy", cl::init(Val: SplitFunctionsStrategy::Profile2),
91 cl::values(clEnumValN(SplitFunctionsStrategy::Profile2, "profile2",
92 "split each function into a hot and cold fragment "
93 "using profiling information")),
94 cl::values(clEnumValN(SplitFunctionsStrategy::CDSplit, "cdsplit",
95 "split each function into a hot, warm, and cold "
96 "fragment using profiling information")),
97 cl::values(clEnumValN(
98 SplitFunctionsStrategy::Random2, "random2",
99 "split each function into a hot and cold fragment at a randomly chosen "
100 "split point (ignoring any available profiling information)")),
101 cl::values(clEnumValN(
102 SplitFunctionsStrategy::RandomN, "randomN",
103 "split each function into N fragments at a randomly chosen split "
104 "points (ignoring any available profiling information)")),
105 cl::values(clEnumValN(
106 SplitFunctionsStrategy::All, "all",
107 "split all basic blocks of each function into fragments such that each "
108 "fragment contains exactly a single basic block")),
109 cl::desc("strategy used to partition blocks into fragments"),
110 cl::cat(BoltOptCategory));
111
112static cl::opt<double> CallScale(
113 "call-scale",
114 cl::desc("Call score scale coefficient (when --split-strategy=cdsplit)"),
115 cl::init(Val: 0.95), cl::ReallyHidden, cl::cat(BoltOptCategory));
116
117static cl::opt<double>
118 CallPower("call-power",
119 cl::desc("Call score power (when --split-strategy=cdsplit)"),
120 cl::init(Val: 0.05), cl::ReallyHidden, cl::cat(BoltOptCategory));
121
122static cl::opt<double>
123 JumpPower("jump-power",
124 cl::desc("Jump score power (when --split-strategy=cdsplit)"),
125 cl::init(Val: 0.15), cl::ReallyHidden, cl::cat(BoltOptCategory));
126} // namespace opts
127
128namespace {
129bool hasFullProfile(const BinaryFunction &BF) {
130 return llvm::all_of(Range: BF.blocks(), P: [](const BinaryBasicBlock &BB) {
131 return BB.getExecutionCount() != BinaryBasicBlock::COUNT_NO_PROFILE;
132 });
133}
134
135bool allBlocksCold(const BinaryFunction &BF) {
136 return llvm::all_of(Range: BF.blocks(), P: [](const BinaryBasicBlock &BB) {
137 return BB.getExecutionCount() == 0;
138 });
139}
140
141struct SplitProfile2 final : public SplitStrategy {
142 bool canSplit(const BinaryFunction &BF) override {
143 return BF.hasValidProfile() && hasFullProfile(BF) && !allBlocksCold(BF);
144 }
145
146 bool compactFragments() override { return true; }
147
148 void fragment(const BlockIt Start, const BlockIt End) override {
149 for (BinaryBasicBlock *const BB : llvm::make_range(x: Start, y: End)) {
150 if (BB->getExecutionCount() == 0)
151 BB->setFragmentNum(FragmentNum::cold());
152 }
153 }
154};
155
156struct SplitCacheDirected final : public SplitStrategy {
157 BinaryContext &BC;
158 using BasicBlockOrder = BinaryFunction::BasicBlockOrderType;
159
160 bool canSplit(const BinaryFunction &BF) override {
161 return BF.hasValidProfile() && hasFullProfile(BF) && !allBlocksCold(BF);
162 }
163
164 explicit SplitCacheDirected(BinaryContext &BC) : BC(BC) {
165 initializeAuxiliaryVariables();
166 buildCallGraph();
167 }
168
169 // When some functions are hot-warm split and others are hot-warm-cold split,
170 // we do not want to change the fragment numbers of the blocks in the hot-warm
171 // split functions.
172 bool compactFragments() override { return false; }
173
174 void fragment(const BlockIt Start, const BlockIt End) override {
175 BasicBlockOrder BlockOrder(Start, End);
176 BinaryFunction &BF = *BlockOrder.front()->getFunction();
177 // No need to re-split small functions.
178 if (BlockOrder.size() <= 2)
179 return;
180
181 size_t BestSplitIndex = findSplitIndex(BF, BlockOrder);
182 assert(BestSplitIndex < BlockOrder.size());
183
184 // Assign fragments based on the computed best split index.
185 // All basic blocks with index up to the best split index become hot.
186 // All remaining blocks are warm / cold depending on if count is
187 // greater than zero or not.
188 for (size_t Index = 0; Index < BlockOrder.size(); Index++) {
189 BinaryBasicBlock *BB = BlockOrder[Index];
190 if (Index <= BestSplitIndex)
191 BB->setFragmentNum(FragmentNum::main());
192 else
193 BB->setFragmentNum(BB->getKnownExecutionCount() > 0
194 ? FragmentNum::warm()
195 : FragmentNum::cold());
196 }
197 }
198
199private:
200 struct CallInfo {
201 size_t Length;
202 size_t Count;
203 };
204
205 struct SplitScore {
206 size_t SplitIndex = size_t(-1);
207 size_t HotSizeReduction = 0;
208 double LocalScore = 0;
209 double CoverCallScore = 0;
210
211 double sum() const { return LocalScore + CoverCallScore; }
212 };
213
214 // Auxiliary variables used by the algorithm.
215 size_t TotalNumBlocks{0};
216 size_t OrigHotSectionSize{0};
217 DenseMap<const BinaryBasicBlock *, size_t> GlobalIndices;
218 DenseMap<const BinaryBasicBlock *, size_t> BBSizes;
219 DenseMap<const BinaryBasicBlock *, size_t> BBOffsets;
220
221 // Call graph.
222 std::vector<SmallVector<const BinaryBasicBlock *, 0>> Callers;
223 std::vector<SmallVector<const BinaryBasicBlock *, 0>> Callees;
224
225 bool shouldConsiderForCallGraph(const BinaryFunction &BF) {
226 // Only a subset of the functions in the binary will be considered
227 // for initializing auxiliary variables and building call graph.
228 return BF.hasValidIndex() && BF.hasValidProfile() && !BF.empty();
229 }
230
231 void initializeAuxiliaryVariables() {
232 for (BinaryFunction *BF : BC.getSortedFunctions()) {
233 if (!shouldConsiderForCallGraph(BF: *BF))
234 continue;
235
236 // Calculate the size of each BB after hot-cold splitting.
237 // This populates BinaryBasicBlock::OutputAddressRange which
238 // can be used to compute the size of each BB.
239 BC.calculateEmittedSize(BF&: *BF, /*FixBranches=*/true);
240
241 for (BinaryBasicBlock *BB : BF->getLayout().blocks()) {
242 // Unique global index.
243 GlobalIndices[BB] = TotalNumBlocks;
244 TotalNumBlocks++;
245
246 // Block size after hot-cold splitting.
247 BBSizes[BB] = BB->getOutputSize();
248
249 // Hot block offset after hot-cold splitting.
250 BBOffsets[BB] = OrigHotSectionSize;
251 if (!BB->isSplit())
252 OrigHotSectionSize += BBSizes[BB];
253 }
254 }
255 }
256
257 void buildCallGraph() {
258 Callers.resize(new_size: TotalNumBlocks);
259 Callees.resize(new_size: TotalNumBlocks);
260 for (const BinaryFunction *SrcFunction : BC.getSortedFunctions()) {
261 if (!shouldConsiderForCallGraph(BF: *SrcFunction))
262 continue;
263
264 for (BinaryBasicBlock &SrcBB : SrcFunction->blocks()) {
265 // Skip blocks that are not executed
266 if (SrcBB.getKnownExecutionCount() == 0)
267 continue;
268
269 // Find call instructions and extract target symbols from each one
270 for (const MCInst &Inst : SrcBB) {
271 if (!BC.MIB->isCall(Inst))
272 continue;
273
274 // Call info
275 const MCSymbol *DstSym = BC.MIB->getTargetSymbol(Inst);
276 // Ignore calls w/o information
277 if (!DstSym)
278 continue;
279
280 const BinaryFunction *DstFunction = BC.getFunctionForSymbol(Symbol: DstSym);
281 // Ignore calls that do not have a valid target, but do not ignore
282 // recursive calls, because caller block could be moved to warm.
283 if (!DstFunction || DstFunction->getLayout().block_empty())
284 continue;
285
286 const BinaryBasicBlock *DstBB = &(DstFunction->front());
287
288 // Record the call only if DstBB is also in functions to consider for
289 // call graph.
290 if (GlobalIndices.contains(Val: DstBB)) {
291 Callers[GlobalIndices[DstBB]].push_back(Elt: &SrcBB);
292 Callees[GlobalIndices[&SrcBB]].push_back(Elt: DstBB);
293 }
294 }
295 }
296 }
297 }
298
299 /// Populate BinaryBasicBlock::OutputAddressRange with estimated basic block
300 /// start and end addresses for hot and warm basic blocks, assuming hot-warm
301 /// splitting happens at \p SplitIndex. Also return estimated end addresses
302 /// of the hot fragment before and after splitting.
303 /// The estimations take into account the potential addition of branch
304 /// instructions due to split fall through branches as well as the need to
305 /// use longer branch instructions for split (un)conditional branches.
306 std::pair<size_t, size_t>
307 estimatePostSplitBBAddress(const BasicBlockOrder &BlockOrder,
308 const size_t SplitIndex) {
309 assert(SplitIndex < BlockOrder.size() && "Invalid split index");
310
311 // Update function layout assuming hot-warm splitting at SplitIndex.
312 for (size_t Index = 0; Index < BlockOrder.size(); Index++) {
313 BinaryBasicBlock *BB = BlockOrder[Index];
314 if (BB->getFragmentNum() == FragmentNum::cold())
315 break;
316 BB->setFragmentNum(Index <= SplitIndex ? FragmentNum::main()
317 : FragmentNum::warm());
318 }
319 BinaryFunction *BF = BlockOrder[0]->getFunction();
320 BF->getLayout().update(NewLayout: BlockOrder);
321 // Populate BB.OutputAddressRange under the updated layout.
322 BC.calculateEmittedSize(BF&: *BF);
323
324 // Populate BB.OutputAddressRange with estimated new start and end addresses
325 // and compute the old end address of the hot section and the new end
326 // address of the hot section.
327 size_t OldHotEndAddr{0};
328 size_t NewHotEndAddr{0};
329 size_t CurrentAddr = BBOffsets[BlockOrder[0]];
330 for (BinaryBasicBlock *BB : BlockOrder) {
331 // We only care about new addresses of blocks in hot/warm.
332 if (BB->getFragmentNum() == FragmentNum::cold())
333 break;
334 const size_t NewSize = BB->getOutputSize();
335 BB->setOutputStartAddress(CurrentAddr);
336 CurrentAddr += NewSize;
337 BB->setOutputEndAddress(CurrentAddr);
338 if (BB->getLayoutIndex() == SplitIndex) {
339 NewHotEndAddr = CurrentAddr;
340 // Approximate the start address of the warm fragment of the current
341 // function using the original hot section size.
342 CurrentAddr = OrigHotSectionSize;
343 }
344 OldHotEndAddr = BBOffsets[BB] + BBSizes[BB];
345 }
346 return std::make_pair(x&: OldHotEndAddr, y&: NewHotEndAddr);
347 }
348
349 /// Get a collection of "shortenable" calls, that is, calls of type X->Y
350 /// when the function order is [... X ... BF ... Y ...].
351 /// If the hot fragment size of BF is reduced, then such calls are guaranteed
352 /// to get shorter by the reduced hot fragment size.
353 std::vector<CallInfo> extractCoverCalls(const BinaryFunction &BF) {
354 // Record the length and the count of the calls that can be shortened
355 std::vector<CallInfo> CoverCalls;
356 if (opts::CallScale == 0)
357 return CoverCalls;
358
359 const BinaryFunction *ThisBF = &BF;
360 const BinaryBasicBlock *ThisBB = &(ThisBF->front());
361 const size_t ThisGI = GlobalIndices[ThisBB];
362
363 for (const BinaryFunction *DstBF : BC.getSortedFunctions()) {
364 if (!shouldConsiderForCallGraph(BF: *DstBF))
365 continue;
366
367 const BinaryBasicBlock *DstBB = &(DstBF->front());
368 if (DstBB->getKnownExecutionCount() == 0)
369 continue;
370
371 const size_t DstGI = GlobalIndices[DstBB];
372 for (const BinaryBasicBlock *SrcBB : Callers[DstGI]) {
373 const BinaryFunction *SrcBF = SrcBB->getFunction();
374 if (ThisBF == SrcBF)
375 continue;
376
377 const size_t CallCount = SrcBB->getKnownExecutionCount();
378
379 const size_t SrcGI = GlobalIndices[SrcBB];
380
381 const bool IsCoverCall = (SrcGI < ThisGI && ThisGI < DstGI) ||
382 (DstGI <= ThisGI && ThisGI < SrcGI);
383 if (!IsCoverCall)
384 continue;
385
386 const size_t SrcBBEndAddr = BBOffsets[SrcBB] + BBSizes[SrcBB];
387 const size_t DstBBStartAddr = BBOffsets[DstBB];
388 const size_t CallLength =
389 AbsoluteDifference(X: SrcBBEndAddr, Y: DstBBStartAddr);
390 const CallInfo CI{.Length: CallLength, .Count: CallCount};
391 CoverCalls.emplace_back(args: CI);
392 }
393 }
394 return CoverCalls;
395 }
396
397 /// Compute the edge score of a call edge.
398 double computeCallScore(uint64_t CallCount, size_t CallLength) {
399 // Increase call lengths by 1 to avoid raising 0 to a negative power.
400 return opts::CallScale * static_cast<double>(CallCount) /
401 std::pow(x: static_cast<double>(CallLength + 1), y: opts::CallPower);
402 }
403
404 /// Compute the edge score of a jump (branch) edge.
405 double computeJumpScore(uint64_t JumpCount, size_t JumpLength) {
406 // Increase jump lengths by 1 to avoid raising 0 to a negative power.
407 return static_cast<double>(JumpCount) /
408 std::pow(x: static_cast<double>(JumpLength + 1), y: opts::JumpPower);
409 }
410
411 /// Compute sum of scores over jumps within \p BlockOrder given \p SplitIndex.
412 /// Increament Score.LocalScore in place by the sum.
413 void computeJumpScore(const BasicBlockOrder &BlockOrder,
414 const size_t SplitIndex, SplitScore &Score) {
415
416 for (const BinaryBasicBlock *SrcBB : BlockOrder) {
417 if (SrcBB->getKnownExecutionCount() == 0)
418 continue;
419
420 const size_t SrcBBEndAddr = SrcBB->getOutputAddressRange().second;
421
422 for (const auto Pair : zip(t: SrcBB->successors(), u: SrcBB->branch_info())) {
423 const BinaryBasicBlock *DstBB = std::get<0>(t: Pair);
424 const BinaryBasicBlock::BinaryBranchInfo &Branch = std::get<1>(t: Pair);
425 const size_t JumpCount = Branch.Count;
426
427 if (JumpCount == 0)
428 continue;
429
430 const size_t DstBBStartAddr = DstBB->getOutputAddressRange().first;
431 const size_t NewJumpLength =
432 AbsoluteDifference(X: SrcBBEndAddr, Y: DstBBStartAddr);
433 Score.LocalScore += computeJumpScore(JumpCount, JumpLength: NewJumpLength);
434 }
435 }
436 }
437
438 /// Compute sum of scores over calls originated in the current function
439 /// given \p SplitIndex. Increament Score.LocalScore in place by the sum.
440 void computeLocalCallScore(const BasicBlockOrder &BlockOrder,
441 const size_t SplitIndex, SplitScore &Score) {
442 if (opts::CallScale == 0)
443 return;
444
445 // Global index of the last block in the current function.
446 // This is later used to determine whether a call originated in the current
447 // function is to a function that comes after the current function.
448 const size_t LastGlobalIndex = GlobalIndices[BlockOrder.back()];
449
450 // The length of calls originated in the input function can increase /
451 // decrease depending on the splitting decision.
452 for (const BinaryBasicBlock *SrcBB : BlockOrder) {
453 const size_t CallCount = SrcBB->getKnownExecutionCount();
454 // If SrcBB does not call any functions, skip it.
455 if (CallCount == 0)
456 continue;
457
458 // Obtain an estimate on the end address of the src basic block
459 // after splitting at SplitIndex.
460 const size_t SrcBBEndAddr = SrcBB->getOutputAddressRange().second;
461
462 for (const BinaryBasicBlock *DstBB : Callees[GlobalIndices[SrcBB]]) {
463 // Obtain an estimate on the start address of the dst basic block
464 // after splitting at SplitIndex. If DstBB is in a function before
465 // the current function, then its start address remains unchanged.
466 size_t DstBBStartAddr = BBOffsets[DstBB];
467 // If DstBB is in a function after the current function, then its
468 // start address should be adjusted based on the reduction in hot size.
469 if (GlobalIndices[DstBB] > LastGlobalIndex) {
470 assert(DstBBStartAddr >= Score.HotSizeReduction);
471 DstBBStartAddr -= Score.HotSizeReduction;
472 }
473 const size_t NewCallLength =
474 AbsoluteDifference(X: SrcBBEndAddr, Y: DstBBStartAddr);
475 Score.LocalScore += computeCallScore(CallCount, CallLength: NewCallLength);
476 }
477 }
478 }
479
480 /// Compute sum of splitting scores for cover calls of the input function.
481 /// Increament Score.CoverCallScore in place by the sum.
482 void computeCoverCallScore(const BasicBlockOrder &BlockOrder,
483 const size_t SplitIndex,
484 const std::vector<CallInfo> &CoverCalls,
485 SplitScore &Score) {
486 if (opts::CallScale == 0)
487 return;
488
489 for (const CallInfo CI : CoverCalls) {
490 assert(CI.Length >= Score.HotSizeReduction &&
491 "Length of cover calls must exceed reduced size of hot fragment.");
492 // Compute the new length of the call, which is shorter than the original
493 // one by the size of the splitted fragment minus the total size increase.
494 const size_t NewCallLength = CI.Length - Score.HotSizeReduction;
495 Score.CoverCallScore += computeCallScore(CallCount: CI.Count, CallLength: NewCallLength);
496 }
497 }
498
499 /// Compute the split score of splitting a function at a given index.
500 /// The split score consists of local score and cover score. This function
501 /// returns \p Score of SplitScore type. It contains the local score and
502 /// cover score of the current splitting index. For easier book keeping and
503 /// comparison, it also stores the split index and the resulting reduction
504 /// in hot fragment size.
505 SplitScore computeSplitScore(const BinaryFunction &BF,
506 const BasicBlockOrder &BlockOrder,
507 const size_t SplitIndex,
508 const std::vector<CallInfo> &CoverCalls) {
509 // Populate BinaryBasicBlock::OutputAddressRange with estimated
510 // new start and end addresses after hot-warm splitting at SplitIndex.
511 size_t OldHotEnd;
512 size_t NewHotEnd;
513 std::tie(args&: OldHotEnd, args&: NewHotEnd) =
514 estimatePostSplitBBAddress(BlockOrder, SplitIndex);
515
516 SplitScore Score;
517 Score.SplitIndex = SplitIndex;
518
519 // It's not worth splitting if OldHotEnd < NewHotEnd.
520 if (OldHotEnd < NewHotEnd)
521 return Score;
522
523 // Hot fragment size reduction due to splitting.
524 Score.HotSizeReduction = OldHotEnd - NewHotEnd;
525
526 // First part of LocalScore is the sum over call edges originated in the
527 // input function. These edges can get shorter or longer depending on
528 // SplitIndex. Score.LocalScore is increamented in place.
529 computeLocalCallScore(BlockOrder, SplitIndex, Score);
530
531 // Second part of LocalScore is the sum over jump edges with src basic block
532 // and dst basic block in the current function. Score.LocalScore is
533 // increamented in place.
534 computeJumpScore(BlockOrder, SplitIndex, Score);
535
536 // Compute CoverCallScore and store in Score in place.
537 computeCoverCallScore(BlockOrder, SplitIndex, CoverCalls, Score);
538 return Score;
539 }
540
541 /// Find the most likely successor of a basic block when it has one or two
542 /// successors. Return nullptr otherwise.
543 const BinaryBasicBlock *getMostLikelySuccessor(const BinaryBasicBlock *BB) {
544 if (BB->succ_size() == 1)
545 return BB->getSuccessor();
546 if (BB->succ_size() == 2) {
547 uint64_t TakenCount = BB->getTakenBranchInfo().Count;
548 assert(TakenCount != BinaryBasicBlock::COUNT_NO_PROFILE);
549 uint64_t NonTakenCount = BB->getFallthroughBranchInfo().Count;
550 assert(NonTakenCount != BinaryBasicBlock::COUNT_NO_PROFILE);
551 if (TakenCount > NonTakenCount)
552 return BB->getConditionalSuccessor(Condition: true);
553 else if (TakenCount < NonTakenCount)
554 return BB->getConditionalSuccessor(Condition: false);
555 }
556 return nullptr;
557 }
558
559 /// Find the best index for splitting. The returned value is the index of the
560 /// last hot basic block. Hence, "no splitting" is equivalent to returning the
561 /// value which is one less than the size of the function.
562 size_t findSplitIndex(const BinaryFunction &BF,
563 const BasicBlockOrder &BlockOrder) {
564 assert(BlockOrder.size() > 2);
565 // Find all function calls that can be shortened if we move blocks of the
566 // current function to warm/cold
567 const std::vector<CallInfo> CoverCalls = extractCoverCalls(BF);
568
569 // Find the existing hot-cold splitting index.
570 size_t HotColdIndex = 0;
571 while (HotColdIndex + 1 < BlockOrder.size()) {
572 if (BlockOrder[HotColdIndex + 1]->getFragmentNum() == FragmentNum::cold())
573 break;
574 HotColdIndex++;
575 }
576 assert(HotColdIndex + 1 == BlockOrder.size() ||
577 (BlockOrder[HotColdIndex]->getFragmentNum() == FragmentNum::main() &&
578 BlockOrder[HotColdIndex + 1]->getFragmentNum() ==
579 FragmentNum::cold()));
580
581 // Try all possible split indices up to HotColdIndex (blocks that have
582 // Index <= SplitIndex are in hot) and find the one maximizing the
583 // splitting score.
584 SplitScore BestScore;
585 for (size_t Index = 0; Index <= HotColdIndex; Index++) {
586 const BinaryBasicBlock *LastHotBB = BlockOrder[Index];
587 assert(LastHotBB->getFragmentNum() != FragmentNum::cold());
588
589 // Do not break jump to the most likely successor.
590 if (Index + 1 < BlockOrder.size() &&
591 BlockOrder[Index + 1] == getMostLikelySuccessor(BB: LastHotBB))
592 continue;
593
594 const SplitScore Score =
595 computeSplitScore(BF, BlockOrder, SplitIndex: Index, CoverCalls);
596 if (Score.sum() > BestScore.sum())
597 BestScore = Score;
598 }
599
600 // If we don't find a good splitting point, fallback to the original one.
601 if (BestScore.SplitIndex == size_t(-1))
602 return HotColdIndex;
603
604 return BestScore.SplitIndex;
605 }
606};
607
608struct SplitRandom2 final : public SplitStrategy {
609 std::minstd_rand0 Gen;
610
611 SplitRandom2() : Gen(opts::RandomSeed.getValue()) {}
612
613 bool canSplit(const BinaryFunction &BF) override { return true; }
614
615 bool compactFragments() override { return true; }
616
617 void fragment(const BlockIt Start, const BlockIt End) override {
618 using DiffT = typename std::iterator_traits<BlockIt>::difference_type;
619 const DiffT NumBlocks = End - Start;
620 assert(NumBlocks > 0 && "Cannot fragment empty function");
621
622 // We want to split at least one block
623 const auto LastSplitPoint = std::max<DiffT>(a: NumBlocks - 1, b: 1);
624 std::uniform_int_distribution<DiffT> Dist(1, LastSplitPoint);
625 const DiffT SplitPoint = Dist(Gen);
626 for (BinaryBasicBlock *BB : llvm::make_range(x: Start + SplitPoint, y: End))
627 BB->setFragmentNum(FragmentNum::cold());
628
629 LLVM_DEBUG(dbgs() << formatv("BOLT-DEBUG: randomly chose last {0} (out of "
630 "{1} possible) blocks to split\n",
631 NumBlocks - SplitPoint, End - Start));
632 }
633};
634
635struct SplitRandomN final : public SplitStrategy {
636 std::minstd_rand0 Gen;
637
638 SplitRandomN() : Gen(opts::RandomSeed.getValue()) {}
639
640 bool canSplit(const BinaryFunction &BF) override { return true; }
641
642 bool compactFragments() override { return true; }
643
644 void fragment(const BlockIt Start, const BlockIt End) override {
645 using DiffT = typename std::iterator_traits<BlockIt>::difference_type;
646 const DiffT NumBlocks = End - Start;
647 assert(NumBlocks > 0 && "Cannot fragment empty function");
648
649 // With n blocks, there are n-1 places to split them.
650 const DiffT MaximumSplits = NumBlocks - 1;
651 // We want to generate at least two fragment if possible, but if there is
652 // only one block, no splits are possible.
653 const auto MinimumSplits = std::min<DiffT>(a: MaximumSplits, b: 1);
654 std::uniform_int_distribution<DiffT> Dist(MinimumSplits, MaximumSplits);
655 // Choose how many splits to perform
656 const DiffT NumSplits = Dist(Gen);
657
658 // Draw split points from a lottery
659 SmallVector<unsigned, 0> Lottery(MaximumSplits);
660 // Start lottery at 1, because there is no meaningful splitpoint before the
661 // first block.
662 std::iota(first: Lottery.begin(), last: Lottery.end(), value: 1u);
663 std::shuffle(first: Lottery.begin(), last: Lottery.end(), g&: Gen);
664 Lottery.resize(N: NumSplits);
665 llvm::sort(C&: Lottery);
666
667 // Add one past the end entry to lottery
668 Lottery.push_back(Elt: NumBlocks);
669
670 unsigned LotteryIndex = 0;
671 unsigned BBPos = 0;
672 for (BinaryBasicBlock *const BB : make_range(x: Start, y: End)) {
673 // Check whether to start new fragment
674 if (BBPos >= Lottery[LotteryIndex])
675 ++LotteryIndex;
676
677 // Because LotteryIndex is 0 based and cold fragments are 1 based, we can
678 // use the index to assign fragments.
679 BB->setFragmentNum(FragmentNum(LotteryIndex));
680
681 ++BBPos;
682 }
683 }
684};
685
686struct SplitAll final : public SplitStrategy {
687 bool canSplit(const BinaryFunction &BF) override { return true; }
688
689 bool compactFragments() override {
690 // Keeping empty fragments allows us to test, that empty fragments do not
691 // generate symbols.
692 return false;
693 }
694
695 void fragment(const BlockIt Start, const BlockIt End) override {
696 unsigned Fragment = 0;
697 for (BinaryBasicBlock *const BB : llvm::make_range(x: Start, y: End))
698 BB->setFragmentNum(FragmentNum(Fragment++));
699 }
700};
701} // namespace
702
703namespace llvm {
704namespace bolt {
705
706bool SplitFunctions::shouldOptimize(const BinaryFunction &BF) const {
707 // Apply execution count threshold
708 if (BF.getKnownExecutionCount() < opts::ExecutionCountThreshold)
709 return false;
710
711 return BinaryFunctionPass::shouldOptimize(BF);
712}
713
714Error SplitFunctions::runOnFunctions(BinaryContext &BC) {
715 if (!opts::SplitFunctions)
716 return Error::success();
717
718 if (BC.IsLinuxKernel && BC.BOLTReserved.empty()) {
719 BC.errs() << "BOLT-ERROR: split functions require reserved space in the "
720 "Linux kernel binary\n";
721 exit(status: 1);
722 }
723
724 // If split strategy is not CDSplit, then a second run of the pass is not
725 // needed after function reordering.
726 if (BC.HasFinalizedFunctionOrder &&
727 opts::SplitStrategy != SplitFunctionsStrategy::CDSplit)
728 return Error::success();
729
730 std::unique_ptr<SplitStrategy> Strategy;
731 bool ForceSequential = false;
732
733 switch (opts::SplitStrategy) {
734 case SplitFunctionsStrategy::CDSplit:
735 // CDSplit runs two splitting passes: hot-cold splitting (SplitPrfoile2)
736 // before function reordering and hot-warm-cold splitting
737 // (SplitCacheDirected) after function reordering.
738 if (BC.HasFinalizedFunctionOrder)
739 Strategy = std::make_unique<SplitCacheDirected>(args&: BC);
740 else
741 Strategy = std::make_unique<SplitProfile2>();
742 opts::AggressiveSplitting = true;
743 BC.HasWarmSection = true;
744 break;
745 case SplitFunctionsStrategy::Profile2:
746 Strategy = std::make_unique<SplitProfile2>();
747 break;
748 case SplitFunctionsStrategy::Random2:
749 Strategy = std::make_unique<SplitRandom2>();
750 // If we split functions randomly, we need to ensure that across runs with
751 // the same input, we generate random numbers for each function in the same
752 // order.
753 ForceSequential = true;
754 break;
755 case SplitFunctionsStrategy::RandomN:
756 Strategy = std::make_unique<SplitRandomN>();
757 ForceSequential = true;
758 break;
759 case SplitFunctionsStrategy::All:
760 Strategy = std::make_unique<SplitAll>();
761 break;
762 }
763
764 ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
765 return !shouldOptimize(BF);
766 };
767
768 ParallelUtilities::runOnEachFunction(
769 BC, SchedPolicy: ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR,
770 WorkFunction: [&](BinaryFunction &BF) { splitFunction(Function&: BF, Strategy&: *Strategy); }, SkipPredicate: SkipFunc,
771 LogName: "SplitFunctions", ForceSequential);
772
773 if (SplitBytesHot + SplitBytesCold > 0)
774 BC.outs() << "BOLT-INFO: splitting separates " << SplitBytesHot
775 << " hot bytes from " << SplitBytesCold << " cold bytes "
776 << format(Fmt: "(%.2lf%% of split functions is hot).\n",
777 Vals: 100.0 * SplitBytesHot /
778 (SplitBytesHot + SplitBytesCold));
779 return Error::success();
780}
781
782void SplitFunctions::splitFunction(BinaryFunction &BF, SplitStrategy &S) {
783 if (BF.empty())
784 return;
785
786 if (!S.canSplit(BF))
787 return;
788
789 FunctionLayout &Layout = BF.getLayout();
790 BinaryFunction::BasicBlockOrderType PreSplitLayout(Layout.block_begin(),
791 Layout.block_end());
792
793 BinaryContext &BC = BF.getBinaryContext();
794 size_t OriginalHotSize;
795 size_t HotSize;
796 size_t ColdSize;
797 if (BC.isX86()) {
798 std::tie(args&: OriginalHotSize, args&: ColdSize) = BC.calculateEmittedSize(BF);
799 LLVM_DEBUG(dbgs() << "Estimated size for function " << BF
800 << " pre-split is <0x"
801 << Twine::utohexstr(OriginalHotSize) << ", 0x"
802 << Twine::utohexstr(ColdSize) << ">\n");
803 }
804
805 BinaryFunction::BasicBlockOrderType NewLayout(Layout.block_begin(),
806 Layout.block_end());
807 // Never outline the first basic block.
808 NewLayout.front()->setCanOutline(false);
809 for (BinaryBasicBlock *const BB : NewLayout) {
810 if (!BB->canOutline())
811 continue;
812
813 // Do not split extra entry points in aarch64. They can be referred by
814 // using ADRs and when this happens, these blocks cannot be placed far
815 // away due to the limited range in ADR instruction.
816 if (BC.isAArch64() && BB->isEntryPoint()) {
817 BB->setCanOutline(false);
818 continue;
819 }
820
821 if (BF.hasEHRanges() && !opts::SplitEH) {
822 // We cannot move landing pads (or rather entry points for landing pads).
823 if (BB->isLandingPad()) {
824 BB->setCanOutline(false);
825 continue;
826 }
827 // We cannot move a block that can throw since exception-handling
828 // runtime cannot deal with split functions. However, if we can guarantee
829 // that the block never throws, it is safe to move the block to
830 // decrease the size of the function.
831 for (MCInst &Instr : *BB) {
832 if (BC.MIB->isInvoke(Inst: Instr)) {
833 BB->setCanOutline(false);
834 break;
835 }
836 }
837 }
838
839 // Outlining blocks with dynamic branches is not supported yet.
840 if (BC.IsLinuxKernel) {
841 if (llvm::any_of(
842 Range&: *BB, P: [&](MCInst &Inst) { return BC.MIB->isDynamicBranch(Inst); }))
843 BB->setCanOutline(false);
844 }
845 }
846
847 BF.getLayout().updateLayoutIndices();
848 S.fragment(Start: NewLayout.begin(), End: NewLayout.end());
849
850 // Make sure all non-outlineable blocks are in the main-fragment.
851 for (BinaryBasicBlock *const BB : NewLayout) {
852 if (!BB->canOutline())
853 BB->setFragmentNum(FragmentNum::main());
854 }
855
856 if (opts::AggressiveSplitting) {
857 // All blocks with 0 count that we can move go to the end of the function.
858 // Even if they were natural to cluster formation and were seen in-between
859 // hot basic blocks.
860 llvm::stable_sort(Range&: NewLayout, C: [&](const BinaryBasicBlock *const A,
861 const BinaryBasicBlock *const B) {
862 return A->getFragmentNum() < B->getFragmentNum();
863 });
864 } else if (BF.hasEHRanges() && !opts::SplitEH) {
865 // Typically functions with exception handling have landing pads at the end.
866 // We cannot move beginning of landing pads, but we can move 0-count blocks
867 // comprising landing pads to the end and thus facilitate splitting.
868 auto FirstLP = NewLayout.begin();
869 while ((*FirstLP)->isLandingPad())
870 ++FirstLP;
871
872 std::stable_sort(first: FirstLP, last: NewLayout.end(),
873 comp: [&](BinaryBasicBlock *A, BinaryBasicBlock *B) {
874 return A->getFragmentNum() < B->getFragmentNum();
875 });
876 }
877
878 // Make sure that fragments are increasing.
879 FragmentNum CurrentFragment = NewLayout.back()->getFragmentNum();
880 for (BinaryBasicBlock *const BB : reverse(C&: NewLayout)) {
881 if (BB->getFragmentNum() > CurrentFragment)
882 BB->setFragmentNum(CurrentFragment);
883 CurrentFragment = BB->getFragmentNum();
884 }
885
886 if (S.compactFragments()) {
887 FragmentNum CurrentFragment = FragmentNum::main();
888 FragmentNum NewFragment = FragmentNum::main();
889 for (BinaryBasicBlock *const BB : NewLayout) {
890 if (BB->getFragmentNum() > CurrentFragment) {
891 CurrentFragment = BB->getFragmentNum();
892 NewFragment = FragmentNum(NewFragment.get() + 1);
893 }
894 BB->setFragmentNum(NewFragment);
895 }
896 }
897
898 const bool LayoutUpdated = BF.getLayout().update(NewLayout);
899
900 // For shared objects, invoke instructions and corresponding landing pads
901 // have to be placed in the same fragment. When we split them, create
902 // trampoline landing pads that will redirect the execution to real LPs.
903 TrampolineSetType Trampolines;
904 if (BF.hasEHRanges() && BF.isSplit()) {
905 // If all landing pads for this fragment are grouped in one (potentially
906 // different) fragment, we can set LPStart to the start of that fragment
907 // and avoid trampoline code.
908 bool NeedsTrampolines = false;
909 for (FunctionFragment &FF : BF.getLayout().fragments()) {
910 // Vector of fragments that contain landing pads for this fragment.
911 SmallVector<FragmentNum, 4> LandingPadFragments;
912 for (const BinaryBasicBlock *BB : FF)
913 for (const BinaryBasicBlock *LPB : BB->landing_pads())
914 LandingPadFragments.push_back(Elt: LPB->getFragmentNum());
915
916 // Eliminate duplicate entries from the vector.
917 llvm::sort(C&: LandingPadFragments);
918 auto Last = llvm::unique(R&: LandingPadFragments);
919 LandingPadFragments.erase(CS: Last, CE: LandingPadFragments.end());
920
921 if (LandingPadFragments.size() == 0) {
922 // If the fragment has no landing pads, we can safely set itself as its
923 // landing pad fragment.
924 BF.setLPFragment(F: FF.getFragmentNum(), LPF: FF.getFragmentNum());
925 } else if (LandingPadFragments.size() == 1) {
926 BF.setLPFragment(F: FF.getFragmentNum(), LPF: LandingPadFragments.front());
927 } else {
928 if (!BC.HasFixedLoadAddress) {
929 NeedsTrampolines = true;
930 break;
931 } else {
932 BF.setLPFragment(F: FF.getFragmentNum(), LPF: std::nullopt);
933 }
934 }
935 }
936
937 // Trampolines guarantee that all landing pads for any given fragment will
938 // be contained in the same fragment.
939 if (NeedsTrampolines) {
940 for (FunctionFragment &FF : BF.getLayout().fragments())
941 BF.setLPFragment(F: FF.getFragmentNum(), LPF: FF.getFragmentNum());
942 Trampolines = createEHTrampolines(Function&: BF);
943 }
944 }
945
946 // Check the new size to see if it's worth splitting the function.
947 if (BC.isX86() && LayoutUpdated) {
948 std::tie(args&: HotSize, args&: ColdSize) = BC.calculateEmittedSize(BF);
949 LLVM_DEBUG(dbgs() << "Estimated size for function " << BF
950 << " post-split is <0x" << Twine::utohexstr(HotSize)
951 << ", 0x" << Twine::utohexstr(ColdSize) << ">\n");
952 if (alignTo(Value: OriginalHotSize, Align: opts::SplitAlignThreshold) <=
953 alignTo(Value: HotSize, Align: opts::SplitAlignThreshold) + opts::SplitThreshold) {
954 if (opts::Verbosity >= 2) {
955 BC.outs() << "BOLT-INFO: Reversing splitting of function "
956 << formatv(Fmt: "{0}:\n {1:x}, {2:x} -> {3:x}\n", Vals&: BF, Vals&: HotSize,
957 Vals&: ColdSize, Vals&: OriginalHotSize);
958 }
959
960 // Reverse the action of createEHTrampolines(). The trampolines will be
961 // placed immediately before the matching destination resulting in no
962 // extra code.
963 if (PreSplitLayout.size() != BF.size())
964 PreSplitLayout = mergeEHTrampolines(BF, Layout&: PreSplitLayout, Trampolines);
965
966 for (BinaryBasicBlock &BB : BF)
967 BB.setFragmentNum(FragmentNum::main());
968 BF.getLayout().update(NewLayout: PreSplitLayout);
969 } else {
970 SplitBytesHot += HotSize;
971 SplitBytesCold += ColdSize;
972 }
973 }
974
975 // Restore LP fragment for the main fragment if the splitting was undone.
976 if (BF.hasEHRanges() && !BF.isSplit())
977 BF.setLPFragment(F: FragmentNum::main(), LPF: FragmentNum::main());
978
979 // Fix branches if the splitting decision of the pass after function
980 // reordering is different from that of the pass before function reordering.
981 if (LayoutUpdated && BC.HasFinalizedFunctionOrder)
982 BF.fixBranches();
983}
984
985SplitFunctions::TrampolineSetType
986SplitFunctions::createEHTrampolines(BinaryFunction &BF) const {
987 const auto &MIB = BF.getBinaryContext().MIB;
988
989 // Map real landing pads to the corresponding trampolines.
990 TrampolineSetType LPTrampolines;
991
992 // Iterate over the copy of basic blocks since we are adding new blocks to the
993 // function which will invalidate its iterators.
994 std::vector<BinaryBasicBlock *> Blocks(BF.pbegin(), BF.pend());
995 for (BinaryBasicBlock *BB : Blocks) {
996 for (MCInst &Instr : *BB) {
997 const std::optional<MCPlus::MCLandingPad> EHInfo = MIB->getEHInfo(Inst: Instr);
998 if (!EHInfo || !EHInfo->first)
999 continue;
1000
1001 const MCSymbol *LPLabel = EHInfo->first;
1002 BinaryBasicBlock *LPBlock = BF.getBasicBlockForLabel(Label: LPLabel);
1003 if (BB->getFragmentNum() == LPBlock->getFragmentNum())
1004 continue;
1005
1006 const MCSymbol *TrampolineLabel = nullptr;
1007 const TrampolineKey Key(BB->getFragmentNum(), LPLabel);
1008 auto Iter = LPTrampolines.find(Val: Key);
1009 if (Iter != LPTrampolines.end()) {
1010 TrampolineLabel = Iter->second;
1011 } else {
1012 // Create a trampoline basic block in the same fragment as the thrower.
1013 // Note: there's no need to insert the jump instruction, it will be
1014 // added by fixBranches().
1015 BinaryBasicBlock *TrampolineBB = BF.addBasicBlock();
1016 TrampolineBB->setFragmentNum(BB->getFragmentNum());
1017 TrampolineBB->setExecutionCount(LPBlock->getExecutionCount());
1018 TrampolineBB->addSuccessor(Succ: LPBlock, Count: TrampolineBB->getExecutionCount());
1019 TrampolineBB->setCFIState(LPBlock->getCFIState());
1020 TrampolineLabel = TrampolineBB->getLabel();
1021 LPTrampolines.insert(KV: std::make_pair(x: Key, y&: TrampolineLabel));
1022 }
1023
1024 // Substitute the landing pad with the trampoline.
1025 MIB->updateEHInfo(Inst&: Instr,
1026 LP: MCPlus::MCLandingPad(TrampolineLabel, EHInfo->second));
1027 }
1028 }
1029
1030 if (LPTrampolines.empty())
1031 return LPTrampolines;
1032
1033 // All trampoline blocks were added to the end of the function. Place them at
1034 // the end of corresponding fragments.
1035 BinaryFunction::BasicBlockOrderType NewLayout(BF.getLayout().block_begin(),
1036 BF.getLayout().block_end());
1037 stable_sort(Range&: NewLayout, C: [&](BinaryBasicBlock *A, BinaryBasicBlock *B) {
1038 return A->getFragmentNum() < B->getFragmentNum();
1039 });
1040 BF.getLayout().update(NewLayout);
1041
1042 // Conservatively introduce branch instructions.
1043 BF.fixBranches();
1044
1045 // Update exception-handling CFG for the function.
1046 BF.recomputeLandingPads();
1047
1048 return LPTrampolines;
1049}
1050
1051SplitFunctions::BasicBlockOrderType SplitFunctions::mergeEHTrampolines(
1052 BinaryFunction &BF, SplitFunctions::BasicBlockOrderType &Layout,
1053 const SplitFunctions::TrampolineSetType &Trampolines) const {
1054 DenseMap<const MCSymbol *, SmallVector<const MCSymbol *, 0>>
1055 IncomingTrampolines;
1056 for (const auto &Entry : Trampolines) {
1057 IncomingTrampolines[Entry.getFirst().Target].emplace_back(
1058 Args: Entry.getSecond());
1059 }
1060
1061 BasicBlockOrderType MergedLayout;
1062 for (BinaryBasicBlock *BB : Layout) {
1063 auto Iter = IncomingTrampolines.find(Val: BB->getLabel());
1064 if (Iter != IncomingTrampolines.end()) {
1065 for (const MCSymbol *const Trampoline : Iter->getSecond()) {
1066 BinaryBasicBlock *LPBlock = BF.getBasicBlockForLabel(Label: Trampoline);
1067 assert(LPBlock && "Could not find matching landing pad block.");
1068 MergedLayout.push_back(Elt: LPBlock);
1069 }
1070 }
1071 MergedLayout.push_back(Elt: BB);
1072 }
1073
1074 return MergedLayout;
1075}
1076
1077} // namespace bolt
1078} // namespace llvm
1079

Provided by KDAB

Privacy Policy
Update your C++ knowledge – Modern C++11/14/17 Training
Find out more

source code of bolt/lib/Passes/SplitFunctions.cpp