1//===- bolt/Passes/BinaryPasses.cpp - Binary-level passes -----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements multiple passes for binary optimization and analysis.
10//
11//===----------------------------------------------------------------------===//
12
13#include "bolt/Passes/BinaryPasses.h"
14#include "bolt/Core/FunctionLayout.h"
15#include "bolt/Core/ParallelUtilities.h"
16#include "bolt/Passes/ReorderAlgorithm.h"
17#include "bolt/Passes/ReorderFunctions.h"
18#include "bolt/Utils/CommandLineOpts.h"
19#include "llvm/Support/CommandLine.h"
20#include <atomic>
21#include <mutex>
22#include <numeric>
23#include <vector>
24
25#define DEBUG_TYPE "bolt-opts"
26
27using namespace llvm;
28using namespace bolt;
29
30static const char *dynoStatsOptName(const bolt::DynoStats::Category C) {
31 assert(C > bolt::DynoStats::FIRST_DYNO_STAT &&
32 C < DynoStats::LAST_DYNO_STAT && "Unexpected dyno stat category.");
33
34 static std::string OptNames[bolt::DynoStats::LAST_DYNO_STAT + 1];
35
36 OptNames[C] = bolt::DynoStats::Description(C);
37
38 llvm::replace(Range&: OptNames[C], OldValue: ' ', NewValue: '-');
39
40 return OptNames[C].c_str();
41}
42
43namespace opts {
44
45extern cl::OptionCategory BoltCategory;
46extern cl::OptionCategory BoltOptCategory;
47
48extern cl::opt<unsigned> Verbosity;
49extern cl::opt<bool> EnableBAT;
50extern cl::opt<unsigned> ExecutionCountThreshold;
51extern cl::opt<bool> UpdateDebugSections;
52extern cl::opt<bolt::ReorderFunctions::ReorderType> ReorderFunctions;
53
54enum DynoStatsSortOrder : char {
55 Ascending,
56 Descending
57};
58
59static cl::opt<DynoStatsSortOrder> DynoStatsSortOrderOpt(
60 "print-sorted-by-order",
61 cl::desc("use ascending or descending order when printing functions "
62 "ordered by dyno stats"),
63 cl::init(Val: DynoStatsSortOrder::Descending), cl::cat(BoltOptCategory));
64
65cl::list<std::string>
66HotTextMoveSections("hot-text-move-sections",
67 cl::desc("list of sections containing functions used for hugifying hot text. "
68 "BOLT makes sure these functions are not placed on the same page as "
69 "the hot text. (default=\'.stub,.mover\')."),
70 cl::value_desc("sec1,sec2,sec3,..."),
71 cl::CommaSeparated,
72 cl::ZeroOrMore,
73 cl::cat(BoltCategory));
74
75bool isHotTextMover(const BinaryFunction &Function) {
76 for (std::string &SectionName : opts::HotTextMoveSections) {
77 if (Function.getOriginSectionName() &&
78 *Function.getOriginSectionName() == SectionName)
79 return true;
80 }
81
82 return false;
83}
84
85static cl::opt<bool> MinBranchClusters(
86 "min-branch-clusters",
87 cl::desc("use a modified clustering algorithm geared towards minimizing "
88 "branches"),
89 cl::Hidden, cl::cat(BoltOptCategory));
90
91static cl::list<Peepholes::PeepholeOpts> Peepholes(
92 "peepholes", cl::CommaSeparated, cl::desc("enable peephole optimizations"),
93 cl::value_desc("opt1,opt2,opt3,..."),
94 cl::values(clEnumValN(Peepholes::PEEP_NONE, "none", "disable peepholes"),
95 clEnumValN(Peepholes::PEEP_DOUBLE_JUMPS, "double-jumps",
96 "remove double jumps when able"),
97 clEnumValN(Peepholes::PEEP_TAILCALL_TRAPS, "tailcall-traps",
98 "insert tail call traps"),
99 clEnumValN(Peepholes::PEEP_USELESS_BRANCHES, "useless-branches",
100 "remove useless conditional branches"),
101 clEnumValN(Peepholes::PEEP_ALL, "all",
102 "enable all peephole optimizations")),
103 cl::ZeroOrMore, cl::cat(BoltOptCategory));
104
105static cl::opt<unsigned>
106 PrintFuncStat("print-function-statistics",
107 cl::desc("print statistics about basic block ordering"),
108 cl::init(Val: 0), cl::cat(BoltOptCategory));
109
110static cl::opt<bool> PrintLargeFunctions(
111 "print-large-functions",
112 cl::desc("print functions that could not be overwritten due to excessive "
113 "size"),
114 cl::init(Val: false), cl::cat(BoltOptCategory));
115
116static cl::list<bolt::DynoStats::Category>
117 PrintSortedBy("print-sorted-by", cl::CommaSeparated,
118 cl::desc("print functions sorted by order of dyno stats"),
119 cl::value_desc("key1,key2,key3,..."),
120 cl::values(
121#define D(name, description, ...) \
122 clEnumValN(bolt::DynoStats::name, dynoStatsOptName(bolt::DynoStats::name), \
123 description),
124 REAL_DYNO_STATS
125#undef D
126 clEnumValN(bolt::DynoStats::LAST_DYNO_STAT, "all",
127 "sorted by all names")),
128 cl::ZeroOrMore, cl::cat(BoltOptCategory));
129
130static cl::opt<bool>
131 PrintUnknown("print-unknown",
132 cl::desc("print names of functions with unknown control flow"),
133 cl::cat(BoltCategory), cl::Hidden);
134
135static cl::opt<bool>
136 PrintUnknownCFG("print-unknown-cfg",
137 cl::desc("dump CFG of functions with unknown control flow"),
138 cl::cat(BoltCategory), cl::ReallyHidden);
139
140// Please MSVC19 with a forward declaration: otherwise it reports an error about
141// an undeclared variable inside a callback.
142extern cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks;
143cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks(
144 "reorder-blocks", cl::desc("change layout of basic blocks in a function"),
145 cl::init(Val: bolt::ReorderBasicBlocks::LT_NONE),
146 cl::values(
147 clEnumValN(bolt::ReorderBasicBlocks::LT_NONE, "none",
148 "do not reorder basic blocks"),
149 clEnumValN(bolt::ReorderBasicBlocks::LT_REVERSE, "reverse",
150 "layout blocks in reverse order"),
151 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE, "normal",
152 "perform optimal layout based on profile"),
153 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_BRANCH,
154 "branch-predictor",
155 "perform optimal layout prioritizing branch "
156 "predictions"),
157 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE, "cache",
158 "perform optimal layout prioritizing I-cache "
159 "behavior"),
160 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS, "cache+",
161 "perform layout optimizing I-cache behavior"),
162 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP, "ext-tsp",
163 "perform layout optimizing I-cache behavior"),
164 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_SHUFFLE,
165 "cluster-shuffle", "perform random layout of clusters")),
166 cl::ZeroOrMore, cl::cat(BoltOptCategory),
167 cl::callback(CB: [](const bolt::ReorderBasicBlocks::LayoutType &option) {
168 if (option == bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS) {
169 errs() << "BOLT-WARNING: '-reorder-blocks=cache+' is deprecated, please"
170 << " use '-reorder-blocks=ext-tsp' instead\n";
171 ReorderBlocks = bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP;
172 }
173 }));
174
175static cl::opt<unsigned> ReportBadLayout(
176 "report-bad-layout",
177 cl::desc("print top <uint> functions with suboptimal code layout on input"),
178 cl::init(Val: 0), cl::Hidden, cl::cat(BoltOptCategory));
179
180static cl::opt<bool>
181 ReportStaleFuncs("report-stale",
182 cl::desc("print the list of functions with stale profile"),
183 cl::Hidden, cl::cat(BoltOptCategory));
184
185enum SctcModes : char {
186 SctcAlways,
187 SctcPreserveDirection,
188 SctcHeuristic
189};
190
191static cl::opt<SctcModes>
192SctcMode("sctc-mode",
193 cl::desc("mode for simplify conditional tail calls"),
194 cl::init(Val: SctcAlways),
195 cl::values(clEnumValN(SctcAlways, "always", "always perform sctc"),
196 clEnumValN(SctcPreserveDirection,
197 "preserve",
198 "only perform sctc when branch direction is "
199 "preserved"),
200 clEnumValN(SctcHeuristic,
201 "heuristic",
202 "use branch prediction data to control sctc")),
203 cl::ZeroOrMore,
204 cl::cat(BoltOptCategory));
205
206static cl::opt<unsigned>
207StaleThreshold("stale-threshold",
208 cl::desc(
209 "maximum percentage of stale functions to tolerate (default: 100)"),
210 cl::init(Val: 100),
211 cl::Hidden,
212 cl::cat(BoltOptCategory));
213
214static cl::opt<unsigned> TSPThreshold(
215 "tsp-threshold",
216 cl::desc(
217 "maximum number of hot basic blocks in a function for which to use "
218 "a precise TSP solution while re-ordering basic blocks"),
219 cl::init(Val: 10), cl::Hidden, cl::cat(BoltOptCategory));
220
221static cl::opt<unsigned> TopCalledLimit(
222 "top-called-limit",
223 cl::desc("maximum number of functions to print in top called "
224 "functions section"),
225 cl::init(Val: 100), cl::Hidden, cl::cat(BoltCategory));
226
227// Profile density options, synced with llvm-profgen/ProfileGenerator.cpp
228static cl::opt<int> ProfileDensityCutOffHot(
229 "profile-density-cutoff-hot", cl::init(Val: 990000),
230 cl::desc("Total samples cutoff for functions used to calculate "
231 "profile density."));
232
233static cl::opt<double> ProfileDensityThreshold(
234 "profile-density-threshold", cl::init(Val: 60),
235 cl::desc("If the profile density is below the given threshold, it "
236 "will be suggested to increase the sampling rate."),
237 cl::Optional);
238
239} // namespace opts
240
241namespace llvm {
242namespace bolt {
243
244bool BinaryFunctionPass::shouldOptimize(const BinaryFunction &BF) const {
245 return BF.isSimple() && BF.getState() == BinaryFunction::State::CFG &&
246 !BF.isIgnored();
247}
248
249bool BinaryFunctionPass::shouldPrint(const BinaryFunction &BF) const {
250 return BF.isSimple() && !BF.isIgnored();
251}
252
253void NormalizeCFG::runOnFunction(BinaryFunction &BF) {
254 uint64_t NumRemoved = 0;
255 uint64_t NumDuplicateEdges = 0;
256 uint64_t NeedsFixBranches = 0;
257 for (BinaryBasicBlock &BB : BF) {
258 if (!BB.empty())
259 continue;
260
261 if (BB.isEntryPoint() || BB.isLandingPad())
262 continue;
263
264 // Handle a dangling empty block.
265 if (BB.succ_size() == 0) {
266 // If an empty dangling basic block has a predecessor, it could be a
267 // result of codegen for __builtin_unreachable. In such case, do not
268 // remove the block.
269 if (BB.pred_size() == 0) {
270 BB.markValid(Valid: false);
271 ++NumRemoved;
272 }
273 continue;
274 }
275
276 // The block should have just one successor.
277 BinaryBasicBlock *Successor = BB.getSuccessor();
278 assert(Successor && "invalid CFG encountered");
279
280 // Redirect all predecessors to the successor block.
281 while (!BB.pred_empty()) {
282 BinaryBasicBlock *Predecessor = *BB.pred_begin();
283 if (Predecessor->hasJumpTable())
284 break;
285
286 if (Predecessor == Successor)
287 break;
288
289 BinaryBasicBlock::BinaryBranchInfo &BI = Predecessor->getBranchInfo(Succ: BB);
290 Predecessor->replaceSuccessor(Succ: &BB, NewSucc: Successor, Count: BI.Count,
291 MispredictedCount: BI.MispredictedCount);
292 // We need to fix branches even if we failed to replace all successors
293 // and remove the block.
294 NeedsFixBranches = true;
295 }
296
297 if (BB.pred_empty()) {
298 BB.removeAllSuccessors();
299 BB.markValid(Valid: false);
300 ++NumRemoved;
301 }
302 }
303
304 if (NumRemoved)
305 BF.eraseInvalidBBs();
306
307 // Check for duplicate successors. Do it after the empty block elimination as
308 // we can get more duplicate successors.
309 for (BinaryBasicBlock &BB : BF)
310 if (!BB.hasJumpTable() && BB.succ_size() == 2 &&
311 BB.getConditionalSuccessor(Condition: false) == BB.getConditionalSuccessor(Condition: true))
312 ++NumDuplicateEdges;
313
314 // fixBranches() will get rid of duplicate edges and update jump instructions.
315 if (NumDuplicateEdges || NeedsFixBranches)
316 BF.fixBranches();
317
318 NumDuplicateEdgesMerged += NumDuplicateEdges;
319 NumBlocksRemoved += NumRemoved;
320}
321
322Error NormalizeCFG::runOnFunctions(BinaryContext &BC) {
323 ParallelUtilities::runOnEachFunction(
324 BC, SchedPolicy: ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR,
325 WorkFunction: [&](BinaryFunction &BF) { runOnFunction(BF); },
326 SkipPredicate: [&](const BinaryFunction &BF) { return !shouldOptimize(BF); },
327 LogName: "NormalizeCFG");
328 if (NumBlocksRemoved)
329 BC.outs() << "BOLT-INFO: removed " << NumBlocksRemoved << " empty block"
330 << (NumBlocksRemoved == 1 ? "" : "s") << '\n';
331 if (NumDuplicateEdgesMerged)
332 BC.outs() << "BOLT-INFO: merged " << NumDuplicateEdgesMerged
333 << " duplicate CFG edge"
334 << (NumDuplicateEdgesMerged == 1 ? "" : "s") << '\n';
335 return Error::success();
336}
337
338void EliminateUnreachableBlocks::runOnFunction(BinaryFunction &Function) {
339 BinaryContext &BC = Function.getBinaryContext();
340 unsigned Count;
341 uint64_t Bytes;
342 Function.markUnreachableBlocks();
343 LLVM_DEBUG({
344 for (BinaryBasicBlock &BB : Function) {
345 if (!BB.isValid()) {
346 dbgs() << "BOLT-INFO: UCE found unreachable block " << BB.getName()
347 << " in function " << Function << "\n";
348 Function.dump();
349 }
350 }
351 });
352 BinaryContext::IndependentCodeEmitter Emitter =
353 BC.createIndependentMCCodeEmitter();
354 std::tie(args&: Count, args&: Bytes) = Function.eraseInvalidBBs(Emitter: Emitter.MCE.get());
355 DeletedBlocks += Count;
356 DeletedBytes += Bytes;
357 if (Count) {
358 auto L = BC.scopeLock();
359 Modified.insert(x: &Function);
360 if (opts::Verbosity > 0)
361 BC.outs() << "BOLT-INFO: removed " << Count
362 << " dead basic block(s) accounting for " << Bytes
363 << " bytes in function " << Function << '\n';
364 }
365}
366
367Error EliminateUnreachableBlocks::runOnFunctions(BinaryContext &BC) {
368 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
369 runOnFunction(Function&: BF);
370 };
371
372 ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
373 return !shouldOptimize(BF) || BF.getLayout().block_empty();
374 };
375
376 ParallelUtilities::runOnEachFunction(
377 BC, SchedPolicy: ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFunction: WorkFun,
378 SkipPredicate, LogName: "elimininate-unreachable");
379
380 if (DeletedBlocks)
381 BC.outs() << "BOLT-INFO: UCE removed " << DeletedBlocks << " blocks and "
382 << DeletedBytes << " bytes of code\n";
383 return Error::success();
384}
385
386bool ReorderBasicBlocks::shouldPrint(const BinaryFunction &BF) const {
387 return (BinaryFunctionPass::shouldPrint(BF) &&
388 opts::ReorderBlocks != ReorderBasicBlocks::LT_NONE);
389}
390
391bool ReorderBasicBlocks::shouldOptimize(const BinaryFunction &BF) const {
392 // Apply execution count threshold
393 if (BF.getKnownExecutionCount() < opts::ExecutionCountThreshold)
394 return false;
395
396 return BinaryFunctionPass::shouldOptimize(BF);
397}
398
399Error ReorderBasicBlocks::runOnFunctions(BinaryContext &BC) {
400 if (opts::ReorderBlocks == ReorderBasicBlocks::LT_NONE)
401 return Error::success();
402
403 std::atomic_uint64_t ModifiedFuncCount(0);
404 std::mutex FunctionEditDistanceMutex;
405 DenseMap<const BinaryFunction *, uint64_t> FunctionEditDistance;
406
407 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
408 SmallVector<const BinaryBasicBlock *, 0> OldBlockOrder;
409 if (opts::PrintFuncStat > 0)
410 llvm::copy(Range: BF.getLayout().blocks(), Out: std::back_inserter(x&: OldBlockOrder));
411
412 const bool LayoutChanged =
413 modifyFunctionLayout(Function&: BF, Type: opts::ReorderBlocks, MinBranchClusters: opts::MinBranchClusters);
414 if (LayoutChanged) {
415 ModifiedFuncCount.fetch_add(i: 1, m: std::memory_order_relaxed);
416 if (opts::PrintFuncStat > 0) {
417 const uint64_t Distance = BF.getLayout().getEditDistance(OldBlockOrder);
418 std::lock_guard<std::mutex> Lock(FunctionEditDistanceMutex);
419 FunctionEditDistance[&BF] = Distance;
420 }
421 }
422 };
423
424 ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
425 return !shouldOptimize(BF);
426 };
427
428 ParallelUtilities::runOnEachFunction(
429 BC, SchedPolicy: ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, WorkFunction: WorkFun, SkipPredicate: SkipFunc,
430 LogName: "ReorderBasicBlocks");
431 const size_t NumAllProfiledFunctions =
432 BC.NumProfiledFuncs + BC.NumStaleProfileFuncs;
433
434 BC.outs() << "BOLT-INFO: basic block reordering modified layout of "
435 << format(
436 Fmt: "%zu functions (%.2lf%% of profiled, %.2lf%% of total)\n",
437 Vals: ModifiedFuncCount.load(m: std::memory_order_relaxed),
438 Vals: 100.0 * ModifiedFuncCount.load(m: std::memory_order_relaxed) /
439 NumAllProfiledFunctions,
440 Vals: 100.0 * ModifiedFuncCount.load(m: std::memory_order_relaxed) /
441 BC.getBinaryFunctions().size());
442
443 if (opts::PrintFuncStat > 0) {
444 raw_ostream &OS = BC.outs();
445 // Copy all the values into vector in order to sort them
446 std::map<uint64_t, BinaryFunction &> ScoreMap;
447 auto &BFs = BC.getBinaryFunctions();
448 for (auto It = BFs.begin(); It != BFs.end(); ++It)
449 ScoreMap.insert(x: std::pair<uint64_t, BinaryFunction &>(
450 It->second.getFunctionScore(), It->second));
451
452 OS << "\nBOLT-INFO: Printing Function Statistics:\n\n";
453 OS << " There are " << BFs.size() << " functions in total. \n";
454 OS << " Number of functions being modified: "
455 << ModifiedFuncCount.load(m: std::memory_order_relaxed) << "\n";
456 OS << " User asks for detailed information on top "
457 << opts::PrintFuncStat << " functions. (Ranked by function score)"
458 << "\n\n";
459 uint64_t I = 0;
460 for (std::map<uint64_t, BinaryFunction &>::reverse_iterator Rit =
461 ScoreMap.rbegin();
462 Rit != ScoreMap.rend() && I < opts::PrintFuncStat; ++Rit, ++I) {
463 BinaryFunction &Function = Rit->second;
464
465 OS << " Information for function of top: " << (I + 1) << ": \n";
466 OS << " Function Score is: " << Function.getFunctionScore()
467 << "\n";
468 OS << " There are " << Function.size()
469 << " number of blocks in this function.\n";
470 OS << " There are " << Function.getInstructionCount()
471 << " number of instructions in this function.\n";
472 OS << " The edit distance for this function is: "
473 << FunctionEditDistance.lookup(Val: &Function) << "\n\n";
474 }
475 }
476 return Error::success();
477}
478
479bool ReorderBasicBlocks::modifyFunctionLayout(BinaryFunction &BF,
480 LayoutType Type,
481 bool MinBranchClusters) const {
482 if (BF.size() == 0 || Type == LT_NONE)
483 return false;
484
485 BinaryFunction::BasicBlockOrderType NewLayout;
486 std::unique_ptr<ReorderAlgorithm> Algo;
487
488 // Cannot do optimal layout without profile.
489 if (Type != LT_REVERSE && !BF.hasValidProfile())
490 return false;
491
492 if (Type == LT_REVERSE) {
493 Algo.reset(p: new ReverseReorderAlgorithm());
494 } else if (BF.size() <= opts::TSPThreshold && Type != LT_OPTIMIZE_SHUFFLE) {
495 // Work on optimal solution if problem is small enough
496 LLVM_DEBUG(dbgs() << "finding optimal block layout for " << BF << "\n");
497 Algo.reset(p: new TSPReorderAlgorithm());
498 } else {
499 LLVM_DEBUG(dbgs() << "running block layout heuristics on " << BF << "\n");
500
501 std::unique_ptr<ClusterAlgorithm> CAlgo;
502 if (MinBranchClusters)
503 CAlgo.reset(p: new MinBranchGreedyClusterAlgorithm());
504 else
505 CAlgo.reset(p: new PHGreedyClusterAlgorithm());
506
507 switch (Type) {
508 case LT_OPTIMIZE:
509 Algo.reset(p: new OptimizeReorderAlgorithm(std::move(CAlgo)));
510 break;
511
512 case LT_OPTIMIZE_BRANCH:
513 Algo.reset(p: new OptimizeBranchReorderAlgorithm(std::move(CAlgo)));
514 break;
515
516 case LT_OPTIMIZE_CACHE:
517 Algo.reset(p: new OptimizeCacheReorderAlgorithm(std::move(CAlgo)));
518 break;
519
520 case LT_OPTIMIZE_EXT_TSP:
521 Algo.reset(p: new ExtTSPReorderAlgorithm());
522 break;
523
524 case LT_OPTIMIZE_SHUFFLE:
525 Algo.reset(p: new RandomClusterReorderAlgorithm(std::move(CAlgo)));
526 break;
527
528 default:
529 llvm_unreachable("unexpected layout type");
530 }
531 }
532
533 Algo->reorderBasicBlocks(BF, Order&: NewLayout);
534
535 return BF.getLayout().update(NewLayout);
536}
537
538Error FixupBranches::runOnFunctions(BinaryContext &BC) {
539 for (auto &It : BC.getBinaryFunctions()) {
540 BinaryFunction &Function = It.second;
541 if (!BC.shouldEmit(Function) || !Function.isSimple())
542 continue;
543
544 Function.fixBranches();
545 }
546 return Error::success();
547}
548
549Error FinalizeFunctions::runOnFunctions(BinaryContext &BC) {
550 std::atomic<bool> HasFatal{false};
551 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
552 if (!BF.finalizeCFIState()) {
553 if (BC.HasRelocations) {
554 BC.errs() << "BOLT-ERROR: unable to fix CFI state for function " << BF
555 << ". Exiting.\n";
556 HasFatal = true;
557 return;
558 }
559 BF.setSimple(false);
560 return;
561 }
562
563 BF.setFinalized();
564
565 // Update exception handling information.
566 BF.updateEHRanges();
567 };
568
569 ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
570 return !BC.shouldEmit(Function: BF);
571 };
572
573 ParallelUtilities::runOnEachFunction(
574 BC, SchedPolicy: ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFunction: WorkFun,
575 SkipPredicate, LogName: "FinalizeFunctions");
576 if (HasFatal)
577 return createFatalBOLTError(S: "finalize CFI state failure");
578 return Error::success();
579}
580
581Error CheckLargeFunctions::runOnFunctions(BinaryContext &BC) {
582 if (BC.HasRelocations)
583 return Error::success();
584
585 // If the function wouldn't fit, mark it as non-simple. Otherwise, we may emit
586 // incorrect meta data.
587 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
588 uint64_t HotSize, ColdSize;
589 std::tie(args&: HotSize, args&: ColdSize) =
590 BC.calculateEmittedSize(BF, /*FixBranches=*/false);
591 uint64_t MainFragmentSize = HotSize;
592 if (BF.hasIslandsInfo()) {
593 MainFragmentSize +=
594 offsetToAlignment(Value: BF.getAddress() + MainFragmentSize,
595 Alignment: Align(BF.getConstantIslandAlignment()));
596 MainFragmentSize += BF.estimateConstantIslandSize();
597 }
598 if (MainFragmentSize > BF.getMaxSize()) {
599 if (opts::PrintLargeFunctions)
600 BC.outs() << "BOLT-INFO: " << BF << " size of " << MainFragmentSize
601 << " bytes exceeds allocated space by "
602 << (MainFragmentSize - BF.getMaxSize()) << " bytes\n";
603 BF.setSimple(false);
604 }
605 };
606
607 ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
608 return !shouldOptimize(BF);
609 };
610
611 ParallelUtilities::runOnEachFunction(
612 BC, SchedPolicy: ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFunction: WorkFun,
613 SkipPredicate: SkipFunc, LogName: "CheckLargeFunctions");
614
615 return Error::success();
616}
617
618bool CheckLargeFunctions::shouldOptimize(const BinaryFunction &BF) const {
619 // Unlike other passes, allow functions in non-CFG state.
620 return BF.isSimple() && !BF.isIgnored();
621}
622
623Error LowerAnnotations::runOnFunctions(BinaryContext &BC) {
624 // Convert GnuArgsSize annotations into CFIs.
625 for (BinaryFunction *BF : BC.getAllBinaryFunctions()) {
626 for (FunctionFragment &FF : BF->getLayout().fragments()) {
627 // Reset at the start of the new fragment.
628 int64_t CurrentGnuArgsSize = 0;
629
630 for (BinaryBasicBlock *const BB : FF) {
631 for (auto II = BB->begin(); II != BB->end(); ++II) {
632 if (!BF->usesGnuArgsSize() || !BC.MIB->isInvoke(Inst: *II))
633 continue;
634
635 const int64_t NewGnuArgsSize = BC.MIB->getGnuArgsSize(Inst: *II);
636 assert(NewGnuArgsSize >= 0 && "Expected non-negative GNU_args_size.");
637 if (NewGnuArgsSize == CurrentGnuArgsSize)
638 continue;
639
640 auto InsertII = BF->addCFIInstruction(
641 BB, Pos: II,
642 Inst: MCCFIInstruction::createGnuArgsSize(L: nullptr, Size: NewGnuArgsSize));
643 CurrentGnuArgsSize = NewGnuArgsSize;
644 II = std::next(x: InsertII);
645 }
646 }
647 }
648 }
649 return Error::success();
650}
651
652// Check for dirty state in MCSymbol objects that might be a consequence
653// of running calculateEmittedSize() in parallel, during split functions
654// pass. If an inconsistent state is found (symbol already registered or
655// already defined), clean it.
656Error CleanMCState::runOnFunctions(BinaryContext &BC) {
657 MCContext &Ctx = *BC.Ctx;
658 for (const auto &SymMapEntry : Ctx.getSymbols()) {
659 const MCSymbol *S = SymMapEntry.getValue().Symbol;
660 if (!S)
661 continue;
662 if (S->isDefined()) {
663 LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName()
664 << "\" is already defined\n");
665 const_cast<MCSymbol *>(S)->setUndefined();
666 }
667 if (S->isRegistered()) {
668 LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName()
669 << "\" is already registered\n");
670 const_cast<MCSymbol *>(S)->setIsRegistered(false);
671 }
672 LLVM_DEBUG(if (S->isVariable()) {
673 dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName() << "\" is variable\n";
674 });
675 }
676 return Error::success();
677}
678
679// This peephole fixes jump instructions that jump to another basic
680// block with a single jump instruction, e.g.
681//
682// B0: ...
683// jmp B1 (or jcc B1)
684//
685// B1: jmp B2
686//
687// ->
688//
689// B0: ...
690// jmp B2 (or jcc B2)
691//
692static uint64_t fixDoubleJumps(BinaryFunction &Function, bool MarkInvalid) {
693 uint64_t NumDoubleJumps = 0;
694
695 MCContext *Ctx = Function.getBinaryContext().Ctx.get();
696 MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get();
697 for (BinaryBasicBlock &BB : Function) {
698 auto checkAndPatch = [&](BinaryBasicBlock *Pred, BinaryBasicBlock *Succ,
699 const MCSymbol *SuccSym,
700 std::optional<uint32_t> Offset) {
701 // Ignore infinite loop jumps or fallthrough tail jumps.
702 if (Pred == Succ || Succ == &BB)
703 return false;
704
705 if (Succ) {
706 const MCSymbol *TBB = nullptr;
707 const MCSymbol *FBB = nullptr;
708 MCInst *CondBranch = nullptr;
709 MCInst *UncondBranch = nullptr;
710 bool Res = Pred->analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
711 if (!Res) {
712 LLVM_DEBUG(dbgs() << "analyzeBranch failed in peepholes in block:\n";
713 Pred->dump());
714 return false;
715 }
716 Pred->replaceSuccessor(Succ: &BB, NewSucc: Succ);
717
718 // We must patch up any existing branch instructions to match up
719 // with the new successor.
720 assert((CondBranch || (!CondBranch && Pred->succ_size() == 1)) &&
721 "Predecessor block has inconsistent number of successors");
722 if (CondBranch && MIB->getTargetSymbol(Inst: *CondBranch) == BB.getLabel()) {
723 MIB->replaceBranchTarget(Inst&: *CondBranch, TBB: Succ->getLabel(), Ctx);
724 } else if (UncondBranch &&
725 MIB->getTargetSymbol(Inst: *UncondBranch) == BB.getLabel()) {
726 MIB->replaceBranchTarget(Inst&: *UncondBranch, TBB: Succ->getLabel(), Ctx);
727 } else if (!UncondBranch) {
728 assert(Function.getLayout().getBasicBlockAfter(Pred, false) != Succ &&
729 "Don't add an explicit jump to a fallthrough block.");
730 Pred->addBranchInstruction(Successor: Succ);
731 }
732 } else {
733 // Succ will be null in the tail call case. In this case we
734 // need to explicitly add a tail call instruction.
735 MCInst *Branch = Pred->getLastNonPseudoInstr();
736 if (Branch && MIB->isUnconditionalBranch(Inst: *Branch)) {
737 assert(MIB->getTargetSymbol(*Branch) == BB.getLabel());
738 Pred->removeSuccessor(Succ: &BB);
739 Pred->eraseInstruction(II: Pred->findInstruction(Inst: Branch));
740 Pred->addTailCallInstruction(Target: SuccSym);
741 if (Offset) {
742 MCInst *TailCall = Pred->getLastNonPseudoInstr();
743 assert(TailCall);
744 MIB->setOffset(Inst&: *TailCall, Offset: *Offset);
745 }
746 } else {
747 return false;
748 }
749 }
750
751 ++NumDoubleJumps;
752 LLVM_DEBUG(dbgs() << "Removed double jump in " << Function << " from "
753 << Pred->getName() << " -> " << BB.getName() << " to "
754 << Pred->getName() << " -> " << SuccSym->getName()
755 << (!Succ ? " (tail)\n" : "\n"));
756
757 return true;
758 };
759
760 if (BB.getNumNonPseudos() != 1 || BB.isLandingPad())
761 continue;
762
763 MCInst *Inst = BB.getFirstNonPseudoInstr();
764 const bool IsTailCall = MIB->isTailCall(Inst: *Inst);
765
766 if (!MIB->isUnconditionalBranch(Inst: *Inst) && !IsTailCall)
767 continue;
768
769 // If we operate after SCTC make sure it's not a conditional tail call.
770 if (IsTailCall && MIB->isConditionalBranch(Inst: *Inst))
771 continue;
772
773 const MCSymbol *SuccSym = MIB->getTargetSymbol(Inst: *Inst);
774 BinaryBasicBlock *Succ = BB.getSuccessor();
775
776 if (((!Succ || &BB == Succ) && !IsTailCall) || (IsTailCall && !SuccSym))
777 continue;
778
779 std::vector<BinaryBasicBlock *> Preds = {BB.pred_begin(), BB.pred_end()};
780
781 for (BinaryBasicBlock *Pred : Preds) {
782 if (Pred->isLandingPad())
783 continue;
784
785 if (Pred->getSuccessor() == &BB ||
786 (Pred->getConditionalSuccessor(Condition: true) == &BB && !IsTailCall) ||
787 Pred->getConditionalSuccessor(Condition: false) == &BB)
788 if (checkAndPatch(Pred, Succ, SuccSym, MIB->getOffset(Inst: *Inst)) &&
789 MarkInvalid)
790 BB.markValid(Valid: BB.pred_size() != 0 || BB.isLandingPad() ||
791 BB.isEntryPoint());
792 }
793 }
794
795 return NumDoubleJumps;
796}
797
798bool SimplifyConditionalTailCalls::shouldRewriteBranch(
799 const BinaryBasicBlock *PredBB, const MCInst &CondBranch,
800 const BinaryBasicBlock *BB, const bool DirectionFlag) {
801 if (BeenOptimized.count(x: PredBB))
802 return false;
803
804 const bool IsForward = BinaryFunction::isForwardBranch(From: PredBB, To: BB);
805
806 if (IsForward)
807 ++NumOrigForwardBranches;
808 else
809 ++NumOrigBackwardBranches;
810
811 if (opts::SctcMode == opts::SctcAlways)
812 return true;
813
814 if (opts::SctcMode == opts::SctcPreserveDirection)
815 return IsForward == DirectionFlag;
816
817 const ErrorOr<std::pair<double, double>> Frequency =
818 PredBB->getBranchStats(Succ: BB);
819
820 // It's ok to rewrite the conditional branch if the new target will be
821 // a backward branch.
822
823 // If no data available for these branches, then it should be ok to
824 // do the optimization since it will reduce code size.
825 if (Frequency.getError())
826 return true;
827
828 // TODO: should this use misprediction frequency instead?
829 const bool Result = (IsForward && Frequency.get().first >= 0.5) ||
830 (!IsForward && Frequency.get().first <= 0.5);
831
832 return Result == DirectionFlag;
833}
834
835uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryFunction &BF) {
836 // Need updated indices to correctly detect branch' direction.
837 BF.getLayout().updateLayoutIndices();
838 BF.markUnreachableBlocks();
839
840 MCPlusBuilder *MIB = BF.getBinaryContext().MIB.get();
841 MCContext *Ctx = BF.getBinaryContext().Ctx.get();
842 uint64_t NumLocalCTCCandidates = 0;
843 uint64_t NumLocalCTCs = 0;
844 uint64_t LocalCTCTakenCount = 0;
845 uint64_t LocalCTCExecCount = 0;
846 std::vector<std::pair<BinaryBasicBlock *, const BinaryBasicBlock *>>
847 NeedsUncondBranch;
848
849 // Will block be deleted by UCE?
850 auto isValid = [](const BinaryBasicBlock *BB) {
851 return (BB->pred_size() != 0 || BB->isLandingPad() || BB->isEntryPoint());
852 };
853
854 for (BinaryBasicBlock *BB : BF.getLayout().blocks()) {
855 // Locate BB with a single direct tail-call instruction.
856 if (BB->getNumNonPseudos() != 1)
857 continue;
858
859 MCInst *Instr = BB->getFirstNonPseudoInstr();
860 if (!MIB->isTailCall(Inst: *Instr) || MIB->isConditionalBranch(Inst: *Instr))
861 continue;
862
863 const MCSymbol *CalleeSymbol = MIB->getTargetSymbol(Inst: *Instr);
864 if (!CalleeSymbol)
865 continue;
866
867 // Detect direction of the possible conditional tail call.
868 const bool IsForwardCTC = BF.isForwardCall(CalleeSymbol);
869
870 // Iterate through all predecessors.
871 for (BinaryBasicBlock *PredBB : BB->predecessors()) {
872 BinaryBasicBlock *CondSucc = PredBB->getConditionalSuccessor(Condition: true);
873 if (!CondSucc)
874 continue;
875
876 ++NumLocalCTCCandidates;
877
878 const MCSymbol *TBB = nullptr;
879 const MCSymbol *FBB = nullptr;
880 MCInst *CondBranch = nullptr;
881 MCInst *UncondBranch = nullptr;
882 bool Result = PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
883
884 // analyzeBranch() can fail due to unusual branch instructions, e.g. jrcxz
885 if (!Result) {
886 LLVM_DEBUG(dbgs() << "analyzeBranch failed in SCTC in block:\n";
887 PredBB->dump());
888 continue;
889 }
890
891 assert(Result && "internal error analyzing conditional branch");
892 assert(CondBranch && "conditional branch expected");
893
894 // Skip dynamic branches for now.
895 if (BF.getBinaryContext().MIB->isDynamicBranch(Inst: *CondBranch))
896 continue;
897
898 // It's possible that PredBB is also a successor to BB that may have
899 // been processed by a previous iteration of the SCTC loop, in which
900 // case it may have been marked invalid. We should skip rewriting in
901 // this case.
902 if (!PredBB->isValid()) {
903 assert(PredBB->isSuccessor(BB) &&
904 "PredBB should be valid if it is not a successor to BB");
905 continue;
906 }
907
908 // We don't want to reverse direction of the branch in new order
909 // without further profile analysis.
910 const bool DirectionFlag = CondSucc == BB ? IsForwardCTC : !IsForwardCTC;
911 if (!shouldRewriteBranch(PredBB, CondBranch: *CondBranch, BB, DirectionFlag))
912 continue;
913
914 // Record this block so that we don't try to optimize it twice.
915 BeenOptimized.insert(x: PredBB);
916
917 uint64_t Count = 0;
918 if (CondSucc != BB) {
919 // Patch the new target address into the conditional branch.
920 MIB->reverseBranchCondition(Inst&: *CondBranch, TBB: CalleeSymbol, Ctx);
921 // Since we reversed the condition on the branch we need to change
922 // the target for the unconditional branch or add a unconditional
923 // branch to the old target. This has to be done manually since
924 // fixupBranches is not called after SCTC.
925 NeedsUncondBranch.emplace_back(args&: PredBB, args&: CondSucc);
926 Count = PredBB->getFallthroughBranchInfo().Count;
927 } else {
928 // Change destination of the conditional branch.
929 MIB->replaceBranchTarget(Inst&: *CondBranch, TBB: CalleeSymbol, Ctx);
930 Count = PredBB->getTakenBranchInfo().Count;
931 }
932 const uint64_t CTCTakenFreq =
933 Count == BinaryBasicBlock::COUNT_NO_PROFILE ? 0 : Count;
934
935 // Annotate it, so "isCall" returns true for this jcc
936 MIB->setConditionalTailCall(Inst&: *CondBranch);
937 // Add info about the conditional tail call frequency, otherwise this
938 // info will be lost when we delete the associated BranchInfo entry
939 auto &CTCAnnotation =
940 MIB->getOrCreateAnnotationAs<uint64_t>(Inst&: *CondBranch, Name: "CTCTakenCount");
941 CTCAnnotation = CTCTakenFreq;
942 // Preserve Offset annotation, used in BAT.
943 // Instr is a direct tail call instruction that was created when CTCs are
944 // first expanded, and has the original CTC offset set.
945 if (std::optional<uint32_t> Offset = MIB->getOffset(Inst: *Instr))
946 MIB->setOffset(Inst&: *CondBranch, Offset: *Offset);
947
948 // Remove the unused successor which may be eliminated later
949 // if there are no other users.
950 PredBB->removeSuccessor(Succ: BB);
951 // Update BB execution count
952 if (CTCTakenFreq && CTCTakenFreq <= BB->getKnownExecutionCount())
953 BB->setExecutionCount(BB->getExecutionCount() - CTCTakenFreq);
954 else if (CTCTakenFreq > BB->getKnownExecutionCount())
955 BB->setExecutionCount(0);
956
957 ++NumLocalCTCs;
958 LocalCTCTakenCount += CTCTakenFreq;
959 LocalCTCExecCount += PredBB->getKnownExecutionCount();
960 }
961
962 // Remove the block from CFG if all predecessors were removed.
963 BB->markValid(Valid: isValid(BB));
964 }
965
966 // Add unconditional branches at the end of BBs to new successors
967 // as long as the successor is not a fallthrough.
968 for (auto &Entry : NeedsUncondBranch) {
969 BinaryBasicBlock *PredBB = Entry.first;
970 const BinaryBasicBlock *CondSucc = Entry.second;
971
972 const MCSymbol *TBB = nullptr;
973 const MCSymbol *FBB = nullptr;
974 MCInst *CondBranch = nullptr;
975 MCInst *UncondBranch = nullptr;
976 PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
977
978 // Find the next valid block. Invalid blocks will be deleted
979 // so they shouldn't be considered fallthrough targets.
980 const BinaryBasicBlock *NextBlock =
981 BF.getLayout().getBasicBlockAfter(BB: PredBB, IgnoreSplits: false);
982 while (NextBlock && !isValid(NextBlock))
983 NextBlock = BF.getLayout().getBasicBlockAfter(BB: NextBlock, IgnoreSplits: false);
984
985 // Get the unconditional successor to this block.
986 const BinaryBasicBlock *PredSucc = PredBB->getSuccessor();
987 assert(PredSucc && "The other branch should be a tail call");
988
989 const bool HasFallthrough = (NextBlock && PredSucc == NextBlock);
990
991 if (UncondBranch) {
992 if (HasFallthrough)
993 PredBB->eraseInstruction(II: PredBB->findInstruction(Inst: UncondBranch));
994 else
995 MIB->replaceBranchTarget(Inst&: *UncondBranch, TBB: CondSucc->getLabel(), Ctx);
996 } else if (!HasFallthrough) {
997 MCInst Branch;
998 MIB->createUncondBranch(Inst&: Branch, TBB: CondSucc->getLabel(), Ctx);
999 PredBB->addInstruction(Inst: Branch);
1000 }
1001 }
1002
1003 if (NumLocalCTCs > 0) {
1004 NumDoubleJumps += fixDoubleJumps(Function&: BF, MarkInvalid: true);
1005 // Clean-up unreachable tail-call blocks.
1006 const std::pair<unsigned, uint64_t> Stats = BF.eraseInvalidBBs();
1007 DeletedBlocks += Stats.first;
1008 DeletedBytes += Stats.second;
1009
1010 assert(BF.validateCFG());
1011 }
1012
1013 LLVM_DEBUG(dbgs() << "BOLT: created " << NumLocalCTCs
1014 << " conditional tail calls from a total of "
1015 << NumLocalCTCCandidates << " candidates in function " << BF
1016 << ". CTCs execution count for this function is "
1017 << LocalCTCExecCount << " and CTC taken count is "
1018 << LocalCTCTakenCount << "\n";);
1019
1020 NumTailCallsPatched += NumLocalCTCs;
1021 NumCandidateTailCalls += NumLocalCTCCandidates;
1022 CTCExecCount += LocalCTCExecCount;
1023 CTCTakenCount += LocalCTCTakenCount;
1024
1025 return NumLocalCTCs > 0;
1026}
1027
1028Error SimplifyConditionalTailCalls::runOnFunctions(BinaryContext &BC) {
1029 if (!BC.isX86())
1030 return Error::success();
1031
1032 for (auto &It : BC.getBinaryFunctions()) {
1033 BinaryFunction &Function = It.second;
1034
1035 if (!shouldOptimize(BF: Function))
1036 continue;
1037
1038 if (fixTailCalls(BF&: Function)) {
1039 Modified.insert(x: &Function);
1040 Function.setHasCanonicalCFG(false);
1041 }
1042 }
1043
1044 if (NumTailCallsPatched)
1045 BC.outs() << "BOLT-INFO: SCTC: patched " << NumTailCallsPatched
1046 << " tail calls (" << NumOrigForwardBranches << " forward)"
1047 << " tail calls (" << NumOrigBackwardBranches << " backward)"
1048 << " from a total of " << NumCandidateTailCalls
1049 << " while removing " << NumDoubleJumps << " double jumps"
1050 << " and removing " << DeletedBlocks << " basic blocks"
1051 << " totalling " << DeletedBytes
1052 << " bytes of code. CTCs total execution count is "
1053 << CTCExecCount << " and the number of times CTCs are taken is "
1054 << CTCTakenCount << "\n";
1055 return Error::success();
1056}
1057
1058uint64_t ShortenInstructions::shortenInstructions(BinaryFunction &Function) {
1059 uint64_t Count = 0;
1060 const BinaryContext &BC = Function.getBinaryContext();
1061 for (BinaryBasicBlock &BB : Function) {
1062 for (MCInst &Inst : BB) {
1063 // Skip shortening instructions with Size annotation.
1064 if (BC.MIB->getSize(Inst))
1065 continue;
1066
1067 MCInst OriginalInst;
1068 if (opts::Verbosity > 2)
1069 OriginalInst = Inst;
1070
1071 if (!BC.MIB->shortenInstruction(Inst, STI: *BC.STI))
1072 continue;
1073
1074 if (opts::Verbosity > 2) {
1075 BC.scopeLock();
1076 BC.outs() << "BOLT-INFO: shortening:\nBOLT-INFO: ";
1077 BC.printInstruction(OS&: BC.outs(), Instruction: OriginalInst, Offset: 0, Function: &Function);
1078 BC.outs() << "BOLT-INFO: to:";
1079 BC.printInstruction(OS&: BC.outs(), Instruction: Inst, Offset: 0, Function: &Function);
1080 }
1081
1082 ++Count;
1083 }
1084 }
1085
1086 return Count;
1087}
1088
1089Error ShortenInstructions::runOnFunctions(BinaryContext &BC) {
1090 std::atomic<uint64_t> NumShortened{0};
1091 if (!BC.isX86())
1092 return Error::success();
1093
1094 ParallelUtilities::runOnEachFunction(
1095 BC, SchedPolicy: ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR,
1096 WorkFunction: [&](BinaryFunction &BF) { NumShortened += shortenInstructions(Function&: BF); },
1097 SkipPredicate: nullptr, LogName: "ShortenInstructions");
1098
1099 if (NumShortened)
1100 BC.outs() << "BOLT-INFO: " << NumShortened
1101 << " instructions were shortened\n";
1102 return Error::success();
1103}
1104
1105void Peepholes::addTailcallTraps(BinaryFunction &Function) {
1106 MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get();
1107 for (BinaryBasicBlock &BB : Function) {
1108 MCInst *Inst = BB.getLastNonPseudoInstr();
1109 if (Inst && MIB->isTailCall(Inst: *Inst) && MIB->isIndirectBranch(Inst: *Inst)) {
1110 MCInst Trap;
1111 MIB->createTrap(Inst&: Trap);
1112 BB.addInstruction(Inst: Trap);
1113 ++TailCallTraps;
1114 }
1115 }
1116}
1117
1118void Peepholes::removeUselessCondBranches(BinaryFunction &Function) {
1119 for (BinaryBasicBlock &BB : Function) {
1120 if (BB.succ_size() != 2)
1121 continue;
1122
1123 BinaryBasicBlock *CondBB = BB.getConditionalSuccessor(Condition: true);
1124 BinaryBasicBlock *UncondBB = BB.getConditionalSuccessor(Condition: false);
1125 if (CondBB != UncondBB)
1126 continue;
1127
1128 const MCSymbol *TBB = nullptr;
1129 const MCSymbol *FBB = nullptr;
1130 MCInst *CondBranch = nullptr;
1131 MCInst *UncondBranch = nullptr;
1132 bool Result = BB.analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
1133
1134 // analyzeBranch() can fail due to unusual branch instructions,
1135 // e.g. jrcxz, or jump tables (indirect jump).
1136 if (!Result || !CondBranch)
1137 continue;
1138
1139 BB.removeDuplicateConditionalSuccessor(CondBranch);
1140 ++NumUselessCondBranches;
1141 }
1142}
1143
1144Error Peepholes::runOnFunctions(BinaryContext &BC) {
1145 const char Opts =
1146 std::accumulate(first: opts::Peepholes.begin(), last: opts::Peepholes.end(), init: 0,
1147 binary_op: [](const char A, const PeepholeOpts B) { return A | B; });
1148 if (Opts == PEEP_NONE)
1149 return Error::success();
1150
1151 for (auto &It : BC.getBinaryFunctions()) {
1152 BinaryFunction &Function = It.second;
1153 if (shouldOptimize(BF: Function)) {
1154 if (Opts & PEEP_DOUBLE_JUMPS)
1155 NumDoubleJumps += fixDoubleJumps(Function, MarkInvalid: false);
1156 if (Opts & PEEP_TAILCALL_TRAPS)
1157 addTailcallTraps(Function);
1158 if (Opts & PEEP_USELESS_BRANCHES)
1159 removeUselessCondBranches(Function);
1160 assert(Function.validateCFG());
1161 }
1162 }
1163 BC.outs() << "BOLT-INFO: Peephole: " << NumDoubleJumps
1164 << " double jumps patched.\n"
1165 << "BOLT-INFO: Peephole: " << TailCallTraps
1166 << " tail call traps inserted.\n"
1167 << "BOLT-INFO: Peephole: " << NumUselessCondBranches
1168 << " useless conditional branches removed.\n";
1169 return Error::success();
1170}
1171
1172bool SimplifyRODataLoads::simplifyRODataLoads(BinaryFunction &BF) {
1173 BinaryContext &BC = BF.getBinaryContext();
1174 MCPlusBuilder *MIB = BC.MIB.get();
1175
1176 uint64_t NumLocalLoadsSimplified = 0;
1177 uint64_t NumDynamicLocalLoadsSimplified = 0;
1178 uint64_t NumLocalLoadsFound = 0;
1179 uint64_t NumDynamicLocalLoadsFound = 0;
1180
1181 for (BinaryBasicBlock *BB : BF.getLayout().blocks()) {
1182 for (MCInst &Inst : *BB) {
1183 unsigned Opcode = Inst.getOpcode();
1184 const MCInstrDesc &Desc = BC.MII->get(Opcode);
1185
1186 // Skip instructions that do not load from memory.
1187 if (!Desc.mayLoad())
1188 continue;
1189
1190 // Try to statically evaluate the target memory address;
1191 uint64_t TargetAddress;
1192
1193 if (MIB->hasPCRelOperand(Inst)) {
1194 // Try to find the symbol that corresponds to the PC-relative operand.
1195 MCOperand *DispOpI = MIB->getMemOperandDisp(Inst);
1196 assert(DispOpI != Inst.end() && "expected PC-relative displacement");
1197 assert(DispOpI->isExpr() &&
1198 "found PC-relative with non-symbolic displacement");
1199
1200 // Get displacement symbol.
1201 const MCSymbol *DisplSymbol;
1202 uint64_t DisplOffset;
1203
1204 std::tie(args&: DisplSymbol, args&: DisplOffset) =
1205 MIB->getTargetSymbolInfo(Expr: DispOpI->getExpr());
1206
1207 if (!DisplSymbol)
1208 continue;
1209
1210 // Look up the symbol address in the global symbols map of the binary
1211 // context object.
1212 BinaryData *BD = BC.getBinaryDataByName(Name: DisplSymbol->getName());
1213 if (!BD)
1214 continue;
1215 TargetAddress = BD->getAddress() + DisplOffset;
1216 } else if (!MIB->evaluateMemOperandTarget(Inst, Target&: TargetAddress)) {
1217 continue;
1218 }
1219
1220 // Get the contents of the section containing the target address of the
1221 // memory operand. We are only interested in read-only sections.
1222 ErrorOr<BinarySection &> DataSection =
1223 BC.getSectionForAddress(Address: TargetAddress);
1224 if (!DataSection || DataSection->isWritable())
1225 continue;
1226
1227 if (BC.getRelocationAt(Address: TargetAddress) ||
1228 BC.getDynamicRelocationAt(Address: TargetAddress))
1229 continue;
1230
1231 uint32_t Offset = TargetAddress - DataSection->getAddress();
1232 StringRef ConstantData = DataSection->getContents();
1233
1234 ++NumLocalLoadsFound;
1235 if (BB->hasProfile())
1236 NumDynamicLocalLoadsFound += BB->getExecutionCount();
1237
1238 if (MIB->replaceMemOperandWithImm(Inst, ConstantData, Offset)) {
1239 ++NumLocalLoadsSimplified;
1240 if (BB->hasProfile())
1241 NumDynamicLocalLoadsSimplified += BB->getExecutionCount();
1242 }
1243 }
1244 }
1245
1246 NumLoadsFound += NumLocalLoadsFound;
1247 NumDynamicLoadsFound += NumDynamicLocalLoadsFound;
1248 NumLoadsSimplified += NumLocalLoadsSimplified;
1249 NumDynamicLoadsSimplified += NumDynamicLocalLoadsSimplified;
1250
1251 return NumLocalLoadsSimplified > 0;
1252}
1253
1254Error SimplifyRODataLoads::runOnFunctions(BinaryContext &BC) {
1255 for (auto &It : BC.getBinaryFunctions()) {
1256 BinaryFunction &Function = It.second;
1257 if (shouldOptimize(BF: Function) && simplifyRODataLoads(BF&: Function))
1258 Modified.insert(x: &Function);
1259 }
1260
1261 BC.outs() << "BOLT-INFO: simplified " << NumLoadsSimplified << " out of "
1262 << NumLoadsFound << " loads from a statically computed address.\n"
1263 << "BOLT-INFO: dynamic loads simplified: "
1264 << NumDynamicLoadsSimplified << "\n"
1265 << "BOLT-INFO: dynamic loads found: " << NumDynamicLoadsFound
1266 << "\n";
1267 return Error::success();
1268}
1269
1270Error AssignSections::runOnFunctions(BinaryContext &BC) {
1271 for (BinaryFunction *Function : BC.getInjectedBinaryFunctions()) {
1272 if (!Function->isPatch()) {
1273 Function->setCodeSectionName(BC.getInjectedCodeSectionName());
1274 Function->setColdCodeSectionName(BC.getInjectedColdCodeSectionName());
1275 }
1276 }
1277
1278 // In non-relocation mode functions have pre-assigned section names.
1279 if (!BC.HasRelocations)
1280 return Error::success();
1281
1282 const bool UseColdSection =
1283 BC.NumProfiledFuncs > 0 ||
1284 opts::ReorderFunctions == ReorderFunctions::RT_USER;
1285 for (auto &BFI : BC.getBinaryFunctions()) {
1286 BinaryFunction &Function = BFI.second;
1287 if (opts::isHotTextMover(Function)) {
1288 Function.setCodeSectionName(BC.getHotTextMoverSectionName());
1289 Function.setColdCodeSectionName(BC.getHotTextMoverSectionName());
1290 continue;
1291 }
1292
1293 if (!UseColdSection || Function.hasValidIndex())
1294 Function.setCodeSectionName(BC.getMainCodeSectionName());
1295 else
1296 Function.setCodeSectionName(BC.getColdCodeSectionName());
1297
1298 if (Function.isSplit())
1299 Function.setColdCodeSectionName(BC.getColdCodeSectionName());
1300 }
1301 return Error::success();
1302}
1303
1304Error PrintProfileStats::runOnFunctions(BinaryContext &BC) {
1305 double FlowImbalanceMean = 0.0;
1306 size_t NumBlocksConsidered = 0;
1307 double WorstBias = 0.0;
1308 const BinaryFunction *WorstBiasFunc = nullptr;
1309
1310 // For each function CFG, we fill an IncomingMap with the sum of the frequency
1311 // of incoming edges for each BB. Likewise for each OutgoingMap and the sum
1312 // of the frequency of outgoing edges.
1313 using FlowMapTy = std::unordered_map<const BinaryBasicBlock *, uint64_t>;
1314 std::unordered_map<const BinaryFunction *, FlowMapTy> TotalIncomingMaps;
1315 std::unordered_map<const BinaryFunction *, FlowMapTy> TotalOutgoingMaps;
1316
1317 // Compute mean
1318 for (const auto &BFI : BC.getBinaryFunctions()) {
1319 const BinaryFunction &Function = BFI.second;
1320 if (Function.empty() || !Function.isSimple())
1321 continue;
1322 FlowMapTy &IncomingMap = TotalIncomingMaps[&Function];
1323 FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function];
1324 for (const BinaryBasicBlock &BB : Function) {
1325 uint64_t TotalOutgoing = 0ULL;
1326 auto SuccBIIter = BB.branch_info_begin();
1327 for (BinaryBasicBlock *Succ : BB.successors()) {
1328 uint64_t Count = SuccBIIter->Count;
1329 if (Count == BinaryBasicBlock::COUNT_NO_PROFILE || Count == 0) {
1330 ++SuccBIIter;
1331 continue;
1332 }
1333 TotalOutgoing += Count;
1334 IncomingMap[Succ] += Count;
1335 ++SuccBIIter;
1336 }
1337 OutgoingMap[&BB] = TotalOutgoing;
1338 }
1339
1340 size_t NumBlocks = 0;
1341 double Mean = 0.0;
1342 for (const BinaryBasicBlock &BB : Function) {
1343 // Do not compute score for low frequency blocks, entry or exit blocks
1344 if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0 || BB.isEntryPoint())
1345 continue;
1346 ++NumBlocks;
1347 const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB];
1348 Mean += fabs(x: Difference / IncomingMap[&BB]);
1349 }
1350
1351 FlowImbalanceMean += Mean;
1352 NumBlocksConsidered += NumBlocks;
1353 if (!NumBlocks)
1354 continue;
1355 double FuncMean = Mean / NumBlocks;
1356 if (FuncMean > WorstBias) {
1357 WorstBias = FuncMean;
1358 WorstBiasFunc = &Function;
1359 }
1360 }
1361 if (NumBlocksConsidered > 0)
1362 FlowImbalanceMean /= NumBlocksConsidered;
1363
1364 // Compute standard deviation
1365 NumBlocksConsidered = 0;
1366 double FlowImbalanceVar = 0.0;
1367 for (const auto &BFI : BC.getBinaryFunctions()) {
1368 const BinaryFunction &Function = BFI.second;
1369 if (Function.empty() || !Function.isSimple())
1370 continue;
1371 FlowMapTy &IncomingMap = TotalIncomingMaps[&Function];
1372 FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function];
1373 for (const BinaryBasicBlock &BB : Function) {
1374 if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0)
1375 continue;
1376 ++NumBlocksConsidered;
1377 const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB];
1378 FlowImbalanceVar +=
1379 pow(x: fabs(x: Difference / IncomingMap[&BB]) - FlowImbalanceMean, y: 2);
1380 }
1381 }
1382 if (NumBlocksConsidered) {
1383 FlowImbalanceVar /= NumBlocksConsidered;
1384 FlowImbalanceVar = sqrt(x: FlowImbalanceVar);
1385 }
1386
1387 // Report to user
1388 BC.outs() << format(Fmt: "BOLT-INFO: Profile bias score: %.4lf%% StDev: %.4lf%%\n",
1389 Vals: (100.0 * FlowImbalanceMean), Vals: (100.0 * FlowImbalanceVar));
1390 if (WorstBiasFunc && opts::Verbosity >= 1) {
1391 BC.outs() << "Worst average bias observed in "
1392 << WorstBiasFunc->getPrintName() << "\n";
1393 LLVM_DEBUG(WorstBiasFunc->dump());
1394 }
1395 return Error::success();
1396}
1397
1398Error PrintProgramStats::runOnFunctions(BinaryContext &BC) {
1399 uint64_t NumRegularFunctions = 0;
1400 uint64_t NumStaleProfileFunctions = 0;
1401 uint64_t NumAllStaleFunctions = 0;
1402 uint64_t NumInferredFunctions = 0;
1403 uint64_t NumNonSimpleProfiledFunctions = 0;
1404 uint64_t NumUnknownControlFlowFunctions = 0;
1405 uint64_t TotalSampleCount = 0;
1406 uint64_t StaleSampleCount = 0;
1407 uint64_t InferredSampleCount = 0;
1408 std::vector<const BinaryFunction *> ProfiledFunctions;
1409 std::vector<std::pair<double, uint64_t>> FuncDensityList;
1410 const char *StaleFuncsHeader = "BOLT-INFO: Functions with stale profile:\n";
1411 for (auto &BFI : BC.getBinaryFunctions()) {
1412 const BinaryFunction &Function = BFI.second;
1413
1414 // Ignore PLT functions for stats.
1415 if (Function.isPLTFunction())
1416 continue;
1417
1418 // Adjustment for BAT mode: the profile for BOLT split fragments is combined
1419 // so only count the hot fragment.
1420 const uint64_t Address = Function.getAddress();
1421 bool IsHotParentOfBOLTSplitFunction = !Function.getFragments().empty() &&
1422 BAT && BAT->isBATFunction(Address) &&
1423 !BAT->fetchParentAddress(Address);
1424
1425 ++NumRegularFunctions;
1426
1427 // In BOLTed binaries split functions are non-simple (due to non-relocation
1428 // mode), but the original function is known to be simple and we have a
1429 // valid profile for it.
1430 if (!Function.isSimple() && !IsHotParentOfBOLTSplitFunction) {
1431 if (Function.hasProfile())
1432 ++NumNonSimpleProfiledFunctions;
1433 continue;
1434 }
1435
1436 if (Function.hasUnknownControlFlow()) {
1437 if (opts::PrintUnknownCFG)
1438 Function.dump();
1439 else if (opts::PrintUnknown)
1440 BC.errs() << "function with unknown control flow: " << Function << '\n';
1441
1442 ++NumUnknownControlFlowFunctions;
1443 }
1444
1445 if (!Function.hasProfile())
1446 continue;
1447
1448 uint64_t SampleCount = Function.getRawSampleCount();
1449 TotalSampleCount += SampleCount;
1450
1451 if (Function.hasValidProfile()) {
1452 ProfiledFunctions.push_back(x: &Function);
1453 if (Function.hasInferredProfile()) {
1454 ++NumInferredFunctions;
1455 InferredSampleCount += SampleCount;
1456 ++NumAllStaleFunctions;
1457 }
1458 } else {
1459 if (opts::ReportStaleFuncs) {
1460 BC.outs() << StaleFuncsHeader;
1461 StaleFuncsHeader = "";
1462 BC.outs() << " " << Function << '\n';
1463 }
1464 ++NumStaleProfileFunctions;
1465 StaleSampleCount += SampleCount;
1466 ++NumAllStaleFunctions;
1467 }
1468
1469 if (opts::ShowDensity) {
1470 uint64_t Size = Function.getSize();
1471 // In case of BOLT split functions registered in BAT, executed traces are
1472 // automatically attributed to the main fragment. Add up function sizes
1473 // for all fragments.
1474 if (IsHotParentOfBOLTSplitFunction)
1475 for (const BinaryFunction *Fragment : Function.getFragments())
1476 Size += Fragment->getSize();
1477 double Density = (double)1.0 * Function.getSampleCountInBytes() / Size;
1478 FuncDensityList.emplace_back(args&: Density, args&: SampleCount);
1479 LLVM_DEBUG(BC.outs() << Function << ": executed bytes "
1480 << Function.getSampleCountInBytes() << ", size (b) "
1481 << Size << ", density " << Density
1482 << ", sample count " << SampleCount << '\n');
1483 }
1484 }
1485 BC.NumProfiledFuncs = ProfiledFunctions.size();
1486 BC.NumStaleProfileFuncs = NumStaleProfileFunctions;
1487
1488 const size_t NumAllProfiledFunctions =
1489 ProfiledFunctions.size() + NumStaleProfileFunctions;
1490 BC.outs() << "BOLT-INFO: " << NumAllProfiledFunctions << " out of "
1491 << NumRegularFunctions << " functions in the binary ("
1492 << format(Fmt: "%.1f", Vals: NumAllProfiledFunctions /
1493 (float)NumRegularFunctions * 100.0f)
1494 << "%) have non-empty execution profile\n";
1495 if (NumNonSimpleProfiledFunctions) {
1496 BC.outs() << "BOLT-INFO: " << NumNonSimpleProfiledFunctions << " function"
1497 << (NumNonSimpleProfiledFunctions == 1 ? "" : "s")
1498 << " with profile could not be optimized\n";
1499 }
1500 if (NumAllStaleFunctions) {
1501 const float PctStale =
1502 NumAllStaleFunctions / (float)NumAllProfiledFunctions * 100.0f;
1503 const float PctStaleFuncsWithEqualBlockCount =
1504 (float)BC.Stats.NumStaleFuncsWithEqualBlockCount /
1505 NumAllStaleFunctions * 100.0f;
1506 const float PctStaleBlocksWithEqualIcount =
1507 (float)BC.Stats.NumStaleBlocksWithEqualIcount /
1508 BC.Stats.NumStaleBlocks * 100.0f;
1509 auto printErrorOrWarning = [&]() {
1510 if (PctStale > opts::StaleThreshold)
1511 BC.errs() << "BOLT-ERROR: ";
1512 else
1513 BC.errs() << "BOLT-WARNING: ";
1514 };
1515 printErrorOrWarning();
1516 BC.errs() << NumAllStaleFunctions
1517 << format(Fmt: " (%.1f%% of all profiled)", Vals: PctStale) << " function"
1518 << (NumAllStaleFunctions == 1 ? "" : "s")
1519 << " have invalid (possibly stale) profile."
1520 " Use -report-stale to see the list.\n";
1521 if (TotalSampleCount > 0) {
1522 printErrorOrWarning();
1523 BC.errs() << (StaleSampleCount + InferredSampleCount) << " out of "
1524 << TotalSampleCount << " samples in the binary ("
1525 << format(Fmt: "%.1f",
1526 Vals: ((100.0f * (StaleSampleCount + InferredSampleCount)) /
1527 TotalSampleCount))
1528 << "%) belong to functions with invalid"
1529 " (possibly stale) profile.\n";
1530 }
1531 BC.outs() << "BOLT-INFO: " << BC.Stats.NumStaleFuncsWithEqualBlockCount
1532 << " stale function"
1533 << (BC.Stats.NumStaleFuncsWithEqualBlockCount == 1 ? "" : "s")
1534 << format(Fmt: " (%.1f%% of all stale)",
1535 Vals: PctStaleFuncsWithEqualBlockCount)
1536 << " have matching block count.\n";
1537 BC.outs() << "BOLT-INFO: " << BC.Stats.NumStaleBlocksWithEqualIcount
1538 << " stale block"
1539 << (BC.Stats.NumStaleBlocksWithEqualIcount == 1 ? "" : "s")
1540 << format(Fmt: " (%.1f%% of all stale)", Vals: PctStaleBlocksWithEqualIcount)
1541 << " have matching icount.\n";
1542 if (PctStale > opts::StaleThreshold) {
1543 return createFatalBOLTError(
1544 S: Twine("BOLT-ERROR: stale functions exceed specified threshold of ") +
1545 Twine(opts::StaleThreshold.getValue()) + Twine("%. Exiting.\n"));
1546 }
1547 }
1548 if (NumInferredFunctions) {
1549 BC.outs() << format(
1550 Fmt: "BOLT-INFO: inferred profile for %d (%.2f%% of profiled, "
1551 "%.2f%% of stale) functions responsible for %.2f%% samples"
1552 " (%zu out of %zu)\n",
1553 Vals: NumInferredFunctions,
1554 Vals: 100.0 * NumInferredFunctions / NumAllProfiledFunctions,
1555 Vals: 100.0 * NumInferredFunctions / NumAllStaleFunctions,
1556 Vals: 100.0 * InferredSampleCount / TotalSampleCount, Vals: InferredSampleCount,
1557 Vals: TotalSampleCount);
1558 BC.outs() << format(
1559 Fmt: "BOLT-INFO: inference found an exact match for %.2f%% of basic blocks"
1560 " (%zu out of %zu stale) responsible for %.2f%% samples"
1561 " (%zu out of %zu stale)\n",
1562 Vals: 100.0 * BC.Stats.NumExactMatchedBlocks / BC.Stats.NumStaleBlocks,
1563 Vals: BC.Stats.NumExactMatchedBlocks, Vals: BC.Stats.NumStaleBlocks,
1564 Vals: 100.0 * BC.Stats.ExactMatchedSampleCount / BC.Stats.StaleSampleCount,
1565 Vals: BC.Stats.ExactMatchedSampleCount, Vals: BC.Stats.StaleSampleCount);
1566 BC.outs() << format(
1567 Fmt: "BOLT-INFO: inference found an exact pseudo probe match for %.2f%% of "
1568 "basic blocks (%zu out of %zu stale) responsible for %.2f%% samples"
1569 " (%zu out of %zu stale)\n",
1570 Vals: 100.0 * BC.Stats.NumPseudoProbeExactMatchedBlocks /
1571 BC.Stats.NumStaleBlocks,
1572 Vals: BC.Stats.NumPseudoProbeExactMatchedBlocks, Vals: BC.Stats.NumStaleBlocks,
1573 Vals: 100.0 * BC.Stats.PseudoProbeExactMatchedSampleCount /
1574 BC.Stats.StaleSampleCount,
1575 Vals: BC.Stats.PseudoProbeExactMatchedSampleCount, Vals: BC.Stats.StaleSampleCount);
1576 BC.outs() << format(
1577 Fmt: "BOLT-INFO: inference found a loose pseudo probe match for %.2f%% of "
1578 "basic blocks (%zu out of %zu stale) responsible for %.2f%% samples"
1579 " (%zu out of %zu stale)\n",
1580 Vals: 100.0 * BC.Stats.NumPseudoProbeLooseMatchedBlocks /
1581 BC.Stats.NumStaleBlocks,
1582 Vals: BC.Stats.NumPseudoProbeLooseMatchedBlocks, Vals: BC.Stats.NumStaleBlocks,
1583 Vals: 100.0 * BC.Stats.PseudoProbeLooseMatchedSampleCount /
1584 BC.Stats.StaleSampleCount,
1585 Vals: BC.Stats.PseudoProbeLooseMatchedSampleCount, Vals: BC.Stats.StaleSampleCount);
1586 BC.outs() << format(
1587 Fmt: "BOLT-INFO: inference found a call match for %.2f%% of basic "
1588 "blocks"
1589 " (%zu out of %zu stale) responsible for %.2f%% samples"
1590 " (%zu out of %zu stale)\n",
1591 Vals: 100.0 * BC.Stats.NumCallMatchedBlocks / BC.Stats.NumStaleBlocks,
1592 Vals: BC.Stats.NumCallMatchedBlocks, Vals: BC.Stats.NumStaleBlocks,
1593 Vals: 100.0 * BC.Stats.CallMatchedSampleCount / BC.Stats.StaleSampleCount,
1594 Vals: BC.Stats.CallMatchedSampleCount, Vals: BC.Stats.StaleSampleCount);
1595 BC.outs() << format(
1596 Fmt: "BOLT-INFO: inference found a loose match for %.2f%% of basic "
1597 "blocks"
1598 " (%zu out of %zu stale) responsible for %.2f%% samples"
1599 " (%zu out of %zu stale)\n",
1600 Vals: 100.0 * BC.Stats.NumLooseMatchedBlocks / BC.Stats.NumStaleBlocks,
1601 Vals: BC.Stats.NumLooseMatchedBlocks, Vals: BC.Stats.NumStaleBlocks,
1602 Vals: 100.0 * BC.Stats.LooseMatchedSampleCount / BC.Stats.StaleSampleCount,
1603 Vals: BC.Stats.LooseMatchedSampleCount, Vals: BC.Stats.StaleSampleCount);
1604 }
1605
1606 if (const uint64_t NumUnusedObjects = BC.getNumUnusedProfiledObjects()) {
1607 BC.outs() << "BOLT-INFO: profile for " << NumUnusedObjects
1608 << " objects was ignored\n";
1609 }
1610
1611 if (ProfiledFunctions.size() > 10) {
1612 if (opts::Verbosity >= 1) {
1613 BC.outs() << "BOLT-INFO: top called functions are:\n";
1614 llvm::sort(C&: ProfiledFunctions,
1615 Comp: [](const BinaryFunction *A, const BinaryFunction *B) {
1616 return B->getExecutionCount() < A->getExecutionCount();
1617 });
1618 auto SFI = ProfiledFunctions.begin();
1619 auto SFIend = ProfiledFunctions.end();
1620 for (unsigned I = 0u; I < opts::TopCalledLimit && SFI != SFIend;
1621 ++SFI, ++I)
1622 BC.outs() << " " << **SFI << " : " << (*SFI)->getExecutionCount()
1623 << '\n';
1624 }
1625 }
1626
1627 if (!opts::PrintSortedBy.empty()) {
1628 std::vector<BinaryFunction *> Functions;
1629 std::map<const BinaryFunction *, DynoStats> Stats;
1630
1631 for (auto &BFI : BC.getBinaryFunctions()) {
1632 BinaryFunction &BF = BFI.second;
1633 if (shouldOptimize(BF) && BF.hasValidProfile()) {
1634 Functions.push_back(x: &BF);
1635 Stats.emplace(args: &BF, args: getDynoStats(BF));
1636 }
1637 }
1638
1639 const bool SortAll =
1640 llvm::is_contained(Range&: opts::PrintSortedBy, Element: DynoStats::LAST_DYNO_STAT);
1641
1642 const bool Ascending =
1643 opts::DynoStatsSortOrderOpt == opts::DynoStatsSortOrder::Ascending;
1644
1645 std::function<bool(const DynoStats &, const DynoStats &)>
1646 DynoStatsComparator =
1647 SortAll ? [](const DynoStats &StatsA,
1648 const DynoStats &StatsB) { return StatsA < StatsB; }
1649 : [](const DynoStats &StatsA, const DynoStats &StatsB) {
1650 return StatsA.lessThan(Other: StatsB, Keys: opts::PrintSortedBy);
1651 };
1652
1653 llvm::stable_sort(Range&: Functions,
1654 C: [Ascending, &Stats, DynoStatsComparator](
1655 const BinaryFunction *A, const BinaryFunction *B) {
1656 auto StatsItr = Stats.find(x: A);
1657 assert(StatsItr != Stats.end());
1658 const DynoStats &StatsA = StatsItr->second;
1659
1660 StatsItr = Stats.find(x: B);
1661 assert(StatsItr != Stats.end());
1662 const DynoStats &StatsB = StatsItr->second;
1663
1664 return Ascending ? DynoStatsComparator(StatsA, StatsB)
1665 : DynoStatsComparator(StatsB, StatsA);
1666 });
1667
1668 BC.outs() << "BOLT-INFO: top functions sorted by ";
1669 if (SortAll) {
1670 BC.outs() << "dyno stats";
1671 } else {
1672 BC.outs() << "(";
1673 bool PrintComma = false;
1674 for (const DynoStats::Category Category : opts::PrintSortedBy) {
1675 if (PrintComma)
1676 BC.outs() << ", ";
1677 BC.outs() << DynoStats::Description(C: Category);
1678 PrintComma = true;
1679 }
1680 BC.outs() << ")";
1681 }
1682
1683 BC.outs() << " are:\n";
1684 auto SFI = Functions.begin();
1685 for (unsigned I = 0; I < 100 && SFI != Functions.end(); ++SFI, ++I) {
1686 const DynoStats Stats = getDynoStats(BF&: **SFI);
1687 BC.outs() << " " << **SFI;
1688 if (!SortAll) {
1689 BC.outs() << " (";
1690 bool PrintComma = false;
1691 for (const DynoStats::Category Category : opts::PrintSortedBy) {
1692 if (PrintComma)
1693 BC.outs() << ", ";
1694 BC.outs() << dynoStatsOptName(C: Category) << "=" << Stats[Category];
1695 PrintComma = true;
1696 }
1697 BC.outs() << ")";
1698 }
1699 BC.outs() << "\n";
1700 }
1701 }
1702
1703 if (!BC.TrappedFunctions.empty()) {
1704 BC.errs() << "BOLT-WARNING: " << BC.TrappedFunctions.size() << " function"
1705 << (BC.TrappedFunctions.size() > 1 ? "s" : "")
1706 << " will trap on entry. Use -trap-avx512=0 to disable"
1707 " traps.";
1708 if (opts::Verbosity >= 1 || BC.TrappedFunctions.size() <= 5) {
1709 BC.errs() << '\n';
1710 for (const BinaryFunction *Function : BC.TrappedFunctions)
1711 BC.errs() << " " << *Function << '\n';
1712 } else {
1713 BC.errs() << " Use -v=1 to see the list.\n";
1714 }
1715 }
1716
1717 // Collect and print information about suboptimal code layout on input.
1718 if (opts::ReportBadLayout) {
1719 std::vector<BinaryFunction *> SuboptimalFuncs;
1720 for (auto &BFI : BC.getBinaryFunctions()) {
1721 BinaryFunction &BF = BFI.second;
1722 if (!BF.hasValidProfile())
1723 continue;
1724
1725 const uint64_t HotThreshold =
1726 std::max<uint64_t>(a: BF.getKnownExecutionCount(), b: 1);
1727 bool HotSeen = false;
1728 for (const BinaryBasicBlock *BB : BF.getLayout().rblocks()) {
1729 if (!HotSeen && BB->getKnownExecutionCount() > HotThreshold) {
1730 HotSeen = true;
1731 continue;
1732 }
1733 if (HotSeen && BB->getKnownExecutionCount() == 0) {
1734 SuboptimalFuncs.push_back(x: &BF);
1735 break;
1736 }
1737 }
1738 }
1739
1740 if (!SuboptimalFuncs.empty()) {
1741 llvm::sort(C&: SuboptimalFuncs,
1742 Comp: [](const BinaryFunction *A, const BinaryFunction *B) {
1743 return A->getKnownExecutionCount() / A->getSize() >
1744 B->getKnownExecutionCount() / B->getSize();
1745 });
1746
1747 BC.outs() << "BOLT-INFO: " << SuboptimalFuncs.size()
1748 << " functions have "
1749 "cold code in the middle of hot code. Top functions are:\n";
1750 for (unsigned I = 0;
1751 I < std::min(a: static_cast<size_t>(opts::ReportBadLayout),
1752 b: SuboptimalFuncs.size());
1753 ++I)
1754 SuboptimalFuncs[I]->print(OS&: BC.outs());
1755 }
1756 }
1757
1758 if (NumUnknownControlFlowFunctions) {
1759 BC.outs() << "BOLT-INFO: " << NumUnknownControlFlowFunctions
1760 << " functions have instructions with unknown control flow";
1761 if (!opts::PrintUnknown)
1762 BC.outs() << ". Use -print-unknown to see the list.";
1763 BC.outs() << '\n';
1764 }
1765
1766 if (opts::ShowDensity) {
1767 double Density = 0.0;
1768 // Sorted by the density in descending order.
1769 llvm::stable_sort(Range&: FuncDensityList,
1770 C: [&](const std::pair<double, uint64_t> &A,
1771 const std::pair<double, uint64_t> &B) {
1772 if (A.first != B.first)
1773 return A.first > B.first;
1774 return A.second < B.second;
1775 });
1776
1777 uint64_t AccumulatedSamples = 0;
1778 uint32_t I = 0;
1779 assert(opts::ProfileDensityCutOffHot <= 1000000 &&
1780 "The cutoff value is greater than 1000000(100%)");
1781 while (AccumulatedSamples <
1782 TotalSampleCount *
1783 static_cast<float>(opts::ProfileDensityCutOffHot) /
1784 1000000 &&
1785 I < FuncDensityList.size()) {
1786 AccumulatedSamples += FuncDensityList[I].second;
1787 Density = FuncDensityList[I].first;
1788 I++;
1789 }
1790 if (Density == 0.0) {
1791 BC.errs() << "BOLT-WARNING: the output profile is empty or the "
1792 "--profile-density-cutoff-hot option is "
1793 "set too low. Please check your command.\n";
1794 } else if (Density < opts::ProfileDensityThreshold) {
1795 BC.errs()
1796 << "BOLT-WARNING: BOLT is estimated to optimize better with "
1797 << format(Fmt: "%.1f", Vals: opts::ProfileDensityThreshold / Density)
1798 << "x more samples. Please consider increasing sampling rate or "
1799 "profiling for longer duration to get more samples.\n";
1800 }
1801
1802 BC.outs() << "BOLT-INFO: Functions with density >= "
1803 << format(Fmt: "%.1f", Vals: Density) << " account for "
1804 << format(Fmt: "%.2f",
1805 Vals: static_cast<double>(opts::ProfileDensityCutOffHot) /
1806 10000)
1807 << "% total sample counts.\n";
1808 }
1809 return Error::success();
1810}
1811
1812Error InstructionLowering::runOnFunctions(BinaryContext &BC) {
1813 for (auto &BFI : BC.getBinaryFunctions())
1814 for (BinaryBasicBlock &BB : BFI.second)
1815 for (MCInst &Instruction : BB)
1816 BC.MIB->lowerTailCall(Inst&: Instruction);
1817 return Error::success();
1818}
1819
1820Error StripRepRet::runOnFunctions(BinaryContext &BC) {
1821 if (!BC.isX86())
1822 return Error::success();
1823
1824 uint64_t NumPrefixesRemoved = 0;
1825 uint64_t NumBytesSaved = 0;
1826 for (auto &BFI : BC.getBinaryFunctions()) {
1827 for (BinaryBasicBlock &BB : BFI.second) {
1828 auto LastInstRIter = BB.getLastNonPseudo();
1829 if (LastInstRIter == BB.rend() || !BC.MIB->isReturn(Inst: *LastInstRIter) ||
1830 !BC.MIB->deleteREPPrefix(Inst&: *LastInstRIter))
1831 continue;
1832
1833 NumPrefixesRemoved += BB.getKnownExecutionCount();
1834 ++NumBytesSaved;
1835 }
1836 }
1837
1838 if (NumBytesSaved)
1839 BC.outs() << "BOLT-INFO: removed " << NumBytesSaved
1840 << " 'repz' prefixes"
1841 " with estimated execution count of "
1842 << NumPrefixesRemoved << " times.\n";
1843 return Error::success();
1844}
1845
1846Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
1847 if (!BC.isX86())
1848 return Error::success();
1849
1850 uint64_t NumInlined = 0;
1851 uint64_t NumInlinedDyno = 0;
1852 for (auto &BFI : BC.getBinaryFunctions()) {
1853 for (BinaryBasicBlock &BB : BFI.second) {
1854 for (auto II = BB.begin(); II != BB.end(); ++II) {
1855 MCInst &Inst = *II;
1856
1857 if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 ||
1858 !Inst.getOperand(i: 0).isExpr())
1859 continue;
1860
1861 const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst);
1862 if (CalleeSymbol->getName() != "memcpy" &&
1863 CalleeSymbol->getName() != "memcpy@PLT" &&
1864 CalleeSymbol->getName() != "_memcpy8")
1865 continue;
1866
1867 const bool IsMemcpy8 = (CalleeSymbol->getName() == "_memcpy8");
1868 const bool IsTailCall = BC.MIB->isTailCall(Inst);
1869
1870 const InstructionListType NewCode =
1871 BC.MIB->createInlineMemcpy(ReturnEnd: IsMemcpy8);
1872 II = BB.replaceInstruction(II, Replacement: NewCode);
1873 std::advance(i&: II, n: NewCode.size() - 1);
1874 if (IsTailCall) {
1875 MCInst Return;
1876 BC.MIB->createReturn(Inst&: Return);
1877 II = BB.insertInstruction(At: std::next(x: II), NewInst: std::move(Return));
1878 }
1879
1880 ++NumInlined;
1881 NumInlinedDyno += BB.getKnownExecutionCount();
1882 }
1883 }
1884 }
1885
1886 if (NumInlined) {
1887 BC.outs() << "BOLT-INFO: inlined " << NumInlined << " memcpy() calls";
1888 if (NumInlinedDyno)
1889 BC.outs() << ". The calls were executed " << NumInlinedDyno
1890 << " times based on profile.";
1891 BC.outs() << '\n';
1892 }
1893 return Error::success();
1894}
1895
1896bool SpecializeMemcpy1::shouldOptimize(const BinaryFunction &Function) const {
1897 if (!BinaryFunctionPass::shouldOptimize(BF: Function))
1898 return false;
1899
1900 for (const std::string &FunctionSpec : Spec) {
1901 StringRef FunctionName = StringRef(FunctionSpec).split(Separator: ':').first;
1902 if (Function.hasNameRegex(NameRegex: FunctionName))
1903 return true;
1904 }
1905
1906 return false;
1907}
1908
1909std::set<size_t> SpecializeMemcpy1::getCallSitesToOptimize(
1910 const BinaryFunction &Function) const {
1911 StringRef SitesString;
1912 for (const std::string &FunctionSpec : Spec) {
1913 StringRef FunctionName;
1914 std::tie(args&: FunctionName, args&: SitesString) = StringRef(FunctionSpec).split(Separator: ':');
1915 if (Function.hasNameRegex(NameRegex: FunctionName))
1916 break;
1917 SitesString = "";
1918 }
1919
1920 std::set<size_t> Sites;
1921 SmallVector<StringRef, 4> SitesVec;
1922 SitesString.split(A&: SitesVec, Separator: ':');
1923 for (StringRef SiteString : SitesVec) {
1924 if (SiteString.empty())
1925 continue;
1926 size_t Result;
1927 if (!SiteString.getAsInteger(Radix: 10, Result))
1928 Sites.emplace(args&: Result);
1929 }
1930
1931 return Sites;
1932}
1933
1934Error SpecializeMemcpy1::runOnFunctions(BinaryContext &BC) {
1935 if (!BC.isX86())
1936 return Error::success();
1937
1938 uint64_t NumSpecialized = 0;
1939 uint64_t NumSpecializedDyno = 0;
1940 for (auto &BFI : BC.getBinaryFunctions()) {
1941 BinaryFunction &Function = BFI.second;
1942 if (!shouldOptimize(Function))
1943 continue;
1944
1945 std::set<size_t> CallsToOptimize = getCallSitesToOptimize(Function);
1946 auto shouldOptimize = [&](size_t N) {
1947 return CallsToOptimize.empty() || CallsToOptimize.count(x: N);
1948 };
1949
1950 std::vector<BinaryBasicBlock *> Blocks(Function.pbegin(), Function.pend());
1951 size_t CallSiteID = 0;
1952 for (BinaryBasicBlock *CurBB : Blocks) {
1953 for (auto II = CurBB->begin(); II != CurBB->end(); ++II) {
1954 MCInst &Inst = *II;
1955
1956 if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 ||
1957 !Inst.getOperand(i: 0).isExpr())
1958 continue;
1959
1960 const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst);
1961 if (CalleeSymbol->getName() != "memcpy" &&
1962 CalleeSymbol->getName() != "memcpy@PLT")
1963 continue;
1964
1965 if (BC.MIB->isTailCall(Inst))
1966 continue;
1967
1968 ++CallSiteID;
1969
1970 if (!shouldOptimize(CallSiteID))
1971 continue;
1972
1973 // Create a copy of a call to memcpy(dest, src, size).
1974 MCInst MemcpyInstr = Inst;
1975
1976 BinaryBasicBlock *OneByteMemcpyBB = CurBB->splitAt(II);
1977
1978 BinaryBasicBlock *NextBB = nullptr;
1979 if (OneByteMemcpyBB->getNumNonPseudos() > 1) {
1980 NextBB = OneByteMemcpyBB->splitAt(II: OneByteMemcpyBB->begin());
1981 NextBB->eraseInstruction(II: NextBB->begin());
1982 } else {
1983 NextBB = OneByteMemcpyBB->getSuccessor();
1984 OneByteMemcpyBB->eraseInstruction(II: OneByteMemcpyBB->begin());
1985 assert(NextBB && "unexpected call to memcpy() with no return");
1986 }
1987
1988 BinaryBasicBlock *MemcpyBB = Function.addBasicBlock();
1989 MemcpyBB->setOffset(CurBB->getInputOffset());
1990 InstructionListType CmpJCC =
1991 BC.MIB->createCmpJE(RegNo: BC.MIB->getIntArgRegister(ArgNo: 2), Imm: 1,
1992 Target: OneByteMemcpyBB->getLabel(), Ctx: BC.Ctx.get());
1993 CurBB->addInstructions(R: CmpJCC);
1994 CurBB->addSuccessor(Succ: MemcpyBB);
1995
1996 MemcpyBB->addInstruction(Inst: std::move(MemcpyInstr));
1997 MemcpyBB->addSuccessor(Succ: NextBB);
1998 MemcpyBB->setCFIState(NextBB->getCFIState());
1999 MemcpyBB->setExecutionCount(0);
2000
2001 // To prevent the actual call from being moved to cold, we set its
2002 // execution count to 1.
2003 if (CurBB->getKnownExecutionCount() > 0)
2004 MemcpyBB->setExecutionCount(1);
2005
2006 InstructionListType OneByteMemcpy = BC.MIB->createOneByteMemcpy();
2007 OneByteMemcpyBB->addInstructions(R: OneByteMemcpy);
2008
2009 ++NumSpecialized;
2010 NumSpecializedDyno += CurBB->getKnownExecutionCount();
2011
2012 CurBB = NextBB;
2013
2014 // Note: we don't expect the next instruction to be a call to memcpy.
2015 II = CurBB->begin();
2016 }
2017 }
2018 }
2019
2020 if (NumSpecialized) {
2021 BC.outs() << "BOLT-INFO: specialized " << NumSpecialized
2022 << " memcpy() call sites for size 1";
2023 if (NumSpecializedDyno)
2024 BC.outs() << ". The calls were executed " << NumSpecializedDyno
2025 << " times based on profile.";
2026 BC.outs() << '\n';
2027 }
2028 return Error::success();
2029}
2030
2031void RemoveNops::runOnFunction(BinaryFunction &BF) {
2032 const BinaryContext &BC = BF.getBinaryContext();
2033 for (BinaryBasicBlock &BB : BF) {
2034 for (int64_t I = BB.size() - 1; I >= 0; --I) {
2035 MCInst &Inst = BB.getInstructionAtIndex(Index: I);
2036 if (BC.MIB->isNoop(Inst) && BC.MIB->hasAnnotation(Inst, Name: "NOP"))
2037 BB.eraseInstructionAtIndex(Index: I);
2038 }
2039 }
2040}
2041
2042Error RemoveNops::runOnFunctions(BinaryContext &BC) {
2043 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
2044 runOnFunction(BF);
2045 };
2046
2047 ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
2048 return BF.shouldPreserveNops();
2049 };
2050
2051 ParallelUtilities::runOnEachFunction(
2052 BC, SchedPolicy: ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFunction: WorkFun,
2053 SkipPredicate: SkipFunc, LogName: "RemoveNops");
2054 return Error::success();
2055}
2056
2057} // namespace bolt
2058} // namespace llvm
2059

Provided by KDAB

Privacy Policy
Update your C++ knowledge – Modern C++11/14/17 Training
Find out more

source code of bolt/lib/Passes/BinaryPasses.cpp