1//===- bolt/Passes/BinaryPasses.cpp - Binary-level passes -----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements multiple passes for binary optimization and analysis.
10//
11//===----------------------------------------------------------------------===//
12
13#include "bolt/Passes/BinaryPasses.h"
14#include "bolt/Core/FunctionLayout.h"
15#include "bolt/Core/ParallelUtilities.h"
16#include "bolt/Passes/ReorderAlgorithm.h"
17#include "bolt/Passes/ReorderFunctions.h"
18#include "llvm/Support/CommandLine.h"
19#include <atomic>
20#include <mutex>
21#include <numeric>
22#include <vector>
23
24#define DEBUG_TYPE "bolt-opts"
25
26using namespace llvm;
27using namespace bolt;
28
29static const char *dynoStatsOptName(const bolt::DynoStats::Category C) {
30 assert(C > bolt::DynoStats::FIRST_DYNO_STAT &&
31 C < DynoStats::LAST_DYNO_STAT && "Unexpected dyno stat category.");
32
33 static std::string OptNames[bolt::DynoStats::LAST_DYNO_STAT + 1];
34
35 OptNames[C] = bolt::DynoStats::Description(C);
36
37 std::replace(first: OptNames[C].begin(), last: OptNames[C].end(), old_value: ' ', new_value: '-');
38
39 return OptNames[C].c_str();
40}
41
42namespace opts {
43
44extern cl::OptionCategory BoltCategory;
45extern cl::OptionCategory BoltOptCategory;
46
47extern cl::opt<bolt::MacroFusionType> AlignMacroOpFusion;
48extern cl::opt<unsigned> Verbosity;
49extern cl::opt<bool> EnableBAT;
50extern cl::opt<unsigned> ExecutionCountThreshold;
51extern cl::opt<bool> UpdateDebugSections;
52extern cl::opt<bolt::ReorderFunctions::ReorderType> ReorderFunctions;
53
54enum DynoStatsSortOrder : char {
55 Ascending,
56 Descending
57};
58
59static cl::opt<DynoStatsSortOrder> DynoStatsSortOrderOpt(
60 "print-sorted-by-order",
61 cl::desc("use ascending or descending order when printing functions "
62 "ordered by dyno stats"),
63 cl::init(Val: DynoStatsSortOrder::Descending), cl::cat(BoltOptCategory));
64
65cl::list<std::string>
66HotTextMoveSections("hot-text-move-sections",
67 cl::desc("list of sections containing functions used for hugifying hot text. "
68 "BOLT makes sure these functions are not placed on the same page as "
69 "the hot text. (default=\'.stub,.mover\')."),
70 cl::value_desc("sec1,sec2,sec3,..."),
71 cl::CommaSeparated,
72 cl::ZeroOrMore,
73 cl::cat(BoltCategory));
74
75bool isHotTextMover(const BinaryFunction &Function) {
76 for (std::string &SectionName : opts::HotTextMoveSections) {
77 if (Function.getOriginSectionName() &&
78 *Function.getOriginSectionName() == SectionName)
79 return true;
80 }
81
82 return false;
83}
84
85static cl::opt<bool> MinBranchClusters(
86 "min-branch-clusters",
87 cl::desc("use a modified clustering algorithm geared towards minimizing "
88 "branches"),
89 cl::Hidden, cl::cat(BoltOptCategory));
90
91static cl::list<Peepholes::PeepholeOpts> Peepholes(
92 "peepholes", cl::CommaSeparated, cl::desc("enable peephole optimizations"),
93 cl::value_desc("opt1,opt2,opt3,..."),
94 cl::values(clEnumValN(Peepholes::PEEP_NONE, "none", "disable peepholes"),
95 clEnumValN(Peepholes::PEEP_DOUBLE_JUMPS, "double-jumps",
96 "remove double jumps when able"),
97 clEnumValN(Peepholes::PEEP_TAILCALL_TRAPS, "tailcall-traps",
98 "insert tail call traps"),
99 clEnumValN(Peepholes::PEEP_USELESS_BRANCHES, "useless-branches",
100 "remove useless conditional branches"),
101 clEnumValN(Peepholes::PEEP_ALL, "all",
102 "enable all peephole optimizations")),
103 cl::ZeroOrMore, cl::cat(BoltOptCategory));
104
105static cl::opt<unsigned>
106 PrintFuncStat("print-function-statistics",
107 cl::desc("print statistics about basic block ordering"),
108 cl::init(Val: 0), cl::cat(BoltOptCategory));
109
110static cl::opt<bool> PrintLargeFunctions(
111 "print-large-functions",
112 cl::desc("print functions that could not be overwritten due to excessive "
113 "size"),
114 cl::init(Val: false), cl::cat(BoltOptCategory));
115
116static cl::list<bolt::DynoStats::Category>
117 PrintSortedBy("print-sorted-by", cl::CommaSeparated,
118 cl::desc("print functions sorted by order of dyno stats"),
119 cl::value_desc("key1,key2,key3,..."),
120 cl::values(
121#define D(name, description, ...) \
122 clEnumValN(bolt::DynoStats::name, dynoStatsOptName(bolt::DynoStats::name), \
123 description),
124 REAL_DYNO_STATS
125#undef D
126 clEnumValN(bolt::DynoStats::LAST_DYNO_STAT, "all",
127 "sorted by all names")),
128 cl::ZeroOrMore, cl::cat(BoltOptCategory));
129
130static cl::opt<bool>
131 PrintUnknown("print-unknown",
132 cl::desc("print names of functions with unknown control flow"),
133 cl::cat(BoltCategory), cl::Hidden);
134
135static cl::opt<bool>
136 PrintUnknownCFG("print-unknown-cfg",
137 cl::desc("dump CFG of functions with unknown control flow"),
138 cl::cat(BoltCategory), cl::ReallyHidden);
139
140// Please MSVC19 with a forward declaration: otherwise it reports an error about
141// an undeclared variable inside a callback.
142extern cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks;
143cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks(
144 "reorder-blocks", cl::desc("change layout of basic blocks in a function"),
145 cl::init(Val: bolt::ReorderBasicBlocks::LT_NONE),
146 cl::values(
147 clEnumValN(bolt::ReorderBasicBlocks::LT_NONE, "none",
148 "do not reorder basic blocks"),
149 clEnumValN(bolt::ReorderBasicBlocks::LT_REVERSE, "reverse",
150 "layout blocks in reverse order"),
151 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE, "normal",
152 "perform optimal layout based on profile"),
153 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_BRANCH,
154 "branch-predictor",
155 "perform optimal layout prioritizing branch "
156 "predictions"),
157 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE, "cache",
158 "perform optimal layout prioritizing I-cache "
159 "behavior"),
160 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS, "cache+",
161 "perform layout optimizing I-cache behavior"),
162 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP, "ext-tsp",
163 "perform layout optimizing I-cache behavior"),
164 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_SHUFFLE,
165 "cluster-shuffle", "perform random layout of clusters")),
166 cl::ZeroOrMore, cl::cat(BoltOptCategory),
167 cl::callback(CB: [](const bolt::ReorderBasicBlocks::LayoutType &option) {
168 if (option == bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS) {
169 errs() << "BOLT-WARNING: '-reorder-blocks=cache+' is deprecated, please"
170 << " use '-reorder-blocks=ext-tsp' instead\n";
171 ReorderBlocks = bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP;
172 }
173 }));
174
175static cl::opt<unsigned> ReportBadLayout(
176 "report-bad-layout",
177 cl::desc("print top <uint> functions with suboptimal code layout on input"),
178 cl::init(Val: 0), cl::Hidden, cl::cat(BoltOptCategory));
179
180static cl::opt<bool>
181 ReportStaleFuncs("report-stale",
182 cl::desc("print the list of functions with stale profile"),
183 cl::Hidden, cl::cat(BoltOptCategory));
184
185enum SctcModes : char {
186 SctcAlways,
187 SctcPreserveDirection,
188 SctcHeuristic
189};
190
191static cl::opt<SctcModes>
192SctcMode("sctc-mode",
193 cl::desc("mode for simplify conditional tail calls"),
194 cl::init(Val: SctcAlways),
195 cl::values(clEnumValN(SctcAlways, "always", "always perform sctc"),
196 clEnumValN(SctcPreserveDirection,
197 "preserve",
198 "only perform sctc when branch direction is "
199 "preserved"),
200 clEnumValN(SctcHeuristic,
201 "heuristic",
202 "use branch prediction data to control sctc")),
203 cl::ZeroOrMore,
204 cl::cat(BoltOptCategory));
205
206static cl::opt<unsigned>
207StaleThreshold("stale-threshold",
208 cl::desc(
209 "maximum percentage of stale functions to tolerate (default: 100)"),
210 cl::init(Val: 100),
211 cl::Hidden,
212 cl::cat(BoltOptCategory));
213
214static cl::opt<unsigned> TSPThreshold(
215 "tsp-threshold",
216 cl::desc(
217 "maximum number of hot basic blocks in a function for which to use "
218 "a precise TSP solution while re-ordering basic blocks"),
219 cl::init(Val: 10), cl::Hidden, cl::cat(BoltOptCategory));
220
221static cl::opt<unsigned> TopCalledLimit(
222 "top-called-limit",
223 cl::desc("maximum number of functions to print in top called "
224 "functions section"),
225 cl::init(Val: 100), cl::Hidden, cl::cat(BoltCategory));
226
227} // namespace opts
228
229namespace llvm {
230namespace bolt {
231
232bool BinaryFunctionPass::shouldOptimize(const BinaryFunction &BF) const {
233 return BF.isSimple() && BF.getState() == BinaryFunction::State::CFG &&
234 !BF.isIgnored();
235}
236
237bool BinaryFunctionPass::shouldPrint(const BinaryFunction &BF) const {
238 return BF.isSimple() && !BF.isIgnored();
239}
240
241void NormalizeCFG::runOnFunction(BinaryFunction &BF) {
242 uint64_t NumRemoved = 0;
243 uint64_t NumDuplicateEdges = 0;
244 uint64_t NeedsFixBranches = 0;
245 for (BinaryBasicBlock &BB : BF) {
246 if (!BB.empty())
247 continue;
248
249 if (BB.isEntryPoint() || BB.isLandingPad())
250 continue;
251
252 // Handle a dangling empty block.
253 if (BB.succ_size() == 0) {
254 // If an empty dangling basic block has a predecessor, it could be a
255 // result of codegen for __builtin_unreachable. In such case, do not
256 // remove the block.
257 if (BB.pred_size() == 0) {
258 BB.markValid(Valid: false);
259 ++NumRemoved;
260 }
261 continue;
262 }
263
264 // The block should have just one successor.
265 BinaryBasicBlock *Successor = BB.getSuccessor();
266 assert(Successor && "invalid CFG encountered");
267
268 // Redirect all predecessors to the successor block.
269 while (!BB.pred_empty()) {
270 BinaryBasicBlock *Predecessor = *BB.pred_begin();
271 if (Predecessor->hasJumpTable())
272 break;
273
274 if (Predecessor == Successor)
275 break;
276
277 BinaryBasicBlock::BinaryBranchInfo &BI = Predecessor->getBranchInfo(Succ: BB);
278 Predecessor->replaceSuccessor(Succ: &BB, NewSucc: Successor, Count: BI.Count,
279 MispredictedCount: BI.MispredictedCount);
280 // We need to fix branches even if we failed to replace all successors
281 // and remove the block.
282 NeedsFixBranches = true;
283 }
284
285 if (BB.pred_empty()) {
286 BB.removeAllSuccessors();
287 BB.markValid(Valid: false);
288 ++NumRemoved;
289 }
290 }
291
292 if (NumRemoved)
293 BF.eraseInvalidBBs();
294
295 // Check for duplicate successors. Do it after the empty block elimination as
296 // we can get more duplicate successors.
297 for (BinaryBasicBlock &BB : BF)
298 if (!BB.hasJumpTable() && BB.succ_size() == 2 &&
299 BB.getConditionalSuccessor(Condition: false) == BB.getConditionalSuccessor(Condition: true))
300 ++NumDuplicateEdges;
301
302 // fixBranches() will get rid of duplicate edges and update jump instructions.
303 if (NumDuplicateEdges || NeedsFixBranches)
304 BF.fixBranches();
305
306 NumDuplicateEdgesMerged += NumDuplicateEdges;
307 NumBlocksRemoved += NumRemoved;
308}
309
310Error NormalizeCFG::runOnFunctions(BinaryContext &BC) {
311 ParallelUtilities::runOnEachFunction(
312 BC, SchedPolicy: ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR,
313 WorkFunction: [&](BinaryFunction &BF) { runOnFunction(BF); },
314 SkipPredicate: [&](const BinaryFunction &BF) { return !shouldOptimize(BF); },
315 LogName: "NormalizeCFG");
316 if (NumBlocksRemoved)
317 BC.outs() << "BOLT-INFO: removed " << NumBlocksRemoved << " empty block"
318 << (NumBlocksRemoved == 1 ? "" : "s") << '\n';
319 if (NumDuplicateEdgesMerged)
320 BC.outs() << "BOLT-INFO: merged " << NumDuplicateEdgesMerged
321 << " duplicate CFG edge"
322 << (NumDuplicateEdgesMerged == 1 ? "" : "s") << '\n';
323 return Error::success();
324}
325
326void EliminateUnreachableBlocks::runOnFunction(BinaryFunction &Function) {
327 BinaryContext &BC = Function.getBinaryContext();
328 unsigned Count;
329 uint64_t Bytes;
330 Function.markUnreachableBlocks();
331 LLVM_DEBUG({
332 for (BinaryBasicBlock &BB : Function) {
333 if (!BB.isValid()) {
334 dbgs() << "BOLT-INFO: UCE found unreachable block " << BB.getName()
335 << " in function " << Function << "\n";
336 Function.dump();
337 }
338 }
339 });
340 BinaryContext::IndependentCodeEmitter Emitter =
341 BC.createIndependentMCCodeEmitter();
342 std::tie(args&: Count, args&: Bytes) = Function.eraseInvalidBBs(Emitter: Emitter.MCE.get());
343 DeletedBlocks += Count;
344 DeletedBytes += Bytes;
345 if (Count) {
346 auto L = BC.scopeLock();
347 Modified.insert(x: &Function);
348 if (opts::Verbosity > 0)
349 BC.outs() << "BOLT-INFO: removed " << Count
350 << " dead basic block(s) accounting for " << Bytes
351 << " bytes in function " << Function << '\n';
352 }
353}
354
355Error EliminateUnreachableBlocks::runOnFunctions(BinaryContext &BC) {
356 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
357 runOnFunction(Function&: BF);
358 };
359
360 ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
361 return !shouldOptimize(BF) || BF.getLayout().block_empty();
362 };
363
364 ParallelUtilities::runOnEachFunction(
365 BC, SchedPolicy: ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFunction: WorkFun,
366 SkipPredicate, LogName: "elimininate-unreachable");
367
368 if (DeletedBlocks)
369 BC.outs() << "BOLT-INFO: UCE removed " << DeletedBlocks << " blocks and "
370 << DeletedBytes << " bytes of code\n";
371 return Error::success();
372}
373
374bool ReorderBasicBlocks::shouldPrint(const BinaryFunction &BF) const {
375 return (BinaryFunctionPass::shouldPrint(BF) &&
376 opts::ReorderBlocks != ReorderBasicBlocks::LT_NONE);
377}
378
379bool ReorderBasicBlocks::shouldOptimize(const BinaryFunction &BF) const {
380 // Apply execution count threshold
381 if (BF.getKnownExecutionCount() < opts::ExecutionCountThreshold)
382 return false;
383
384 return BinaryFunctionPass::shouldOptimize(BF);
385}
386
387Error ReorderBasicBlocks::runOnFunctions(BinaryContext &BC) {
388 if (opts::ReorderBlocks == ReorderBasicBlocks::LT_NONE)
389 return Error::success();
390
391 std::atomic_uint64_t ModifiedFuncCount(0);
392 std::mutex FunctionEditDistanceMutex;
393 DenseMap<const BinaryFunction *, uint64_t> FunctionEditDistance;
394
395 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
396 SmallVector<const BinaryBasicBlock *, 0> OldBlockOrder;
397 if (opts::PrintFuncStat > 0)
398 llvm::copy(Range: BF.getLayout().blocks(), Out: std::back_inserter(x&: OldBlockOrder));
399
400 const bool LayoutChanged =
401 modifyFunctionLayout(Function&: BF, Type: opts::ReorderBlocks, MinBranchClusters: opts::MinBranchClusters);
402 if (LayoutChanged) {
403 ModifiedFuncCount.fetch_add(i: 1, m: std::memory_order_relaxed);
404 if (opts::PrintFuncStat > 0) {
405 const uint64_t Distance = BF.getLayout().getEditDistance(OldBlockOrder);
406 std::lock_guard<std::mutex> Lock(FunctionEditDistanceMutex);
407 FunctionEditDistance[&BF] = Distance;
408 }
409 }
410 };
411
412 ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
413 return !shouldOptimize(BF);
414 };
415
416 ParallelUtilities::runOnEachFunction(
417 BC, SchedPolicy: ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, WorkFunction: WorkFun, SkipPredicate: SkipFunc,
418 LogName: "ReorderBasicBlocks");
419 const size_t NumAllProfiledFunctions =
420 BC.NumProfiledFuncs + BC.NumStaleProfileFuncs;
421
422 BC.outs() << "BOLT-INFO: basic block reordering modified layout of "
423 << format(
424 Fmt: "%zu functions (%.2lf%% of profiled, %.2lf%% of total)\n",
425 Vals: ModifiedFuncCount.load(m: std::memory_order_relaxed),
426 Vals: 100.0 * ModifiedFuncCount.load(m: std::memory_order_relaxed) /
427 NumAllProfiledFunctions,
428 Vals: 100.0 * ModifiedFuncCount.load(m: std::memory_order_relaxed) /
429 BC.getBinaryFunctions().size());
430
431 if (opts::PrintFuncStat > 0) {
432 raw_ostream &OS = BC.outs();
433 // Copy all the values into vector in order to sort them
434 std::map<uint64_t, BinaryFunction &> ScoreMap;
435 auto &BFs = BC.getBinaryFunctions();
436 for (auto It = BFs.begin(); It != BFs.end(); ++It)
437 ScoreMap.insert(x: std::pair<uint64_t, BinaryFunction &>(
438 It->second.getFunctionScore(), It->second));
439
440 OS << "\nBOLT-INFO: Printing Function Statistics:\n\n";
441 OS << " There are " << BFs.size() << " functions in total. \n";
442 OS << " Number of functions being modified: "
443 << ModifiedFuncCount.load(m: std::memory_order_relaxed) << "\n";
444 OS << " User asks for detailed information on top "
445 << opts::PrintFuncStat << " functions. (Ranked by function score)"
446 << "\n\n";
447 uint64_t I = 0;
448 for (std::map<uint64_t, BinaryFunction &>::reverse_iterator Rit =
449 ScoreMap.rbegin();
450 Rit != ScoreMap.rend() && I < opts::PrintFuncStat; ++Rit, ++I) {
451 BinaryFunction &Function = Rit->second;
452
453 OS << " Information for function of top: " << (I + 1) << ": \n";
454 OS << " Function Score is: " << Function.getFunctionScore()
455 << "\n";
456 OS << " There are " << Function.size()
457 << " number of blocks in this function.\n";
458 OS << " There are " << Function.getInstructionCount()
459 << " number of instructions in this function.\n";
460 OS << " The edit distance for this function is: "
461 << FunctionEditDistance.lookup(Val: &Function) << "\n\n";
462 }
463 }
464 return Error::success();
465}
466
467bool ReorderBasicBlocks::modifyFunctionLayout(BinaryFunction &BF,
468 LayoutType Type,
469 bool MinBranchClusters) const {
470 if (BF.size() == 0 || Type == LT_NONE)
471 return false;
472
473 BinaryFunction::BasicBlockOrderType NewLayout;
474 std::unique_ptr<ReorderAlgorithm> Algo;
475
476 // Cannot do optimal layout without profile.
477 if (Type != LT_REVERSE && !BF.hasValidProfile())
478 return false;
479
480 if (Type == LT_REVERSE) {
481 Algo.reset(p: new ReverseReorderAlgorithm());
482 } else if (BF.size() <= opts::TSPThreshold && Type != LT_OPTIMIZE_SHUFFLE) {
483 // Work on optimal solution if problem is small enough
484 LLVM_DEBUG(dbgs() << "finding optimal block layout for " << BF << "\n");
485 Algo.reset(p: new TSPReorderAlgorithm());
486 } else {
487 LLVM_DEBUG(dbgs() << "running block layout heuristics on " << BF << "\n");
488
489 std::unique_ptr<ClusterAlgorithm> CAlgo;
490 if (MinBranchClusters)
491 CAlgo.reset(p: new MinBranchGreedyClusterAlgorithm());
492 else
493 CAlgo.reset(p: new PHGreedyClusterAlgorithm());
494
495 switch (Type) {
496 case LT_OPTIMIZE:
497 Algo.reset(p: new OptimizeReorderAlgorithm(std::move(CAlgo)));
498 break;
499
500 case LT_OPTIMIZE_BRANCH:
501 Algo.reset(p: new OptimizeBranchReorderAlgorithm(std::move(CAlgo)));
502 break;
503
504 case LT_OPTIMIZE_CACHE:
505 Algo.reset(p: new OptimizeCacheReorderAlgorithm(std::move(CAlgo)));
506 break;
507
508 case LT_OPTIMIZE_EXT_TSP:
509 Algo.reset(p: new ExtTSPReorderAlgorithm());
510 break;
511
512 case LT_OPTIMIZE_SHUFFLE:
513 Algo.reset(p: new RandomClusterReorderAlgorithm(std::move(CAlgo)));
514 break;
515
516 default:
517 llvm_unreachable("unexpected layout type");
518 }
519 }
520
521 Algo->reorderBasicBlocks(BF, Order&: NewLayout);
522
523 return BF.getLayout().update(NewLayout);
524}
525
526Error FixupBranches::runOnFunctions(BinaryContext &BC) {
527 for (auto &It : BC.getBinaryFunctions()) {
528 BinaryFunction &Function = It.second;
529 if (!BC.shouldEmit(Function) || !Function.isSimple())
530 continue;
531
532 Function.fixBranches();
533 }
534 return Error::success();
535}
536
537Error FinalizeFunctions::runOnFunctions(BinaryContext &BC) {
538 std::atomic<bool> HasFatal{false};
539 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
540 if (!BF.finalizeCFIState()) {
541 if (BC.HasRelocations) {
542 BC.errs() << "BOLT-ERROR: unable to fix CFI state for function " << BF
543 << ". Exiting.\n";
544 HasFatal = true;
545 return;
546 }
547 BF.setSimple(false);
548 return;
549 }
550
551 BF.setFinalized();
552
553 // Update exception handling information.
554 BF.updateEHRanges();
555 };
556
557 ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
558 return !BC.shouldEmit(Function: BF);
559 };
560
561 ParallelUtilities::runOnEachFunction(
562 BC, SchedPolicy: ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFunction: WorkFun,
563 SkipPredicate, LogName: "FinalizeFunctions");
564 if (HasFatal)
565 return createFatalBOLTError(S: "finalize CFI state failure");
566 return Error::success();
567}
568
569Error CheckLargeFunctions::runOnFunctions(BinaryContext &BC) {
570 if (BC.HasRelocations)
571 return Error::success();
572
573 // If the function wouldn't fit, mark it as non-simple. Otherwise, we may emit
574 // incorrect meta data.
575 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
576 uint64_t HotSize, ColdSize;
577 std::tie(args&: HotSize, args&: ColdSize) =
578 BC.calculateEmittedSize(BF, /*FixBranches=*/false);
579 if (HotSize > BF.getMaxSize()) {
580 if (opts::PrintLargeFunctions)
581 BC.outs() << "BOLT-INFO: " << BF << " size exceeds allocated space by "
582 << (HotSize - BF.getMaxSize()) << " bytes\n";
583 BF.setSimple(false);
584 }
585 };
586
587 ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
588 return !shouldOptimize(BF);
589 };
590
591 ParallelUtilities::runOnEachFunction(
592 BC, SchedPolicy: ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFunction: WorkFun,
593 SkipPredicate: SkipFunc, LogName: "CheckLargeFunctions");
594
595 return Error::success();
596}
597
598bool CheckLargeFunctions::shouldOptimize(const BinaryFunction &BF) const {
599 // Unlike other passes, allow functions in non-CFG state.
600 return BF.isSimple() && !BF.isIgnored();
601}
602
603Error LowerAnnotations::runOnFunctions(BinaryContext &BC) {
604 // Convert GnuArgsSize annotations into CFIs.
605 for (BinaryFunction *BF : BC.getAllBinaryFunctions()) {
606 for (FunctionFragment &FF : BF->getLayout().fragments()) {
607 // Reset at the start of the new fragment.
608 int64_t CurrentGnuArgsSize = 0;
609
610 for (BinaryBasicBlock *const BB : FF) {
611 for (auto II = BB->begin(); II != BB->end(); ++II) {
612 if (!BF->usesGnuArgsSize() || !BC.MIB->isInvoke(Inst: *II))
613 continue;
614
615 const int64_t NewGnuArgsSize = BC.MIB->getGnuArgsSize(Inst: *II);
616 assert(NewGnuArgsSize >= 0 && "Expected non-negative GNU_args_size.");
617 if (NewGnuArgsSize == CurrentGnuArgsSize)
618 continue;
619
620 auto InsertII = BF->addCFIInstruction(
621 BB, Pos: II,
622 Inst: MCCFIInstruction::createGnuArgsSize(L: nullptr, Size: NewGnuArgsSize));
623 CurrentGnuArgsSize = NewGnuArgsSize;
624 II = std::next(x: InsertII);
625 }
626 }
627 }
628 }
629 return Error::success();
630}
631
632// Check for dirty state in MCSymbol objects that might be a consequence
633// of running calculateEmittedSize() in parallel, during split functions
634// pass. If an inconsistent state is found (symbol already registered or
635// already defined), clean it.
636Error CleanMCState::runOnFunctions(BinaryContext &BC) {
637 MCContext &Ctx = *BC.Ctx;
638 for (const auto &SymMapEntry : Ctx.getSymbols()) {
639 const MCSymbol *S = SymMapEntry.second;
640 if (S->isDefined()) {
641 LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName()
642 << "\" is already defined\n");
643 const_cast<MCSymbol *>(S)->setUndefined();
644 }
645 if (S->isRegistered()) {
646 LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName()
647 << "\" is already registered\n");
648 const_cast<MCSymbol *>(S)->setIsRegistered(false);
649 }
650 LLVM_DEBUG(if (S->isVariable()) {
651 dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName() << "\" is variable\n";
652 });
653 }
654 return Error::success();
655}
656
657// This peephole fixes jump instructions that jump to another basic
658// block with a single jump instruction, e.g.
659//
660// B0: ...
661// jmp B1 (or jcc B1)
662//
663// B1: jmp B2
664//
665// ->
666//
667// B0: ...
668// jmp B2 (or jcc B2)
669//
670static uint64_t fixDoubleJumps(BinaryFunction &Function, bool MarkInvalid) {
671 uint64_t NumDoubleJumps = 0;
672
673 MCContext *Ctx = Function.getBinaryContext().Ctx.get();
674 MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get();
675 for (BinaryBasicBlock &BB : Function) {
676 auto checkAndPatch = [&](BinaryBasicBlock *Pred, BinaryBasicBlock *Succ,
677 const MCSymbol *SuccSym) {
678 // Ignore infinite loop jumps or fallthrough tail jumps.
679 if (Pred == Succ || Succ == &BB)
680 return false;
681
682 if (Succ) {
683 const MCSymbol *TBB = nullptr;
684 const MCSymbol *FBB = nullptr;
685 MCInst *CondBranch = nullptr;
686 MCInst *UncondBranch = nullptr;
687 bool Res = Pred->analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
688 if (!Res) {
689 LLVM_DEBUG(dbgs() << "analyzeBranch failed in peepholes in block:\n";
690 Pred->dump());
691 return false;
692 }
693 Pred->replaceSuccessor(Succ: &BB, NewSucc: Succ);
694
695 // We must patch up any existing branch instructions to match up
696 // with the new successor.
697 assert((CondBranch || (!CondBranch && Pred->succ_size() == 1)) &&
698 "Predecessor block has inconsistent number of successors");
699 if (CondBranch && MIB->getTargetSymbol(Inst: *CondBranch) == BB.getLabel()) {
700 MIB->replaceBranchTarget(Inst&: *CondBranch, TBB: Succ->getLabel(), Ctx);
701 } else if (UncondBranch &&
702 MIB->getTargetSymbol(Inst: *UncondBranch) == BB.getLabel()) {
703 MIB->replaceBranchTarget(Inst&: *UncondBranch, TBB: Succ->getLabel(), Ctx);
704 } else if (!UncondBranch) {
705 assert(Function.getLayout().getBasicBlockAfter(Pred, false) != Succ &&
706 "Don't add an explicit jump to a fallthrough block.");
707 Pred->addBranchInstruction(Successor: Succ);
708 }
709 } else {
710 // Succ will be null in the tail call case. In this case we
711 // need to explicitly add a tail call instruction.
712 MCInst *Branch = Pred->getLastNonPseudoInstr();
713 if (Branch && MIB->isUnconditionalBranch(Inst: *Branch)) {
714 assert(MIB->getTargetSymbol(*Branch) == BB.getLabel());
715 Pred->removeSuccessor(Succ: &BB);
716 Pred->eraseInstruction(II: Pred->findInstruction(Inst: Branch));
717 Pred->addTailCallInstruction(Target: SuccSym);
718 } else {
719 return false;
720 }
721 }
722
723 ++NumDoubleJumps;
724 LLVM_DEBUG(dbgs() << "Removed double jump in " << Function << " from "
725 << Pred->getName() << " -> " << BB.getName() << " to "
726 << Pred->getName() << " -> " << SuccSym->getName()
727 << (!Succ ? " (tail)\n" : "\n"));
728
729 return true;
730 };
731
732 if (BB.getNumNonPseudos() != 1 || BB.isLandingPad())
733 continue;
734
735 MCInst *Inst = BB.getFirstNonPseudoInstr();
736 const bool IsTailCall = MIB->isTailCall(Inst: *Inst);
737
738 if (!MIB->isUnconditionalBranch(Inst: *Inst) && !IsTailCall)
739 continue;
740
741 // If we operate after SCTC make sure it's not a conditional tail call.
742 if (IsTailCall && MIB->isConditionalBranch(Inst: *Inst))
743 continue;
744
745 const MCSymbol *SuccSym = MIB->getTargetSymbol(Inst: *Inst);
746 BinaryBasicBlock *Succ = BB.getSuccessor();
747
748 if (((!Succ || &BB == Succ) && !IsTailCall) || (IsTailCall && !SuccSym))
749 continue;
750
751 std::vector<BinaryBasicBlock *> Preds = {BB.pred_begin(), BB.pred_end()};
752
753 for (BinaryBasicBlock *Pred : Preds) {
754 if (Pred->isLandingPad())
755 continue;
756
757 if (Pred->getSuccessor() == &BB ||
758 (Pred->getConditionalSuccessor(Condition: true) == &BB && !IsTailCall) ||
759 Pred->getConditionalSuccessor(Condition: false) == &BB)
760 if (checkAndPatch(Pred, Succ, SuccSym) && MarkInvalid)
761 BB.markValid(Valid: BB.pred_size() != 0 || BB.isLandingPad() ||
762 BB.isEntryPoint());
763 }
764 }
765
766 return NumDoubleJumps;
767}
768
769bool SimplifyConditionalTailCalls::shouldRewriteBranch(
770 const BinaryBasicBlock *PredBB, const MCInst &CondBranch,
771 const BinaryBasicBlock *BB, const bool DirectionFlag) {
772 if (BeenOptimized.count(x: PredBB))
773 return false;
774
775 const bool IsForward = BinaryFunction::isForwardBranch(From: PredBB, To: BB);
776
777 if (IsForward)
778 ++NumOrigForwardBranches;
779 else
780 ++NumOrigBackwardBranches;
781
782 if (opts::SctcMode == opts::SctcAlways)
783 return true;
784
785 if (opts::SctcMode == opts::SctcPreserveDirection)
786 return IsForward == DirectionFlag;
787
788 const ErrorOr<std::pair<double, double>> Frequency =
789 PredBB->getBranchStats(Succ: BB);
790
791 // It's ok to rewrite the conditional branch if the new target will be
792 // a backward branch.
793
794 // If no data available for these branches, then it should be ok to
795 // do the optimization since it will reduce code size.
796 if (Frequency.getError())
797 return true;
798
799 // TODO: should this use misprediction frequency instead?
800 const bool Result = (IsForward && Frequency.get().first >= 0.5) ||
801 (!IsForward && Frequency.get().first <= 0.5);
802
803 return Result == DirectionFlag;
804}
805
806uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryFunction &BF) {
807 // Need updated indices to correctly detect branch' direction.
808 BF.getLayout().updateLayoutIndices();
809 BF.markUnreachableBlocks();
810
811 MCPlusBuilder *MIB = BF.getBinaryContext().MIB.get();
812 MCContext *Ctx = BF.getBinaryContext().Ctx.get();
813 uint64_t NumLocalCTCCandidates = 0;
814 uint64_t NumLocalCTCs = 0;
815 uint64_t LocalCTCTakenCount = 0;
816 uint64_t LocalCTCExecCount = 0;
817 std::vector<std::pair<BinaryBasicBlock *, const BinaryBasicBlock *>>
818 NeedsUncondBranch;
819
820 // Will block be deleted by UCE?
821 auto isValid = [](const BinaryBasicBlock *BB) {
822 return (BB->pred_size() != 0 || BB->isLandingPad() || BB->isEntryPoint());
823 };
824
825 for (BinaryBasicBlock *BB : BF.getLayout().blocks()) {
826 // Locate BB with a single direct tail-call instruction.
827 if (BB->getNumNonPseudos() != 1)
828 continue;
829
830 MCInst *Instr = BB->getFirstNonPseudoInstr();
831 if (!MIB->isTailCall(Inst: *Instr) || MIB->isConditionalBranch(Inst: *Instr))
832 continue;
833
834 const MCSymbol *CalleeSymbol = MIB->getTargetSymbol(Inst: *Instr);
835 if (!CalleeSymbol)
836 continue;
837
838 // Detect direction of the possible conditional tail call.
839 const bool IsForwardCTC = BF.isForwardCall(CalleeSymbol);
840
841 // Iterate through all predecessors.
842 for (BinaryBasicBlock *PredBB : BB->predecessors()) {
843 BinaryBasicBlock *CondSucc = PredBB->getConditionalSuccessor(Condition: true);
844 if (!CondSucc)
845 continue;
846
847 ++NumLocalCTCCandidates;
848
849 const MCSymbol *TBB = nullptr;
850 const MCSymbol *FBB = nullptr;
851 MCInst *CondBranch = nullptr;
852 MCInst *UncondBranch = nullptr;
853 bool Result = PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
854
855 // analyzeBranch() can fail due to unusual branch instructions, e.g. jrcxz
856 if (!Result) {
857 LLVM_DEBUG(dbgs() << "analyzeBranch failed in SCTC in block:\n";
858 PredBB->dump());
859 continue;
860 }
861
862 assert(Result && "internal error analyzing conditional branch");
863 assert(CondBranch && "conditional branch expected");
864
865 // Skip dynamic branches for now.
866 if (BF.getBinaryContext().MIB->isDynamicBranch(Inst: *CondBranch))
867 continue;
868
869 // It's possible that PredBB is also a successor to BB that may have
870 // been processed by a previous iteration of the SCTC loop, in which
871 // case it may have been marked invalid. We should skip rewriting in
872 // this case.
873 if (!PredBB->isValid()) {
874 assert(PredBB->isSuccessor(BB) &&
875 "PredBB should be valid if it is not a successor to BB");
876 continue;
877 }
878
879 // We don't want to reverse direction of the branch in new order
880 // without further profile analysis.
881 const bool DirectionFlag = CondSucc == BB ? IsForwardCTC : !IsForwardCTC;
882 if (!shouldRewriteBranch(PredBB, CondBranch: *CondBranch, BB, DirectionFlag))
883 continue;
884
885 // Record this block so that we don't try to optimize it twice.
886 BeenOptimized.insert(x: PredBB);
887
888 uint64_t Count = 0;
889 if (CondSucc != BB) {
890 // Patch the new target address into the conditional branch.
891 MIB->reverseBranchCondition(Inst&: *CondBranch, TBB: CalleeSymbol, Ctx);
892 // Since we reversed the condition on the branch we need to change
893 // the target for the unconditional branch or add a unconditional
894 // branch to the old target. This has to be done manually since
895 // fixupBranches is not called after SCTC.
896 NeedsUncondBranch.emplace_back(args&: PredBB, args&: CondSucc);
897 Count = PredBB->getFallthroughBranchInfo().Count;
898 } else {
899 // Change destination of the conditional branch.
900 MIB->replaceBranchTarget(Inst&: *CondBranch, TBB: CalleeSymbol, Ctx);
901 Count = PredBB->getTakenBranchInfo().Count;
902 }
903 const uint64_t CTCTakenFreq =
904 Count == BinaryBasicBlock::COUNT_NO_PROFILE ? 0 : Count;
905
906 // Annotate it, so "isCall" returns true for this jcc
907 MIB->setConditionalTailCall(Inst&: *CondBranch);
908 // Add info about the conditional tail call frequency, otherwise this
909 // info will be lost when we delete the associated BranchInfo entry
910 auto &CTCAnnotation =
911 MIB->getOrCreateAnnotationAs<uint64_t>(Inst&: *CondBranch, Name: "CTCTakenCount");
912 CTCAnnotation = CTCTakenFreq;
913
914 // Remove the unused successor which may be eliminated later
915 // if there are no other users.
916 PredBB->removeSuccessor(Succ: BB);
917 // Update BB execution count
918 if (CTCTakenFreq && CTCTakenFreq <= BB->getKnownExecutionCount())
919 BB->setExecutionCount(BB->getExecutionCount() - CTCTakenFreq);
920 else if (CTCTakenFreq > BB->getKnownExecutionCount())
921 BB->setExecutionCount(0);
922
923 ++NumLocalCTCs;
924 LocalCTCTakenCount += CTCTakenFreq;
925 LocalCTCExecCount += PredBB->getKnownExecutionCount();
926 }
927
928 // Remove the block from CFG if all predecessors were removed.
929 BB->markValid(Valid: isValid(BB));
930 }
931
932 // Add unconditional branches at the end of BBs to new successors
933 // as long as the successor is not a fallthrough.
934 for (auto &Entry : NeedsUncondBranch) {
935 BinaryBasicBlock *PredBB = Entry.first;
936 const BinaryBasicBlock *CondSucc = Entry.second;
937
938 const MCSymbol *TBB = nullptr;
939 const MCSymbol *FBB = nullptr;
940 MCInst *CondBranch = nullptr;
941 MCInst *UncondBranch = nullptr;
942 PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
943
944 // Find the next valid block. Invalid blocks will be deleted
945 // so they shouldn't be considered fallthrough targets.
946 const BinaryBasicBlock *NextBlock =
947 BF.getLayout().getBasicBlockAfter(BB: PredBB, IgnoreSplits: false);
948 while (NextBlock && !isValid(NextBlock))
949 NextBlock = BF.getLayout().getBasicBlockAfter(BB: NextBlock, IgnoreSplits: false);
950
951 // Get the unconditional successor to this block.
952 const BinaryBasicBlock *PredSucc = PredBB->getSuccessor();
953 assert(PredSucc && "The other branch should be a tail call");
954
955 const bool HasFallthrough = (NextBlock && PredSucc == NextBlock);
956
957 if (UncondBranch) {
958 if (HasFallthrough)
959 PredBB->eraseInstruction(II: PredBB->findInstruction(Inst: UncondBranch));
960 else
961 MIB->replaceBranchTarget(Inst&: *UncondBranch, TBB: CondSucc->getLabel(), Ctx);
962 } else if (!HasFallthrough) {
963 MCInst Branch;
964 MIB->createUncondBranch(Inst&: Branch, TBB: CondSucc->getLabel(), Ctx);
965 PredBB->addInstruction(Inst: Branch);
966 }
967 }
968
969 if (NumLocalCTCs > 0) {
970 NumDoubleJumps += fixDoubleJumps(Function&: BF, MarkInvalid: true);
971 // Clean-up unreachable tail-call blocks.
972 const std::pair<unsigned, uint64_t> Stats = BF.eraseInvalidBBs();
973 DeletedBlocks += Stats.first;
974 DeletedBytes += Stats.second;
975
976 assert(BF.validateCFG());
977 }
978
979 LLVM_DEBUG(dbgs() << "BOLT: created " << NumLocalCTCs
980 << " conditional tail calls from a total of "
981 << NumLocalCTCCandidates << " candidates in function " << BF
982 << ". CTCs execution count for this function is "
983 << LocalCTCExecCount << " and CTC taken count is "
984 << LocalCTCTakenCount << "\n";);
985
986 NumTailCallsPatched += NumLocalCTCs;
987 NumCandidateTailCalls += NumLocalCTCCandidates;
988 CTCExecCount += LocalCTCExecCount;
989 CTCTakenCount += LocalCTCTakenCount;
990
991 return NumLocalCTCs > 0;
992}
993
994Error SimplifyConditionalTailCalls::runOnFunctions(BinaryContext &BC) {
995 if (!BC.isX86())
996 return Error::success();
997
998 for (auto &It : BC.getBinaryFunctions()) {
999 BinaryFunction &Function = It.second;
1000
1001 if (!shouldOptimize(BF: Function))
1002 continue;
1003
1004 if (fixTailCalls(BF&: Function)) {
1005 Modified.insert(x: &Function);
1006 Function.setHasCanonicalCFG(false);
1007 }
1008 }
1009
1010 if (NumTailCallsPatched)
1011 BC.outs() << "BOLT-INFO: SCTC: patched " << NumTailCallsPatched
1012 << " tail calls (" << NumOrigForwardBranches << " forward)"
1013 << " tail calls (" << NumOrigBackwardBranches << " backward)"
1014 << " from a total of " << NumCandidateTailCalls
1015 << " while removing " << NumDoubleJumps << " double jumps"
1016 << " and removing " << DeletedBlocks << " basic blocks"
1017 << " totalling " << DeletedBytes
1018 << " bytes of code. CTCs total execution count is "
1019 << CTCExecCount << " and the number of times CTCs are taken is "
1020 << CTCTakenCount << "\n";
1021 return Error::success();
1022}
1023
1024uint64_t ShortenInstructions::shortenInstructions(BinaryFunction &Function) {
1025 uint64_t Count = 0;
1026 const BinaryContext &BC = Function.getBinaryContext();
1027 for (BinaryBasicBlock &BB : Function) {
1028 for (MCInst &Inst : BB) {
1029 // Skip shortening instructions with Size annotation.
1030 if (BC.MIB->getSize(Inst))
1031 continue;
1032
1033 MCInst OriginalInst;
1034 if (opts::Verbosity > 2)
1035 OriginalInst = Inst;
1036
1037 if (!BC.MIB->shortenInstruction(Inst, STI: *BC.STI))
1038 continue;
1039
1040 if (opts::Verbosity > 2) {
1041 BC.scopeLock();
1042 BC.outs() << "BOLT-INFO: shortening:\nBOLT-INFO: ";
1043 BC.printInstruction(OS&: BC.outs(), Instruction: OriginalInst, Offset: 0, Function: &Function);
1044 BC.outs() << "BOLT-INFO: to:";
1045 BC.printInstruction(OS&: BC.outs(), Instruction: Inst, Offset: 0, Function: &Function);
1046 }
1047
1048 ++Count;
1049 }
1050 }
1051
1052 return Count;
1053}
1054
1055Error ShortenInstructions::runOnFunctions(BinaryContext &BC) {
1056 std::atomic<uint64_t> NumShortened{0};
1057 if (!BC.isX86())
1058 return Error::success();
1059
1060 ParallelUtilities::runOnEachFunction(
1061 BC, SchedPolicy: ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR,
1062 WorkFunction: [&](BinaryFunction &BF) { NumShortened += shortenInstructions(Function&: BF); },
1063 SkipPredicate: nullptr, LogName: "ShortenInstructions");
1064
1065 if (NumShortened)
1066 BC.outs() << "BOLT-INFO: " << NumShortened
1067 << " instructions were shortened\n";
1068 return Error::success();
1069}
1070
1071void Peepholes::addTailcallTraps(BinaryFunction &Function) {
1072 MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get();
1073 for (BinaryBasicBlock &BB : Function) {
1074 MCInst *Inst = BB.getLastNonPseudoInstr();
1075 if (Inst && MIB->isTailCall(Inst: *Inst) && MIB->isIndirectBranch(Inst: *Inst)) {
1076 MCInst Trap;
1077 MIB->createTrap(Inst&: Trap);
1078 BB.addInstruction(Inst: Trap);
1079 ++TailCallTraps;
1080 }
1081 }
1082}
1083
1084void Peepholes::removeUselessCondBranches(BinaryFunction &Function) {
1085 for (BinaryBasicBlock &BB : Function) {
1086 if (BB.succ_size() != 2)
1087 continue;
1088
1089 BinaryBasicBlock *CondBB = BB.getConditionalSuccessor(Condition: true);
1090 BinaryBasicBlock *UncondBB = BB.getConditionalSuccessor(Condition: false);
1091 if (CondBB != UncondBB)
1092 continue;
1093
1094 const MCSymbol *TBB = nullptr;
1095 const MCSymbol *FBB = nullptr;
1096 MCInst *CondBranch = nullptr;
1097 MCInst *UncondBranch = nullptr;
1098 bool Result = BB.analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
1099
1100 // analyzeBranch() can fail due to unusual branch instructions,
1101 // e.g. jrcxz, or jump tables (indirect jump).
1102 if (!Result || !CondBranch)
1103 continue;
1104
1105 BB.removeDuplicateConditionalSuccessor(CondBranch);
1106 ++NumUselessCondBranches;
1107 }
1108}
1109
1110Error Peepholes::runOnFunctions(BinaryContext &BC) {
1111 const char Opts =
1112 std::accumulate(first: opts::Peepholes.begin(), last: opts::Peepholes.end(), init: 0,
1113 binary_op: [](const char A, const PeepholeOpts B) { return A | B; });
1114 if (Opts == PEEP_NONE)
1115 return Error::success();
1116
1117 for (auto &It : BC.getBinaryFunctions()) {
1118 BinaryFunction &Function = It.second;
1119 if (shouldOptimize(BF: Function)) {
1120 if (Opts & PEEP_DOUBLE_JUMPS)
1121 NumDoubleJumps += fixDoubleJumps(Function, MarkInvalid: false);
1122 if (Opts & PEEP_TAILCALL_TRAPS)
1123 addTailcallTraps(Function);
1124 if (Opts & PEEP_USELESS_BRANCHES)
1125 removeUselessCondBranches(Function);
1126 assert(Function.validateCFG());
1127 }
1128 }
1129 BC.outs() << "BOLT-INFO: Peephole: " << NumDoubleJumps
1130 << " double jumps patched.\n"
1131 << "BOLT-INFO: Peephole: " << TailCallTraps
1132 << " tail call traps inserted.\n"
1133 << "BOLT-INFO: Peephole: " << NumUselessCondBranches
1134 << " useless conditional branches removed.\n";
1135 return Error::success();
1136}
1137
1138bool SimplifyRODataLoads::simplifyRODataLoads(BinaryFunction &BF) {
1139 BinaryContext &BC = BF.getBinaryContext();
1140 MCPlusBuilder *MIB = BC.MIB.get();
1141
1142 uint64_t NumLocalLoadsSimplified = 0;
1143 uint64_t NumDynamicLocalLoadsSimplified = 0;
1144 uint64_t NumLocalLoadsFound = 0;
1145 uint64_t NumDynamicLocalLoadsFound = 0;
1146
1147 for (BinaryBasicBlock *BB : BF.getLayout().blocks()) {
1148 for (MCInst &Inst : *BB) {
1149 unsigned Opcode = Inst.getOpcode();
1150 const MCInstrDesc &Desc = BC.MII->get(Opcode);
1151
1152 // Skip instructions that do not load from memory.
1153 if (!Desc.mayLoad())
1154 continue;
1155
1156 // Try to statically evaluate the target memory address;
1157 uint64_t TargetAddress;
1158
1159 if (MIB->hasPCRelOperand(Inst)) {
1160 // Try to find the symbol that corresponds to the PC-relative operand.
1161 MCOperand *DispOpI = MIB->getMemOperandDisp(Inst);
1162 assert(DispOpI != Inst.end() && "expected PC-relative displacement");
1163 assert(DispOpI->isExpr() &&
1164 "found PC-relative with non-symbolic displacement");
1165
1166 // Get displacement symbol.
1167 const MCSymbol *DisplSymbol;
1168 uint64_t DisplOffset;
1169
1170 std::tie(args&: DisplSymbol, args&: DisplOffset) =
1171 MIB->getTargetSymbolInfo(Expr: DispOpI->getExpr());
1172
1173 if (!DisplSymbol)
1174 continue;
1175
1176 // Look up the symbol address in the global symbols map of the binary
1177 // context object.
1178 BinaryData *BD = BC.getBinaryDataByName(Name: DisplSymbol->getName());
1179 if (!BD)
1180 continue;
1181 TargetAddress = BD->getAddress() + DisplOffset;
1182 } else if (!MIB->evaluateMemOperandTarget(Inst, Target&: TargetAddress)) {
1183 continue;
1184 }
1185
1186 // Get the contents of the section containing the target address of the
1187 // memory operand. We are only interested in read-only sections.
1188 ErrorOr<BinarySection &> DataSection =
1189 BC.getSectionForAddress(Address: TargetAddress);
1190 if (!DataSection || DataSection->isWritable())
1191 continue;
1192
1193 if (BC.getRelocationAt(Address: TargetAddress) ||
1194 BC.getDynamicRelocationAt(Address: TargetAddress))
1195 continue;
1196
1197 uint32_t Offset = TargetAddress - DataSection->getAddress();
1198 StringRef ConstantData = DataSection->getContents();
1199
1200 ++NumLocalLoadsFound;
1201 if (BB->hasProfile())
1202 NumDynamicLocalLoadsFound += BB->getExecutionCount();
1203
1204 if (MIB->replaceMemOperandWithImm(Inst, ConstantData, Offset)) {
1205 ++NumLocalLoadsSimplified;
1206 if (BB->hasProfile())
1207 NumDynamicLocalLoadsSimplified += BB->getExecutionCount();
1208 }
1209 }
1210 }
1211
1212 NumLoadsFound += NumLocalLoadsFound;
1213 NumDynamicLoadsFound += NumDynamicLocalLoadsFound;
1214 NumLoadsSimplified += NumLocalLoadsSimplified;
1215 NumDynamicLoadsSimplified += NumDynamicLocalLoadsSimplified;
1216
1217 return NumLocalLoadsSimplified > 0;
1218}
1219
1220Error SimplifyRODataLoads::runOnFunctions(BinaryContext &BC) {
1221 for (auto &It : BC.getBinaryFunctions()) {
1222 BinaryFunction &Function = It.second;
1223 if (shouldOptimize(BF: Function) && simplifyRODataLoads(BF&: Function))
1224 Modified.insert(x: &Function);
1225 }
1226
1227 BC.outs() << "BOLT-INFO: simplified " << NumLoadsSimplified << " out of "
1228 << NumLoadsFound << " loads from a statically computed address.\n"
1229 << "BOLT-INFO: dynamic loads simplified: "
1230 << NumDynamicLoadsSimplified << "\n"
1231 << "BOLT-INFO: dynamic loads found: " << NumDynamicLoadsFound
1232 << "\n";
1233 return Error::success();
1234}
1235
1236Error AssignSections::runOnFunctions(BinaryContext &BC) {
1237 for (BinaryFunction *Function : BC.getInjectedBinaryFunctions()) {
1238 Function->setCodeSectionName(BC.getInjectedCodeSectionName());
1239 Function->setColdCodeSectionName(BC.getInjectedColdCodeSectionName());
1240 }
1241
1242 // In non-relocation mode functions have pre-assigned section names.
1243 if (!BC.HasRelocations)
1244 return Error::success();
1245
1246 const bool UseColdSection =
1247 BC.NumProfiledFuncs > 0 ||
1248 opts::ReorderFunctions == ReorderFunctions::RT_USER;
1249 for (auto &BFI : BC.getBinaryFunctions()) {
1250 BinaryFunction &Function = BFI.second;
1251 if (opts::isHotTextMover(Function)) {
1252 Function.setCodeSectionName(BC.getHotTextMoverSectionName());
1253 Function.setColdCodeSectionName(BC.getHotTextMoverSectionName());
1254 continue;
1255 }
1256
1257 if (!UseColdSection || Function.hasValidIndex())
1258 Function.setCodeSectionName(BC.getMainCodeSectionName());
1259 else
1260 Function.setCodeSectionName(BC.getColdCodeSectionName());
1261
1262 if (Function.isSplit())
1263 Function.setColdCodeSectionName(BC.getColdCodeSectionName());
1264 }
1265 return Error::success();
1266}
1267
1268Error PrintProfileStats::runOnFunctions(BinaryContext &BC) {
1269 double FlowImbalanceMean = 0.0;
1270 size_t NumBlocksConsidered = 0;
1271 double WorstBias = 0.0;
1272 const BinaryFunction *WorstBiasFunc = nullptr;
1273
1274 // For each function CFG, we fill an IncomingMap with the sum of the frequency
1275 // of incoming edges for each BB. Likewise for each OutgoingMap and the sum
1276 // of the frequency of outgoing edges.
1277 using FlowMapTy = std::unordered_map<const BinaryBasicBlock *, uint64_t>;
1278 std::unordered_map<const BinaryFunction *, FlowMapTy> TotalIncomingMaps;
1279 std::unordered_map<const BinaryFunction *, FlowMapTy> TotalOutgoingMaps;
1280
1281 // Compute mean
1282 for (const auto &BFI : BC.getBinaryFunctions()) {
1283 const BinaryFunction &Function = BFI.second;
1284 if (Function.empty() || !Function.isSimple())
1285 continue;
1286 FlowMapTy &IncomingMap = TotalIncomingMaps[&Function];
1287 FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function];
1288 for (const BinaryBasicBlock &BB : Function) {
1289 uint64_t TotalOutgoing = 0ULL;
1290 auto SuccBIIter = BB.branch_info_begin();
1291 for (BinaryBasicBlock *Succ : BB.successors()) {
1292 uint64_t Count = SuccBIIter->Count;
1293 if (Count == BinaryBasicBlock::COUNT_NO_PROFILE || Count == 0) {
1294 ++SuccBIIter;
1295 continue;
1296 }
1297 TotalOutgoing += Count;
1298 IncomingMap[Succ] += Count;
1299 ++SuccBIIter;
1300 }
1301 OutgoingMap[&BB] = TotalOutgoing;
1302 }
1303
1304 size_t NumBlocks = 0;
1305 double Mean = 0.0;
1306 for (const BinaryBasicBlock &BB : Function) {
1307 // Do not compute score for low frequency blocks, entry or exit blocks
1308 if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0 || BB.isEntryPoint())
1309 continue;
1310 ++NumBlocks;
1311 const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB];
1312 Mean += fabs(x: Difference / IncomingMap[&BB]);
1313 }
1314
1315 FlowImbalanceMean += Mean;
1316 NumBlocksConsidered += NumBlocks;
1317 if (!NumBlocks)
1318 continue;
1319 double FuncMean = Mean / NumBlocks;
1320 if (FuncMean > WorstBias) {
1321 WorstBias = FuncMean;
1322 WorstBiasFunc = &Function;
1323 }
1324 }
1325 if (NumBlocksConsidered > 0)
1326 FlowImbalanceMean /= NumBlocksConsidered;
1327
1328 // Compute standard deviation
1329 NumBlocksConsidered = 0;
1330 double FlowImbalanceVar = 0.0;
1331 for (const auto &BFI : BC.getBinaryFunctions()) {
1332 const BinaryFunction &Function = BFI.second;
1333 if (Function.empty() || !Function.isSimple())
1334 continue;
1335 FlowMapTy &IncomingMap = TotalIncomingMaps[&Function];
1336 FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function];
1337 for (const BinaryBasicBlock &BB : Function) {
1338 if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0)
1339 continue;
1340 ++NumBlocksConsidered;
1341 const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB];
1342 FlowImbalanceVar +=
1343 pow(x: fabs(x: Difference / IncomingMap[&BB]) - FlowImbalanceMean, y: 2);
1344 }
1345 }
1346 if (NumBlocksConsidered) {
1347 FlowImbalanceVar /= NumBlocksConsidered;
1348 FlowImbalanceVar = sqrt(x: FlowImbalanceVar);
1349 }
1350
1351 // Report to user
1352 BC.outs() << format(Fmt: "BOLT-INFO: Profile bias score: %.4lf%% StDev: %.4lf%%\n",
1353 Vals: (100.0 * FlowImbalanceMean), Vals: (100.0 * FlowImbalanceVar));
1354 if (WorstBiasFunc && opts::Verbosity >= 1) {
1355 BC.outs() << "Worst average bias observed in "
1356 << WorstBiasFunc->getPrintName() << "\n";
1357 LLVM_DEBUG(WorstBiasFunc->dump());
1358 }
1359 return Error::success();
1360}
1361
1362Error PrintProgramStats::runOnFunctions(BinaryContext &BC) {
1363 uint64_t NumRegularFunctions = 0;
1364 uint64_t NumStaleProfileFunctions = 0;
1365 uint64_t NumAllStaleFunctions = 0;
1366 uint64_t NumInferredFunctions = 0;
1367 uint64_t NumNonSimpleProfiledFunctions = 0;
1368 uint64_t NumUnknownControlFlowFunctions = 0;
1369 uint64_t TotalSampleCount = 0;
1370 uint64_t StaleSampleCount = 0;
1371 uint64_t InferredSampleCount = 0;
1372 std::vector<const BinaryFunction *> ProfiledFunctions;
1373 const char *StaleFuncsHeader = "BOLT-INFO: Functions with stale profile:\n";
1374 for (auto &BFI : BC.getBinaryFunctions()) {
1375 const BinaryFunction &Function = BFI.second;
1376
1377 // Ignore PLT functions for stats.
1378 if (Function.isPLTFunction())
1379 continue;
1380
1381 ++NumRegularFunctions;
1382
1383 if (!Function.isSimple()) {
1384 if (Function.hasProfile())
1385 ++NumNonSimpleProfiledFunctions;
1386 continue;
1387 }
1388
1389 if (Function.hasUnknownControlFlow()) {
1390 if (opts::PrintUnknownCFG)
1391 Function.dump();
1392 else if (opts::PrintUnknown)
1393 BC.errs() << "function with unknown control flow: " << Function << '\n';
1394
1395 ++NumUnknownControlFlowFunctions;
1396 }
1397
1398 if (!Function.hasProfile())
1399 continue;
1400
1401 uint64_t SampleCount = Function.getRawBranchCount();
1402 TotalSampleCount += SampleCount;
1403
1404 if (Function.hasValidProfile()) {
1405 ProfiledFunctions.push_back(x: &Function);
1406 if (Function.hasInferredProfile()) {
1407 ++NumInferredFunctions;
1408 InferredSampleCount += SampleCount;
1409 ++NumAllStaleFunctions;
1410 }
1411 } else {
1412 if (opts::ReportStaleFuncs) {
1413 BC.outs() << StaleFuncsHeader;
1414 StaleFuncsHeader = "";
1415 BC.outs() << " " << Function << '\n';
1416 }
1417 ++NumStaleProfileFunctions;
1418 StaleSampleCount += SampleCount;
1419 ++NumAllStaleFunctions;
1420 }
1421 }
1422 BC.NumProfiledFuncs = ProfiledFunctions.size();
1423 BC.NumStaleProfileFuncs = NumStaleProfileFunctions;
1424
1425 const size_t NumAllProfiledFunctions =
1426 ProfiledFunctions.size() + NumStaleProfileFunctions;
1427 BC.outs() << "BOLT-INFO: " << NumAllProfiledFunctions << " out of "
1428 << NumRegularFunctions << " functions in the binary ("
1429 << format(Fmt: "%.1f", Vals: NumAllProfiledFunctions /
1430 (float)NumRegularFunctions * 100.0f)
1431 << "%) have non-empty execution profile\n";
1432 if (NumNonSimpleProfiledFunctions) {
1433 BC.outs() << "BOLT-INFO: " << NumNonSimpleProfiledFunctions << " function"
1434 << (NumNonSimpleProfiledFunctions == 1 ? "" : "s")
1435 << " with profile could not be optimized\n";
1436 }
1437 if (NumAllStaleFunctions) {
1438 const float PctStale =
1439 NumAllStaleFunctions / (float)NumAllProfiledFunctions * 100.0f;
1440 const float PctStaleFuncsWithEqualBlockCount =
1441 (float)BC.Stats.NumStaleFuncsWithEqualBlockCount /
1442 NumAllStaleFunctions * 100.0f;
1443 const float PctStaleBlocksWithEqualIcount =
1444 (float)BC.Stats.NumStaleBlocksWithEqualIcount /
1445 BC.Stats.NumStaleBlocks * 100.0f;
1446 auto printErrorOrWarning = [&]() {
1447 if (PctStale > opts::StaleThreshold)
1448 BC.errs() << "BOLT-ERROR: ";
1449 else
1450 BC.errs() << "BOLT-WARNING: ";
1451 };
1452 printErrorOrWarning();
1453 BC.errs() << NumAllStaleFunctions
1454 << format(Fmt: " (%.1f%% of all profiled)", Vals: PctStale) << " function"
1455 << (NumAllStaleFunctions == 1 ? "" : "s")
1456 << " have invalid (possibly stale) profile."
1457 " Use -report-stale to see the list.\n";
1458 if (TotalSampleCount > 0) {
1459 printErrorOrWarning();
1460 BC.errs() << (StaleSampleCount + InferredSampleCount) << " out of "
1461 << TotalSampleCount << " samples in the binary ("
1462 << format(Fmt: "%.1f",
1463 Vals: ((100.0f * (StaleSampleCount + InferredSampleCount)) /
1464 TotalSampleCount))
1465 << "%) belong to functions with invalid"
1466 " (possibly stale) profile.\n";
1467 }
1468 BC.outs() << "BOLT-INFO: " << BC.Stats.NumStaleFuncsWithEqualBlockCount
1469 << " stale function"
1470 << (BC.Stats.NumStaleFuncsWithEqualBlockCount == 1 ? "" : "s")
1471 << format(Fmt: " (%.1f%% of all stale)",
1472 Vals: PctStaleFuncsWithEqualBlockCount)
1473 << " have matching block count.\n";
1474 BC.outs() << "BOLT-INFO: " << BC.Stats.NumStaleBlocksWithEqualIcount
1475 << " stale block"
1476 << (BC.Stats.NumStaleBlocksWithEqualIcount == 1 ? "" : "s")
1477 << format(Fmt: " (%.1f%% of all stale)", Vals: PctStaleBlocksWithEqualIcount)
1478 << " have matching icount.\n";
1479 if (PctStale > opts::StaleThreshold) {
1480 return createFatalBOLTError(
1481 S: Twine("BOLT-ERROR: stale functions exceed specified threshold of ") +
1482 Twine(opts::StaleThreshold.getValue()) + Twine("%. Exiting.\n"));
1483 }
1484 }
1485 if (NumInferredFunctions) {
1486 BC.outs() << format(
1487 Fmt: "BOLT-INFO: inferred profile for %d (%.2f%% of profiled, "
1488 "%.2f%% of stale) functions responsible for %.2f%% samples"
1489 " (%zu out of %zu)\n",
1490 Vals: NumInferredFunctions,
1491 Vals: 100.0 * NumInferredFunctions / NumAllProfiledFunctions,
1492 Vals: 100.0 * NumInferredFunctions / NumAllStaleFunctions,
1493 Vals: 100.0 * InferredSampleCount / TotalSampleCount, Vals: InferredSampleCount,
1494 Vals: TotalSampleCount);
1495 BC.outs() << format(
1496 Fmt: "BOLT-INFO: inference found an exact match for %.2f%% of basic blocks"
1497 " (%zu out of %zu stale) responsible for %.2f%% samples"
1498 " (%zu out of %zu stale)\n",
1499 Vals: 100.0 * BC.Stats.NumMatchedBlocks / BC.Stats.NumStaleBlocks,
1500 Vals: BC.Stats.NumMatchedBlocks, Vals: BC.Stats.NumStaleBlocks,
1501 Vals: 100.0 * BC.Stats.MatchedSampleCount / BC.Stats.StaleSampleCount,
1502 Vals: BC.Stats.MatchedSampleCount, Vals: BC.Stats.StaleSampleCount);
1503 }
1504
1505 if (const uint64_t NumUnusedObjects = BC.getNumUnusedProfiledObjects()) {
1506 BC.outs() << "BOLT-INFO: profile for " << NumUnusedObjects
1507 << " objects was ignored\n";
1508 }
1509
1510 if (ProfiledFunctions.size() > 10) {
1511 if (opts::Verbosity >= 1) {
1512 BC.outs() << "BOLT-INFO: top called functions are:\n";
1513 llvm::sort(C&: ProfiledFunctions,
1514 Comp: [](const BinaryFunction *A, const BinaryFunction *B) {
1515 return B->getExecutionCount() < A->getExecutionCount();
1516 });
1517 auto SFI = ProfiledFunctions.begin();
1518 auto SFIend = ProfiledFunctions.end();
1519 for (unsigned I = 0u; I < opts::TopCalledLimit && SFI != SFIend;
1520 ++SFI, ++I)
1521 BC.outs() << " " << **SFI << " : " << (*SFI)->getExecutionCount()
1522 << '\n';
1523 }
1524 }
1525
1526 if (!opts::PrintSortedBy.empty()) {
1527 std::vector<BinaryFunction *> Functions;
1528 std::map<const BinaryFunction *, DynoStats> Stats;
1529
1530 for (auto &BFI : BC.getBinaryFunctions()) {
1531 BinaryFunction &BF = BFI.second;
1532 if (shouldOptimize(BF) && BF.hasValidProfile()) {
1533 Functions.push_back(x: &BF);
1534 Stats.emplace(args: &BF, args: getDynoStats(BF));
1535 }
1536 }
1537
1538 const bool SortAll =
1539 llvm::is_contained(Range&: opts::PrintSortedBy, Element: DynoStats::LAST_DYNO_STAT);
1540
1541 const bool Ascending =
1542 opts::DynoStatsSortOrderOpt == opts::DynoStatsSortOrder::Ascending;
1543
1544 if (SortAll) {
1545 llvm::stable_sort(Range&: Functions,
1546 C: [Ascending, &Stats](const BinaryFunction *A,
1547 const BinaryFunction *B) {
1548 return Ascending ? Stats.at(k: A) < Stats.at(k: B)
1549 : Stats.at(k: B) < Stats.at(k: A);
1550 });
1551 } else {
1552 llvm::stable_sort(
1553 Range&: Functions, C: [Ascending, &Stats](const BinaryFunction *A,
1554 const BinaryFunction *B) {
1555 const DynoStats &StatsA = Stats.at(k: A);
1556 const DynoStats &StatsB = Stats.at(k: B);
1557 return Ascending ? StatsA.lessThan(Other: StatsB, Keys: opts::PrintSortedBy)
1558 : StatsB.lessThan(Other: StatsA, Keys: opts::PrintSortedBy);
1559 });
1560 }
1561
1562 BC.outs() << "BOLT-INFO: top functions sorted by ";
1563 if (SortAll) {
1564 BC.outs() << "dyno stats";
1565 } else {
1566 BC.outs() << "(";
1567 bool PrintComma = false;
1568 for (const DynoStats::Category Category : opts::PrintSortedBy) {
1569 if (PrintComma)
1570 BC.outs() << ", ";
1571 BC.outs() << DynoStats::Description(C: Category);
1572 PrintComma = true;
1573 }
1574 BC.outs() << ")";
1575 }
1576
1577 BC.outs() << " are:\n";
1578 auto SFI = Functions.begin();
1579 for (unsigned I = 0; I < 100 && SFI != Functions.end(); ++SFI, ++I) {
1580 const DynoStats Stats = getDynoStats(BF&: **SFI);
1581 BC.outs() << " " << **SFI;
1582 if (!SortAll) {
1583 BC.outs() << " (";
1584 bool PrintComma = false;
1585 for (const DynoStats::Category Category : opts::PrintSortedBy) {
1586 if (PrintComma)
1587 BC.outs() << ", ";
1588 BC.outs() << dynoStatsOptName(C: Category) << "=" << Stats[Category];
1589 PrintComma = true;
1590 }
1591 BC.outs() << ")";
1592 }
1593 BC.outs() << "\n";
1594 }
1595 }
1596
1597 if (!BC.TrappedFunctions.empty()) {
1598 BC.errs() << "BOLT-WARNING: " << BC.TrappedFunctions.size() << " function"
1599 << (BC.TrappedFunctions.size() > 1 ? "s" : "")
1600 << " will trap on entry. Use -trap-avx512=0 to disable"
1601 " traps.";
1602 if (opts::Verbosity >= 1 || BC.TrappedFunctions.size() <= 5) {
1603 BC.errs() << '\n';
1604 for (const BinaryFunction *Function : BC.TrappedFunctions)
1605 BC.errs() << " " << *Function << '\n';
1606 } else {
1607 BC.errs() << " Use -v=1 to see the list.\n";
1608 }
1609 }
1610
1611 // Print information on missed macro-fusion opportunities seen on input.
1612 if (BC.Stats.MissedMacroFusionPairs) {
1613 BC.outs() << format(
1614 Fmt: "BOLT-INFO: the input contains %zu (dynamic count : %zu)"
1615 " opportunities for macro-fusion optimization",
1616 Vals: BC.Stats.MissedMacroFusionPairs, Vals: BC.Stats.MissedMacroFusionExecCount);
1617 switch (opts::AlignMacroOpFusion) {
1618 case MFT_NONE:
1619 BC.outs() << ". Use -align-macro-fusion to fix.\n";
1620 break;
1621 case MFT_HOT:
1622 BC.outs() << ". Will fix instances on a hot path.\n";
1623 break;
1624 case MFT_ALL:
1625 BC.outs() << " that are going to be fixed\n";
1626 break;
1627 }
1628 }
1629
1630 // Collect and print information about suboptimal code layout on input.
1631 if (opts::ReportBadLayout) {
1632 std::vector<BinaryFunction *> SuboptimalFuncs;
1633 for (auto &BFI : BC.getBinaryFunctions()) {
1634 BinaryFunction &BF = BFI.second;
1635 if (!BF.hasValidProfile())
1636 continue;
1637
1638 const uint64_t HotThreshold =
1639 std::max<uint64_t>(a: BF.getKnownExecutionCount(), b: 1);
1640 bool HotSeen = false;
1641 for (const BinaryBasicBlock *BB : BF.getLayout().rblocks()) {
1642 if (!HotSeen && BB->getKnownExecutionCount() > HotThreshold) {
1643 HotSeen = true;
1644 continue;
1645 }
1646 if (HotSeen && BB->getKnownExecutionCount() == 0) {
1647 SuboptimalFuncs.push_back(x: &BF);
1648 break;
1649 }
1650 }
1651 }
1652
1653 if (!SuboptimalFuncs.empty()) {
1654 llvm::sort(C&: SuboptimalFuncs,
1655 Comp: [](const BinaryFunction *A, const BinaryFunction *B) {
1656 return A->getKnownExecutionCount() / A->getSize() >
1657 B->getKnownExecutionCount() / B->getSize();
1658 });
1659
1660 BC.outs() << "BOLT-INFO: " << SuboptimalFuncs.size()
1661 << " functions have "
1662 "cold code in the middle of hot code. Top functions are:\n";
1663 for (unsigned I = 0;
1664 I < std::min(a: static_cast<size_t>(opts::ReportBadLayout),
1665 b: SuboptimalFuncs.size());
1666 ++I)
1667 SuboptimalFuncs[I]->print(OS&: BC.outs());
1668 }
1669 }
1670
1671 if (NumUnknownControlFlowFunctions) {
1672 BC.outs() << "BOLT-INFO: " << NumUnknownControlFlowFunctions
1673 << " functions have instructions with unknown control flow";
1674 if (!opts::PrintUnknown)
1675 BC.outs() << ". Use -print-unknown to see the list.";
1676 BC.outs() << '\n';
1677 }
1678 return Error::success();
1679}
1680
1681Error InstructionLowering::runOnFunctions(BinaryContext &BC) {
1682 for (auto &BFI : BC.getBinaryFunctions())
1683 for (BinaryBasicBlock &BB : BFI.second)
1684 for (MCInst &Instruction : BB)
1685 BC.MIB->lowerTailCall(Inst&: Instruction);
1686 return Error::success();
1687}
1688
1689Error StripRepRet::runOnFunctions(BinaryContext &BC) {
1690 if (!BC.isX86())
1691 return Error::success();
1692
1693 uint64_t NumPrefixesRemoved = 0;
1694 uint64_t NumBytesSaved = 0;
1695 for (auto &BFI : BC.getBinaryFunctions()) {
1696 for (BinaryBasicBlock &BB : BFI.second) {
1697 auto LastInstRIter = BB.getLastNonPseudo();
1698 if (LastInstRIter == BB.rend() || !BC.MIB->isReturn(Inst: *LastInstRIter) ||
1699 !BC.MIB->deleteREPPrefix(Inst&: *LastInstRIter))
1700 continue;
1701
1702 NumPrefixesRemoved += BB.getKnownExecutionCount();
1703 ++NumBytesSaved;
1704 }
1705 }
1706
1707 if (NumBytesSaved)
1708 BC.outs() << "BOLT-INFO: removed " << NumBytesSaved
1709 << " 'repz' prefixes"
1710 " with estimated execution count of "
1711 << NumPrefixesRemoved << " times.\n";
1712 return Error::success();
1713}
1714
1715Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
1716 if (!BC.isX86())
1717 return Error::success();
1718
1719 uint64_t NumInlined = 0;
1720 uint64_t NumInlinedDyno = 0;
1721 for (auto &BFI : BC.getBinaryFunctions()) {
1722 for (BinaryBasicBlock &BB : BFI.second) {
1723 for (auto II = BB.begin(); II != BB.end(); ++II) {
1724 MCInst &Inst = *II;
1725
1726 if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 ||
1727 !Inst.getOperand(i: 0).isExpr())
1728 continue;
1729
1730 const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst);
1731 if (CalleeSymbol->getName() != "memcpy" &&
1732 CalleeSymbol->getName() != "memcpy@PLT" &&
1733 CalleeSymbol->getName() != "_memcpy8")
1734 continue;
1735
1736 const bool IsMemcpy8 = (CalleeSymbol->getName() == "_memcpy8");
1737 const bool IsTailCall = BC.MIB->isTailCall(Inst);
1738
1739 const InstructionListType NewCode =
1740 BC.MIB->createInlineMemcpy(ReturnEnd: IsMemcpy8);
1741 II = BB.replaceInstruction(II, Replacement: NewCode);
1742 std::advance(i&: II, n: NewCode.size() - 1);
1743 if (IsTailCall) {
1744 MCInst Return;
1745 BC.MIB->createReturn(Inst&: Return);
1746 II = BB.insertInstruction(At: std::next(x: II), NewInst: std::move(Return));
1747 }
1748
1749 ++NumInlined;
1750 NumInlinedDyno += BB.getKnownExecutionCount();
1751 }
1752 }
1753 }
1754
1755 if (NumInlined) {
1756 BC.outs() << "BOLT-INFO: inlined " << NumInlined << " memcpy() calls";
1757 if (NumInlinedDyno)
1758 BC.outs() << ". The calls were executed " << NumInlinedDyno
1759 << " times based on profile.";
1760 BC.outs() << '\n';
1761 }
1762 return Error::success();
1763}
1764
1765bool SpecializeMemcpy1::shouldOptimize(const BinaryFunction &Function) const {
1766 if (!BinaryFunctionPass::shouldOptimize(BF: Function))
1767 return false;
1768
1769 for (const std::string &FunctionSpec : Spec) {
1770 StringRef FunctionName = StringRef(FunctionSpec).split(Separator: ':').first;
1771 if (Function.hasNameRegex(NameRegex: FunctionName))
1772 return true;
1773 }
1774
1775 return false;
1776}
1777
1778std::set<size_t> SpecializeMemcpy1::getCallSitesToOptimize(
1779 const BinaryFunction &Function) const {
1780 StringRef SitesString;
1781 for (const std::string &FunctionSpec : Spec) {
1782 StringRef FunctionName;
1783 std::tie(args&: FunctionName, args&: SitesString) = StringRef(FunctionSpec).split(Separator: ':');
1784 if (Function.hasNameRegex(NameRegex: FunctionName))
1785 break;
1786 SitesString = "";
1787 }
1788
1789 std::set<size_t> Sites;
1790 SmallVector<StringRef, 4> SitesVec;
1791 SitesString.split(A&: SitesVec, Separator: ':');
1792 for (StringRef SiteString : SitesVec) {
1793 if (SiteString.empty())
1794 continue;
1795 size_t Result;
1796 if (!SiteString.getAsInteger(Radix: 10, Result))
1797 Sites.emplace(args&: Result);
1798 }
1799
1800 return Sites;
1801}
1802
1803Error SpecializeMemcpy1::runOnFunctions(BinaryContext &BC) {
1804 if (!BC.isX86())
1805 return Error::success();
1806
1807 uint64_t NumSpecialized = 0;
1808 uint64_t NumSpecializedDyno = 0;
1809 for (auto &BFI : BC.getBinaryFunctions()) {
1810 BinaryFunction &Function = BFI.second;
1811 if (!shouldOptimize(Function))
1812 continue;
1813
1814 std::set<size_t> CallsToOptimize = getCallSitesToOptimize(Function);
1815 auto shouldOptimize = [&](size_t N) {
1816 return CallsToOptimize.empty() || CallsToOptimize.count(x: N);
1817 };
1818
1819 std::vector<BinaryBasicBlock *> Blocks(Function.pbegin(), Function.pend());
1820 size_t CallSiteID = 0;
1821 for (BinaryBasicBlock *CurBB : Blocks) {
1822 for (auto II = CurBB->begin(); II != CurBB->end(); ++II) {
1823 MCInst &Inst = *II;
1824
1825 if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 ||
1826 !Inst.getOperand(i: 0).isExpr())
1827 continue;
1828
1829 const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst);
1830 if (CalleeSymbol->getName() != "memcpy" &&
1831 CalleeSymbol->getName() != "memcpy@PLT")
1832 continue;
1833
1834 if (BC.MIB->isTailCall(Inst))
1835 continue;
1836
1837 ++CallSiteID;
1838
1839 if (!shouldOptimize(CallSiteID))
1840 continue;
1841
1842 // Create a copy of a call to memcpy(dest, src, size).
1843 MCInst MemcpyInstr = Inst;
1844
1845 BinaryBasicBlock *OneByteMemcpyBB = CurBB->splitAt(II);
1846
1847 BinaryBasicBlock *NextBB = nullptr;
1848 if (OneByteMemcpyBB->getNumNonPseudos() > 1) {
1849 NextBB = OneByteMemcpyBB->splitAt(II: OneByteMemcpyBB->begin());
1850 NextBB->eraseInstruction(II: NextBB->begin());
1851 } else {
1852 NextBB = OneByteMemcpyBB->getSuccessor();
1853 OneByteMemcpyBB->eraseInstruction(II: OneByteMemcpyBB->begin());
1854 assert(NextBB && "unexpected call to memcpy() with no return");
1855 }
1856
1857 BinaryBasicBlock *MemcpyBB = Function.addBasicBlock();
1858 MemcpyBB->setOffset(CurBB->getInputOffset());
1859 InstructionListType CmpJCC =
1860 BC.MIB->createCmpJE(RegNo: BC.MIB->getIntArgRegister(ArgNo: 2), Imm: 1,
1861 Target: OneByteMemcpyBB->getLabel(), Ctx: BC.Ctx.get());
1862 CurBB->addInstructions(R: CmpJCC);
1863 CurBB->addSuccessor(Succ: MemcpyBB);
1864
1865 MemcpyBB->addInstruction(Inst: std::move(MemcpyInstr));
1866 MemcpyBB->addSuccessor(Succ: NextBB);
1867 MemcpyBB->setCFIState(NextBB->getCFIState());
1868 MemcpyBB->setExecutionCount(0);
1869
1870 // To prevent the actual call from being moved to cold, we set its
1871 // execution count to 1.
1872 if (CurBB->getKnownExecutionCount() > 0)
1873 MemcpyBB->setExecutionCount(1);
1874
1875 InstructionListType OneByteMemcpy = BC.MIB->createOneByteMemcpy();
1876 OneByteMemcpyBB->addInstructions(R: OneByteMemcpy);
1877
1878 ++NumSpecialized;
1879 NumSpecializedDyno += CurBB->getKnownExecutionCount();
1880
1881 CurBB = NextBB;
1882
1883 // Note: we don't expect the next instruction to be a call to memcpy.
1884 II = CurBB->begin();
1885 }
1886 }
1887 }
1888
1889 if (NumSpecialized) {
1890 BC.outs() << "BOLT-INFO: specialized " << NumSpecialized
1891 << " memcpy() call sites for size 1";
1892 if (NumSpecializedDyno)
1893 BC.outs() << ". The calls were executed " << NumSpecializedDyno
1894 << " times based on profile.";
1895 BC.outs() << '\n';
1896 }
1897 return Error::success();
1898}
1899
1900void RemoveNops::runOnFunction(BinaryFunction &BF) {
1901 const BinaryContext &BC = BF.getBinaryContext();
1902 for (BinaryBasicBlock &BB : BF) {
1903 for (int64_t I = BB.size() - 1; I >= 0; --I) {
1904 MCInst &Inst = BB.getInstructionAtIndex(Index: I);
1905 if (BC.MIB->isNoop(Inst) && BC.MIB->hasAnnotation(Inst, Name: "NOP"))
1906 BB.eraseInstructionAtIndex(Index: I);
1907 }
1908 }
1909}
1910
1911Error RemoveNops::runOnFunctions(BinaryContext &BC) {
1912 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
1913 runOnFunction(BF);
1914 };
1915
1916 ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
1917 return BF.shouldPreserveNops();
1918 };
1919
1920 ParallelUtilities::runOnEachFunction(
1921 BC, SchedPolicy: ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFunction: WorkFun,
1922 SkipPredicate: SkipFunc, LogName: "RemoveNops");
1923 return Error::success();
1924}
1925
1926} // namespace bolt
1927} // namespace llvm
1928

source code of bolt/lib/Passes/BinaryPasses.cpp