1 | //===- bolt/Passes/BinaryPasses.cpp - Binary-level passes -----------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file implements multiple passes for binary optimization and analysis. |
10 | // |
11 | //===----------------------------------------------------------------------===// |
12 | |
13 | #include "bolt/Passes/BinaryPasses.h" |
14 | #include "bolt/Core/FunctionLayout.h" |
15 | #include "bolt/Core/ParallelUtilities.h" |
16 | #include "bolt/Passes/ReorderAlgorithm.h" |
17 | #include "bolt/Passes/ReorderFunctions.h" |
18 | #include "llvm/Support/CommandLine.h" |
19 | #include <atomic> |
20 | #include <mutex> |
21 | #include <numeric> |
22 | #include <vector> |
23 | |
24 | #define DEBUG_TYPE "bolt-opts" |
25 | |
26 | using namespace llvm; |
27 | using namespace bolt; |
28 | |
29 | static const char *dynoStatsOptName(const bolt::DynoStats::Category C) { |
30 | assert(C > bolt::DynoStats::FIRST_DYNO_STAT && |
31 | C < DynoStats::LAST_DYNO_STAT && "Unexpected dyno stat category." ); |
32 | |
33 | static std::string OptNames[bolt::DynoStats::LAST_DYNO_STAT + 1]; |
34 | |
35 | OptNames[C] = bolt::DynoStats::Description(C); |
36 | |
37 | std::replace(first: OptNames[C].begin(), last: OptNames[C].end(), old_value: ' ', new_value: '-'); |
38 | |
39 | return OptNames[C].c_str(); |
40 | } |
41 | |
42 | namespace opts { |
43 | |
44 | extern cl::OptionCategory BoltCategory; |
45 | extern cl::OptionCategory BoltOptCategory; |
46 | |
47 | extern cl::opt<bolt::MacroFusionType> AlignMacroOpFusion; |
48 | extern cl::opt<unsigned> Verbosity; |
49 | extern cl::opt<bool> EnableBAT; |
50 | extern cl::opt<unsigned> ExecutionCountThreshold; |
51 | extern cl::opt<bool> UpdateDebugSections; |
52 | extern cl::opt<bolt::ReorderFunctions::ReorderType> ReorderFunctions; |
53 | |
54 | enum DynoStatsSortOrder : char { |
55 | Ascending, |
56 | Descending |
57 | }; |
58 | |
59 | static cl::opt<DynoStatsSortOrder> DynoStatsSortOrderOpt( |
60 | "print-sorted-by-order" , |
61 | cl::desc("use ascending or descending order when printing functions " |
62 | "ordered by dyno stats" ), |
63 | cl::init(Val: DynoStatsSortOrder::Descending), cl::cat(BoltOptCategory)); |
64 | |
65 | cl::list<std::string> |
66 | HotTextMoveSections("hot-text-move-sections" , |
67 | cl::desc("list of sections containing functions used for hugifying hot text. " |
68 | "BOLT makes sure these functions are not placed on the same page as " |
69 | "the hot text. (default=\'.stub,.mover\')." ), |
70 | cl::value_desc("sec1,sec2,sec3,..." ), |
71 | cl::CommaSeparated, |
72 | cl::ZeroOrMore, |
73 | cl::cat(BoltCategory)); |
74 | |
75 | bool isHotTextMover(const BinaryFunction &Function) { |
76 | for (std::string &SectionName : opts::HotTextMoveSections) { |
77 | if (Function.getOriginSectionName() && |
78 | *Function.getOriginSectionName() == SectionName) |
79 | return true; |
80 | } |
81 | |
82 | return false; |
83 | } |
84 | |
85 | static cl::opt<bool> MinBranchClusters( |
86 | "min-branch-clusters" , |
87 | cl::desc("use a modified clustering algorithm geared towards minimizing " |
88 | "branches" ), |
89 | cl::Hidden, cl::cat(BoltOptCategory)); |
90 | |
91 | static cl::list<Peepholes::PeepholeOpts> Peepholes( |
92 | "peepholes" , cl::CommaSeparated, cl::desc("enable peephole optimizations" ), |
93 | cl::value_desc("opt1,opt2,opt3,..." ), |
94 | cl::values(clEnumValN(Peepholes::PEEP_NONE, "none" , "disable peepholes" ), |
95 | clEnumValN(Peepholes::PEEP_DOUBLE_JUMPS, "double-jumps" , |
96 | "remove double jumps when able" ), |
97 | clEnumValN(Peepholes::PEEP_TAILCALL_TRAPS, "tailcall-traps" , |
98 | "insert tail call traps" ), |
99 | clEnumValN(Peepholes::PEEP_USELESS_BRANCHES, "useless-branches" , |
100 | "remove useless conditional branches" ), |
101 | clEnumValN(Peepholes::PEEP_ALL, "all" , |
102 | "enable all peephole optimizations" )), |
103 | cl::ZeroOrMore, cl::cat(BoltOptCategory)); |
104 | |
105 | static cl::opt<unsigned> |
106 | PrintFuncStat("print-function-statistics" , |
107 | cl::desc("print statistics about basic block ordering" ), |
108 | cl::init(Val: 0), cl::cat(BoltOptCategory)); |
109 | |
110 | static cl::opt<bool> PrintLargeFunctions( |
111 | "print-large-functions" , |
112 | cl::desc("print functions that could not be overwritten due to excessive " |
113 | "size" ), |
114 | cl::init(Val: false), cl::cat(BoltOptCategory)); |
115 | |
116 | static cl::list<bolt::DynoStats::Category> |
117 | PrintSortedBy("print-sorted-by" , cl::CommaSeparated, |
118 | cl::desc("print functions sorted by order of dyno stats" ), |
119 | cl::value_desc("key1,key2,key3,..." ), |
120 | cl::values( |
121 | #define D(name, description, ...) \ |
122 | clEnumValN(bolt::DynoStats::name, dynoStatsOptName(bolt::DynoStats::name), \ |
123 | description), |
124 | REAL_DYNO_STATS |
125 | #undef D |
126 | clEnumValN(bolt::DynoStats::LAST_DYNO_STAT, "all" , |
127 | "sorted by all names" )), |
128 | cl::ZeroOrMore, cl::cat(BoltOptCategory)); |
129 | |
130 | static cl::opt<bool> |
131 | PrintUnknown("print-unknown" , |
132 | cl::desc("print names of functions with unknown control flow" ), |
133 | cl::cat(BoltCategory), cl::Hidden); |
134 | |
135 | static cl::opt<bool> |
136 | PrintUnknownCFG("print-unknown-cfg" , |
137 | cl::desc("dump CFG of functions with unknown control flow" ), |
138 | cl::cat(BoltCategory), cl::ReallyHidden); |
139 | |
140 | // Please MSVC19 with a forward declaration: otherwise it reports an error about |
141 | // an undeclared variable inside a callback. |
142 | extern cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks; |
143 | cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks( |
144 | "reorder-blocks" , cl::desc("change layout of basic blocks in a function" ), |
145 | cl::init(Val: bolt::ReorderBasicBlocks::LT_NONE), |
146 | cl::values( |
147 | clEnumValN(bolt::ReorderBasicBlocks::LT_NONE, "none" , |
148 | "do not reorder basic blocks" ), |
149 | clEnumValN(bolt::ReorderBasicBlocks::LT_REVERSE, "reverse" , |
150 | "layout blocks in reverse order" ), |
151 | clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE, "normal" , |
152 | "perform optimal layout based on profile" ), |
153 | clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_BRANCH, |
154 | "branch-predictor" , |
155 | "perform optimal layout prioritizing branch " |
156 | "predictions" ), |
157 | clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE, "cache" , |
158 | "perform optimal layout prioritizing I-cache " |
159 | "behavior" ), |
160 | clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS, "cache+" , |
161 | "perform layout optimizing I-cache behavior" ), |
162 | clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP, "ext-tsp" , |
163 | "perform layout optimizing I-cache behavior" ), |
164 | clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_SHUFFLE, |
165 | "cluster-shuffle" , "perform random layout of clusters" )), |
166 | cl::ZeroOrMore, cl::cat(BoltOptCategory), |
167 | cl::callback(CB: [](const bolt::ReorderBasicBlocks::LayoutType &option) { |
168 | if (option == bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS) { |
169 | errs() << "BOLT-WARNING: '-reorder-blocks=cache+' is deprecated, please" |
170 | << " use '-reorder-blocks=ext-tsp' instead\n" ; |
171 | ReorderBlocks = bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP; |
172 | } |
173 | })); |
174 | |
175 | static cl::opt<unsigned> ReportBadLayout( |
176 | "report-bad-layout" , |
177 | cl::desc("print top <uint> functions with suboptimal code layout on input" ), |
178 | cl::init(Val: 0), cl::Hidden, cl::cat(BoltOptCategory)); |
179 | |
180 | static cl::opt<bool> |
181 | ReportStaleFuncs("report-stale" , |
182 | cl::desc("print the list of functions with stale profile" ), |
183 | cl::Hidden, cl::cat(BoltOptCategory)); |
184 | |
185 | enum SctcModes : char { |
186 | SctcAlways, |
187 | SctcPreserveDirection, |
188 | SctcHeuristic |
189 | }; |
190 | |
191 | static cl::opt<SctcModes> |
192 | SctcMode("sctc-mode" , |
193 | cl::desc("mode for simplify conditional tail calls" ), |
194 | cl::init(Val: SctcAlways), |
195 | cl::values(clEnumValN(SctcAlways, "always" , "always perform sctc" ), |
196 | clEnumValN(SctcPreserveDirection, |
197 | "preserve" , |
198 | "only perform sctc when branch direction is " |
199 | "preserved" ), |
200 | clEnumValN(SctcHeuristic, |
201 | "heuristic" , |
202 | "use branch prediction data to control sctc" )), |
203 | cl::ZeroOrMore, |
204 | cl::cat(BoltOptCategory)); |
205 | |
206 | static cl::opt<unsigned> |
207 | StaleThreshold("stale-threshold" , |
208 | cl::desc( |
209 | "maximum percentage of stale functions to tolerate (default: 100)" ), |
210 | cl::init(Val: 100), |
211 | cl::Hidden, |
212 | cl::cat(BoltOptCategory)); |
213 | |
214 | static cl::opt<unsigned> TSPThreshold( |
215 | "tsp-threshold" , |
216 | cl::desc( |
217 | "maximum number of hot basic blocks in a function for which to use " |
218 | "a precise TSP solution while re-ordering basic blocks" ), |
219 | cl::init(Val: 10), cl::Hidden, cl::cat(BoltOptCategory)); |
220 | |
221 | static cl::opt<unsigned> TopCalledLimit( |
222 | "top-called-limit" , |
223 | cl::desc("maximum number of functions to print in top called " |
224 | "functions section" ), |
225 | cl::init(Val: 100), cl::Hidden, cl::cat(BoltCategory)); |
226 | |
227 | } // namespace opts |
228 | |
229 | namespace llvm { |
230 | namespace bolt { |
231 | |
232 | bool BinaryFunctionPass::shouldOptimize(const BinaryFunction &BF) const { |
233 | return BF.isSimple() && BF.getState() == BinaryFunction::State::CFG && |
234 | !BF.isIgnored(); |
235 | } |
236 | |
237 | bool BinaryFunctionPass::shouldPrint(const BinaryFunction &BF) const { |
238 | return BF.isSimple() && !BF.isIgnored(); |
239 | } |
240 | |
241 | void NormalizeCFG::runOnFunction(BinaryFunction &BF) { |
242 | uint64_t NumRemoved = 0; |
243 | uint64_t NumDuplicateEdges = 0; |
244 | uint64_t NeedsFixBranches = 0; |
245 | for (BinaryBasicBlock &BB : BF) { |
246 | if (!BB.empty()) |
247 | continue; |
248 | |
249 | if (BB.isEntryPoint() || BB.isLandingPad()) |
250 | continue; |
251 | |
252 | // Handle a dangling empty block. |
253 | if (BB.succ_size() == 0) { |
254 | // If an empty dangling basic block has a predecessor, it could be a |
255 | // result of codegen for __builtin_unreachable. In such case, do not |
256 | // remove the block. |
257 | if (BB.pred_size() == 0) { |
258 | BB.markValid(Valid: false); |
259 | ++NumRemoved; |
260 | } |
261 | continue; |
262 | } |
263 | |
264 | // The block should have just one successor. |
265 | BinaryBasicBlock *Successor = BB.getSuccessor(); |
266 | assert(Successor && "invalid CFG encountered" ); |
267 | |
268 | // Redirect all predecessors to the successor block. |
269 | while (!BB.pred_empty()) { |
270 | BinaryBasicBlock *Predecessor = *BB.pred_begin(); |
271 | if (Predecessor->hasJumpTable()) |
272 | break; |
273 | |
274 | if (Predecessor == Successor) |
275 | break; |
276 | |
277 | BinaryBasicBlock::BinaryBranchInfo &BI = Predecessor->getBranchInfo(Succ: BB); |
278 | Predecessor->replaceSuccessor(Succ: &BB, NewSucc: Successor, Count: BI.Count, |
279 | MispredictedCount: BI.MispredictedCount); |
280 | // We need to fix branches even if we failed to replace all successors |
281 | // and remove the block. |
282 | NeedsFixBranches = true; |
283 | } |
284 | |
285 | if (BB.pred_empty()) { |
286 | BB.removeAllSuccessors(); |
287 | BB.markValid(Valid: false); |
288 | ++NumRemoved; |
289 | } |
290 | } |
291 | |
292 | if (NumRemoved) |
293 | BF.eraseInvalidBBs(); |
294 | |
295 | // Check for duplicate successors. Do it after the empty block elimination as |
296 | // we can get more duplicate successors. |
297 | for (BinaryBasicBlock &BB : BF) |
298 | if (!BB.hasJumpTable() && BB.succ_size() == 2 && |
299 | BB.getConditionalSuccessor(Condition: false) == BB.getConditionalSuccessor(Condition: true)) |
300 | ++NumDuplicateEdges; |
301 | |
302 | // fixBranches() will get rid of duplicate edges and update jump instructions. |
303 | if (NumDuplicateEdges || NeedsFixBranches) |
304 | BF.fixBranches(); |
305 | |
306 | NumDuplicateEdgesMerged += NumDuplicateEdges; |
307 | NumBlocksRemoved += NumRemoved; |
308 | } |
309 | |
310 | Error NormalizeCFG::runOnFunctions(BinaryContext &BC) { |
311 | ParallelUtilities::runOnEachFunction( |
312 | BC, SchedPolicy: ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, |
313 | WorkFunction: [&](BinaryFunction &BF) { runOnFunction(BF); }, |
314 | SkipPredicate: [&](const BinaryFunction &BF) { return !shouldOptimize(BF); }, |
315 | LogName: "NormalizeCFG" ); |
316 | if (NumBlocksRemoved) |
317 | BC.outs() << "BOLT-INFO: removed " << NumBlocksRemoved << " empty block" |
318 | << (NumBlocksRemoved == 1 ? "" : "s" ) << '\n'; |
319 | if (NumDuplicateEdgesMerged) |
320 | BC.outs() << "BOLT-INFO: merged " << NumDuplicateEdgesMerged |
321 | << " duplicate CFG edge" |
322 | << (NumDuplicateEdgesMerged == 1 ? "" : "s" ) << '\n'; |
323 | return Error::success(); |
324 | } |
325 | |
326 | void EliminateUnreachableBlocks::runOnFunction(BinaryFunction &Function) { |
327 | BinaryContext &BC = Function.getBinaryContext(); |
328 | unsigned Count; |
329 | uint64_t Bytes; |
330 | Function.markUnreachableBlocks(); |
331 | LLVM_DEBUG({ |
332 | for (BinaryBasicBlock &BB : Function) { |
333 | if (!BB.isValid()) { |
334 | dbgs() << "BOLT-INFO: UCE found unreachable block " << BB.getName() |
335 | << " in function " << Function << "\n" ; |
336 | Function.dump(); |
337 | } |
338 | } |
339 | }); |
340 | BinaryContext::IndependentCodeEmitter Emitter = |
341 | BC.createIndependentMCCodeEmitter(); |
342 | std::tie(args&: Count, args&: Bytes) = Function.eraseInvalidBBs(Emitter: Emitter.MCE.get()); |
343 | DeletedBlocks += Count; |
344 | DeletedBytes += Bytes; |
345 | if (Count) { |
346 | auto L = BC.scopeLock(); |
347 | Modified.insert(x: &Function); |
348 | if (opts::Verbosity > 0) |
349 | BC.outs() << "BOLT-INFO: removed " << Count |
350 | << " dead basic block(s) accounting for " << Bytes |
351 | << " bytes in function " << Function << '\n'; |
352 | } |
353 | } |
354 | |
355 | Error EliminateUnreachableBlocks::runOnFunctions(BinaryContext &BC) { |
356 | ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { |
357 | runOnFunction(Function&: BF); |
358 | }; |
359 | |
360 | ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) { |
361 | return !shouldOptimize(BF) || BF.getLayout().block_empty(); |
362 | }; |
363 | |
364 | ParallelUtilities::runOnEachFunction( |
365 | BC, SchedPolicy: ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFunction: WorkFun, |
366 | SkipPredicate, LogName: "elimininate-unreachable" ); |
367 | |
368 | if (DeletedBlocks) |
369 | BC.outs() << "BOLT-INFO: UCE removed " << DeletedBlocks << " blocks and " |
370 | << DeletedBytes << " bytes of code\n" ; |
371 | return Error::success(); |
372 | } |
373 | |
374 | bool ReorderBasicBlocks::shouldPrint(const BinaryFunction &BF) const { |
375 | return (BinaryFunctionPass::shouldPrint(BF) && |
376 | opts::ReorderBlocks != ReorderBasicBlocks::LT_NONE); |
377 | } |
378 | |
379 | bool ReorderBasicBlocks::shouldOptimize(const BinaryFunction &BF) const { |
380 | // Apply execution count threshold |
381 | if (BF.getKnownExecutionCount() < opts::ExecutionCountThreshold) |
382 | return false; |
383 | |
384 | return BinaryFunctionPass::shouldOptimize(BF); |
385 | } |
386 | |
387 | Error ReorderBasicBlocks::runOnFunctions(BinaryContext &BC) { |
388 | if (opts::ReorderBlocks == ReorderBasicBlocks::LT_NONE) |
389 | return Error::success(); |
390 | |
391 | std::atomic_uint64_t ModifiedFuncCount(0); |
392 | std::mutex FunctionEditDistanceMutex; |
393 | DenseMap<const BinaryFunction *, uint64_t> FunctionEditDistance; |
394 | |
395 | ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { |
396 | SmallVector<const BinaryBasicBlock *, 0> OldBlockOrder; |
397 | if (opts::PrintFuncStat > 0) |
398 | llvm::copy(Range: BF.getLayout().blocks(), Out: std::back_inserter(x&: OldBlockOrder)); |
399 | |
400 | const bool LayoutChanged = |
401 | modifyFunctionLayout(Function&: BF, Type: opts::ReorderBlocks, MinBranchClusters: opts::MinBranchClusters); |
402 | if (LayoutChanged) { |
403 | ModifiedFuncCount.fetch_add(i: 1, m: std::memory_order_relaxed); |
404 | if (opts::PrintFuncStat > 0) { |
405 | const uint64_t Distance = BF.getLayout().getEditDistance(OldBlockOrder); |
406 | std::lock_guard<std::mutex> Lock(FunctionEditDistanceMutex); |
407 | FunctionEditDistance[&BF] = Distance; |
408 | } |
409 | } |
410 | }; |
411 | |
412 | ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) { |
413 | return !shouldOptimize(BF); |
414 | }; |
415 | |
416 | ParallelUtilities::runOnEachFunction( |
417 | BC, SchedPolicy: ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, WorkFunction: WorkFun, SkipPredicate: SkipFunc, |
418 | LogName: "ReorderBasicBlocks" ); |
419 | const size_t NumAllProfiledFunctions = |
420 | BC.NumProfiledFuncs + BC.NumStaleProfileFuncs; |
421 | |
422 | BC.outs() << "BOLT-INFO: basic block reordering modified layout of " |
423 | << format( |
424 | Fmt: "%zu functions (%.2lf%% of profiled, %.2lf%% of total)\n" , |
425 | Vals: ModifiedFuncCount.load(m: std::memory_order_relaxed), |
426 | Vals: 100.0 * ModifiedFuncCount.load(m: std::memory_order_relaxed) / |
427 | NumAllProfiledFunctions, |
428 | Vals: 100.0 * ModifiedFuncCount.load(m: std::memory_order_relaxed) / |
429 | BC.getBinaryFunctions().size()); |
430 | |
431 | if (opts::PrintFuncStat > 0) { |
432 | raw_ostream &OS = BC.outs(); |
433 | // Copy all the values into vector in order to sort them |
434 | std::map<uint64_t, BinaryFunction &> ScoreMap; |
435 | auto &BFs = BC.getBinaryFunctions(); |
436 | for (auto It = BFs.begin(); It != BFs.end(); ++It) |
437 | ScoreMap.insert(x: std::pair<uint64_t, BinaryFunction &>( |
438 | It->second.getFunctionScore(), It->second)); |
439 | |
440 | OS << "\nBOLT-INFO: Printing Function Statistics:\n\n" ; |
441 | OS << " There are " << BFs.size() << " functions in total. \n" ; |
442 | OS << " Number of functions being modified: " |
443 | << ModifiedFuncCount.load(m: std::memory_order_relaxed) << "\n" ; |
444 | OS << " User asks for detailed information on top " |
445 | << opts::PrintFuncStat << " functions. (Ranked by function score)" |
446 | << "\n\n" ; |
447 | uint64_t I = 0; |
448 | for (std::map<uint64_t, BinaryFunction &>::reverse_iterator Rit = |
449 | ScoreMap.rbegin(); |
450 | Rit != ScoreMap.rend() && I < opts::PrintFuncStat; ++Rit, ++I) { |
451 | BinaryFunction &Function = Rit->second; |
452 | |
453 | OS << " Information for function of top: " << (I + 1) << ": \n" ; |
454 | OS << " Function Score is: " << Function.getFunctionScore() |
455 | << "\n" ; |
456 | OS << " There are " << Function.size() |
457 | << " number of blocks in this function.\n" ; |
458 | OS << " There are " << Function.getInstructionCount() |
459 | << " number of instructions in this function.\n" ; |
460 | OS << " The edit distance for this function is: " |
461 | << FunctionEditDistance.lookup(Val: &Function) << "\n\n" ; |
462 | } |
463 | } |
464 | return Error::success(); |
465 | } |
466 | |
467 | bool ReorderBasicBlocks::modifyFunctionLayout(BinaryFunction &BF, |
468 | LayoutType Type, |
469 | bool MinBranchClusters) const { |
470 | if (BF.size() == 0 || Type == LT_NONE) |
471 | return false; |
472 | |
473 | BinaryFunction::BasicBlockOrderType NewLayout; |
474 | std::unique_ptr<ReorderAlgorithm> Algo; |
475 | |
476 | // Cannot do optimal layout without profile. |
477 | if (Type != LT_REVERSE && !BF.hasValidProfile()) |
478 | return false; |
479 | |
480 | if (Type == LT_REVERSE) { |
481 | Algo.reset(p: new ReverseReorderAlgorithm()); |
482 | } else if (BF.size() <= opts::TSPThreshold && Type != LT_OPTIMIZE_SHUFFLE) { |
483 | // Work on optimal solution if problem is small enough |
484 | LLVM_DEBUG(dbgs() << "finding optimal block layout for " << BF << "\n" ); |
485 | Algo.reset(p: new TSPReorderAlgorithm()); |
486 | } else { |
487 | LLVM_DEBUG(dbgs() << "running block layout heuristics on " << BF << "\n" ); |
488 | |
489 | std::unique_ptr<ClusterAlgorithm> CAlgo; |
490 | if (MinBranchClusters) |
491 | CAlgo.reset(p: new MinBranchGreedyClusterAlgorithm()); |
492 | else |
493 | CAlgo.reset(p: new PHGreedyClusterAlgorithm()); |
494 | |
495 | switch (Type) { |
496 | case LT_OPTIMIZE: |
497 | Algo.reset(p: new OptimizeReorderAlgorithm(std::move(CAlgo))); |
498 | break; |
499 | |
500 | case LT_OPTIMIZE_BRANCH: |
501 | Algo.reset(p: new OptimizeBranchReorderAlgorithm(std::move(CAlgo))); |
502 | break; |
503 | |
504 | case LT_OPTIMIZE_CACHE: |
505 | Algo.reset(p: new OptimizeCacheReorderAlgorithm(std::move(CAlgo))); |
506 | break; |
507 | |
508 | case LT_OPTIMIZE_EXT_TSP: |
509 | Algo.reset(p: new ExtTSPReorderAlgorithm()); |
510 | break; |
511 | |
512 | case LT_OPTIMIZE_SHUFFLE: |
513 | Algo.reset(p: new RandomClusterReorderAlgorithm(std::move(CAlgo))); |
514 | break; |
515 | |
516 | default: |
517 | llvm_unreachable("unexpected layout type" ); |
518 | } |
519 | } |
520 | |
521 | Algo->reorderBasicBlocks(BF, Order&: NewLayout); |
522 | |
523 | return BF.getLayout().update(NewLayout); |
524 | } |
525 | |
526 | Error FixupBranches::runOnFunctions(BinaryContext &BC) { |
527 | for (auto &It : BC.getBinaryFunctions()) { |
528 | BinaryFunction &Function = It.second; |
529 | if (!BC.shouldEmit(Function) || !Function.isSimple()) |
530 | continue; |
531 | |
532 | Function.fixBranches(); |
533 | } |
534 | return Error::success(); |
535 | } |
536 | |
537 | Error FinalizeFunctions::runOnFunctions(BinaryContext &BC) { |
538 | std::atomic<bool> HasFatal{false}; |
539 | ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { |
540 | if (!BF.finalizeCFIState()) { |
541 | if (BC.HasRelocations) { |
542 | BC.errs() << "BOLT-ERROR: unable to fix CFI state for function " << BF |
543 | << ". Exiting.\n" ; |
544 | HasFatal = true; |
545 | return; |
546 | } |
547 | BF.setSimple(false); |
548 | return; |
549 | } |
550 | |
551 | BF.setFinalized(); |
552 | |
553 | // Update exception handling information. |
554 | BF.updateEHRanges(); |
555 | }; |
556 | |
557 | ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) { |
558 | return !BC.shouldEmit(Function: BF); |
559 | }; |
560 | |
561 | ParallelUtilities::runOnEachFunction( |
562 | BC, SchedPolicy: ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFunction: WorkFun, |
563 | SkipPredicate, LogName: "FinalizeFunctions" ); |
564 | if (HasFatal) |
565 | return createFatalBOLTError(S: "finalize CFI state failure" ); |
566 | return Error::success(); |
567 | } |
568 | |
569 | Error CheckLargeFunctions::runOnFunctions(BinaryContext &BC) { |
570 | if (BC.HasRelocations) |
571 | return Error::success(); |
572 | |
573 | // If the function wouldn't fit, mark it as non-simple. Otherwise, we may emit |
574 | // incorrect meta data. |
575 | ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { |
576 | uint64_t HotSize, ColdSize; |
577 | std::tie(args&: HotSize, args&: ColdSize) = |
578 | BC.calculateEmittedSize(BF, /*FixBranches=*/false); |
579 | if (HotSize > BF.getMaxSize()) { |
580 | if (opts::PrintLargeFunctions) |
581 | BC.outs() << "BOLT-INFO: " << BF << " size exceeds allocated space by " |
582 | << (HotSize - BF.getMaxSize()) << " bytes\n" ; |
583 | BF.setSimple(false); |
584 | } |
585 | }; |
586 | |
587 | ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) { |
588 | return !shouldOptimize(BF); |
589 | }; |
590 | |
591 | ParallelUtilities::runOnEachFunction( |
592 | BC, SchedPolicy: ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFunction: WorkFun, |
593 | SkipPredicate: SkipFunc, LogName: "CheckLargeFunctions" ); |
594 | |
595 | return Error::success(); |
596 | } |
597 | |
598 | bool CheckLargeFunctions::shouldOptimize(const BinaryFunction &BF) const { |
599 | // Unlike other passes, allow functions in non-CFG state. |
600 | return BF.isSimple() && !BF.isIgnored(); |
601 | } |
602 | |
603 | Error LowerAnnotations::runOnFunctions(BinaryContext &BC) { |
604 | // Convert GnuArgsSize annotations into CFIs. |
605 | for (BinaryFunction *BF : BC.getAllBinaryFunctions()) { |
606 | for (FunctionFragment &FF : BF->getLayout().fragments()) { |
607 | // Reset at the start of the new fragment. |
608 | int64_t CurrentGnuArgsSize = 0; |
609 | |
610 | for (BinaryBasicBlock *const BB : FF) { |
611 | for (auto II = BB->begin(); II != BB->end(); ++II) { |
612 | if (!BF->usesGnuArgsSize() || !BC.MIB->isInvoke(Inst: *II)) |
613 | continue; |
614 | |
615 | const int64_t NewGnuArgsSize = BC.MIB->getGnuArgsSize(Inst: *II); |
616 | assert(NewGnuArgsSize >= 0 && "Expected non-negative GNU_args_size." ); |
617 | if (NewGnuArgsSize == CurrentGnuArgsSize) |
618 | continue; |
619 | |
620 | auto InsertII = BF->addCFIInstruction( |
621 | BB, Pos: II, |
622 | Inst: MCCFIInstruction::createGnuArgsSize(L: nullptr, Size: NewGnuArgsSize)); |
623 | CurrentGnuArgsSize = NewGnuArgsSize; |
624 | II = std::next(x: InsertII); |
625 | } |
626 | } |
627 | } |
628 | } |
629 | return Error::success(); |
630 | } |
631 | |
632 | // Check for dirty state in MCSymbol objects that might be a consequence |
633 | // of running calculateEmittedSize() in parallel, during split functions |
634 | // pass. If an inconsistent state is found (symbol already registered or |
635 | // already defined), clean it. |
636 | Error CleanMCState::runOnFunctions(BinaryContext &BC) { |
637 | MCContext &Ctx = *BC.Ctx; |
638 | for (const auto &SymMapEntry : Ctx.getSymbols()) { |
639 | const MCSymbol *S = SymMapEntry.second; |
640 | if (S->isDefined()) { |
641 | LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName() |
642 | << "\" is already defined\n" ); |
643 | const_cast<MCSymbol *>(S)->setUndefined(); |
644 | } |
645 | if (S->isRegistered()) { |
646 | LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName() |
647 | << "\" is already registered\n" ); |
648 | const_cast<MCSymbol *>(S)->setIsRegistered(false); |
649 | } |
650 | LLVM_DEBUG(if (S->isVariable()) { |
651 | dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName() << "\" is variable\n" ; |
652 | }); |
653 | } |
654 | return Error::success(); |
655 | } |
656 | |
657 | // This peephole fixes jump instructions that jump to another basic |
658 | // block with a single jump instruction, e.g. |
659 | // |
660 | // B0: ... |
661 | // jmp B1 (or jcc B1) |
662 | // |
663 | // B1: jmp B2 |
664 | // |
665 | // -> |
666 | // |
667 | // B0: ... |
668 | // jmp B2 (or jcc B2) |
669 | // |
670 | static uint64_t fixDoubleJumps(BinaryFunction &Function, bool MarkInvalid) { |
671 | uint64_t NumDoubleJumps = 0; |
672 | |
673 | MCContext *Ctx = Function.getBinaryContext().Ctx.get(); |
674 | MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get(); |
675 | for (BinaryBasicBlock &BB : Function) { |
676 | auto checkAndPatch = [&](BinaryBasicBlock *Pred, BinaryBasicBlock *Succ, |
677 | const MCSymbol *SuccSym) { |
678 | // Ignore infinite loop jumps or fallthrough tail jumps. |
679 | if (Pred == Succ || Succ == &BB) |
680 | return false; |
681 | |
682 | if (Succ) { |
683 | const MCSymbol *TBB = nullptr; |
684 | const MCSymbol *FBB = nullptr; |
685 | MCInst *CondBranch = nullptr; |
686 | MCInst *UncondBranch = nullptr; |
687 | bool Res = Pred->analyzeBranch(TBB, FBB, CondBranch, UncondBranch); |
688 | if (!Res) { |
689 | LLVM_DEBUG(dbgs() << "analyzeBranch failed in peepholes in block:\n" ; |
690 | Pred->dump()); |
691 | return false; |
692 | } |
693 | Pred->replaceSuccessor(Succ: &BB, NewSucc: Succ); |
694 | |
695 | // We must patch up any existing branch instructions to match up |
696 | // with the new successor. |
697 | assert((CondBranch || (!CondBranch && Pred->succ_size() == 1)) && |
698 | "Predecessor block has inconsistent number of successors" ); |
699 | if (CondBranch && MIB->getTargetSymbol(Inst: *CondBranch) == BB.getLabel()) { |
700 | MIB->replaceBranchTarget(Inst&: *CondBranch, TBB: Succ->getLabel(), Ctx); |
701 | } else if (UncondBranch && |
702 | MIB->getTargetSymbol(Inst: *UncondBranch) == BB.getLabel()) { |
703 | MIB->replaceBranchTarget(Inst&: *UncondBranch, TBB: Succ->getLabel(), Ctx); |
704 | } else if (!UncondBranch) { |
705 | assert(Function.getLayout().getBasicBlockAfter(Pred, false) != Succ && |
706 | "Don't add an explicit jump to a fallthrough block." ); |
707 | Pred->addBranchInstruction(Successor: Succ); |
708 | } |
709 | } else { |
710 | // Succ will be null in the tail call case. In this case we |
711 | // need to explicitly add a tail call instruction. |
712 | MCInst *Branch = Pred->getLastNonPseudoInstr(); |
713 | if (Branch && MIB->isUnconditionalBranch(Inst: *Branch)) { |
714 | assert(MIB->getTargetSymbol(*Branch) == BB.getLabel()); |
715 | Pred->removeSuccessor(Succ: &BB); |
716 | Pred->eraseInstruction(II: Pred->findInstruction(Inst: Branch)); |
717 | Pred->addTailCallInstruction(Target: SuccSym); |
718 | } else { |
719 | return false; |
720 | } |
721 | } |
722 | |
723 | ++NumDoubleJumps; |
724 | LLVM_DEBUG(dbgs() << "Removed double jump in " << Function << " from " |
725 | << Pred->getName() << " -> " << BB.getName() << " to " |
726 | << Pred->getName() << " -> " << SuccSym->getName() |
727 | << (!Succ ? " (tail)\n" : "\n" )); |
728 | |
729 | return true; |
730 | }; |
731 | |
732 | if (BB.getNumNonPseudos() != 1 || BB.isLandingPad()) |
733 | continue; |
734 | |
735 | MCInst *Inst = BB.getFirstNonPseudoInstr(); |
736 | const bool IsTailCall = MIB->isTailCall(Inst: *Inst); |
737 | |
738 | if (!MIB->isUnconditionalBranch(Inst: *Inst) && !IsTailCall) |
739 | continue; |
740 | |
741 | // If we operate after SCTC make sure it's not a conditional tail call. |
742 | if (IsTailCall && MIB->isConditionalBranch(Inst: *Inst)) |
743 | continue; |
744 | |
745 | const MCSymbol *SuccSym = MIB->getTargetSymbol(Inst: *Inst); |
746 | BinaryBasicBlock *Succ = BB.getSuccessor(); |
747 | |
748 | if (((!Succ || &BB == Succ) && !IsTailCall) || (IsTailCall && !SuccSym)) |
749 | continue; |
750 | |
751 | std::vector<BinaryBasicBlock *> Preds = {BB.pred_begin(), BB.pred_end()}; |
752 | |
753 | for (BinaryBasicBlock *Pred : Preds) { |
754 | if (Pred->isLandingPad()) |
755 | continue; |
756 | |
757 | if (Pred->getSuccessor() == &BB || |
758 | (Pred->getConditionalSuccessor(Condition: true) == &BB && !IsTailCall) || |
759 | Pred->getConditionalSuccessor(Condition: false) == &BB) |
760 | if (checkAndPatch(Pred, Succ, SuccSym) && MarkInvalid) |
761 | BB.markValid(Valid: BB.pred_size() != 0 || BB.isLandingPad() || |
762 | BB.isEntryPoint()); |
763 | } |
764 | } |
765 | |
766 | return NumDoubleJumps; |
767 | } |
768 | |
769 | bool SimplifyConditionalTailCalls::shouldRewriteBranch( |
770 | const BinaryBasicBlock *PredBB, const MCInst &CondBranch, |
771 | const BinaryBasicBlock *BB, const bool DirectionFlag) { |
772 | if (BeenOptimized.count(x: PredBB)) |
773 | return false; |
774 | |
775 | const bool IsForward = BinaryFunction::isForwardBranch(From: PredBB, To: BB); |
776 | |
777 | if (IsForward) |
778 | ++NumOrigForwardBranches; |
779 | else |
780 | ++NumOrigBackwardBranches; |
781 | |
782 | if (opts::SctcMode == opts::SctcAlways) |
783 | return true; |
784 | |
785 | if (opts::SctcMode == opts::SctcPreserveDirection) |
786 | return IsForward == DirectionFlag; |
787 | |
788 | const ErrorOr<std::pair<double, double>> Frequency = |
789 | PredBB->getBranchStats(Succ: BB); |
790 | |
791 | // It's ok to rewrite the conditional branch if the new target will be |
792 | // a backward branch. |
793 | |
794 | // If no data available for these branches, then it should be ok to |
795 | // do the optimization since it will reduce code size. |
796 | if (Frequency.getError()) |
797 | return true; |
798 | |
799 | // TODO: should this use misprediction frequency instead? |
800 | const bool Result = (IsForward && Frequency.get().first >= 0.5) || |
801 | (!IsForward && Frequency.get().first <= 0.5); |
802 | |
803 | return Result == DirectionFlag; |
804 | } |
805 | |
806 | uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryFunction &BF) { |
807 | // Need updated indices to correctly detect branch' direction. |
808 | BF.getLayout().updateLayoutIndices(); |
809 | BF.markUnreachableBlocks(); |
810 | |
811 | MCPlusBuilder *MIB = BF.getBinaryContext().MIB.get(); |
812 | MCContext *Ctx = BF.getBinaryContext().Ctx.get(); |
813 | uint64_t NumLocalCTCCandidates = 0; |
814 | uint64_t NumLocalCTCs = 0; |
815 | uint64_t LocalCTCTakenCount = 0; |
816 | uint64_t LocalCTCExecCount = 0; |
817 | std::vector<std::pair<BinaryBasicBlock *, const BinaryBasicBlock *>> |
818 | NeedsUncondBranch; |
819 | |
820 | // Will block be deleted by UCE? |
821 | auto isValid = [](const BinaryBasicBlock *BB) { |
822 | return (BB->pred_size() != 0 || BB->isLandingPad() || BB->isEntryPoint()); |
823 | }; |
824 | |
825 | for (BinaryBasicBlock *BB : BF.getLayout().blocks()) { |
826 | // Locate BB with a single direct tail-call instruction. |
827 | if (BB->getNumNonPseudos() != 1) |
828 | continue; |
829 | |
830 | MCInst *Instr = BB->getFirstNonPseudoInstr(); |
831 | if (!MIB->isTailCall(Inst: *Instr) || MIB->isConditionalBranch(Inst: *Instr)) |
832 | continue; |
833 | |
834 | const MCSymbol *CalleeSymbol = MIB->getTargetSymbol(Inst: *Instr); |
835 | if (!CalleeSymbol) |
836 | continue; |
837 | |
838 | // Detect direction of the possible conditional tail call. |
839 | const bool IsForwardCTC = BF.isForwardCall(CalleeSymbol); |
840 | |
841 | // Iterate through all predecessors. |
842 | for (BinaryBasicBlock *PredBB : BB->predecessors()) { |
843 | BinaryBasicBlock *CondSucc = PredBB->getConditionalSuccessor(Condition: true); |
844 | if (!CondSucc) |
845 | continue; |
846 | |
847 | ++NumLocalCTCCandidates; |
848 | |
849 | const MCSymbol *TBB = nullptr; |
850 | const MCSymbol *FBB = nullptr; |
851 | MCInst *CondBranch = nullptr; |
852 | MCInst *UncondBranch = nullptr; |
853 | bool Result = PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch); |
854 | |
855 | // analyzeBranch() can fail due to unusual branch instructions, e.g. jrcxz |
856 | if (!Result) { |
857 | LLVM_DEBUG(dbgs() << "analyzeBranch failed in SCTC in block:\n" ; |
858 | PredBB->dump()); |
859 | continue; |
860 | } |
861 | |
862 | assert(Result && "internal error analyzing conditional branch" ); |
863 | assert(CondBranch && "conditional branch expected" ); |
864 | |
865 | // Skip dynamic branches for now. |
866 | if (BF.getBinaryContext().MIB->isDynamicBranch(Inst: *CondBranch)) |
867 | continue; |
868 | |
869 | // It's possible that PredBB is also a successor to BB that may have |
870 | // been processed by a previous iteration of the SCTC loop, in which |
871 | // case it may have been marked invalid. We should skip rewriting in |
872 | // this case. |
873 | if (!PredBB->isValid()) { |
874 | assert(PredBB->isSuccessor(BB) && |
875 | "PredBB should be valid if it is not a successor to BB" ); |
876 | continue; |
877 | } |
878 | |
879 | // We don't want to reverse direction of the branch in new order |
880 | // without further profile analysis. |
881 | const bool DirectionFlag = CondSucc == BB ? IsForwardCTC : !IsForwardCTC; |
882 | if (!shouldRewriteBranch(PredBB, CondBranch: *CondBranch, BB, DirectionFlag)) |
883 | continue; |
884 | |
885 | // Record this block so that we don't try to optimize it twice. |
886 | BeenOptimized.insert(x: PredBB); |
887 | |
888 | uint64_t Count = 0; |
889 | if (CondSucc != BB) { |
890 | // Patch the new target address into the conditional branch. |
891 | MIB->reverseBranchCondition(Inst&: *CondBranch, TBB: CalleeSymbol, Ctx); |
892 | // Since we reversed the condition on the branch we need to change |
893 | // the target for the unconditional branch or add a unconditional |
894 | // branch to the old target. This has to be done manually since |
895 | // fixupBranches is not called after SCTC. |
896 | NeedsUncondBranch.emplace_back(args&: PredBB, args&: CondSucc); |
897 | Count = PredBB->getFallthroughBranchInfo().Count; |
898 | } else { |
899 | // Change destination of the conditional branch. |
900 | MIB->replaceBranchTarget(Inst&: *CondBranch, TBB: CalleeSymbol, Ctx); |
901 | Count = PredBB->getTakenBranchInfo().Count; |
902 | } |
903 | const uint64_t CTCTakenFreq = |
904 | Count == BinaryBasicBlock::COUNT_NO_PROFILE ? 0 : Count; |
905 | |
906 | // Annotate it, so "isCall" returns true for this jcc |
907 | MIB->setConditionalTailCall(Inst&: *CondBranch); |
908 | // Add info about the conditional tail call frequency, otherwise this |
909 | // info will be lost when we delete the associated BranchInfo entry |
910 | auto &CTCAnnotation = |
911 | MIB->getOrCreateAnnotationAs<uint64_t>(Inst&: *CondBranch, Name: "CTCTakenCount" ); |
912 | CTCAnnotation = CTCTakenFreq; |
913 | |
914 | // Remove the unused successor which may be eliminated later |
915 | // if there are no other users. |
916 | PredBB->removeSuccessor(Succ: BB); |
917 | // Update BB execution count |
918 | if (CTCTakenFreq && CTCTakenFreq <= BB->getKnownExecutionCount()) |
919 | BB->setExecutionCount(BB->getExecutionCount() - CTCTakenFreq); |
920 | else if (CTCTakenFreq > BB->getKnownExecutionCount()) |
921 | BB->setExecutionCount(0); |
922 | |
923 | ++NumLocalCTCs; |
924 | LocalCTCTakenCount += CTCTakenFreq; |
925 | LocalCTCExecCount += PredBB->getKnownExecutionCount(); |
926 | } |
927 | |
928 | // Remove the block from CFG if all predecessors were removed. |
929 | BB->markValid(Valid: isValid(BB)); |
930 | } |
931 | |
932 | // Add unconditional branches at the end of BBs to new successors |
933 | // as long as the successor is not a fallthrough. |
934 | for (auto &Entry : NeedsUncondBranch) { |
935 | BinaryBasicBlock *PredBB = Entry.first; |
936 | const BinaryBasicBlock *CondSucc = Entry.second; |
937 | |
938 | const MCSymbol *TBB = nullptr; |
939 | const MCSymbol *FBB = nullptr; |
940 | MCInst *CondBranch = nullptr; |
941 | MCInst *UncondBranch = nullptr; |
942 | PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch); |
943 | |
944 | // Find the next valid block. Invalid blocks will be deleted |
945 | // so they shouldn't be considered fallthrough targets. |
946 | const BinaryBasicBlock *NextBlock = |
947 | BF.getLayout().getBasicBlockAfter(BB: PredBB, IgnoreSplits: false); |
948 | while (NextBlock && !isValid(NextBlock)) |
949 | NextBlock = BF.getLayout().getBasicBlockAfter(BB: NextBlock, IgnoreSplits: false); |
950 | |
951 | // Get the unconditional successor to this block. |
952 | const BinaryBasicBlock *PredSucc = PredBB->getSuccessor(); |
953 | assert(PredSucc && "The other branch should be a tail call" ); |
954 | |
955 | const bool HasFallthrough = (NextBlock && PredSucc == NextBlock); |
956 | |
957 | if (UncondBranch) { |
958 | if (HasFallthrough) |
959 | PredBB->eraseInstruction(II: PredBB->findInstruction(Inst: UncondBranch)); |
960 | else |
961 | MIB->replaceBranchTarget(Inst&: *UncondBranch, TBB: CondSucc->getLabel(), Ctx); |
962 | } else if (!HasFallthrough) { |
963 | MCInst Branch; |
964 | MIB->createUncondBranch(Inst&: Branch, TBB: CondSucc->getLabel(), Ctx); |
965 | PredBB->addInstruction(Inst: Branch); |
966 | } |
967 | } |
968 | |
969 | if (NumLocalCTCs > 0) { |
970 | NumDoubleJumps += fixDoubleJumps(Function&: BF, MarkInvalid: true); |
971 | // Clean-up unreachable tail-call blocks. |
972 | const std::pair<unsigned, uint64_t> Stats = BF.eraseInvalidBBs(); |
973 | DeletedBlocks += Stats.first; |
974 | DeletedBytes += Stats.second; |
975 | |
976 | assert(BF.validateCFG()); |
977 | } |
978 | |
979 | LLVM_DEBUG(dbgs() << "BOLT: created " << NumLocalCTCs |
980 | << " conditional tail calls from a total of " |
981 | << NumLocalCTCCandidates << " candidates in function " << BF |
982 | << ". CTCs execution count for this function is " |
983 | << LocalCTCExecCount << " and CTC taken count is " |
984 | << LocalCTCTakenCount << "\n" ;); |
985 | |
986 | NumTailCallsPatched += NumLocalCTCs; |
987 | NumCandidateTailCalls += NumLocalCTCCandidates; |
988 | CTCExecCount += LocalCTCExecCount; |
989 | CTCTakenCount += LocalCTCTakenCount; |
990 | |
991 | return NumLocalCTCs > 0; |
992 | } |
993 | |
994 | Error SimplifyConditionalTailCalls::runOnFunctions(BinaryContext &BC) { |
995 | if (!BC.isX86()) |
996 | return Error::success(); |
997 | |
998 | for (auto &It : BC.getBinaryFunctions()) { |
999 | BinaryFunction &Function = It.second; |
1000 | |
1001 | if (!shouldOptimize(BF: Function)) |
1002 | continue; |
1003 | |
1004 | if (fixTailCalls(BF&: Function)) { |
1005 | Modified.insert(x: &Function); |
1006 | Function.setHasCanonicalCFG(false); |
1007 | } |
1008 | } |
1009 | |
1010 | if (NumTailCallsPatched) |
1011 | BC.outs() << "BOLT-INFO: SCTC: patched " << NumTailCallsPatched |
1012 | << " tail calls (" << NumOrigForwardBranches << " forward)" |
1013 | << " tail calls (" << NumOrigBackwardBranches << " backward)" |
1014 | << " from a total of " << NumCandidateTailCalls |
1015 | << " while removing " << NumDoubleJumps << " double jumps" |
1016 | << " and removing " << DeletedBlocks << " basic blocks" |
1017 | << " totalling " << DeletedBytes |
1018 | << " bytes of code. CTCs total execution count is " |
1019 | << CTCExecCount << " and the number of times CTCs are taken is " |
1020 | << CTCTakenCount << "\n" ; |
1021 | return Error::success(); |
1022 | } |
1023 | |
1024 | uint64_t ShortenInstructions::shortenInstructions(BinaryFunction &Function) { |
1025 | uint64_t Count = 0; |
1026 | const BinaryContext &BC = Function.getBinaryContext(); |
1027 | for (BinaryBasicBlock &BB : Function) { |
1028 | for (MCInst &Inst : BB) { |
1029 | // Skip shortening instructions with Size annotation. |
1030 | if (BC.MIB->getSize(Inst)) |
1031 | continue; |
1032 | |
1033 | MCInst OriginalInst; |
1034 | if (opts::Verbosity > 2) |
1035 | OriginalInst = Inst; |
1036 | |
1037 | if (!BC.MIB->shortenInstruction(Inst, STI: *BC.STI)) |
1038 | continue; |
1039 | |
1040 | if (opts::Verbosity > 2) { |
1041 | BC.scopeLock(); |
1042 | BC.outs() << "BOLT-INFO: shortening:\nBOLT-INFO: " ; |
1043 | BC.printInstruction(OS&: BC.outs(), Instruction: OriginalInst, Offset: 0, Function: &Function); |
1044 | BC.outs() << "BOLT-INFO: to:" ; |
1045 | BC.printInstruction(OS&: BC.outs(), Instruction: Inst, Offset: 0, Function: &Function); |
1046 | } |
1047 | |
1048 | ++Count; |
1049 | } |
1050 | } |
1051 | |
1052 | return Count; |
1053 | } |
1054 | |
1055 | Error ShortenInstructions::runOnFunctions(BinaryContext &BC) { |
1056 | std::atomic<uint64_t> NumShortened{0}; |
1057 | if (!BC.isX86()) |
1058 | return Error::success(); |
1059 | |
1060 | ParallelUtilities::runOnEachFunction( |
1061 | BC, SchedPolicy: ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, |
1062 | WorkFunction: [&](BinaryFunction &BF) { NumShortened += shortenInstructions(Function&: BF); }, |
1063 | SkipPredicate: nullptr, LogName: "ShortenInstructions" ); |
1064 | |
1065 | if (NumShortened) |
1066 | BC.outs() << "BOLT-INFO: " << NumShortened |
1067 | << " instructions were shortened\n" ; |
1068 | return Error::success(); |
1069 | } |
1070 | |
1071 | void Peepholes::addTailcallTraps(BinaryFunction &Function) { |
1072 | MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get(); |
1073 | for (BinaryBasicBlock &BB : Function) { |
1074 | MCInst *Inst = BB.getLastNonPseudoInstr(); |
1075 | if (Inst && MIB->isTailCall(Inst: *Inst) && MIB->isIndirectBranch(Inst: *Inst)) { |
1076 | MCInst Trap; |
1077 | MIB->createTrap(Inst&: Trap); |
1078 | BB.addInstruction(Inst: Trap); |
1079 | ++TailCallTraps; |
1080 | } |
1081 | } |
1082 | } |
1083 | |
1084 | void Peepholes::removeUselessCondBranches(BinaryFunction &Function) { |
1085 | for (BinaryBasicBlock &BB : Function) { |
1086 | if (BB.succ_size() != 2) |
1087 | continue; |
1088 | |
1089 | BinaryBasicBlock *CondBB = BB.getConditionalSuccessor(Condition: true); |
1090 | BinaryBasicBlock *UncondBB = BB.getConditionalSuccessor(Condition: false); |
1091 | if (CondBB != UncondBB) |
1092 | continue; |
1093 | |
1094 | const MCSymbol *TBB = nullptr; |
1095 | const MCSymbol *FBB = nullptr; |
1096 | MCInst *CondBranch = nullptr; |
1097 | MCInst *UncondBranch = nullptr; |
1098 | bool Result = BB.analyzeBranch(TBB, FBB, CondBranch, UncondBranch); |
1099 | |
1100 | // analyzeBranch() can fail due to unusual branch instructions, |
1101 | // e.g. jrcxz, or jump tables (indirect jump). |
1102 | if (!Result || !CondBranch) |
1103 | continue; |
1104 | |
1105 | BB.removeDuplicateConditionalSuccessor(CondBranch); |
1106 | ++NumUselessCondBranches; |
1107 | } |
1108 | } |
1109 | |
1110 | Error Peepholes::runOnFunctions(BinaryContext &BC) { |
1111 | const char Opts = |
1112 | std::accumulate(first: opts::Peepholes.begin(), last: opts::Peepholes.end(), init: 0, |
1113 | binary_op: [](const char A, const PeepholeOpts B) { return A | B; }); |
1114 | if (Opts == PEEP_NONE) |
1115 | return Error::success(); |
1116 | |
1117 | for (auto &It : BC.getBinaryFunctions()) { |
1118 | BinaryFunction &Function = It.second; |
1119 | if (shouldOptimize(BF: Function)) { |
1120 | if (Opts & PEEP_DOUBLE_JUMPS) |
1121 | NumDoubleJumps += fixDoubleJumps(Function, MarkInvalid: false); |
1122 | if (Opts & PEEP_TAILCALL_TRAPS) |
1123 | addTailcallTraps(Function); |
1124 | if (Opts & PEEP_USELESS_BRANCHES) |
1125 | removeUselessCondBranches(Function); |
1126 | assert(Function.validateCFG()); |
1127 | } |
1128 | } |
1129 | BC.outs() << "BOLT-INFO: Peephole: " << NumDoubleJumps |
1130 | << " double jumps patched.\n" |
1131 | << "BOLT-INFO: Peephole: " << TailCallTraps |
1132 | << " tail call traps inserted.\n" |
1133 | << "BOLT-INFO: Peephole: " << NumUselessCondBranches |
1134 | << " useless conditional branches removed.\n" ; |
1135 | return Error::success(); |
1136 | } |
1137 | |
1138 | bool SimplifyRODataLoads::simplifyRODataLoads(BinaryFunction &BF) { |
1139 | BinaryContext &BC = BF.getBinaryContext(); |
1140 | MCPlusBuilder *MIB = BC.MIB.get(); |
1141 | |
1142 | uint64_t NumLocalLoadsSimplified = 0; |
1143 | uint64_t NumDynamicLocalLoadsSimplified = 0; |
1144 | uint64_t NumLocalLoadsFound = 0; |
1145 | uint64_t NumDynamicLocalLoadsFound = 0; |
1146 | |
1147 | for (BinaryBasicBlock *BB : BF.getLayout().blocks()) { |
1148 | for (MCInst &Inst : *BB) { |
1149 | unsigned Opcode = Inst.getOpcode(); |
1150 | const MCInstrDesc &Desc = BC.MII->get(Opcode); |
1151 | |
1152 | // Skip instructions that do not load from memory. |
1153 | if (!Desc.mayLoad()) |
1154 | continue; |
1155 | |
1156 | // Try to statically evaluate the target memory address; |
1157 | uint64_t TargetAddress; |
1158 | |
1159 | if (MIB->hasPCRelOperand(Inst)) { |
1160 | // Try to find the symbol that corresponds to the PC-relative operand. |
1161 | MCOperand *DispOpI = MIB->getMemOperandDisp(Inst); |
1162 | assert(DispOpI != Inst.end() && "expected PC-relative displacement" ); |
1163 | assert(DispOpI->isExpr() && |
1164 | "found PC-relative with non-symbolic displacement" ); |
1165 | |
1166 | // Get displacement symbol. |
1167 | const MCSymbol *DisplSymbol; |
1168 | uint64_t DisplOffset; |
1169 | |
1170 | std::tie(args&: DisplSymbol, args&: DisplOffset) = |
1171 | MIB->getTargetSymbolInfo(Expr: DispOpI->getExpr()); |
1172 | |
1173 | if (!DisplSymbol) |
1174 | continue; |
1175 | |
1176 | // Look up the symbol address in the global symbols map of the binary |
1177 | // context object. |
1178 | BinaryData *BD = BC.getBinaryDataByName(Name: DisplSymbol->getName()); |
1179 | if (!BD) |
1180 | continue; |
1181 | TargetAddress = BD->getAddress() + DisplOffset; |
1182 | } else if (!MIB->evaluateMemOperandTarget(Inst, Target&: TargetAddress)) { |
1183 | continue; |
1184 | } |
1185 | |
1186 | // Get the contents of the section containing the target address of the |
1187 | // memory operand. We are only interested in read-only sections. |
1188 | ErrorOr<BinarySection &> DataSection = |
1189 | BC.getSectionForAddress(Address: TargetAddress); |
1190 | if (!DataSection || DataSection->isWritable()) |
1191 | continue; |
1192 | |
1193 | if (BC.getRelocationAt(Address: TargetAddress) || |
1194 | BC.getDynamicRelocationAt(Address: TargetAddress)) |
1195 | continue; |
1196 | |
1197 | uint32_t Offset = TargetAddress - DataSection->getAddress(); |
1198 | StringRef ConstantData = DataSection->getContents(); |
1199 | |
1200 | ++NumLocalLoadsFound; |
1201 | if (BB->hasProfile()) |
1202 | NumDynamicLocalLoadsFound += BB->getExecutionCount(); |
1203 | |
1204 | if (MIB->replaceMemOperandWithImm(Inst, ConstantData, Offset)) { |
1205 | ++NumLocalLoadsSimplified; |
1206 | if (BB->hasProfile()) |
1207 | NumDynamicLocalLoadsSimplified += BB->getExecutionCount(); |
1208 | } |
1209 | } |
1210 | } |
1211 | |
1212 | NumLoadsFound += NumLocalLoadsFound; |
1213 | NumDynamicLoadsFound += NumDynamicLocalLoadsFound; |
1214 | NumLoadsSimplified += NumLocalLoadsSimplified; |
1215 | NumDynamicLoadsSimplified += NumDynamicLocalLoadsSimplified; |
1216 | |
1217 | return NumLocalLoadsSimplified > 0; |
1218 | } |
1219 | |
1220 | Error SimplifyRODataLoads::runOnFunctions(BinaryContext &BC) { |
1221 | for (auto &It : BC.getBinaryFunctions()) { |
1222 | BinaryFunction &Function = It.second; |
1223 | if (shouldOptimize(BF: Function) && simplifyRODataLoads(BF&: Function)) |
1224 | Modified.insert(x: &Function); |
1225 | } |
1226 | |
1227 | BC.outs() << "BOLT-INFO: simplified " << NumLoadsSimplified << " out of " |
1228 | << NumLoadsFound << " loads from a statically computed address.\n" |
1229 | << "BOLT-INFO: dynamic loads simplified: " |
1230 | << NumDynamicLoadsSimplified << "\n" |
1231 | << "BOLT-INFO: dynamic loads found: " << NumDynamicLoadsFound |
1232 | << "\n" ; |
1233 | return Error::success(); |
1234 | } |
1235 | |
1236 | Error AssignSections::runOnFunctions(BinaryContext &BC) { |
1237 | for (BinaryFunction *Function : BC.getInjectedBinaryFunctions()) { |
1238 | Function->setCodeSectionName(BC.getInjectedCodeSectionName()); |
1239 | Function->setColdCodeSectionName(BC.getInjectedColdCodeSectionName()); |
1240 | } |
1241 | |
1242 | // In non-relocation mode functions have pre-assigned section names. |
1243 | if (!BC.HasRelocations) |
1244 | return Error::success(); |
1245 | |
1246 | const bool UseColdSection = |
1247 | BC.NumProfiledFuncs > 0 || |
1248 | opts::ReorderFunctions == ReorderFunctions::RT_USER; |
1249 | for (auto &BFI : BC.getBinaryFunctions()) { |
1250 | BinaryFunction &Function = BFI.second; |
1251 | if (opts::isHotTextMover(Function)) { |
1252 | Function.setCodeSectionName(BC.getHotTextMoverSectionName()); |
1253 | Function.setColdCodeSectionName(BC.getHotTextMoverSectionName()); |
1254 | continue; |
1255 | } |
1256 | |
1257 | if (!UseColdSection || Function.hasValidIndex()) |
1258 | Function.setCodeSectionName(BC.getMainCodeSectionName()); |
1259 | else |
1260 | Function.setCodeSectionName(BC.getColdCodeSectionName()); |
1261 | |
1262 | if (Function.isSplit()) |
1263 | Function.setColdCodeSectionName(BC.getColdCodeSectionName()); |
1264 | } |
1265 | return Error::success(); |
1266 | } |
1267 | |
1268 | Error PrintProfileStats::runOnFunctions(BinaryContext &BC) { |
1269 | double FlowImbalanceMean = 0.0; |
1270 | size_t NumBlocksConsidered = 0; |
1271 | double WorstBias = 0.0; |
1272 | const BinaryFunction *WorstBiasFunc = nullptr; |
1273 | |
1274 | // For each function CFG, we fill an IncomingMap with the sum of the frequency |
1275 | // of incoming edges for each BB. Likewise for each OutgoingMap and the sum |
1276 | // of the frequency of outgoing edges. |
1277 | using FlowMapTy = std::unordered_map<const BinaryBasicBlock *, uint64_t>; |
1278 | std::unordered_map<const BinaryFunction *, FlowMapTy> TotalIncomingMaps; |
1279 | std::unordered_map<const BinaryFunction *, FlowMapTy> TotalOutgoingMaps; |
1280 | |
1281 | // Compute mean |
1282 | for (const auto &BFI : BC.getBinaryFunctions()) { |
1283 | const BinaryFunction &Function = BFI.second; |
1284 | if (Function.empty() || !Function.isSimple()) |
1285 | continue; |
1286 | FlowMapTy &IncomingMap = TotalIncomingMaps[&Function]; |
1287 | FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function]; |
1288 | for (const BinaryBasicBlock &BB : Function) { |
1289 | uint64_t TotalOutgoing = 0ULL; |
1290 | auto SuccBIIter = BB.branch_info_begin(); |
1291 | for (BinaryBasicBlock *Succ : BB.successors()) { |
1292 | uint64_t Count = SuccBIIter->Count; |
1293 | if (Count == BinaryBasicBlock::COUNT_NO_PROFILE || Count == 0) { |
1294 | ++SuccBIIter; |
1295 | continue; |
1296 | } |
1297 | TotalOutgoing += Count; |
1298 | IncomingMap[Succ] += Count; |
1299 | ++SuccBIIter; |
1300 | } |
1301 | OutgoingMap[&BB] = TotalOutgoing; |
1302 | } |
1303 | |
1304 | size_t NumBlocks = 0; |
1305 | double Mean = 0.0; |
1306 | for (const BinaryBasicBlock &BB : Function) { |
1307 | // Do not compute score for low frequency blocks, entry or exit blocks |
1308 | if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0 || BB.isEntryPoint()) |
1309 | continue; |
1310 | ++NumBlocks; |
1311 | const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB]; |
1312 | Mean += fabs(x: Difference / IncomingMap[&BB]); |
1313 | } |
1314 | |
1315 | FlowImbalanceMean += Mean; |
1316 | NumBlocksConsidered += NumBlocks; |
1317 | if (!NumBlocks) |
1318 | continue; |
1319 | double FuncMean = Mean / NumBlocks; |
1320 | if (FuncMean > WorstBias) { |
1321 | WorstBias = FuncMean; |
1322 | WorstBiasFunc = &Function; |
1323 | } |
1324 | } |
1325 | if (NumBlocksConsidered > 0) |
1326 | FlowImbalanceMean /= NumBlocksConsidered; |
1327 | |
1328 | // Compute standard deviation |
1329 | NumBlocksConsidered = 0; |
1330 | double FlowImbalanceVar = 0.0; |
1331 | for (const auto &BFI : BC.getBinaryFunctions()) { |
1332 | const BinaryFunction &Function = BFI.second; |
1333 | if (Function.empty() || !Function.isSimple()) |
1334 | continue; |
1335 | FlowMapTy &IncomingMap = TotalIncomingMaps[&Function]; |
1336 | FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function]; |
1337 | for (const BinaryBasicBlock &BB : Function) { |
1338 | if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0) |
1339 | continue; |
1340 | ++NumBlocksConsidered; |
1341 | const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB]; |
1342 | FlowImbalanceVar += |
1343 | pow(x: fabs(x: Difference / IncomingMap[&BB]) - FlowImbalanceMean, y: 2); |
1344 | } |
1345 | } |
1346 | if (NumBlocksConsidered) { |
1347 | FlowImbalanceVar /= NumBlocksConsidered; |
1348 | FlowImbalanceVar = sqrt(x: FlowImbalanceVar); |
1349 | } |
1350 | |
1351 | // Report to user |
1352 | BC.outs() << format(Fmt: "BOLT-INFO: Profile bias score: %.4lf%% StDev: %.4lf%%\n" , |
1353 | Vals: (100.0 * FlowImbalanceMean), Vals: (100.0 * FlowImbalanceVar)); |
1354 | if (WorstBiasFunc && opts::Verbosity >= 1) { |
1355 | BC.outs() << "Worst average bias observed in " |
1356 | << WorstBiasFunc->getPrintName() << "\n" ; |
1357 | LLVM_DEBUG(WorstBiasFunc->dump()); |
1358 | } |
1359 | return Error::success(); |
1360 | } |
1361 | |
1362 | Error PrintProgramStats::runOnFunctions(BinaryContext &BC) { |
1363 | uint64_t NumRegularFunctions = 0; |
1364 | uint64_t NumStaleProfileFunctions = 0; |
1365 | uint64_t NumAllStaleFunctions = 0; |
1366 | uint64_t NumInferredFunctions = 0; |
1367 | uint64_t NumNonSimpleProfiledFunctions = 0; |
1368 | uint64_t NumUnknownControlFlowFunctions = 0; |
1369 | uint64_t TotalSampleCount = 0; |
1370 | uint64_t StaleSampleCount = 0; |
1371 | uint64_t InferredSampleCount = 0; |
1372 | std::vector<const BinaryFunction *> ProfiledFunctions; |
1373 | const char * = "BOLT-INFO: Functions with stale profile:\n" ; |
1374 | for (auto &BFI : BC.getBinaryFunctions()) { |
1375 | const BinaryFunction &Function = BFI.second; |
1376 | |
1377 | // Ignore PLT functions for stats. |
1378 | if (Function.isPLTFunction()) |
1379 | continue; |
1380 | |
1381 | ++NumRegularFunctions; |
1382 | |
1383 | if (!Function.isSimple()) { |
1384 | if (Function.hasProfile()) |
1385 | ++NumNonSimpleProfiledFunctions; |
1386 | continue; |
1387 | } |
1388 | |
1389 | if (Function.hasUnknownControlFlow()) { |
1390 | if (opts::PrintUnknownCFG) |
1391 | Function.dump(); |
1392 | else if (opts::PrintUnknown) |
1393 | BC.errs() << "function with unknown control flow: " << Function << '\n'; |
1394 | |
1395 | ++NumUnknownControlFlowFunctions; |
1396 | } |
1397 | |
1398 | if (!Function.hasProfile()) |
1399 | continue; |
1400 | |
1401 | uint64_t SampleCount = Function.getRawBranchCount(); |
1402 | TotalSampleCount += SampleCount; |
1403 | |
1404 | if (Function.hasValidProfile()) { |
1405 | ProfiledFunctions.push_back(x: &Function); |
1406 | if (Function.hasInferredProfile()) { |
1407 | ++NumInferredFunctions; |
1408 | InferredSampleCount += SampleCount; |
1409 | ++NumAllStaleFunctions; |
1410 | } |
1411 | } else { |
1412 | if (opts::ReportStaleFuncs) { |
1413 | BC.outs() << StaleFuncsHeader; |
1414 | StaleFuncsHeader = "" ; |
1415 | BC.outs() << " " << Function << '\n'; |
1416 | } |
1417 | ++NumStaleProfileFunctions; |
1418 | StaleSampleCount += SampleCount; |
1419 | ++NumAllStaleFunctions; |
1420 | } |
1421 | } |
1422 | BC.NumProfiledFuncs = ProfiledFunctions.size(); |
1423 | BC.NumStaleProfileFuncs = NumStaleProfileFunctions; |
1424 | |
1425 | const size_t NumAllProfiledFunctions = |
1426 | ProfiledFunctions.size() + NumStaleProfileFunctions; |
1427 | BC.outs() << "BOLT-INFO: " << NumAllProfiledFunctions << " out of " |
1428 | << NumRegularFunctions << " functions in the binary (" |
1429 | << format(Fmt: "%.1f" , Vals: NumAllProfiledFunctions / |
1430 | (float)NumRegularFunctions * 100.0f) |
1431 | << "%) have non-empty execution profile\n" ; |
1432 | if (NumNonSimpleProfiledFunctions) { |
1433 | BC.outs() << "BOLT-INFO: " << NumNonSimpleProfiledFunctions << " function" |
1434 | << (NumNonSimpleProfiledFunctions == 1 ? "" : "s" ) |
1435 | << " with profile could not be optimized\n" ; |
1436 | } |
1437 | if (NumAllStaleFunctions) { |
1438 | const float PctStale = |
1439 | NumAllStaleFunctions / (float)NumAllProfiledFunctions * 100.0f; |
1440 | const float PctStaleFuncsWithEqualBlockCount = |
1441 | (float)BC.Stats.NumStaleFuncsWithEqualBlockCount / |
1442 | NumAllStaleFunctions * 100.0f; |
1443 | const float PctStaleBlocksWithEqualIcount = |
1444 | (float)BC.Stats.NumStaleBlocksWithEqualIcount / |
1445 | BC.Stats.NumStaleBlocks * 100.0f; |
1446 | auto printErrorOrWarning = [&]() { |
1447 | if (PctStale > opts::StaleThreshold) |
1448 | BC.errs() << "BOLT-ERROR: " ; |
1449 | else |
1450 | BC.errs() << "BOLT-WARNING: " ; |
1451 | }; |
1452 | printErrorOrWarning(); |
1453 | BC.errs() << NumAllStaleFunctions |
1454 | << format(Fmt: " (%.1f%% of all profiled)" , Vals: PctStale) << " function" |
1455 | << (NumAllStaleFunctions == 1 ? "" : "s" ) |
1456 | << " have invalid (possibly stale) profile." |
1457 | " Use -report-stale to see the list.\n" ; |
1458 | if (TotalSampleCount > 0) { |
1459 | printErrorOrWarning(); |
1460 | BC.errs() << (StaleSampleCount + InferredSampleCount) << " out of " |
1461 | << TotalSampleCount << " samples in the binary (" |
1462 | << format(Fmt: "%.1f" , |
1463 | Vals: ((100.0f * (StaleSampleCount + InferredSampleCount)) / |
1464 | TotalSampleCount)) |
1465 | << "%) belong to functions with invalid" |
1466 | " (possibly stale) profile.\n" ; |
1467 | } |
1468 | BC.outs() << "BOLT-INFO: " << BC.Stats.NumStaleFuncsWithEqualBlockCount |
1469 | << " stale function" |
1470 | << (BC.Stats.NumStaleFuncsWithEqualBlockCount == 1 ? "" : "s" ) |
1471 | << format(Fmt: " (%.1f%% of all stale)" , |
1472 | Vals: PctStaleFuncsWithEqualBlockCount) |
1473 | << " have matching block count.\n" ; |
1474 | BC.outs() << "BOLT-INFO: " << BC.Stats.NumStaleBlocksWithEqualIcount |
1475 | << " stale block" |
1476 | << (BC.Stats.NumStaleBlocksWithEqualIcount == 1 ? "" : "s" ) |
1477 | << format(Fmt: " (%.1f%% of all stale)" , Vals: PctStaleBlocksWithEqualIcount) |
1478 | << " have matching icount.\n" ; |
1479 | if (PctStale > opts::StaleThreshold) { |
1480 | return createFatalBOLTError( |
1481 | S: Twine("BOLT-ERROR: stale functions exceed specified threshold of " ) + |
1482 | Twine(opts::StaleThreshold.getValue()) + Twine("%. Exiting.\n" )); |
1483 | } |
1484 | } |
1485 | if (NumInferredFunctions) { |
1486 | BC.outs() << format( |
1487 | Fmt: "BOLT-INFO: inferred profile for %d (%.2f%% of profiled, " |
1488 | "%.2f%% of stale) functions responsible for %.2f%% samples" |
1489 | " (%zu out of %zu)\n" , |
1490 | Vals: NumInferredFunctions, |
1491 | Vals: 100.0 * NumInferredFunctions / NumAllProfiledFunctions, |
1492 | Vals: 100.0 * NumInferredFunctions / NumAllStaleFunctions, |
1493 | Vals: 100.0 * InferredSampleCount / TotalSampleCount, Vals: InferredSampleCount, |
1494 | Vals: TotalSampleCount); |
1495 | BC.outs() << format( |
1496 | Fmt: "BOLT-INFO: inference found an exact match for %.2f%% of basic blocks" |
1497 | " (%zu out of %zu stale) responsible for %.2f%% samples" |
1498 | " (%zu out of %zu stale)\n" , |
1499 | Vals: 100.0 * BC.Stats.NumMatchedBlocks / BC.Stats.NumStaleBlocks, |
1500 | Vals: BC.Stats.NumMatchedBlocks, Vals: BC.Stats.NumStaleBlocks, |
1501 | Vals: 100.0 * BC.Stats.MatchedSampleCount / BC.Stats.StaleSampleCount, |
1502 | Vals: BC.Stats.MatchedSampleCount, Vals: BC.Stats.StaleSampleCount); |
1503 | } |
1504 | |
1505 | if (const uint64_t NumUnusedObjects = BC.getNumUnusedProfiledObjects()) { |
1506 | BC.outs() << "BOLT-INFO: profile for " << NumUnusedObjects |
1507 | << " objects was ignored\n" ; |
1508 | } |
1509 | |
1510 | if (ProfiledFunctions.size() > 10) { |
1511 | if (opts::Verbosity >= 1) { |
1512 | BC.outs() << "BOLT-INFO: top called functions are:\n" ; |
1513 | llvm::sort(C&: ProfiledFunctions, |
1514 | Comp: [](const BinaryFunction *A, const BinaryFunction *B) { |
1515 | return B->getExecutionCount() < A->getExecutionCount(); |
1516 | }); |
1517 | auto SFI = ProfiledFunctions.begin(); |
1518 | auto SFIend = ProfiledFunctions.end(); |
1519 | for (unsigned I = 0u; I < opts::TopCalledLimit && SFI != SFIend; |
1520 | ++SFI, ++I) |
1521 | BC.outs() << " " << **SFI << " : " << (*SFI)->getExecutionCount() |
1522 | << '\n'; |
1523 | } |
1524 | } |
1525 | |
1526 | if (!opts::PrintSortedBy.empty()) { |
1527 | std::vector<BinaryFunction *> Functions; |
1528 | std::map<const BinaryFunction *, DynoStats> Stats; |
1529 | |
1530 | for (auto &BFI : BC.getBinaryFunctions()) { |
1531 | BinaryFunction &BF = BFI.second; |
1532 | if (shouldOptimize(BF) && BF.hasValidProfile()) { |
1533 | Functions.push_back(x: &BF); |
1534 | Stats.emplace(args: &BF, args: getDynoStats(BF)); |
1535 | } |
1536 | } |
1537 | |
1538 | const bool SortAll = |
1539 | llvm::is_contained(Range&: opts::PrintSortedBy, Element: DynoStats::LAST_DYNO_STAT); |
1540 | |
1541 | const bool Ascending = |
1542 | opts::DynoStatsSortOrderOpt == opts::DynoStatsSortOrder::Ascending; |
1543 | |
1544 | if (SortAll) { |
1545 | llvm::stable_sort(Range&: Functions, |
1546 | C: [Ascending, &Stats](const BinaryFunction *A, |
1547 | const BinaryFunction *B) { |
1548 | return Ascending ? Stats.at(k: A) < Stats.at(k: B) |
1549 | : Stats.at(k: B) < Stats.at(k: A); |
1550 | }); |
1551 | } else { |
1552 | llvm::stable_sort( |
1553 | Range&: Functions, C: [Ascending, &Stats](const BinaryFunction *A, |
1554 | const BinaryFunction *B) { |
1555 | const DynoStats &StatsA = Stats.at(k: A); |
1556 | const DynoStats &StatsB = Stats.at(k: B); |
1557 | return Ascending ? StatsA.lessThan(Other: StatsB, Keys: opts::PrintSortedBy) |
1558 | : StatsB.lessThan(Other: StatsA, Keys: opts::PrintSortedBy); |
1559 | }); |
1560 | } |
1561 | |
1562 | BC.outs() << "BOLT-INFO: top functions sorted by " ; |
1563 | if (SortAll) { |
1564 | BC.outs() << "dyno stats" ; |
1565 | } else { |
1566 | BC.outs() << "(" ; |
1567 | bool PrintComma = false; |
1568 | for (const DynoStats::Category Category : opts::PrintSortedBy) { |
1569 | if (PrintComma) |
1570 | BC.outs() << ", " ; |
1571 | BC.outs() << DynoStats::Description(C: Category); |
1572 | PrintComma = true; |
1573 | } |
1574 | BC.outs() << ")" ; |
1575 | } |
1576 | |
1577 | BC.outs() << " are:\n" ; |
1578 | auto SFI = Functions.begin(); |
1579 | for (unsigned I = 0; I < 100 && SFI != Functions.end(); ++SFI, ++I) { |
1580 | const DynoStats Stats = getDynoStats(BF&: **SFI); |
1581 | BC.outs() << " " << **SFI; |
1582 | if (!SortAll) { |
1583 | BC.outs() << " (" ; |
1584 | bool PrintComma = false; |
1585 | for (const DynoStats::Category Category : opts::PrintSortedBy) { |
1586 | if (PrintComma) |
1587 | BC.outs() << ", " ; |
1588 | BC.outs() << dynoStatsOptName(C: Category) << "=" << Stats[Category]; |
1589 | PrintComma = true; |
1590 | } |
1591 | BC.outs() << ")" ; |
1592 | } |
1593 | BC.outs() << "\n" ; |
1594 | } |
1595 | } |
1596 | |
1597 | if (!BC.TrappedFunctions.empty()) { |
1598 | BC.errs() << "BOLT-WARNING: " << BC.TrappedFunctions.size() << " function" |
1599 | << (BC.TrappedFunctions.size() > 1 ? "s" : "" ) |
1600 | << " will trap on entry. Use -trap-avx512=0 to disable" |
1601 | " traps." ; |
1602 | if (opts::Verbosity >= 1 || BC.TrappedFunctions.size() <= 5) { |
1603 | BC.errs() << '\n'; |
1604 | for (const BinaryFunction *Function : BC.TrappedFunctions) |
1605 | BC.errs() << " " << *Function << '\n'; |
1606 | } else { |
1607 | BC.errs() << " Use -v=1 to see the list.\n" ; |
1608 | } |
1609 | } |
1610 | |
1611 | // Print information on missed macro-fusion opportunities seen on input. |
1612 | if (BC.Stats.MissedMacroFusionPairs) { |
1613 | BC.outs() << format( |
1614 | Fmt: "BOLT-INFO: the input contains %zu (dynamic count : %zu)" |
1615 | " opportunities for macro-fusion optimization" , |
1616 | Vals: BC.Stats.MissedMacroFusionPairs, Vals: BC.Stats.MissedMacroFusionExecCount); |
1617 | switch (opts::AlignMacroOpFusion) { |
1618 | case MFT_NONE: |
1619 | BC.outs() << ". Use -align-macro-fusion to fix.\n" ; |
1620 | break; |
1621 | case MFT_HOT: |
1622 | BC.outs() << ". Will fix instances on a hot path.\n" ; |
1623 | break; |
1624 | case MFT_ALL: |
1625 | BC.outs() << " that are going to be fixed\n" ; |
1626 | break; |
1627 | } |
1628 | } |
1629 | |
1630 | // Collect and print information about suboptimal code layout on input. |
1631 | if (opts::ReportBadLayout) { |
1632 | std::vector<BinaryFunction *> SuboptimalFuncs; |
1633 | for (auto &BFI : BC.getBinaryFunctions()) { |
1634 | BinaryFunction &BF = BFI.second; |
1635 | if (!BF.hasValidProfile()) |
1636 | continue; |
1637 | |
1638 | const uint64_t HotThreshold = |
1639 | std::max<uint64_t>(a: BF.getKnownExecutionCount(), b: 1); |
1640 | bool HotSeen = false; |
1641 | for (const BinaryBasicBlock *BB : BF.getLayout().rblocks()) { |
1642 | if (!HotSeen && BB->getKnownExecutionCount() > HotThreshold) { |
1643 | HotSeen = true; |
1644 | continue; |
1645 | } |
1646 | if (HotSeen && BB->getKnownExecutionCount() == 0) { |
1647 | SuboptimalFuncs.push_back(x: &BF); |
1648 | break; |
1649 | } |
1650 | } |
1651 | } |
1652 | |
1653 | if (!SuboptimalFuncs.empty()) { |
1654 | llvm::sort(C&: SuboptimalFuncs, |
1655 | Comp: [](const BinaryFunction *A, const BinaryFunction *B) { |
1656 | return A->getKnownExecutionCount() / A->getSize() > |
1657 | B->getKnownExecutionCount() / B->getSize(); |
1658 | }); |
1659 | |
1660 | BC.outs() << "BOLT-INFO: " << SuboptimalFuncs.size() |
1661 | << " functions have " |
1662 | "cold code in the middle of hot code. Top functions are:\n" ; |
1663 | for (unsigned I = 0; |
1664 | I < std::min(a: static_cast<size_t>(opts::ReportBadLayout), |
1665 | b: SuboptimalFuncs.size()); |
1666 | ++I) |
1667 | SuboptimalFuncs[I]->print(OS&: BC.outs()); |
1668 | } |
1669 | } |
1670 | |
1671 | if (NumUnknownControlFlowFunctions) { |
1672 | BC.outs() << "BOLT-INFO: " << NumUnknownControlFlowFunctions |
1673 | << " functions have instructions with unknown control flow" ; |
1674 | if (!opts::PrintUnknown) |
1675 | BC.outs() << ". Use -print-unknown to see the list." ; |
1676 | BC.outs() << '\n'; |
1677 | } |
1678 | return Error::success(); |
1679 | } |
1680 | |
1681 | Error InstructionLowering::runOnFunctions(BinaryContext &BC) { |
1682 | for (auto &BFI : BC.getBinaryFunctions()) |
1683 | for (BinaryBasicBlock &BB : BFI.second) |
1684 | for (MCInst &Instruction : BB) |
1685 | BC.MIB->lowerTailCall(Inst&: Instruction); |
1686 | return Error::success(); |
1687 | } |
1688 | |
1689 | Error StripRepRet::runOnFunctions(BinaryContext &BC) { |
1690 | if (!BC.isX86()) |
1691 | return Error::success(); |
1692 | |
1693 | uint64_t NumPrefixesRemoved = 0; |
1694 | uint64_t NumBytesSaved = 0; |
1695 | for (auto &BFI : BC.getBinaryFunctions()) { |
1696 | for (BinaryBasicBlock &BB : BFI.second) { |
1697 | auto LastInstRIter = BB.getLastNonPseudo(); |
1698 | if (LastInstRIter == BB.rend() || !BC.MIB->isReturn(Inst: *LastInstRIter) || |
1699 | !BC.MIB->deleteREPPrefix(Inst&: *LastInstRIter)) |
1700 | continue; |
1701 | |
1702 | NumPrefixesRemoved += BB.getKnownExecutionCount(); |
1703 | ++NumBytesSaved; |
1704 | } |
1705 | } |
1706 | |
1707 | if (NumBytesSaved) |
1708 | BC.outs() << "BOLT-INFO: removed " << NumBytesSaved |
1709 | << " 'repz' prefixes" |
1710 | " with estimated execution count of " |
1711 | << NumPrefixesRemoved << " times.\n" ; |
1712 | return Error::success(); |
1713 | } |
1714 | |
1715 | Error InlineMemcpy::runOnFunctions(BinaryContext &BC) { |
1716 | if (!BC.isX86()) |
1717 | return Error::success(); |
1718 | |
1719 | uint64_t NumInlined = 0; |
1720 | uint64_t NumInlinedDyno = 0; |
1721 | for (auto &BFI : BC.getBinaryFunctions()) { |
1722 | for (BinaryBasicBlock &BB : BFI.second) { |
1723 | for (auto II = BB.begin(); II != BB.end(); ++II) { |
1724 | MCInst &Inst = *II; |
1725 | |
1726 | if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 || |
1727 | !Inst.getOperand(i: 0).isExpr()) |
1728 | continue; |
1729 | |
1730 | const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst); |
1731 | if (CalleeSymbol->getName() != "memcpy" && |
1732 | CalleeSymbol->getName() != "memcpy@PLT" && |
1733 | CalleeSymbol->getName() != "_memcpy8" ) |
1734 | continue; |
1735 | |
1736 | const bool IsMemcpy8 = (CalleeSymbol->getName() == "_memcpy8" ); |
1737 | const bool IsTailCall = BC.MIB->isTailCall(Inst); |
1738 | |
1739 | const InstructionListType NewCode = |
1740 | BC.MIB->createInlineMemcpy(ReturnEnd: IsMemcpy8); |
1741 | II = BB.replaceInstruction(II, Replacement: NewCode); |
1742 | std::advance(i&: II, n: NewCode.size() - 1); |
1743 | if (IsTailCall) { |
1744 | MCInst Return; |
1745 | BC.MIB->createReturn(Inst&: Return); |
1746 | II = BB.insertInstruction(At: std::next(x: II), NewInst: std::move(Return)); |
1747 | } |
1748 | |
1749 | ++NumInlined; |
1750 | NumInlinedDyno += BB.getKnownExecutionCount(); |
1751 | } |
1752 | } |
1753 | } |
1754 | |
1755 | if (NumInlined) { |
1756 | BC.outs() << "BOLT-INFO: inlined " << NumInlined << " memcpy() calls" ; |
1757 | if (NumInlinedDyno) |
1758 | BC.outs() << ". The calls were executed " << NumInlinedDyno |
1759 | << " times based on profile." ; |
1760 | BC.outs() << '\n'; |
1761 | } |
1762 | return Error::success(); |
1763 | } |
1764 | |
1765 | bool SpecializeMemcpy1::shouldOptimize(const BinaryFunction &Function) const { |
1766 | if (!BinaryFunctionPass::shouldOptimize(BF: Function)) |
1767 | return false; |
1768 | |
1769 | for (const std::string &FunctionSpec : Spec) { |
1770 | StringRef FunctionName = StringRef(FunctionSpec).split(Separator: ':').first; |
1771 | if (Function.hasNameRegex(NameRegex: FunctionName)) |
1772 | return true; |
1773 | } |
1774 | |
1775 | return false; |
1776 | } |
1777 | |
1778 | std::set<size_t> SpecializeMemcpy1::getCallSitesToOptimize( |
1779 | const BinaryFunction &Function) const { |
1780 | StringRef SitesString; |
1781 | for (const std::string &FunctionSpec : Spec) { |
1782 | StringRef FunctionName; |
1783 | std::tie(args&: FunctionName, args&: SitesString) = StringRef(FunctionSpec).split(Separator: ':'); |
1784 | if (Function.hasNameRegex(NameRegex: FunctionName)) |
1785 | break; |
1786 | SitesString = "" ; |
1787 | } |
1788 | |
1789 | std::set<size_t> Sites; |
1790 | SmallVector<StringRef, 4> SitesVec; |
1791 | SitesString.split(A&: SitesVec, Separator: ':'); |
1792 | for (StringRef SiteString : SitesVec) { |
1793 | if (SiteString.empty()) |
1794 | continue; |
1795 | size_t Result; |
1796 | if (!SiteString.getAsInteger(Radix: 10, Result)) |
1797 | Sites.emplace(args&: Result); |
1798 | } |
1799 | |
1800 | return Sites; |
1801 | } |
1802 | |
1803 | Error SpecializeMemcpy1::runOnFunctions(BinaryContext &BC) { |
1804 | if (!BC.isX86()) |
1805 | return Error::success(); |
1806 | |
1807 | uint64_t NumSpecialized = 0; |
1808 | uint64_t NumSpecializedDyno = 0; |
1809 | for (auto &BFI : BC.getBinaryFunctions()) { |
1810 | BinaryFunction &Function = BFI.second; |
1811 | if (!shouldOptimize(Function)) |
1812 | continue; |
1813 | |
1814 | std::set<size_t> CallsToOptimize = getCallSitesToOptimize(Function); |
1815 | auto shouldOptimize = [&](size_t N) { |
1816 | return CallsToOptimize.empty() || CallsToOptimize.count(x: N); |
1817 | }; |
1818 | |
1819 | std::vector<BinaryBasicBlock *> Blocks(Function.pbegin(), Function.pend()); |
1820 | size_t CallSiteID = 0; |
1821 | for (BinaryBasicBlock *CurBB : Blocks) { |
1822 | for (auto II = CurBB->begin(); II != CurBB->end(); ++II) { |
1823 | MCInst &Inst = *II; |
1824 | |
1825 | if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 || |
1826 | !Inst.getOperand(i: 0).isExpr()) |
1827 | continue; |
1828 | |
1829 | const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst); |
1830 | if (CalleeSymbol->getName() != "memcpy" && |
1831 | CalleeSymbol->getName() != "memcpy@PLT" ) |
1832 | continue; |
1833 | |
1834 | if (BC.MIB->isTailCall(Inst)) |
1835 | continue; |
1836 | |
1837 | ++CallSiteID; |
1838 | |
1839 | if (!shouldOptimize(CallSiteID)) |
1840 | continue; |
1841 | |
1842 | // Create a copy of a call to memcpy(dest, src, size). |
1843 | MCInst MemcpyInstr = Inst; |
1844 | |
1845 | BinaryBasicBlock *OneByteMemcpyBB = CurBB->splitAt(II); |
1846 | |
1847 | BinaryBasicBlock *NextBB = nullptr; |
1848 | if (OneByteMemcpyBB->getNumNonPseudos() > 1) { |
1849 | NextBB = OneByteMemcpyBB->splitAt(II: OneByteMemcpyBB->begin()); |
1850 | NextBB->eraseInstruction(II: NextBB->begin()); |
1851 | } else { |
1852 | NextBB = OneByteMemcpyBB->getSuccessor(); |
1853 | OneByteMemcpyBB->eraseInstruction(II: OneByteMemcpyBB->begin()); |
1854 | assert(NextBB && "unexpected call to memcpy() with no return" ); |
1855 | } |
1856 | |
1857 | BinaryBasicBlock *MemcpyBB = Function.addBasicBlock(); |
1858 | MemcpyBB->setOffset(CurBB->getInputOffset()); |
1859 | InstructionListType CmpJCC = |
1860 | BC.MIB->createCmpJE(RegNo: BC.MIB->getIntArgRegister(ArgNo: 2), Imm: 1, |
1861 | Target: OneByteMemcpyBB->getLabel(), Ctx: BC.Ctx.get()); |
1862 | CurBB->addInstructions(R: CmpJCC); |
1863 | CurBB->addSuccessor(Succ: MemcpyBB); |
1864 | |
1865 | MemcpyBB->addInstruction(Inst: std::move(MemcpyInstr)); |
1866 | MemcpyBB->addSuccessor(Succ: NextBB); |
1867 | MemcpyBB->setCFIState(NextBB->getCFIState()); |
1868 | MemcpyBB->setExecutionCount(0); |
1869 | |
1870 | // To prevent the actual call from being moved to cold, we set its |
1871 | // execution count to 1. |
1872 | if (CurBB->getKnownExecutionCount() > 0) |
1873 | MemcpyBB->setExecutionCount(1); |
1874 | |
1875 | InstructionListType OneByteMemcpy = BC.MIB->createOneByteMemcpy(); |
1876 | OneByteMemcpyBB->addInstructions(R: OneByteMemcpy); |
1877 | |
1878 | ++NumSpecialized; |
1879 | NumSpecializedDyno += CurBB->getKnownExecutionCount(); |
1880 | |
1881 | CurBB = NextBB; |
1882 | |
1883 | // Note: we don't expect the next instruction to be a call to memcpy. |
1884 | II = CurBB->begin(); |
1885 | } |
1886 | } |
1887 | } |
1888 | |
1889 | if (NumSpecialized) { |
1890 | BC.outs() << "BOLT-INFO: specialized " << NumSpecialized |
1891 | << " memcpy() call sites for size 1" ; |
1892 | if (NumSpecializedDyno) |
1893 | BC.outs() << ". The calls were executed " << NumSpecializedDyno |
1894 | << " times based on profile." ; |
1895 | BC.outs() << '\n'; |
1896 | } |
1897 | return Error::success(); |
1898 | } |
1899 | |
1900 | void RemoveNops::runOnFunction(BinaryFunction &BF) { |
1901 | const BinaryContext &BC = BF.getBinaryContext(); |
1902 | for (BinaryBasicBlock &BB : BF) { |
1903 | for (int64_t I = BB.size() - 1; I >= 0; --I) { |
1904 | MCInst &Inst = BB.getInstructionAtIndex(Index: I); |
1905 | if (BC.MIB->isNoop(Inst) && BC.MIB->hasAnnotation(Inst, Name: "NOP" )) |
1906 | BB.eraseInstructionAtIndex(Index: I); |
1907 | } |
1908 | } |
1909 | } |
1910 | |
1911 | Error RemoveNops::runOnFunctions(BinaryContext &BC) { |
1912 | ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { |
1913 | runOnFunction(BF); |
1914 | }; |
1915 | |
1916 | ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) { |
1917 | return BF.shouldPreserveNops(); |
1918 | }; |
1919 | |
1920 | ParallelUtilities::runOnEachFunction( |
1921 | BC, SchedPolicy: ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFunction: WorkFun, |
1922 | SkipPredicate: SkipFunc, LogName: "RemoveNops" ); |
1923 | return Error::success(); |
1924 | } |
1925 | |
1926 | } // namespace bolt |
1927 | } // namespace llvm |
1928 | |