| 1 | //===- bolt/Passes/BinaryPasses.cpp - Binary-level passes -----------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // This file implements multiple passes for binary optimization and analysis. |
| 10 | // |
| 11 | //===----------------------------------------------------------------------===// |
| 12 | |
| 13 | #include "bolt/Passes/BinaryPasses.h" |
| 14 | #include "bolt/Core/FunctionLayout.h" |
| 15 | #include "bolt/Core/ParallelUtilities.h" |
| 16 | #include "bolt/Passes/ReorderAlgorithm.h" |
| 17 | #include "bolt/Passes/ReorderFunctions.h" |
| 18 | #include "bolt/Utils/CommandLineOpts.h" |
| 19 | #include "llvm/Support/CommandLine.h" |
| 20 | #include <atomic> |
| 21 | #include <mutex> |
| 22 | #include <numeric> |
| 23 | #include <vector> |
| 24 | |
| 25 | #define DEBUG_TYPE "bolt-opts" |
| 26 | |
| 27 | using namespace llvm; |
| 28 | using namespace bolt; |
| 29 | |
| 30 | static const char *dynoStatsOptName(const bolt::DynoStats::Category C) { |
| 31 | assert(C > bolt::DynoStats::FIRST_DYNO_STAT && |
| 32 | C < DynoStats::LAST_DYNO_STAT && "Unexpected dyno stat category." ); |
| 33 | |
| 34 | static std::string OptNames[bolt::DynoStats::LAST_DYNO_STAT + 1]; |
| 35 | |
| 36 | OptNames[C] = bolt::DynoStats::Description(C); |
| 37 | |
| 38 | llvm::replace(Range&: OptNames[C], OldValue: ' ', NewValue: '-'); |
| 39 | |
| 40 | return OptNames[C].c_str(); |
| 41 | } |
| 42 | |
| 43 | namespace opts { |
| 44 | |
| 45 | extern cl::OptionCategory BoltCategory; |
| 46 | extern cl::OptionCategory BoltOptCategory; |
| 47 | |
| 48 | extern cl::opt<unsigned> Verbosity; |
| 49 | extern cl::opt<bool> EnableBAT; |
| 50 | extern cl::opt<unsigned> ExecutionCountThreshold; |
| 51 | extern cl::opt<bool> UpdateDebugSections; |
| 52 | extern cl::opt<bolt::ReorderFunctions::ReorderType> ReorderFunctions; |
| 53 | |
| 54 | enum DynoStatsSortOrder : char { |
| 55 | Ascending, |
| 56 | Descending |
| 57 | }; |
| 58 | |
| 59 | static cl::opt<DynoStatsSortOrder> DynoStatsSortOrderOpt( |
| 60 | "print-sorted-by-order" , |
| 61 | cl::desc("use ascending or descending order when printing functions " |
| 62 | "ordered by dyno stats" ), |
| 63 | cl::init(Val: DynoStatsSortOrder::Descending), cl::cat(BoltOptCategory)); |
| 64 | |
| 65 | cl::list<std::string> |
| 66 | HotTextMoveSections("hot-text-move-sections" , |
| 67 | cl::desc("list of sections containing functions used for hugifying hot text. " |
| 68 | "BOLT makes sure these functions are not placed on the same page as " |
| 69 | "the hot text. (default=\'.stub,.mover\')." ), |
| 70 | cl::value_desc("sec1,sec2,sec3,..." ), |
| 71 | cl::CommaSeparated, |
| 72 | cl::ZeroOrMore, |
| 73 | cl::cat(BoltCategory)); |
| 74 | |
| 75 | bool isHotTextMover(const BinaryFunction &Function) { |
| 76 | for (std::string &SectionName : opts::HotTextMoveSections) { |
| 77 | if (Function.getOriginSectionName() && |
| 78 | *Function.getOriginSectionName() == SectionName) |
| 79 | return true; |
| 80 | } |
| 81 | |
| 82 | return false; |
| 83 | } |
| 84 | |
| 85 | static cl::opt<bool> MinBranchClusters( |
| 86 | "min-branch-clusters" , |
| 87 | cl::desc("use a modified clustering algorithm geared towards minimizing " |
| 88 | "branches" ), |
| 89 | cl::Hidden, cl::cat(BoltOptCategory)); |
| 90 | |
| 91 | static cl::list<Peepholes::PeepholeOpts> Peepholes( |
| 92 | "peepholes" , cl::CommaSeparated, cl::desc("enable peephole optimizations" ), |
| 93 | cl::value_desc("opt1,opt2,opt3,..." ), |
| 94 | cl::values(clEnumValN(Peepholes::PEEP_NONE, "none" , "disable peepholes" ), |
| 95 | clEnumValN(Peepholes::PEEP_DOUBLE_JUMPS, "double-jumps" , |
| 96 | "remove double jumps when able" ), |
| 97 | clEnumValN(Peepholes::PEEP_TAILCALL_TRAPS, "tailcall-traps" , |
| 98 | "insert tail call traps" ), |
| 99 | clEnumValN(Peepholes::PEEP_USELESS_BRANCHES, "useless-branches" , |
| 100 | "remove useless conditional branches" ), |
| 101 | clEnumValN(Peepholes::PEEP_ALL, "all" , |
| 102 | "enable all peephole optimizations" )), |
| 103 | cl::ZeroOrMore, cl::cat(BoltOptCategory)); |
| 104 | |
| 105 | static cl::opt<unsigned> |
| 106 | PrintFuncStat("print-function-statistics" , |
| 107 | cl::desc("print statistics about basic block ordering" ), |
| 108 | cl::init(Val: 0), cl::cat(BoltOptCategory)); |
| 109 | |
| 110 | static cl::opt<bool> PrintLargeFunctions( |
| 111 | "print-large-functions" , |
| 112 | cl::desc("print functions that could not be overwritten due to excessive " |
| 113 | "size" ), |
| 114 | cl::init(Val: false), cl::cat(BoltOptCategory)); |
| 115 | |
| 116 | static cl::list<bolt::DynoStats::Category> |
| 117 | PrintSortedBy("print-sorted-by" , cl::CommaSeparated, |
| 118 | cl::desc("print functions sorted by order of dyno stats" ), |
| 119 | cl::value_desc("key1,key2,key3,..." ), |
| 120 | cl::values( |
| 121 | #define D(name, description, ...) \ |
| 122 | clEnumValN(bolt::DynoStats::name, dynoStatsOptName(bolt::DynoStats::name), \ |
| 123 | description), |
| 124 | REAL_DYNO_STATS |
| 125 | #undef D |
| 126 | clEnumValN(bolt::DynoStats::LAST_DYNO_STAT, "all" , |
| 127 | "sorted by all names" )), |
| 128 | cl::ZeroOrMore, cl::cat(BoltOptCategory)); |
| 129 | |
| 130 | static cl::opt<bool> |
| 131 | PrintUnknown("print-unknown" , |
| 132 | cl::desc("print names of functions with unknown control flow" ), |
| 133 | cl::cat(BoltCategory), cl::Hidden); |
| 134 | |
| 135 | static cl::opt<bool> |
| 136 | PrintUnknownCFG("print-unknown-cfg" , |
| 137 | cl::desc("dump CFG of functions with unknown control flow" ), |
| 138 | cl::cat(BoltCategory), cl::ReallyHidden); |
| 139 | |
| 140 | // Please MSVC19 with a forward declaration: otherwise it reports an error about |
| 141 | // an undeclared variable inside a callback. |
| 142 | extern cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks; |
| 143 | cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks( |
| 144 | "reorder-blocks" , cl::desc("change layout of basic blocks in a function" ), |
| 145 | cl::init(Val: bolt::ReorderBasicBlocks::LT_NONE), |
| 146 | cl::values( |
| 147 | clEnumValN(bolt::ReorderBasicBlocks::LT_NONE, "none" , |
| 148 | "do not reorder basic blocks" ), |
| 149 | clEnumValN(bolt::ReorderBasicBlocks::LT_REVERSE, "reverse" , |
| 150 | "layout blocks in reverse order" ), |
| 151 | clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE, "normal" , |
| 152 | "perform optimal layout based on profile" ), |
| 153 | clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_BRANCH, |
| 154 | "branch-predictor" , |
| 155 | "perform optimal layout prioritizing branch " |
| 156 | "predictions" ), |
| 157 | clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE, "cache" , |
| 158 | "perform optimal layout prioritizing I-cache " |
| 159 | "behavior" ), |
| 160 | clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS, "cache+" , |
| 161 | "perform layout optimizing I-cache behavior" ), |
| 162 | clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP, "ext-tsp" , |
| 163 | "perform layout optimizing I-cache behavior" ), |
| 164 | clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_SHUFFLE, |
| 165 | "cluster-shuffle" , "perform random layout of clusters" )), |
| 166 | cl::ZeroOrMore, cl::cat(BoltOptCategory), |
| 167 | cl::callback(CB: [](const bolt::ReorderBasicBlocks::LayoutType &option) { |
| 168 | if (option == bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS) { |
| 169 | errs() << "BOLT-WARNING: '-reorder-blocks=cache+' is deprecated, please" |
| 170 | << " use '-reorder-blocks=ext-tsp' instead\n" ; |
| 171 | ReorderBlocks = bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP; |
| 172 | } |
| 173 | })); |
| 174 | |
| 175 | static cl::opt<unsigned> ReportBadLayout( |
| 176 | "report-bad-layout" , |
| 177 | cl::desc("print top <uint> functions with suboptimal code layout on input" ), |
| 178 | cl::init(Val: 0), cl::Hidden, cl::cat(BoltOptCategory)); |
| 179 | |
| 180 | static cl::opt<bool> |
| 181 | ReportStaleFuncs("report-stale" , |
| 182 | cl::desc("print the list of functions with stale profile" ), |
| 183 | cl::Hidden, cl::cat(BoltOptCategory)); |
| 184 | |
| 185 | enum SctcModes : char { |
| 186 | SctcAlways, |
| 187 | SctcPreserveDirection, |
| 188 | SctcHeuristic |
| 189 | }; |
| 190 | |
| 191 | static cl::opt<SctcModes> |
| 192 | SctcMode("sctc-mode" , |
| 193 | cl::desc("mode for simplify conditional tail calls" ), |
| 194 | cl::init(Val: SctcAlways), |
| 195 | cl::values(clEnumValN(SctcAlways, "always" , "always perform sctc" ), |
| 196 | clEnumValN(SctcPreserveDirection, |
| 197 | "preserve" , |
| 198 | "only perform sctc when branch direction is " |
| 199 | "preserved" ), |
| 200 | clEnumValN(SctcHeuristic, |
| 201 | "heuristic" , |
| 202 | "use branch prediction data to control sctc" )), |
| 203 | cl::ZeroOrMore, |
| 204 | cl::cat(BoltOptCategory)); |
| 205 | |
| 206 | static cl::opt<unsigned> |
| 207 | StaleThreshold("stale-threshold" , |
| 208 | cl::desc( |
| 209 | "maximum percentage of stale functions to tolerate (default: 100)" ), |
| 210 | cl::init(Val: 100), |
| 211 | cl::Hidden, |
| 212 | cl::cat(BoltOptCategory)); |
| 213 | |
| 214 | static cl::opt<unsigned> TSPThreshold( |
| 215 | "tsp-threshold" , |
| 216 | cl::desc( |
| 217 | "maximum number of hot basic blocks in a function for which to use " |
| 218 | "a precise TSP solution while re-ordering basic blocks" ), |
| 219 | cl::init(Val: 10), cl::Hidden, cl::cat(BoltOptCategory)); |
| 220 | |
| 221 | static cl::opt<unsigned> TopCalledLimit( |
| 222 | "top-called-limit" , |
| 223 | cl::desc("maximum number of functions to print in top called " |
| 224 | "functions section" ), |
| 225 | cl::init(Val: 100), cl::Hidden, cl::cat(BoltCategory)); |
| 226 | |
| 227 | // Profile density options, synced with llvm-profgen/ProfileGenerator.cpp |
| 228 | static cl::opt<int> ProfileDensityCutOffHot( |
| 229 | "profile-density-cutoff-hot" , cl::init(Val: 990000), |
| 230 | cl::desc("Total samples cutoff for functions used to calculate " |
| 231 | "profile density." )); |
| 232 | |
| 233 | static cl::opt<double> ProfileDensityThreshold( |
| 234 | "profile-density-threshold" , cl::init(Val: 60), |
| 235 | cl::desc("If the profile density is below the given threshold, it " |
| 236 | "will be suggested to increase the sampling rate." ), |
| 237 | cl::Optional); |
| 238 | |
| 239 | } // namespace opts |
| 240 | |
| 241 | namespace llvm { |
| 242 | namespace bolt { |
| 243 | |
| 244 | bool BinaryFunctionPass::shouldOptimize(const BinaryFunction &BF) const { |
| 245 | return BF.isSimple() && BF.getState() == BinaryFunction::State::CFG && |
| 246 | !BF.isIgnored(); |
| 247 | } |
| 248 | |
| 249 | bool BinaryFunctionPass::shouldPrint(const BinaryFunction &BF) const { |
| 250 | return BF.isSimple() && !BF.isIgnored(); |
| 251 | } |
| 252 | |
| 253 | void NormalizeCFG::runOnFunction(BinaryFunction &BF) { |
| 254 | uint64_t NumRemoved = 0; |
| 255 | uint64_t NumDuplicateEdges = 0; |
| 256 | uint64_t NeedsFixBranches = 0; |
| 257 | for (BinaryBasicBlock &BB : BF) { |
| 258 | if (!BB.empty()) |
| 259 | continue; |
| 260 | |
| 261 | if (BB.isEntryPoint() || BB.isLandingPad()) |
| 262 | continue; |
| 263 | |
| 264 | // Handle a dangling empty block. |
| 265 | if (BB.succ_size() == 0) { |
| 266 | // If an empty dangling basic block has a predecessor, it could be a |
| 267 | // result of codegen for __builtin_unreachable. In such case, do not |
| 268 | // remove the block. |
| 269 | if (BB.pred_size() == 0) { |
| 270 | BB.markValid(Valid: false); |
| 271 | ++NumRemoved; |
| 272 | } |
| 273 | continue; |
| 274 | } |
| 275 | |
| 276 | // The block should have just one successor. |
| 277 | BinaryBasicBlock *Successor = BB.getSuccessor(); |
| 278 | assert(Successor && "invalid CFG encountered" ); |
| 279 | |
| 280 | // Redirect all predecessors to the successor block. |
| 281 | while (!BB.pred_empty()) { |
| 282 | BinaryBasicBlock *Predecessor = *BB.pred_begin(); |
| 283 | if (Predecessor->hasJumpTable()) |
| 284 | break; |
| 285 | |
| 286 | if (Predecessor == Successor) |
| 287 | break; |
| 288 | |
| 289 | BinaryBasicBlock::BinaryBranchInfo &BI = Predecessor->getBranchInfo(Succ: BB); |
| 290 | Predecessor->replaceSuccessor(Succ: &BB, NewSucc: Successor, Count: BI.Count, |
| 291 | MispredictedCount: BI.MispredictedCount); |
| 292 | // We need to fix branches even if we failed to replace all successors |
| 293 | // and remove the block. |
| 294 | NeedsFixBranches = true; |
| 295 | } |
| 296 | |
| 297 | if (BB.pred_empty()) { |
| 298 | BB.removeAllSuccessors(); |
| 299 | BB.markValid(Valid: false); |
| 300 | ++NumRemoved; |
| 301 | } |
| 302 | } |
| 303 | |
| 304 | if (NumRemoved) |
| 305 | BF.eraseInvalidBBs(); |
| 306 | |
| 307 | // Check for duplicate successors. Do it after the empty block elimination as |
| 308 | // we can get more duplicate successors. |
| 309 | for (BinaryBasicBlock &BB : BF) |
| 310 | if (!BB.hasJumpTable() && BB.succ_size() == 2 && |
| 311 | BB.getConditionalSuccessor(Condition: false) == BB.getConditionalSuccessor(Condition: true)) |
| 312 | ++NumDuplicateEdges; |
| 313 | |
| 314 | // fixBranches() will get rid of duplicate edges and update jump instructions. |
| 315 | if (NumDuplicateEdges || NeedsFixBranches) |
| 316 | BF.fixBranches(); |
| 317 | |
| 318 | NumDuplicateEdgesMerged += NumDuplicateEdges; |
| 319 | NumBlocksRemoved += NumRemoved; |
| 320 | } |
| 321 | |
| 322 | Error NormalizeCFG::runOnFunctions(BinaryContext &BC) { |
| 323 | ParallelUtilities::runOnEachFunction( |
| 324 | BC, SchedPolicy: ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, |
| 325 | WorkFunction: [&](BinaryFunction &BF) { runOnFunction(BF); }, |
| 326 | SkipPredicate: [&](const BinaryFunction &BF) { return !shouldOptimize(BF); }, |
| 327 | LogName: "NormalizeCFG" ); |
| 328 | if (NumBlocksRemoved) |
| 329 | BC.outs() << "BOLT-INFO: removed " << NumBlocksRemoved << " empty block" |
| 330 | << (NumBlocksRemoved == 1 ? "" : "s" ) << '\n'; |
| 331 | if (NumDuplicateEdgesMerged) |
| 332 | BC.outs() << "BOLT-INFO: merged " << NumDuplicateEdgesMerged |
| 333 | << " duplicate CFG edge" |
| 334 | << (NumDuplicateEdgesMerged == 1 ? "" : "s" ) << '\n'; |
| 335 | return Error::success(); |
| 336 | } |
| 337 | |
| 338 | void EliminateUnreachableBlocks::runOnFunction(BinaryFunction &Function) { |
| 339 | BinaryContext &BC = Function.getBinaryContext(); |
| 340 | unsigned Count; |
| 341 | uint64_t Bytes; |
| 342 | Function.markUnreachableBlocks(); |
| 343 | LLVM_DEBUG({ |
| 344 | for (BinaryBasicBlock &BB : Function) { |
| 345 | if (!BB.isValid()) { |
| 346 | dbgs() << "BOLT-INFO: UCE found unreachable block " << BB.getName() |
| 347 | << " in function " << Function << "\n" ; |
| 348 | Function.dump(); |
| 349 | } |
| 350 | } |
| 351 | }); |
| 352 | BinaryContext::IndependentCodeEmitter Emitter = |
| 353 | BC.createIndependentMCCodeEmitter(); |
| 354 | std::tie(args&: Count, args&: Bytes) = Function.eraseInvalidBBs(Emitter: Emitter.MCE.get()); |
| 355 | DeletedBlocks += Count; |
| 356 | DeletedBytes += Bytes; |
| 357 | if (Count) { |
| 358 | auto L = BC.scopeLock(); |
| 359 | Modified.insert(x: &Function); |
| 360 | if (opts::Verbosity > 0) |
| 361 | BC.outs() << "BOLT-INFO: removed " << Count |
| 362 | << " dead basic block(s) accounting for " << Bytes |
| 363 | << " bytes in function " << Function << '\n'; |
| 364 | } |
| 365 | } |
| 366 | |
| 367 | Error EliminateUnreachableBlocks::runOnFunctions(BinaryContext &BC) { |
| 368 | ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { |
| 369 | runOnFunction(Function&: BF); |
| 370 | }; |
| 371 | |
| 372 | ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) { |
| 373 | return !shouldOptimize(BF) || BF.getLayout().block_empty(); |
| 374 | }; |
| 375 | |
| 376 | ParallelUtilities::runOnEachFunction( |
| 377 | BC, SchedPolicy: ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFunction: WorkFun, |
| 378 | SkipPredicate, LogName: "elimininate-unreachable" ); |
| 379 | |
| 380 | if (DeletedBlocks) |
| 381 | BC.outs() << "BOLT-INFO: UCE removed " << DeletedBlocks << " blocks and " |
| 382 | << DeletedBytes << " bytes of code\n" ; |
| 383 | return Error::success(); |
| 384 | } |
| 385 | |
| 386 | bool ReorderBasicBlocks::shouldPrint(const BinaryFunction &BF) const { |
| 387 | return (BinaryFunctionPass::shouldPrint(BF) && |
| 388 | opts::ReorderBlocks != ReorderBasicBlocks::LT_NONE); |
| 389 | } |
| 390 | |
| 391 | bool ReorderBasicBlocks::shouldOptimize(const BinaryFunction &BF) const { |
| 392 | // Apply execution count threshold |
| 393 | if (BF.getKnownExecutionCount() < opts::ExecutionCountThreshold) |
| 394 | return false; |
| 395 | |
| 396 | return BinaryFunctionPass::shouldOptimize(BF); |
| 397 | } |
| 398 | |
| 399 | Error ReorderBasicBlocks::runOnFunctions(BinaryContext &BC) { |
| 400 | if (opts::ReorderBlocks == ReorderBasicBlocks::LT_NONE) |
| 401 | return Error::success(); |
| 402 | |
| 403 | std::atomic_uint64_t ModifiedFuncCount(0); |
| 404 | std::mutex FunctionEditDistanceMutex; |
| 405 | DenseMap<const BinaryFunction *, uint64_t> FunctionEditDistance; |
| 406 | |
| 407 | ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { |
| 408 | SmallVector<const BinaryBasicBlock *, 0> OldBlockOrder; |
| 409 | if (opts::PrintFuncStat > 0) |
| 410 | llvm::copy(Range: BF.getLayout().blocks(), Out: std::back_inserter(x&: OldBlockOrder)); |
| 411 | |
| 412 | const bool LayoutChanged = |
| 413 | modifyFunctionLayout(Function&: BF, Type: opts::ReorderBlocks, MinBranchClusters: opts::MinBranchClusters); |
| 414 | if (LayoutChanged) { |
| 415 | ModifiedFuncCount.fetch_add(i: 1, m: std::memory_order_relaxed); |
| 416 | if (opts::PrintFuncStat > 0) { |
| 417 | const uint64_t Distance = BF.getLayout().getEditDistance(OldBlockOrder); |
| 418 | std::lock_guard<std::mutex> Lock(FunctionEditDistanceMutex); |
| 419 | FunctionEditDistance[&BF] = Distance; |
| 420 | } |
| 421 | } |
| 422 | }; |
| 423 | |
| 424 | ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) { |
| 425 | return !shouldOptimize(BF); |
| 426 | }; |
| 427 | |
| 428 | ParallelUtilities::runOnEachFunction( |
| 429 | BC, SchedPolicy: ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, WorkFunction: WorkFun, SkipPredicate: SkipFunc, |
| 430 | LogName: "ReorderBasicBlocks" ); |
| 431 | const size_t NumAllProfiledFunctions = |
| 432 | BC.NumProfiledFuncs + BC.NumStaleProfileFuncs; |
| 433 | |
| 434 | BC.outs() << "BOLT-INFO: basic block reordering modified layout of " |
| 435 | << format( |
| 436 | Fmt: "%zu functions (%.2lf%% of profiled, %.2lf%% of total)\n" , |
| 437 | Vals: ModifiedFuncCount.load(m: std::memory_order_relaxed), |
| 438 | Vals: 100.0 * ModifiedFuncCount.load(m: std::memory_order_relaxed) / |
| 439 | NumAllProfiledFunctions, |
| 440 | Vals: 100.0 * ModifiedFuncCount.load(m: std::memory_order_relaxed) / |
| 441 | BC.getBinaryFunctions().size()); |
| 442 | |
| 443 | if (opts::PrintFuncStat > 0) { |
| 444 | raw_ostream &OS = BC.outs(); |
| 445 | // Copy all the values into vector in order to sort them |
| 446 | std::map<uint64_t, BinaryFunction &> ScoreMap; |
| 447 | auto &BFs = BC.getBinaryFunctions(); |
| 448 | for (auto It = BFs.begin(); It != BFs.end(); ++It) |
| 449 | ScoreMap.insert(x: std::pair<uint64_t, BinaryFunction &>( |
| 450 | It->second.getFunctionScore(), It->second)); |
| 451 | |
| 452 | OS << "\nBOLT-INFO: Printing Function Statistics:\n\n" ; |
| 453 | OS << " There are " << BFs.size() << " functions in total. \n" ; |
| 454 | OS << " Number of functions being modified: " |
| 455 | << ModifiedFuncCount.load(m: std::memory_order_relaxed) << "\n" ; |
| 456 | OS << " User asks for detailed information on top " |
| 457 | << opts::PrintFuncStat << " functions. (Ranked by function score)" |
| 458 | << "\n\n" ; |
| 459 | uint64_t I = 0; |
| 460 | for (std::map<uint64_t, BinaryFunction &>::reverse_iterator Rit = |
| 461 | ScoreMap.rbegin(); |
| 462 | Rit != ScoreMap.rend() && I < opts::PrintFuncStat; ++Rit, ++I) { |
| 463 | BinaryFunction &Function = Rit->second; |
| 464 | |
| 465 | OS << " Information for function of top: " << (I + 1) << ": \n" ; |
| 466 | OS << " Function Score is: " << Function.getFunctionScore() |
| 467 | << "\n" ; |
| 468 | OS << " There are " << Function.size() |
| 469 | << " number of blocks in this function.\n" ; |
| 470 | OS << " There are " << Function.getInstructionCount() |
| 471 | << " number of instructions in this function.\n" ; |
| 472 | OS << " The edit distance for this function is: " |
| 473 | << FunctionEditDistance.lookup(Val: &Function) << "\n\n" ; |
| 474 | } |
| 475 | } |
| 476 | return Error::success(); |
| 477 | } |
| 478 | |
| 479 | bool ReorderBasicBlocks::modifyFunctionLayout(BinaryFunction &BF, |
| 480 | LayoutType Type, |
| 481 | bool MinBranchClusters) const { |
| 482 | if (BF.size() == 0 || Type == LT_NONE) |
| 483 | return false; |
| 484 | |
| 485 | BinaryFunction::BasicBlockOrderType NewLayout; |
| 486 | std::unique_ptr<ReorderAlgorithm> Algo; |
| 487 | |
| 488 | // Cannot do optimal layout without profile. |
| 489 | if (Type != LT_REVERSE && !BF.hasValidProfile()) |
| 490 | return false; |
| 491 | |
| 492 | if (Type == LT_REVERSE) { |
| 493 | Algo.reset(p: new ReverseReorderAlgorithm()); |
| 494 | } else if (BF.size() <= opts::TSPThreshold && Type != LT_OPTIMIZE_SHUFFLE) { |
| 495 | // Work on optimal solution if problem is small enough |
| 496 | LLVM_DEBUG(dbgs() << "finding optimal block layout for " << BF << "\n" ); |
| 497 | Algo.reset(p: new TSPReorderAlgorithm()); |
| 498 | } else { |
| 499 | LLVM_DEBUG(dbgs() << "running block layout heuristics on " << BF << "\n" ); |
| 500 | |
| 501 | std::unique_ptr<ClusterAlgorithm> CAlgo; |
| 502 | if (MinBranchClusters) |
| 503 | CAlgo.reset(p: new MinBranchGreedyClusterAlgorithm()); |
| 504 | else |
| 505 | CAlgo.reset(p: new PHGreedyClusterAlgorithm()); |
| 506 | |
| 507 | switch (Type) { |
| 508 | case LT_OPTIMIZE: |
| 509 | Algo.reset(p: new OptimizeReorderAlgorithm(std::move(CAlgo))); |
| 510 | break; |
| 511 | |
| 512 | case LT_OPTIMIZE_BRANCH: |
| 513 | Algo.reset(p: new OptimizeBranchReorderAlgorithm(std::move(CAlgo))); |
| 514 | break; |
| 515 | |
| 516 | case LT_OPTIMIZE_CACHE: |
| 517 | Algo.reset(p: new OptimizeCacheReorderAlgorithm(std::move(CAlgo))); |
| 518 | break; |
| 519 | |
| 520 | case LT_OPTIMIZE_EXT_TSP: |
| 521 | Algo.reset(p: new ExtTSPReorderAlgorithm()); |
| 522 | break; |
| 523 | |
| 524 | case LT_OPTIMIZE_SHUFFLE: |
| 525 | Algo.reset(p: new RandomClusterReorderAlgorithm(std::move(CAlgo))); |
| 526 | break; |
| 527 | |
| 528 | default: |
| 529 | llvm_unreachable("unexpected layout type" ); |
| 530 | } |
| 531 | } |
| 532 | |
| 533 | Algo->reorderBasicBlocks(BF, Order&: NewLayout); |
| 534 | |
| 535 | return BF.getLayout().update(NewLayout); |
| 536 | } |
| 537 | |
| 538 | Error FixupBranches::runOnFunctions(BinaryContext &BC) { |
| 539 | for (auto &It : BC.getBinaryFunctions()) { |
| 540 | BinaryFunction &Function = It.second; |
| 541 | if (!BC.shouldEmit(Function) || !Function.isSimple()) |
| 542 | continue; |
| 543 | |
| 544 | Function.fixBranches(); |
| 545 | } |
| 546 | return Error::success(); |
| 547 | } |
| 548 | |
| 549 | Error FinalizeFunctions::runOnFunctions(BinaryContext &BC) { |
| 550 | std::atomic<bool> HasFatal{false}; |
| 551 | ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { |
| 552 | if (!BF.finalizeCFIState()) { |
| 553 | if (BC.HasRelocations) { |
| 554 | BC.errs() << "BOLT-ERROR: unable to fix CFI state for function " << BF |
| 555 | << ". Exiting.\n" ; |
| 556 | HasFatal = true; |
| 557 | return; |
| 558 | } |
| 559 | BF.setSimple(false); |
| 560 | return; |
| 561 | } |
| 562 | |
| 563 | BF.setFinalized(); |
| 564 | |
| 565 | // Update exception handling information. |
| 566 | BF.updateEHRanges(); |
| 567 | }; |
| 568 | |
| 569 | ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) { |
| 570 | return !BC.shouldEmit(Function: BF); |
| 571 | }; |
| 572 | |
| 573 | ParallelUtilities::runOnEachFunction( |
| 574 | BC, SchedPolicy: ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFunction: WorkFun, |
| 575 | SkipPredicate, LogName: "FinalizeFunctions" ); |
| 576 | if (HasFatal) |
| 577 | return createFatalBOLTError(S: "finalize CFI state failure" ); |
| 578 | return Error::success(); |
| 579 | } |
| 580 | |
| 581 | Error CheckLargeFunctions::runOnFunctions(BinaryContext &BC) { |
| 582 | if (BC.HasRelocations) |
| 583 | return Error::success(); |
| 584 | |
| 585 | // If the function wouldn't fit, mark it as non-simple. Otherwise, we may emit |
| 586 | // incorrect meta data. |
| 587 | ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { |
| 588 | uint64_t HotSize, ColdSize; |
| 589 | std::tie(args&: HotSize, args&: ColdSize) = |
| 590 | BC.calculateEmittedSize(BF, /*FixBranches=*/false); |
| 591 | uint64_t MainFragmentSize = HotSize; |
| 592 | if (BF.hasIslandsInfo()) { |
| 593 | MainFragmentSize += |
| 594 | offsetToAlignment(Value: BF.getAddress() + MainFragmentSize, |
| 595 | Alignment: Align(BF.getConstantIslandAlignment())); |
| 596 | MainFragmentSize += BF.estimateConstantIslandSize(); |
| 597 | } |
| 598 | if (MainFragmentSize > BF.getMaxSize()) { |
| 599 | if (opts::PrintLargeFunctions) |
| 600 | BC.outs() << "BOLT-INFO: " << BF << " size of " << MainFragmentSize |
| 601 | << " bytes exceeds allocated space by " |
| 602 | << (MainFragmentSize - BF.getMaxSize()) << " bytes\n" ; |
| 603 | BF.setSimple(false); |
| 604 | } |
| 605 | }; |
| 606 | |
| 607 | ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) { |
| 608 | return !shouldOptimize(BF); |
| 609 | }; |
| 610 | |
| 611 | ParallelUtilities::runOnEachFunction( |
| 612 | BC, SchedPolicy: ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFunction: WorkFun, |
| 613 | SkipPredicate: SkipFunc, LogName: "CheckLargeFunctions" ); |
| 614 | |
| 615 | return Error::success(); |
| 616 | } |
| 617 | |
| 618 | bool CheckLargeFunctions::shouldOptimize(const BinaryFunction &BF) const { |
| 619 | // Unlike other passes, allow functions in non-CFG state. |
| 620 | return BF.isSimple() && !BF.isIgnored(); |
| 621 | } |
| 622 | |
| 623 | Error LowerAnnotations::runOnFunctions(BinaryContext &BC) { |
| 624 | // Convert GnuArgsSize annotations into CFIs. |
| 625 | for (BinaryFunction *BF : BC.getAllBinaryFunctions()) { |
| 626 | for (FunctionFragment &FF : BF->getLayout().fragments()) { |
| 627 | // Reset at the start of the new fragment. |
| 628 | int64_t CurrentGnuArgsSize = 0; |
| 629 | |
| 630 | for (BinaryBasicBlock *const BB : FF) { |
| 631 | for (auto II = BB->begin(); II != BB->end(); ++II) { |
| 632 | if (!BF->usesGnuArgsSize() || !BC.MIB->isInvoke(Inst: *II)) |
| 633 | continue; |
| 634 | |
| 635 | const int64_t NewGnuArgsSize = BC.MIB->getGnuArgsSize(Inst: *II); |
| 636 | assert(NewGnuArgsSize >= 0 && "Expected non-negative GNU_args_size." ); |
| 637 | if (NewGnuArgsSize == CurrentGnuArgsSize) |
| 638 | continue; |
| 639 | |
| 640 | auto InsertII = BF->addCFIInstruction( |
| 641 | BB, Pos: II, |
| 642 | Inst: MCCFIInstruction::createGnuArgsSize(L: nullptr, Size: NewGnuArgsSize)); |
| 643 | CurrentGnuArgsSize = NewGnuArgsSize; |
| 644 | II = std::next(x: InsertII); |
| 645 | } |
| 646 | } |
| 647 | } |
| 648 | } |
| 649 | return Error::success(); |
| 650 | } |
| 651 | |
| 652 | // Check for dirty state in MCSymbol objects that might be a consequence |
| 653 | // of running calculateEmittedSize() in parallel, during split functions |
| 654 | // pass. If an inconsistent state is found (symbol already registered or |
| 655 | // already defined), clean it. |
| 656 | Error CleanMCState::runOnFunctions(BinaryContext &BC) { |
| 657 | MCContext &Ctx = *BC.Ctx; |
| 658 | for (const auto &SymMapEntry : Ctx.getSymbols()) { |
| 659 | const MCSymbol *S = SymMapEntry.getValue().Symbol; |
| 660 | if (!S) |
| 661 | continue; |
| 662 | if (S->isDefined()) { |
| 663 | LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName() |
| 664 | << "\" is already defined\n" ); |
| 665 | const_cast<MCSymbol *>(S)->setUndefined(); |
| 666 | } |
| 667 | if (S->isRegistered()) { |
| 668 | LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName() |
| 669 | << "\" is already registered\n" ); |
| 670 | const_cast<MCSymbol *>(S)->setIsRegistered(false); |
| 671 | } |
| 672 | LLVM_DEBUG(if (S->isVariable()) { |
| 673 | dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName() << "\" is variable\n" ; |
| 674 | }); |
| 675 | } |
| 676 | return Error::success(); |
| 677 | } |
| 678 | |
| 679 | // This peephole fixes jump instructions that jump to another basic |
| 680 | // block with a single jump instruction, e.g. |
| 681 | // |
| 682 | // B0: ... |
| 683 | // jmp B1 (or jcc B1) |
| 684 | // |
| 685 | // B1: jmp B2 |
| 686 | // |
| 687 | // -> |
| 688 | // |
| 689 | // B0: ... |
| 690 | // jmp B2 (or jcc B2) |
| 691 | // |
| 692 | static uint64_t fixDoubleJumps(BinaryFunction &Function, bool MarkInvalid) { |
| 693 | uint64_t NumDoubleJumps = 0; |
| 694 | |
| 695 | MCContext *Ctx = Function.getBinaryContext().Ctx.get(); |
| 696 | MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get(); |
| 697 | for (BinaryBasicBlock &BB : Function) { |
| 698 | auto checkAndPatch = [&](BinaryBasicBlock *Pred, BinaryBasicBlock *Succ, |
| 699 | const MCSymbol *SuccSym, |
| 700 | std::optional<uint32_t> Offset) { |
| 701 | // Ignore infinite loop jumps or fallthrough tail jumps. |
| 702 | if (Pred == Succ || Succ == &BB) |
| 703 | return false; |
| 704 | |
| 705 | if (Succ) { |
| 706 | const MCSymbol *TBB = nullptr; |
| 707 | const MCSymbol *FBB = nullptr; |
| 708 | MCInst *CondBranch = nullptr; |
| 709 | MCInst *UncondBranch = nullptr; |
| 710 | bool Res = Pred->analyzeBranch(TBB, FBB, CondBranch, UncondBranch); |
| 711 | if (!Res) { |
| 712 | LLVM_DEBUG(dbgs() << "analyzeBranch failed in peepholes in block:\n" ; |
| 713 | Pred->dump()); |
| 714 | return false; |
| 715 | } |
| 716 | Pred->replaceSuccessor(Succ: &BB, NewSucc: Succ); |
| 717 | |
| 718 | // We must patch up any existing branch instructions to match up |
| 719 | // with the new successor. |
| 720 | assert((CondBranch || (!CondBranch && Pred->succ_size() == 1)) && |
| 721 | "Predecessor block has inconsistent number of successors" ); |
| 722 | if (CondBranch && MIB->getTargetSymbol(Inst: *CondBranch) == BB.getLabel()) { |
| 723 | MIB->replaceBranchTarget(Inst&: *CondBranch, TBB: Succ->getLabel(), Ctx); |
| 724 | } else if (UncondBranch && |
| 725 | MIB->getTargetSymbol(Inst: *UncondBranch) == BB.getLabel()) { |
| 726 | MIB->replaceBranchTarget(Inst&: *UncondBranch, TBB: Succ->getLabel(), Ctx); |
| 727 | } else if (!UncondBranch) { |
| 728 | assert(Function.getLayout().getBasicBlockAfter(Pred, false) != Succ && |
| 729 | "Don't add an explicit jump to a fallthrough block." ); |
| 730 | Pred->addBranchInstruction(Successor: Succ); |
| 731 | } |
| 732 | } else { |
| 733 | // Succ will be null in the tail call case. In this case we |
| 734 | // need to explicitly add a tail call instruction. |
| 735 | MCInst *Branch = Pred->getLastNonPseudoInstr(); |
| 736 | if (Branch && MIB->isUnconditionalBranch(Inst: *Branch)) { |
| 737 | assert(MIB->getTargetSymbol(*Branch) == BB.getLabel()); |
| 738 | Pred->removeSuccessor(Succ: &BB); |
| 739 | Pred->eraseInstruction(II: Pred->findInstruction(Inst: Branch)); |
| 740 | Pred->addTailCallInstruction(Target: SuccSym); |
| 741 | if (Offset) { |
| 742 | MCInst *TailCall = Pred->getLastNonPseudoInstr(); |
| 743 | assert(TailCall); |
| 744 | MIB->setOffset(Inst&: *TailCall, Offset: *Offset); |
| 745 | } |
| 746 | } else { |
| 747 | return false; |
| 748 | } |
| 749 | } |
| 750 | |
| 751 | ++NumDoubleJumps; |
| 752 | LLVM_DEBUG(dbgs() << "Removed double jump in " << Function << " from " |
| 753 | << Pred->getName() << " -> " << BB.getName() << " to " |
| 754 | << Pred->getName() << " -> " << SuccSym->getName() |
| 755 | << (!Succ ? " (tail)\n" : "\n" )); |
| 756 | |
| 757 | return true; |
| 758 | }; |
| 759 | |
| 760 | if (BB.getNumNonPseudos() != 1 || BB.isLandingPad()) |
| 761 | continue; |
| 762 | |
| 763 | MCInst *Inst = BB.getFirstNonPseudoInstr(); |
| 764 | const bool IsTailCall = MIB->isTailCall(Inst: *Inst); |
| 765 | |
| 766 | if (!MIB->isUnconditionalBranch(Inst: *Inst) && !IsTailCall) |
| 767 | continue; |
| 768 | |
| 769 | // If we operate after SCTC make sure it's not a conditional tail call. |
| 770 | if (IsTailCall && MIB->isConditionalBranch(Inst: *Inst)) |
| 771 | continue; |
| 772 | |
| 773 | const MCSymbol *SuccSym = MIB->getTargetSymbol(Inst: *Inst); |
| 774 | BinaryBasicBlock *Succ = BB.getSuccessor(); |
| 775 | |
| 776 | if (((!Succ || &BB == Succ) && !IsTailCall) || (IsTailCall && !SuccSym)) |
| 777 | continue; |
| 778 | |
| 779 | std::vector<BinaryBasicBlock *> Preds = {BB.pred_begin(), BB.pred_end()}; |
| 780 | |
| 781 | for (BinaryBasicBlock *Pred : Preds) { |
| 782 | if (Pred->isLandingPad()) |
| 783 | continue; |
| 784 | |
| 785 | if (Pred->getSuccessor() == &BB || |
| 786 | (Pred->getConditionalSuccessor(Condition: true) == &BB && !IsTailCall) || |
| 787 | Pred->getConditionalSuccessor(Condition: false) == &BB) |
| 788 | if (checkAndPatch(Pred, Succ, SuccSym, MIB->getOffset(Inst: *Inst)) && |
| 789 | MarkInvalid) |
| 790 | BB.markValid(Valid: BB.pred_size() != 0 || BB.isLandingPad() || |
| 791 | BB.isEntryPoint()); |
| 792 | } |
| 793 | } |
| 794 | |
| 795 | return NumDoubleJumps; |
| 796 | } |
| 797 | |
| 798 | bool SimplifyConditionalTailCalls::shouldRewriteBranch( |
| 799 | const BinaryBasicBlock *PredBB, const MCInst &CondBranch, |
| 800 | const BinaryBasicBlock *BB, const bool DirectionFlag) { |
| 801 | if (BeenOptimized.count(x: PredBB)) |
| 802 | return false; |
| 803 | |
| 804 | const bool IsForward = BinaryFunction::isForwardBranch(From: PredBB, To: BB); |
| 805 | |
| 806 | if (IsForward) |
| 807 | ++NumOrigForwardBranches; |
| 808 | else |
| 809 | ++NumOrigBackwardBranches; |
| 810 | |
| 811 | if (opts::SctcMode == opts::SctcAlways) |
| 812 | return true; |
| 813 | |
| 814 | if (opts::SctcMode == opts::SctcPreserveDirection) |
| 815 | return IsForward == DirectionFlag; |
| 816 | |
| 817 | const ErrorOr<std::pair<double, double>> Frequency = |
| 818 | PredBB->getBranchStats(Succ: BB); |
| 819 | |
| 820 | // It's ok to rewrite the conditional branch if the new target will be |
| 821 | // a backward branch. |
| 822 | |
| 823 | // If no data available for these branches, then it should be ok to |
| 824 | // do the optimization since it will reduce code size. |
| 825 | if (Frequency.getError()) |
| 826 | return true; |
| 827 | |
| 828 | // TODO: should this use misprediction frequency instead? |
| 829 | const bool Result = (IsForward && Frequency.get().first >= 0.5) || |
| 830 | (!IsForward && Frequency.get().first <= 0.5); |
| 831 | |
| 832 | return Result == DirectionFlag; |
| 833 | } |
| 834 | |
| 835 | uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryFunction &BF) { |
| 836 | // Need updated indices to correctly detect branch' direction. |
| 837 | BF.getLayout().updateLayoutIndices(); |
| 838 | BF.markUnreachableBlocks(); |
| 839 | |
| 840 | MCPlusBuilder *MIB = BF.getBinaryContext().MIB.get(); |
| 841 | MCContext *Ctx = BF.getBinaryContext().Ctx.get(); |
| 842 | uint64_t NumLocalCTCCandidates = 0; |
| 843 | uint64_t NumLocalCTCs = 0; |
| 844 | uint64_t LocalCTCTakenCount = 0; |
| 845 | uint64_t LocalCTCExecCount = 0; |
| 846 | std::vector<std::pair<BinaryBasicBlock *, const BinaryBasicBlock *>> |
| 847 | NeedsUncondBranch; |
| 848 | |
| 849 | // Will block be deleted by UCE? |
| 850 | auto isValid = [](const BinaryBasicBlock *BB) { |
| 851 | return (BB->pred_size() != 0 || BB->isLandingPad() || BB->isEntryPoint()); |
| 852 | }; |
| 853 | |
| 854 | for (BinaryBasicBlock *BB : BF.getLayout().blocks()) { |
| 855 | // Locate BB with a single direct tail-call instruction. |
| 856 | if (BB->getNumNonPseudos() != 1) |
| 857 | continue; |
| 858 | |
| 859 | MCInst *Instr = BB->getFirstNonPseudoInstr(); |
| 860 | if (!MIB->isTailCall(Inst: *Instr) || MIB->isConditionalBranch(Inst: *Instr)) |
| 861 | continue; |
| 862 | |
| 863 | const MCSymbol *CalleeSymbol = MIB->getTargetSymbol(Inst: *Instr); |
| 864 | if (!CalleeSymbol) |
| 865 | continue; |
| 866 | |
| 867 | // Detect direction of the possible conditional tail call. |
| 868 | const bool IsForwardCTC = BF.isForwardCall(CalleeSymbol); |
| 869 | |
| 870 | // Iterate through all predecessors. |
| 871 | for (BinaryBasicBlock *PredBB : BB->predecessors()) { |
| 872 | BinaryBasicBlock *CondSucc = PredBB->getConditionalSuccessor(Condition: true); |
| 873 | if (!CondSucc) |
| 874 | continue; |
| 875 | |
| 876 | ++NumLocalCTCCandidates; |
| 877 | |
| 878 | const MCSymbol *TBB = nullptr; |
| 879 | const MCSymbol *FBB = nullptr; |
| 880 | MCInst *CondBranch = nullptr; |
| 881 | MCInst *UncondBranch = nullptr; |
| 882 | bool Result = PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch); |
| 883 | |
| 884 | // analyzeBranch() can fail due to unusual branch instructions, e.g. jrcxz |
| 885 | if (!Result) { |
| 886 | LLVM_DEBUG(dbgs() << "analyzeBranch failed in SCTC in block:\n" ; |
| 887 | PredBB->dump()); |
| 888 | continue; |
| 889 | } |
| 890 | |
| 891 | assert(Result && "internal error analyzing conditional branch" ); |
| 892 | assert(CondBranch && "conditional branch expected" ); |
| 893 | |
| 894 | // Skip dynamic branches for now. |
| 895 | if (BF.getBinaryContext().MIB->isDynamicBranch(Inst: *CondBranch)) |
| 896 | continue; |
| 897 | |
| 898 | // It's possible that PredBB is also a successor to BB that may have |
| 899 | // been processed by a previous iteration of the SCTC loop, in which |
| 900 | // case it may have been marked invalid. We should skip rewriting in |
| 901 | // this case. |
| 902 | if (!PredBB->isValid()) { |
| 903 | assert(PredBB->isSuccessor(BB) && |
| 904 | "PredBB should be valid if it is not a successor to BB" ); |
| 905 | continue; |
| 906 | } |
| 907 | |
| 908 | // We don't want to reverse direction of the branch in new order |
| 909 | // without further profile analysis. |
| 910 | const bool DirectionFlag = CondSucc == BB ? IsForwardCTC : !IsForwardCTC; |
| 911 | if (!shouldRewriteBranch(PredBB, CondBranch: *CondBranch, BB, DirectionFlag)) |
| 912 | continue; |
| 913 | |
| 914 | // Record this block so that we don't try to optimize it twice. |
| 915 | BeenOptimized.insert(x: PredBB); |
| 916 | |
| 917 | uint64_t Count = 0; |
| 918 | if (CondSucc != BB) { |
| 919 | // Patch the new target address into the conditional branch. |
| 920 | MIB->reverseBranchCondition(Inst&: *CondBranch, TBB: CalleeSymbol, Ctx); |
| 921 | // Since we reversed the condition on the branch we need to change |
| 922 | // the target for the unconditional branch or add a unconditional |
| 923 | // branch to the old target. This has to be done manually since |
| 924 | // fixupBranches is not called after SCTC. |
| 925 | NeedsUncondBranch.emplace_back(args&: PredBB, args&: CondSucc); |
| 926 | Count = PredBB->getFallthroughBranchInfo().Count; |
| 927 | } else { |
| 928 | // Change destination of the conditional branch. |
| 929 | MIB->replaceBranchTarget(Inst&: *CondBranch, TBB: CalleeSymbol, Ctx); |
| 930 | Count = PredBB->getTakenBranchInfo().Count; |
| 931 | } |
| 932 | const uint64_t CTCTakenFreq = |
| 933 | Count == BinaryBasicBlock::COUNT_NO_PROFILE ? 0 : Count; |
| 934 | |
| 935 | // Annotate it, so "isCall" returns true for this jcc |
| 936 | MIB->setConditionalTailCall(Inst&: *CondBranch); |
| 937 | // Add info about the conditional tail call frequency, otherwise this |
| 938 | // info will be lost when we delete the associated BranchInfo entry |
| 939 | auto &CTCAnnotation = |
| 940 | MIB->getOrCreateAnnotationAs<uint64_t>(Inst&: *CondBranch, Name: "CTCTakenCount" ); |
| 941 | CTCAnnotation = CTCTakenFreq; |
| 942 | // Preserve Offset annotation, used in BAT. |
| 943 | // Instr is a direct tail call instruction that was created when CTCs are |
| 944 | // first expanded, and has the original CTC offset set. |
| 945 | if (std::optional<uint32_t> Offset = MIB->getOffset(Inst: *Instr)) |
| 946 | MIB->setOffset(Inst&: *CondBranch, Offset: *Offset); |
| 947 | |
| 948 | // Remove the unused successor which may be eliminated later |
| 949 | // if there are no other users. |
| 950 | PredBB->removeSuccessor(Succ: BB); |
| 951 | // Update BB execution count |
| 952 | if (CTCTakenFreq && CTCTakenFreq <= BB->getKnownExecutionCount()) |
| 953 | BB->setExecutionCount(BB->getExecutionCount() - CTCTakenFreq); |
| 954 | else if (CTCTakenFreq > BB->getKnownExecutionCount()) |
| 955 | BB->setExecutionCount(0); |
| 956 | |
| 957 | ++NumLocalCTCs; |
| 958 | LocalCTCTakenCount += CTCTakenFreq; |
| 959 | LocalCTCExecCount += PredBB->getKnownExecutionCount(); |
| 960 | } |
| 961 | |
| 962 | // Remove the block from CFG if all predecessors were removed. |
| 963 | BB->markValid(Valid: isValid(BB)); |
| 964 | } |
| 965 | |
| 966 | // Add unconditional branches at the end of BBs to new successors |
| 967 | // as long as the successor is not a fallthrough. |
| 968 | for (auto &Entry : NeedsUncondBranch) { |
| 969 | BinaryBasicBlock *PredBB = Entry.first; |
| 970 | const BinaryBasicBlock *CondSucc = Entry.second; |
| 971 | |
| 972 | const MCSymbol *TBB = nullptr; |
| 973 | const MCSymbol *FBB = nullptr; |
| 974 | MCInst *CondBranch = nullptr; |
| 975 | MCInst *UncondBranch = nullptr; |
| 976 | PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch); |
| 977 | |
| 978 | // Find the next valid block. Invalid blocks will be deleted |
| 979 | // so they shouldn't be considered fallthrough targets. |
| 980 | const BinaryBasicBlock *NextBlock = |
| 981 | BF.getLayout().getBasicBlockAfter(BB: PredBB, IgnoreSplits: false); |
| 982 | while (NextBlock && !isValid(NextBlock)) |
| 983 | NextBlock = BF.getLayout().getBasicBlockAfter(BB: NextBlock, IgnoreSplits: false); |
| 984 | |
| 985 | // Get the unconditional successor to this block. |
| 986 | const BinaryBasicBlock *PredSucc = PredBB->getSuccessor(); |
| 987 | assert(PredSucc && "The other branch should be a tail call" ); |
| 988 | |
| 989 | const bool HasFallthrough = (NextBlock && PredSucc == NextBlock); |
| 990 | |
| 991 | if (UncondBranch) { |
| 992 | if (HasFallthrough) |
| 993 | PredBB->eraseInstruction(II: PredBB->findInstruction(Inst: UncondBranch)); |
| 994 | else |
| 995 | MIB->replaceBranchTarget(Inst&: *UncondBranch, TBB: CondSucc->getLabel(), Ctx); |
| 996 | } else if (!HasFallthrough) { |
| 997 | MCInst Branch; |
| 998 | MIB->createUncondBranch(Inst&: Branch, TBB: CondSucc->getLabel(), Ctx); |
| 999 | PredBB->addInstruction(Inst: Branch); |
| 1000 | } |
| 1001 | } |
| 1002 | |
| 1003 | if (NumLocalCTCs > 0) { |
| 1004 | NumDoubleJumps += fixDoubleJumps(Function&: BF, MarkInvalid: true); |
| 1005 | // Clean-up unreachable tail-call blocks. |
| 1006 | const std::pair<unsigned, uint64_t> Stats = BF.eraseInvalidBBs(); |
| 1007 | DeletedBlocks += Stats.first; |
| 1008 | DeletedBytes += Stats.second; |
| 1009 | |
| 1010 | assert(BF.validateCFG()); |
| 1011 | } |
| 1012 | |
| 1013 | LLVM_DEBUG(dbgs() << "BOLT: created " << NumLocalCTCs |
| 1014 | << " conditional tail calls from a total of " |
| 1015 | << NumLocalCTCCandidates << " candidates in function " << BF |
| 1016 | << ". CTCs execution count for this function is " |
| 1017 | << LocalCTCExecCount << " and CTC taken count is " |
| 1018 | << LocalCTCTakenCount << "\n" ;); |
| 1019 | |
| 1020 | NumTailCallsPatched += NumLocalCTCs; |
| 1021 | NumCandidateTailCalls += NumLocalCTCCandidates; |
| 1022 | CTCExecCount += LocalCTCExecCount; |
| 1023 | CTCTakenCount += LocalCTCTakenCount; |
| 1024 | |
| 1025 | return NumLocalCTCs > 0; |
| 1026 | } |
| 1027 | |
| 1028 | Error SimplifyConditionalTailCalls::runOnFunctions(BinaryContext &BC) { |
| 1029 | if (!BC.isX86()) |
| 1030 | return Error::success(); |
| 1031 | |
| 1032 | for (auto &It : BC.getBinaryFunctions()) { |
| 1033 | BinaryFunction &Function = It.second; |
| 1034 | |
| 1035 | if (!shouldOptimize(BF: Function)) |
| 1036 | continue; |
| 1037 | |
| 1038 | if (fixTailCalls(BF&: Function)) { |
| 1039 | Modified.insert(x: &Function); |
| 1040 | Function.setHasCanonicalCFG(false); |
| 1041 | } |
| 1042 | } |
| 1043 | |
| 1044 | if (NumTailCallsPatched) |
| 1045 | BC.outs() << "BOLT-INFO: SCTC: patched " << NumTailCallsPatched |
| 1046 | << " tail calls (" << NumOrigForwardBranches << " forward)" |
| 1047 | << " tail calls (" << NumOrigBackwardBranches << " backward)" |
| 1048 | << " from a total of " << NumCandidateTailCalls |
| 1049 | << " while removing " << NumDoubleJumps << " double jumps" |
| 1050 | << " and removing " << DeletedBlocks << " basic blocks" |
| 1051 | << " totalling " << DeletedBytes |
| 1052 | << " bytes of code. CTCs total execution count is " |
| 1053 | << CTCExecCount << " and the number of times CTCs are taken is " |
| 1054 | << CTCTakenCount << "\n" ; |
| 1055 | return Error::success(); |
| 1056 | } |
| 1057 | |
| 1058 | uint64_t ShortenInstructions::shortenInstructions(BinaryFunction &Function) { |
| 1059 | uint64_t Count = 0; |
| 1060 | const BinaryContext &BC = Function.getBinaryContext(); |
| 1061 | for (BinaryBasicBlock &BB : Function) { |
| 1062 | for (MCInst &Inst : BB) { |
| 1063 | // Skip shortening instructions with Size annotation. |
| 1064 | if (BC.MIB->getSize(Inst)) |
| 1065 | continue; |
| 1066 | |
| 1067 | MCInst OriginalInst; |
| 1068 | if (opts::Verbosity > 2) |
| 1069 | OriginalInst = Inst; |
| 1070 | |
| 1071 | if (!BC.MIB->shortenInstruction(Inst, STI: *BC.STI)) |
| 1072 | continue; |
| 1073 | |
| 1074 | if (opts::Verbosity > 2) { |
| 1075 | BC.scopeLock(); |
| 1076 | BC.outs() << "BOLT-INFO: shortening:\nBOLT-INFO: " ; |
| 1077 | BC.printInstruction(OS&: BC.outs(), Instruction: OriginalInst, Offset: 0, Function: &Function); |
| 1078 | BC.outs() << "BOLT-INFO: to:" ; |
| 1079 | BC.printInstruction(OS&: BC.outs(), Instruction: Inst, Offset: 0, Function: &Function); |
| 1080 | } |
| 1081 | |
| 1082 | ++Count; |
| 1083 | } |
| 1084 | } |
| 1085 | |
| 1086 | return Count; |
| 1087 | } |
| 1088 | |
| 1089 | Error ShortenInstructions::runOnFunctions(BinaryContext &BC) { |
| 1090 | std::atomic<uint64_t> NumShortened{0}; |
| 1091 | if (!BC.isX86()) |
| 1092 | return Error::success(); |
| 1093 | |
| 1094 | ParallelUtilities::runOnEachFunction( |
| 1095 | BC, SchedPolicy: ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, |
| 1096 | WorkFunction: [&](BinaryFunction &BF) { NumShortened += shortenInstructions(Function&: BF); }, |
| 1097 | SkipPredicate: nullptr, LogName: "ShortenInstructions" ); |
| 1098 | |
| 1099 | if (NumShortened) |
| 1100 | BC.outs() << "BOLT-INFO: " << NumShortened |
| 1101 | << " instructions were shortened\n" ; |
| 1102 | return Error::success(); |
| 1103 | } |
| 1104 | |
| 1105 | void Peepholes::addTailcallTraps(BinaryFunction &Function) { |
| 1106 | MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get(); |
| 1107 | for (BinaryBasicBlock &BB : Function) { |
| 1108 | MCInst *Inst = BB.getLastNonPseudoInstr(); |
| 1109 | if (Inst && MIB->isTailCall(Inst: *Inst) && MIB->isIndirectBranch(Inst: *Inst)) { |
| 1110 | MCInst Trap; |
| 1111 | MIB->createTrap(Inst&: Trap); |
| 1112 | BB.addInstruction(Inst: Trap); |
| 1113 | ++TailCallTraps; |
| 1114 | } |
| 1115 | } |
| 1116 | } |
| 1117 | |
| 1118 | void Peepholes::removeUselessCondBranches(BinaryFunction &Function) { |
| 1119 | for (BinaryBasicBlock &BB : Function) { |
| 1120 | if (BB.succ_size() != 2) |
| 1121 | continue; |
| 1122 | |
| 1123 | BinaryBasicBlock *CondBB = BB.getConditionalSuccessor(Condition: true); |
| 1124 | BinaryBasicBlock *UncondBB = BB.getConditionalSuccessor(Condition: false); |
| 1125 | if (CondBB != UncondBB) |
| 1126 | continue; |
| 1127 | |
| 1128 | const MCSymbol *TBB = nullptr; |
| 1129 | const MCSymbol *FBB = nullptr; |
| 1130 | MCInst *CondBranch = nullptr; |
| 1131 | MCInst *UncondBranch = nullptr; |
| 1132 | bool Result = BB.analyzeBranch(TBB, FBB, CondBranch, UncondBranch); |
| 1133 | |
| 1134 | // analyzeBranch() can fail due to unusual branch instructions, |
| 1135 | // e.g. jrcxz, or jump tables (indirect jump). |
| 1136 | if (!Result || !CondBranch) |
| 1137 | continue; |
| 1138 | |
| 1139 | BB.removeDuplicateConditionalSuccessor(CondBranch); |
| 1140 | ++NumUselessCondBranches; |
| 1141 | } |
| 1142 | } |
| 1143 | |
| 1144 | Error Peepholes::runOnFunctions(BinaryContext &BC) { |
| 1145 | const char Opts = |
| 1146 | std::accumulate(first: opts::Peepholes.begin(), last: opts::Peepholes.end(), init: 0, |
| 1147 | binary_op: [](const char A, const PeepholeOpts B) { return A | B; }); |
| 1148 | if (Opts == PEEP_NONE) |
| 1149 | return Error::success(); |
| 1150 | |
| 1151 | for (auto &It : BC.getBinaryFunctions()) { |
| 1152 | BinaryFunction &Function = It.second; |
| 1153 | if (shouldOptimize(BF: Function)) { |
| 1154 | if (Opts & PEEP_DOUBLE_JUMPS) |
| 1155 | NumDoubleJumps += fixDoubleJumps(Function, MarkInvalid: false); |
| 1156 | if (Opts & PEEP_TAILCALL_TRAPS) |
| 1157 | addTailcallTraps(Function); |
| 1158 | if (Opts & PEEP_USELESS_BRANCHES) |
| 1159 | removeUselessCondBranches(Function); |
| 1160 | assert(Function.validateCFG()); |
| 1161 | } |
| 1162 | } |
| 1163 | BC.outs() << "BOLT-INFO: Peephole: " << NumDoubleJumps |
| 1164 | << " double jumps patched.\n" |
| 1165 | << "BOLT-INFO: Peephole: " << TailCallTraps |
| 1166 | << " tail call traps inserted.\n" |
| 1167 | << "BOLT-INFO: Peephole: " << NumUselessCondBranches |
| 1168 | << " useless conditional branches removed.\n" ; |
| 1169 | return Error::success(); |
| 1170 | } |
| 1171 | |
| 1172 | bool SimplifyRODataLoads::simplifyRODataLoads(BinaryFunction &BF) { |
| 1173 | BinaryContext &BC = BF.getBinaryContext(); |
| 1174 | MCPlusBuilder *MIB = BC.MIB.get(); |
| 1175 | |
| 1176 | uint64_t NumLocalLoadsSimplified = 0; |
| 1177 | uint64_t NumDynamicLocalLoadsSimplified = 0; |
| 1178 | uint64_t NumLocalLoadsFound = 0; |
| 1179 | uint64_t NumDynamicLocalLoadsFound = 0; |
| 1180 | |
| 1181 | for (BinaryBasicBlock *BB : BF.getLayout().blocks()) { |
| 1182 | for (MCInst &Inst : *BB) { |
| 1183 | unsigned Opcode = Inst.getOpcode(); |
| 1184 | const MCInstrDesc &Desc = BC.MII->get(Opcode); |
| 1185 | |
| 1186 | // Skip instructions that do not load from memory. |
| 1187 | if (!Desc.mayLoad()) |
| 1188 | continue; |
| 1189 | |
| 1190 | // Try to statically evaluate the target memory address; |
| 1191 | uint64_t TargetAddress; |
| 1192 | |
| 1193 | if (MIB->hasPCRelOperand(Inst)) { |
| 1194 | // Try to find the symbol that corresponds to the PC-relative operand. |
| 1195 | MCOperand *DispOpI = MIB->getMemOperandDisp(Inst); |
| 1196 | assert(DispOpI != Inst.end() && "expected PC-relative displacement" ); |
| 1197 | assert(DispOpI->isExpr() && |
| 1198 | "found PC-relative with non-symbolic displacement" ); |
| 1199 | |
| 1200 | // Get displacement symbol. |
| 1201 | const MCSymbol *DisplSymbol; |
| 1202 | uint64_t DisplOffset; |
| 1203 | |
| 1204 | std::tie(args&: DisplSymbol, args&: DisplOffset) = |
| 1205 | MIB->getTargetSymbolInfo(Expr: DispOpI->getExpr()); |
| 1206 | |
| 1207 | if (!DisplSymbol) |
| 1208 | continue; |
| 1209 | |
| 1210 | // Look up the symbol address in the global symbols map of the binary |
| 1211 | // context object. |
| 1212 | BinaryData *BD = BC.getBinaryDataByName(Name: DisplSymbol->getName()); |
| 1213 | if (!BD) |
| 1214 | continue; |
| 1215 | TargetAddress = BD->getAddress() + DisplOffset; |
| 1216 | } else if (!MIB->evaluateMemOperandTarget(Inst, Target&: TargetAddress)) { |
| 1217 | continue; |
| 1218 | } |
| 1219 | |
| 1220 | // Get the contents of the section containing the target address of the |
| 1221 | // memory operand. We are only interested in read-only sections. |
| 1222 | ErrorOr<BinarySection &> DataSection = |
| 1223 | BC.getSectionForAddress(Address: TargetAddress); |
| 1224 | if (!DataSection || DataSection->isWritable()) |
| 1225 | continue; |
| 1226 | |
| 1227 | if (BC.getRelocationAt(Address: TargetAddress) || |
| 1228 | BC.getDynamicRelocationAt(Address: TargetAddress)) |
| 1229 | continue; |
| 1230 | |
| 1231 | uint32_t Offset = TargetAddress - DataSection->getAddress(); |
| 1232 | StringRef ConstantData = DataSection->getContents(); |
| 1233 | |
| 1234 | ++NumLocalLoadsFound; |
| 1235 | if (BB->hasProfile()) |
| 1236 | NumDynamicLocalLoadsFound += BB->getExecutionCount(); |
| 1237 | |
| 1238 | if (MIB->replaceMemOperandWithImm(Inst, ConstantData, Offset)) { |
| 1239 | ++NumLocalLoadsSimplified; |
| 1240 | if (BB->hasProfile()) |
| 1241 | NumDynamicLocalLoadsSimplified += BB->getExecutionCount(); |
| 1242 | } |
| 1243 | } |
| 1244 | } |
| 1245 | |
| 1246 | NumLoadsFound += NumLocalLoadsFound; |
| 1247 | NumDynamicLoadsFound += NumDynamicLocalLoadsFound; |
| 1248 | NumLoadsSimplified += NumLocalLoadsSimplified; |
| 1249 | NumDynamicLoadsSimplified += NumDynamicLocalLoadsSimplified; |
| 1250 | |
| 1251 | return NumLocalLoadsSimplified > 0; |
| 1252 | } |
| 1253 | |
| 1254 | Error SimplifyRODataLoads::runOnFunctions(BinaryContext &BC) { |
| 1255 | for (auto &It : BC.getBinaryFunctions()) { |
| 1256 | BinaryFunction &Function = It.second; |
| 1257 | if (shouldOptimize(BF: Function) && simplifyRODataLoads(BF&: Function)) |
| 1258 | Modified.insert(x: &Function); |
| 1259 | } |
| 1260 | |
| 1261 | BC.outs() << "BOLT-INFO: simplified " << NumLoadsSimplified << " out of " |
| 1262 | << NumLoadsFound << " loads from a statically computed address.\n" |
| 1263 | << "BOLT-INFO: dynamic loads simplified: " |
| 1264 | << NumDynamicLoadsSimplified << "\n" |
| 1265 | << "BOLT-INFO: dynamic loads found: " << NumDynamicLoadsFound |
| 1266 | << "\n" ; |
| 1267 | return Error::success(); |
| 1268 | } |
| 1269 | |
| 1270 | Error AssignSections::runOnFunctions(BinaryContext &BC) { |
| 1271 | for (BinaryFunction *Function : BC.getInjectedBinaryFunctions()) { |
| 1272 | if (!Function->isPatch()) { |
| 1273 | Function->setCodeSectionName(BC.getInjectedCodeSectionName()); |
| 1274 | Function->setColdCodeSectionName(BC.getInjectedColdCodeSectionName()); |
| 1275 | } |
| 1276 | } |
| 1277 | |
| 1278 | // In non-relocation mode functions have pre-assigned section names. |
| 1279 | if (!BC.HasRelocations) |
| 1280 | return Error::success(); |
| 1281 | |
| 1282 | const bool UseColdSection = |
| 1283 | BC.NumProfiledFuncs > 0 || |
| 1284 | opts::ReorderFunctions == ReorderFunctions::RT_USER; |
| 1285 | for (auto &BFI : BC.getBinaryFunctions()) { |
| 1286 | BinaryFunction &Function = BFI.second; |
| 1287 | if (opts::isHotTextMover(Function)) { |
| 1288 | Function.setCodeSectionName(BC.getHotTextMoverSectionName()); |
| 1289 | Function.setColdCodeSectionName(BC.getHotTextMoverSectionName()); |
| 1290 | continue; |
| 1291 | } |
| 1292 | |
| 1293 | if (!UseColdSection || Function.hasValidIndex()) |
| 1294 | Function.setCodeSectionName(BC.getMainCodeSectionName()); |
| 1295 | else |
| 1296 | Function.setCodeSectionName(BC.getColdCodeSectionName()); |
| 1297 | |
| 1298 | if (Function.isSplit()) |
| 1299 | Function.setColdCodeSectionName(BC.getColdCodeSectionName()); |
| 1300 | } |
| 1301 | return Error::success(); |
| 1302 | } |
| 1303 | |
| 1304 | Error PrintProfileStats::runOnFunctions(BinaryContext &BC) { |
| 1305 | double FlowImbalanceMean = 0.0; |
| 1306 | size_t NumBlocksConsidered = 0; |
| 1307 | double WorstBias = 0.0; |
| 1308 | const BinaryFunction *WorstBiasFunc = nullptr; |
| 1309 | |
| 1310 | // For each function CFG, we fill an IncomingMap with the sum of the frequency |
| 1311 | // of incoming edges for each BB. Likewise for each OutgoingMap and the sum |
| 1312 | // of the frequency of outgoing edges. |
| 1313 | using FlowMapTy = std::unordered_map<const BinaryBasicBlock *, uint64_t>; |
| 1314 | std::unordered_map<const BinaryFunction *, FlowMapTy> TotalIncomingMaps; |
| 1315 | std::unordered_map<const BinaryFunction *, FlowMapTy> TotalOutgoingMaps; |
| 1316 | |
| 1317 | // Compute mean |
| 1318 | for (const auto &BFI : BC.getBinaryFunctions()) { |
| 1319 | const BinaryFunction &Function = BFI.second; |
| 1320 | if (Function.empty() || !Function.isSimple()) |
| 1321 | continue; |
| 1322 | FlowMapTy &IncomingMap = TotalIncomingMaps[&Function]; |
| 1323 | FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function]; |
| 1324 | for (const BinaryBasicBlock &BB : Function) { |
| 1325 | uint64_t TotalOutgoing = 0ULL; |
| 1326 | auto SuccBIIter = BB.branch_info_begin(); |
| 1327 | for (BinaryBasicBlock *Succ : BB.successors()) { |
| 1328 | uint64_t Count = SuccBIIter->Count; |
| 1329 | if (Count == BinaryBasicBlock::COUNT_NO_PROFILE || Count == 0) { |
| 1330 | ++SuccBIIter; |
| 1331 | continue; |
| 1332 | } |
| 1333 | TotalOutgoing += Count; |
| 1334 | IncomingMap[Succ] += Count; |
| 1335 | ++SuccBIIter; |
| 1336 | } |
| 1337 | OutgoingMap[&BB] = TotalOutgoing; |
| 1338 | } |
| 1339 | |
| 1340 | size_t NumBlocks = 0; |
| 1341 | double Mean = 0.0; |
| 1342 | for (const BinaryBasicBlock &BB : Function) { |
| 1343 | // Do not compute score for low frequency blocks, entry or exit blocks |
| 1344 | if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0 || BB.isEntryPoint()) |
| 1345 | continue; |
| 1346 | ++NumBlocks; |
| 1347 | const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB]; |
| 1348 | Mean += fabs(x: Difference / IncomingMap[&BB]); |
| 1349 | } |
| 1350 | |
| 1351 | FlowImbalanceMean += Mean; |
| 1352 | NumBlocksConsidered += NumBlocks; |
| 1353 | if (!NumBlocks) |
| 1354 | continue; |
| 1355 | double FuncMean = Mean / NumBlocks; |
| 1356 | if (FuncMean > WorstBias) { |
| 1357 | WorstBias = FuncMean; |
| 1358 | WorstBiasFunc = &Function; |
| 1359 | } |
| 1360 | } |
| 1361 | if (NumBlocksConsidered > 0) |
| 1362 | FlowImbalanceMean /= NumBlocksConsidered; |
| 1363 | |
| 1364 | // Compute standard deviation |
| 1365 | NumBlocksConsidered = 0; |
| 1366 | double FlowImbalanceVar = 0.0; |
| 1367 | for (const auto &BFI : BC.getBinaryFunctions()) { |
| 1368 | const BinaryFunction &Function = BFI.second; |
| 1369 | if (Function.empty() || !Function.isSimple()) |
| 1370 | continue; |
| 1371 | FlowMapTy &IncomingMap = TotalIncomingMaps[&Function]; |
| 1372 | FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function]; |
| 1373 | for (const BinaryBasicBlock &BB : Function) { |
| 1374 | if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0) |
| 1375 | continue; |
| 1376 | ++NumBlocksConsidered; |
| 1377 | const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB]; |
| 1378 | FlowImbalanceVar += |
| 1379 | pow(x: fabs(x: Difference / IncomingMap[&BB]) - FlowImbalanceMean, y: 2); |
| 1380 | } |
| 1381 | } |
| 1382 | if (NumBlocksConsidered) { |
| 1383 | FlowImbalanceVar /= NumBlocksConsidered; |
| 1384 | FlowImbalanceVar = sqrt(x: FlowImbalanceVar); |
| 1385 | } |
| 1386 | |
| 1387 | // Report to user |
| 1388 | BC.outs() << format(Fmt: "BOLT-INFO: Profile bias score: %.4lf%% StDev: %.4lf%%\n" , |
| 1389 | Vals: (100.0 * FlowImbalanceMean), Vals: (100.0 * FlowImbalanceVar)); |
| 1390 | if (WorstBiasFunc && opts::Verbosity >= 1) { |
| 1391 | BC.outs() << "Worst average bias observed in " |
| 1392 | << WorstBiasFunc->getPrintName() << "\n" ; |
| 1393 | LLVM_DEBUG(WorstBiasFunc->dump()); |
| 1394 | } |
| 1395 | return Error::success(); |
| 1396 | } |
| 1397 | |
| 1398 | Error PrintProgramStats::runOnFunctions(BinaryContext &BC) { |
| 1399 | uint64_t NumRegularFunctions = 0; |
| 1400 | uint64_t NumStaleProfileFunctions = 0; |
| 1401 | uint64_t NumAllStaleFunctions = 0; |
| 1402 | uint64_t NumInferredFunctions = 0; |
| 1403 | uint64_t NumNonSimpleProfiledFunctions = 0; |
| 1404 | uint64_t NumUnknownControlFlowFunctions = 0; |
| 1405 | uint64_t TotalSampleCount = 0; |
| 1406 | uint64_t StaleSampleCount = 0; |
| 1407 | uint64_t InferredSampleCount = 0; |
| 1408 | std::vector<const BinaryFunction *> ProfiledFunctions; |
| 1409 | std::vector<std::pair<double, uint64_t>> FuncDensityList; |
| 1410 | const char * = "BOLT-INFO: Functions with stale profile:\n" ; |
| 1411 | for (auto &BFI : BC.getBinaryFunctions()) { |
| 1412 | const BinaryFunction &Function = BFI.second; |
| 1413 | |
| 1414 | // Ignore PLT functions for stats. |
| 1415 | if (Function.isPLTFunction()) |
| 1416 | continue; |
| 1417 | |
| 1418 | // Adjustment for BAT mode: the profile for BOLT split fragments is combined |
| 1419 | // so only count the hot fragment. |
| 1420 | const uint64_t Address = Function.getAddress(); |
| 1421 | bool IsHotParentOfBOLTSplitFunction = !Function.getFragments().empty() && |
| 1422 | BAT && BAT->isBATFunction(Address) && |
| 1423 | !BAT->fetchParentAddress(Address); |
| 1424 | |
| 1425 | ++NumRegularFunctions; |
| 1426 | |
| 1427 | // In BOLTed binaries split functions are non-simple (due to non-relocation |
| 1428 | // mode), but the original function is known to be simple and we have a |
| 1429 | // valid profile for it. |
| 1430 | if (!Function.isSimple() && !IsHotParentOfBOLTSplitFunction) { |
| 1431 | if (Function.hasProfile()) |
| 1432 | ++NumNonSimpleProfiledFunctions; |
| 1433 | continue; |
| 1434 | } |
| 1435 | |
| 1436 | if (Function.hasUnknownControlFlow()) { |
| 1437 | if (opts::PrintUnknownCFG) |
| 1438 | Function.dump(); |
| 1439 | else if (opts::PrintUnknown) |
| 1440 | BC.errs() << "function with unknown control flow: " << Function << '\n'; |
| 1441 | |
| 1442 | ++NumUnknownControlFlowFunctions; |
| 1443 | } |
| 1444 | |
| 1445 | if (!Function.hasProfile()) |
| 1446 | continue; |
| 1447 | |
| 1448 | uint64_t SampleCount = Function.getRawSampleCount(); |
| 1449 | TotalSampleCount += SampleCount; |
| 1450 | |
| 1451 | if (Function.hasValidProfile()) { |
| 1452 | ProfiledFunctions.push_back(x: &Function); |
| 1453 | if (Function.hasInferredProfile()) { |
| 1454 | ++NumInferredFunctions; |
| 1455 | InferredSampleCount += SampleCount; |
| 1456 | ++NumAllStaleFunctions; |
| 1457 | } |
| 1458 | } else { |
| 1459 | if (opts::ReportStaleFuncs) { |
| 1460 | BC.outs() << StaleFuncsHeader; |
| 1461 | StaleFuncsHeader = "" ; |
| 1462 | BC.outs() << " " << Function << '\n'; |
| 1463 | } |
| 1464 | ++NumStaleProfileFunctions; |
| 1465 | StaleSampleCount += SampleCount; |
| 1466 | ++NumAllStaleFunctions; |
| 1467 | } |
| 1468 | |
| 1469 | if (opts::ShowDensity) { |
| 1470 | uint64_t Size = Function.getSize(); |
| 1471 | // In case of BOLT split functions registered in BAT, executed traces are |
| 1472 | // automatically attributed to the main fragment. Add up function sizes |
| 1473 | // for all fragments. |
| 1474 | if (IsHotParentOfBOLTSplitFunction) |
| 1475 | for (const BinaryFunction *Fragment : Function.getFragments()) |
| 1476 | Size += Fragment->getSize(); |
| 1477 | double Density = (double)1.0 * Function.getSampleCountInBytes() / Size; |
| 1478 | FuncDensityList.emplace_back(args&: Density, args&: SampleCount); |
| 1479 | LLVM_DEBUG(BC.outs() << Function << ": executed bytes " |
| 1480 | << Function.getSampleCountInBytes() << ", size (b) " |
| 1481 | << Size << ", density " << Density |
| 1482 | << ", sample count " << SampleCount << '\n'); |
| 1483 | } |
| 1484 | } |
| 1485 | BC.NumProfiledFuncs = ProfiledFunctions.size(); |
| 1486 | BC.NumStaleProfileFuncs = NumStaleProfileFunctions; |
| 1487 | |
| 1488 | const size_t NumAllProfiledFunctions = |
| 1489 | ProfiledFunctions.size() + NumStaleProfileFunctions; |
| 1490 | BC.outs() << "BOLT-INFO: " << NumAllProfiledFunctions << " out of " |
| 1491 | << NumRegularFunctions << " functions in the binary (" |
| 1492 | << format(Fmt: "%.1f" , Vals: NumAllProfiledFunctions / |
| 1493 | (float)NumRegularFunctions * 100.0f) |
| 1494 | << "%) have non-empty execution profile\n" ; |
| 1495 | if (NumNonSimpleProfiledFunctions) { |
| 1496 | BC.outs() << "BOLT-INFO: " << NumNonSimpleProfiledFunctions << " function" |
| 1497 | << (NumNonSimpleProfiledFunctions == 1 ? "" : "s" ) |
| 1498 | << " with profile could not be optimized\n" ; |
| 1499 | } |
| 1500 | if (NumAllStaleFunctions) { |
| 1501 | const float PctStale = |
| 1502 | NumAllStaleFunctions / (float)NumAllProfiledFunctions * 100.0f; |
| 1503 | const float PctStaleFuncsWithEqualBlockCount = |
| 1504 | (float)BC.Stats.NumStaleFuncsWithEqualBlockCount / |
| 1505 | NumAllStaleFunctions * 100.0f; |
| 1506 | const float PctStaleBlocksWithEqualIcount = |
| 1507 | (float)BC.Stats.NumStaleBlocksWithEqualIcount / |
| 1508 | BC.Stats.NumStaleBlocks * 100.0f; |
| 1509 | auto printErrorOrWarning = [&]() { |
| 1510 | if (PctStale > opts::StaleThreshold) |
| 1511 | BC.errs() << "BOLT-ERROR: " ; |
| 1512 | else |
| 1513 | BC.errs() << "BOLT-WARNING: " ; |
| 1514 | }; |
| 1515 | printErrorOrWarning(); |
| 1516 | BC.errs() << NumAllStaleFunctions |
| 1517 | << format(Fmt: " (%.1f%% of all profiled)" , Vals: PctStale) << " function" |
| 1518 | << (NumAllStaleFunctions == 1 ? "" : "s" ) |
| 1519 | << " have invalid (possibly stale) profile." |
| 1520 | " Use -report-stale to see the list.\n" ; |
| 1521 | if (TotalSampleCount > 0) { |
| 1522 | printErrorOrWarning(); |
| 1523 | BC.errs() << (StaleSampleCount + InferredSampleCount) << " out of " |
| 1524 | << TotalSampleCount << " samples in the binary (" |
| 1525 | << format(Fmt: "%.1f" , |
| 1526 | Vals: ((100.0f * (StaleSampleCount + InferredSampleCount)) / |
| 1527 | TotalSampleCount)) |
| 1528 | << "%) belong to functions with invalid" |
| 1529 | " (possibly stale) profile.\n" ; |
| 1530 | } |
| 1531 | BC.outs() << "BOLT-INFO: " << BC.Stats.NumStaleFuncsWithEqualBlockCount |
| 1532 | << " stale function" |
| 1533 | << (BC.Stats.NumStaleFuncsWithEqualBlockCount == 1 ? "" : "s" ) |
| 1534 | << format(Fmt: " (%.1f%% of all stale)" , |
| 1535 | Vals: PctStaleFuncsWithEqualBlockCount) |
| 1536 | << " have matching block count.\n" ; |
| 1537 | BC.outs() << "BOLT-INFO: " << BC.Stats.NumStaleBlocksWithEqualIcount |
| 1538 | << " stale block" |
| 1539 | << (BC.Stats.NumStaleBlocksWithEqualIcount == 1 ? "" : "s" ) |
| 1540 | << format(Fmt: " (%.1f%% of all stale)" , Vals: PctStaleBlocksWithEqualIcount) |
| 1541 | << " have matching icount.\n" ; |
| 1542 | if (PctStale > opts::StaleThreshold) { |
| 1543 | return createFatalBOLTError( |
| 1544 | S: Twine("BOLT-ERROR: stale functions exceed specified threshold of " ) + |
| 1545 | Twine(opts::StaleThreshold.getValue()) + Twine("%. Exiting.\n" )); |
| 1546 | } |
| 1547 | } |
| 1548 | if (NumInferredFunctions) { |
| 1549 | BC.outs() << format( |
| 1550 | Fmt: "BOLT-INFO: inferred profile for %d (%.2f%% of profiled, " |
| 1551 | "%.2f%% of stale) functions responsible for %.2f%% samples" |
| 1552 | " (%zu out of %zu)\n" , |
| 1553 | Vals: NumInferredFunctions, |
| 1554 | Vals: 100.0 * NumInferredFunctions / NumAllProfiledFunctions, |
| 1555 | Vals: 100.0 * NumInferredFunctions / NumAllStaleFunctions, |
| 1556 | Vals: 100.0 * InferredSampleCount / TotalSampleCount, Vals: InferredSampleCount, |
| 1557 | Vals: TotalSampleCount); |
| 1558 | BC.outs() << format( |
| 1559 | Fmt: "BOLT-INFO: inference found an exact match for %.2f%% of basic blocks" |
| 1560 | " (%zu out of %zu stale) responsible for %.2f%% samples" |
| 1561 | " (%zu out of %zu stale)\n" , |
| 1562 | Vals: 100.0 * BC.Stats.NumExactMatchedBlocks / BC.Stats.NumStaleBlocks, |
| 1563 | Vals: BC.Stats.NumExactMatchedBlocks, Vals: BC.Stats.NumStaleBlocks, |
| 1564 | Vals: 100.0 * BC.Stats.ExactMatchedSampleCount / BC.Stats.StaleSampleCount, |
| 1565 | Vals: BC.Stats.ExactMatchedSampleCount, Vals: BC.Stats.StaleSampleCount); |
| 1566 | BC.outs() << format( |
| 1567 | Fmt: "BOLT-INFO: inference found an exact pseudo probe match for %.2f%% of " |
| 1568 | "basic blocks (%zu out of %zu stale) responsible for %.2f%% samples" |
| 1569 | " (%zu out of %zu stale)\n" , |
| 1570 | Vals: 100.0 * BC.Stats.NumPseudoProbeExactMatchedBlocks / |
| 1571 | BC.Stats.NumStaleBlocks, |
| 1572 | Vals: BC.Stats.NumPseudoProbeExactMatchedBlocks, Vals: BC.Stats.NumStaleBlocks, |
| 1573 | Vals: 100.0 * BC.Stats.PseudoProbeExactMatchedSampleCount / |
| 1574 | BC.Stats.StaleSampleCount, |
| 1575 | Vals: BC.Stats.PseudoProbeExactMatchedSampleCount, Vals: BC.Stats.StaleSampleCount); |
| 1576 | BC.outs() << format( |
| 1577 | Fmt: "BOLT-INFO: inference found a loose pseudo probe match for %.2f%% of " |
| 1578 | "basic blocks (%zu out of %zu stale) responsible for %.2f%% samples" |
| 1579 | " (%zu out of %zu stale)\n" , |
| 1580 | Vals: 100.0 * BC.Stats.NumPseudoProbeLooseMatchedBlocks / |
| 1581 | BC.Stats.NumStaleBlocks, |
| 1582 | Vals: BC.Stats.NumPseudoProbeLooseMatchedBlocks, Vals: BC.Stats.NumStaleBlocks, |
| 1583 | Vals: 100.0 * BC.Stats.PseudoProbeLooseMatchedSampleCount / |
| 1584 | BC.Stats.StaleSampleCount, |
| 1585 | Vals: BC.Stats.PseudoProbeLooseMatchedSampleCount, Vals: BC.Stats.StaleSampleCount); |
| 1586 | BC.outs() << format( |
| 1587 | Fmt: "BOLT-INFO: inference found a call match for %.2f%% of basic " |
| 1588 | "blocks" |
| 1589 | " (%zu out of %zu stale) responsible for %.2f%% samples" |
| 1590 | " (%zu out of %zu stale)\n" , |
| 1591 | Vals: 100.0 * BC.Stats.NumCallMatchedBlocks / BC.Stats.NumStaleBlocks, |
| 1592 | Vals: BC.Stats.NumCallMatchedBlocks, Vals: BC.Stats.NumStaleBlocks, |
| 1593 | Vals: 100.0 * BC.Stats.CallMatchedSampleCount / BC.Stats.StaleSampleCount, |
| 1594 | Vals: BC.Stats.CallMatchedSampleCount, Vals: BC.Stats.StaleSampleCount); |
| 1595 | BC.outs() << format( |
| 1596 | Fmt: "BOLT-INFO: inference found a loose match for %.2f%% of basic " |
| 1597 | "blocks" |
| 1598 | " (%zu out of %zu stale) responsible for %.2f%% samples" |
| 1599 | " (%zu out of %zu stale)\n" , |
| 1600 | Vals: 100.0 * BC.Stats.NumLooseMatchedBlocks / BC.Stats.NumStaleBlocks, |
| 1601 | Vals: BC.Stats.NumLooseMatchedBlocks, Vals: BC.Stats.NumStaleBlocks, |
| 1602 | Vals: 100.0 * BC.Stats.LooseMatchedSampleCount / BC.Stats.StaleSampleCount, |
| 1603 | Vals: BC.Stats.LooseMatchedSampleCount, Vals: BC.Stats.StaleSampleCount); |
| 1604 | } |
| 1605 | |
| 1606 | if (const uint64_t NumUnusedObjects = BC.getNumUnusedProfiledObjects()) { |
| 1607 | BC.outs() << "BOLT-INFO: profile for " << NumUnusedObjects |
| 1608 | << " objects was ignored\n" ; |
| 1609 | } |
| 1610 | |
| 1611 | if (ProfiledFunctions.size() > 10) { |
| 1612 | if (opts::Verbosity >= 1) { |
| 1613 | BC.outs() << "BOLT-INFO: top called functions are:\n" ; |
| 1614 | llvm::sort(C&: ProfiledFunctions, |
| 1615 | Comp: [](const BinaryFunction *A, const BinaryFunction *B) { |
| 1616 | return B->getExecutionCount() < A->getExecutionCount(); |
| 1617 | }); |
| 1618 | auto SFI = ProfiledFunctions.begin(); |
| 1619 | auto SFIend = ProfiledFunctions.end(); |
| 1620 | for (unsigned I = 0u; I < opts::TopCalledLimit && SFI != SFIend; |
| 1621 | ++SFI, ++I) |
| 1622 | BC.outs() << " " << **SFI << " : " << (*SFI)->getExecutionCount() |
| 1623 | << '\n'; |
| 1624 | } |
| 1625 | } |
| 1626 | |
| 1627 | if (!opts::PrintSortedBy.empty()) { |
| 1628 | std::vector<BinaryFunction *> Functions; |
| 1629 | std::map<const BinaryFunction *, DynoStats> Stats; |
| 1630 | |
| 1631 | for (auto &BFI : BC.getBinaryFunctions()) { |
| 1632 | BinaryFunction &BF = BFI.second; |
| 1633 | if (shouldOptimize(BF) && BF.hasValidProfile()) { |
| 1634 | Functions.push_back(x: &BF); |
| 1635 | Stats.emplace(args: &BF, args: getDynoStats(BF)); |
| 1636 | } |
| 1637 | } |
| 1638 | |
| 1639 | const bool SortAll = |
| 1640 | llvm::is_contained(Range&: opts::PrintSortedBy, Element: DynoStats::LAST_DYNO_STAT); |
| 1641 | |
| 1642 | const bool Ascending = |
| 1643 | opts::DynoStatsSortOrderOpt == opts::DynoStatsSortOrder::Ascending; |
| 1644 | |
| 1645 | std::function<bool(const DynoStats &, const DynoStats &)> |
| 1646 | DynoStatsComparator = |
| 1647 | SortAll ? [](const DynoStats &StatsA, |
| 1648 | const DynoStats &StatsB) { return StatsA < StatsB; } |
| 1649 | : [](const DynoStats &StatsA, const DynoStats &StatsB) { |
| 1650 | return StatsA.lessThan(Other: StatsB, Keys: opts::PrintSortedBy); |
| 1651 | }; |
| 1652 | |
| 1653 | llvm::stable_sort(Range&: Functions, |
| 1654 | C: [Ascending, &Stats, DynoStatsComparator]( |
| 1655 | const BinaryFunction *A, const BinaryFunction *B) { |
| 1656 | auto StatsItr = Stats.find(x: A); |
| 1657 | assert(StatsItr != Stats.end()); |
| 1658 | const DynoStats &StatsA = StatsItr->second; |
| 1659 | |
| 1660 | StatsItr = Stats.find(x: B); |
| 1661 | assert(StatsItr != Stats.end()); |
| 1662 | const DynoStats &StatsB = StatsItr->second; |
| 1663 | |
| 1664 | return Ascending ? DynoStatsComparator(StatsA, StatsB) |
| 1665 | : DynoStatsComparator(StatsB, StatsA); |
| 1666 | }); |
| 1667 | |
| 1668 | BC.outs() << "BOLT-INFO: top functions sorted by " ; |
| 1669 | if (SortAll) { |
| 1670 | BC.outs() << "dyno stats" ; |
| 1671 | } else { |
| 1672 | BC.outs() << "(" ; |
| 1673 | bool PrintComma = false; |
| 1674 | for (const DynoStats::Category Category : opts::PrintSortedBy) { |
| 1675 | if (PrintComma) |
| 1676 | BC.outs() << ", " ; |
| 1677 | BC.outs() << DynoStats::Description(C: Category); |
| 1678 | PrintComma = true; |
| 1679 | } |
| 1680 | BC.outs() << ")" ; |
| 1681 | } |
| 1682 | |
| 1683 | BC.outs() << " are:\n" ; |
| 1684 | auto SFI = Functions.begin(); |
| 1685 | for (unsigned I = 0; I < 100 && SFI != Functions.end(); ++SFI, ++I) { |
| 1686 | const DynoStats Stats = getDynoStats(BF&: **SFI); |
| 1687 | BC.outs() << " " << **SFI; |
| 1688 | if (!SortAll) { |
| 1689 | BC.outs() << " (" ; |
| 1690 | bool PrintComma = false; |
| 1691 | for (const DynoStats::Category Category : opts::PrintSortedBy) { |
| 1692 | if (PrintComma) |
| 1693 | BC.outs() << ", " ; |
| 1694 | BC.outs() << dynoStatsOptName(C: Category) << "=" << Stats[Category]; |
| 1695 | PrintComma = true; |
| 1696 | } |
| 1697 | BC.outs() << ")" ; |
| 1698 | } |
| 1699 | BC.outs() << "\n" ; |
| 1700 | } |
| 1701 | } |
| 1702 | |
| 1703 | if (!BC.TrappedFunctions.empty()) { |
| 1704 | BC.errs() << "BOLT-WARNING: " << BC.TrappedFunctions.size() << " function" |
| 1705 | << (BC.TrappedFunctions.size() > 1 ? "s" : "" ) |
| 1706 | << " will trap on entry. Use -trap-avx512=0 to disable" |
| 1707 | " traps." ; |
| 1708 | if (opts::Verbosity >= 1 || BC.TrappedFunctions.size() <= 5) { |
| 1709 | BC.errs() << '\n'; |
| 1710 | for (const BinaryFunction *Function : BC.TrappedFunctions) |
| 1711 | BC.errs() << " " << *Function << '\n'; |
| 1712 | } else { |
| 1713 | BC.errs() << " Use -v=1 to see the list.\n" ; |
| 1714 | } |
| 1715 | } |
| 1716 | |
| 1717 | // Collect and print information about suboptimal code layout on input. |
| 1718 | if (opts::ReportBadLayout) { |
| 1719 | std::vector<BinaryFunction *> SuboptimalFuncs; |
| 1720 | for (auto &BFI : BC.getBinaryFunctions()) { |
| 1721 | BinaryFunction &BF = BFI.second; |
| 1722 | if (!BF.hasValidProfile()) |
| 1723 | continue; |
| 1724 | |
| 1725 | const uint64_t HotThreshold = |
| 1726 | std::max<uint64_t>(a: BF.getKnownExecutionCount(), b: 1); |
| 1727 | bool HotSeen = false; |
| 1728 | for (const BinaryBasicBlock *BB : BF.getLayout().rblocks()) { |
| 1729 | if (!HotSeen && BB->getKnownExecutionCount() > HotThreshold) { |
| 1730 | HotSeen = true; |
| 1731 | continue; |
| 1732 | } |
| 1733 | if (HotSeen && BB->getKnownExecutionCount() == 0) { |
| 1734 | SuboptimalFuncs.push_back(x: &BF); |
| 1735 | break; |
| 1736 | } |
| 1737 | } |
| 1738 | } |
| 1739 | |
| 1740 | if (!SuboptimalFuncs.empty()) { |
| 1741 | llvm::sort(C&: SuboptimalFuncs, |
| 1742 | Comp: [](const BinaryFunction *A, const BinaryFunction *B) { |
| 1743 | return A->getKnownExecutionCount() / A->getSize() > |
| 1744 | B->getKnownExecutionCount() / B->getSize(); |
| 1745 | }); |
| 1746 | |
| 1747 | BC.outs() << "BOLT-INFO: " << SuboptimalFuncs.size() |
| 1748 | << " functions have " |
| 1749 | "cold code in the middle of hot code. Top functions are:\n" ; |
| 1750 | for (unsigned I = 0; |
| 1751 | I < std::min(a: static_cast<size_t>(opts::ReportBadLayout), |
| 1752 | b: SuboptimalFuncs.size()); |
| 1753 | ++I) |
| 1754 | SuboptimalFuncs[I]->print(OS&: BC.outs()); |
| 1755 | } |
| 1756 | } |
| 1757 | |
| 1758 | if (NumUnknownControlFlowFunctions) { |
| 1759 | BC.outs() << "BOLT-INFO: " << NumUnknownControlFlowFunctions |
| 1760 | << " functions have instructions with unknown control flow" ; |
| 1761 | if (!opts::PrintUnknown) |
| 1762 | BC.outs() << ". Use -print-unknown to see the list." ; |
| 1763 | BC.outs() << '\n'; |
| 1764 | } |
| 1765 | |
| 1766 | if (opts::ShowDensity) { |
| 1767 | double Density = 0.0; |
| 1768 | // Sorted by the density in descending order. |
| 1769 | llvm::stable_sort(Range&: FuncDensityList, |
| 1770 | C: [&](const std::pair<double, uint64_t> &A, |
| 1771 | const std::pair<double, uint64_t> &B) { |
| 1772 | if (A.first != B.first) |
| 1773 | return A.first > B.first; |
| 1774 | return A.second < B.second; |
| 1775 | }); |
| 1776 | |
| 1777 | uint64_t AccumulatedSamples = 0; |
| 1778 | uint32_t I = 0; |
| 1779 | assert(opts::ProfileDensityCutOffHot <= 1000000 && |
| 1780 | "The cutoff value is greater than 1000000(100%)" ); |
| 1781 | while (AccumulatedSamples < |
| 1782 | TotalSampleCount * |
| 1783 | static_cast<float>(opts::ProfileDensityCutOffHot) / |
| 1784 | 1000000 && |
| 1785 | I < FuncDensityList.size()) { |
| 1786 | AccumulatedSamples += FuncDensityList[I].second; |
| 1787 | Density = FuncDensityList[I].first; |
| 1788 | I++; |
| 1789 | } |
| 1790 | if (Density == 0.0) { |
| 1791 | BC.errs() << "BOLT-WARNING: the output profile is empty or the " |
| 1792 | "--profile-density-cutoff-hot option is " |
| 1793 | "set too low. Please check your command.\n" ; |
| 1794 | } else if (Density < opts::ProfileDensityThreshold) { |
| 1795 | BC.errs() |
| 1796 | << "BOLT-WARNING: BOLT is estimated to optimize better with " |
| 1797 | << format(Fmt: "%.1f" , Vals: opts::ProfileDensityThreshold / Density) |
| 1798 | << "x more samples. Please consider increasing sampling rate or " |
| 1799 | "profiling for longer duration to get more samples.\n" ; |
| 1800 | } |
| 1801 | |
| 1802 | BC.outs() << "BOLT-INFO: Functions with density >= " |
| 1803 | << format(Fmt: "%.1f" , Vals: Density) << " account for " |
| 1804 | << format(Fmt: "%.2f" , |
| 1805 | Vals: static_cast<double>(opts::ProfileDensityCutOffHot) / |
| 1806 | 10000) |
| 1807 | << "% total sample counts.\n" ; |
| 1808 | } |
| 1809 | return Error::success(); |
| 1810 | } |
| 1811 | |
| 1812 | Error InstructionLowering::runOnFunctions(BinaryContext &BC) { |
| 1813 | for (auto &BFI : BC.getBinaryFunctions()) |
| 1814 | for (BinaryBasicBlock &BB : BFI.second) |
| 1815 | for (MCInst &Instruction : BB) |
| 1816 | BC.MIB->lowerTailCall(Inst&: Instruction); |
| 1817 | return Error::success(); |
| 1818 | } |
| 1819 | |
| 1820 | Error StripRepRet::runOnFunctions(BinaryContext &BC) { |
| 1821 | if (!BC.isX86()) |
| 1822 | return Error::success(); |
| 1823 | |
| 1824 | uint64_t NumPrefixesRemoved = 0; |
| 1825 | uint64_t NumBytesSaved = 0; |
| 1826 | for (auto &BFI : BC.getBinaryFunctions()) { |
| 1827 | for (BinaryBasicBlock &BB : BFI.second) { |
| 1828 | auto LastInstRIter = BB.getLastNonPseudo(); |
| 1829 | if (LastInstRIter == BB.rend() || !BC.MIB->isReturn(Inst: *LastInstRIter) || |
| 1830 | !BC.MIB->deleteREPPrefix(Inst&: *LastInstRIter)) |
| 1831 | continue; |
| 1832 | |
| 1833 | NumPrefixesRemoved += BB.getKnownExecutionCount(); |
| 1834 | ++NumBytesSaved; |
| 1835 | } |
| 1836 | } |
| 1837 | |
| 1838 | if (NumBytesSaved) |
| 1839 | BC.outs() << "BOLT-INFO: removed " << NumBytesSaved |
| 1840 | << " 'repz' prefixes" |
| 1841 | " with estimated execution count of " |
| 1842 | << NumPrefixesRemoved << " times.\n" ; |
| 1843 | return Error::success(); |
| 1844 | } |
| 1845 | |
| 1846 | Error InlineMemcpy::runOnFunctions(BinaryContext &BC) { |
| 1847 | if (!BC.isX86()) |
| 1848 | return Error::success(); |
| 1849 | |
| 1850 | uint64_t NumInlined = 0; |
| 1851 | uint64_t NumInlinedDyno = 0; |
| 1852 | for (auto &BFI : BC.getBinaryFunctions()) { |
| 1853 | for (BinaryBasicBlock &BB : BFI.second) { |
| 1854 | for (auto II = BB.begin(); II != BB.end(); ++II) { |
| 1855 | MCInst &Inst = *II; |
| 1856 | |
| 1857 | if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 || |
| 1858 | !Inst.getOperand(i: 0).isExpr()) |
| 1859 | continue; |
| 1860 | |
| 1861 | const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst); |
| 1862 | if (CalleeSymbol->getName() != "memcpy" && |
| 1863 | CalleeSymbol->getName() != "memcpy@PLT" && |
| 1864 | CalleeSymbol->getName() != "_memcpy8" ) |
| 1865 | continue; |
| 1866 | |
| 1867 | const bool IsMemcpy8 = (CalleeSymbol->getName() == "_memcpy8" ); |
| 1868 | const bool IsTailCall = BC.MIB->isTailCall(Inst); |
| 1869 | |
| 1870 | const InstructionListType NewCode = |
| 1871 | BC.MIB->createInlineMemcpy(ReturnEnd: IsMemcpy8); |
| 1872 | II = BB.replaceInstruction(II, Replacement: NewCode); |
| 1873 | std::advance(i&: II, n: NewCode.size() - 1); |
| 1874 | if (IsTailCall) { |
| 1875 | MCInst Return; |
| 1876 | BC.MIB->createReturn(Inst&: Return); |
| 1877 | II = BB.insertInstruction(At: std::next(x: II), NewInst: std::move(Return)); |
| 1878 | } |
| 1879 | |
| 1880 | ++NumInlined; |
| 1881 | NumInlinedDyno += BB.getKnownExecutionCount(); |
| 1882 | } |
| 1883 | } |
| 1884 | } |
| 1885 | |
| 1886 | if (NumInlined) { |
| 1887 | BC.outs() << "BOLT-INFO: inlined " << NumInlined << " memcpy() calls" ; |
| 1888 | if (NumInlinedDyno) |
| 1889 | BC.outs() << ". The calls were executed " << NumInlinedDyno |
| 1890 | << " times based on profile." ; |
| 1891 | BC.outs() << '\n'; |
| 1892 | } |
| 1893 | return Error::success(); |
| 1894 | } |
| 1895 | |
| 1896 | bool SpecializeMemcpy1::shouldOptimize(const BinaryFunction &Function) const { |
| 1897 | if (!BinaryFunctionPass::shouldOptimize(BF: Function)) |
| 1898 | return false; |
| 1899 | |
| 1900 | for (const std::string &FunctionSpec : Spec) { |
| 1901 | StringRef FunctionName = StringRef(FunctionSpec).split(Separator: ':').first; |
| 1902 | if (Function.hasNameRegex(NameRegex: FunctionName)) |
| 1903 | return true; |
| 1904 | } |
| 1905 | |
| 1906 | return false; |
| 1907 | } |
| 1908 | |
| 1909 | std::set<size_t> SpecializeMemcpy1::getCallSitesToOptimize( |
| 1910 | const BinaryFunction &Function) const { |
| 1911 | StringRef SitesString; |
| 1912 | for (const std::string &FunctionSpec : Spec) { |
| 1913 | StringRef FunctionName; |
| 1914 | std::tie(args&: FunctionName, args&: SitesString) = StringRef(FunctionSpec).split(Separator: ':'); |
| 1915 | if (Function.hasNameRegex(NameRegex: FunctionName)) |
| 1916 | break; |
| 1917 | SitesString = "" ; |
| 1918 | } |
| 1919 | |
| 1920 | std::set<size_t> Sites; |
| 1921 | SmallVector<StringRef, 4> SitesVec; |
| 1922 | SitesString.split(A&: SitesVec, Separator: ':'); |
| 1923 | for (StringRef SiteString : SitesVec) { |
| 1924 | if (SiteString.empty()) |
| 1925 | continue; |
| 1926 | size_t Result; |
| 1927 | if (!SiteString.getAsInteger(Radix: 10, Result)) |
| 1928 | Sites.emplace(args&: Result); |
| 1929 | } |
| 1930 | |
| 1931 | return Sites; |
| 1932 | } |
| 1933 | |
| 1934 | Error SpecializeMemcpy1::runOnFunctions(BinaryContext &BC) { |
| 1935 | if (!BC.isX86()) |
| 1936 | return Error::success(); |
| 1937 | |
| 1938 | uint64_t NumSpecialized = 0; |
| 1939 | uint64_t NumSpecializedDyno = 0; |
| 1940 | for (auto &BFI : BC.getBinaryFunctions()) { |
| 1941 | BinaryFunction &Function = BFI.second; |
| 1942 | if (!shouldOptimize(Function)) |
| 1943 | continue; |
| 1944 | |
| 1945 | std::set<size_t> CallsToOptimize = getCallSitesToOptimize(Function); |
| 1946 | auto shouldOptimize = [&](size_t N) { |
| 1947 | return CallsToOptimize.empty() || CallsToOptimize.count(x: N); |
| 1948 | }; |
| 1949 | |
| 1950 | std::vector<BinaryBasicBlock *> Blocks(Function.pbegin(), Function.pend()); |
| 1951 | size_t CallSiteID = 0; |
| 1952 | for (BinaryBasicBlock *CurBB : Blocks) { |
| 1953 | for (auto II = CurBB->begin(); II != CurBB->end(); ++II) { |
| 1954 | MCInst &Inst = *II; |
| 1955 | |
| 1956 | if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 || |
| 1957 | !Inst.getOperand(i: 0).isExpr()) |
| 1958 | continue; |
| 1959 | |
| 1960 | const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst); |
| 1961 | if (CalleeSymbol->getName() != "memcpy" && |
| 1962 | CalleeSymbol->getName() != "memcpy@PLT" ) |
| 1963 | continue; |
| 1964 | |
| 1965 | if (BC.MIB->isTailCall(Inst)) |
| 1966 | continue; |
| 1967 | |
| 1968 | ++CallSiteID; |
| 1969 | |
| 1970 | if (!shouldOptimize(CallSiteID)) |
| 1971 | continue; |
| 1972 | |
| 1973 | // Create a copy of a call to memcpy(dest, src, size). |
| 1974 | MCInst MemcpyInstr = Inst; |
| 1975 | |
| 1976 | BinaryBasicBlock *OneByteMemcpyBB = CurBB->splitAt(II); |
| 1977 | |
| 1978 | BinaryBasicBlock *NextBB = nullptr; |
| 1979 | if (OneByteMemcpyBB->getNumNonPseudos() > 1) { |
| 1980 | NextBB = OneByteMemcpyBB->splitAt(II: OneByteMemcpyBB->begin()); |
| 1981 | NextBB->eraseInstruction(II: NextBB->begin()); |
| 1982 | } else { |
| 1983 | NextBB = OneByteMemcpyBB->getSuccessor(); |
| 1984 | OneByteMemcpyBB->eraseInstruction(II: OneByteMemcpyBB->begin()); |
| 1985 | assert(NextBB && "unexpected call to memcpy() with no return" ); |
| 1986 | } |
| 1987 | |
| 1988 | BinaryBasicBlock *MemcpyBB = Function.addBasicBlock(); |
| 1989 | MemcpyBB->setOffset(CurBB->getInputOffset()); |
| 1990 | InstructionListType CmpJCC = |
| 1991 | BC.MIB->createCmpJE(RegNo: BC.MIB->getIntArgRegister(ArgNo: 2), Imm: 1, |
| 1992 | Target: OneByteMemcpyBB->getLabel(), Ctx: BC.Ctx.get()); |
| 1993 | CurBB->addInstructions(R: CmpJCC); |
| 1994 | CurBB->addSuccessor(Succ: MemcpyBB); |
| 1995 | |
| 1996 | MemcpyBB->addInstruction(Inst: std::move(MemcpyInstr)); |
| 1997 | MemcpyBB->addSuccessor(Succ: NextBB); |
| 1998 | MemcpyBB->setCFIState(NextBB->getCFIState()); |
| 1999 | MemcpyBB->setExecutionCount(0); |
| 2000 | |
| 2001 | // To prevent the actual call from being moved to cold, we set its |
| 2002 | // execution count to 1. |
| 2003 | if (CurBB->getKnownExecutionCount() > 0) |
| 2004 | MemcpyBB->setExecutionCount(1); |
| 2005 | |
| 2006 | InstructionListType OneByteMemcpy = BC.MIB->createOneByteMemcpy(); |
| 2007 | OneByteMemcpyBB->addInstructions(R: OneByteMemcpy); |
| 2008 | |
| 2009 | ++NumSpecialized; |
| 2010 | NumSpecializedDyno += CurBB->getKnownExecutionCount(); |
| 2011 | |
| 2012 | CurBB = NextBB; |
| 2013 | |
| 2014 | // Note: we don't expect the next instruction to be a call to memcpy. |
| 2015 | II = CurBB->begin(); |
| 2016 | } |
| 2017 | } |
| 2018 | } |
| 2019 | |
| 2020 | if (NumSpecialized) { |
| 2021 | BC.outs() << "BOLT-INFO: specialized " << NumSpecialized |
| 2022 | << " memcpy() call sites for size 1" ; |
| 2023 | if (NumSpecializedDyno) |
| 2024 | BC.outs() << ". The calls were executed " << NumSpecializedDyno |
| 2025 | << " times based on profile." ; |
| 2026 | BC.outs() << '\n'; |
| 2027 | } |
| 2028 | return Error::success(); |
| 2029 | } |
| 2030 | |
| 2031 | void RemoveNops::runOnFunction(BinaryFunction &BF) { |
| 2032 | const BinaryContext &BC = BF.getBinaryContext(); |
| 2033 | for (BinaryBasicBlock &BB : BF) { |
| 2034 | for (int64_t I = BB.size() - 1; I >= 0; --I) { |
| 2035 | MCInst &Inst = BB.getInstructionAtIndex(Index: I); |
| 2036 | if (BC.MIB->isNoop(Inst) && BC.MIB->hasAnnotation(Inst, Name: "NOP" )) |
| 2037 | BB.eraseInstructionAtIndex(Index: I); |
| 2038 | } |
| 2039 | } |
| 2040 | } |
| 2041 | |
| 2042 | Error RemoveNops::runOnFunctions(BinaryContext &BC) { |
| 2043 | ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { |
| 2044 | runOnFunction(BF); |
| 2045 | }; |
| 2046 | |
| 2047 | ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) { |
| 2048 | return BF.shouldPreserveNops(); |
| 2049 | }; |
| 2050 | |
| 2051 | ParallelUtilities::runOnEachFunction( |
| 2052 | BC, SchedPolicy: ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFunction: WorkFun, |
| 2053 | SkipPredicate: SkipFunc, LogName: "RemoveNops" ); |
| 2054 | return Error::success(); |
| 2055 | } |
| 2056 | |
| 2057 | } // namespace bolt |
| 2058 | } // namespace llvm |
| 2059 | |