| 1 | //===- bolt/Profile/StaleProfileMatching.cpp - Profile data matching ----===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // BOLT often has to deal with profiles collected on binaries built from several |
| 10 | // revisions behind release. As a result, a certain percentage of functions is |
| 11 | // considered stale and not optimized. This file implements an ability to match |
| 12 | // profile to functions that are not 100% binary identical, and thus, increasing |
| 13 | // the optimization coverage and boost the performance of applications. |
| 14 | // |
| 15 | // The algorithm consists of two phases: matching and inference: |
| 16 | // - At the matching phase, we try to "guess" as many block and jump counts from |
| 17 | // the stale profile as possible. To this end, the content of each basic block |
| 18 | // is hashed and stored in the (yaml) profile. When BOLT optimizes a binary, |
| 19 | // it computes block hashes and identifies the corresponding entries in the |
| 20 | // stale profile. It yields a partial profile for every CFG in the binary. |
| 21 | // - At the inference phase, we employ a network flow-based algorithm (profi) to |
| 22 | // reconstruct "realistic" block and jump counts from the partial profile |
| 23 | // generated at the first stage. In practice, we don't always produce proper |
| 24 | // profile data but the majority (e.g., >90%) of CFGs get the correct counts. |
| 25 | // |
| 26 | //===----------------------------------------------------------------------===// |
| 27 | |
| 28 | #include "bolt/Core/HashUtilities.h" |
| 29 | #include "bolt/Profile/YAMLProfileReader.h" |
| 30 | #include "llvm/ADT/Bitfields.h" |
| 31 | #include "llvm/ADT/Hashing.h" |
| 32 | #include "llvm/MC/MCPseudoProbe.h" |
| 33 | #include "llvm/Support/CommandLine.h" |
| 34 | #include "llvm/Support/Timer.h" |
| 35 | #include "llvm/Support/xxhash.h" |
| 36 | #include "llvm/Transforms/Utils/SampleProfileInference.h" |
| 37 | |
| 38 | #include <queue> |
| 39 | |
| 40 | using namespace llvm; |
| 41 | |
| 42 | #undef DEBUG_TYPE |
| 43 | #define DEBUG_TYPE "bolt-prof" |
| 44 | |
| 45 | namespace opts { |
| 46 | |
| 47 | extern cl::opt<bool> TimeRewrite; |
| 48 | extern cl::OptionCategory BoltOptCategory; |
| 49 | |
| 50 | cl::opt<bool> |
| 51 | InferStaleProfile("infer-stale-profile" , |
| 52 | cl::desc("Infer counts from stale profile data." ), |
| 53 | cl::init(Val: false), cl::Hidden, cl::cat(BoltOptCategory)); |
| 54 | |
| 55 | static cl::opt<unsigned> StaleMatchingMinMatchedBlock( |
| 56 | "stale-matching-min-matched-block" , |
| 57 | cl::desc("Percentage threshold of matched basic blocks at which stale " |
| 58 | "profile inference is executed." ), |
| 59 | cl::init(Val: 0), cl::Hidden, cl::cat(BoltOptCategory)); |
| 60 | |
| 61 | static cl::opt<unsigned> StaleMatchingMaxFuncSize( |
| 62 | "stale-matching-max-func-size" , |
| 63 | cl::desc("The maximum size of a function to consider for inference." ), |
| 64 | cl::init(Val: 10000), cl::Hidden, cl::cat(BoltOptCategory)); |
| 65 | |
| 66 | // Parameters of the profile inference algorithm. The default values are tuned |
| 67 | // on several benchmarks. |
| 68 | static cl::opt<bool> StaleMatchingEvenFlowDistribution( |
| 69 | "stale-matching-even-flow-distribution" , |
| 70 | cl::desc("Try to evenly distribute flow when there are multiple equally " |
| 71 | "likely options." ), |
| 72 | cl::init(Val: true), cl::ReallyHidden, cl::cat(BoltOptCategory)); |
| 73 | |
| 74 | static cl::opt<bool> StaleMatchingRebalanceUnknown( |
| 75 | "stale-matching-rebalance-unknown" , |
| 76 | cl::desc("Evenly re-distribute flow among unknown subgraphs." ), |
| 77 | cl::init(Val: false), cl::ReallyHidden, cl::cat(BoltOptCategory)); |
| 78 | |
| 79 | static cl::opt<bool> StaleMatchingJoinIslands( |
| 80 | "stale-matching-join-islands" , |
| 81 | cl::desc("Join isolated components having positive flow." ), cl::init(Val: true), |
| 82 | cl::ReallyHidden, cl::cat(BoltOptCategory)); |
| 83 | |
| 84 | static cl::opt<unsigned> StaleMatchingCostBlockInc( |
| 85 | "stale-matching-cost-block-inc" , |
| 86 | cl::desc("The cost of increasing a block count by one." ), cl::init(Val: 150), |
| 87 | cl::ReallyHidden, cl::cat(BoltOptCategory)); |
| 88 | |
| 89 | static cl::opt<unsigned> StaleMatchingCostBlockDec( |
| 90 | "stale-matching-cost-block-dec" , |
| 91 | cl::desc("The cost of decreasing a block count by one." ), cl::init(Val: 150), |
| 92 | cl::ReallyHidden, cl::cat(BoltOptCategory)); |
| 93 | |
| 94 | static cl::opt<unsigned> StaleMatchingCostJumpInc( |
| 95 | "stale-matching-cost-jump-inc" , |
| 96 | cl::desc("The cost of increasing a jump count by one." ), cl::init(Val: 150), |
| 97 | cl::ReallyHidden, cl::cat(BoltOptCategory)); |
| 98 | |
| 99 | static cl::opt<unsigned> StaleMatchingCostJumpDec( |
| 100 | "stale-matching-cost-jump-dec" , |
| 101 | cl::desc("The cost of decreasing a jump count by one." ), cl::init(Val: 150), |
| 102 | cl::ReallyHidden, cl::cat(BoltOptCategory)); |
| 103 | |
| 104 | static cl::opt<unsigned> StaleMatchingCostBlockUnknownInc( |
| 105 | "stale-matching-cost-block-unknown-inc" , |
| 106 | cl::desc("The cost of increasing an unknown block count by one." ), |
| 107 | cl::init(Val: 1), cl::ReallyHidden, cl::cat(BoltOptCategory)); |
| 108 | |
| 109 | static cl::opt<unsigned> StaleMatchingCostJumpUnknownInc( |
| 110 | "stale-matching-cost-jump-unknown-inc" , |
| 111 | cl::desc("The cost of increasing an unknown jump count by one." ), |
| 112 | cl::init(Val: 140), cl::ReallyHidden, cl::cat(BoltOptCategory)); |
| 113 | |
| 114 | static cl::opt<unsigned> StaleMatchingCostJumpUnknownFTInc( |
| 115 | "stale-matching-cost-jump-unknown-ft-inc" , |
| 116 | cl::desc( |
| 117 | "The cost of increasing an unknown fall-through jump count by one." ), |
| 118 | cl::init(Val: 3), cl::ReallyHidden, cl::cat(BoltOptCategory)); |
| 119 | |
| 120 | cl::opt<bool> StaleMatchingWithPseudoProbes( |
| 121 | "stale-matching-with-pseudo-probes" , |
| 122 | cl::desc("Turns on stale matching with block pseudo probes." ), |
| 123 | cl::init(Val: false), cl::ReallyHidden, cl::cat(BoltOptCategory)); |
| 124 | |
| 125 | } // namespace opts |
| 126 | |
| 127 | namespace llvm { |
| 128 | namespace bolt { |
| 129 | |
| 130 | /// An object wrapping several components of a basic block hash. The combined |
| 131 | /// (blended) hash is represented and stored as one uint64_t, while individual |
| 132 | /// components are of smaller size (e.g., uint16_t or uint8_t). |
| 133 | struct BlendedBlockHash { |
| 134 | private: |
| 135 | using ValueOffset = Bitfield::Element<uint16_t, 0, 16>; |
| 136 | using ValueOpcode = Bitfield::Element<uint16_t, 16, 16>; |
| 137 | using ValueInstr = Bitfield::Element<uint16_t, 32, 16>; |
| 138 | using ValuePred = Bitfield::Element<uint8_t, 48, 8>; |
| 139 | using ValueSucc = Bitfield::Element<uint8_t, 56, 8>; |
| 140 | |
| 141 | public: |
| 142 | explicit BlendedBlockHash() {} |
| 143 | |
| 144 | explicit BlendedBlockHash(uint64_t Hash) { |
| 145 | Offset = Bitfield::get<ValueOffset>(Packed: Hash); |
| 146 | OpcodeHash = Bitfield::get<ValueOpcode>(Packed: Hash); |
| 147 | InstrHash = Bitfield::get<ValueInstr>(Packed: Hash); |
| 148 | PredHash = Bitfield::get<ValuePred>(Packed: Hash); |
| 149 | SuccHash = Bitfield::get<ValueSucc>(Packed: Hash); |
| 150 | } |
| 151 | |
| 152 | /// Combine the blended hash into uint64_t. |
| 153 | uint64_t combine() const { |
| 154 | uint64_t Hash = 0; |
| 155 | Bitfield::set<ValueOffset>(Packed&: Hash, Value: Offset); |
| 156 | Bitfield::set<ValueOpcode>(Packed&: Hash, Value: OpcodeHash); |
| 157 | Bitfield::set<ValueInstr>(Packed&: Hash, Value: InstrHash); |
| 158 | Bitfield::set<ValuePred>(Packed&: Hash, Value: PredHash); |
| 159 | Bitfield::set<ValueSucc>(Packed&: Hash, Value: SuccHash); |
| 160 | return Hash; |
| 161 | } |
| 162 | |
| 163 | /// Compute a distance between two given blended hashes. The smaller the |
| 164 | /// distance, the more similar two blocks are. For identical basic blocks, |
| 165 | /// the distance is zero. |
| 166 | uint64_t distance(const BlendedBlockHash &BBH) const { |
| 167 | assert(OpcodeHash == BBH.OpcodeHash && |
| 168 | "incorrect blended hash distance computation" ); |
| 169 | uint64_t Dist = 0; |
| 170 | // Account for NeighborHash |
| 171 | Dist += SuccHash == BBH.SuccHash ? 0 : 1; |
| 172 | Dist += PredHash == BBH.PredHash ? 0 : 1; |
| 173 | Dist <<= 16; |
| 174 | // Account for InstrHash |
| 175 | Dist += InstrHash == BBH.InstrHash ? 0 : 1; |
| 176 | Dist <<= 16; |
| 177 | // Account for Offset |
| 178 | Dist += (Offset >= BBH.Offset ? Offset - BBH.Offset : BBH.Offset - Offset); |
| 179 | return Dist; |
| 180 | } |
| 181 | |
| 182 | /// The offset of the basic block from the function start. |
| 183 | uint16_t Offset{0}; |
| 184 | /// (Loose) Hash of the basic block instructions, excluding operands. |
| 185 | uint16_t OpcodeHash{0}; |
| 186 | /// (Strong) Hash of the basic block instructions, including opcodes and |
| 187 | /// operands. |
| 188 | uint16_t InstrHash{0}; |
| 189 | /// (Loose) Hashes of the predecessors of the basic block. |
| 190 | uint8_t PredHash{0}; |
| 191 | /// (Loose) Hashes of the successors of the basic block. |
| 192 | uint8_t SuccHash{0}; |
| 193 | }; |
| 194 | |
| 195 | /// The object is used to identify and match basic blocks in a BinaryFunction |
| 196 | /// given their hashes computed on a binary built from several revisions behind |
| 197 | /// release. |
| 198 | class StaleMatcher { |
| 199 | public: |
| 200 | /// Initialize stale matcher. |
| 201 | void init(const std::vector<FlowBlock *> &Blocks, |
| 202 | const std::vector<BlendedBlockHash> &Hashes, |
| 203 | const std::vector<uint64_t> &CallHashes) { |
| 204 | assert(Blocks.size() == Hashes.size() && |
| 205 | Hashes.size() == CallHashes.size() && |
| 206 | "incorrect matcher initialization" ); |
| 207 | for (size_t I = 0; I < Blocks.size(); I++) { |
| 208 | FlowBlock *Block = Blocks[I]; |
| 209 | uint16_t OpHash = Hashes[I].OpcodeHash; |
| 210 | OpHashToBlocks[OpHash].push_back(x: std::make_pair(x: Hashes[I], y&: Block)); |
| 211 | if (CallHashes[I]) |
| 212 | CallHashToBlocks[CallHashes[I]].push_back( |
| 213 | x: std::make_pair(x: Hashes[I], y&: Block)); |
| 214 | } |
| 215 | } |
| 216 | |
| 217 | /// Creates a mapping from a pseudo probe to a flow block. |
| 218 | void mapProbeToBB(const MCDecodedPseudoProbe *Probe, FlowBlock *Block) { |
| 219 | BBPseudoProbeToBlock[Probe] = Block; |
| 220 | } |
| 221 | |
| 222 | enum MatchMethod : char { |
| 223 | MATCH_EXACT = 0, |
| 224 | MATCH_PROBE_EXACT, |
| 225 | MATCH_PROBE_LOOSE, |
| 226 | MATCH_OPCODE, |
| 227 | MATCH_CALL, |
| 228 | NO_MATCH |
| 229 | }; |
| 230 | |
| 231 | /// Find the most similar flow block for a profile block given blended hash. |
| 232 | std::pair<const FlowBlock *, MatchMethod> |
| 233 | matchBlockStrict(BlendedBlockHash BlendedHash) { |
| 234 | const auto &[Block, ExactHash] = matchWithOpcodes(BlendedHash); |
| 235 | if (Block && ExactHash) |
| 236 | return {Block, MATCH_EXACT}; |
| 237 | return {nullptr, NO_MATCH}; |
| 238 | } |
| 239 | |
| 240 | /// Find the most similar flow block for a profile block given pseudo probes. |
| 241 | std::pair<const FlowBlock *, MatchMethod> matchBlockProbe( |
| 242 | const ArrayRef<yaml::bolt::PseudoProbeInfo> PseudoProbes, |
| 243 | const YAMLProfileReader::InlineTreeNodeMapTy &InlineTreeNodeMap) { |
| 244 | const auto &[ProbeBlock, ExactProbe] = |
| 245 | matchWithPseudoProbes(BlockPseudoProbes: PseudoProbes, InlineTreeNodeMap); |
| 246 | if (ProbeBlock) |
| 247 | return {ProbeBlock, ExactProbe ? MATCH_PROBE_EXACT : MATCH_PROBE_LOOSE}; |
| 248 | return {nullptr, NO_MATCH}; |
| 249 | } |
| 250 | |
| 251 | /// Find the most similar flow block for a profile block given its hashes. |
| 252 | std::pair<const FlowBlock *, MatchMethod> |
| 253 | matchBlockLoose(BlendedBlockHash BlendedHash, uint64_t CallHash) { |
| 254 | if (const FlowBlock *CallBlock = matchWithCalls(BlendedHash, CallHash)) |
| 255 | return {CallBlock, MATCH_CALL}; |
| 256 | if (const FlowBlock *OpcodeBlock = matchWithOpcodes(BlendedHash).first) |
| 257 | return {OpcodeBlock, MATCH_OPCODE}; |
| 258 | return {nullptr, NO_MATCH}; |
| 259 | } |
| 260 | |
| 261 | /// Returns true if the two basic blocks (in the binary and in the profile) |
| 262 | /// corresponding to the given hashes are matched to each other with a high |
| 263 | /// confidence. |
| 264 | static bool isHighConfidenceMatch(BlendedBlockHash Hash1, |
| 265 | BlendedBlockHash Hash2) { |
| 266 | return Hash1.InstrHash == Hash2.InstrHash; |
| 267 | } |
| 268 | |
| 269 | private: |
| 270 | using HashBlockPairType = std::pair<BlendedBlockHash, FlowBlock *>; |
| 271 | std::unordered_map<uint16_t, std::vector<HashBlockPairType>> OpHashToBlocks; |
| 272 | std::unordered_map<uint64_t, std::vector<HashBlockPairType>> CallHashToBlocks; |
| 273 | DenseMap<const MCDecodedPseudoProbe *, FlowBlock *> BBPseudoProbeToBlock; |
| 274 | |
| 275 | // Uses OpcodeHash to find the most similar block for a given hash. |
| 276 | std::pair<const FlowBlock *, bool> |
| 277 | matchWithOpcodes(BlendedBlockHash BlendedHash) const { |
| 278 | auto BlockIt = OpHashToBlocks.find(x: BlendedHash.OpcodeHash); |
| 279 | if (BlockIt == OpHashToBlocks.end()) |
| 280 | return {nullptr, false}; |
| 281 | FlowBlock *BestBlock = nullptr; |
| 282 | uint64_t BestDist = std::numeric_limits<uint64_t>::max(); |
| 283 | BlendedBlockHash BestHash; |
| 284 | for (const auto &[Hash, Block] : BlockIt->second) { |
| 285 | uint64_t Dist = Hash.distance(BBH: BlendedHash); |
| 286 | if (BestBlock == nullptr || Dist < BestDist) { |
| 287 | BestDist = Dist; |
| 288 | BestBlock = Block; |
| 289 | BestHash = Hash; |
| 290 | } |
| 291 | } |
| 292 | return {BestBlock, isHighConfidenceMatch(Hash1: BestHash, Hash2: BlendedHash)}; |
| 293 | } |
| 294 | |
| 295 | // Uses CallHash to find the most similar block for a given hash. |
| 296 | const FlowBlock *matchWithCalls(BlendedBlockHash BlendedHash, |
| 297 | uint64_t CallHash) const { |
| 298 | if (!CallHash) |
| 299 | return nullptr; |
| 300 | auto BlockIt = CallHashToBlocks.find(x: CallHash); |
| 301 | if (BlockIt == CallHashToBlocks.end()) |
| 302 | return nullptr; |
| 303 | FlowBlock *BestBlock = nullptr; |
| 304 | uint64_t BestDist = std::numeric_limits<uint64_t>::max(); |
| 305 | for (const auto &[Hash, Block] : BlockIt->second) { |
| 306 | uint64_t Dist = Hash.OpcodeHash > BlendedHash.OpcodeHash |
| 307 | ? Hash.OpcodeHash - BlendedHash.OpcodeHash |
| 308 | : BlendedHash.OpcodeHash - Hash.OpcodeHash; |
| 309 | if (BestBlock == nullptr || Dist < BestDist) { |
| 310 | BestDist = Dist; |
| 311 | BestBlock = Block; |
| 312 | } |
| 313 | } |
| 314 | return BestBlock; |
| 315 | } |
| 316 | |
| 317 | /// Matches a profile block with a binary block based on pseudo probes. |
| 318 | /// Returns the best matching block (or nullptr) and whether the match is |
| 319 | /// unambiguous. |
| 320 | std::pair<const FlowBlock *, bool> matchWithPseudoProbes( |
| 321 | const ArrayRef<yaml::bolt::PseudoProbeInfo> BlockPseudoProbes, |
| 322 | const YAMLProfileReader::InlineTreeNodeMapTy &InlineTreeNodeMap) const { |
| 323 | |
| 324 | if (!opts::StaleMatchingWithPseudoProbes) |
| 325 | return {nullptr, false}; |
| 326 | |
| 327 | DenseMap<const FlowBlock *, uint32_t> FlowBlockMatchCount; |
| 328 | |
| 329 | auto matchProfileProbeToBlock = [&](uint32_t NodeId, |
| 330 | uint64_t ProbeId) -> const FlowBlock * { |
| 331 | const MCDecodedPseudoProbeInlineTree *BinaryNode = |
| 332 | InlineTreeNodeMap.getInlineTreeNode(ProfileInlineTreeNodeId: NodeId); |
| 333 | if (!BinaryNode) |
| 334 | return nullptr; |
| 335 | const MCDecodedPseudoProbe Dummy(0, ProbeId, PseudoProbeType::Block, 0, 0, |
| 336 | nullptr); |
| 337 | ArrayRef<MCDecodedPseudoProbe> BinaryProbes = BinaryNode->getProbes(); |
| 338 | auto BinaryProbeIt = llvm::lower_bound( |
| 339 | Range&: BinaryProbes, Value: Dummy, C: [](const auto &LHS, const auto &RHS) { |
| 340 | return LHS.getIndex() < RHS.getIndex(); |
| 341 | }); |
| 342 | if (BinaryProbeIt == BinaryNode->getProbes().end() || |
| 343 | BinaryProbeIt->getIndex() != ProbeId) |
| 344 | return nullptr; |
| 345 | auto It = BBPseudoProbeToBlock.find(Val: &*BinaryProbeIt); |
| 346 | if (It == BBPseudoProbeToBlock.end()) |
| 347 | return nullptr; |
| 348 | return It->second; |
| 349 | }; |
| 350 | |
| 351 | auto matchPseudoProbeInfo = [&](const yaml::bolt::PseudoProbeInfo |
| 352 | &ProfileProbe, |
| 353 | uint32_t NodeId) { |
| 354 | for (uint64_t Index = 0; Index < 64; ++Index) |
| 355 | if (ProfileProbe.BlockMask & 1ull << Index) |
| 356 | ++FlowBlockMatchCount[matchProfileProbeToBlock(NodeId, Index + 1)]; |
| 357 | for (const auto &ProfileProbes : |
| 358 | {ProfileProbe.BlockProbes, ProfileProbe.IndCallProbes, |
| 359 | ProfileProbe.CallProbes}) |
| 360 | for (uint64_t ProfileProbe : ProfileProbes) |
| 361 | ++FlowBlockMatchCount[matchProfileProbeToBlock(NodeId, ProfileProbe)]; |
| 362 | }; |
| 363 | |
| 364 | for (const yaml::bolt::PseudoProbeInfo &ProfileProbe : BlockPseudoProbes) { |
| 365 | if (!ProfileProbe.InlineTreeNodes.empty()) |
| 366 | for (uint32_t ProfileInlineTreeNode : ProfileProbe.InlineTreeNodes) |
| 367 | matchPseudoProbeInfo(ProfileProbe, ProfileInlineTreeNode); |
| 368 | else |
| 369 | matchPseudoProbeInfo(ProfileProbe, ProfileProbe.InlineTreeIndex); |
| 370 | } |
| 371 | uint32_t BestMatchCount = 0; |
| 372 | uint32_t TotalMatchCount = 0; |
| 373 | const FlowBlock *BestMatchBlock = nullptr; |
| 374 | for (const auto &[FlowBlock, Count] : FlowBlockMatchCount) { |
| 375 | TotalMatchCount += Count; |
| 376 | if (Count < BestMatchCount || (Count == BestMatchCount && BestMatchBlock)) |
| 377 | continue; |
| 378 | BestMatchBlock = FlowBlock; |
| 379 | BestMatchCount = Count; |
| 380 | } |
| 381 | return {BestMatchBlock, BestMatchCount == TotalMatchCount}; |
| 382 | } |
| 383 | }; |
| 384 | |
| 385 | void BinaryFunction::computeBlockHashes(HashFunction HashFunction) const { |
| 386 | if (size() == 0) |
| 387 | return; |
| 388 | |
| 389 | assert(hasCFG() && "the function is expected to have CFG" ); |
| 390 | |
| 391 | std::vector<BlendedBlockHash> BlendedHashes(BasicBlocks.size()); |
| 392 | std::vector<uint64_t> OpcodeHashes(BasicBlocks.size()); |
| 393 | // Initialize hash components. |
| 394 | for (size_t I = 0; I < BasicBlocks.size(); I++) { |
| 395 | const BinaryBasicBlock *BB = BasicBlocks[I]; |
| 396 | assert(BB->getIndex() == I && "incorrect block index" ); |
| 397 | BlendedHashes[I].Offset = BB->getOffset(); |
| 398 | // Hashing complete instructions. |
| 399 | std::string InstrHashStr = hashBlock( |
| 400 | BC, BB: *BB, OperandHashFunc: [&](const MCOperand &Op) { return hashInstOperand(BC, Operand: Op); }); |
| 401 | if (HashFunction == HashFunction::StdHash) { |
| 402 | uint64_t InstrHash = std::hash<std::string>{}(InstrHashStr); |
| 403 | BlendedHashes[I].InstrHash = (uint16_t)hash_value(value: InstrHash); |
| 404 | } else if (HashFunction == HashFunction::XXH3) { |
| 405 | uint64_t InstrHash = llvm::xxh3_64bits(data: InstrHashStr); |
| 406 | BlendedHashes[I].InstrHash = (uint16_t)InstrHash; |
| 407 | } else { |
| 408 | llvm_unreachable("Unhandled HashFunction" ); |
| 409 | } |
| 410 | // Hashing opcodes. |
| 411 | std::string OpcodeHashStr = hashBlockLoose(BC, BB: *BB); |
| 412 | if (HashFunction == HashFunction::StdHash) { |
| 413 | OpcodeHashes[I] = std::hash<std::string>{}(OpcodeHashStr); |
| 414 | BlendedHashes[I].OpcodeHash = (uint16_t)hash_value(value: OpcodeHashes[I]); |
| 415 | } else if (HashFunction == HashFunction::XXH3) { |
| 416 | OpcodeHashes[I] = llvm::xxh3_64bits(data: OpcodeHashStr); |
| 417 | BlendedHashes[I].OpcodeHash = (uint16_t)OpcodeHashes[I]; |
| 418 | } else { |
| 419 | llvm_unreachable("Unhandled HashFunction" ); |
| 420 | } |
| 421 | } |
| 422 | |
| 423 | // Initialize neighbor hash. |
| 424 | for (size_t I = 0; I < BasicBlocks.size(); I++) { |
| 425 | const BinaryBasicBlock *BB = BasicBlocks[I]; |
| 426 | // Append hashes of successors. |
| 427 | uint64_t Hash = 0; |
| 428 | for (BinaryBasicBlock *SuccBB : BB->successors()) { |
| 429 | uint64_t SuccHash = OpcodeHashes[SuccBB->getIndex()]; |
| 430 | Hash = hashing::detail::hash_16_bytes(low: Hash, high: SuccHash); |
| 431 | } |
| 432 | if (HashFunction == HashFunction::StdHash) { |
| 433 | // Compatibility with old behavior. |
| 434 | BlendedHashes[I].SuccHash = (uint8_t)hash_value(value: Hash); |
| 435 | } else { |
| 436 | BlendedHashes[I].SuccHash = (uint8_t)Hash; |
| 437 | } |
| 438 | |
| 439 | // Append hashes of predecessors. |
| 440 | Hash = 0; |
| 441 | for (BinaryBasicBlock *PredBB : BB->predecessors()) { |
| 442 | uint64_t PredHash = OpcodeHashes[PredBB->getIndex()]; |
| 443 | Hash = hashing::detail::hash_16_bytes(low: Hash, high: PredHash); |
| 444 | } |
| 445 | if (HashFunction == HashFunction::StdHash) { |
| 446 | // Compatibility with old behavior. |
| 447 | BlendedHashes[I].PredHash = (uint8_t)hash_value(value: Hash); |
| 448 | } else { |
| 449 | BlendedHashes[I].PredHash = (uint8_t)Hash; |
| 450 | } |
| 451 | } |
| 452 | |
| 453 | // Assign hashes. |
| 454 | for (size_t I = 0; I < BasicBlocks.size(); I++) { |
| 455 | const BinaryBasicBlock *BB = BasicBlocks[I]; |
| 456 | BB->setHash(BlendedHashes[I].combine()); |
| 457 | } |
| 458 | } |
| 459 | // TODO: mediate the difference between flow function construction here in BOLT |
| 460 | // and in the compiler by splitting blocks with exception throwing calls at the |
| 461 | // call and adding the landing pad as the successor. |
| 462 | /// Create a wrapper flow function to use with the profile inference algorithm, |
| 463 | /// and initialize its jumps and metadata. |
| 464 | FlowFunction |
| 465 | createFlowFunction(const BinaryFunction::BasicBlockOrderType &BlockOrder) { |
| 466 | FlowFunction Func; |
| 467 | |
| 468 | // Add a special "dummy" source so that there is always a unique entry point. |
| 469 | FlowBlock EntryBlock; |
| 470 | EntryBlock.Index = 0; |
| 471 | Func.Blocks.push_back(x: EntryBlock); |
| 472 | |
| 473 | // Create FlowBlock for every basic block in the binary function. |
| 474 | for (const BinaryBasicBlock *BB : BlockOrder) { |
| 475 | Func.Blocks.emplace_back(); |
| 476 | FlowBlock &Block = Func.Blocks.back(); |
| 477 | Block.Index = Func.Blocks.size() - 1; |
| 478 | (void)BB; |
| 479 | assert(Block.Index == BB->getIndex() + 1 && |
| 480 | "incorrectly assigned basic block index" ); |
| 481 | } |
| 482 | |
| 483 | // Add a special "dummy" sink block so there is always a unique sink. |
| 484 | FlowBlock SinkBlock; |
| 485 | SinkBlock.Index = Func.Blocks.size(); |
| 486 | Func.Blocks.push_back(x: SinkBlock); |
| 487 | |
| 488 | // Create FlowJump for each jump between basic blocks in the binary function. |
| 489 | std::vector<uint64_t> InDegree(Func.Blocks.size(), 0); |
| 490 | for (const BinaryBasicBlock *SrcBB : BlockOrder) { |
| 491 | std::unordered_set<const BinaryBasicBlock *> UniqueSuccs; |
| 492 | // Collect regular jumps |
| 493 | for (const BinaryBasicBlock *DstBB : SrcBB->successors()) { |
| 494 | // Ignoring parallel edges |
| 495 | if (UniqueSuccs.find(x: DstBB) != UniqueSuccs.end()) |
| 496 | continue; |
| 497 | |
| 498 | Func.Jumps.emplace_back(); |
| 499 | FlowJump &Jump = Func.Jumps.back(); |
| 500 | Jump.Source = SrcBB->getIndex() + 1; |
| 501 | Jump.Target = DstBB->getIndex() + 1; |
| 502 | InDegree[Jump.Target]++; |
| 503 | UniqueSuccs.insert(x: DstBB); |
| 504 | } |
| 505 | // TODO: set jump from exit block to landing pad to Unlikely. |
| 506 | // If the block is an exit, add a dummy edge from it to the sink block. |
| 507 | if (UniqueSuccs.empty()) { |
| 508 | Func.Jumps.emplace_back(); |
| 509 | FlowJump &Jump = Func.Jumps.back(); |
| 510 | Jump.Source = SrcBB->getIndex() + 1; |
| 511 | Jump.Target = Func.Blocks.size() - 1; |
| 512 | InDegree[Jump.Target]++; |
| 513 | } |
| 514 | |
| 515 | // Collect jumps to landing pads |
| 516 | for (const BinaryBasicBlock *DstBB : SrcBB->landing_pads()) { |
| 517 | // Ignoring parallel edges |
| 518 | if (UniqueSuccs.find(x: DstBB) != UniqueSuccs.end()) |
| 519 | continue; |
| 520 | |
| 521 | Func.Jumps.emplace_back(); |
| 522 | FlowJump &Jump = Func.Jumps.back(); |
| 523 | Jump.Source = SrcBB->getIndex() + 1; |
| 524 | Jump.Target = DstBB->getIndex() + 1; |
| 525 | InDegree[Jump.Target]++; |
| 526 | UniqueSuccs.insert(x: DstBB); |
| 527 | } |
| 528 | } |
| 529 | |
| 530 | // Add dummy edges to the extra sources. If there are multiple entry blocks, |
| 531 | // add an unlikely edge from 0 to the subsequent ones. Skips the sink block. |
| 532 | assert(InDegree[0] == 0 && "dummy entry blocks shouldn't have predecessors" ); |
| 533 | for (uint64_t I = 1; I < Func.Blocks.size() - 1; I++) { |
| 534 | const BinaryBasicBlock *BB = BlockOrder[I - 1]; |
| 535 | if (BB->isEntryPoint() || InDegree[I] == 0) { |
| 536 | Func.Jumps.emplace_back(); |
| 537 | FlowJump &Jump = Func.Jumps.back(); |
| 538 | Jump.Source = 0; |
| 539 | Jump.Target = I; |
| 540 | if (!BB->isEntryPoint()) |
| 541 | Jump.IsUnlikely = true; |
| 542 | } |
| 543 | } |
| 544 | |
| 545 | // Create necessary metadata for the flow function |
| 546 | for (FlowJump &Jump : Func.Jumps) { |
| 547 | assert(Jump.Source < Func.Blocks.size()); |
| 548 | Func.Blocks[Jump.Source].SuccJumps.push_back(x: &Jump); |
| 549 | assert(Jump.Target < Func.Blocks.size()); |
| 550 | Func.Blocks[Jump.Target].PredJumps.push_back(x: &Jump); |
| 551 | } |
| 552 | return Func; |
| 553 | } |
| 554 | |
| 555 | /// Assign initial block/jump weights based on the stale profile data. The goal |
| 556 | /// is to extract as much information from the stale profile as possible. Here |
| 557 | /// we assume that each basic block is specified via a hash value computed from |
| 558 | /// its content and the hashes of the unchanged basic blocks stay the same |
| 559 | /// across different revisions of the binary. Blocks may also have pseudo probe |
| 560 | /// information in the profile and the binary which is used for matching. |
| 561 | /// Whenever there is a count in the profile with the hash corresponding to one |
| 562 | /// of the basic blocks in the binary, the count is "matched" to the block. |
| 563 | /// Similarly, if both the source and the target of a count in the profile are |
| 564 | /// matched to a jump in the binary, the count is recorded in CFG. |
| 565 | size_t matchWeights( |
| 566 | BinaryContext &BC, const BinaryFunction::BasicBlockOrderType &BlockOrder, |
| 567 | const yaml::bolt::BinaryFunctionProfile &YamlBF, FlowFunction &Func, |
| 568 | HashFunction HashFunction, YAMLProfileReader::ProfileLookupMap &IdToYamlBF, |
| 569 | const BinaryFunction &BF, |
| 570 | const ArrayRef<YAMLProfileReader::ProbeMatchSpec> ProbeMatchSpecs) { |
| 571 | |
| 572 | assert(Func.Blocks.size() == BlockOrder.size() + 2); |
| 573 | |
| 574 | std::vector<uint64_t> CallHashes; |
| 575 | std::vector<FlowBlock *> Blocks; |
| 576 | std::vector<BlendedBlockHash> BlendedHashes; |
| 577 | for (uint64_t I = 0; I < BlockOrder.size(); I++) { |
| 578 | const BinaryBasicBlock *BB = BlockOrder[I]; |
| 579 | assert(BB->getHash() != 0 && "empty hash of BinaryBasicBlock" ); |
| 580 | |
| 581 | std::string CallHashStr = hashBlockCalls(BC, BB: *BB); |
| 582 | if (CallHashStr.empty()) { |
| 583 | CallHashes.push_back(x: 0); |
| 584 | } else { |
| 585 | if (HashFunction == HashFunction::StdHash) |
| 586 | CallHashes.push_back(x: std::hash<std::string>{}(CallHashStr)); |
| 587 | else if (HashFunction == HashFunction::XXH3) |
| 588 | CallHashes.push_back(x: llvm::xxh3_64bits(data: CallHashStr)); |
| 589 | else |
| 590 | llvm_unreachable("Unhandled HashFunction" ); |
| 591 | } |
| 592 | |
| 593 | Blocks.push_back(x: &Func.Blocks[I + 1]); |
| 594 | BlendedBlockHash BlendedHash(BB->getHash()); |
| 595 | BlendedHashes.push_back(x: BlendedHash); |
| 596 | LLVM_DEBUG(dbgs() << "BB with index " << I << " has hash = " |
| 597 | << Twine::utohexstr(BB->getHash()) << "\n" ); |
| 598 | } |
| 599 | StaleMatcher Matcher; |
| 600 | // Collects function pseudo probes for use in the StaleMatcher. |
| 601 | if (opts::StaleMatchingWithPseudoProbes) { |
| 602 | const MCPseudoProbeDecoder *Decoder = BC.getPseudoProbeDecoder(); |
| 603 | assert(Decoder && |
| 604 | "If pseudo probes are in use, pseudo probe decoder should exist" ); |
| 605 | const AddressProbesMap &ProbeMap = Decoder->getAddress2ProbesMap(); |
| 606 | const uint64_t FuncAddr = BF.getAddress(); |
| 607 | for (const MCDecodedPseudoProbe &Probe : |
| 608 | ProbeMap.find(From: FuncAddr, To: FuncAddr + BF.getSize())) |
| 609 | if (const BinaryBasicBlock *BB = |
| 610 | BF.getBasicBlockContainingOffset(Offset: Probe.getAddress() - FuncAddr)) |
| 611 | Matcher.mapProbeToBB(Probe: &Probe, Block: Blocks[BB->getIndex()]); |
| 612 | } |
| 613 | Matcher.init(Blocks, Hashes: BlendedHashes, CallHashes); |
| 614 | |
| 615 | using FlowBlockTy = |
| 616 | std::pair<const FlowBlock *, const yaml::bolt::BinaryBasicBlockProfile *>; |
| 617 | using ProfileBlockMatchMap = DenseMap<uint32_t, FlowBlockTy>; |
| 618 | // Binary profile => block index => matched block + its block profile |
| 619 | DenseMap<const yaml::bolt::BinaryFunctionProfile *, ProfileBlockMatchMap> |
| 620 | MatchedBlocks; |
| 621 | |
| 622 | // Map of FlowBlock and matching method. |
| 623 | DenseMap<const FlowBlock *, StaleMatcher::MatchMethod> MatchedFlowBlocks; |
| 624 | |
| 625 | auto addMatchedBlock = |
| 626 | [&](std::pair<const FlowBlock *, StaleMatcher::MatchMethod> BlockMethod, |
| 627 | const yaml::bolt::BinaryFunctionProfile &YamlBP, |
| 628 | const yaml::bolt::BinaryBasicBlockProfile &YamlBB) { |
| 629 | const auto &[MatchedBlock, Method] = BlockMethod; |
| 630 | if (!MatchedBlock) |
| 631 | return; |
| 632 | // Don't override earlier matches |
| 633 | if (MatchedFlowBlocks.contains(Val: MatchedBlock)) |
| 634 | return; |
| 635 | MatchedFlowBlocks.try_emplace(Key: MatchedBlock, Args: Method); |
| 636 | MatchedBlocks[&YamlBP][YamlBB.Index] = {MatchedBlock, &YamlBB}; |
| 637 | }; |
| 638 | |
| 639 | // Match blocks from the profile to the blocks in CFG by strict hash. |
| 640 | for (const yaml::bolt::BinaryBasicBlockProfile &YamlBB : YamlBF.Blocks) { |
| 641 | // Update matching stats. |
| 642 | ++BC.Stats.NumStaleBlocks; |
| 643 | BC.Stats.StaleSampleCount += YamlBB.ExecCount; |
| 644 | |
| 645 | assert(YamlBB.Hash != 0 && "empty hash of BinaryBasicBlockProfile" ); |
| 646 | BlendedBlockHash YamlHash(YamlBB.Hash); |
| 647 | addMatchedBlock(Matcher.matchBlockStrict(BlendedHash: YamlHash), YamlBF, YamlBB); |
| 648 | } |
| 649 | // Match blocks from the profile to the blocks in CFG by pseudo probes. |
| 650 | for (const auto &[InlineNodeMap, YamlBP] : ProbeMatchSpecs) { |
| 651 | for (const yaml::bolt::BinaryBasicBlockProfile &BB : YamlBP.get().Blocks) |
| 652 | if (!BB.PseudoProbes.empty()) |
| 653 | addMatchedBlock(Matcher.matchBlockProbe(PseudoProbes: BB.PseudoProbes, InlineTreeNodeMap: InlineNodeMap), |
| 654 | YamlBP, BB); |
| 655 | } |
| 656 | // Match blocks from the profile to the blocks in CFG with loose methods. |
| 657 | for (const yaml::bolt::BinaryBasicBlockProfile &YamlBB : YamlBF.Blocks) { |
| 658 | assert(YamlBB.Hash != 0 && "empty hash of BinaryBasicBlockProfile" ); |
| 659 | BlendedBlockHash YamlHash(YamlBB.Hash); |
| 660 | |
| 661 | std::string CallHashStr = hashBlockCalls(IdToYamlFunction: IdToYamlBF, YamlBB); |
| 662 | uint64_t CallHash = 0; |
| 663 | if (!CallHashStr.empty()) { |
| 664 | if (HashFunction == HashFunction::StdHash) |
| 665 | CallHash = std::hash<std::string>{}(CallHashStr); |
| 666 | else if (HashFunction == HashFunction::XXH3) |
| 667 | CallHash = llvm::xxh3_64bits(data: CallHashStr); |
| 668 | else |
| 669 | llvm_unreachable("Unhandled HashFunction" ); |
| 670 | } |
| 671 | auto [MatchedBlock, Method] = Matcher.matchBlockLoose(BlendedHash: YamlHash, CallHash); |
| 672 | if (MatchedBlock == nullptr && YamlBB.Index == 0) { |
| 673 | MatchedBlock = Blocks[0]; |
| 674 | // Report as loose match |
| 675 | Method = StaleMatcher::MATCH_OPCODE; |
| 676 | } |
| 677 | if (!MatchedBlock) { |
| 678 | LLVM_DEBUG(dbgs() << "Couldn't match yaml block (bid = " << YamlBB.Index |
| 679 | << ")" << " with hash " << Twine::utohexstr(YamlBB.Hash) |
| 680 | << "\n" ); |
| 681 | continue; |
| 682 | } |
| 683 | addMatchedBlock({MatchedBlock, Method}, YamlBF, YamlBB); |
| 684 | } |
| 685 | |
| 686 | // Match jumps from the profile to the jumps from CFG |
| 687 | std::vector<uint64_t> OutWeight(Func.Blocks.size(), 0); |
| 688 | std::vector<uint64_t> InWeight(Func.Blocks.size(), 0); |
| 689 | |
| 690 | for (const auto &[YamlBF, MatchMap] : MatchedBlocks) { |
| 691 | for (const auto &[YamlBBIdx, FlowBlockProfile] : MatchMap) { |
| 692 | const auto &[MatchedBlock, YamlBB] = FlowBlockProfile; |
| 693 | StaleMatcher::MatchMethod Method = MatchedFlowBlocks.lookup(Val: MatchedBlock); |
| 694 | BlendedBlockHash BinHash = BlendedHashes[MatchedBlock->Index - 1]; |
| 695 | LLVM_DEBUG(dbgs() << "Matched yaml block (bid = " << YamlBBIdx << ")" |
| 696 | << " with hash " << Twine::utohexstr(YamlBB->Hash) |
| 697 | << " to BB (index = " << MatchedBlock->Index - 1 << ")" |
| 698 | << " with hash " << Twine::utohexstr(BinHash.combine()) |
| 699 | << "\n" ); |
| 700 | (void)BinHash; |
| 701 | uint64_t ExecCount = YamlBB->ExecCount; |
| 702 | // Update matching stats accounting for the matched block. |
| 703 | switch (Method) { |
| 704 | case StaleMatcher::MATCH_EXACT: |
| 705 | ++BC.Stats.NumExactMatchedBlocks; |
| 706 | BC.Stats.ExactMatchedSampleCount += ExecCount; |
| 707 | LLVM_DEBUG(dbgs() << " exact match\n" ); |
| 708 | break; |
| 709 | case StaleMatcher::MATCH_PROBE_EXACT: |
| 710 | ++BC.Stats.NumPseudoProbeExactMatchedBlocks; |
| 711 | BC.Stats.PseudoProbeExactMatchedSampleCount += ExecCount; |
| 712 | LLVM_DEBUG(dbgs() << " exact pseudo probe match\n" ); |
| 713 | break; |
| 714 | case StaleMatcher::MATCH_PROBE_LOOSE: |
| 715 | ++BC.Stats.NumPseudoProbeLooseMatchedBlocks; |
| 716 | BC.Stats.PseudoProbeLooseMatchedSampleCount += ExecCount; |
| 717 | LLVM_DEBUG(dbgs() << " loose pseudo probe match\n" ); |
| 718 | break; |
| 719 | case StaleMatcher::MATCH_CALL: |
| 720 | ++BC.Stats.NumCallMatchedBlocks; |
| 721 | BC.Stats.CallMatchedSampleCount += ExecCount; |
| 722 | LLVM_DEBUG(dbgs() << " call match\n" ); |
| 723 | break; |
| 724 | case StaleMatcher::MATCH_OPCODE: |
| 725 | ++BC.Stats.NumLooseMatchedBlocks; |
| 726 | BC.Stats.LooseMatchedSampleCount += ExecCount; |
| 727 | LLVM_DEBUG(dbgs() << " loose match\n" ); |
| 728 | break; |
| 729 | case StaleMatcher::NO_MATCH: |
| 730 | LLVM_DEBUG(dbgs() << " no match\n" ); |
| 731 | } |
| 732 | } |
| 733 | |
| 734 | for (const yaml::bolt::BinaryBasicBlockProfile &YamlBB : YamlBF->Blocks) { |
| 735 | for (const yaml::bolt::SuccessorInfo &YamlSI : YamlBB.Successors) { |
| 736 | if (YamlSI.Count == 0) |
| 737 | continue; |
| 738 | |
| 739 | // Try to find the jump for a given (src, dst) pair from the profile and |
| 740 | // assign the jump weight based on the profile count |
| 741 | const uint64_t SrcIndex = YamlBB.Index; |
| 742 | const uint64_t DstIndex = YamlSI.Index; |
| 743 | |
| 744 | const FlowBlock *MatchedSrcBlock = MatchMap.lookup(Val: SrcIndex).first; |
| 745 | const FlowBlock *MatchedDstBlock = MatchMap.lookup(Val: DstIndex).first; |
| 746 | |
| 747 | if (MatchedSrcBlock != nullptr && MatchedDstBlock != nullptr) { |
| 748 | // Find a jump between the two blocks |
| 749 | FlowJump *Jump = nullptr; |
| 750 | for (FlowJump *SuccJump : MatchedSrcBlock->SuccJumps) { |
| 751 | if (SuccJump->Target == MatchedDstBlock->Index) { |
| 752 | Jump = SuccJump; |
| 753 | break; |
| 754 | } |
| 755 | } |
| 756 | // Assign the weight, if the corresponding jump is found |
| 757 | if (Jump != nullptr) { |
| 758 | Jump->Weight = YamlSI.Count; |
| 759 | Jump->HasUnknownWeight = false; |
| 760 | } |
| 761 | } |
| 762 | // Assign the weight for the src block, if it is found |
| 763 | if (MatchedSrcBlock != nullptr) |
| 764 | OutWeight[MatchedSrcBlock->Index] += YamlSI.Count; |
| 765 | // Assign the weight for the dst block, if it is found |
| 766 | if (MatchedDstBlock != nullptr) |
| 767 | InWeight[MatchedDstBlock->Index] += YamlSI.Count; |
| 768 | } |
| 769 | } |
| 770 | } |
| 771 | |
| 772 | // Assign block counts based on in-/out- jumps |
| 773 | for (FlowBlock &Block : Func.Blocks) { |
| 774 | if (OutWeight[Block.Index] == 0 && InWeight[Block.Index] == 0) { |
| 775 | assert(Block.HasUnknownWeight && "unmatched block with a positive count" ); |
| 776 | continue; |
| 777 | } |
| 778 | Block.HasUnknownWeight = false; |
| 779 | Block.Weight = std::max(a: OutWeight[Block.Index], b: InWeight[Block.Index]); |
| 780 | } |
| 781 | |
| 782 | return MatchedBlocks[&YamlBF].size(); |
| 783 | } |
| 784 | |
| 785 | /// The function finds all blocks that are (i) reachable from the Entry block |
| 786 | /// and (ii) do not have a path to an exit, and marks all such blocks 'cold' |
| 787 | /// so that profi does not send any flow to such blocks. |
| 788 | void preprocessUnreachableBlocks(FlowFunction &Func) { |
| 789 | const uint64_t NumBlocks = Func.Blocks.size(); |
| 790 | |
| 791 | // Start bfs from the source |
| 792 | std::queue<uint64_t> Queue; |
| 793 | std::vector<bool> VisitedEntry(NumBlocks, false); |
| 794 | for (uint64_t I = 0; I < NumBlocks; I++) { |
| 795 | FlowBlock &Block = Func.Blocks[I]; |
| 796 | if (Block.isEntry()) { |
| 797 | Queue.push(x: I); |
| 798 | VisitedEntry[I] = true; |
| 799 | break; |
| 800 | } |
| 801 | } |
| 802 | while (!Queue.empty()) { |
| 803 | const uint64_t Src = Queue.front(); |
| 804 | Queue.pop(); |
| 805 | for (FlowJump *Jump : Func.Blocks[Src].SuccJumps) { |
| 806 | const uint64_t Dst = Jump->Target; |
| 807 | if (!VisitedEntry[Dst]) { |
| 808 | Queue.push(x: Dst); |
| 809 | VisitedEntry[Dst] = true; |
| 810 | } |
| 811 | } |
| 812 | } |
| 813 | |
| 814 | // Start bfs from all sinks |
| 815 | std::vector<bool> VisitedExit(NumBlocks, false); |
| 816 | for (uint64_t I = 0; I < NumBlocks; I++) { |
| 817 | FlowBlock &Block = Func.Blocks[I]; |
| 818 | if (Block.isExit() && VisitedEntry[I]) { |
| 819 | Queue.push(x: I); |
| 820 | VisitedExit[I] = true; |
| 821 | } |
| 822 | } |
| 823 | while (!Queue.empty()) { |
| 824 | const uint64_t Src = Queue.front(); |
| 825 | Queue.pop(); |
| 826 | for (FlowJump *Jump : Func.Blocks[Src].PredJumps) { |
| 827 | const uint64_t Dst = Jump->Source; |
| 828 | if (!VisitedExit[Dst]) { |
| 829 | Queue.push(x: Dst); |
| 830 | VisitedExit[Dst] = true; |
| 831 | } |
| 832 | } |
| 833 | } |
| 834 | |
| 835 | // Make all blocks of zero weight so that flow is not sent |
| 836 | for (uint64_t I = 0; I < NumBlocks; I++) { |
| 837 | FlowBlock &Block = Func.Blocks[I]; |
| 838 | if (Block.Weight == 0) |
| 839 | continue; |
| 840 | if (!VisitedEntry[I] || !VisitedExit[I]) { |
| 841 | Block.Weight = 0; |
| 842 | Block.HasUnknownWeight = true; |
| 843 | Block.IsUnlikely = true; |
| 844 | for (FlowJump *Jump : Block.SuccJumps) { |
| 845 | if (Jump->Source == Block.Index && Jump->Target == Block.Index) { |
| 846 | Jump->Weight = 0; |
| 847 | Jump->HasUnknownWeight = true; |
| 848 | Jump->IsUnlikely = true; |
| 849 | } |
| 850 | } |
| 851 | } |
| 852 | } |
| 853 | } |
| 854 | |
| 855 | /// Decide if stale profile matching can be applied for a given function. |
| 856 | /// Currently we skip inference for (very) large instances and for instances |
| 857 | /// having "unexpected" control flow (e.g., having no sink basic blocks). |
| 858 | bool canApplyInference(const FlowFunction &Func, |
| 859 | const yaml::bolt::BinaryFunctionProfile &YamlBF, |
| 860 | const uint64_t &MatchedBlocks) { |
| 861 | if (Func.Blocks.size() > opts::StaleMatchingMaxFuncSize) |
| 862 | return false; |
| 863 | |
| 864 | if (MatchedBlocks * 100 < |
| 865 | opts::StaleMatchingMinMatchedBlock * YamlBF.Blocks.size()) |
| 866 | return false; |
| 867 | |
| 868 | // Returns false if the artificial sink block has no predecessors meaning |
| 869 | // there are no exit blocks. |
| 870 | if (Func.Blocks[Func.Blocks.size() - 1].isEntry()) |
| 871 | return false; |
| 872 | |
| 873 | return true; |
| 874 | } |
| 875 | |
| 876 | /// Apply the profile inference algorithm for a given flow function. |
| 877 | void applyInference(FlowFunction &Func) { |
| 878 | ProfiParams Params; |
| 879 | // Set the params from the command-line flags. |
| 880 | Params.EvenFlowDistribution = opts::StaleMatchingEvenFlowDistribution; |
| 881 | Params.RebalanceUnknown = opts::StaleMatchingRebalanceUnknown; |
| 882 | Params.JoinIslands = opts::StaleMatchingJoinIslands; |
| 883 | |
| 884 | Params.CostBlockInc = opts::StaleMatchingCostBlockInc; |
| 885 | Params.CostBlockEntryInc = opts::StaleMatchingCostBlockInc; |
| 886 | Params.CostBlockDec = opts::StaleMatchingCostBlockDec; |
| 887 | Params.CostBlockEntryDec = opts::StaleMatchingCostBlockDec; |
| 888 | Params.CostBlockUnknownInc = opts::StaleMatchingCostBlockUnknownInc; |
| 889 | |
| 890 | Params.CostJumpInc = opts::StaleMatchingCostJumpInc; |
| 891 | Params.CostJumpFTInc = opts::StaleMatchingCostJumpInc; |
| 892 | Params.CostJumpDec = opts::StaleMatchingCostJumpDec; |
| 893 | Params.CostJumpFTDec = opts::StaleMatchingCostJumpDec; |
| 894 | Params.CostJumpUnknownInc = opts::StaleMatchingCostJumpUnknownInc; |
| 895 | Params.CostJumpUnknownFTInc = opts::StaleMatchingCostJumpUnknownFTInc; |
| 896 | |
| 897 | applyFlowInference(Params, Func); |
| 898 | } |
| 899 | |
| 900 | /// Collect inferred counts from the flow function and update annotations in |
| 901 | /// the binary function. |
| 902 | void assignProfile(BinaryFunction &BF, |
| 903 | const BinaryFunction::BasicBlockOrderType &BlockOrder, |
| 904 | FlowFunction &Func) { |
| 905 | BinaryContext &BC = BF.getBinaryContext(); |
| 906 | |
| 907 | assert(Func.Blocks.size() == BlockOrder.size() + 2); |
| 908 | for (uint64_t I = 0; I < BlockOrder.size(); I++) { |
| 909 | FlowBlock &Block = Func.Blocks[I + 1]; |
| 910 | BinaryBasicBlock *BB = BlockOrder[I]; |
| 911 | |
| 912 | // Update block's count |
| 913 | BB->setExecutionCount(Block.Flow); |
| 914 | |
| 915 | // Update jump counts: (i) clean existing counts and then (ii) set new ones |
| 916 | auto BI = BB->branch_info_begin(); |
| 917 | for (const BinaryBasicBlock *DstBB : BB->successors()) { |
| 918 | (void)DstBB; |
| 919 | BI->Count = 0; |
| 920 | BI->MispredictedCount = 0; |
| 921 | ++BI; |
| 922 | } |
| 923 | for (FlowJump *Jump : Block.SuccJumps) { |
| 924 | if (Jump->IsUnlikely) |
| 925 | continue; |
| 926 | if (Jump->Flow == 0) |
| 927 | continue; |
| 928 | |
| 929 | // Skips the artificial sink block. |
| 930 | if (Jump->Target == Func.Blocks.size() - 1) |
| 931 | continue; |
| 932 | BinaryBasicBlock &SuccBB = *BlockOrder[Jump->Target - 1]; |
| 933 | // Check if the edge corresponds to a regular jump or a landing pad |
| 934 | if (BB->getSuccessor(Label: SuccBB.getLabel())) { |
| 935 | BinaryBasicBlock::BinaryBranchInfo &BI = BB->getBranchInfo(Succ: SuccBB); |
| 936 | BI.Count += Jump->Flow; |
| 937 | } else { |
| 938 | BinaryBasicBlock *LP = BB->getLandingPad(Label: SuccBB.getLabel()); |
| 939 | if (LP && LP->getKnownExecutionCount() < Jump->Flow) |
| 940 | LP->setExecutionCount(Jump->Flow); |
| 941 | } |
| 942 | } |
| 943 | |
| 944 | // Update call-site annotations |
| 945 | auto setOrUpdateAnnotation = [&](MCInst &Instr, StringRef Name, |
| 946 | uint64_t Count) { |
| 947 | if (BC.MIB->hasAnnotation(Inst: Instr, Name)) |
| 948 | BC.MIB->removeAnnotation(Inst&: Instr, Name); |
| 949 | // Do not add zero-count annotations |
| 950 | if (Count == 0) |
| 951 | return; |
| 952 | BC.MIB->addAnnotation(Inst&: Instr, Name, Val: Count); |
| 953 | }; |
| 954 | |
| 955 | for (MCInst &Instr : *BB) { |
| 956 | // Ignore pseudo instructions |
| 957 | if (BC.MIB->isPseudo(Inst: Instr)) |
| 958 | continue; |
| 959 | // Ignore jump tables |
| 960 | const MCInst *LastInstr = BB->getLastNonPseudoInstr(); |
| 961 | if (BC.MIB->getJumpTable(Inst: *LastInstr) && LastInstr == &Instr) |
| 962 | continue; |
| 963 | |
| 964 | if (BC.MIB->isIndirectCall(Inst: Instr) || BC.MIB->isIndirectBranch(Inst: Instr)) { |
| 965 | auto &ICSP = BC.MIB->getOrCreateAnnotationAs<IndirectCallSiteProfile>( |
| 966 | Inst&: Instr, Name: "CallProfile" ); |
| 967 | if (!ICSP.empty()) { |
| 968 | // Try to evenly distribute the counts among the call sites |
| 969 | const uint64_t TotalCount = Block.Flow; |
| 970 | const uint64_t NumSites = ICSP.size(); |
| 971 | for (uint64_t Idx = 0; Idx < ICSP.size(); Idx++) { |
| 972 | IndirectCallProfile &CSP = ICSP[Idx]; |
| 973 | uint64_t CountPerSite = TotalCount / NumSites; |
| 974 | // When counts cannot be exactly distributed, increase by 1 the |
| 975 | // counts of the first (TotalCount % NumSites) call sites |
| 976 | if (Idx < TotalCount % NumSites) |
| 977 | CountPerSite++; |
| 978 | CSP.Count = CountPerSite; |
| 979 | } |
| 980 | } else { |
| 981 | ICSP.emplace_back(Args: nullptr, Args&: Block.Flow, Args: 0); |
| 982 | } |
| 983 | } else if (BC.MIB->getConditionalTailCall(Inst: Instr)) { |
| 984 | // We don't know exactly the number of times the conditional tail call |
| 985 | // is executed; conservatively, setting it to the count of the block |
| 986 | setOrUpdateAnnotation(Instr, "CTCTakenCount" , Block.Flow); |
| 987 | BC.MIB->removeAnnotation(Inst&: Instr, Name: "CTCMispredCount" ); |
| 988 | } else if (BC.MIB->isCall(Inst: Instr)) { |
| 989 | setOrUpdateAnnotation(Instr, "Count" , Block.Flow); |
| 990 | } |
| 991 | } |
| 992 | } |
| 993 | |
| 994 | // Update function's execution count and mark the function inferred. |
| 995 | BF.setExecutionCount(Func.Blocks[0].Flow); |
| 996 | BF.setHasInferredProfile(true); |
| 997 | } |
| 998 | |
| 999 | bool YAMLProfileReader::inferStaleProfile( |
| 1000 | BinaryFunction &BF, const yaml::bolt::BinaryFunctionProfile &YamlBF, |
| 1001 | const ArrayRef<ProbeMatchSpec> ProbeMatchSpecs) { |
| 1002 | |
| 1003 | NamedRegionTimer T("inferStaleProfile" , "stale profile inference" , "rewrite" , |
| 1004 | "Rewrite passes" , opts::TimeRewrite); |
| 1005 | |
| 1006 | if (!BF.hasCFG()) |
| 1007 | return false; |
| 1008 | |
| 1009 | LLVM_DEBUG(dbgs() << "BOLT-INFO: applying profile inference for " |
| 1010 | << "\"" << BF.getPrintName() << "\"\n" ); |
| 1011 | |
| 1012 | // Make sure that block hashes are up to date. |
| 1013 | BF.computeBlockHashes(HashFunction: YamlBP.Header.HashFunction); |
| 1014 | |
| 1015 | const BinaryFunction::BasicBlockOrderType BlockOrder( |
| 1016 | BF.getLayout().block_begin(), BF.getLayout().block_end()); |
| 1017 | |
| 1018 | // Tracks the number of matched blocks. |
| 1019 | |
| 1020 | // Create a wrapper flow function to use with the profile inference algorithm. |
| 1021 | FlowFunction Func = createFlowFunction(BlockOrder); |
| 1022 | |
| 1023 | // Match as many block/jump counts from the stale profile as possible |
| 1024 | size_t MatchedBlocks = |
| 1025 | matchWeights(BC&: BF.getBinaryContext(), BlockOrder, YamlBF, Func, |
| 1026 | HashFunction: YamlBP.Header.HashFunction, IdToYamlBF&: IdToYamLBF, BF, ProbeMatchSpecs); |
| 1027 | |
| 1028 | // Adjust the flow function by marking unreachable blocks Unlikely so that |
| 1029 | // they don't get any counts assigned. |
| 1030 | preprocessUnreachableBlocks(Func); |
| 1031 | |
| 1032 | // Check if profile inference can be applied for the instance. |
| 1033 | if (!canApplyInference(Func, YamlBF, MatchedBlocks)) |
| 1034 | return false; |
| 1035 | |
| 1036 | // Apply the profile inference algorithm. |
| 1037 | applyInference(Func); |
| 1038 | |
| 1039 | // Collect inferred counts and update function annotations. |
| 1040 | assignProfile(BF, BlockOrder, Func); |
| 1041 | |
| 1042 | // As of now, we always mark the binary function having "correct" profile. |
| 1043 | // In the future, we may discard the results for instances with poor inference |
| 1044 | // metrics and keep such functions un-optimized. |
| 1045 | return true; |
| 1046 | } |
| 1047 | |
| 1048 | } // end namespace bolt |
| 1049 | } // end namespace llvm |
| 1050 | |