1 | //===- bolt/Profile/StaleProfileMatching.cpp - Profile data matching ----===// |
---|---|
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // BOLT often has to deal with profiles collected on binaries built from several |
10 | // revisions behind release. As a result, a certain percentage of functions is |
11 | // considered stale and not optimized. This file implements an ability to match |
12 | // profile to functions that are not 100% binary identical, and thus, increasing |
13 | // the optimization coverage and boost the performance of applications. |
14 | // |
15 | // The algorithm consists of two phases: matching and inference: |
16 | // - At the matching phase, we try to "guess" as many block and jump counts from |
17 | // the stale profile as possible. To this end, the content of each basic block |
18 | // is hashed and stored in the (yaml) profile. When BOLT optimizes a binary, |
19 | // it computes block hashes and identifies the corresponding entries in the |
20 | // stale profile. It yields a partial profile for every CFG in the binary. |
21 | // - At the inference phase, we employ a network flow-based algorithm (profi) to |
22 | // reconstruct "realistic" block and jump counts from the partial profile |
23 | // generated at the first stage. In practice, we don't always produce proper |
24 | // profile data but the majority (e.g., >90%) of CFGs get the correct counts. |
25 | // |
26 | //===----------------------------------------------------------------------===// |
27 | |
28 | #include "bolt/Core/HashUtilities.h" |
29 | #include "bolt/Profile/YAMLProfileReader.h" |
30 | #include "llvm/ADT/Bitfields.h" |
31 | #include "llvm/ADT/Hashing.h" |
32 | #include "llvm/MC/MCPseudoProbe.h" |
33 | #include "llvm/Support/CommandLine.h" |
34 | #include "llvm/Support/Timer.h" |
35 | #include "llvm/Support/xxhash.h" |
36 | #include "llvm/Transforms/Utils/SampleProfileInference.h" |
37 | |
38 | #include <queue> |
39 | |
40 | using namespace llvm; |
41 | |
42 | #undef DEBUG_TYPE |
43 | #define DEBUG_TYPE "bolt-prof" |
44 | |
45 | namespace opts { |
46 | |
47 | extern cl::opt<bool> TimeRewrite; |
48 | extern cl::OptionCategory BoltOptCategory; |
49 | |
50 | cl::opt<bool> |
51 | InferStaleProfile("infer-stale-profile", |
52 | cl::desc("Infer counts from stale profile data."), |
53 | cl::init(Val: false), cl::Hidden, cl::cat(BoltOptCategory)); |
54 | |
55 | static cl::opt<unsigned> StaleMatchingMinMatchedBlock( |
56 | "stale-matching-min-matched-block", |
57 | cl::desc("Percentage threshold of matched basic blocks at which stale " |
58 | "profile inference is executed."), |
59 | cl::init(Val: 0), cl::Hidden, cl::cat(BoltOptCategory)); |
60 | |
61 | static cl::opt<unsigned> StaleMatchingMaxFuncSize( |
62 | "stale-matching-max-func-size", |
63 | cl::desc("The maximum size of a function to consider for inference."), |
64 | cl::init(Val: 10000), cl::Hidden, cl::cat(BoltOptCategory)); |
65 | |
66 | // Parameters of the profile inference algorithm. The default values are tuned |
67 | // on several benchmarks. |
68 | static cl::opt<bool> StaleMatchingEvenFlowDistribution( |
69 | "stale-matching-even-flow-distribution", |
70 | cl::desc("Try to evenly distribute flow when there are multiple equally " |
71 | "likely options."), |
72 | cl::init(Val: true), cl::ReallyHidden, cl::cat(BoltOptCategory)); |
73 | |
74 | static cl::opt<bool> StaleMatchingRebalanceUnknown( |
75 | "stale-matching-rebalance-unknown", |
76 | cl::desc("Evenly re-distribute flow among unknown subgraphs."), |
77 | cl::init(Val: false), cl::ReallyHidden, cl::cat(BoltOptCategory)); |
78 | |
79 | static cl::opt<bool> StaleMatchingJoinIslands( |
80 | "stale-matching-join-islands", |
81 | cl::desc("Join isolated components having positive flow."), cl::init(Val: true), |
82 | cl::ReallyHidden, cl::cat(BoltOptCategory)); |
83 | |
84 | static cl::opt<unsigned> StaleMatchingCostBlockInc( |
85 | "stale-matching-cost-block-inc", |
86 | cl::desc("The cost of increasing a block count by one."), cl::init(Val: 150), |
87 | cl::ReallyHidden, cl::cat(BoltOptCategory)); |
88 | |
89 | static cl::opt<unsigned> StaleMatchingCostBlockDec( |
90 | "stale-matching-cost-block-dec", |
91 | cl::desc("The cost of decreasing a block count by one."), cl::init(Val: 150), |
92 | cl::ReallyHidden, cl::cat(BoltOptCategory)); |
93 | |
94 | static cl::opt<unsigned> StaleMatchingCostJumpInc( |
95 | "stale-matching-cost-jump-inc", |
96 | cl::desc("The cost of increasing a jump count by one."), cl::init(Val: 150), |
97 | cl::ReallyHidden, cl::cat(BoltOptCategory)); |
98 | |
99 | static cl::opt<unsigned> StaleMatchingCostJumpDec( |
100 | "stale-matching-cost-jump-dec", |
101 | cl::desc("The cost of decreasing a jump count by one."), cl::init(Val: 150), |
102 | cl::ReallyHidden, cl::cat(BoltOptCategory)); |
103 | |
104 | static cl::opt<unsigned> StaleMatchingCostBlockUnknownInc( |
105 | "stale-matching-cost-block-unknown-inc", |
106 | cl::desc("The cost of increasing an unknown block count by one."), |
107 | cl::init(Val: 1), cl::ReallyHidden, cl::cat(BoltOptCategory)); |
108 | |
109 | static cl::opt<unsigned> StaleMatchingCostJumpUnknownInc( |
110 | "stale-matching-cost-jump-unknown-inc", |
111 | cl::desc("The cost of increasing an unknown jump count by one."), |
112 | cl::init(Val: 140), cl::ReallyHidden, cl::cat(BoltOptCategory)); |
113 | |
114 | static cl::opt<unsigned> StaleMatchingCostJumpUnknownFTInc( |
115 | "stale-matching-cost-jump-unknown-ft-inc", |
116 | cl::desc( |
117 | "The cost of increasing an unknown fall-through jump count by one."), |
118 | cl::init(Val: 3), cl::ReallyHidden, cl::cat(BoltOptCategory)); |
119 | |
120 | cl::opt<bool> StaleMatchingWithPseudoProbes( |
121 | "stale-matching-with-pseudo-probes", |
122 | cl::desc("Turns on stale matching with block pseudo probes."), |
123 | cl::init(Val: false), cl::ReallyHidden, cl::cat(BoltOptCategory)); |
124 | |
125 | } // namespace opts |
126 | |
127 | namespace llvm { |
128 | namespace bolt { |
129 | |
130 | /// An object wrapping several components of a basic block hash. The combined |
131 | /// (blended) hash is represented and stored as one uint64_t, while individual |
132 | /// components are of smaller size (e.g., uint16_t or uint8_t). |
133 | struct BlendedBlockHash { |
134 | private: |
135 | using ValueOffset = Bitfield::Element<uint16_t, 0, 16>; |
136 | using ValueOpcode = Bitfield::Element<uint16_t, 16, 16>; |
137 | using ValueInstr = Bitfield::Element<uint16_t, 32, 16>; |
138 | using ValuePred = Bitfield::Element<uint8_t, 48, 8>; |
139 | using ValueSucc = Bitfield::Element<uint8_t, 56, 8>; |
140 | |
141 | public: |
142 | explicit BlendedBlockHash() {} |
143 | |
144 | explicit BlendedBlockHash(uint64_t Hash) { |
145 | Offset = Bitfield::get<ValueOffset>(Packed: Hash); |
146 | OpcodeHash = Bitfield::get<ValueOpcode>(Packed: Hash); |
147 | InstrHash = Bitfield::get<ValueInstr>(Packed: Hash); |
148 | PredHash = Bitfield::get<ValuePred>(Packed: Hash); |
149 | SuccHash = Bitfield::get<ValueSucc>(Packed: Hash); |
150 | } |
151 | |
152 | /// Combine the blended hash into uint64_t. |
153 | uint64_t combine() const { |
154 | uint64_t Hash = 0; |
155 | Bitfield::set<ValueOffset>(Packed&: Hash, Value: Offset); |
156 | Bitfield::set<ValueOpcode>(Packed&: Hash, Value: OpcodeHash); |
157 | Bitfield::set<ValueInstr>(Packed&: Hash, Value: InstrHash); |
158 | Bitfield::set<ValuePred>(Packed&: Hash, Value: PredHash); |
159 | Bitfield::set<ValueSucc>(Packed&: Hash, Value: SuccHash); |
160 | return Hash; |
161 | } |
162 | |
163 | /// Compute a distance between two given blended hashes. The smaller the |
164 | /// distance, the more similar two blocks are. For identical basic blocks, |
165 | /// the distance is zero. |
166 | uint64_t distance(const BlendedBlockHash &BBH) const { |
167 | assert(OpcodeHash == BBH.OpcodeHash && |
168 | "incorrect blended hash distance computation"); |
169 | uint64_t Dist = 0; |
170 | // Account for NeighborHash |
171 | Dist += SuccHash == BBH.SuccHash ? 0 : 1; |
172 | Dist += PredHash == BBH.PredHash ? 0 : 1; |
173 | Dist <<= 16; |
174 | // Account for InstrHash |
175 | Dist += InstrHash == BBH.InstrHash ? 0 : 1; |
176 | Dist <<= 16; |
177 | // Account for Offset |
178 | Dist += (Offset >= BBH.Offset ? Offset - BBH.Offset : BBH.Offset - Offset); |
179 | return Dist; |
180 | } |
181 | |
182 | /// The offset of the basic block from the function start. |
183 | uint16_t Offset{0}; |
184 | /// (Loose) Hash of the basic block instructions, excluding operands. |
185 | uint16_t OpcodeHash{0}; |
186 | /// (Strong) Hash of the basic block instructions, including opcodes and |
187 | /// operands. |
188 | uint16_t InstrHash{0}; |
189 | /// (Loose) Hashes of the predecessors of the basic block. |
190 | uint8_t PredHash{0}; |
191 | /// (Loose) Hashes of the successors of the basic block. |
192 | uint8_t SuccHash{0}; |
193 | }; |
194 | |
195 | /// The object is used to identify and match basic blocks in a BinaryFunction |
196 | /// given their hashes computed on a binary built from several revisions behind |
197 | /// release. |
198 | class StaleMatcher { |
199 | public: |
200 | /// Initialize stale matcher. |
201 | void init(const std::vector<FlowBlock *> &Blocks, |
202 | const std::vector<BlendedBlockHash> &Hashes, |
203 | const std::vector<uint64_t> &CallHashes) { |
204 | assert(Blocks.size() == Hashes.size() && |
205 | Hashes.size() == CallHashes.size() && |
206 | "incorrect matcher initialization"); |
207 | for (size_t I = 0; I < Blocks.size(); I++) { |
208 | FlowBlock *Block = Blocks[I]; |
209 | uint16_t OpHash = Hashes[I].OpcodeHash; |
210 | OpHashToBlocks[OpHash].push_back(x: std::make_pair(x: Hashes[I], y&: Block)); |
211 | if (CallHashes[I]) |
212 | CallHashToBlocks[CallHashes[I]].push_back( |
213 | x: std::make_pair(x: Hashes[I], y&: Block)); |
214 | } |
215 | } |
216 | |
217 | /// Creates a mapping from a pseudo probe to a flow block. |
218 | void mapProbeToBB(const MCDecodedPseudoProbe *Probe, FlowBlock *Block) { |
219 | BBPseudoProbeToBlock[Probe] = Block; |
220 | } |
221 | |
222 | enum MatchMethod : char { |
223 | MATCH_EXACT = 0, |
224 | MATCH_PROBE_EXACT, |
225 | MATCH_PROBE_LOOSE, |
226 | MATCH_OPCODE, |
227 | MATCH_CALL, |
228 | NO_MATCH |
229 | }; |
230 | |
231 | /// Find the most similar flow block for a profile block given blended hash. |
232 | std::pair<const FlowBlock *, MatchMethod> |
233 | matchBlockStrict(BlendedBlockHash BlendedHash) { |
234 | const auto &[Block, ExactHash] = matchWithOpcodes(BlendedHash); |
235 | if (Block && ExactHash) |
236 | return {Block, MATCH_EXACT}; |
237 | return {nullptr, NO_MATCH}; |
238 | } |
239 | |
240 | /// Find the most similar flow block for a profile block given pseudo probes. |
241 | std::pair<const FlowBlock *, MatchMethod> matchBlockProbe( |
242 | const ArrayRef<yaml::bolt::PseudoProbeInfo> PseudoProbes, |
243 | const YAMLProfileReader::InlineTreeNodeMapTy &InlineTreeNodeMap) { |
244 | const auto &[ProbeBlock, ExactProbe] = |
245 | matchWithPseudoProbes(BlockPseudoProbes: PseudoProbes, InlineTreeNodeMap); |
246 | if (ProbeBlock) |
247 | return {ProbeBlock, ExactProbe ? MATCH_PROBE_EXACT : MATCH_PROBE_LOOSE}; |
248 | return {nullptr, NO_MATCH}; |
249 | } |
250 | |
251 | /// Find the most similar flow block for a profile block given its hashes. |
252 | std::pair<const FlowBlock *, MatchMethod> |
253 | matchBlockLoose(BlendedBlockHash BlendedHash, uint64_t CallHash) { |
254 | if (const FlowBlock *CallBlock = matchWithCalls(BlendedHash, CallHash)) |
255 | return {CallBlock, MATCH_CALL}; |
256 | if (const FlowBlock *OpcodeBlock = matchWithOpcodes(BlendedHash).first) |
257 | return {OpcodeBlock, MATCH_OPCODE}; |
258 | return {nullptr, NO_MATCH}; |
259 | } |
260 | |
261 | /// Returns true if the two basic blocks (in the binary and in the profile) |
262 | /// corresponding to the given hashes are matched to each other with a high |
263 | /// confidence. |
264 | static bool isHighConfidenceMatch(BlendedBlockHash Hash1, |
265 | BlendedBlockHash Hash2) { |
266 | return Hash1.InstrHash == Hash2.InstrHash; |
267 | } |
268 | |
269 | private: |
270 | using HashBlockPairType = std::pair<BlendedBlockHash, FlowBlock *>; |
271 | std::unordered_map<uint16_t, std::vector<HashBlockPairType>> OpHashToBlocks; |
272 | std::unordered_map<uint64_t, std::vector<HashBlockPairType>> CallHashToBlocks; |
273 | DenseMap<const MCDecodedPseudoProbe *, FlowBlock *> BBPseudoProbeToBlock; |
274 | |
275 | // Uses OpcodeHash to find the most similar block for a given hash. |
276 | std::pair<const FlowBlock *, bool> |
277 | matchWithOpcodes(BlendedBlockHash BlendedHash) const { |
278 | auto BlockIt = OpHashToBlocks.find(x: BlendedHash.OpcodeHash); |
279 | if (BlockIt == OpHashToBlocks.end()) |
280 | return {nullptr, false}; |
281 | FlowBlock *BestBlock = nullptr; |
282 | uint64_t BestDist = std::numeric_limits<uint64_t>::max(); |
283 | BlendedBlockHash BestHash; |
284 | for (const auto &[Hash, Block] : BlockIt->second) { |
285 | uint64_t Dist = Hash.distance(BBH: BlendedHash); |
286 | if (BestBlock == nullptr || Dist < BestDist) { |
287 | BestDist = Dist; |
288 | BestBlock = Block; |
289 | BestHash = Hash; |
290 | } |
291 | } |
292 | return {BestBlock, isHighConfidenceMatch(Hash1: BestHash, Hash2: BlendedHash)}; |
293 | } |
294 | |
295 | // Uses CallHash to find the most similar block for a given hash. |
296 | const FlowBlock *matchWithCalls(BlendedBlockHash BlendedHash, |
297 | uint64_t CallHash) const { |
298 | if (!CallHash) |
299 | return nullptr; |
300 | auto BlockIt = CallHashToBlocks.find(x: CallHash); |
301 | if (BlockIt == CallHashToBlocks.end()) |
302 | return nullptr; |
303 | FlowBlock *BestBlock = nullptr; |
304 | uint64_t BestDist = std::numeric_limits<uint64_t>::max(); |
305 | for (const auto &[Hash, Block] : BlockIt->second) { |
306 | uint64_t Dist = Hash.OpcodeHash > BlendedHash.OpcodeHash |
307 | ? Hash.OpcodeHash - BlendedHash.OpcodeHash |
308 | : BlendedHash.OpcodeHash - Hash.OpcodeHash; |
309 | if (BestBlock == nullptr || Dist < BestDist) { |
310 | BestDist = Dist; |
311 | BestBlock = Block; |
312 | } |
313 | } |
314 | return BestBlock; |
315 | } |
316 | |
317 | /// Matches a profile block with a binary block based on pseudo probes. |
318 | /// Returns the best matching block (or nullptr) and whether the match is |
319 | /// unambiguous. |
320 | std::pair<const FlowBlock *, bool> matchWithPseudoProbes( |
321 | const ArrayRef<yaml::bolt::PseudoProbeInfo> BlockPseudoProbes, |
322 | const YAMLProfileReader::InlineTreeNodeMapTy &InlineTreeNodeMap) const { |
323 | |
324 | if (!opts::StaleMatchingWithPseudoProbes) |
325 | return {nullptr, false}; |
326 | |
327 | DenseMap<const FlowBlock *, uint32_t> FlowBlockMatchCount; |
328 | |
329 | auto matchProfileProbeToBlock = [&](uint32_t NodeId, |
330 | uint64_t ProbeId) -> const FlowBlock * { |
331 | const MCDecodedPseudoProbeInlineTree *BinaryNode = |
332 | InlineTreeNodeMap.getInlineTreeNode(ProfileInlineTreeNodeId: NodeId); |
333 | if (!BinaryNode) |
334 | return nullptr; |
335 | const MCDecodedPseudoProbe Dummy(0, ProbeId, PseudoProbeType::Block, 0, 0, |
336 | nullptr); |
337 | ArrayRef<MCDecodedPseudoProbe> BinaryProbes = BinaryNode->getProbes(); |
338 | auto BinaryProbeIt = llvm::lower_bound( |
339 | Range&: BinaryProbes, Value: Dummy, C: [](const auto &LHS, const auto &RHS) { |
340 | return LHS.getIndex() < RHS.getIndex(); |
341 | }); |
342 | if (BinaryProbeIt == BinaryNode->getProbes().end() || |
343 | BinaryProbeIt->getIndex() != ProbeId) |
344 | return nullptr; |
345 | auto It = BBPseudoProbeToBlock.find(Val: &*BinaryProbeIt); |
346 | if (It == BBPseudoProbeToBlock.end()) |
347 | return nullptr; |
348 | return It->second; |
349 | }; |
350 | |
351 | auto matchPseudoProbeInfo = [&](const yaml::bolt::PseudoProbeInfo |
352 | &ProfileProbe, |
353 | uint32_t NodeId) { |
354 | for (uint64_t Index = 0; Index < 64; ++Index) |
355 | if (ProfileProbe.BlockMask & 1ull << Index) |
356 | ++FlowBlockMatchCount[matchProfileProbeToBlock(NodeId, Index + 1)]; |
357 | for (const auto &ProfileProbes : |
358 | {ProfileProbe.BlockProbes, ProfileProbe.IndCallProbes, |
359 | ProfileProbe.CallProbes}) |
360 | for (uint64_t ProfileProbe : ProfileProbes) |
361 | ++FlowBlockMatchCount[matchProfileProbeToBlock(NodeId, ProfileProbe)]; |
362 | }; |
363 | |
364 | for (const yaml::bolt::PseudoProbeInfo &ProfileProbe : BlockPseudoProbes) { |
365 | if (!ProfileProbe.InlineTreeNodes.empty()) |
366 | for (uint32_t ProfileInlineTreeNode : ProfileProbe.InlineTreeNodes) |
367 | matchPseudoProbeInfo(ProfileProbe, ProfileInlineTreeNode); |
368 | else |
369 | matchPseudoProbeInfo(ProfileProbe, ProfileProbe.InlineTreeIndex); |
370 | } |
371 | uint32_t BestMatchCount = 0; |
372 | uint32_t TotalMatchCount = 0; |
373 | const FlowBlock *BestMatchBlock = nullptr; |
374 | for (const auto &[FlowBlock, Count] : FlowBlockMatchCount) { |
375 | TotalMatchCount += Count; |
376 | if (Count < BestMatchCount || (Count == BestMatchCount && BestMatchBlock)) |
377 | continue; |
378 | BestMatchBlock = FlowBlock; |
379 | BestMatchCount = Count; |
380 | } |
381 | return {BestMatchBlock, BestMatchCount == TotalMatchCount}; |
382 | } |
383 | }; |
384 | |
385 | void BinaryFunction::computeBlockHashes(HashFunction HashFunction) const { |
386 | if (size() == 0) |
387 | return; |
388 | |
389 | assert(hasCFG() && "the function is expected to have CFG"); |
390 | |
391 | std::vector<BlendedBlockHash> BlendedHashes(BasicBlocks.size()); |
392 | std::vector<uint64_t> OpcodeHashes(BasicBlocks.size()); |
393 | // Initialize hash components. |
394 | for (size_t I = 0; I < BasicBlocks.size(); I++) { |
395 | const BinaryBasicBlock *BB = BasicBlocks[I]; |
396 | assert(BB->getIndex() == I && "incorrect block index"); |
397 | BlendedHashes[I].Offset = BB->getOffset(); |
398 | // Hashing complete instructions. |
399 | std::string InstrHashStr = hashBlock( |
400 | BC, BB: *BB, OperandHashFunc: [&](const MCOperand &Op) { return hashInstOperand(BC, Operand: Op); }); |
401 | if (HashFunction == HashFunction::StdHash) { |
402 | uint64_t InstrHash = std::hash<std::string>{}(InstrHashStr); |
403 | BlendedHashes[I].InstrHash = (uint16_t)hash_value(value: InstrHash); |
404 | } else if (HashFunction == HashFunction::XXH3) { |
405 | uint64_t InstrHash = llvm::xxh3_64bits(data: InstrHashStr); |
406 | BlendedHashes[I].InstrHash = (uint16_t)InstrHash; |
407 | } else { |
408 | llvm_unreachable("Unhandled HashFunction"); |
409 | } |
410 | // Hashing opcodes. |
411 | std::string OpcodeHashStr = hashBlockLoose(BC, BB: *BB); |
412 | if (HashFunction == HashFunction::StdHash) { |
413 | OpcodeHashes[I] = std::hash<std::string>{}(OpcodeHashStr); |
414 | BlendedHashes[I].OpcodeHash = (uint16_t)hash_value(value: OpcodeHashes[I]); |
415 | } else if (HashFunction == HashFunction::XXH3) { |
416 | OpcodeHashes[I] = llvm::xxh3_64bits(data: OpcodeHashStr); |
417 | BlendedHashes[I].OpcodeHash = (uint16_t)OpcodeHashes[I]; |
418 | } else { |
419 | llvm_unreachable("Unhandled HashFunction"); |
420 | } |
421 | } |
422 | |
423 | // Initialize neighbor hash. |
424 | for (size_t I = 0; I < BasicBlocks.size(); I++) { |
425 | const BinaryBasicBlock *BB = BasicBlocks[I]; |
426 | // Append hashes of successors. |
427 | uint64_t Hash = 0; |
428 | for (BinaryBasicBlock *SuccBB : BB->successors()) { |
429 | uint64_t SuccHash = OpcodeHashes[SuccBB->getIndex()]; |
430 | Hash = hashing::detail::hash_16_bytes(low: Hash, high: SuccHash); |
431 | } |
432 | if (HashFunction == HashFunction::StdHash) { |
433 | // Compatibility with old behavior. |
434 | BlendedHashes[I].SuccHash = (uint8_t)hash_value(value: Hash); |
435 | } else { |
436 | BlendedHashes[I].SuccHash = (uint8_t)Hash; |
437 | } |
438 | |
439 | // Append hashes of predecessors. |
440 | Hash = 0; |
441 | for (BinaryBasicBlock *PredBB : BB->predecessors()) { |
442 | uint64_t PredHash = OpcodeHashes[PredBB->getIndex()]; |
443 | Hash = hashing::detail::hash_16_bytes(low: Hash, high: PredHash); |
444 | } |
445 | if (HashFunction == HashFunction::StdHash) { |
446 | // Compatibility with old behavior. |
447 | BlendedHashes[I].PredHash = (uint8_t)hash_value(value: Hash); |
448 | } else { |
449 | BlendedHashes[I].PredHash = (uint8_t)Hash; |
450 | } |
451 | } |
452 | |
453 | // Assign hashes. |
454 | for (size_t I = 0; I < BasicBlocks.size(); I++) { |
455 | const BinaryBasicBlock *BB = BasicBlocks[I]; |
456 | BB->setHash(BlendedHashes[I].combine()); |
457 | } |
458 | } |
459 | // TODO: mediate the difference between flow function construction here in BOLT |
460 | // and in the compiler by splitting blocks with exception throwing calls at the |
461 | // call and adding the landing pad as the successor. |
462 | /// Create a wrapper flow function to use with the profile inference algorithm, |
463 | /// and initialize its jumps and metadata. |
464 | FlowFunction |
465 | createFlowFunction(const BinaryFunction::BasicBlockOrderType &BlockOrder) { |
466 | FlowFunction Func; |
467 | |
468 | // Add a special "dummy" source so that there is always a unique entry point. |
469 | FlowBlock EntryBlock; |
470 | EntryBlock.Index = 0; |
471 | Func.Blocks.push_back(x: EntryBlock); |
472 | |
473 | // Create FlowBlock for every basic block in the binary function. |
474 | for (const BinaryBasicBlock *BB : BlockOrder) { |
475 | Func.Blocks.emplace_back(); |
476 | FlowBlock &Block = Func.Blocks.back(); |
477 | Block.Index = Func.Blocks.size() - 1; |
478 | (void)BB; |
479 | assert(Block.Index == BB->getIndex() + 1 && |
480 | "incorrectly assigned basic block index"); |
481 | } |
482 | |
483 | // Add a special "dummy" sink block so there is always a unique sink. |
484 | FlowBlock SinkBlock; |
485 | SinkBlock.Index = Func.Blocks.size(); |
486 | Func.Blocks.push_back(x: SinkBlock); |
487 | |
488 | // Create FlowJump for each jump between basic blocks in the binary function. |
489 | std::vector<uint64_t> InDegree(Func.Blocks.size(), 0); |
490 | for (const BinaryBasicBlock *SrcBB : BlockOrder) { |
491 | std::unordered_set<const BinaryBasicBlock *> UniqueSuccs; |
492 | // Collect regular jumps |
493 | for (const BinaryBasicBlock *DstBB : SrcBB->successors()) { |
494 | // Ignoring parallel edges |
495 | if (UniqueSuccs.find(x: DstBB) != UniqueSuccs.end()) |
496 | continue; |
497 | |
498 | Func.Jumps.emplace_back(); |
499 | FlowJump &Jump = Func.Jumps.back(); |
500 | Jump.Source = SrcBB->getIndex() + 1; |
501 | Jump.Target = DstBB->getIndex() + 1; |
502 | InDegree[Jump.Target]++; |
503 | UniqueSuccs.insert(x: DstBB); |
504 | } |
505 | // TODO: set jump from exit block to landing pad to Unlikely. |
506 | // If the block is an exit, add a dummy edge from it to the sink block. |
507 | if (UniqueSuccs.empty()) { |
508 | Func.Jumps.emplace_back(); |
509 | FlowJump &Jump = Func.Jumps.back(); |
510 | Jump.Source = SrcBB->getIndex() + 1; |
511 | Jump.Target = Func.Blocks.size() - 1; |
512 | InDegree[Jump.Target]++; |
513 | } |
514 | |
515 | // Collect jumps to landing pads |
516 | for (const BinaryBasicBlock *DstBB : SrcBB->landing_pads()) { |
517 | // Ignoring parallel edges |
518 | if (UniqueSuccs.find(x: DstBB) != UniqueSuccs.end()) |
519 | continue; |
520 | |
521 | Func.Jumps.emplace_back(); |
522 | FlowJump &Jump = Func.Jumps.back(); |
523 | Jump.Source = SrcBB->getIndex() + 1; |
524 | Jump.Target = DstBB->getIndex() + 1; |
525 | InDegree[Jump.Target]++; |
526 | UniqueSuccs.insert(x: DstBB); |
527 | } |
528 | } |
529 | |
530 | // Add dummy edges to the extra sources. If there are multiple entry blocks, |
531 | // add an unlikely edge from 0 to the subsequent ones. Skips the sink block. |
532 | assert(InDegree[0] == 0 && "dummy entry blocks shouldn't have predecessors"); |
533 | for (uint64_t I = 1; I < Func.Blocks.size() - 1; I++) { |
534 | const BinaryBasicBlock *BB = BlockOrder[I - 1]; |
535 | if (BB->isEntryPoint() || InDegree[I] == 0) { |
536 | Func.Jumps.emplace_back(); |
537 | FlowJump &Jump = Func.Jumps.back(); |
538 | Jump.Source = 0; |
539 | Jump.Target = I; |
540 | if (!BB->isEntryPoint()) |
541 | Jump.IsUnlikely = true; |
542 | } |
543 | } |
544 | |
545 | // Create necessary metadata for the flow function |
546 | for (FlowJump &Jump : Func.Jumps) { |
547 | assert(Jump.Source < Func.Blocks.size()); |
548 | Func.Blocks[Jump.Source].SuccJumps.push_back(x: &Jump); |
549 | assert(Jump.Target < Func.Blocks.size()); |
550 | Func.Blocks[Jump.Target].PredJumps.push_back(x: &Jump); |
551 | } |
552 | return Func; |
553 | } |
554 | |
555 | /// Assign initial block/jump weights based on the stale profile data. The goal |
556 | /// is to extract as much information from the stale profile as possible. Here |
557 | /// we assume that each basic block is specified via a hash value computed from |
558 | /// its content and the hashes of the unchanged basic blocks stay the same |
559 | /// across different revisions of the binary. Blocks may also have pseudo probe |
560 | /// information in the profile and the binary which is used for matching. |
561 | /// Whenever there is a count in the profile with the hash corresponding to one |
562 | /// of the basic blocks in the binary, the count is "matched" to the block. |
563 | /// Similarly, if both the source and the target of a count in the profile are |
564 | /// matched to a jump in the binary, the count is recorded in CFG. |
565 | size_t matchWeights( |
566 | BinaryContext &BC, const BinaryFunction::BasicBlockOrderType &BlockOrder, |
567 | const yaml::bolt::BinaryFunctionProfile &YamlBF, FlowFunction &Func, |
568 | HashFunction HashFunction, YAMLProfileReader::ProfileLookupMap &IdToYamlBF, |
569 | const BinaryFunction &BF, |
570 | const ArrayRef<YAMLProfileReader::ProbeMatchSpec> ProbeMatchSpecs) { |
571 | |
572 | assert(Func.Blocks.size() == BlockOrder.size() + 2); |
573 | |
574 | std::vector<uint64_t> CallHashes; |
575 | std::vector<FlowBlock *> Blocks; |
576 | std::vector<BlendedBlockHash> BlendedHashes; |
577 | for (uint64_t I = 0; I < BlockOrder.size(); I++) { |
578 | const BinaryBasicBlock *BB = BlockOrder[I]; |
579 | assert(BB->getHash() != 0 && "empty hash of BinaryBasicBlock"); |
580 | |
581 | std::string CallHashStr = hashBlockCalls(BC, BB: *BB); |
582 | if (CallHashStr.empty()) { |
583 | CallHashes.push_back(x: 0); |
584 | } else { |
585 | if (HashFunction == HashFunction::StdHash) |
586 | CallHashes.push_back(x: std::hash<std::string>{}(CallHashStr)); |
587 | else if (HashFunction == HashFunction::XXH3) |
588 | CallHashes.push_back(x: llvm::xxh3_64bits(data: CallHashStr)); |
589 | else |
590 | llvm_unreachable("Unhandled HashFunction"); |
591 | } |
592 | |
593 | Blocks.push_back(x: &Func.Blocks[I + 1]); |
594 | BlendedBlockHash BlendedHash(BB->getHash()); |
595 | BlendedHashes.push_back(x: BlendedHash); |
596 | LLVM_DEBUG(dbgs() << "BB with index "<< I << " has hash = " |
597 | << Twine::utohexstr(BB->getHash()) << "\n"); |
598 | } |
599 | StaleMatcher Matcher; |
600 | // Collects function pseudo probes for use in the StaleMatcher. |
601 | if (opts::StaleMatchingWithPseudoProbes) { |
602 | const MCPseudoProbeDecoder *Decoder = BC.getPseudoProbeDecoder(); |
603 | assert(Decoder && |
604 | "If pseudo probes are in use, pseudo probe decoder should exist"); |
605 | const AddressProbesMap &ProbeMap = Decoder->getAddress2ProbesMap(); |
606 | const uint64_t FuncAddr = BF.getAddress(); |
607 | for (const MCDecodedPseudoProbe &Probe : |
608 | ProbeMap.find(From: FuncAddr, To: FuncAddr + BF.getSize())) |
609 | if (const BinaryBasicBlock *BB = |
610 | BF.getBasicBlockContainingOffset(Offset: Probe.getAddress() - FuncAddr)) |
611 | Matcher.mapProbeToBB(Probe: &Probe, Block: Blocks[BB->getIndex()]); |
612 | } |
613 | Matcher.init(Blocks, Hashes: BlendedHashes, CallHashes); |
614 | |
615 | using FlowBlockTy = |
616 | std::pair<const FlowBlock *, const yaml::bolt::BinaryBasicBlockProfile *>; |
617 | using ProfileBlockMatchMap = DenseMap<uint32_t, FlowBlockTy>; |
618 | // Binary profile => block index => matched block + its block profile |
619 | DenseMap<const yaml::bolt::BinaryFunctionProfile *, ProfileBlockMatchMap> |
620 | MatchedBlocks; |
621 | |
622 | // Map of FlowBlock and matching method. |
623 | DenseMap<const FlowBlock *, StaleMatcher::MatchMethod> MatchedFlowBlocks; |
624 | |
625 | auto addMatchedBlock = |
626 | [&](std::pair<const FlowBlock *, StaleMatcher::MatchMethod> BlockMethod, |
627 | const yaml::bolt::BinaryFunctionProfile &YamlBP, |
628 | const yaml::bolt::BinaryBasicBlockProfile &YamlBB) { |
629 | const auto &[MatchedBlock, Method] = BlockMethod; |
630 | if (!MatchedBlock) |
631 | return; |
632 | // Don't override earlier matches |
633 | if (MatchedFlowBlocks.contains(Val: MatchedBlock)) |
634 | return; |
635 | MatchedFlowBlocks.try_emplace(Key: MatchedBlock, Args: Method); |
636 | MatchedBlocks[&YamlBP][YamlBB.Index] = {MatchedBlock, &YamlBB}; |
637 | }; |
638 | |
639 | // Match blocks from the profile to the blocks in CFG by strict hash. |
640 | for (const yaml::bolt::BinaryBasicBlockProfile &YamlBB : YamlBF.Blocks) { |
641 | // Update matching stats. |
642 | ++BC.Stats.NumStaleBlocks; |
643 | BC.Stats.StaleSampleCount += YamlBB.ExecCount; |
644 | |
645 | assert(YamlBB.Hash != 0 && "empty hash of BinaryBasicBlockProfile"); |
646 | BlendedBlockHash YamlHash(YamlBB.Hash); |
647 | addMatchedBlock(Matcher.matchBlockStrict(BlendedHash: YamlHash), YamlBF, YamlBB); |
648 | } |
649 | // Match blocks from the profile to the blocks in CFG by pseudo probes. |
650 | for (const auto &[InlineNodeMap, YamlBP] : ProbeMatchSpecs) { |
651 | for (const yaml::bolt::BinaryBasicBlockProfile &BB : YamlBP.get().Blocks) |
652 | if (!BB.PseudoProbes.empty()) |
653 | addMatchedBlock(Matcher.matchBlockProbe(PseudoProbes: BB.PseudoProbes, InlineTreeNodeMap: InlineNodeMap), |
654 | YamlBP, BB); |
655 | } |
656 | // Match blocks from the profile to the blocks in CFG with loose methods. |
657 | for (const yaml::bolt::BinaryBasicBlockProfile &YamlBB : YamlBF.Blocks) { |
658 | assert(YamlBB.Hash != 0 && "empty hash of BinaryBasicBlockProfile"); |
659 | BlendedBlockHash YamlHash(YamlBB.Hash); |
660 | |
661 | std::string CallHashStr = hashBlockCalls(IdToYamlFunction: IdToYamlBF, YamlBB); |
662 | uint64_t CallHash = 0; |
663 | if (!CallHashStr.empty()) { |
664 | if (HashFunction == HashFunction::StdHash) |
665 | CallHash = std::hash<std::string>{}(CallHashStr); |
666 | else if (HashFunction == HashFunction::XXH3) |
667 | CallHash = llvm::xxh3_64bits(data: CallHashStr); |
668 | else |
669 | llvm_unreachable("Unhandled HashFunction"); |
670 | } |
671 | auto [MatchedBlock, Method] = Matcher.matchBlockLoose(BlendedHash: YamlHash, CallHash); |
672 | if (MatchedBlock == nullptr && YamlBB.Index == 0) { |
673 | MatchedBlock = Blocks[0]; |
674 | // Report as loose match |
675 | Method = StaleMatcher::MATCH_OPCODE; |
676 | } |
677 | if (!MatchedBlock) { |
678 | LLVM_DEBUG(dbgs() << "Couldn't match yaml block (bid = "<< YamlBB.Index |
679 | << ")"<< " with hash "<< Twine::utohexstr(YamlBB.Hash) |
680 | << "\n"); |
681 | continue; |
682 | } |
683 | addMatchedBlock({MatchedBlock, Method}, YamlBF, YamlBB); |
684 | } |
685 | |
686 | // Match jumps from the profile to the jumps from CFG |
687 | std::vector<uint64_t> OutWeight(Func.Blocks.size(), 0); |
688 | std::vector<uint64_t> InWeight(Func.Blocks.size(), 0); |
689 | |
690 | for (const auto &[YamlBF, MatchMap] : MatchedBlocks) { |
691 | for (const auto &[YamlBBIdx, FlowBlockProfile] : MatchMap) { |
692 | const auto &[MatchedBlock, YamlBB] = FlowBlockProfile; |
693 | StaleMatcher::MatchMethod Method = MatchedFlowBlocks.lookup(Val: MatchedBlock); |
694 | BlendedBlockHash BinHash = BlendedHashes[MatchedBlock->Index - 1]; |
695 | LLVM_DEBUG(dbgs() << "Matched yaml block (bid = "<< YamlBBIdx << ")" |
696 | << " with hash "<< Twine::utohexstr(YamlBB->Hash) |
697 | << " to BB (index = "<< MatchedBlock->Index - 1 << ")" |
698 | << " with hash "<< Twine::utohexstr(BinHash.combine()) |
699 | << "\n"); |
700 | (void)BinHash; |
701 | uint64_t ExecCount = YamlBB->ExecCount; |
702 | // Update matching stats accounting for the matched block. |
703 | switch (Method) { |
704 | case StaleMatcher::MATCH_EXACT: |
705 | ++BC.Stats.NumExactMatchedBlocks; |
706 | BC.Stats.ExactMatchedSampleCount += ExecCount; |
707 | LLVM_DEBUG(dbgs() << " exact match\n"); |
708 | break; |
709 | case StaleMatcher::MATCH_PROBE_EXACT: |
710 | ++BC.Stats.NumPseudoProbeExactMatchedBlocks; |
711 | BC.Stats.PseudoProbeExactMatchedSampleCount += ExecCount; |
712 | LLVM_DEBUG(dbgs() << " exact pseudo probe match\n"); |
713 | break; |
714 | case StaleMatcher::MATCH_PROBE_LOOSE: |
715 | ++BC.Stats.NumPseudoProbeLooseMatchedBlocks; |
716 | BC.Stats.PseudoProbeLooseMatchedSampleCount += ExecCount; |
717 | LLVM_DEBUG(dbgs() << " loose pseudo probe match\n"); |
718 | break; |
719 | case StaleMatcher::MATCH_CALL: |
720 | ++BC.Stats.NumCallMatchedBlocks; |
721 | BC.Stats.CallMatchedSampleCount += ExecCount; |
722 | LLVM_DEBUG(dbgs() << " call match\n"); |
723 | break; |
724 | case StaleMatcher::MATCH_OPCODE: |
725 | ++BC.Stats.NumLooseMatchedBlocks; |
726 | BC.Stats.LooseMatchedSampleCount += ExecCount; |
727 | LLVM_DEBUG(dbgs() << " loose match\n"); |
728 | break; |
729 | case StaleMatcher::NO_MATCH: |
730 | LLVM_DEBUG(dbgs() << " no match\n"); |
731 | } |
732 | } |
733 | |
734 | for (const yaml::bolt::BinaryBasicBlockProfile &YamlBB : YamlBF->Blocks) { |
735 | for (const yaml::bolt::SuccessorInfo &YamlSI : YamlBB.Successors) { |
736 | if (YamlSI.Count == 0) |
737 | continue; |
738 | |
739 | // Try to find the jump for a given (src, dst) pair from the profile and |
740 | // assign the jump weight based on the profile count |
741 | const uint64_t SrcIndex = YamlBB.Index; |
742 | const uint64_t DstIndex = YamlSI.Index; |
743 | |
744 | const FlowBlock *MatchedSrcBlock = MatchMap.lookup(Val: SrcIndex).first; |
745 | const FlowBlock *MatchedDstBlock = MatchMap.lookup(Val: DstIndex).first; |
746 | |
747 | if (MatchedSrcBlock != nullptr && MatchedDstBlock != nullptr) { |
748 | // Find a jump between the two blocks |
749 | FlowJump *Jump = nullptr; |
750 | for (FlowJump *SuccJump : MatchedSrcBlock->SuccJumps) { |
751 | if (SuccJump->Target == MatchedDstBlock->Index) { |
752 | Jump = SuccJump; |
753 | break; |
754 | } |
755 | } |
756 | // Assign the weight, if the corresponding jump is found |
757 | if (Jump != nullptr) { |
758 | Jump->Weight = YamlSI.Count; |
759 | Jump->HasUnknownWeight = false; |
760 | } |
761 | } |
762 | // Assign the weight for the src block, if it is found |
763 | if (MatchedSrcBlock != nullptr) |
764 | OutWeight[MatchedSrcBlock->Index] += YamlSI.Count; |
765 | // Assign the weight for the dst block, if it is found |
766 | if (MatchedDstBlock != nullptr) |
767 | InWeight[MatchedDstBlock->Index] += YamlSI.Count; |
768 | } |
769 | } |
770 | } |
771 | |
772 | // Assign block counts based on in-/out- jumps |
773 | for (FlowBlock &Block : Func.Blocks) { |
774 | if (OutWeight[Block.Index] == 0 && InWeight[Block.Index] == 0) { |
775 | assert(Block.HasUnknownWeight && "unmatched block with a positive count"); |
776 | continue; |
777 | } |
778 | Block.HasUnknownWeight = false; |
779 | Block.Weight = std::max(a: OutWeight[Block.Index], b: InWeight[Block.Index]); |
780 | } |
781 | |
782 | return MatchedBlocks[&YamlBF].size(); |
783 | } |
784 | |
785 | /// The function finds all blocks that are (i) reachable from the Entry block |
786 | /// and (ii) do not have a path to an exit, and marks all such blocks 'cold' |
787 | /// so that profi does not send any flow to such blocks. |
788 | void preprocessUnreachableBlocks(FlowFunction &Func) { |
789 | const uint64_t NumBlocks = Func.Blocks.size(); |
790 | |
791 | // Start bfs from the source |
792 | std::queue<uint64_t> Queue; |
793 | std::vector<bool> VisitedEntry(NumBlocks, false); |
794 | for (uint64_t I = 0; I < NumBlocks; I++) { |
795 | FlowBlock &Block = Func.Blocks[I]; |
796 | if (Block.isEntry()) { |
797 | Queue.push(x: I); |
798 | VisitedEntry[I] = true; |
799 | break; |
800 | } |
801 | } |
802 | while (!Queue.empty()) { |
803 | const uint64_t Src = Queue.front(); |
804 | Queue.pop(); |
805 | for (FlowJump *Jump : Func.Blocks[Src].SuccJumps) { |
806 | const uint64_t Dst = Jump->Target; |
807 | if (!VisitedEntry[Dst]) { |
808 | Queue.push(x: Dst); |
809 | VisitedEntry[Dst] = true; |
810 | } |
811 | } |
812 | } |
813 | |
814 | // Start bfs from all sinks |
815 | std::vector<bool> VisitedExit(NumBlocks, false); |
816 | for (uint64_t I = 0; I < NumBlocks; I++) { |
817 | FlowBlock &Block = Func.Blocks[I]; |
818 | if (Block.isExit() && VisitedEntry[I]) { |
819 | Queue.push(x: I); |
820 | VisitedExit[I] = true; |
821 | } |
822 | } |
823 | while (!Queue.empty()) { |
824 | const uint64_t Src = Queue.front(); |
825 | Queue.pop(); |
826 | for (FlowJump *Jump : Func.Blocks[Src].PredJumps) { |
827 | const uint64_t Dst = Jump->Source; |
828 | if (!VisitedExit[Dst]) { |
829 | Queue.push(x: Dst); |
830 | VisitedExit[Dst] = true; |
831 | } |
832 | } |
833 | } |
834 | |
835 | // Make all blocks of zero weight so that flow is not sent |
836 | for (uint64_t I = 0; I < NumBlocks; I++) { |
837 | FlowBlock &Block = Func.Blocks[I]; |
838 | if (Block.Weight == 0) |
839 | continue; |
840 | if (!VisitedEntry[I] || !VisitedExit[I]) { |
841 | Block.Weight = 0; |
842 | Block.HasUnknownWeight = true; |
843 | Block.IsUnlikely = true; |
844 | for (FlowJump *Jump : Block.SuccJumps) { |
845 | if (Jump->Source == Block.Index && Jump->Target == Block.Index) { |
846 | Jump->Weight = 0; |
847 | Jump->HasUnknownWeight = true; |
848 | Jump->IsUnlikely = true; |
849 | } |
850 | } |
851 | } |
852 | } |
853 | } |
854 | |
855 | /// Decide if stale profile matching can be applied for a given function. |
856 | /// Currently we skip inference for (very) large instances and for instances |
857 | /// having "unexpected" control flow (e.g., having no sink basic blocks). |
858 | bool canApplyInference(const FlowFunction &Func, |
859 | const yaml::bolt::BinaryFunctionProfile &YamlBF, |
860 | const uint64_t &MatchedBlocks) { |
861 | if (Func.Blocks.size() > opts::StaleMatchingMaxFuncSize) |
862 | return false; |
863 | |
864 | if (MatchedBlocks * 100 < |
865 | opts::StaleMatchingMinMatchedBlock * YamlBF.Blocks.size()) |
866 | return false; |
867 | |
868 | // Returns false if the artificial sink block has no predecessors meaning |
869 | // there are no exit blocks. |
870 | if (Func.Blocks[Func.Blocks.size() - 1].isEntry()) |
871 | return false; |
872 | |
873 | return true; |
874 | } |
875 | |
876 | /// Apply the profile inference algorithm for a given flow function. |
877 | void applyInference(FlowFunction &Func) { |
878 | ProfiParams Params; |
879 | // Set the params from the command-line flags. |
880 | Params.EvenFlowDistribution = opts::StaleMatchingEvenFlowDistribution; |
881 | Params.RebalanceUnknown = opts::StaleMatchingRebalanceUnknown; |
882 | Params.JoinIslands = opts::StaleMatchingJoinIslands; |
883 | |
884 | Params.CostBlockInc = opts::StaleMatchingCostBlockInc; |
885 | Params.CostBlockEntryInc = opts::StaleMatchingCostBlockInc; |
886 | Params.CostBlockDec = opts::StaleMatchingCostBlockDec; |
887 | Params.CostBlockEntryDec = opts::StaleMatchingCostBlockDec; |
888 | Params.CostBlockUnknownInc = opts::StaleMatchingCostBlockUnknownInc; |
889 | |
890 | Params.CostJumpInc = opts::StaleMatchingCostJumpInc; |
891 | Params.CostJumpFTInc = opts::StaleMatchingCostJumpInc; |
892 | Params.CostJumpDec = opts::StaleMatchingCostJumpDec; |
893 | Params.CostJumpFTDec = opts::StaleMatchingCostJumpDec; |
894 | Params.CostJumpUnknownInc = opts::StaleMatchingCostJumpUnknownInc; |
895 | Params.CostJumpUnknownFTInc = opts::StaleMatchingCostJumpUnknownFTInc; |
896 | |
897 | applyFlowInference(Params, Func); |
898 | } |
899 | |
900 | /// Collect inferred counts from the flow function and update annotations in |
901 | /// the binary function. |
902 | void assignProfile(BinaryFunction &BF, |
903 | const BinaryFunction::BasicBlockOrderType &BlockOrder, |
904 | FlowFunction &Func) { |
905 | BinaryContext &BC = BF.getBinaryContext(); |
906 | |
907 | assert(Func.Blocks.size() == BlockOrder.size() + 2); |
908 | for (uint64_t I = 0; I < BlockOrder.size(); I++) { |
909 | FlowBlock &Block = Func.Blocks[I + 1]; |
910 | BinaryBasicBlock *BB = BlockOrder[I]; |
911 | |
912 | // Update block's count |
913 | BB->setExecutionCount(Block.Flow); |
914 | |
915 | // Update jump counts: (i) clean existing counts and then (ii) set new ones |
916 | auto BI = BB->branch_info_begin(); |
917 | for (const BinaryBasicBlock *DstBB : BB->successors()) { |
918 | (void)DstBB; |
919 | BI->Count = 0; |
920 | BI->MispredictedCount = 0; |
921 | ++BI; |
922 | } |
923 | for (FlowJump *Jump : Block.SuccJumps) { |
924 | if (Jump->IsUnlikely) |
925 | continue; |
926 | if (Jump->Flow == 0) |
927 | continue; |
928 | |
929 | // Skips the artificial sink block. |
930 | if (Jump->Target == Func.Blocks.size() - 1) |
931 | continue; |
932 | BinaryBasicBlock &SuccBB = *BlockOrder[Jump->Target - 1]; |
933 | // Check if the edge corresponds to a regular jump or a landing pad |
934 | if (BB->getSuccessor(Label: SuccBB.getLabel())) { |
935 | BinaryBasicBlock::BinaryBranchInfo &BI = BB->getBranchInfo(Succ: SuccBB); |
936 | BI.Count += Jump->Flow; |
937 | } else { |
938 | BinaryBasicBlock *LP = BB->getLandingPad(Label: SuccBB.getLabel()); |
939 | if (LP && LP->getKnownExecutionCount() < Jump->Flow) |
940 | LP->setExecutionCount(Jump->Flow); |
941 | } |
942 | } |
943 | |
944 | // Update call-site annotations |
945 | auto setOrUpdateAnnotation = [&](MCInst &Instr, StringRef Name, |
946 | uint64_t Count) { |
947 | if (BC.MIB->hasAnnotation(Inst: Instr, Name)) |
948 | BC.MIB->removeAnnotation(Inst&: Instr, Name); |
949 | // Do not add zero-count annotations |
950 | if (Count == 0) |
951 | return; |
952 | BC.MIB->addAnnotation(Inst&: Instr, Name, Val: Count); |
953 | }; |
954 | |
955 | for (MCInst &Instr : *BB) { |
956 | // Ignore pseudo instructions |
957 | if (BC.MIB->isPseudo(Inst: Instr)) |
958 | continue; |
959 | // Ignore jump tables |
960 | const MCInst *LastInstr = BB->getLastNonPseudoInstr(); |
961 | if (BC.MIB->getJumpTable(Inst: *LastInstr) && LastInstr == &Instr) |
962 | continue; |
963 | |
964 | if (BC.MIB->isIndirectCall(Inst: Instr) || BC.MIB->isIndirectBranch(Inst: Instr)) { |
965 | auto &ICSP = BC.MIB->getOrCreateAnnotationAs<IndirectCallSiteProfile>( |
966 | Inst&: Instr, Name: "CallProfile"); |
967 | if (!ICSP.empty()) { |
968 | // Try to evenly distribute the counts among the call sites |
969 | const uint64_t TotalCount = Block.Flow; |
970 | const uint64_t NumSites = ICSP.size(); |
971 | for (uint64_t Idx = 0; Idx < ICSP.size(); Idx++) { |
972 | IndirectCallProfile &CSP = ICSP[Idx]; |
973 | uint64_t CountPerSite = TotalCount / NumSites; |
974 | // When counts cannot be exactly distributed, increase by 1 the |
975 | // counts of the first (TotalCount % NumSites) call sites |
976 | if (Idx < TotalCount % NumSites) |
977 | CountPerSite++; |
978 | CSP.Count = CountPerSite; |
979 | } |
980 | } else { |
981 | ICSP.emplace_back(Args: nullptr, Args&: Block.Flow, Args: 0); |
982 | } |
983 | } else if (BC.MIB->getConditionalTailCall(Inst: Instr)) { |
984 | // We don't know exactly the number of times the conditional tail call |
985 | // is executed; conservatively, setting it to the count of the block |
986 | setOrUpdateAnnotation(Instr, "CTCTakenCount", Block.Flow); |
987 | BC.MIB->removeAnnotation(Inst&: Instr, Name: "CTCMispredCount"); |
988 | } else if (BC.MIB->isCall(Inst: Instr)) { |
989 | setOrUpdateAnnotation(Instr, "Count", Block.Flow); |
990 | } |
991 | } |
992 | } |
993 | |
994 | // Update function's execution count and mark the function inferred. |
995 | BF.setExecutionCount(Func.Blocks[0].Flow); |
996 | BF.setHasInferredProfile(true); |
997 | } |
998 | |
999 | bool YAMLProfileReader::inferStaleProfile( |
1000 | BinaryFunction &BF, const yaml::bolt::BinaryFunctionProfile &YamlBF, |
1001 | const ArrayRef<ProbeMatchSpec> ProbeMatchSpecs) { |
1002 | |
1003 | NamedRegionTimer T("inferStaleProfile", "stale profile inference", "rewrite", |
1004 | "Rewrite passes", opts::TimeRewrite); |
1005 | |
1006 | if (!BF.hasCFG()) |
1007 | return false; |
1008 | |
1009 | LLVM_DEBUG(dbgs() << "BOLT-INFO: applying profile inference for " |
1010 | << "\""<< BF.getPrintName() << "\"\n"); |
1011 | |
1012 | // Make sure that block hashes are up to date. |
1013 | BF.computeBlockHashes(HashFunction: YamlBP.Header.HashFunction); |
1014 | |
1015 | const BinaryFunction::BasicBlockOrderType BlockOrder( |
1016 | BF.getLayout().block_begin(), BF.getLayout().block_end()); |
1017 | |
1018 | // Tracks the number of matched blocks. |
1019 | |
1020 | // Create a wrapper flow function to use with the profile inference algorithm. |
1021 | FlowFunction Func = createFlowFunction(BlockOrder); |
1022 | |
1023 | // Match as many block/jump counts from the stale profile as possible |
1024 | size_t MatchedBlocks = |
1025 | matchWeights(BC&: BF.getBinaryContext(), BlockOrder, YamlBF, Func, |
1026 | HashFunction: YamlBP.Header.HashFunction, IdToYamlBF&: IdToYamLBF, BF, ProbeMatchSpecs); |
1027 | |
1028 | // Adjust the flow function by marking unreachable blocks Unlikely so that |
1029 | // they don't get any counts assigned. |
1030 | preprocessUnreachableBlocks(Func); |
1031 | |
1032 | // Check if profile inference can be applied for the instance. |
1033 | if (!canApplyInference(Func, YamlBF, MatchedBlocks)) |
1034 | return false; |
1035 | |
1036 | // Apply the profile inference algorithm. |
1037 | applyInference(Func); |
1038 | |
1039 | // Collect inferred counts and update function annotations. |
1040 | assignProfile(BF, BlockOrder, Func); |
1041 | |
1042 | // As of now, we always mark the binary function having "correct" profile. |
1043 | // In the future, we may discard the results for instances with poor inference |
1044 | // metrics and keep such functions un-optimized. |
1045 | return true; |
1046 | } |
1047 | |
1048 | } // end namespace bolt |
1049 | } // end namespace llvm |
1050 |
Definitions
- InferStaleProfile
- StaleMatchingMinMatchedBlock
- StaleMatchingMaxFuncSize
- StaleMatchingEvenFlowDistribution
- StaleMatchingRebalanceUnknown
- StaleMatchingJoinIslands
- StaleMatchingCostBlockInc
- StaleMatchingCostBlockDec
- StaleMatchingCostJumpInc
- StaleMatchingCostJumpDec
- StaleMatchingCostBlockUnknownInc
- StaleMatchingCostJumpUnknownInc
- StaleMatchingCostJumpUnknownFTInc
- StaleMatchingWithPseudoProbes
- BlendedBlockHash
- BlendedBlockHash
- BlendedBlockHash
- combine
- distance
- StaleMatcher
- init
- mapProbeToBB
- MatchMethod
- matchBlockStrict
- matchBlockProbe
- matchBlockLoose
- isHighConfidenceMatch
- matchWithOpcodes
- matchWithCalls
- matchWithPseudoProbes
- computeBlockHashes
- createFlowFunction
- matchWeights
- preprocessUnreachableBlocks
- canApplyInference
- applyInference
- assignProfile
Learn to use CMake with our Intro Training
Find out more