1 | //===- bolt/Profile/StaleProfileMatching.cpp - Profile data matching ----===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // BOLT often has to deal with profiles collected on binaries built from several |
10 | // revisions behind release. As a result, a certain percentage of functions is |
11 | // considered stale and not optimized. This file implements an ability to match |
12 | // profile to functions that are not 100% binary identical, and thus, increasing |
13 | // the optimization coverage and boost the performance of applications. |
14 | // |
15 | // The algorithm consists of two phases: matching and inference: |
16 | // - At the matching phase, we try to "guess" as many block and jump counts from |
17 | // the stale profile as possible. To this end, the content of each basic block |
18 | // is hashed and stored in the (yaml) profile. When BOLT optimizes a binary, |
19 | // it computes block hashes and identifies the corresponding entries in the |
20 | // stale profile. It yields a partial profile for every CFG in the binary. |
21 | // - At the inference phase, we employ a network flow-based algorithm (profi) to |
22 | // reconstruct "realistic" block and jump counts from the partial profile |
23 | // generated at the first stage. In practice, we don't always produce proper |
24 | // profile data but the majority (e.g., >90%) of CFGs get the correct counts. |
25 | // |
26 | //===----------------------------------------------------------------------===// |
27 | |
28 | #include "bolt/Core/HashUtilities.h" |
29 | #include "bolt/Profile/YAMLProfileReader.h" |
30 | #include "llvm/ADT/Bitfields.h" |
31 | #include "llvm/ADT/Hashing.h" |
32 | #include "llvm/Support/CommandLine.h" |
33 | #include "llvm/Support/xxhash.h" |
34 | #include "llvm/Transforms/Utils/SampleProfileInference.h" |
35 | |
36 | #include <queue> |
37 | |
38 | using namespace llvm; |
39 | |
40 | #undef DEBUG_TYPE |
41 | #define DEBUG_TYPE "bolt-prof" |
42 | |
43 | namespace opts { |
44 | |
45 | extern cl::OptionCategory BoltOptCategory; |
46 | |
47 | cl::opt<bool> |
48 | InferStaleProfile("infer-stale-profile" , |
49 | cl::desc("Infer counts from stale profile data." ), |
50 | cl::init(Val: false), cl::Hidden, cl::cat(BoltOptCategory)); |
51 | |
52 | cl::opt<unsigned> StaleMatchingMaxFuncSize( |
53 | "stale-matching-max-func-size" , |
54 | cl::desc("The maximum size of a function to consider for inference." ), |
55 | cl::init(Val: 10000), cl::Hidden, cl::cat(BoltOptCategory)); |
56 | |
57 | // Parameters of the profile inference algorithm. The default values are tuned |
58 | // on several benchmarks. |
59 | cl::opt<bool> StaleMatchingEvenFlowDistribution( |
60 | "stale-matching-even-flow-distribution" , |
61 | cl::desc("Try to evenly distribute flow when there are multiple equally " |
62 | "likely options." ), |
63 | cl::init(Val: true), cl::ReallyHidden, cl::cat(BoltOptCategory)); |
64 | |
65 | cl::opt<bool> StaleMatchingRebalanceUnknown( |
66 | "stale-matching-rebalance-unknown" , |
67 | cl::desc("Evenly re-distribute flow among unknown subgraphs." ), |
68 | cl::init(Val: false), cl::ReallyHidden, cl::cat(BoltOptCategory)); |
69 | |
70 | cl::opt<bool> StaleMatchingJoinIslands( |
71 | "stale-matching-join-islands" , |
72 | cl::desc("Join isolated components having positive flow." ), cl::init(Val: true), |
73 | cl::ReallyHidden, cl::cat(BoltOptCategory)); |
74 | |
75 | cl::opt<unsigned> StaleMatchingCostBlockInc( |
76 | "stale-matching-cost-block-inc" , |
77 | cl::desc("The cost of increasing a block count by one." ), cl::init(Val: 150), |
78 | cl::ReallyHidden, cl::cat(BoltOptCategory)); |
79 | |
80 | cl::opt<unsigned> StaleMatchingCostBlockDec( |
81 | "stale-matching-cost-block-dec" , |
82 | cl::desc("The cost of decreasing a block count by one." ), cl::init(Val: 150), |
83 | cl::ReallyHidden, cl::cat(BoltOptCategory)); |
84 | |
85 | cl::opt<unsigned> StaleMatchingCostJumpInc( |
86 | "stale-matching-cost-jump-inc" , |
87 | cl::desc("The cost of increasing a jump count by one." ), cl::init(Val: 150), |
88 | cl::ReallyHidden, cl::cat(BoltOptCategory)); |
89 | |
90 | cl::opt<unsigned> StaleMatchingCostJumpDec( |
91 | "stale-matching-cost-jump-dec" , |
92 | cl::desc("The cost of decreasing a jump count by one." ), cl::init(Val: 150), |
93 | cl::ReallyHidden, cl::cat(BoltOptCategory)); |
94 | |
95 | cl::opt<unsigned> StaleMatchingCostBlockUnknownInc( |
96 | "stale-matching-cost-block-unknown-inc" , |
97 | cl::desc("The cost of increasing an unknown block count by one." ), |
98 | cl::init(Val: 1), cl::ReallyHidden, cl::cat(BoltOptCategory)); |
99 | |
100 | cl::opt<unsigned> StaleMatchingCostJumpUnknownInc( |
101 | "stale-matching-cost-jump-unknown-inc" , |
102 | cl::desc("The cost of increasing an unknown jump count by one." ), |
103 | cl::init(Val: 140), cl::ReallyHidden, cl::cat(BoltOptCategory)); |
104 | |
105 | cl::opt<unsigned> StaleMatchingCostJumpUnknownFTInc( |
106 | "stale-matching-cost-jump-unknown-ft-inc" , |
107 | cl::desc( |
108 | "The cost of increasing an unknown fall-through jump count by one." ), |
109 | cl::init(Val: 3), cl::ReallyHidden, cl::cat(BoltOptCategory)); |
110 | |
111 | } // namespace opts |
112 | |
113 | namespace llvm { |
114 | namespace bolt { |
115 | |
116 | /// An object wrapping several components of a basic block hash. The combined |
117 | /// (blended) hash is represented and stored as one uint64_t, while individual |
118 | /// components are of smaller size (e.g., uint16_t or uint8_t). |
119 | struct BlendedBlockHash { |
120 | private: |
121 | using ValueOffset = Bitfield::Element<uint16_t, 0, 16>; |
122 | using ValueOpcode = Bitfield::Element<uint16_t, 16, 16>; |
123 | using ValueInstr = Bitfield::Element<uint16_t, 32, 16>; |
124 | using ValuePred = Bitfield::Element<uint8_t, 48, 8>; |
125 | using ValueSucc = Bitfield::Element<uint8_t, 56, 8>; |
126 | |
127 | public: |
128 | explicit BlendedBlockHash() {} |
129 | |
130 | explicit BlendedBlockHash(uint64_t Hash) { |
131 | Offset = Bitfield::get<ValueOffset>(Packed: Hash); |
132 | OpcodeHash = Bitfield::get<ValueOpcode>(Packed: Hash); |
133 | InstrHash = Bitfield::get<ValueInstr>(Packed: Hash); |
134 | PredHash = Bitfield::get<ValuePred>(Packed: Hash); |
135 | SuccHash = Bitfield::get<ValueSucc>(Packed: Hash); |
136 | } |
137 | |
138 | /// Combine the blended hash into uint64_t. |
139 | uint64_t combine() const { |
140 | uint64_t Hash = 0; |
141 | Bitfield::set<ValueOffset>(Packed&: Hash, Value: Offset); |
142 | Bitfield::set<ValueOpcode>(Packed&: Hash, Value: OpcodeHash); |
143 | Bitfield::set<ValueInstr>(Packed&: Hash, Value: InstrHash); |
144 | Bitfield::set<ValuePred>(Packed&: Hash, Value: PredHash); |
145 | Bitfield::set<ValueSucc>(Packed&: Hash, Value: SuccHash); |
146 | return Hash; |
147 | } |
148 | |
149 | /// Compute a distance between two given blended hashes. The smaller the |
150 | /// distance, the more similar two blocks are. For identical basic blocks, |
151 | /// the distance is zero. |
152 | uint64_t distance(const BlendedBlockHash &BBH) const { |
153 | assert(OpcodeHash == BBH.OpcodeHash && |
154 | "incorrect blended hash distance computation" ); |
155 | uint64_t Dist = 0; |
156 | // Account for NeighborHash |
157 | Dist += SuccHash == BBH.SuccHash ? 0 : 1; |
158 | Dist += PredHash == BBH.PredHash ? 0 : 1; |
159 | Dist <<= 16; |
160 | // Account for InstrHash |
161 | Dist += InstrHash == BBH.InstrHash ? 0 : 1; |
162 | Dist <<= 16; |
163 | // Account for Offset |
164 | Dist += (Offset >= BBH.Offset ? Offset - BBH.Offset : BBH.Offset - Offset); |
165 | return Dist; |
166 | } |
167 | |
168 | /// The offset of the basic block from the function start. |
169 | uint16_t Offset{0}; |
170 | /// (Loose) Hash of the basic block instructions, excluding operands. |
171 | uint16_t OpcodeHash{0}; |
172 | /// (Strong) Hash of the basic block instructions, including opcodes and |
173 | /// operands. |
174 | uint16_t InstrHash{0}; |
175 | /// (Loose) Hashes of the predecessors of the basic block. |
176 | uint8_t PredHash{0}; |
177 | /// (Loose) Hashes of the successors of the basic block. |
178 | uint8_t SuccHash{0}; |
179 | }; |
180 | |
181 | /// The object is used to identify and match basic blocks in a BinaryFunction |
182 | /// given their hashes computed on a binary built from several revisions behind |
183 | /// release. |
184 | class StaleMatcher { |
185 | public: |
186 | /// Initialize stale matcher. |
187 | void init(const std::vector<FlowBlock *> &Blocks, |
188 | const std::vector<BlendedBlockHash> &Hashes) { |
189 | assert(Blocks.size() == Hashes.size() && |
190 | "incorrect matcher initialization" ); |
191 | for (size_t I = 0; I < Blocks.size(); I++) { |
192 | FlowBlock *Block = Blocks[I]; |
193 | uint16_t OpHash = Hashes[I].OpcodeHash; |
194 | OpHashToBlocks[OpHash].push_back(x: std::make_pair(x: Hashes[I], y&: Block)); |
195 | } |
196 | } |
197 | |
198 | /// Find the most similar block for a given hash. |
199 | const FlowBlock *matchBlock(BlendedBlockHash BlendedHash) const { |
200 | auto BlockIt = OpHashToBlocks.find(x: BlendedHash.OpcodeHash); |
201 | if (BlockIt == OpHashToBlocks.end()) |
202 | return nullptr; |
203 | FlowBlock *BestBlock = nullptr; |
204 | uint64_t BestDist = std::numeric_limits<uint64_t>::max(); |
205 | for (const auto &[Hash, Block] : BlockIt->second) { |
206 | uint64_t Dist = Hash.distance(BBH: BlendedHash); |
207 | if (BestBlock == nullptr || Dist < BestDist) { |
208 | BestDist = Dist; |
209 | BestBlock = Block; |
210 | } |
211 | } |
212 | return BestBlock; |
213 | } |
214 | |
215 | /// Returns true if the two basic blocks (in the binary and in the profile) |
216 | /// corresponding to the given hashes are matched to each other with a high |
217 | /// confidence. |
218 | static bool isHighConfidenceMatch(BlendedBlockHash Hash1, |
219 | BlendedBlockHash Hash2) { |
220 | return Hash1.InstrHash == Hash2.InstrHash; |
221 | } |
222 | |
223 | private: |
224 | using HashBlockPairType = std::pair<BlendedBlockHash, FlowBlock *>; |
225 | std::unordered_map<uint16_t, std::vector<HashBlockPairType>> OpHashToBlocks; |
226 | }; |
227 | |
228 | void BinaryFunction::computeBlockHashes(HashFunction HashFunction) const { |
229 | if (size() == 0) |
230 | return; |
231 | |
232 | assert(hasCFG() && "the function is expected to have CFG" ); |
233 | |
234 | std::vector<BlendedBlockHash> BlendedHashes(BasicBlocks.size()); |
235 | std::vector<uint64_t> OpcodeHashes(BasicBlocks.size()); |
236 | // Initialize hash components. |
237 | for (size_t I = 0; I < BasicBlocks.size(); I++) { |
238 | const BinaryBasicBlock *BB = BasicBlocks[I]; |
239 | assert(BB->getIndex() == I && "incorrect block index" ); |
240 | BlendedHashes[I].Offset = BB->getOffset(); |
241 | // Hashing complete instructions. |
242 | std::string InstrHashStr = hashBlock( |
243 | BC, BB: *BB, OperandHashFunc: [&](const MCOperand &Op) { return hashInstOperand(BC, Operand: Op); }); |
244 | if (HashFunction == HashFunction::StdHash) { |
245 | uint64_t InstrHash = std::hash<std::string>{}(InstrHashStr); |
246 | BlendedHashes[I].InstrHash = (uint16_t)hash_value(value: InstrHash); |
247 | } else if (HashFunction == HashFunction::XXH3) { |
248 | uint64_t InstrHash = llvm::xxh3_64bits(data: InstrHashStr); |
249 | BlendedHashes[I].InstrHash = (uint16_t)InstrHash; |
250 | } else { |
251 | llvm_unreachable("Unhandled HashFunction" ); |
252 | } |
253 | // Hashing opcodes. |
254 | std::string OpcodeHashStr = hashBlockLoose(BC, BB: *BB); |
255 | if (HashFunction == HashFunction::StdHash) { |
256 | OpcodeHashes[I] = std::hash<std::string>{}(OpcodeHashStr); |
257 | BlendedHashes[I].OpcodeHash = (uint16_t)hash_value(value: OpcodeHashes[I]); |
258 | } else if (HashFunction == HashFunction::XXH3) { |
259 | OpcodeHashes[I] = llvm::xxh3_64bits(data: OpcodeHashStr); |
260 | BlendedHashes[I].OpcodeHash = (uint16_t)OpcodeHashes[I]; |
261 | } else { |
262 | llvm_unreachable("Unhandled HashFunction" ); |
263 | } |
264 | } |
265 | |
266 | // Initialize neighbor hash. |
267 | for (size_t I = 0; I < BasicBlocks.size(); I++) { |
268 | const BinaryBasicBlock *BB = BasicBlocks[I]; |
269 | // Append hashes of successors. |
270 | uint64_t Hash = 0; |
271 | for (BinaryBasicBlock *SuccBB : BB->successors()) { |
272 | uint64_t SuccHash = OpcodeHashes[SuccBB->getIndex()]; |
273 | Hash = hashing::detail::hash_16_bytes(low: Hash, high: SuccHash); |
274 | } |
275 | if (HashFunction == HashFunction::StdHash) { |
276 | // Compatibility with old behavior. |
277 | BlendedHashes[I].SuccHash = (uint8_t)hash_value(value: Hash); |
278 | } else { |
279 | BlendedHashes[I].SuccHash = (uint8_t)Hash; |
280 | } |
281 | |
282 | // Append hashes of predecessors. |
283 | Hash = 0; |
284 | for (BinaryBasicBlock *PredBB : BB->predecessors()) { |
285 | uint64_t PredHash = OpcodeHashes[PredBB->getIndex()]; |
286 | Hash = hashing::detail::hash_16_bytes(low: Hash, high: PredHash); |
287 | } |
288 | if (HashFunction == HashFunction::StdHash) { |
289 | // Compatibility with old behavior. |
290 | BlendedHashes[I].PredHash = (uint8_t)hash_value(value: Hash); |
291 | } else { |
292 | BlendedHashes[I].PredHash = (uint8_t)Hash; |
293 | } |
294 | } |
295 | |
296 | // Assign hashes. |
297 | for (size_t I = 0; I < BasicBlocks.size(); I++) { |
298 | const BinaryBasicBlock *BB = BasicBlocks[I]; |
299 | BB->setHash(BlendedHashes[I].combine()); |
300 | } |
301 | } |
302 | |
303 | /// Create a wrapper flow function to use with the profile inference algorithm, |
304 | /// and initialize its jumps and metadata. |
305 | FlowFunction |
306 | createFlowFunction(const BinaryFunction::BasicBlockOrderType &BlockOrder) { |
307 | FlowFunction Func; |
308 | |
309 | // Add a special "dummy" source so that there is always a unique entry point. |
310 | // Because of the extra source, for all other blocks in FlowFunction it holds |
311 | // that Block.Index == BB->getIndex() + 1 |
312 | FlowBlock EntryBlock; |
313 | EntryBlock.Index = 0; |
314 | Func.Blocks.push_back(x: EntryBlock); |
315 | |
316 | // Create FlowBlock for every basic block in the binary function |
317 | for (const BinaryBasicBlock *BB : BlockOrder) { |
318 | Func.Blocks.emplace_back(); |
319 | FlowBlock &Block = Func.Blocks.back(); |
320 | Block.Index = Func.Blocks.size() - 1; |
321 | (void)BB; |
322 | assert(Block.Index == BB->getIndex() + 1 && |
323 | "incorrectly assigned basic block index" ); |
324 | } |
325 | |
326 | // Create FlowJump for each jump between basic blocks in the binary function |
327 | std::vector<uint64_t> InDegree(Func.Blocks.size(), 0); |
328 | for (const BinaryBasicBlock *SrcBB : BlockOrder) { |
329 | std::unordered_set<const BinaryBasicBlock *> UniqueSuccs; |
330 | // Collect regular jumps |
331 | for (const BinaryBasicBlock *DstBB : SrcBB->successors()) { |
332 | // Ignoring parallel edges |
333 | if (UniqueSuccs.find(x: DstBB) != UniqueSuccs.end()) |
334 | continue; |
335 | |
336 | Func.Jumps.emplace_back(); |
337 | FlowJump &Jump = Func.Jumps.back(); |
338 | Jump.Source = SrcBB->getIndex() + 1; |
339 | Jump.Target = DstBB->getIndex() + 1; |
340 | InDegree[Jump.Target]++; |
341 | UniqueSuccs.insert(x: DstBB); |
342 | } |
343 | // Collect jumps to landing pads |
344 | for (const BinaryBasicBlock *DstBB : SrcBB->landing_pads()) { |
345 | // Ignoring parallel edges |
346 | if (UniqueSuccs.find(x: DstBB) != UniqueSuccs.end()) |
347 | continue; |
348 | |
349 | Func.Jumps.emplace_back(); |
350 | FlowJump &Jump = Func.Jumps.back(); |
351 | Jump.Source = SrcBB->getIndex() + 1; |
352 | Jump.Target = DstBB->getIndex() + 1; |
353 | InDegree[Jump.Target]++; |
354 | UniqueSuccs.insert(x: DstBB); |
355 | } |
356 | } |
357 | |
358 | // Add dummy edges to the extra sources. If there are multiple entry blocks, |
359 | // add an unlikely edge from 0 to the subsequent ones |
360 | assert(InDegree[0] == 0 && "dummy entry blocks shouldn't have predecessors" ); |
361 | for (uint64_t I = 1; I < Func.Blocks.size(); I++) { |
362 | const BinaryBasicBlock *BB = BlockOrder[I - 1]; |
363 | if (BB->isEntryPoint() || InDegree[I] == 0) { |
364 | Func.Jumps.emplace_back(); |
365 | FlowJump &Jump = Func.Jumps.back(); |
366 | Jump.Source = 0; |
367 | Jump.Target = I; |
368 | if (!BB->isEntryPoint()) |
369 | Jump.IsUnlikely = true; |
370 | } |
371 | } |
372 | |
373 | // Create necessary metadata for the flow function |
374 | for (FlowJump &Jump : Func.Jumps) { |
375 | Func.Blocks.at(n: Jump.Source).SuccJumps.push_back(x: &Jump); |
376 | Func.Blocks.at(n: Jump.Target).PredJumps.push_back(x: &Jump); |
377 | } |
378 | return Func; |
379 | } |
380 | |
381 | /// Assign initial block/jump weights based on the stale profile data. The goal |
382 | /// is to extract as much information from the stale profile as possible. Here |
383 | /// we assume that each basic block is specified via a hash value computed from |
384 | /// its content and the hashes of the unchanged basic blocks stay the same |
385 | /// across different revisions of the binary. |
386 | /// Whenever there is a count in the profile with the hash corresponding to one |
387 | /// of the basic blocks in the binary, the count is "matched" to the block. |
388 | /// Similarly, if both the source and the target of a count in the profile are |
389 | /// matched to a jump in the binary, the count is recorded in CFG. |
390 | void matchWeightsByHashes(BinaryContext &BC, |
391 | const BinaryFunction::BasicBlockOrderType &BlockOrder, |
392 | const yaml::bolt::BinaryFunctionProfile &YamlBF, |
393 | FlowFunction &Func) { |
394 | assert(Func.Blocks.size() == BlockOrder.size() + 1); |
395 | |
396 | std::vector<FlowBlock *> Blocks; |
397 | std::vector<BlendedBlockHash> BlendedHashes; |
398 | for (uint64_t I = 0; I < BlockOrder.size(); I++) { |
399 | const BinaryBasicBlock *BB = BlockOrder[I]; |
400 | assert(BB->getHash() != 0 && "empty hash of BinaryBasicBlock" ); |
401 | Blocks.push_back(x: &Func.Blocks[I + 1]); |
402 | BlendedBlockHash BlendedHash(BB->getHash()); |
403 | BlendedHashes.push_back(x: BlendedHash); |
404 | LLVM_DEBUG(dbgs() << "BB with index " << I << " has hash = " |
405 | << Twine::utohexstr(BB->getHash()) << "\n" ); |
406 | } |
407 | StaleMatcher Matcher; |
408 | Matcher.init(Blocks, Hashes: BlendedHashes); |
409 | |
410 | // Index in yaml profile => corresponding (matched) block |
411 | DenseMap<uint64_t, const FlowBlock *> MatchedBlocks; |
412 | // Match blocks from the profile to the blocks in CFG |
413 | for (const yaml::bolt::BinaryBasicBlockProfile &YamlBB : YamlBF.Blocks) { |
414 | assert(YamlBB.Hash != 0 && "empty hash of BinaryBasicBlockProfile" ); |
415 | BlendedBlockHash YamlHash(YamlBB.Hash); |
416 | const FlowBlock *MatchedBlock = Matcher.matchBlock(BlendedHash: YamlHash); |
417 | // Always match the entry block. |
418 | if (MatchedBlock == nullptr && YamlBB.Index == 0) |
419 | MatchedBlock = Blocks[0]; |
420 | if (MatchedBlock != nullptr) { |
421 | const BinaryBasicBlock *BB = BlockOrder[MatchedBlock->Index - 1]; |
422 | MatchedBlocks[YamlBB.Index] = MatchedBlock; |
423 | BlendedBlockHash BinHash = BlendedHashes[MatchedBlock->Index - 1]; |
424 | LLVM_DEBUG(dbgs() << "Matched yaml block (bid = " << YamlBB.Index << ")" |
425 | << " with hash " << Twine::utohexstr(YamlBB.Hash) |
426 | << " to BB (index = " << MatchedBlock->Index - 1 << ")" |
427 | << " with hash " << Twine::utohexstr(BinHash.combine()) |
428 | << "\n" ); |
429 | // Update matching stats accounting for the matched block. |
430 | if (Matcher.isHighConfidenceMatch(Hash1: BinHash, Hash2: YamlHash)) { |
431 | ++BC.Stats.NumMatchedBlocks; |
432 | BC.Stats.MatchedSampleCount += YamlBB.ExecCount; |
433 | LLVM_DEBUG(dbgs() << " exact match\n" ); |
434 | } else { |
435 | LLVM_DEBUG(dbgs() << " loose match\n" ); |
436 | } |
437 | if (YamlBB.NumInstructions == BB->size()) |
438 | ++BC.Stats.NumStaleBlocksWithEqualIcount; |
439 | } else { |
440 | LLVM_DEBUG( |
441 | dbgs() << "Couldn't match yaml block (bid = " << YamlBB.Index << ")" |
442 | << " with hash " << Twine::utohexstr(YamlBB.Hash) << "\n" ); |
443 | } |
444 | |
445 | // Update matching stats. |
446 | ++BC.Stats.NumStaleBlocks; |
447 | BC.Stats.StaleSampleCount += YamlBB.ExecCount; |
448 | } |
449 | |
450 | // Match jumps from the profile to the jumps from CFG |
451 | std::vector<uint64_t> OutWeight(Func.Blocks.size(), 0); |
452 | std::vector<uint64_t> InWeight(Func.Blocks.size(), 0); |
453 | for (const yaml::bolt::BinaryBasicBlockProfile &YamlBB : YamlBF.Blocks) { |
454 | for (const yaml::bolt::SuccessorInfo &YamlSI : YamlBB.Successors) { |
455 | if (YamlSI.Count == 0) |
456 | continue; |
457 | |
458 | // Try to find the jump for a given (src, dst) pair from the profile and |
459 | // assign the jump weight based on the profile count |
460 | const uint64_t SrcIndex = YamlBB.Index; |
461 | const uint64_t DstIndex = YamlSI.Index; |
462 | |
463 | const FlowBlock *MatchedSrcBlock = MatchedBlocks.lookup(Val: SrcIndex); |
464 | const FlowBlock *MatchedDstBlock = MatchedBlocks.lookup(Val: DstIndex); |
465 | |
466 | if (MatchedSrcBlock != nullptr && MatchedDstBlock != nullptr) { |
467 | // Find a jump between the two blocks |
468 | FlowJump *Jump = nullptr; |
469 | for (FlowJump *SuccJump : MatchedSrcBlock->SuccJumps) { |
470 | if (SuccJump->Target == MatchedDstBlock->Index) { |
471 | Jump = SuccJump; |
472 | break; |
473 | } |
474 | } |
475 | // Assign the weight, if the corresponding jump is found |
476 | if (Jump != nullptr) { |
477 | Jump->Weight = YamlSI.Count; |
478 | Jump->HasUnknownWeight = false; |
479 | } |
480 | } |
481 | // Assign the weight for the src block, if it is found |
482 | if (MatchedSrcBlock != nullptr) |
483 | OutWeight[MatchedSrcBlock->Index] += YamlSI.Count; |
484 | // Assign the weight for the dst block, if it is found |
485 | if (MatchedDstBlock != nullptr) |
486 | InWeight[MatchedDstBlock->Index] += YamlSI.Count; |
487 | } |
488 | } |
489 | |
490 | // Assign block counts based on in-/out- jumps |
491 | for (FlowBlock &Block : Func.Blocks) { |
492 | if (OutWeight[Block.Index] == 0 && InWeight[Block.Index] == 0) { |
493 | assert(Block.HasUnknownWeight && "unmatched block with a positive count" ); |
494 | continue; |
495 | } |
496 | Block.HasUnknownWeight = false; |
497 | Block.Weight = std::max(a: OutWeight[Block.Index], b: InWeight[Block.Index]); |
498 | } |
499 | } |
500 | |
501 | /// The function finds all blocks that are (i) reachable from the Entry block |
502 | /// and (ii) do not have a path to an exit, and marks all such blocks 'cold' |
503 | /// so that profi does not send any flow to such blocks. |
504 | void preprocessUnreachableBlocks(FlowFunction &Func) { |
505 | const uint64_t NumBlocks = Func.Blocks.size(); |
506 | |
507 | // Start bfs from the source |
508 | std::queue<uint64_t> Queue; |
509 | std::vector<bool> VisitedEntry(NumBlocks, false); |
510 | for (uint64_t I = 0; I < NumBlocks; I++) { |
511 | FlowBlock &Block = Func.Blocks[I]; |
512 | if (Block.isEntry()) { |
513 | Queue.push(x: I); |
514 | VisitedEntry[I] = true; |
515 | break; |
516 | } |
517 | } |
518 | while (!Queue.empty()) { |
519 | const uint64_t Src = Queue.front(); |
520 | Queue.pop(); |
521 | for (FlowJump *Jump : Func.Blocks[Src].SuccJumps) { |
522 | const uint64_t Dst = Jump->Target; |
523 | if (!VisitedEntry[Dst]) { |
524 | Queue.push(x: Dst); |
525 | VisitedEntry[Dst] = true; |
526 | } |
527 | } |
528 | } |
529 | |
530 | // Start bfs from all sinks |
531 | std::vector<bool> VisitedExit(NumBlocks, false); |
532 | for (uint64_t I = 0; I < NumBlocks; I++) { |
533 | FlowBlock &Block = Func.Blocks[I]; |
534 | if (Block.isExit() && VisitedEntry[I]) { |
535 | Queue.push(x: I); |
536 | VisitedExit[I] = true; |
537 | } |
538 | } |
539 | while (!Queue.empty()) { |
540 | const uint64_t Src = Queue.front(); |
541 | Queue.pop(); |
542 | for (FlowJump *Jump : Func.Blocks[Src].PredJumps) { |
543 | const uint64_t Dst = Jump->Source; |
544 | if (!VisitedExit[Dst]) { |
545 | Queue.push(x: Dst); |
546 | VisitedExit[Dst] = true; |
547 | } |
548 | } |
549 | } |
550 | |
551 | // Make all blocks of zero weight so that flow is not sent |
552 | for (uint64_t I = 0; I < NumBlocks; I++) { |
553 | FlowBlock &Block = Func.Blocks[I]; |
554 | if (Block.Weight == 0) |
555 | continue; |
556 | if (!VisitedEntry[I] || !VisitedExit[I]) { |
557 | Block.Weight = 0; |
558 | Block.HasUnknownWeight = true; |
559 | Block.IsUnlikely = true; |
560 | for (FlowJump *Jump : Block.SuccJumps) { |
561 | if (Jump->Source == Block.Index && Jump->Target == Block.Index) { |
562 | Jump->Weight = 0; |
563 | Jump->HasUnknownWeight = true; |
564 | Jump->IsUnlikely = true; |
565 | } |
566 | } |
567 | } |
568 | } |
569 | } |
570 | |
571 | /// Decide if stale profile matching can be applied for a given function. |
572 | /// Currently we skip inference for (very) large instances and for instances |
573 | /// having "unexpected" control flow (e.g., having no sink basic blocks). |
574 | bool canApplyInference(const FlowFunction &Func) { |
575 | if (Func.Blocks.size() > opts::StaleMatchingMaxFuncSize) |
576 | return false; |
577 | |
578 | bool HasExitBlocks = llvm::any_of( |
579 | Range: Func.Blocks, P: [&](const FlowBlock &Block) { return Block.isExit(); }); |
580 | if (!HasExitBlocks) |
581 | return false; |
582 | |
583 | return true; |
584 | } |
585 | |
586 | /// Apply the profile inference algorithm for a given flow function. |
587 | void applyInference(FlowFunction &Func) { |
588 | ProfiParams Params; |
589 | // Set the params from the command-line flags. |
590 | Params.EvenFlowDistribution = opts::StaleMatchingEvenFlowDistribution; |
591 | Params.RebalanceUnknown = opts::StaleMatchingRebalanceUnknown; |
592 | Params.JoinIslands = opts::StaleMatchingJoinIslands; |
593 | |
594 | Params.CostBlockInc = opts::StaleMatchingCostBlockInc; |
595 | Params.CostBlockEntryInc = opts::StaleMatchingCostBlockInc; |
596 | Params.CostBlockDec = opts::StaleMatchingCostBlockDec; |
597 | Params.CostBlockEntryDec = opts::StaleMatchingCostBlockDec; |
598 | Params.CostBlockUnknownInc = opts::StaleMatchingCostBlockUnknownInc; |
599 | |
600 | Params.CostJumpInc = opts::StaleMatchingCostJumpInc; |
601 | Params.CostJumpFTInc = opts::StaleMatchingCostJumpInc; |
602 | Params.CostJumpDec = opts::StaleMatchingCostJumpDec; |
603 | Params.CostJumpFTDec = opts::StaleMatchingCostJumpDec; |
604 | Params.CostJumpUnknownInc = opts::StaleMatchingCostJumpUnknownInc; |
605 | Params.CostJumpUnknownFTInc = opts::StaleMatchingCostJumpUnknownFTInc; |
606 | |
607 | applyFlowInference(Params, Func); |
608 | } |
609 | |
610 | /// Collect inferred counts from the flow function and update annotations in |
611 | /// the binary function. |
612 | void assignProfile(BinaryFunction &BF, |
613 | const BinaryFunction::BasicBlockOrderType &BlockOrder, |
614 | FlowFunction &Func) { |
615 | BinaryContext &BC = BF.getBinaryContext(); |
616 | |
617 | assert(Func.Blocks.size() == BlockOrder.size() + 1); |
618 | for (uint64_t I = 0; I < BlockOrder.size(); I++) { |
619 | FlowBlock &Block = Func.Blocks[I + 1]; |
620 | BinaryBasicBlock *BB = BlockOrder[I]; |
621 | |
622 | // Update block's count |
623 | BB->setExecutionCount(Block.Flow); |
624 | |
625 | // Update jump counts: (i) clean existing counts and then (ii) set new ones |
626 | auto BI = BB->branch_info_begin(); |
627 | for (const BinaryBasicBlock *DstBB : BB->successors()) { |
628 | (void)DstBB; |
629 | BI->Count = 0; |
630 | BI->MispredictedCount = 0; |
631 | ++BI; |
632 | } |
633 | for (FlowJump *Jump : Block.SuccJumps) { |
634 | if (Jump->IsUnlikely) |
635 | continue; |
636 | if (Jump->Flow == 0) |
637 | continue; |
638 | |
639 | BinaryBasicBlock &SuccBB = *BlockOrder[Jump->Target - 1]; |
640 | // Check if the edge corresponds to a regular jump or a landing pad |
641 | if (BB->getSuccessor(Label: SuccBB.getLabel())) { |
642 | BinaryBasicBlock::BinaryBranchInfo &BI = BB->getBranchInfo(Succ: SuccBB); |
643 | BI.Count += Jump->Flow; |
644 | } else { |
645 | BinaryBasicBlock *LP = BB->getLandingPad(Label: SuccBB.getLabel()); |
646 | if (LP && LP->getKnownExecutionCount() < Jump->Flow) |
647 | LP->setExecutionCount(Jump->Flow); |
648 | } |
649 | } |
650 | |
651 | // Update call-site annotations |
652 | auto setOrUpdateAnnotation = [&](MCInst &Instr, StringRef Name, |
653 | uint64_t Count) { |
654 | if (BC.MIB->hasAnnotation(Inst: Instr, Name)) |
655 | BC.MIB->removeAnnotation(Inst&: Instr, Name); |
656 | // Do not add zero-count annotations |
657 | if (Count == 0) |
658 | return; |
659 | BC.MIB->addAnnotation(Inst&: Instr, Name, Val: Count); |
660 | }; |
661 | |
662 | for (MCInst &Instr : *BB) { |
663 | // Ignore pseudo instructions |
664 | if (BC.MIB->isPseudo(Inst: Instr)) |
665 | continue; |
666 | // Ignore jump tables |
667 | const MCInst *LastInstr = BB->getLastNonPseudoInstr(); |
668 | if (BC.MIB->getJumpTable(Inst: *LastInstr) && LastInstr == &Instr) |
669 | continue; |
670 | |
671 | if (BC.MIB->isIndirectCall(Inst: Instr) || BC.MIB->isIndirectBranch(Inst: Instr)) { |
672 | auto &ICSP = BC.MIB->getOrCreateAnnotationAs<IndirectCallSiteProfile>( |
673 | Inst&: Instr, Name: "CallProfile" ); |
674 | if (!ICSP.empty()) { |
675 | // Try to evenly distribute the counts among the call sites |
676 | const uint64_t TotalCount = Block.Flow; |
677 | const uint64_t NumSites = ICSP.size(); |
678 | for (uint64_t Idx = 0; Idx < ICSP.size(); Idx++) { |
679 | IndirectCallProfile &CSP = ICSP[Idx]; |
680 | uint64_t CountPerSite = TotalCount / NumSites; |
681 | // When counts cannot be exactly distributed, increase by 1 the |
682 | // counts of the first (TotalCount % NumSites) call sites |
683 | if (Idx < TotalCount % NumSites) |
684 | CountPerSite++; |
685 | CSP.Count = CountPerSite; |
686 | } |
687 | } else { |
688 | ICSP.emplace_back(Args: nullptr, Args&: Block.Flow, Args: 0); |
689 | } |
690 | } else if (BC.MIB->getConditionalTailCall(Inst: Instr)) { |
691 | // We don't know exactly the number of times the conditional tail call |
692 | // is executed; conservatively, setting it to the count of the block |
693 | setOrUpdateAnnotation(Instr, "CTCTakenCount" , Block.Flow); |
694 | BC.MIB->removeAnnotation(Inst&: Instr, Name: "CTCMispredCount" ); |
695 | } else if (BC.MIB->isCall(Inst: Instr)) { |
696 | setOrUpdateAnnotation(Instr, "Count" , Block.Flow); |
697 | } |
698 | } |
699 | } |
700 | |
701 | // Update function's execution count and mark the function inferred. |
702 | BF.setExecutionCount(Func.Blocks[0].Flow); |
703 | BF.setHasInferredProfile(true); |
704 | } |
705 | |
706 | bool YAMLProfileReader::inferStaleProfile( |
707 | BinaryFunction &BF, const yaml::bolt::BinaryFunctionProfile &YamlBF) { |
708 | if (!BF.hasCFG()) |
709 | return false; |
710 | |
711 | LLVM_DEBUG(dbgs() << "BOLT-INFO: applying profile inference for " |
712 | << "\"" << BF.getPrintName() << "\"\n" ); |
713 | |
714 | // Make sure that block hashes are up to date. |
715 | BF.computeBlockHashes(HashFunction: YamlBP.Header.HashFunction); |
716 | |
717 | const BinaryFunction::BasicBlockOrderType BlockOrder( |
718 | BF.getLayout().block_begin(), BF.getLayout().block_end()); |
719 | |
720 | // Create a wrapper flow function to use with the profile inference algorithm. |
721 | FlowFunction Func = createFlowFunction(BlockOrder); |
722 | |
723 | // Match as many block/jump counts from the stale profile as possible |
724 | matchWeightsByHashes(BC&: BF.getBinaryContext(), BlockOrder, YamlBF, Func); |
725 | |
726 | // Adjust the flow function by marking unreachable blocks Unlikely so that |
727 | // they don't get any counts assigned. |
728 | preprocessUnreachableBlocks(Func); |
729 | |
730 | // Check if profile inference can be applied for the instance. |
731 | if (!canApplyInference(Func)) |
732 | return false; |
733 | |
734 | // Apply the profile inference algorithm. |
735 | applyInference(Func); |
736 | |
737 | // Collect inferred counts and update function annotations. |
738 | assignProfile(BF, BlockOrder, Func); |
739 | |
740 | // As of now, we always mark the binary function having "correct" profile. |
741 | // In the future, we may discard the results for instances with poor inference |
742 | // metrics and keep such functions un-optimized. |
743 | return true; |
744 | } |
745 | |
746 | } // end namespace bolt |
747 | } // end namespace llvm |
748 | |