1 | //===- bolt/Passes/CacheMetrics.cpp - Metrics for instruction cache -------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file implements the CacheMetrics class and functions for showing metrics |
10 | // of cache lines. |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "bolt/Passes/CacheMetrics.h" |
15 | #include "bolt/Core/BinaryBasicBlock.h" |
16 | #include "bolt/Core/BinaryFunction.h" |
17 | #include <unordered_map> |
18 | |
19 | using namespace llvm; |
20 | using namespace bolt; |
21 | |
22 | namespace { |
23 | |
24 | /// The following constants are used to estimate the number of i-TLB cache |
25 | /// misses for a given code layout. Empirically the values result in high |
26 | /// correlations between the estimations and the perf measurements. |
27 | /// The constants do not affect the code layout algorithms. |
28 | constexpr unsigned ITLBPageSize = 4096; |
29 | constexpr unsigned ITLBEntries = 16; |
30 | |
31 | /// Initialize and return a position map for binary basic blocks |
32 | void ( |
33 | const std::vector<BinaryFunction *> &BinaryFunctions, |
34 | std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr, |
35 | std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) { |
36 | |
37 | for (BinaryFunction *BF : BinaryFunctions) { |
38 | const BinaryContext &BC = BF->getBinaryContext(); |
39 | for (BinaryBasicBlock &BB : *BF) { |
40 | if (BF->isSimple() || BC.HasRelocations) { |
41 | // Use addresses/sizes as in the output binary |
42 | BBAddr[&BB] = BB.getOutputAddressRange().first; |
43 | BBSize[&BB] = BB.getOutputSize(); |
44 | } else { |
45 | // Output ranges should match the input if the body hasn't changed |
46 | BBAddr[&BB] = BB.getInputAddressRange().first + BF->getAddress(); |
47 | BBSize[&BB] = BB.getOriginalSize(); |
48 | } |
49 | } |
50 | } |
51 | } |
52 | |
53 | /// Calculate TSP metric, which quantifies the number of fallthrough jumps in |
54 | /// the ordering of basic blocks. The method returns a pair |
55 | /// (the number of fallthrough branches, the total number of branches) |
56 | std::pair<uint64_t, uint64_t> |
57 | calcTSPScore(const std::vector<BinaryFunction *> &BinaryFunctions, |
58 | const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr, |
59 | const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) { |
60 | uint64_t Score = 0; |
61 | uint64_t JumpCount = 0; |
62 | for (BinaryFunction *BF : BinaryFunctions) { |
63 | if (!BF->hasProfile()) |
64 | continue; |
65 | for (BinaryBasicBlock *SrcBB : BF->getLayout().blocks()) { |
66 | auto BI = SrcBB->branch_info_begin(); |
67 | for (BinaryBasicBlock *DstBB : SrcBB->successors()) { |
68 | if (SrcBB != DstBB && BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE) { |
69 | JumpCount += BI->Count; |
70 | if (BBAddr.at(k: SrcBB) + BBSize.at(k: SrcBB) == BBAddr.at(k: DstBB)) |
71 | Score += BI->Count; |
72 | } |
73 | ++BI; |
74 | } |
75 | } |
76 | } |
77 | return std::make_pair(x&: Score, y&: JumpCount); |
78 | } |
79 | |
80 | using Predecessors = std::vector<std::pair<BinaryFunction *, uint64_t>>; |
81 | |
82 | /// Build a simplified version of the call graph: For every function, keep |
83 | /// its callers and the frequencies of the calls |
84 | std::unordered_map<const BinaryFunction *, Predecessors> |
85 | (const std::vector<BinaryFunction *> &BinaryFunctions) { |
86 | std::unordered_map<const BinaryFunction *, Predecessors> Calls; |
87 | |
88 | for (BinaryFunction *SrcFunction : BinaryFunctions) { |
89 | const BinaryContext &BC = SrcFunction->getBinaryContext(); |
90 | for (const BinaryBasicBlock *BB : SrcFunction->getLayout().blocks()) { |
91 | // Find call instructions and extract target symbols from each one |
92 | for (const MCInst &Inst : *BB) { |
93 | if (!BC.MIB->isCall(Inst)) |
94 | continue; |
95 | |
96 | // Call info |
97 | const MCSymbol *DstSym = BC.MIB->getTargetSymbol(Inst); |
98 | uint64_t Count = BB->getKnownExecutionCount(); |
99 | // Ignore calls w/o information |
100 | if (DstSym == nullptr || Count == 0) |
101 | continue; |
102 | |
103 | const BinaryFunction *DstFunction = BC.getFunctionForSymbol(Symbol: DstSym); |
104 | // Ignore recursive calls |
105 | if (DstFunction == nullptr || DstFunction->getLayout().block_empty() || |
106 | DstFunction == SrcFunction) |
107 | continue; |
108 | |
109 | // Record the call |
110 | Calls[DstFunction].emplace_back(args&: SrcFunction, args&: Count); |
111 | } |
112 | } |
113 | } |
114 | return Calls; |
115 | } |
116 | |
117 | /// Compute expected hit ratio of the i-TLB cache (optimized by HFSortPlus alg). |
118 | /// Given an assignment of functions to the i-TLB pages), we divide all |
119 | /// functions calls into two categories: |
120 | /// - 'short' ones that have a caller-callee distance less than a page; |
121 | /// - 'long' ones where the distance exceeds a page. |
122 | /// The short calls are likely to result in a i-TLB cache hit. For the long |
123 | /// ones, the hit/miss result depends on the 'hotness' of the page (i.e., how |
124 | /// often the page is accessed). Assuming that functions are sent to the i-TLB |
125 | /// cache in a random order, the probability that a page is present in the cache |
126 | /// is proportional to the number of samples corresponding to the functions on |
127 | /// the page. The following procedure detects short and long calls, and |
128 | /// estimates the expected number of cache misses for the long ones. |
129 | double expectedCacheHitRatio( |
130 | const std::vector<BinaryFunction *> &BinaryFunctions, |
131 | const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr, |
132 | const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) { |
133 | std::unordered_map<const BinaryFunction *, Predecessors> Calls = |
134 | extractFunctionCalls(BinaryFunctions); |
135 | // Compute 'hotness' of the functions |
136 | double TotalSamples = 0; |
137 | std::unordered_map<BinaryFunction *, double> FunctionSamples; |
138 | for (BinaryFunction *BF : BinaryFunctions) { |
139 | double Samples = 0; |
140 | for (std::pair<BinaryFunction *, uint64_t> Pair : Calls[BF]) |
141 | Samples += Pair.second; |
142 | Samples = std::max(a: Samples, b: (double)BF->getKnownExecutionCount()); |
143 | FunctionSamples[BF] = Samples; |
144 | TotalSamples += Samples; |
145 | } |
146 | |
147 | // Compute 'hotness' of the pages |
148 | std::unordered_map<uint64_t, double> PageSamples; |
149 | for (BinaryFunction *BF : BinaryFunctions) { |
150 | if (BF->getLayout().block_empty()) |
151 | continue; |
152 | const uint64_t Page = |
153 | BBAddr.at(k: BF->getLayout().block_front()) / ITLBPageSize; |
154 | PageSamples[Page] += FunctionSamples.at(k: BF); |
155 | } |
156 | |
157 | // Computing the expected number of misses for every function |
158 | double Misses = 0; |
159 | for (BinaryFunction *BF : BinaryFunctions) { |
160 | // Skip the function if it has no samples |
161 | if (BF->getLayout().block_empty() || FunctionSamples.at(k: BF) == 0.0) |
162 | continue; |
163 | double Samples = FunctionSamples.at(k: BF); |
164 | const uint64_t Page = |
165 | BBAddr.at(k: BF->getLayout().block_front()) / ITLBPageSize; |
166 | // The probability that the page is not present in the cache |
167 | const double MissProb = |
168 | pow(x: 1.0 - PageSamples[Page] / TotalSamples, y: ITLBEntries); |
169 | |
170 | // Processing all callers of the function |
171 | for (std::pair<BinaryFunction *, uint64_t> Pair : Calls[BF]) { |
172 | BinaryFunction *SrcFunction = Pair.first; |
173 | const uint64_t SrcPage = |
174 | BBAddr.at(k: SrcFunction->getLayout().block_front()) / ITLBPageSize; |
175 | // Is this a 'long' or a 'short' call? |
176 | if (Page != SrcPage) { |
177 | // This is a miss |
178 | Misses += MissProb * Pair.second; |
179 | } |
180 | Samples -= Pair.second; |
181 | } |
182 | assert(Samples >= 0.0 && "Function samples computed incorrectly" ); |
183 | // The remaining samples likely come from the jitted code |
184 | Misses += Samples * MissProb; |
185 | } |
186 | |
187 | return 100.0 * (1.0 - Misses / TotalSamples); |
188 | } |
189 | |
190 | } // namespace |
191 | |
192 | void CacheMetrics::printAll(raw_ostream &OS, |
193 | const std::vector<BinaryFunction *> &BFs) { |
194 | // Stats related to hot-cold code splitting |
195 | size_t NumFunctions = 0; |
196 | size_t NumProfiledFunctions = 0; |
197 | size_t NumHotFunctions = 0; |
198 | size_t NumBlocks = 0; |
199 | size_t NumHotBlocks = 0; |
200 | |
201 | size_t TotalCodeMinAddr = std::numeric_limits<size_t>::max(); |
202 | size_t TotalCodeMaxAddr = 0; |
203 | size_t HotCodeMinAddr = std::numeric_limits<size_t>::max(); |
204 | size_t HotCodeMaxAddr = 0; |
205 | |
206 | for (BinaryFunction *BF : BFs) { |
207 | NumFunctions++; |
208 | if (BF->hasProfile()) |
209 | NumProfiledFunctions++; |
210 | if (BF->hasValidIndex()) |
211 | NumHotFunctions++; |
212 | for (const BinaryBasicBlock &BB : *BF) { |
213 | NumBlocks++; |
214 | size_t BBAddrMin = BB.getOutputAddressRange().first; |
215 | size_t BBAddrMax = BB.getOutputAddressRange().second; |
216 | TotalCodeMinAddr = std::min(a: TotalCodeMinAddr, b: BBAddrMin); |
217 | TotalCodeMaxAddr = std::max(a: TotalCodeMaxAddr, b: BBAddrMax); |
218 | if (BF->hasValidIndex() && !BB.isCold()) { |
219 | NumHotBlocks++; |
220 | HotCodeMinAddr = std::min(a: HotCodeMinAddr, b: BBAddrMin); |
221 | HotCodeMaxAddr = std::max(a: HotCodeMaxAddr, b: BBAddrMax); |
222 | } |
223 | } |
224 | } |
225 | |
226 | OS << format(Fmt: " There are %zu functions;" , Vals: NumFunctions) |
227 | << format(Fmt: " %zu (%.2lf%%) are in the hot section," , Vals: NumHotFunctions, |
228 | Vals: 100.0 * NumHotFunctions / NumFunctions) |
229 | << format(Fmt: " %zu (%.2lf%%) have profile\n" , Vals: NumProfiledFunctions, |
230 | Vals: 100.0 * NumProfiledFunctions / NumFunctions); |
231 | OS << format(Fmt: " There are %zu basic blocks;" , Vals: NumBlocks) |
232 | << format(Fmt: " %zu (%.2lf%%) are in the hot section\n" , Vals: NumHotBlocks, |
233 | Vals: 100.0 * NumHotBlocks / NumBlocks); |
234 | |
235 | assert(TotalCodeMinAddr <= TotalCodeMaxAddr && "incorrect output addresses" ); |
236 | size_t HotCodeSize = HotCodeMaxAddr - HotCodeMinAddr; |
237 | size_t TotalCodeSize = TotalCodeMaxAddr - TotalCodeMinAddr; |
238 | |
239 | size_t HugePage2MB = 2 << 20; |
240 | OS << format(Fmt: " Hot code takes %.2lf%% of binary (%zu bytes out of %zu, " |
241 | "%.2lf huge pages)\n" , |
242 | Vals: 100.0 * HotCodeSize / TotalCodeSize, Vals: HotCodeSize, Vals: TotalCodeSize, |
243 | Vals: double(HotCodeSize) / HugePage2MB); |
244 | |
245 | // Stats related to expected cache performance |
246 | std::unordered_map<BinaryBasicBlock *, uint64_t> BBAddr; |
247 | std::unordered_map<BinaryBasicBlock *, uint64_t> BBSize; |
248 | extractBasicBlockInfo(BinaryFunctions: BFs, BBAddr, BBSize); |
249 | |
250 | OS << " Expected i-TLB cache hit ratio: " |
251 | << format(Fmt: "%.2lf%%\n" , Vals: expectedCacheHitRatio(BinaryFunctions: BFs, BBAddr, BBSize)); |
252 | |
253 | auto Stats = calcTSPScore(BinaryFunctions: BFs, BBAddr, BBSize); |
254 | OS << " TSP score: " |
255 | << format(Fmt: "%.2lf%% (%zu out of %zu)\n" , |
256 | Vals: 100.0 * Stats.first / std::max<uint64_t>(a: Stats.second, b: 1), |
257 | Vals: Stats.first, Vals: Stats.second); |
258 | } |
259 | |