1 | //===- bolt/Passes/CacheMetrics.cpp - Metrics for instruction cache -------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file implements the CacheMetrics class and functions for showing metrics |
10 | // of cache lines. |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "bolt/Passes/CacheMetrics.h" |
15 | #include "bolt/Core/BinaryBasicBlock.h" |
16 | #include "bolt/Core/BinaryFunction.h" |
17 | #include <unordered_map> |
18 | |
19 | using namespace llvm; |
20 | using namespace bolt; |
21 | |
22 | namespace { |
23 | |
24 | /// The following constants are used to estimate the number of i-TLB cache |
25 | /// misses for a given code layout. Empirically the values result in high |
26 | /// correlations between the estimations and the perf measurements. |
27 | /// The constants do not affect the code layout algorithms. |
28 | constexpr unsigned ITLBPageSize = 4096; |
29 | constexpr unsigned ITLBEntries = 16; |
30 | |
31 | /// Initialize and return a position map for binary basic blocks |
32 | void ( |
33 | const std::vector<BinaryFunction *> &BinaryFunctions, |
34 | std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr, |
35 | std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) { |
36 | |
37 | for (BinaryFunction *BF : BinaryFunctions) { |
38 | const BinaryContext &BC = BF->getBinaryContext(); |
39 | for (BinaryBasicBlock &BB : *BF) { |
40 | if (BF->isSimple() || BC.HasRelocations) { |
41 | // Use addresses/sizes as in the output binary |
42 | BBAddr[&BB] = BB.getOutputAddressRange().first; |
43 | BBSize[&BB] = BB.getOutputSize(); |
44 | } else { |
45 | // Output ranges should match the input if the body hasn't changed |
46 | BBAddr[&BB] = BB.getInputAddressRange().first + BF->getAddress(); |
47 | BBSize[&BB] = BB.getOriginalSize(); |
48 | } |
49 | } |
50 | } |
51 | } |
52 | |
53 | /// Calculate TSP metric, which quantifies the number of fallthrough jumps in |
54 | /// the ordering of basic blocks. The method returns a pair |
55 | /// (the number of fallthrough branches, the total number of branches) |
56 | std::pair<uint64_t, uint64_t> |
57 | calcTSPScore(const std::vector<BinaryFunction *> &BinaryFunctions, |
58 | const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr, |
59 | const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) { |
60 | uint64_t Score = 0; |
61 | uint64_t JumpCount = 0; |
62 | for (BinaryFunction *BF : BinaryFunctions) { |
63 | if (!BF->hasProfile()) |
64 | continue; |
65 | for (BinaryBasicBlock *SrcBB : BF->getLayout().blocks()) { |
66 | auto BI = SrcBB->branch_info_begin(); |
67 | for (BinaryBasicBlock *DstBB : SrcBB->successors()) { |
68 | if (SrcBB != DstBB && BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE) { |
69 | JumpCount += BI->Count; |
70 | |
71 | auto BBAddrIt = BBAddr.find(x: SrcBB); |
72 | assert(BBAddrIt != BBAddr.end()); |
73 | uint64_t SrcBBAddr = BBAddrIt->second; |
74 | |
75 | auto BBSizeIt = BBSize.find(x: SrcBB); |
76 | assert(BBSizeIt != BBSize.end()); |
77 | uint64_t SrcBBSize = BBSizeIt->second; |
78 | |
79 | BBAddrIt = BBAddr.find(x: DstBB); |
80 | assert(BBAddrIt != BBAddr.end()); |
81 | uint64_t DstBBAddr = BBAddrIt->second; |
82 | |
83 | if (SrcBBAddr + SrcBBSize == DstBBAddr) |
84 | Score += BI->Count; |
85 | } |
86 | ++BI; |
87 | } |
88 | } |
89 | } |
90 | return std::make_pair(x&: Score, y&: JumpCount); |
91 | } |
92 | |
93 | using Predecessors = std::vector<std::pair<BinaryFunction *, uint64_t>>; |
94 | |
95 | /// Build a simplified version of the call graph: For every function, keep |
96 | /// its callers and the frequencies of the calls |
97 | std::unordered_map<const BinaryFunction *, Predecessors> |
98 | (const std::vector<BinaryFunction *> &BinaryFunctions) { |
99 | std::unordered_map<const BinaryFunction *, Predecessors> Calls; |
100 | |
101 | for (BinaryFunction *SrcFunction : BinaryFunctions) { |
102 | const BinaryContext &BC = SrcFunction->getBinaryContext(); |
103 | for (const BinaryBasicBlock *BB : SrcFunction->getLayout().blocks()) { |
104 | // Find call instructions and extract target symbols from each one |
105 | for (const MCInst &Inst : *BB) { |
106 | if (!BC.MIB->isCall(Inst)) |
107 | continue; |
108 | |
109 | // Call info |
110 | const MCSymbol *DstSym = BC.MIB->getTargetSymbol(Inst); |
111 | uint64_t Count = BB->getKnownExecutionCount(); |
112 | // Ignore calls w/o information |
113 | if (DstSym == nullptr || Count == 0) |
114 | continue; |
115 | |
116 | const BinaryFunction *DstFunction = BC.getFunctionForSymbol(Symbol: DstSym); |
117 | // Ignore recursive calls |
118 | if (DstFunction == nullptr || DstFunction->getLayout().block_empty() || |
119 | DstFunction == SrcFunction) |
120 | continue; |
121 | |
122 | // Record the call |
123 | Calls[DstFunction].emplace_back(args&: SrcFunction, args&: Count); |
124 | } |
125 | } |
126 | } |
127 | return Calls; |
128 | } |
129 | |
130 | /// Compute expected hit ratio of the i-TLB cache (optimized by HFSortPlus alg). |
131 | /// Given an assignment of functions to the i-TLB pages), we divide all |
132 | /// functions calls into two categories: |
133 | /// - 'short' ones that have a caller-callee distance less than a page; |
134 | /// - 'long' ones where the distance exceeds a page. |
135 | /// The short calls are likely to result in a i-TLB cache hit. For the long |
136 | /// ones, the hit/miss result depends on the 'hotness' of the page (i.e., how |
137 | /// often the page is accessed). Assuming that functions are sent to the i-TLB |
138 | /// cache in a random order, the probability that a page is present in the cache |
139 | /// is proportional to the number of samples corresponding to the functions on |
140 | /// the page. The following procedure detects short and long calls, and |
141 | /// estimates the expected number of cache misses for the long ones. |
142 | double expectedCacheHitRatio( |
143 | const std::vector<BinaryFunction *> &BinaryFunctions, |
144 | const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr, |
145 | const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) { |
146 | std::unordered_map<const BinaryFunction *, Predecessors> Calls = |
147 | extractFunctionCalls(BinaryFunctions); |
148 | // Compute 'hotness' of the functions |
149 | double TotalSamples = 0; |
150 | std::unordered_map<BinaryFunction *, double> FunctionSamples; |
151 | for (BinaryFunction *BF : BinaryFunctions) { |
152 | double Samples = 0; |
153 | for (std::pair<BinaryFunction *, uint64_t> Pair : Calls[BF]) |
154 | Samples += Pair.second; |
155 | Samples = std::max(a: Samples, b: (double)BF->getKnownExecutionCount()); |
156 | FunctionSamples[BF] = Samples; |
157 | TotalSamples += Samples; |
158 | } |
159 | |
160 | // Compute 'hotness' of the pages |
161 | std::unordered_map<uint64_t, double> PageSamples; |
162 | for (BinaryFunction *BF : BinaryFunctions) { |
163 | if (BF->getLayout().block_empty()) |
164 | continue; |
165 | auto BBAddrIt = BBAddr.find(x: BF->getLayout().block_front()); |
166 | assert(BBAddrIt != BBAddr.end()); |
167 | const uint64_t Page = BBAddrIt->second / ITLBPageSize; |
168 | |
169 | auto FunctionSamplesIt = FunctionSamples.find(x: BF); |
170 | assert(FunctionSamplesIt != FunctionSamples.end()); |
171 | PageSamples[Page] += FunctionSamplesIt->second; |
172 | } |
173 | |
174 | // Computing the expected number of misses for every function |
175 | double Misses = 0; |
176 | for (BinaryFunction *BF : BinaryFunctions) { |
177 | // Skip the function if it has no samples |
178 | auto FunctionSamplesIt = FunctionSamples.find(x: BF); |
179 | assert(FunctionSamplesIt != FunctionSamples.end()); |
180 | double Samples = FunctionSamplesIt->second; |
181 | if (BF->getLayout().block_empty() || Samples == 0.0) |
182 | continue; |
183 | |
184 | auto BBAddrIt = BBAddr.find(x: BF->getLayout().block_front()); |
185 | assert(BBAddrIt != BBAddr.end()); |
186 | const uint64_t Page = BBAddrIt->second / ITLBPageSize; |
187 | // The probability that the page is not present in the cache |
188 | const double MissProb = |
189 | pow(x: 1.0 - PageSamples[Page] / TotalSamples, y: ITLBEntries); |
190 | |
191 | // Processing all callers of the function |
192 | for (std::pair<BinaryFunction *, uint64_t> Pair : Calls[BF]) { |
193 | BinaryFunction *SrcFunction = Pair.first; |
194 | |
195 | BBAddrIt = BBAddr.find(x: SrcFunction->getLayout().block_front()); |
196 | assert(BBAddrIt != BBAddr.end()); |
197 | const uint64_t SrcPage = BBAddrIt->second / ITLBPageSize; |
198 | // Is this a 'long' or a 'short' call? |
199 | if (Page != SrcPage) { |
200 | // This is a miss |
201 | Misses += MissProb * Pair.second; |
202 | } |
203 | Samples -= Pair.second; |
204 | } |
205 | assert(Samples >= 0.0 && "Function samples computed incorrectly" ); |
206 | // The remaining samples likely come from the jitted code |
207 | Misses += Samples * MissProb; |
208 | } |
209 | |
210 | return 100.0 * (1.0 - Misses / TotalSamples); |
211 | } |
212 | |
213 | } // namespace |
214 | |
215 | void CacheMetrics::printAll(raw_ostream &OS, |
216 | const std::vector<BinaryFunction *> &BFs) { |
217 | // Stats related to hot-cold code splitting |
218 | size_t NumFunctions = 0; |
219 | size_t NumProfiledFunctions = 0; |
220 | size_t NumHotFunctions = 0; |
221 | size_t NumBlocks = 0; |
222 | size_t NumHotBlocks = 0; |
223 | |
224 | size_t TotalCodeMinAddr = std::numeric_limits<size_t>::max(); |
225 | size_t TotalCodeMaxAddr = 0; |
226 | size_t HotCodeMinAddr = std::numeric_limits<size_t>::max(); |
227 | size_t HotCodeMaxAddr = 0; |
228 | |
229 | for (BinaryFunction *BF : BFs) { |
230 | NumFunctions++; |
231 | if (BF->hasProfile()) |
232 | NumProfiledFunctions++; |
233 | if (BF->hasValidIndex()) |
234 | NumHotFunctions++; |
235 | for (const BinaryBasicBlock &BB : *BF) { |
236 | NumBlocks++; |
237 | size_t BBAddrMin = BB.getOutputAddressRange().first; |
238 | size_t BBAddrMax = BB.getOutputAddressRange().second; |
239 | TotalCodeMinAddr = std::min(a: TotalCodeMinAddr, b: BBAddrMin); |
240 | TotalCodeMaxAddr = std::max(a: TotalCodeMaxAddr, b: BBAddrMax); |
241 | if (BF->hasValidIndex() && !BB.isCold()) { |
242 | NumHotBlocks++; |
243 | HotCodeMinAddr = std::min(a: HotCodeMinAddr, b: BBAddrMin); |
244 | HotCodeMaxAddr = std::max(a: HotCodeMaxAddr, b: BBAddrMax); |
245 | } |
246 | } |
247 | } |
248 | |
249 | OS << format(Fmt: " There are %zu functions;" , Vals: NumFunctions) |
250 | << format(Fmt: " %zu (%.2lf%%) are in the hot section," , Vals: NumHotFunctions, |
251 | Vals: 100.0 * NumHotFunctions / NumFunctions) |
252 | << format(Fmt: " %zu (%.2lf%%) have profile\n" , Vals: NumProfiledFunctions, |
253 | Vals: 100.0 * NumProfiledFunctions / NumFunctions); |
254 | OS << format(Fmt: " There are %zu basic blocks;" , Vals: NumBlocks) |
255 | << format(Fmt: " %zu (%.2lf%%) are in the hot section\n" , Vals: NumHotBlocks, |
256 | Vals: 100.0 * NumHotBlocks / NumBlocks); |
257 | |
258 | assert(TotalCodeMinAddr <= TotalCodeMaxAddr && "incorrect output addresses" ); |
259 | size_t HotCodeSize = HotCodeMaxAddr - HotCodeMinAddr; |
260 | size_t TotalCodeSize = TotalCodeMaxAddr - TotalCodeMinAddr; |
261 | |
262 | size_t HugePage2MB = 2 << 20; |
263 | OS << format(Fmt: " Hot code takes %.2lf%% of binary (%zu bytes out of %zu, " |
264 | "%.2lf huge pages)\n" , |
265 | Vals: 100.0 * HotCodeSize / TotalCodeSize, Vals: HotCodeSize, Vals: TotalCodeSize, |
266 | Vals: double(HotCodeSize) / HugePage2MB); |
267 | |
268 | // Stats related to expected cache performance |
269 | std::unordered_map<BinaryBasicBlock *, uint64_t> BBAddr; |
270 | std::unordered_map<BinaryBasicBlock *, uint64_t> BBSize; |
271 | extractBasicBlockInfo(BinaryFunctions: BFs, BBAddr, BBSize); |
272 | |
273 | OS << " Expected i-TLB cache hit ratio: " |
274 | << format(Fmt: "%.2lf%%\n" , Vals: expectedCacheHitRatio(BinaryFunctions: BFs, BBAddr, BBSize)); |
275 | |
276 | auto Stats = calcTSPScore(BinaryFunctions: BFs, BBAddr, BBSize); |
277 | OS << " TSP score: " |
278 | << format(Fmt: "%.2lf%% (%zu out of %zu)\n" , |
279 | Vals: 100.0 * Stats.first / std::max<uint64_t>(a: Stats.second, b: 1), |
280 | Vals: Stats.first, Vals: Stats.second); |
281 | } |
282 | |