1//===- bolt/Passes/CacheMetrics.cpp - Metrics for instruction cache -------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the CacheMetrics class and functions for showing metrics
10// of cache lines.
11//
12//===----------------------------------------------------------------------===//
13
14#include "bolt/Passes/CacheMetrics.h"
15#include "bolt/Core/BinaryBasicBlock.h"
16#include "bolt/Core/BinaryFunction.h"
17#include <unordered_map>
18
19using namespace llvm;
20using namespace bolt;
21
22namespace {
23
24/// The following constants are used to estimate the number of i-TLB cache
25/// misses for a given code layout. Empirically the values result in high
26/// correlations between the estimations and the perf measurements.
27/// The constants do not affect the code layout algorithms.
28constexpr unsigned ITLBPageSize = 4096;
29constexpr unsigned ITLBEntries = 16;
30
31/// Initialize and return a position map for binary basic blocks
32void extractBasicBlockInfo(
33 const std::vector<BinaryFunction *> &BinaryFunctions,
34 std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr,
35 std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) {
36
37 for (BinaryFunction *BF : BinaryFunctions) {
38 const BinaryContext &BC = BF->getBinaryContext();
39 for (BinaryBasicBlock &BB : *BF) {
40 if (BF->isSimple() || BC.HasRelocations) {
41 // Use addresses/sizes as in the output binary
42 BBAddr[&BB] = BB.getOutputAddressRange().first;
43 BBSize[&BB] = BB.getOutputSize();
44 } else {
45 // Output ranges should match the input if the body hasn't changed
46 BBAddr[&BB] = BB.getInputAddressRange().first + BF->getAddress();
47 BBSize[&BB] = BB.getOriginalSize();
48 }
49 }
50 }
51}
52
53/// Calculate TSP metric, which quantifies the number of fallthrough jumps in
54/// the ordering of basic blocks. The method returns a pair
55/// (the number of fallthrough branches, the total number of branches)
56std::pair<uint64_t, uint64_t>
57calcTSPScore(const std::vector<BinaryFunction *> &BinaryFunctions,
58 const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr,
59 const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) {
60 uint64_t Score = 0;
61 uint64_t JumpCount = 0;
62 for (BinaryFunction *BF : BinaryFunctions) {
63 if (!BF->hasProfile())
64 continue;
65 for (BinaryBasicBlock *SrcBB : BF->getLayout().blocks()) {
66 auto BI = SrcBB->branch_info_begin();
67 for (BinaryBasicBlock *DstBB : SrcBB->successors()) {
68 if (SrcBB != DstBB && BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE) {
69 JumpCount += BI->Count;
70 if (BBAddr.at(k: SrcBB) + BBSize.at(k: SrcBB) == BBAddr.at(k: DstBB))
71 Score += BI->Count;
72 }
73 ++BI;
74 }
75 }
76 }
77 return std::make_pair(x&: Score, y&: JumpCount);
78}
79
80using Predecessors = std::vector<std::pair<BinaryFunction *, uint64_t>>;
81
82/// Build a simplified version of the call graph: For every function, keep
83/// its callers and the frequencies of the calls
84std::unordered_map<const BinaryFunction *, Predecessors>
85extractFunctionCalls(const std::vector<BinaryFunction *> &BinaryFunctions) {
86 std::unordered_map<const BinaryFunction *, Predecessors> Calls;
87
88 for (BinaryFunction *SrcFunction : BinaryFunctions) {
89 const BinaryContext &BC = SrcFunction->getBinaryContext();
90 for (const BinaryBasicBlock *BB : SrcFunction->getLayout().blocks()) {
91 // Find call instructions and extract target symbols from each one
92 for (const MCInst &Inst : *BB) {
93 if (!BC.MIB->isCall(Inst))
94 continue;
95
96 // Call info
97 const MCSymbol *DstSym = BC.MIB->getTargetSymbol(Inst);
98 uint64_t Count = BB->getKnownExecutionCount();
99 // Ignore calls w/o information
100 if (DstSym == nullptr || Count == 0)
101 continue;
102
103 const BinaryFunction *DstFunction = BC.getFunctionForSymbol(Symbol: DstSym);
104 // Ignore recursive calls
105 if (DstFunction == nullptr || DstFunction->getLayout().block_empty() ||
106 DstFunction == SrcFunction)
107 continue;
108
109 // Record the call
110 Calls[DstFunction].emplace_back(args&: SrcFunction, args&: Count);
111 }
112 }
113 }
114 return Calls;
115}
116
117/// Compute expected hit ratio of the i-TLB cache (optimized by HFSortPlus alg).
118/// Given an assignment of functions to the i-TLB pages), we divide all
119/// functions calls into two categories:
120/// - 'short' ones that have a caller-callee distance less than a page;
121/// - 'long' ones where the distance exceeds a page.
122/// The short calls are likely to result in a i-TLB cache hit. For the long
123/// ones, the hit/miss result depends on the 'hotness' of the page (i.e., how
124/// often the page is accessed). Assuming that functions are sent to the i-TLB
125/// cache in a random order, the probability that a page is present in the cache
126/// is proportional to the number of samples corresponding to the functions on
127/// the page. The following procedure detects short and long calls, and
128/// estimates the expected number of cache misses for the long ones.
129double expectedCacheHitRatio(
130 const std::vector<BinaryFunction *> &BinaryFunctions,
131 const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr,
132 const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) {
133 std::unordered_map<const BinaryFunction *, Predecessors> Calls =
134 extractFunctionCalls(BinaryFunctions);
135 // Compute 'hotness' of the functions
136 double TotalSamples = 0;
137 std::unordered_map<BinaryFunction *, double> FunctionSamples;
138 for (BinaryFunction *BF : BinaryFunctions) {
139 double Samples = 0;
140 for (std::pair<BinaryFunction *, uint64_t> Pair : Calls[BF])
141 Samples += Pair.second;
142 Samples = std::max(a: Samples, b: (double)BF->getKnownExecutionCount());
143 FunctionSamples[BF] = Samples;
144 TotalSamples += Samples;
145 }
146
147 // Compute 'hotness' of the pages
148 std::unordered_map<uint64_t, double> PageSamples;
149 for (BinaryFunction *BF : BinaryFunctions) {
150 if (BF->getLayout().block_empty())
151 continue;
152 const uint64_t Page =
153 BBAddr.at(k: BF->getLayout().block_front()) / ITLBPageSize;
154 PageSamples[Page] += FunctionSamples.at(k: BF);
155 }
156
157 // Computing the expected number of misses for every function
158 double Misses = 0;
159 for (BinaryFunction *BF : BinaryFunctions) {
160 // Skip the function if it has no samples
161 if (BF->getLayout().block_empty() || FunctionSamples.at(k: BF) == 0.0)
162 continue;
163 double Samples = FunctionSamples.at(k: BF);
164 const uint64_t Page =
165 BBAddr.at(k: BF->getLayout().block_front()) / ITLBPageSize;
166 // The probability that the page is not present in the cache
167 const double MissProb =
168 pow(x: 1.0 - PageSamples[Page] / TotalSamples, y: ITLBEntries);
169
170 // Processing all callers of the function
171 for (std::pair<BinaryFunction *, uint64_t> Pair : Calls[BF]) {
172 BinaryFunction *SrcFunction = Pair.first;
173 const uint64_t SrcPage =
174 BBAddr.at(k: SrcFunction->getLayout().block_front()) / ITLBPageSize;
175 // Is this a 'long' or a 'short' call?
176 if (Page != SrcPage) {
177 // This is a miss
178 Misses += MissProb * Pair.second;
179 }
180 Samples -= Pair.second;
181 }
182 assert(Samples >= 0.0 && "Function samples computed incorrectly");
183 // The remaining samples likely come from the jitted code
184 Misses += Samples * MissProb;
185 }
186
187 return 100.0 * (1.0 - Misses / TotalSamples);
188}
189
190} // namespace
191
192void CacheMetrics::printAll(raw_ostream &OS,
193 const std::vector<BinaryFunction *> &BFs) {
194 // Stats related to hot-cold code splitting
195 size_t NumFunctions = 0;
196 size_t NumProfiledFunctions = 0;
197 size_t NumHotFunctions = 0;
198 size_t NumBlocks = 0;
199 size_t NumHotBlocks = 0;
200
201 size_t TotalCodeMinAddr = std::numeric_limits<size_t>::max();
202 size_t TotalCodeMaxAddr = 0;
203 size_t HotCodeMinAddr = std::numeric_limits<size_t>::max();
204 size_t HotCodeMaxAddr = 0;
205
206 for (BinaryFunction *BF : BFs) {
207 NumFunctions++;
208 if (BF->hasProfile())
209 NumProfiledFunctions++;
210 if (BF->hasValidIndex())
211 NumHotFunctions++;
212 for (const BinaryBasicBlock &BB : *BF) {
213 NumBlocks++;
214 size_t BBAddrMin = BB.getOutputAddressRange().first;
215 size_t BBAddrMax = BB.getOutputAddressRange().second;
216 TotalCodeMinAddr = std::min(a: TotalCodeMinAddr, b: BBAddrMin);
217 TotalCodeMaxAddr = std::max(a: TotalCodeMaxAddr, b: BBAddrMax);
218 if (BF->hasValidIndex() && !BB.isCold()) {
219 NumHotBlocks++;
220 HotCodeMinAddr = std::min(a: HotCodeMinAddr, b: BBAddrMin);
221 HotCodeMaxAddr = std::max(a: HotCodeMaxAddr, b: BBAddrMax);
222 }
223 }
224 }
225
226 OS << format(Fmt: " There are %zu functions;", Vals: NumFunctions)
227 << format(Fmt: " %zu (%.2lf%%) are in the hot section,", Vals: NumHotFunctions,
228 Vals: 100.0 * NumHotFunctions / NumFunctions)
229 << format(Fmt: " %zu (%.2lf%%) have profile\n", Vals: NumProfiledFunctions,
230 Vals: 100.0 * NumProfiledFunctions / NumFunctions);
231 OS << format(Fmt: " There are %zu basic blocks;", Vals: NumBlocks)
232 << format(Fmt: " %zu (%.2lf%%) are in the hot section\n", Vals: NumHotBlocks,
233 Vals: 100.0 * NumHotBlocks / NumBlocks);
234
235 assert(TotalCodeMinAddr <= TotalCodeMaxAddr && "incorrect output addresses");
236 size_t HotCodeSize = HotCodeMaxAddr - HotCodeMinAddr;
237 size_t TotalCodeSize = TotalCodeMaxAddr - TotalCodeMinAddr;
238
239 size_t HugePage2MB = 2 << 20;
240 OS << format(Fmt: " Hot code takes %.2lf%% of binary (%zu bytes out of %zu, "
241 "%.2lf huge pages)\n",
242 Vals: 100.0 * HotCodeSize / TotalCodeSize, Vals: HotCodeSize, Vals: TotalCodeSize,
243 Vals: double(HotCodeSize) / HugePage2MB);
244
245 // Stats related to expected cache performance
246 std::unordered_map<BinaryBasicBlock *, uint64_t> BBAddr;
247 std::unordered_map<BinaryBasicBlock *, uint64_t> BBSize;
248 extractBasicBlockInfo(BinaryFunctions: BFs, BBAddr, BBSize);
249
250 OS << " Expected i-TLB cache hit ratio: "
251 << format(Fmt: "%.2lf%%\n", Vals: expectedCacheHitRatio(BinaryFunctions: BFs, BBAddr, BBSize));
252
253 auto Stats = calcTSPScore(BinaryFunctions: BFs, BBAddr, BBSize);
254 OS << " TSP score: "
255 << format(Fmt: "%.2lf%% (%zu out of %zu)\n",
256 Vals: 100.0 * Stats.first / std::max<uint64_t>(a: Stats.second, b: 1),
257 Vals: Stats.first, Vals: Stats.second);
258}
259

source code of bolt/lib/Passes/CacheMetrics.cpp