| 1 | //===- bolt/Passes/CacheMetrics.cpp - Metrics for instruction cache -------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // This file implements the CacheMetrics class and functions for showing metrics |
| 10 | // of cache lines. |
| 11 | // |
| 12 | //===----------------------------------------------------------------------===// |
| 13 | |
| 14 | #include "bolt/Passes/CacheMetrics.h" |
| 15 | #include "bolt/Core/BinaryBasicBlock.h" |
| 16 | #include "bolt/Core/BinaryFunction.h" |
| 17 | #include <unordered_map> |
| 18 | |
| 19 | using namespace llvm; |
| 20 | using namespace bolt; |
| 21 | |
| 22 | namespace { |
| 23 | |
| 24 | /// The following constants are used to estimate the number of i-TLB cache |
| 25 | /// misses for a given code layout. Empirically the values result in high |
| 26 | /// correlations between the estimations and the perf measurements. |
| 27 | /// The constants do not affect the code layout algorithms. |
| 28 | constexpr unsigned ITLBPageSize = 4096; |
| 29 | constexpr unsigned ITLBEntries = 16; |
| 30 | |
| 31 | /// Initialize and return a position map for binary basic blocks |
| 32 | void ( |
| 33 | const std::vector<BinaryFunction *> &BinaryFunctions, |
| 34 | std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr, |
| 35 | std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) { |
| 36 | |
| 37 | for (BinaryFunction *BF : BinaryFunctions) { |
| 38 | const BinaryContext &BC = BF->getBinaryContext(); |
| 39 | for (BinaryBasicBlock &BB : *BF) { |
| 40 | if (BF->isSimple() || BC.HasRelocations) { |
| 41 | // Use addresses/sizes as in the output binary |
| 42 | BBAddr[&BB] = BB.getOutputAddressRange().first; |
| 43 | BBSize[&BB] = BB.getOutputSize(); |
| 44 | } else { |
| 45 | // Output ranges should match the input if the body hasn't changed |
| 46 | BBAddr[&BB] = BB.getInputAddressRange().first + BF->getAddress(); |
| 47 | BBSize[&BB] = BB.getOriginalSize(); |
| 48 | } |
| 49 | } |
| 50 | } |
| 51 | } |
| 52 | |
| 53 | /// Calculate TSP metric, which quantifies the number of fallthrough jumps in |
| 54 | /// the ordering of basic blocks. The method returns a pair |
| 55 | /// (the number of fallthrough branches, the total number of branches) |
| 56 | std::pair<uint64_t, uint64_t> |
| 57 | calcTSPScore(const std::vector<BinaryFunction *> &BinaryFunctions, |
| 58 | const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr, |
| 59 | const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) { |
| 60 | uint64_t Score = 0; |
| 61 | uint64_t JumpCount = 0; |
| 62 | for (BinaryFunction *BF : BinaryFunctions) { |
| 63 | if (!BF->hasProfile()) |
| 64 | continue; |
| 65 | for (BinaryBasicBlock *SrcBB : BF->getLayout().blocks()) { |
| 66 | auto BI = SrcBB->branch_info_begin(); |
| 67 | for (BinaryBasicBlock *DstBB : SrcBB->successors()) { |
| 68 | if (SrcBB != DstBB && BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE) { |
| 69 | JumpCount += BI->Count; |
| 70 | |
| 71 | auto BBAddrIt = BBAddr.find(x: SrcBB); |
| 72 | assert(BBAddrIt != BBAddr.end()); |
| 73 | uint64_t SrcBBAddr = BBAddrIt->second; |
| 74 | |
| 75 | auto BBSizeIt = BBSize.find(x: SrcBB); |
| 76 | assert(BBSizeIt != BBSize.end()); |
| 77 | uint64_t SrcBBSize = BBSizeIt->second; |
| 78 | |
| 79 | BBAddrIt = BBAddr.find(x: DstBB); |
| 80 | assert(BBAddrIt != BBAddr.end()); |
| 81 | uint64_t DstBBAddr = BBAddrIt->second; |
| 82 | |
| 83 | if (SrcBBAddr + SrcBBSize == DstBBAddr) |
| 84 | Score += BI->Count; |
| 85 | } |
| 86 | ++BI; |
| 87 | } |
| 88 | } |
| 89 | } |
| 90 | return std::make_pair(x&: Score, y&: JumpCount); |
| 91 | } |
| 92 | |
| 93 | using Predecessors = std::vector<std::pair<BinaryFunction *, uint64_t>>; |
| 94 | |
| 95 | /// Build a simplified version of the call graph: For every function, keep |
| 96 | /// its callers and the frequencies of the calls |
| 97 | std::unordered_map<const BinaryFunction *, Predecessors> |
| 98 | (const std::vector<BinaryFunction *> &BinaryFunctions) { |
| 99 | std::unordered_map<const BinaryFunction *, Predecessors> Calls; |
| 100 | |
| 101 | for (BinaryFunction *SrcFunction : BinaryFunctions) { |
| 102 | const BinaryContext &BC = SrcFunction->getBinaryContext(); |
| 103 | for (const BinaryBasicBlock *BB : SrcFunction->getLayout().blocks()) { |
| 104 | // Find call instructions and extract target symbols from each one |
| 105 | for (const MCInst &Inst : *BB) { |
| 106 | if (!BC.MIB->isCall(Inst)) |
| 107 | continue; |
| 108 | |
| 109 | // Call info |
| 110 | const MCSymbol *DstSym = BC.MIB->getTargetSymbol(Inst); |
| 111 | uint64_t Count = BB->getKnownExecutionCount(); |
| 112 | // Ignore calls w/o information |
| 113 | if (DstSym == nullptr || Count == 0) |
| 114 | continue; |
| 115 | |
| 116 | const BinaryFunction *DstFunction = BC.getFunctionForSymbol(Symbol: DstSym); |
| 117 | // Ignore recursive calls |
| 118 | if (DstFunction == nullptr || DstFunction->getLayout().block_empty() || |
| 119 | DstFunction == SrcFunction) |
| 120 | continue; |
| 121 | |
| 122 | // Record the call |
| 123 | Calls[DstFunction].emplace_back(args&: SrcFunction, args&: Count); |
| 124 | } |
| 125 | } |
| 126 | } |
| 127 | return Calls; |
| 128 | } |
| 129 | |
| 130 | /// Compute expected hit ratio of the i-TLB cache (optimized by HFSortPlus alg). |
| 131 | /// Given an assignment of functions to the i-TLB pages), we divide all |
| 132 | /// functions calls into two categories: |
| 133 | /// - 'short' ones that have a caller-callee distance less than a page; |
| 134 | /// - 'long' ones where the distance exceeds a page. |
| 135 | /// The short calls are likely to result in a i-TLB cache hit. For the long |
| 136 | /// ones, the hit/miss result depends on the 'hotness' of the page (i.e., how |
| 137 | /// often the page is accessed). Assuming that functions are sent to the i-TLB |
| 138 | /// cache in a random order, the probability that a page is present in the cache |
| 139 | /// is proportional to the number of samples corresponding to the functions on |
| 140 | /// the page. The following procedure detects short and long calls, and |
| 141 | /// estimates the expected number of cache misses for the long ones. |
| 142 | double expectedCacheHitRatio( |
| 143 | const std::vector<BinaryFunction *> &BinaryFunctions, |
| 144 | const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr, |
| 145 | const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) { |
| 146 | std::unordered_map<const BinaryFunction *, Predecessors> Calls = |
| 147 | extractFunctionCalls(BinaryFunctions); |
| 148 | // Compute 'hotness' of the functions |
| 149 | double TotalSamples = 0; |
| 150 | std::unordered_map<BinaryFunction *, double> FunctionSamples; |
| 151 | for (BinaryFunction *BF : BinaryFunctions) { |
| 152 | double Samples = 0; |
| 153 | for (std::pair<BinaryFunction *, uint64_t> Pair : Calls[BF]) |
| 154 | Samples += Pair.second; |
| 155 | Samples = std::max(a: Samples, b: (double)BF->getKnownExecutionCount()); |
| 156 | FunctionSamples[BF] = Samples; |
| 157 | TotalSamples += Samples; |
| 158 | } |
| 159 | |
| 160 | // Compute 'hotness' of the pages |
| 161 | std::unordered_map<uint64_t, double> PageSamples; |
| 162 | for (BinaryFunction *BF : BinaryFunctions) { |
| 163 | if (BF->getLayout().block_empty()) |
| 164 | continue; |
| 165 | auto BBAddrIt = BBAddr.find(x: BF->getLayout().block_front()); |
| 166 | assert(BBAddrIt != BBAddr.end()); |
| 167 | const uint64_t Page = BBAddrIt->second / ITLBPageSize; |
| 168 | |
| 169 | auto FunctionSamplesIt = FunctionSamples.find(x: BF); |
| 170 | assert(FunctionSamplesIt != FunctionSamples.end()); |
| 171 | PageSamples[Page] += FunctionSamplesIt->second; |
| 172 | } |
| 173 | |
| 174 | // Computing the expected number of misses for every function |
| 175 | double Misses = 0; |
| 176 | for (BinaryFunction *BF : BinaryFunctions) { |
| 177 | // Skip the function if it has no samples |
| 178 | auto FunctionSamplesIt = FunctionSamples.find(x: BF); |
| 179 | assert(FunctionSamplesIt != FunctionSamples.end()); |
| 180 | double Samples = FunctionSamplesIt->second; |
| 181 | if (BF->getLayout().block_empty() || Samples == 0.0) |
| 182 | continue; |
| 183 | |
| 184 | auto BBAddrIt = BBAddr.find(x: BF->getLayout().block_front()); |
| 185 | assert(BBAddrIt != BBAddr.end()); |
| 186 | const uint64_t Page = BBAddrIt->second / ITLBPageSize; |
| 187 | // The probability that the page is not present in the cache |
| 188 | const double MissProb = |
| 189 | pow(x: 1.0 - PageSamples[Page] / TotalSamples, y: ITLBEntries); |
| 190 | |
| 191 | // Processing all callers of the function |
| 192 | for (std::pair<BinaryFunction *, uint64_t> Pair : Calls[BF]) { |
| 193 | BinaryFunction *SrcFunction = Pair.first; |
| 194 | |
| 195 | BBAddrIt = BBAddr.find(x: SrcFunction->getLayout().block_front()); |
| 196 | assert(BBAddrIt != BBAddr.end()); |
| 197 | const uint64_t SrcPage = BBAddrIt->second / ITLBPageSize; |
| 198 | // Is this a 'long' or a 'short' call? |
| 199 | if (Page != SrcPage) { |
| 200 | // This is a miss |
| 201 | Misses += MissProb * Pair.second; |
| 202 | } |
| 203 | Samples -= Pair.second; |
| 204 | } |
| 205 | assert(Samples >= 0.0 && "Function samples computed incorrectly" ); |
| 206 | // The remaining samples likely come from the jitted code |
| 207 | Misses += Samples * MissProb; |
| 208 | } |
| 209 | |
| 210 | return 100.0 * (1.0 - Misses / TotalSamples); |
| 211 | } |
| 212 | |
| 213 | } // namespace |
| 214 | |
| 215 | void CacheMetrics::printAll(raw_ostream &OS, |
| 216 | const std::vector<BinaryFunction *> &BFs) { |
| 217 | // Stats related to hot-cold code splitting |
| 218 | size_t NumFunctions = 0; |
| 219 | size_t NumProfiledFunctions = 0; |
| 220 | size_t NumHotFunctions = 0; |
| 221 | size_t NumBlocks = 0; |
| 222 | size_t NumHotBlocks = 0; |
| 223 | |
| 224 | size_t TotalCodeMinAddr = std::numeric_limits<size_t>::max(); |
| 225 | size_t TotalCodeMaxAddr = 0; |
| 226 | size_t HotCodeMinAddr = std::numeric_limits<size_t>::max(); |
| 227 | size_t HotCodeMaxAddr = 0; |
| 228 | |
| 229 | for (BinaryFunction *BF : BFs) { |
| 230 | NumFunctions++; |
| 231 | if (BF->hasProfile()) |
| 232 | NumProfiledFunctions++; |
| 233 | if (BF->hasValidIndex()) |
| 234 | NumHotFunctions++; |
| 235 | for (const BinaryBasicBlock &BB : *BF) { |
| 236 | NumBlocks++; |
| 237 | size_t BBAddrMin = BB.getOutputAddressRange().first; |
| 238 | size_t BBAddrMax = BB.getOutputAddressRange().second; |
| 239 | TotalCodeMinAddr = std::min(a: TotalCodeMinAddr, b: BBAddrMin); |
| 240 | TotalCodeMaxAddr = std::max(a: TotalCodeMaxAddr, b: BBAddrMax); |
| 241 | if (BF->hasValidIndex() && !BB.isCold()) { |
| 242 | NumHotBlocks++; |
| 243 | HotCodeMinAddr = std::min(a: HotCodeMinAddr, b: BBAddrMin); |
| 244 | HotCodeMaxAddr = std::max(a: HotCodeMaxAddr, b: BBAddrMax); |
| 245 | } |
| 246 | } |
| 247 | } |
| 248 | |
| 249 | OS << format(Fmt: " There are %zu functions;" , Vals: NumFunctions) |
| 250 | << format(Fmt: " %zu (%.2lf%%) are in the hot section," , Vals: NumHotFunctions, |
| 251 | Vals: 100.0 * NumHotFunctions / NumFunctions) |
| 252 | << format(Fmt: " %zu (%.2lf%%) have profile\n" , Vals: NumProfiledFunctions, |
| 253 | Vals: 100.0 * NumProfiledFunctions / NumFunctions); |
| 254 | OS << format(Fmt: " There are %zu basic blocks;" , Vals: NumBlocks) |
| 255 | << format(Fmt: " %zu (%.2lf%%) are in the hot section\n" , Vals: NumHotBlocks, |
| 256 | Vals: 100.0 * NumHotBlocks / NumBlocks); |
| 257 | |
| 258 | assert(TotalCodeMinAddr <= TotalCodeMaxAddr && "incorrect output addresses" ); |
| 259 | size_t HotCodeSize = HotCodeMaxAddr - HotCodeMinAddr; |
| 260 | size_t TotalCodeSize = TotalCodeMaxAddr - TotalCodeMinAddr; |
| 261 | |
| 262 | size_t HugePage2MB = 2 << 20; |
| 263 | OS << format(Fmt: " Hot code takes %.2lf%% of binary (%zu bytes out of %zu, " |
| 264 | "%.2lf huge pages)\n" , |
| 265 | Vals: 100.0 * HotCodeSize / TotalCodeSize, Vals: HotCodeSize, Vals: TotalCodeSize, |
| 266 | Vals: double(HotCodeSize) / HugePage2MB); |
| 267 | |
| 268 | // Stats related to expected cache performance |
| 269 | std::unordered_map<BinaryBasicBlock *, uint64_t> BBAddr; |
| 270 | std::unordered_map<BinaryBasicBlock *, uint64_t> BBSize; |
| 271 | extractBasicBlockInfo(BinaryFunctions: BFs, BBAddr, BBSize); |
| 272 | |
| 273 | OS << " Expected i-TLB cache hit ratio: " |
| 274 | << format(Fmt: "%.2lf%%\n" , Vals: expectedCacheHitRatio(BinaryFunctions: BFs, BBAddr, BBSize)); |
| 275 | |
| 276 | auto Stats = calcTSPScore(BinaryFunctions: BFs, BBAddr, BBSize); |
| 277 | OS << " TSP score: " |
| 278 | << format(Fmt: "%.2lf%% (%zu out of %zu)\n" , |
| 279 | Vals: 100.0 * Stats.first / std::max<uint64_t>(a: Stats.second, b: 1), |
| 280 | Vals: Stats.first, Vals: Stats.second); |
| 281 | } |
| 282 | |