1//===- bolt/Passes/IndirectCallPromotion.cpp ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the IndirectCallPromotion class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "bolt/Passes/IndirectCallPromotion.h"
14#include "bolt/Passes/BinaryFunctionCallGraph.h"
15#include "bolt/Passes/DataflowInfoManager.h"
16#include "bolt/Passes/Inliner.h"
17#include "llvm/ADT/STLExtras.h"
18#include "llvm/Support/CommandLine.h"
19#include <iterator>
20
21#define DEBUG_TYPE "ICP"
22#define DEBUG_VERBOSE(Level, X) \
23 if (opts::Verbosity >= (Level)) { \
24 X; \
25 }
26
27using namespace llvm;
28using namespace bolt;
29
30namespace opts {
31
32extern cl::OptionCategory BoltOptCategory;
33
34extern cl::opt<IndirectCallPromotionType> ICP;
35extern cl::opt<unsigned> Verbosity;
36extern cl::opt<unsigned> ExecutionCountThreshold;
37
38static cl::opt<unsigned> ICPJTRemainingPercentThreshold(
39 "icp-jt-remaining-percent-threshold",
40 cl::desc("The percentage threshold against remaining unpromoted indirect "
41 "call count for the promotion for jump tables"),
42 cl::init(Val: 30), cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory));
43
44static cl::opt<unsigned> ICPJTTotalPercentThreshold(
45 "icp-jt-total-percent-threshold",
46 cl::desc(
47 "The percentage threshold against total count for the promotion for "
48 "jump tables"),
49 cl::init(Val: 5), cl::Hidden, cl::cat(BoltOptCategory));
50
51static cl::opt<unsigned> ICPCallsRemainingPercentThreshold(
52 "icp-calls-remaining-percent-threshold",
53 cl::desc("The percentage threshold against remaining unpromoted indirect "
54 "call count for the promotion for calls"),
55 cl::init(Val: 50), cl::Hidden, cl::cat(BoltOptCategory));
56
57static cl::opt<unsigned> ICPCallsTotalPercentThreshold(
58 "icp-calls-total-percent-threshold",
59 cl::desc(
60 "The percentage threshold against total count for the promotion for "
61 "calls"),
62 cl::init(Val: 30), cl::Hidden, cl::cat(BoltOptCategory));
63
64static cl::opt<unsigned> ICPMispredictThreshold(
65 "indirect-call-promotion-mispredict-threshold",
66 cl::desc("misprediction threshold for skipping ICP on an "
67 "indirect call"),
68 cl::init(Val: 0), cl::cat(BoltOptCategory));
69
70static cl::alias ICPMispredictThresholdAlias(
71 "icp-mp-threshold",
72 cl::desc("alias for --indirect-call-promotion-mispredict-threshold"),
73 cl::aliasopt(ICPMispredictThreshold));
74
75static cl::opt<bool> ICPUseMispredicts(
76 "indirect-call-promotion-use-mispredicts",
77 cl::desc("use misprediction frequency for determining whether or not ICP "
78 "should be applied at a callsite. The "
79 "-indirect-call-promotion-mispredict-threshold value will be used "
80 "by this heuristic"),
81 cl::cat(BoltOptCategory));
82
83static cl::alias ICPUseMispredictsAlias(
84 "icp-use-mp",
85 cl::desc("alias for --indirect-call-promotion-use-mispredicts"),
86 cl::aliasopt(ICPUseMispredicts));
87
88static cl::opt<unsigned>
89 ICPTopN("indirect-call-promotion-topn",
90 cl::desc("limit number of targets to consider when doing indirect "
91 "call promotion. 0 = no limit"),
92 cl::init(Val: 3), cl::cat(BoltOptCategory));
93
94static cl::alias
95 ICPTopNAlias("icp-topn",
96 cl::desc("alias for --indirect-call-promotion-topn"),
97 cl::aliasopt(ICPTopN));
98
99static cl::opt<unsigned> ICPCallsTopN(
100 "indirect-call-promotion-calls-topn",
101 cl::desc("limit number of targets to consider when doing indirect "
102 "call promotion on calls. 0 = no limit"),
103 cl::init(Val: 0), cl::cat(BoltOptCategory));
104
105static cl::alias ICPCallsTopNAlias(
106 "icp-calls-topn",
107 cl::desc("alias for --indirect-call-promotion-calls-topn"),
108 cl::aliasopt(ICPCallsTopN));
109
110static cl::opt<unsigned> ICPJumpTablesTopN(
111 "indirect-call-promotion-jump-tables-topn",
112 cl::desc("limit number of targets to consider when doing indirect "
113 "call promotion on jump tables. 0 = no limit"),
114 cl::init(Val: 0), cl::cat(BoltOptCategory));
115
116static cl::alias ICPJumpTablesTopNAlias(
117 "icp-jt-topn",
118 cl::desc("alias for --indirect-call-promotion-jump-tables-topn"),
119 cl::aliasopt(ICPJumpTablesTopN));
120
121static cl::opt<bool> EliminateLoads(
122 "icp-eliminate-loads",
123 cl::desc("enable load elimination using memory profiling data when "
124 "performing ICP"),
125 cl::init(Val: true), cl::cat(BoltOptCategory));
126
127static cl::opt<unsigned> ICPTopCallsites(
128 "icp-top-callsites",
129 cl::desc("optimize hottest calls until at least this percentage of all "
130 "indirect calls frequency is covered. 0 = all callsites"),
131 cl::init(Val: 99), cl::Hidden, cl::cat(BoltOptCategory));
132
133static cl::list<std::string>
134 ICPFuncsList("icp-funcs", cl::CommaSeparated,
135 cl::desc("list of functions to enable ICP for"),
136 cl::value_desc("func1,func2,func3,..."), cl::Hidden,
137 cl::cat(BoltOptCategory));
138
139static cl::opt<bool>
140 ICPOldCodeSequence("icp-old-code-sequence",
141 cl::desc("use old code sequence for promoted calls"),
142 cl::Hidden, cl::cat(BoltOptCategory));
143
144static cl::opt<bool> ICPJumpTablesByTarget(
145 "icp-jump-tables-targets",
146 cl::desc(
147 "for jump tables, optimize indirect jmp targets instead of indices"),
148 cl::Hidden, cl::cat(BoltOptCategory));
149
150static cl::alias
151 ICPJumpTablesByTargetAlias("icp-jt-targets",
152 cl::desc("alias for --icp-jump-tables-targets"),
153 cl::aliasopt(ICPJumpTablesByTarget));
154
155static cl::opt<bool> ICPPeelForInline(
156 "icp-inline", cl::desc("only promote call targets eligible for inlining"),
157 cl::Hidden, cl::cat(BoltOptCategory));
158
159} // namespace opts
160
161#ifndef NDEBUG
162static bool verifyProfile(std::map<uint64_t, BinaryFunction> &BFs) {
163 bool IsValid = true;
164 for (auto &BFI : BFs) {
165 BinaryFunction &BF = BFI.second;
166 if (!BF.isSimple())
167 continue;
168 for (const BinaryBasicBlock &BB : BF) {
169 auto BI = BB.branch_info_begin();
170 for (BinaryBasicBlock *SuccBB : BB.successors()) {
171 if (BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && BI->Count > 0) {
172 if (BB.getKnownExecutionCount() == 0 ||
173 SuccBB->getKnownExecutionCount() == 0) {
174 BF.getBinaryContext().errs()
175 << "BOLT-WARNING: profile verification failed after ICP for "
176 "function "
177 << BF << '\n';
178 IsValid = false;
179 }
180 }
181 ++BI;
182 }
183 }
184 }
185 return IsValid;
186}
187#endif
188
189namespace llvm {
190namespace bolt {
191
192IndirectCallPromotion::Callsite::Callsite(BinaryFunction &BF,
193 const IndirectCallProfile &ICP)
194 : From(BF.getSymbol()), To(ICP.Offset), Mispreds(ICP.Mispreds),
195 Branches(ICP.Count) {
196 if (ICP.Symbol) {
197 To.Sym = ICP.Symbol;
198 To.Addr = 0;
199 }
200}
201
202void IndirectCallPromotion::printDecision(
203 llvm::raw_ostream &OS,
204 std::vector<IndirectCallPromotion::Callsite> &Targets, unsigned N) const {
205 uint64_t TotalCount = 0;
206 uint64_t TotalMispreds = 0;
207 for (const Callsite &S : Targets) {
208 TotalCount += S.Branches;
209 TotalMispreds += S.Mispreds;
210 }
211 if (!TotalCount)
212 TotalCount = 1;
213 if (!TotalMispreds)
214 TotalMispreds = 1;
215
216 OS << "BOLT-INFO: ICP decision for call site with " << Targets.size()
217 << " targets, Count = " << TotalCount << ", Mispreds = " << TotalMispreds
218 << "\n";
219
220 size_t I = 0;
221 for (const Callsite &S : Targets) {
222 OS << "Count = " << S.Branches << ", "
223 << format(Fmt: "%.1f", Vals: (100.0 * S.Branches) / TotalCount) << ", "
224 << "Mispreds = " << S.Mispreds << ", "
225 << format(Fmt: "%.1f", Vals: (100.0 * S.Mispreds) / TotalMispreds);
226 if (I < N)
227 OS << " * to be optimized *";
228 if (!S.JTIndices.empty()) {
229 OS << " Indices:";
230 for (const uint64_t Idx : S.JTIndices)
231 OS << " " << Idx;
232 }
233 OS << "\n";
234 I += S.JTIndices.empty() ? 1 : S.JTIndices.size();
235 }
236}
237
238// Get list of targets for a given call sorted by most frequently
239// called first.
240std::vector<IndirectCallPromotion::Callsite>
241IndirectCallPromotion::getCallTargets(BinaryBasicBlock &BB,
242 const MCInst &Inst) const {
243 BinaryFunction &BF = *BB.getFunction();
244 const BinaryContext &BC = BF.getBinaryContext();
245 std::vector<Callsite> Targets;
246
247 if (const JumpTable *JT = BF.getJumpTable(Inst)) {
248 // Don't support PIC jump tables for now
249 if (!opts::ICPJumpTablesByTarget && JT->Type == JumpTable::JTT_PIC)
250 return Targets;
251 const Location From(BF.getSymbol());
252 const std::pair<size_t, size_t> Range =
253 JT->getEntriesForAddress(Addr: BC.MIB->getJumpTable(Inst));
254 assert(JT->Counts.empty() || JT->Counts.size() >= Range.second);
255 JumpTable::JumpInfo DefaultJI;
256 const JumpTable::JumpInfo *JI =
257 JT->Counts.empty() ? &DefaultJI : &JT->Counts[Range.first];
258 const size_t JIAdj = JT->Counts.empty() ? 0 : 1;
259 assert(JT->Type == JumpTable::JTT_PIC ||
260 JT->EntrySize == BC.AsmInfo->getCodePointerSize());
261 for (size_t I = Range.first; I < Range.second; ++I, JI += JIAdj) {
262 MCSymbol *Entry = JT->Entries[I];
263 const BinaryBasicBlock *ToBB = BF.getBasicBlockForLabel(Label: Entry);
264 assert(ToBB || Entry == BF.getFunctionEndLabel() ||
265 Entry == BF.getFunctionEndLabel(FragmentNum::cold()));
266 if (Entry == BF.getFunctionEndLabel() ||
267 Entry == BF.getFunctionEndLabel(Fragment: FragmentNum::cold()))
268 continue;
269 const Location To(Entry);
270 const BinaryBasicBlock::BinaryBranchInfo &BI = BB.getBranchInfo(Succ: *ToBB);
271 Targets.emplace_back(args: From, args: To, args: BI.MispredictedCount, args: BI.Count,
272 args: I - Range.first);
273 }
274
275 // Sort by symbol then addr.
276 llvm::sort(C&: Targets, Comp: [](const Callsite &A, const Callsite &B) {
277 if (A.To.Sym && B.To.Sym)
278 return A.To.Sym < B.To.Sym;
279 else if (A.To.Sym && !B.To.Sym)
280 return true;
281 else if (!A.To.Sym && B.To.Sym)
282 return false;
283 else
284 return A.To.Addr < B.To.Addr;
285 });
286
287 // Targets may contain multiple entries to the same target, but using
288 // different indices. Their profile will report the same number of branches
289 // for different indices if the target is the same. That's because we don't
290 // profile the index value, but only the target via LBR.
291 auto First = Targets.begin();
292 auto Last = Targets.end();
293 auto Result = First;
294 while (++First != Last) {
295 Callsite &A = *Result;
296 const Callsite &B = *First;
297 if (A.To.Sym && B.To.Sym && A.To.Sym == B.To.Sym)
298 A.JTIndices.insert(position: A.JTIndices.end(), first: B.JTIndices.begin(),
299 last: B.JTIndices.end());
300 else
301 *(++Result) = *First;
302 }
303 ++Result;
304
305 LLVM_DEBUG(if (Targets.end() - Result > 0) {
306 dbgs() << "BOLT-INFO: ICP: " << (Targets.end() - Result)
307 << " duplicate targets removed\n";
308 });
309
310 Targets.erase(first: Result, last: Targets.end());
311 } else {
312 // Don't try to optimize PC relative indirect calls.
313 if (Inst.getOperand(i: 0).isReg() &&
314 Inst.getOperand(i: 0).getReg() == BC.MRI->getProgramCounter())
315 return Targets;
316
317 const auto ICSP = BC.MIB->tryGetAnnotationAs<IndirectCallSiteProfile>(
318 Inst, Name: "CallProfile");
319 if (ICSP) {
320 for (const IndirectCallProfile &CSP : ICSP.get()) {
321 Callsite Site(BF, CSP);
322 if (Site.isValid())
323 Targets.emplace_back(args: std::move(Site));
324 }
325 }
326 }
327
328 // Sort by target count, number of indices in case of jump table, and
329 // mispredicts. We prioritize targets with high count, small number of indices
330 // and high mispredicts. Break ties by selecting targets with lower addresses.
331 llvm::stable_sort(Range&: Targets, C: [](const Callsite &A, const Callsite &B) {
332 if (A.Branches != B.Branches)
333 return A.Branches > B.Branches;
334 if (A.JTIndices.size() != B.JTIndices.size())
335 return A.JTIndices.size() < B.JTIndices.size();
336 if (A.Mispreds != B.Mispreds)
337 return A.Mispreds > B.Mispreds;
338 return A.To.Addr < B.To.Addr;
339 });
340
341 // Remove non-symbol targets
342 llvm::erase_if(C&: Targets, P: [](const Callsite &CS) { return !CS.To.Sym; });
343
344 LLVM_DEBUG(if (BF.getJumpTable(Inst)) {
345 uint64_t TotalCount = 0;
346 uint64_t TotalMispreds = 0;
347 for (const Callsite &S : Targets) {
348 TotalCount += S.Branches;
349 TotalMispreds += S.Mispreds;
350 }
351 if (!TotalCount)
352 TotalCount = 1;
353 if (!TotalMispreds)
354 TotalMispreds = 1;
355
356 dbgs() << "BOLT-INFO: ICP: jump table size = " << Targets.size()
357 << ", Count = " << TotalCount << ", Mispreds = " << TotalMispreds
358 << "\n";
359
360 size_t I = 0;
361 for (const Callsite &S : Targets) {
362 dbgs() << "Count[" << I << "] = " << S.Branches << ", "
363 << format("%.1f", (100.0 * S.Branches) / TotalCount) << ", "
364 << "Mispreds[" << I << "] = " << S.Mispreds << ", "
365 << format("%.1f", (100.0 * S.Mispreds) / TotalMispreds) << "\n";
366 ++I;
367 }
368 });
369
370 return Targets;
371}
372
373IndirectCallPromotion::JumpTableInfoType
374IndirectCallPromotion::maybeGetHotJumpTableTargets(BinaryBasicBlock &BB,
375 MCInst &CallInst,
376 MCInst *&TargetFetchInst,
377 const JumpTable *JT) const {
378 assert(JT && "Can't get jump table addrs for non-jump tables.");
379
380 BinaryFunction &Function = *BB.getFunction();
381 BinaryContext &BC = Function.getBinaryContext();
382
383 if (!Function.hasMemoryProfile() || !opts::EliminateLoads)
384 return JumpTableInfoType();
385
386 JumpTableInfoType HotTargets;
387 MCInst *MemLocInstr;
388 MCInst *PCRelBaseOut;
389 unsigned BaseReg, IndexReg;
390 int64_t DispValue;
391 const MCExpr *DispExpr;
392 MutableArrayRef<MCInst> Insts(&BB.front(), &CallInst);
393 const IndirectBranchType Type = BC.MIB->analyzeIndirectBranch(
394 Instruction&: CallInst, Begin: Insts.begin(), End: Insts.end(), PtrSize: BC.AsmInfo->getCodePointerSize(),
395 MemLocInstr, BaseRegNum&: BaseReg, IndexRegNum&: IndexReg, DispValue, DispExpr, PCRelBaseOut);
396
397 assert(MemLocInstr && "There should always be a load for jump tables");
398 if (!MemLocInstr)
399 return JumpTableInfoType();
400
401 LLVM_DEBUG({
402 dbgs() << "BOLT-INFO: ICP attempting to find memory profiling data for "
403 << "jump table in " << Function << " at @ "
404 << (&CallInst - &BB.front()) << "\n"
405 << "BOLT-INFO: ICP target fetch instructions:\n";
406 BC.printInstruction(dbgs(), *MemLocInstr, 0, &Function);
407 if (MemLocInstr != &CallInst)
408 BC.printInstruction(dbgs(), CallInst, 0, &Function);
409 });
410
411 DEBUG_VERBOSE(1, {
412 dbgs() << "Jmp info: Type = " << (unsigned)Type << ", "
413 << "BaseReg = " << BC.MRI->getName(BaseReg) << ", "
414 << "IndexReg = " << BC.MRI->getName(IndexReg) << ", "
415 << "DispValue = " << Twine::utohexstr(DispValue) << ", "
416 << "DispExpr = " << DispExpr << ", "
417 << "MemLocInstr = ";
418 BC.printInstruction(dbgs(), *MemLocInstr, 0, &Function);
419 dbgs() << "\n";
420 });
421
422 ++TotalIndexBasedCandidates;
423
424 auto ErrorOrMemAccessProfile =
425 BC.MIB->tryGetAnnotationAs<MemoryAccessProfile>(Inst&: *MemLocInstr,
426 Name: "MemoryAccessProfile");
427 if (!ErrorOrMemAccessProfile) {
428 DEBUG_VERBOSE(1, dbgs()
429 << "BOLT-INFO: ICP no memory profiling data found\n");
430 return JumpTableInfoType();
431 }
432 MemoryAccessProfile &MemAccessProfile = ErrorOrMemAccessProfile.get();
433
434 uint64_t ArrayStart;
435 if (DispExpr) {
436 ErrorOr<uint64_t> DispValueOrError =
437 BC.getSymbolValue(Symbol: *BC.MIB->getTargetSymbol(Expr: DispExpr));
438 assert(DispValueOrError && "global symbol needs a value");
439 ArrayStart = *DispValueOrError;
440 } else {
441 ArrayStart = static_cast<uint64_t>(DispValue);
442 }
443
444 if (BaseReg == BC.MRI->getProgramCounter())
445 ArrayStart += Function.getAddress() + MemAccessProfile.NextInstrOffset;
446
447 // This is a map of [symbol] -> [count, index] and is used to combine indices
448 // into the jump table since there may be multiple addresses that all have the
449 // same entry.
450 std::map<MCSymbol *, std::pair<uint64_t, uint64_t>> HotTargetMap;
451 const std::pair<size_t, size_t> Range = JT->getEntriesForAddress(Addr: ArrayStart);
452
453 for (const AddressAccess &AccessInfo : MemAccessProfile.AddressAccessInfo) {
454 size_t Index;
455 // Mem data occasionally includes nullprs, ignore them.
456 if (!AccessInfo.MemoryObject && !AccessInfo.Offset)
457 continue;
458
459 if (AccessInfo.Offset % JT->EntrySize != 0) // ignore bogus data
460 return JumpTableInfoType();
461
462 if (AccessInfo.MemoryObject) {
463 // Deal with bad/stale data
464 if (!AccessInfo.MemoryObject->getName().starts_with(
465 Prefix: "JUMP_TABLE/" + Function.getOneName().str()))
466 return JumpTableInfoType();
467 Index =
468 (AccessInfo.Offset - (ArrayStart - JT->getAddress())) / JT->EntrySize;
469 } else {
470 Index = (AccessInfo.Offset - ArrayStart) / JT->EntrySize;
471 }
472
473 // If Index is out of range it probably means the memory profiling data is
474 // wrong for this instruction, bail out.
475 if (Index >= Range.second) {
476 LLVM_DEBUG(dbgs() << "BOLT-INFO: Index out of range of " << Range.first
477 << ", " << Range.second << "\n");
478 return JumpTableInfoType();
479 }
480
481 // Make sure the hot index points at a legal label corresponding to a BB,
482 // e.g. not the end of function (unreachable) label.
483 if (!Function.getBasicBlockForLabel(Label: JT->Entries[Index + Range.first])) {
484 LLVM_DEBUG({
485 dbgs() << "BOLT-INFO: hot index " << Index << " pointing at bogus "
486 << "label " << JT->Entries[Index + Range.first]->getName()
487 << " in jump table:\n";
488 JT->print(dbgs());
489 dbgs() << "HotTargetMap:\n";
490 for (std::pair<MCSymbol *const, std::pair<uint64_t, uint64_t>> &HT :
491 HotTargetMap)
492 dbgs() << "BOLT-INFO: " << HT.first->getName()
493 << " = (count=" << HT.second.first
494 << ", index=" << HT.second.second << ")\n";
495 });
496 return JumpTableInfoType();
497 }
498
499 std::pair<uint64_t, uint64_t> &HotTarget =
500 HotTargetMap[JT->Entries[Index + Range.first]];
501 HotTarget.first += AccessInfo.Count;
502 HotTarget.second = Index;
503 }
504
505 llvm::copy(Range: llvm::make_second_range(c&: HotTargetMap),
506 Out: std::back_inserter(x&: HotTargets));
507
508 // Sort with highest counts first.
509 llvm::sort(C: reverse(C&: HotTargets));
510
511 LLVM_DEBUG({
512 dbgs() << "BOLT-INFO: ICP jump table hot targets:\n";
513 for (const std::pair<uint64_t, uint64_t> &Target : HotTargets)
514 dbgs() << "BOLT-INFO: Idx = " << Target.second << ", "
515 << "Count = " << Target.first << "\n";
516 });
517
518 BC.MIB->getOrCreateAnnotationAs<uint16_t>(Inst&: CallInst, Name: "JTIndexReg") = IndexReg;
519
520 TargetFetchInst = MemLocInstr;
521
522 return HotTargets;
523}
524
525IndirectCallPromotion::SymTargetsType
526IndirectCallPromotion::findCallTargetSymbols(std::vector<Callsite> &Targets,
527 size_t &N, BinaryBasicBlock &BB,
528 MCInst &CallInst,
529 MCInst *&TargetFetchInst) const {
530 const BinaryContext &BC = BB.getFunction()->getBinaryContext();
531 const JumpTable *JT = BB.getFunction()->getJumpTable(Inst: CallInst);
532 SymTargetsType SymTargets;
533
534 if (!JT) {
535 for (size_t I = 0; I < N; ++I) {
536 assert(Targets[I].To.Sym && "All ICP targets must be to known symbols");
537 assert(Targets[I].JTIndices.empty() &&
538 "Can't have jump table indices for non-jump tables");
539 SymTargets.emplace_back(args&: Targets[I].To.Sym, args: 0);
540 }
541 return SymTargets;
542 }
543
544 // Use memory profile to select hot targets.
545 JumpTableInfoType HotTargets =
546 maybeGetHotJumpTableTargets(BB, CallInst, TargetFetchInst, JT);
547
548 auto findTargetsIndex = [&](uint64_t JTIndex) {
549 for (size_t I = 0; I < Targets.size(); ++I)
550 if (llvm::is_contained(Range&: Targets[I].JTIndices, Element: JTIndex))
551 return I;
552 LLVM_DEBUG(dbgs() << "BOLT-ERROR: Unable to find target index for hot jump "
553 << " table entry in " << *BB.getFunction() << "\n");
554 llvm_unreachable("Hot indices must be referred to by at least one "
555 "callsite");
556 };
557
558 if (!HotTargets.empty()) {
559 if (opts::Verbosity >= 1)
560 for (size_t I = 0; I < HotTargets.size(); ++I)
561 BC.outs() << "BOLT-INFO: HotTarget[" << I << "] = ("
562 << HotTargets[I].first << ", " << HotTargets[I].second
563 << ")\n";
564
565 // Recompute hottest targets, now discriminating which index is hot
566 // NOTE: This is a tradeoff. On one hand, we get index information. On the
567 // other hand, info coming from the memory profile is much less accurate
568 // than LBRs. So we may actually end up working with more coarse
569 // profile granularity in exchange for information about indices.
570 std::vector<Callsite> NewTargets;
571 std::map<const MCSymbol *, uint32_t> IndicesPerTarget;
572 uint64_t TotalMemAccesses = 0;
573 for (size_t I = 0; I < HotTargets.size(); ++I) {
574 const uint64_t TargetIndex = findTargetsIndex(HotTargets[I].second);
575 ++IndicesPerTarget[Targets[TargetIndex].To.Sym];
576 TotalMemAccesses += HotTargets[I].first;
577 }
578 uint64_t RemainingMemAccesses = TotalMemAccesses;
579 const size_t TopN =
580 opts::ICPJumpTablesTopN ? opts::ICPJumpTablesTopN : opts::ICPTopN;
581 size_t I = 0;
582 for (; I < HotTargets.size(); ++I) {
583 const uint64_t MemAccesses = HotTargets[I].first;
584 if (100 * MemAccesses <
585 TotalMemAccesses * opts::ICPJTTotalPercentThreshold)
586 break;
587 if (100 * MemAccesses <
588 RemainingMemAccesses * opts::ICPJTRemainingPercentThreshold)
589 break;
590 if (TopN && I >= TopN)
591 break;
592 RemainingMemAccesses -= MemAccesses;
593
594 const uint64_t JTIndex = HotTargets[I].second;
595 Callsite &Target = Targets[findTargetsIndex(JTIndex)];
596
597 NewTargets.push_back(x: Target);
598 std::vector<uint64_t>({JTIndex}).swap(x&: NewTargets.back().JTIndices);
599 llvm::erase(C&: Target.JTIndices, V: JTIndex);
600
601 // Keep fixCFG counts sane if more indices use this same target later
602 assert(IndicesPerTarget[Target.To.Sym] > 0 && "wrong map");
603 NewTargets.back().Branches =
604 Target.Branches / IndicesPerTarget[Target.To.Sym];
605 NewTargets.back().Mispreds =
606 Target.Mispreds / IndicesPerTarget[Target.To.Sym];
607 assert(Target.Branches >= NewTargets.back().Branches);
608 assert(Target.Mispreds >= NewTargets.back().Mispreds);
609 Target.Branches -= NewTargets.back().Branches;
610 Target.Mispreds -= NewTargets.back().Mispreds;
611 }
612 llvm::copy(Range&: Targets, Out: std::back_inserter(x&: NewTargets));
613 std::swap(x&: NewTargets, y&: Targets);
614 N = I;
615
616 if (N == 0 && opts::Verbosity >= 1) {
617 BC.outs() << "BOLT-INFO: ICP failed in " << *BB.getFunction() << " in "
618 << BB.getName() << ": failed to meet thresholds after memory "
619 << "profile data was loaded.\n";
620 return SymTargets;
621 }
622 }
623
624 for (size_t I = 0, TgtIdx = 0; I < N; ++TgtIdx) {
625 Callsite &Target = Targets[TgtIdx];
626 assert(Target.To.Sym && "All ICP targets must be to known symbols");
627 assert(!Target.JTIndices.empty() && "Jump tables must have indices");
628 for (uint64_t Idx : Target.JTIndices) {
629 SymTargets.emplace_back(args&: Target.To.Sym, args&: Idx);
630 ++I;
631 }
632 }
633
634 return SymTargets;
635}
636
637IndirectCallPromotion::MethodInfoType IndirectCallPromotion::maybeGetVtableSyms(
638 BinaryBasicBlock &BB, MCInst &Inst,
639 const SymTargetsType &SymTargets) const {
640 BinaryFunction &Function = *BB.getFunction();
641 BinaryContext &BC = Function.getBinaryContext();
642 std::vector<std::pair<MCSymbol *, uint64_t>> VtableSyms;
643 std::vector<MCInst *> MethodFetchInsns;
644 unsigned VtableReg, MethodReg;
645 uint64_t MethodOffset;
646
647 assert(!Function.getJumpTable(Inst) &&
648 "Can't get vtable addrs for jump tables.");
649
650 if (!Function.hasMemoryProfile() || !opts::EliminateLoads)
651 return MethodInfoType();
652
653 MutableArrayRef<MCInst> Insts(&BB.front(), &Inst + 1);
654 if (!BC.MIB->analyzeVirtualMethodCall(Begin: Insts.begin(), End: Insts.end(),
655 MethodFetchInsns, VtableRegNum&: VtableReg, BaseRegNum&: MethodReg,
656 MethodOffset)) {
657 DEBUG_VERBOSE(
658 1, dbgs() << "BOLT-INFO: ICP unable to analyze method call in "
659 << Function << " at @ " << (&Inst - &BB.front()) << "\n");
660 return MethodInfoType();
661 }
662
663 ++TotalMethodLoadEliminationCandidates;
664
665 DEBUG_VERBOSE(1, {
666 dbgs() << "BOLT-INFO: ICP found virtual method call in " << Function
667 << " at @ " << (&Inst - &BB.front()) << "\n";
668 dbgs() << "BOLT-INFO: ICP method fetch instructions:\n";
669 for (MCInst *Inst : MethodFetchInsns)
670 BC.printInstruction(dbgs(), *Inst, 0, &Function);
671
672 if (MethodFetchInsns.back() != &Inst)
673 BC.printInstruction(dbgs(), Inst, 0, &Function);
674 });
675
676 // Try to get value profiling data for the method load instruction.
677 auto ErrorOrMemAccessProfile =
678 BC.MIB->tryGetAnnotationAs<MemoryAccessProfile>(Inst&: *MethodFetchInsns.back(),
679 Name: "MemoryAccessProfile");
680 if (!ErrorOrMemAccessProfile) {
681 DEBUG_VERBOSE(1, dbgs()
682 << "BOLT-INFO: ICP no memory profiling data found\n");
683 return MethodInfoType();
684 }
685 MemoryAccessProfile &MemAccessProfile = ErrorOrMemAccessProfile.get();
686
687 // Find the vtable that each method belongs to.
688 std::map<const MCSymbol *, uint64_t> MethodToVtable;
689
690 for (const AddressAccess &AccessInfo : MemAccessProfile.AddressAccessInfo) {
691 uint64_t Address = AccessInfo.Offset;
692 if (AccessInfo.MemoryObject)
693 Address += AccessInfo.MemoryObject->getAddress();
694
695 // Ignore bogus data.
696 if (!Address)
697 continue;
698
699 const uint64_t VtableBase = Address - MethodOffset;
700
701 DEBUG_VERBOSE(1, dbgs() << "BOLT-INFO: ICP vtable = "
702 << Twine::utohexstr(VtableBase) << "+"
703 << MethodOffset << "/" << AccessInfo.Count << "\n");
704
705 if (ErrorOr<uint64_t> MethodAddr = BC.getPointerAtAddress(Address)) {
706 BinaryData *MethodBD = BC.getBinaryDataAtAddress(Address: MethodAddr.get());
707 if (!MethodBD) // skip unknown methods
708 continue;
709 MCSymbol *MethodSym = MethodBD->getSymbol();
710 MethodToVtable[MethodSym] = VtableBase;
711 DEBUG_VERBOSE(1, {
712 const BinaryFunction *Method = BC.getFunctionForSymbol(MethodSym);
713 dbgs() << "BOLT-INFO: ICP found method = "
714 << Twine::utohexstr(MethodAddr.get()) << "/"
715 << (Method ? Method->getPrintName() : "") << "\n";
716 });
717 }
718 }
719
720 // Find the vtable for each target symbol.
721 for (size_t I = 0; I < SymTargets.size(); ++I) {
722 auto Itr = MethodToVtable.find(x: SymTargets[I].first);
723 if (Itr != MethodToVtable.end()) {
724 if (BinaryData *BD = BC.getBinaryDataContainingAddress(Address: Itr->second)) {
725 const uint64_t Addend = Itr->second - BD->getAddress();
726 VtableSyms.emplace_back(args: BD->getSymbol(), args: Addend);
727 continue;
728 }
729 }
730 // Give up if we can't find the vtable for a method.
731 DEBUG_VERBOSE(1, dbgs() << "BOLT-INFO: ICP can't find vtable for "
732 << SymTargets[I].first->getName() << "\n");
733 return MethodInfoType();
734 }
735
736 // Make sure the vtable reg is not clobbered by the argument passing code
737 if (VtableReg != MethodReg) {
738 for (MCInst *CurInst = MethodFetchInsns.front(); CurInst < &Inst;
739 ++CurInst) {
740 const MCInstrDesc &InstrInfo = BC.MII->get(Opcode: CurInst->getOpcode());
741 if (InstrInfo.hasDefOfPhysReg(MI: *CurInst, Reg: VtableReg, RI: *BC.MRI))
742 return MethodInfoType();
743 }
744 }
745
746 return MethodInfoType(VtableSyms, MethodFetchInsns);
747}
748
749std::vector<std::unique_ptr<BinaryBasicBlock>>
750IndirectCallPromotion::rewriteCall(
751 BinaryBasicBlock &IndCallBlock, const MCInst &CallInst,
752 MCPlusBuilder::BlocksVectorTy &&ICPcode,
753 const std::vector<MCInst *> &MethodFetchInsns) const {
754 BinaryFunction &Function = *IndCallBlock.getFunction();
755 MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get();
756
757 // Create new basic blocks with correct code in each one first.
758 std::vector<std::unique_ptr<BinaryBasicBlock>> NewBBs;
759 const bool IsTailCallOrJT =
760 (MIB->isTailCall(Inst: CallInst) || Function.getJumpTable(Inst: CallInst));
761
762 // If we are tracking the indirect call/jump address, propagate the address to
763 // the ICP code.
764 const std::optional<uint32_t> IndirectInstrOffset = MIB->getOffset(Inst: CallInst);
765 if (IndirectInstrOffset) {
766 for (auto &[Symbol, Instructions] : ICPcode)
767 for (MCInst &Inst : Instructions)
768 MIB->setOffset(Inst, Offset: *IndirectInstrOffset);
769 }
770
771 // Move instructions from the tail of the original call block
772 // to the merge block.
773
774 // Remember any pseudo instructions following a tail call. These
775 // must be preserved and moved to the original block.
776 InstructionListType TailInsts;
777 const MCInst *TailInst = &CallInst;
778 if (IsTailCallOrJT)
779 while (TailInst + 1 < &(*IndCallBlock.end()) &&
780 MIB->isPseudo(Inst: *(TailInst + 1)))
781 TailInsts.push_back(x: *++TailInst);
782
783 InstructionListType MovedInst = IndCallBlock.splitInstructions(Inst: &CallInst);
784 // Link new BBs to the original input offset of the indirect call site or its
785 // containing BB, so we can map samples recorded in new BBs back to the
786 // original BB seen in the input binary (if using BAT).
787 const uint32_t OrigOffset = IndirectInstrOffset
788 ? *IndirectInstrOffset
789 : IndCallBlock.getInputOffset();
790
791 IndCallBlock.eraseInstructions(Begin: MethodFetchInsns.begin(),
792 End: MethodFetchInsns.end());
793 if (IndCallBlock.empty() ||
794 (!MethodFetchInsns.empty() && MethodFetchInsns.back() == &CallInst))
795 IndCallBlock.addInstructions(Begin: ICPcode.front().second.begin(),
796 End: ICPcode.front().second.end());
797 else
798 IndCallBlock.replaceInstruction(II: std::prev(x: IndCallBlock.end()),
799 Replacement: ICPcode.front().second);
800 IndCallBlock.addInstructions(Begin: TailInsts.begin(), End: TailInsts.end());
801
802 for (auto Itr = ICPcode.begin() + 1; Itr != ICPcode.end(); ++Itr) {
803 MCSymbol *&Sym = Itr->first;
804 InstructionListType &Insts = Itr->second;
805 assert(Sym);
806 std::unique_ptr<BinaryBasicBlock> TBB = Function.createBasicBlock(Label: Sym);
807 TBB->setOffset(OrigOffset);
808 for (MCInst &Inst : Insts) // sanitize new instructions.
809 if (MIB->isCall(Inst))
810 MIB->removeAnnotation(Inst, Name: "CallProfile");
811 TBB->addInstructions(Begin: Insts.begin(), End: Insts.end());
812 NewBBs.emplace_back(args: std::move(TBB));
813 }
814
815 // Move tail of instructions from after the original call to
816 // the merge block.
817 if (!IsTailCallOrJT)
818 NewBBs.back()->addInstructions(Begin: MovedInst.begin(), End: MovedInst.end());
819
820 return NewBBs;
821}
822
823BinaryBasicBlock *
824IndirectCallPromotion::fixCFG(BinaryBasicBlock &IndCallBlock,
825 const bool IsTailCall, const bool IsJumpTable,
826 IndirectCallPromotion::BasicBlocksVector &&NewBBs,
827 const std::vector<Callsite> &Targets) const {
828 BinaryFunction &Function = *IndCallBlock.getFunction();
829 using BinaryBranchInfo = BinaryBasicBlock::BinaryBranchInfo;
830 BinaryBasicBlock *MergeBlock = nullptr;
831
832 // Scale indirect call counts to the execution count of the original
833 // basic block containing the indirect call.
834 uint64_t TotalCount = IndCallBlock.getKnownExecutionCount();
835 uint64_t TotalIndirectBranches = 0;
836 for (const Callsite &Target : Targets)
837 TotalIndirectBranches += Target.Branches;
838 if (TotalIndirectBranches == 0)
839 TotalIndirectBranches = 1;
840 BinaryBasicBlock::BranchInfoType BBI;
841 BinaryBasicBlock::BranchInfoType ScaledBBI;
842 for (const Callsite &Target : Targets) {
843 const size_t NumEntries =
844 std::max(a: static_cast<std::size_t>(1UL), b: Target.JTIndices.size());
845 for (size_t I = 0; I < NumEntries; ++I) {
846 BBI.push_back(
847 Elt: BinaryBranchInfo{.Count: (Target.Branches + NumEntries - 1) / NumEntries,
848 .MispredictedCount: (Target.Mispreds + NumEntries - 1) / NumEntries});
849 ScaledBBI.push_back(
850 Elt: BinaryBranchInfo{.Count: uint64_t(TotalCount * Target.Branches /
851 (NumEntries * TotalIndirectBranches)),
852 .MispredictedCount: uint64_t(TotalCount * Target.Mispreds /
853 (NumEntries * TotalIndirectBranches))});
854 }
855 }
856
857 if (IsJumpTable) {
858 BinaryBasicBlock *NewIndCallBlock = NewBBs.back().get();
859 IndCallBlock.moveAllSuccessorsTo(New: NewIndCallBlock);
860
861 std::vector<MCSymbol *> SymTargets;
862 for (const Callsite &Target : Targets) {
863 const size_t NumEntries =
864 std::max(a: static_cast<std::size_t>(1UL), b: Target.JTIndices.size());
865 for (size_t I = 0; I < NumEntries; ++I)
866 SymTargets.push_back(x: Target.To.Sym);
867 }
868 assert(SymTargets.size() > NewBBs.size() - 1 &&
869 "There must be a target symbol associated with each new BB.");
870
871 for (uint64_t I = 0; I < NewBBs.size(); ++I) {
872 BinaryBasicBlock *SourceBB = I ? NewBBs[I - 1].get() : &IndCallBlock;
873 SourceBB->setExecutionCount(TotalCount);
874
875 BinaryBasicBlock *TargetBB =
876 Function.getBasicBlockForLabel(Label: SymTargets[I]);
877 SourceBB->addSuccessor(Succ: TargetBB, BI: ScaledBBI[I]); // taken
878
879 TotalCount -= ScaledBBI[I].Count;
880 SourceBB->addSuccessor(Succ: NewBBs[I].get(), Count: TotalCount); // fall-through
881
882 // Update branch info for the indirect jump.
883 BinaryBasicBlock::BinaryBranchInfo &BranchInfo =
884 NewIndCallBlock->getBranchInfo(Succ: *TargetBB);
885 if (BranchInfo.Count > BBI[I].Count)
886 BranchInfo.Count -= BBI[I].Count;
887 else
888 BranchInfo.Count = 0;
889
890 if (BranchInfo.MispredictedCount > BBI[I].MispredictedCount)
891 BranchInfo.MispredictedCount -= BBI[I].MispredictedCount;
892 else
893 BranchInfo.MispredictedCount = 0;
894 }
895 } else {
896 assert(NewBBs.size() >= 2);
897 assert(NewBBs.size() % 2 == 1 || IndCallBlock.succ_empty());
898 assert(NewBBs.size() % 2 == 1 || IsTailCall);
899
900 auto ScaledBI = ScaledBBI.begin();
901 auto updateCurrentBranchInfo = [&] {
902 assert(ScaledBI != ScaledBBI.end());
903 TotalCount -= ScaledBI->Count;
904 ++ScaledBI;
905 };
906
907 if (!IsTailCall) {
908 MergeBlock = NewBBs.back().get();
909 IndCallBlock.moveAllSuccessorsTo(New: MergeBlock);
910 }
911
912 // Fix up successors and execution counts.
913 updateCurrentBranchInfo();
914 IndCallBlock.addSuccessor(Succ: NewBBs[1].get(), Count: TotalCount);
915 IndCallBlock.addSuccessor(Succ: NewBBs[0].get(), BI: ScaledBBI[0]);
916
917 const size_t Adj = IsTailCall ? 1 : 2;
918 for (size_t I = 0; I < NewBBs.size() - Adj; ++I) {
919 assert(TotalCount <= IndCallBlock.getExecutionCount() ||
920 TotalCount <= uint64_t(TotalIndirectBranches));
921 uint64_t ExecCount = ScaledBBI[(I + 1) / 2].Count;
922 if (I % 2 == 0) {
923 if (MergeBlock)
924 NewBBs[I]->addSuccessor(Succ: MergeBlock, Count: ScaledBBI[(I + 1) / 2].Count);
925 } else {
926 assert(I + 2 < NewBBs.size());
927 updateCurrentBranchInfo();
928 NewBBs[I]->addSuccessor(Succ: NewBBs[I + 2].get(), Count: TotalCount);
929 NewBBs[I]->addSuccessor(Succ: NewBBs[I + 1].get(), BI: ScaledBBI[(I + 1) / 2]);
930 ExecCount += TotalCount;
931 }
932 NewBBs[I]->setExecutionCount(ExecCount);
933 }
934
935 if (MergeBlock) {
936 // Arrange for the MergeBlock to be the fallthrough for the first
937 // promoted call block.
938 std::unique_ptr<BinaryBasicBlock> MBPtr;
939 std::swap(x&: MBPtr, y&: NewBBs.back());
940 NewBBs.pop_back();
941 NewBBs.emplace(position: NewBBs.begin() + 1, args: std::move(MBPtr));
942 // TODO: is COUNT_FALLTHROUGH_EDGE the right thing here?
943 NewBBs.back()->addSuccessor(Succ: MergeBlock, Count: TotalCount); // uncond branch
944 }
945 }
946
947 // Update the execution count.
948 NewBBs.back()->setExecutionCount(TotalCount);
949
950 // Update BB and BB layout.
951 Function.insertBasicBlocks(Start: &IndCallBlock, NewBBs: std::move(NewBBs));
952 assert(Function.validateCFG());
953
954 return MergeBlock;
955}
956
957size_t IndirectCallPromotion::canPromoteCallsite(
958 const BinaryBasicBlock &BB, const MCInst &Inst,
959 const std::vector<Callsite> &Targets, uint64_t NumCalls) {
960 BinaryFunction *BF = BB.getFunction();
961 const BinaryContext &BC = BF->getBinaryContext();
962
963 if (BB.getKnownExecutionCount() < opts::ExecutionCountThreshold)
964 return 0;
965
966 const bool IsJumpTable = BF->getJumpTable(Inst);
967
968 auto computeStats = [&](size_t N) {
969 for (size_t I = 0; I < N; ++I)
970 if (IsJumpTable)
971 TotalNumFrequentJmps += Targets[I].Branches;
972 else
973 TotalNumFrequentCalls += Targets[I].Branches;
974 };
975
976 // If we have no targets (or no calls), skip this callsite.
977 if (Targets.empty() || !NumCalls) {
978 if (opts::Verbosity >= 1) {
979 const ptrdiff_t InstIdx = &Inst - &(*BB.begin());
980 BC.outs() << "BOLT-INFO: ICP failed in " << *BF << " @ " << InstIdx
981 << " in " << BB.getName() << ", calls = " << NumCalls
982 << ", targets empty or NumCalls == 0.\n";
983 }
984 return 0;
985 }
986
987 size_t TopN = opts::ICPTopN;
988 if (IsJumpTable)
989 TopN = opts::ICPJumpTablesTopN ? opts::ICPJumpTablesTopN : TopN;
990 else
991 TopN = opts::ICPCallsTopN ? opts::ICPCallsTopN : TopN;
992
993 const size_t TrialN = TopN ? std::min(a: TopN, b: Targets.size()) : Targets.size();
994
995 if (opts::ICPTopCallsites && !BC.MIB->hasAnnotation(Inst, Name: "DoICP"))
996 return 0;
997
998 // Pick the top N targets.
999 uint64_t TotalMispredictsTopN = 0;
1000 size_t N = 0;
1001
1002 if (opts::ICPUseMispredicts &&
1003 (!IsJumpTable || opts::ICPJumpTablesByTarget)) {
1004 // Count total number of mispredictions for (at most) the top N targets.
1005 // We may choose a smaller N (TrialN vs. N) if the frequency threshold
1006 // is exceeded by fewer targets.
1007 double Threshold = double(opts::ICPMispredictThreshold);
1008 for (size_t I = 0; I < TrialN && Threshold > 0; ++I, ++N) {
1009 Threshold -= (100.0 * Targets[I].Mispreds) / NumCalls;
1010 TotalMispredictsTopN += Targets[I].Mispreds;
1011 }
1012 computeStats(N);
1013
1014 // Compute the misprediction frequency of the top N call targets. If this
1015 // frequency is greater than the threshold, we should try ICP on this
1016 // callsite.
1017 const double TopNFrequency = (100.0 * TotalMispredictsTopN) / NumCalls;
1018 if (TopNFrequency == 0 || TopNFrequency < opts::ICPMispredictThreshold) {
1019 if (opts::Verbosity >= 1) {
1020 const ptrdiff_t InstIdx = &Inst - &(*BB.begin());
1021 BC.outs() << "BOLT-INFO: ICP failed in " << *BF << " @ " << InstIdx
1022 << " in " << BB.getName() << ", calls = " << NumCalls
1023 << ", top N mis. frequency " << format(Fmt: "%.1f", Vals: TopNFrequency)
1024 << "% < " << opts::ICPMispredictThreshold << "%\n";
1025 }
1026 return 0;
1027 }
1028 } else {
1029 size_t MaxTargets = 0;
1030
1031 // Count total number of calls for (at most) the top N targets.
1032 // We may choose a smaller N (TrialN vs. N) if the frequency threshold
1033 // is exceeded by fewer targets.
1034 const unsigned TotalThreshold = IsJumpTable
1035 ? opts::ICPJTTotalPercentThreshold
1036 : opts::ICPCallsTotalPercentThreshold;
1037 const unsigned RemainingThreshold =
1038 IsJumpTable ? opts::ICPJTRemainingPercentThreshold
1039 : opts::ICPCallsRemainingPercentThreshold;
1040 uint64_t NumRemainingCalls = NumCalls;
1041 for (size_t I = 0; I < TrialN; ++I, ++MaxTargets) {
1042 if (100 * Targets[I].Branches < NumCalls * TotalThreshold)
1043 break;
1044 if (100 * Targets[I].Branches < NumRemainingCalls * RemainingThreshold)
1045 break;
1046 if (N + (Targets[I].JTIndices.empty() ? 1 : Targets[I].JTIndices.size()) >
1047 TrialN)
1048 break;
1049 TotalMispredictsTopN += Targets[I].Mispreds;
1050 NumRemainingCalls -= Targets[I].Branches;
1051 N += Targets[I].JTIndices.empty() ? 1 : Targets[I].JTIndices.size();
1052 }
1053 computeStats(MaxTargets);
1054
1055 // Don't check misprediction frequency for jump tables -- we don't really
1056 // care as long as we are saving loads from the jump table.
1057 if (!IsJumpTable || opts::ICPJumpTablesByTarget) {
1058 // Compute the misprediction frequency of the top N call targets. If
1059 // this frequency is less than the threshold, we should skip ICP at
1060 // this callsite.
1061 const double TopNMispredictFrequency =
1062 (100.0 * TotalMispredictsTopN) / NumCalls;
1063
1064 if (TopNMispredictFrequency < opts::ICPMispredictThreshold) {
1065 if (opts::Verbosity >= 1) {
1066 const ptrdiff_t InstIdx = &Inst - &(*BB.begin());
1067 BC.outs() << "BOLT-INFO: ICP failed in " << *BF << " @ " << InstIdx
1068 << " in " << BB.getName() << ", calls = " << NumCalls
1069 << ", top N mispredict frequency "
1070 << format(Fmt: "%.1f", Vals: TopNMispredictFrequency) << "% < "
1071 << opts::ICPMispredictThreshold << "%\n";
1072 }
1073 return 0;
1074 }
1075 }
1076 }
1077
1078 // Filter by inline-ability of target functions, stop at first target that
1079 // can't be inlined.
1080 if (!IsJumpTable && opts::ICPPeelForInline) {
1081 for (size_t I = 0; I < N; ++I) {
1082 const MCSymbol *TargetSym = Targets[I].To.Sym;
1083 const BinaryFunction *TargetBF = BC.getFunctionForSymbol(Symbol: TargetSym);
1084 if (!TargetBF || !BinaryFunctionPass::shouldOptimize(BF: *TargetBF) ||
1085 getInliningInfo(BF: *TargetBF).Type == InliningType::INL_NONE) {
1086 N = I;
1087 break;
1088 }
1089 }
1090 }
1091
1092 // Filter functions that can have ICP applied (for debugging)
1093 if (!opts::ICPFuncsList.empty()) {
1094 for (std::string &Name : opts::ICPFuncsList)
1095 if (BF->hasName(FunctionName: Name))
1096 return N;
1097 return 0;
1098 }
1099
1100 return N;
1101}
1102
1103void IndirectCallPromotion::printCallsiteInfo(
1104 const BinaryBasicBlock &BB, const MCInst &Inst,
1105 const std::vector<Callsite> &Targets, const size_t N,
1106 uint64_t NumCalls) const {
1107 BinaryContext &BC = BB.getFunction()->getBinaryContext();
1108 const bool IsTailCall = BC.MIB->isTailCall(Inst);
1109 const bool IsJumpTable = BB.getFunction()->getJumpTable(Inst);
1110 const ptrdiff_t InstIdx = &Inst - &(*BB.begin());
1111
1112 BC.outs() << "BOLT-INFO: ICP candidate branch info: " << *BB.getFunction()
1113 << " @ " << InstIdx << " in " << BB.getName()
1114 << " -> calls = " << NumCalls
1115 << (IsTailCall ? " (tail)" : (IsJumpTable ? " (jump table)" : ""))
1116 << "\n";
1117 for (size_t I = 0; I < N; I++) {
1118 const double Frequency = 100.0 * Targets[I].Branches / NumCalls;
1119 const double MisFrequency = 100.0 * Targets[I].Mispreds / NumCalls;
1120 BC.outs() << "BOLT-INFO: ";
1121 if (Targets[I].To.Sym)
1122 BC.outs() << Targets[I].To.Sym->getName();
1123 else
1124 BC.outs() << Targets[I].To.Addr;
1125 BC.outs() << ", calls = " << Targets[I].Branches
1126 << ", mispreds = " << Targets[I].Mispreds
1127 << ", taken freq = " << format(Fmt: "%.1f", Vals: Frequency) << "%"
1128 << ", mis. freq = " << format(Fmt: "%.1f", Vals: MisFrequency) << "%";
1129 bool First = true;
1130 for (uint64_t JTIndex : Targets[I].JTIndices) {
1131 BC.outs() << (First ? ", indices = " : ", ") << JTIndex;
1132 First = false;
1133 }
1134 BC.outs() << "\n";
1135 }
1136
1137 LLVM_DEBUG({
1138 dbgs() << "BOLT-INFO: ICP original call instruction:";
1139 BC.printInstruction(dbgs(), Inst, Targets[0].From.Addr, nullptr, true);
1140 });
1141}
1142
1143Error IndirectCallPromotion::runOnFunctions(BinaryContext &BC) {
1144 if (opts::ICP == ICP_NONE)
1145 return Error::success();
1146
1147 auto &BFs = BC.getBinaryFunctions();
1148
1149 const bool OptimizeCalls = (opts::ICP == ICP_CALLS || opts::ICP == ICP_ALL);
1150 const bool OptimizeJumpTables =
1151 (opts::ICP == ICP_JUMP_TABLES || opts::ICP == ICP_ALL);
1152
1153 std::unique_ptr<RegAnalysis> RA;
1154 std::unique_ptr<BinaryFunctionCallGraph> CG;
1155 if (OptimizeJumpTables) {
1156 CG.reset(p: new BinaryFunctionCallGraph(buildCallGraph(BC)));
1157 RA.reset(p: new RegAnalysis(BC, &BFs, &*CG));
1158 }
1159
1160 // If icp-top-callsites is enabled, compute the total number of indirect
1161 // calls and then optimize the hottest callsites that contribute to that
1162 // total.
1163 SetVector<BinaryFunction *> Functions;
1164 if (opts::ICPTopCallsites == 0) {
1165 for (auto &KV : BFs)
1166 Functions.insert(X: &KV.second);
1167 } else {
1168 using IndirectCallsite = std::tuple<uint64_t, MCInst *, BinaryFunction *>;
1169 std::vector<IndirectCallsite> IndirectCalls;
1170 size_t TotalIndirectCalls = 0;
1171
1172 // Find all the indirect callsites.
1173 for (auto &BFIt : BFs) {
1174 BinaryFunction &Function = BFIt.second;
1175
1176 if (!shouldOptimize(BF: Function))
1177 continue;
1178
1179 const bool HasLayout = !Function.getLayout().block_empty();
1180
1181 for (BinaryBasicBlock &BB : Function) {
1182 if (HasLayout && Function.isSplit() && BB.isCold())
1183 continue;
1184
1185 for (MCInst &Inst : BB) {
1186 const bool IsJumpTable = Function.getJumpTable(Inst);
1187 const bool HasIndirectCallProfile =
1188 BC.MIB->hasAnnotation(Inst, Name: "CallProfile");
1189 const bool IsDirectCall =
1190 (BC.MIB->isCall(Inst) && BC.MIB->getTargetSymbol(Inst, OpNum: 0));
1191
1192 if (!IsDirectCall &&
1193 ((HasIndirectCallProfile && !IsJumpTable && OptimizeCalls) ||
1194 (IsJumpTable && OptimizeJumpTables))) {
1195 uint64_t NumCalls = 0;
1196 for (const Callsite &BInfo : getCallTargets(BB, Inst))
1197 NumCalls += BInfo.Branches;
1198 IndirectCalls.push_back(
1199 x: std::make_tuple(args&: NumCalls, args: &Inst, args: &Function));
1200 TotalIndirectCalls += NumCalls;
1201 }
1202 }
1203 }
1204 }
1205
1206 // Sort callsites by execution count.
1207 llvm::sort(C: reverse(C&: IndirectCalls));
1208
1209 // Find callsites that contribute to the top "opts::ICPTopCallsites"%
1210 // number of calls.
1211 const float TopPerc = opts::ICPTopCallsites / 100.0f;
1212 int64_t MaxCalls = TotalIndirectCalls * TopPerc;
1213 uint64_t LastFreq = std::numeric_limits<uint64_t>::max();
1214 size_t Num = 0;
1215 for (const IndirectCallsite &IC : IndirectCalls) {
1216 const uint64_t CurFreq = std::get<0>(t: IC);
1217 // Once we decide to stop, include at least all branches that share the
1218 // same frequency of the last one to avoid non-deterministic behavior
1219 // (e.g. turning on/off ICP depending on the order of functions)
1220 if (MaxCalls <= 0 && CurFreq != LastFreq)
1221 break;
1222 MaxCalls -= CurFreq;
1223 LastFreq = CurFreq;
1224 BC.MIB->addAnnotation(Inst&: *std::get<1>(t: IC), Name: "DoICP", Val: true);
1225 Functions.insert(X: std::get<2>(t: IC));
1226 ++Num;
1227 }
1228 BC.outs() << "BOLT-INFO: ICP Total indirect calls = " << TotalIndirectCalls
1229 << ", " << Num << " callsites cover " << opts::ICPTopCallsites
1230 << "% of all indirect calls\n";
1231 }
1232
1233 for (BinaryFunction *FuncPtr : Functions) {
1234 BinaryFunction &Function = *FuncPtr;
1235
1236 if (!shouldOptimize(BF: Function))
1237 continue;
1238
1239 const bool HasLayout = !Function.getLayout().block_empty();
1240
1241 // Total number of indirect calls issued from the current Function.
1242 // (a fraction of TotalIndirectCalls)
1243 uint64_t FuncTotalIndirectCalls = 0;
1244 uint64_t FuncTotalIndirectJmps = 0;
1245
1246 std::vector<BinaryBasicBlock *> BBs;
1247 for (BinaryBasicBlock &BB : Function) {
1248 // Skip indirect calls in cold blocks.
1249 if (!HasLayout || !Function.isSplit() || !BB.isCold())
1250 BBs.push_back(x: &BB);
1251 }
1252 if (BBs.empty())
1253 continue;
1254
1255 DataflowInfoManager Info(Function, RA.get(), nullptr);
1256 while (!BBs.empty()) {
1257 BinaryBasicBlock *BB = BBs.back();
1258 BBs.pop_back();
1259
1260 for (unsigned Idx = 0; Idx < BB->size(); ++Idx) {
1261 MCInst &Inst = BB->getInstructionAtIndex(Index: Idx);
1262 const ptrdiff_t InstIdx = &Inst - &(*BB->begin());
1263 const bool IsTailCall = BC.MIB->isTailCall(Inst);
1264 const bool HasIndirectCallProfile =
1265 BC.MIB->hasAnnotation(Inst, Name: "CallProfile");
1266 const bool IsJumpTable = Function.getJumpTable(Inst);
1267
1268 if (BC.MIB->isCall(Inst))
1269 TotalCalls += BB->getKnownExecutionCount();
1270
1271 if (IsJumpTable && !OptimizeJumpTables)
1272 continue;
1273
1274 if (!IsJumpTable && (!HasIndirectCallProfile || !OptimizeCalls))
1275 continue;
1276
1277 // Ignore direct calls.
1278 if (BC.MIB->isCall(Inst) && BC.MIB->getTargetSymbol(Inst, OpNum: 0))
1279 continue;
1280
1281 assert((BC.MIB->isCall(Inst) || BC.MIB->isIndirectBranch(Inst)) &&
1282 "expected a call or an indirect jump instruction");
1283
1284 if (IsJumpTable)
1285 ++TotalJumpTableCallsites;
1286 else
1287 ++TotalIndirectCallsites;
1288
1289 std::vector<Callsite> Targets = getCallTargets(BB&: *BB, Inst);
1290
1291 // Compute the total number of calls from this particular callsite.
1292 uint64_t NumCalls = 0;
1293 for (const Callsite &BInfo : Targets)
1294 NumCalls += BInfo.Branches;
1295 if (!IsJumpTable)
1296 FuncTotalIndirectCalls += NumCalls;
1297 else
1298 FuncTotalIndirectJmps += NumCalls;
1299
1300 // If FLAGS regs is alive after this jmp site, do not try
1301 // promoting because we will clobber FLAGS.
1302 if (IsJumpTable) {
1303 ErrorOr<const BitVector &> State =
1304 Info.getLivenessAnalysis().getStateBefore(Point: Inst);
1305 if (!State || (State && (*State)[BC.MIB->getFlagsReg()])) {
1306 if (opts::Verbosity >= 1)
1307 BC.outs() << "BOLT-INFO: ICP failed in " << Function << " @ "
1308 << InstIdx << " in " << BB->getName()
1309 << ", calls = " << NumCalls
1310 << (State ? ", cannot clobber flags reg.\n"
1311 : ", no liveness data available.\n");
1312 continue;
1313 }
1314 }
1315
1316 // Should this callsite be optimized? Return the number of targets
1317 // to use when promoting this call. A value of zero means to skip
1318 // this callsite.
1319 size_t N = canPromoteCallsite(BB: *BB, Inst, Targets, NumCalls);
1320
1321 // If it is a jump table and it failed to meet our initial threshold,
1322 // proceed to findCallTargetSymbols -- it may reevaluate N if
1323 // memory profile is present
1324 if (!N && !IsJumpTable)
1325 continue;
1326
1327 if (opts::Verbosity >= 1)
1328 printCallsiteInfo(BB: *BB, Inst, Targets, N, NumCalls);
1329
1330 // Find MCSymbols or absolute addresses for each call target.
1331 MCInst *TargetFetchInst = nullptr;
1332 const SymTargetsType SymTargets =
1333 findCallTargetSymbols(Targets, N, BB&: *BB, CallInst&: Inst, TargetFetchInst);
1334
1335 // findCallTargetSymbols may have changed N if mem profile is available
1336 // for jump tables
1337 if (!N)
1338 continue;
1339
1340 LLVM_DEBUG(printDecision(dbgs(), Targets, N));
1341
1342 // If we can't resolve any of the target symbols, punt on this callsite.
1343 // TODO: can this ever happen?
1344 if (SymTargets.size() < N) {
1345 const size_t LastTarget = SymTargets.size();
1346 if (opts::Verbosity >= 1)
1347 BC.outs() << "BOLT-INFO: ICP failed in " << Function << " @ "
1348 << InstIdx << " in " << BB->getName()
1349 << ", calls = " << NumCalls
1350 << ", ICP failed to find target symbol for "
1351 << Targets[LastTarget].To.Sym->getName() << "\n";
1352 continue;
1353 }
1354
1355 MethodInfoType MethodInfo;
1356
1357 if (!IsJumpTable) {
1358 MethodInfo = maybeGetVtableSyms(BB&: *BB, Inst, SymTargets);
1359 TotalMethodLoadsEliminated += MethodInfo.first.empty() ? 0 : 1;
1360 LLVM_DEBUG(dbgs()
1361 << "BOLT-INFO: ICP "
1362 << (!MethodInfo.first.empty() ? "found" : "did not find")
1363 << " vtables for all methods.\n");
1364 } else if (TargetFetchInst) {
1365 ++TotalIndexBasedJumps;
1366 MethodInfo.second.push_back(x: TargetFetchInst);
1367 }
1368
1369 // Generate new promoted call code for this callsite.
1370 MCPlusBuilder::BlocksVectorTy ICPcode =
1371 (IsJumpTable && !opts::ICPJumpTablesByTarget)
1372 ? BC.MIB->jumpTablePromotion(IJmpInst: Inst, Targets: SymTargets,
1373 TargetFetchInsns: MethodInfo.second, Ctx: BC.Ctx.get())
1374 : BC.MIB->indirectCallPromotion(
1375 CallInst: Inst, Targets: SymTargets, VtableSyms: MethodInfo.first, MethodFetchInsns: MethodInfo.second,
1376 MinimizeCodeSize: opts::ICPOldCodeSequence, Ctx: BC.Ctx.get());
1377
1378 if (ICPcode.empty()) {
1379 if (opts::Verbosity >= 1)
1380 BC.outs() << "BOLT-INFO: ICP failed in " << Function << " @ "
1381 << InstIdx << " in " << BB->getName()
1382 << ", calls = " << NumCalls
1383 << ", unable to generate promoted call code.\n";
1384 continue;
1385 }
1386
1387 LLVM_DEBUG({
1388 uint64_t Offset = Targets[0].From.Addr;
1389 dbgs() << "BOLT-INFO: ICP indirect call code:\n";
1390 for (const auto &entry : ICPcode) {
1391 const MCSymbol *const &Sym = entry.first;
1392 const InstructionListType &Insts = entry.second;
1393 if (Sym)
1394 dbgs() << Sym->getName() << ":\n";
1395 Offset = BC.printInstructions(dbgs(), Insts.begin(), Insts.end(),
1396 Offset);
1397 }
1398 dbgs() << "---------------------------------------------------\n";
1399 });
1400
1401 // Rewrite the CFG with the newly generated ICP code.
1402 std::vector<std::unique_ptr<BinaryBasicBlock>> NewBBs =
1403 rewriteCall(IndCallBlock&: *BB, CallInst: Inst, ICPcode: std::move(ICPcode), MethodFetchInsns: MethodInfo.second);
1404
1405 // Fix the CFG after inserting the new basic blocks.
1406 BinaryBasicBlock *MergeBlock =
1407 fixCFG(IndCallBlock&: *BB, IsTailCall, IsJumpTable, NewBBs: std::move(NewBBs), Targets);
1408
1409 // Since the tail of the original block was split off and it may contain
1410 // additional indirect calls, we must add the merge block to the set of
1411 // blocks to process.
1412 if (MergeBlock)
1413 BBs.push_back(x: MergeBlock);
1414
1415 if (opts::Verbosity >= 1)
1416 BC.outs() << "BOLT-INFO: ICP succeeded in " << Function << " @ "
1417 << InstIdx << " in " << BB->getName()
1418 << " -> calls = " << NumCalls << "\n";
1419
1420 if (IsJumpTable)
1421 ++TotalOptimizedJumpTableCallsites;
1422 else
1423 ++TotalOptimizedIndirectCallsites;
1424
1425 Modified.insert(x: &Function);
1426 }
1427 }
1428 TotalIndirectCalls += FuncTotalIndirectCalls;
1429 TotalIndirectJmps += FuncTotalIndirectJmps;
1430 }
1431
1432 BC.outs()
1433 << "BOLT-INFO: ICP total indirect callsites with profile = "
1434 << TotalIndirectCallsites << "\n"
1435 << "BOLT-INFO: ICP total jump table callsites = "
1436 << TotalJumpTableCallsites << "\n"
1437 << "BOLT-INFO: ICP total number of calls = " << TotalCalls << "\n"
1438 << "BOLT-INFO: ICP percentage of calls that are indirect = "
1439 << format(Fmt: "%.1f", Vals: (100.0 * TotalIndirectCalls) / TotalCalls) << "%\n"
1440 << "BOLT-INFO: ICP percentage of indirect calls that can be "
1441 "optimized = "
1442 << format(Fmt: "%.1f", Vals: (100.0 * TotalNumFrequentCalls) /
1443 std::max<size_t>(a: TotalIndirectCalls, b: 1))
1444 << "%\n"
1445 << "BOLT-INFO: ICP percentage of indirect callsites that are "
1446 "optimized = "
1447 << format(Fmt: "%.1f", Vals: (100.0 * TotalOptimizedIndirectCallsites) /
1448 std::max<uint64_t>(a: TotalIndirectCallsites, b: 1))
1449 << "%\n"
1450 << "BOLT-INFO: ICP number of method load elimination candidates = "
1451 << TotalMethodLoadEliminationCandidates << "\n"
1452 << "BOLT-INFO: ICP percentage of method calls candidates that have "
1453 "loads eliminated = "
1454 << format(Fmt: "%.1f",
1455 Vals: (100.0 * TotalMethodLoadsEliminated) /
1456 std::max<uint64_t>(a: TotalMethodLoadEliminationCandidates, b: 1))
1457 << "%\n"
1458 << "BOLT-INFO: ICP percentage of indirect branches that are "
1459 "optimized = "
1460 << format(Fmt: "%.1f", Vals: (100.0 * TotalNumFrequentJmps) /
1461 std::max<uint64_t>(a: TotalIndirectJmps, b: 1))
1462 << "%\n"
1463 << "BOLT-INFO: ICP percentage of jump table callsites that are "
1464 << "optimized = "
1465 << format(Fmt: "%.1f", Vals: (100.0 * TotalOptimizedJumpTableCallsites) /
1466 std::max<uint64_t>(a: TotalJumpTableCallsites, b: 1))
1467 << "%\n"
1468 << "BOLT-INFO: ICP number of jump table callsites that can use hot "
1469 << "indices = " << TotalIndexBasedCandidates << "\n"
1470 << "BOLT-INFO: ICP percentage of jump table callsites that use hot "
1471 "indices = "
1472 << format(Fmt: "%.1f", Vals: (100.0 * TotalIndexBasedJumps) /
1473 std::max<uint64_t>(a: TotalIndexBasedCandidates, b: 1))
1474 << "%\n";
1475
1476#ifndef NDEBUG
1477 verifyProfile(BFs);
1478#endif
1479 return Error::success();
1480}
1481
1482} // namespace bolt
1483} // namespace llvm
1484

source code of bolt/lib/Passes/IndirectCallPromotion.cpp