1//===- InlineCost.cpp - Cost analysis for inliner -------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements inline cost analysis.
10//
11//===----------------------------------------------------------------------===//
12
13#include "llvm/Analysis/InlineCost.h"
14#include "llvm/ADT/STLExtras.h"
15#include "llvm/ADT/SetVector.h"
16#include "llvm/ADT/SmallPtrSet.h"
17#include "llvm/ADT/SmallVector.h"
18#include "llvm/ADT/Statistic.h"
19#include "llvm/Analysis/AssumptionCache.h"
20#include "llvm/Analysis/BlockFrequencyInfo.h"
21#include "llvm/Analysis/CodeMetrics.h"
22#include "llvm/Analysis/ConstantFolding.h"
23#include "llvm/Analysis/InstructionSimplify.h"
24#include "llvm/Analysis/LoopInfo.h"
25#include "llvm/Analysis/MemoryBuiltins.h"
26#include "llvm/Analysis/OptimizationRemarkEmitter.h"
27#include "llvm/Analysis/ProfileSummaryInfo.h"
28#include "llvm/Analysis/TargetLibraryInfo.h"
29#include "llvm/Analysis/TargetTransformInfo.h"
30#include "llvm/Analysis/ValueTracking.h"
31#include "llvm/Config/llvm-config.h"
32#include "llvm/IR/AssemblyAnnotationWriter.h"
33#include "llvm/IR/CallingConv.h"
34#include "llvm/IR/DataLayout.h"
35#include "llvm/IR/Dominators.h"
36#include "llvm/IR/GetElementPtrTypeIterator.h"
37#include "llvm/IR/GlobalAlias.h"
38#include "llvm/IR/InstVisitor.h"
39#include "llvm/IR/IntrinsicInst.h"
40#include "llvm/IR/Operator.h"
41#include "llvm/IR/PatternMatch.h"
42#include "llvm/Support/CommandLine.h"
43#include "llvm/Support/Debug.h"
44#include "llvm/Support/FormattedStream.h"
45#include "llvm/Support/raw_ostream.h"
46#include <climits>
47#include <limits>
48#include <optional>
49
50using namespace llvm;
51
52#define DEBUG_TYPE "inline-cost"
53
54STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
55
56static cl::opt<int>
57 DefaultThreshold("inlinedefault-threshold", cl::Hidden, cl::init(Val: 225),
58 cl::desc("Default amount of inlining to perform"));
59
60// We introduce this option since there is a minor compile-time win by avoiding
61// addition of TTI attributes (target-features in particular) to inline
62// candidates when they are guaranteed to be the same as top level methods in
63// some use cases. If we avoid adding the attribute, we need an option to avoid
64// checking these attributes.
65static cl::opt<bool> IgnoreTTIInlineCompatible(
66 "ignore-tti-inline-compatible", cl::Hidden, cl::init(Val: false),
67 cl::desc("Ignore TTI attributes compatibility check between callee/caller "
68 "during inline cost calculation"));
69
70static cl::opt<bool> PrintInstructionComments(
71 "print-instruction-comments", cl::Hidden, cl::init(Val: false),
72 cl::desc("Prints comments for instruction based on inline cost analysis"));
73
74static cl::opt<int> InlineThreshold(
75 "inline-threshold", cl::Hidden, cl::init(Val: 225),
76 cl::desc("Control the amount of inlining to perform (default = 225)"));
77
78static cl::opt<int> HintThreshold(
79 "inlinehint-threshold", cl::Hidden, cl::init(Val: 325),
80 cl::desc("Threshold for inlining functions with inline hint"));
81
82static cl::opt<int>
83 ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden,
84 cl::init(Val: 45),
85 cl::desc("Threshold for inlining cold callsites"));
86
87static cl::opt<bool> InlineEnableCostBenefitAnalysis(
88 "inline-enable-cost-benefit-analysis", cl::Hidden, cl::init(Val: false),
89 cl::desc("Enable the cost-benefit analysis for the inliner"));
90
91// InlineSavingsMultiplier overrides per TTI multipliers iff it is
92// specified explicitly in command line options. This option is exposed
93// for tuning and testing.
94static cl::opt<int> InlineSavingsMultiplier(
95 "inline-savings-multiplier", cl::Hidden, cl::init(Val: 8),
96 cl::desc("Multiplier to multiply cycle savings by during inlining"));
97
98// InlineSavingsProfitableMultiplier overrides per TTI multipliers iff it is
99// specified explicitly in command line options. This option is exposed
100// for tuning and testing.
101static cl::opt<int> InlineSavingsProfitableMultiplier(
102 "inline-savings-profitable-multiplier", cl::Hidden, cl::init(Val: 4),
103 cl::desc("A multiplier on top of cycle savings to decide whether the "
104 "savings won't justify the cost"));
105
106static cl::opt<int>
107 InlineSizeAllowance("inline-size-allowance", cl::Hidden, cl::init(Val: 100),
108 cl::desc("The maximum size of a callee that get's "
109 "inlined without sufficient cycle savings"));
110
111// We introduce this threshold to help performance of instrumentation based
112// PGO before we actually hook up inliner with analysis passes such as BPI and
113// BFI.
114static cl::opt<int> ColdThreshold(
115 "inlinecold-threshold", cl::Hidden, cl::init(Val: 45),
116 cl::desc("Threshold for inlining functions with cold attribute"));
117
118static cl::opt<int>
119 HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(Val: 3000),
120 cl::desc("Threshold for hot callsites "));
121
122static cl::opt<int> LocallyHotCallSiteThreshold(
123 "locally-hot-callsite-threshold", cl::Hidden, cl::init(Val: 525),
124 cl::desc("Threshold for locally hot callsites "));
125
126static cl::opt<int> ColdCallSiteRelFreq(
127 "cold-callsite-rel-freq", cl::Hidden, cl::init(Val: 2),
128 cl::desc("Maximum block frequency, expressed as a percentage of caller's "
129 "entry frequency, for a callsite to be cold in the absence of "
130 "profile information."));
131
132static cl::opt<uint64_t> HotCallSiteRelFreq(
133 "hot-callsite-rel-freq", cl::Hidden, cl::init(Val: 60),
134 cl::desc("Minimum block frequency, expressed as a multiple of caller's "
135 "entry frequency, for a callsite to be hot in the absence of "
136 "profile information."));
137
138static cl::opt<int>
139 InstrCost("inline-instr-cost", cl::Hidden, cl::init(Val: 5),
140 cl::desc("Cost of a single instruction when inlining"));
141
142static cl::opt<int>
143 MemAccessCost("inline-memaccess-cost", cl::Hidden, cl::init(Val: 0),
144 cl::desc("Cost of load/store instruction when inlining"));
145
146static cl::opt<int> CallPenalty(
147 "inline-call-penalty", cl::Hidden, cl::init(Val: 25),
148 cl::desc("Call penalty that is applied per callsite when inlining"));
149
150static cl::opt<size_t>
151 StackSizeThreshold("inline-max-stacksize", cl::Hidden,
152 cl::init(Val: std::numeric_limits<size_t>::max()),
153 cl::desc("Do not inline functions with a stack size "
154 "that exceeds the specified limit"));
155
156static cl::opt<size_t> RecurStackSizeThreshold(
157 "recursive-inline-max-stacksize", cl::Hidden,
158 cl::init(Val: InlineConstants::TotalAllocaSizeRecursiveCaller),
159 cl::desc("Do not inline recursive functions with a stack "
160 "size that exceeds the specified limit"));
161
162static cl::opt<bool> OptComputeFullInlineCost(
163 "inline-cost-full", cl::Hidden,
164 cl::desc("Compute the full inline cost of a call site even when the cost "
165 "exceeds the threshold."));
166
167static cl::opt<bool> InlineCallerSupersetNoBuiltin(
168 "inline-caller-superset-nobuiltin", cl::Hidden, cl::init(Val: true),
169 cl::desc("Allow inlining when caller has a superset of callee's nobuiltin "
170 "attributes."));
171
172static cl::opt<bool> DisableGEPConstOperand(
173 "disable-gep-const-evaluation", cl::Hidden, cl::init(Val: false),
174 cl::desc("Disables evaluation of GetElementPtr with constant operands"));
175
176namespace llvm {
177std::optional<int> getStringFnAttrAsInt(const Attribute &Attr) {
178 if (Attr.isValid()) {
179 int AttrValue = 0;
180 if (!Attr.getValueAsString().getAsInteger(Radix: 10, Result&: AttrValue))
181 return AttrValue;
182 }
183 return std::nullopt;
184}
185
186std::optional<int> getStringFnAttrAsInt(CallBase &CB, StringRef AttrKind) {
187 return getStringFnAttrAsInt(Attr: CB.getFnAttr(Kind: AttrKind));
188}
189
190std::optional<int> getStringFnAttrAsInt(Function *F, StringRef AttrKind) {
191 return getStringFnAttrAsInt(Attr: F->getFnAttribute(Kind: AttrKind));
192}
193
194namespace InlineConstants {
195int getInstrCost() { return InstrCost; }
196
197} // namespace InlineConstants
198
199} // namespace llvm
200
201namespace {
202class InlineCostCallAnalyzer;
203
204// This struct is used to store information about inline cost of a
205// particular instruction
206struct InstructionCostDetail {
207 int CostBefore = 0;
208 int CostAfter = 0;
209 int ThresholdBefore = 0;
210 int ThresholdAfter = 0;
211
212 int getThresholdDelta() const { return ThresholdAfter - ThresholdBefore; }
213
214 int getCostDelta() const { return CostAfter - CostBefore; }
215
216 bool hasThresholdChanged() const { return ThresholdAfter != ThresholdBefore; }
217};
218
219class InlineCostAnnotationWriter : public AssemblyAnnotationWriter {
220private:
221 InlineCostCallAnalyzer *const ICCA;
222
223public:
224 InlineCostAnnotationWriter(InlineCostCallAnalyzer *ICCA) : ICCA(ICCA) {}
225 void emitInstructionAnnot(const Instruction *I,
226 formatted_raw_ostream &OS) override;
227};
228
229/// Carry out call site analysis, in order to evaluate inlinability.
230/// NOTE: the type is currently used as implementation detail of functions such
231/// as llvm::getInlineCost. Note the function_ref constructor parameters - the
232/// expectation is that they come from the outer scope, from the wrapper
233/// functions. If we want to support constructing CallAnalyzer objects where
234/// lambdas are provided inline at construction, or where the object needs to
235/// otherwise survive past the scope of the provided functions, we need to
236/// revisit the argument types.
237class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
238 typedef InstVisitor<CallAnalyzer, bool> Base;
239 friend class InstVisitor<CallAnalyzer, bool>;
240
241protected:
242 virtual ~CallAnalyzer() = default;
243 /// The TargetTransformInfo available for this compilation.
244 const TargetTransformInfo &TTI;
245
246 /// Getter for the cache of @llvm.assume intrinsics.
247 function_ref<AssumptionCache &(Function &)> GetAssumptionCache;
248
249 /// Getter for BlockFrequencyInfo
250 function_ref<BlockFrequencyInfo &(Function &)> GetBFI;
251
252 /// Profile summary information.
253 ProfileSummaryInfo *PSI;
254
255 /// The called function.
256 Function &F;
257
258 // Cache the DataLayout since we use it a lot.
259 const DataLayout &DL;
260
261 /// The OptimizationRemarkEmitter available for this compilation.
262 OptimizationRemarkEmitter *ORE;
263
264 /// The candidate callsite being analyzed. Please do not use this to do
265 /// analysis in the caller function; we want the inline cost query to be
266 /// easily cacheable. Instead, use the cover function paramHasAttr.
267 CallBase &CandidateCall;
268
269 /// Extension points for handling callsite features.
270 // Called before a basic block was analyzed.
271 virtual void onBlockStart(const BasicBlock *BB) {}
272
273 /// Called after a basic block was analyzed.
274 virtual void onBlockAnalyzed(const BasicBlock *BB) {}
275
276 /// Called before an instruction was analyzed
277 virtual void onInstructionAnalysisStart(const Instruction *I) {}
278
279 /// Called after an instruction was analyzed
280 virtual void onInstructionAnalysisFinish(const Instruction *I) {}
281
282 /// Called at the end of the analysis of the callsite. Return the outcome of
283 /// the analysis, i.e. 'InlineResult(true)' if the inlining may happen, or
284 /// the reason it can't.
285 virtual InlineResult finalizeAnalysis() { return InlineResult::success(); }
286 /// Called when we're about to start processing a basic block, and every time
287 /// we are done processing an instruction. Return true if there is no point in
288 /// continuing the analysis (e.g. we've determined already the call site is
289 /// too expensive to inline)
290 virtual bool shouldStop() { return false; }
291
292 /// Called before the analysis of the callee body starts (with callsite
293 /// contexts propagated). It checks callsite-specific information. Return a
294 /// reason analysis can't continue if that's the case, or 'true' if it may
295 /// continue.
296 virtual InlineResult onAnalysisStart() { return InlineResult::success(); }
297 /// Called if the analysis engine decides SROA cannot be done for the given
298 /// alloca.
299 virtual void onDisableSROA(AllocaInst *Arg) {}
300
301 /// Called the analysis engine determines load elimination won't happen.
302 virtual void onDisableLoadElimination() {}
303
304 /// Called when we visit a CallBase, before the analysis starts. Return false
305 /// to stop further processing of the instruction.
306 virtual bool onCallBaseVisitStart(CallBase &Call) { return true; }
307
308 /// Called to account for a call.
309 virtual void onCallPenalty() {}
310
311 /// Called to account for a load or store.
312 virtual void onMemAccess(){};
313
314 /// Called to account for the expectation the inlining would result in a load
315 /// elimination.
316 virtual void onLoadEliminationOpportunity() {}
317
318 /// Called to account for the cost of argument setup for the Call in the
319 /// callee's body (not the callsite currently under analysis).
320 virtual void onCallArgumentSetup(const CallBase &Call) {}
321
322 /// Called to account for a load relative intrinsic.
323 virtual void onLoadRelativeIntrinsic() {}
324
325 /// Called to account for a lowered call.
326 virtual void onLoweredCall(Function *F, CallBase &Call, bool IsIndirectCall) {
327 }
328
329 /// Account for a jump table of given size. Return false to stop further
330 /// processing the switch instruction
331 virtual bool onJumpTable(unsigned JumpTableSize) { return true; }
332
333 /// Account for a case cluster of given size. Return false to stop further
334 /// processing of the instruction.
335 virtual bool onCaseCluster(unsigned NumCaseCluster) { return true; }
336
337 /// Called at the end of processing a switch instruction, with the given
338 /// number of case clusters.
339 virtual void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster,
340 bool DefaultDestUndefined) {}
341
342 /// Called to account for any other instruction not specifically accounted
343 /// for.
344 virtual void onMissedSimplification() {}
345
346 /// Start accounting potential benefits due to SROA for the given alloca.
347 virtual void onInitializeSROAArg(AllocaInst *Arg) {}
348
349 /// Account SROA savings for the AllocaInst value.
350 virtual void onAggregateSROAUse(AllocaInst *V) {}
351
352 bool handleSROA(Value *V, bool DoNotDisable) {
353 // Check for SROA candidates in comparisons.
354 if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
355 if (DoNotDisable) {
356 onAggregateSROAUse(V: SROAArg);
357 return true;
358 }
359 disableSROAForArg(SROAArg);
360 }
361 return false;
362 }
363
364 bool IsCallerRecursive = false;
365 bool IsRecursiveCall = false;
366 bool ExposesReturnsTwice = false;
367 bool HasDynamicAlloca = false;
368 bool ContainsNoDuplicateCall = false;
369 bool HasReturn = false;
370 bool HasIndirectBr = false;
371 bool HasUninlineableIntrinsic = false;
372 bool InitsVargArgs = false;
373
374 /// Number of bytes allocated statically by the callee.
375 uint64_t AllocatedSize = 0;
376 unsigned NumInstructions = 0;
377 unsigned NumVectorInstructions = 0;
378
379 /// While we walk the potentially-inlined instructions, we build up and
380 /// maintain a mapping of simplified values specific to this callsite. The
381 /// idea is to propagate any special information we have about arguments to
382 /// this call through the inlinable section of the function, and account for
383 /// likely simplifications post-inlining. The most important aspect we track
384 /// is CFG altering simplifications -- when we prove a basic block dead, that
385 /// can cause dramatic shifts in the cost of inlining a function.
386 DenseMap<Value *, Constant *> SimplifiedValues;
387
388 /// Keep track of the values which map back (through function arguments) to
389 /// allocas on the caller stack which could be simplified through SROA.
390 DenseMap<Value *, AllocaInst *> SROAArgValues;
391
392 /// Keep track of Allocas for which we believe we may get SROA optimization.
393 DenseSet<AllocaInst *> EnabledSROAAllocas;
394
395 /// Keep track of values which map to a pointer base and constant offset.
396 DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs;
397
398 /// Keep track of dead blocks due to the constant arguments.
399 SmallPtrSet<BasicBlock *, 16> DeadBlocks;
400
401 /// The mapping of the blocks to their known unique successors due to the
402 /// constant arguments.
403 DenseMap<BasicBlock *, BasicBlock *> KnownSuccessors;
404
405 /// Model the elimination of repeated loads that is expected to happen
406 /// whenever we simplify away the stores that would otherwise cause them to be
407 /// loads.
408 bool EnableLoadElimination = true;
409
410 /// Whether we allow inlining for recursive call.
411 bool AllowRecursiveCall = false;
412
413 SmallPtrSet<Value *, 16> LoadAddrSet;
414
415 AllocaInst *getSROAArgForValueOrNull(Value *V) const {
416 auto It = SROAArgValues.find(Val: V);
417 if (It == SROAArgValues.end() || EnabledSROAAllocas.count(V: It->second) == 0)
418 return nullptr;
419 return It->second;
420 }
421
422 // Custom simplification helper routines.
423 bool isAllocaDerivedArg(Value *V);
424 void disableSROAForArg(AllocaInst *SROAArg);
425 void disableSROA(Value *V);
426 void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB);
427 void disableLoadElimination();
428 bool isGEPFree(GetElementPtrInst &GEP);
429 bool canFoldInboundsGEP(GetElementPtrInst &I);
430 bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
431 bool simplifyCallSite(Function *F, CallBase &Call);
432 bool simplifyInstruction(Instruction &I);
433 bool simplifyIntrinsicCallIsConstant(CallBase &CB);
434 bool simplifyIntrinsicCallObjectSize(CallBase &CB);
435 ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
436
437 /// Return true if the given argument to the function being considered for
438 /// inlining has the given attribute set either at the call site or the
439 /// function declaration. Primarily used to inspect call site specific
440 /// attributes since these can be more precise than the ones on the callee
441 /// itself.
442 bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);
443
444 /// Return true if the given value is known non null within the callee if
445 /// inlined through this particular callsite.
446 bool isKnownNonNullInCallee(Value *V);
447
448 /// Return true if size growth is allowed when inlining the callee at \p Call.
449 bool allowSizeGrowth(CallBase &Call);
450
451 // Custom analysis routines.
452 InlineResult analyzeBlock(BasicBlock *BB,
453 SmallPtrSetImpl<const Value *> &EphValues);
454
455 // Disable several entry points to the visitor so we don't accidentally use
456 // them by declaring but not defining them here.
457 void visit(Module *);
458 void visit(Module &);
459 void visit(Function *);
460 void visit(Function &);
461 void visit(BasicBlock *);
462 void visit(BasicBlock &);
463
464 // Provide base case for our instruction visit.
465 bool visitInstruction(Instruction &I);
466
467 // Our visit overrides.
468 bool visitAlloca(AllocaInst &I);
469 bool visitPHI(PHINode &I);
470 bool visitGetElementPtr(GetElementPtrInst &I);
471 bool visitBitCast(BitCastInst &I);
472 bool visitPtrToInt(PtrToIntInst &I);
473 bool visitIntToPtr(IntToPtrInst &I);
474 bool visitCastInst(CastInst &I);
475 bool visitCmpInst(CmpInst &I);
476 bool visitSub(BinaryOperator &I);
477 bool visitBinaryOperator(BinaryOperator &I);
478 bool visitFNeg(UnaryOperator &I);
479 bool visitLoad(LoadInst &I);
480 bool visitStore(StoreInst &I);
481 bool visitExtractValue(ExtractValueInst &I);
482 bool visitInsertValue(InsertValueInst &I);
483 bool visitCallBase(CallBase &Call);
484 bool visitReturnInst(ReturnInst &RI);
485 bool visitBranchInst(BranchInst &BI);
486 bool visitSelectInst(SelectInst &SI);
487 bool visitSwitchInst(SwitchInst &SI);
488 bool visitIndirectBrInst(IndirectBrInst &IBI);
489 bool visitResumeInst(ResumeInst &RI);
490 bool visitCleanupReturnInst(CleanupReturnInst &RI);
491 bool visitCatchReturnInst(CatchReturnInst &RI);
492 bool visitUnreachableInst(UnreachableInst &I);
493
494public:
495 CallAnalyzer(Function &Callee, CallBase &Call, const TargetTransformInfo &TTI,
496 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
497 function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr,
498 ProfileSummaryInfo *PSI = nullptr,
499 OptimizationRemarkEmitter *ORE = nullptr)
500 : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
501 PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE),
502 CandidateCall(Call) {}
503
504 InlineResult analyze();
505
506 std::optional<Constant *> getSimplifiedValue(Instruction *I) {
507 if (SimplifiedValues.contains(Val: I))
508 return SimplifiedValues[I];
509 return std::nullopt;
510 }
511
512 // Keep a bunch of stats about the cost savings found so we can print them
513 // out when debugging.
514 unsigned NumConstantArgs = 0;
515 unsigned NumConstantOffsetPtrArgs = 0;
516 unsigned NumAllocaArgs = 0;
517 unsigned NumConstantPtrCmps = 0;
518 unsigned NumConstantPtrDiffs = 0;
519 unsigned NumInstructionsSimplified = 0;
520
521 void dump();
522};
523
524// Considering forming a binary search, we should find the number of nodes
525// which is same as the number of comparisons when lowered. For a given
526// number of clusters, n, we can define a recursive function, f(n), to find
527// the number of nodes in the tree. The recursion is :
528// f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
529// and f(n) = n, when n <= 3.
530// This will lead a binary tree where the leaf should be either f(2) or f(3)
531// when n > 3. So, the number of comparisons from leaves should be n, while
532// the number of non-leaf should be :
533// 2^(log2(n) - 1) - 1
534// = 2^log2(n) * 2^-1 - 1
535// = n / 2 - 1.
536// Considering comparisons from leaf and non-leaf nodes, we can estimate the
537// number of comparisons in a simple closed form :
538// n + n / 2 - 1 = n * 3 / 2 - 1
539int64_t getExpectedNumberOfCompare(int NumCaseCluster) {
540 return 3 * static_cast<int64_t>(NumCaseCluster) / 2 - 1;
541}
542
543/// FIXME: if it is necessary to derive from InlineCostCallAnalyzer, note
544/// the FIXME in onLoweredCall, when instantiating an InlineCostCallAnalyzer
545class InlineCostCallAnalyzer final : public CallAnalyzer {
546 const bool ComputeFullInlineCost;
547 int LoadEliminationCost = 0;
548 /// Bonus to be applied when percentage of vector instructions in callee is
549 /// high (see more details in updateThreshold).
550 int VectorBonus = 0;
551 /// Bonus to be applied when the callee has only one reachable basic block.
552 int SingleBBBonus = 0;
553
554 /// Tunable parameters that control the analysis.
555 const InlineParams &Params;
556
557 // This DenseMap stores the delta change in cost and threshold after
558 // accounting for the given instruction. The map is filled only with the
559 // flag PrintInstructionComments on.
560 DenseMap<const Instruction *, InstructionCostDetail> InstructionCostDetailMap;
561
562 /// Upper bound for the inlining cost. Bonuses are being applied to account
563 /// for speculative "expected profit" of the inlining decision.
564 int Threshold = 0;
565
566 /// The amount of StaticBonus applied.
567 int StaticBonusApplied = 0;
568
569 /// Attempt to evaluate indirect calls to boost its inline cost.
570 const bool BoostIndirectCalls;
571
572 /// Ignore the threshold when finalizing analysis.
573 const bool IgnoreThreshold;
574
575 // True if the cost-benefit-analysis-based inliner is enabled.
576 const bool CostBenefitAnalysisEnabled;
577
578 /// Inlining cost measured in abstract units, accounts for all the
579 /// instructions expected to be executed for a given function invocation.
580 /// Instructions that are statically proven to be dead based on call-site
581 /// arguments are not counted here.
582 int Cost = 0;
583
584 // The cumulative cost at the beginning of the basic block being analyzed. At
585 // the end of analyzing each basic block, "Cost - CostAtBBStart" represents
586 // the size of that basic block.
587 int CostAtBBStart = 0;
588
589 // The static size of live but cold basic blocks. This is "static" in the
590 // sense that it's not weighted by profile counts at all.
591 int ColdSize = 0;
592
593 // Whether inlining is decided by cost-threshold analysis.
594 bool DecidedByCostThreshold = false;
595
596 // Whether inlining is decided by cost-benefit analysis.
597 bool DecidedByCostBenefit = false;
598
599 // The cost-benefit pair computed by cost-benefit analysis.
600 std::optional<CostBenefitPair> CostBenefit;
601
602 bool SingleBB = true;
603
604 unsigned SROACostSavings = 0;
605 unsigned SROACostSavingsLost = 0;
606
607 /// The mapping of caller Alloca values to their accumulated cost savings. If
608 /// we have to disable SROA for one of the allocas, this tells us how much
609 /// cost must be added.
610 DenseMap<AllocaInst *, int> SROAArgCosts;
611
612 /// Return true if \p Call is a cold callsite.
613 bool isColdCallSite(CallBase &Call, BlockFrequencyInfo *CallerBFI);
614
615 /// Update Threshold based on callsite properties such as callee
616 /// attributes and callee hotness for PGO builds. The Callee is explicitly
617 /// passed to support analyzing indirect calls whose target is inferred by
618 /// analysis.
619 void updateThreshold(CallBase &Call, Function &Callee);
620 /// Return a higher threshold if \p Call is a hot callsite.
621 std::optional<int> getHotCallSiteThreshold(CallBase &Call,
622 BlockFrequencyInfo *CallerBFI);
623
624 /// Handle a capped 'int' increment for Cost.
625 void addCost(int64_t Inc) {
626 Inc = std::clamp<int64_t>(val: Inc, INT_MIN, INT_MAX);
627 Cost = std::clamp<int64_t>(val: Inc + Cost, INT_MIN, INT_MAX);
628 }
629
630 void onDisableSROA(AllocaInst *Arg) override {
631 auto CostIt = SROAArgCosts.find(Val: Arg);
632 if (CostIt == SROAArgCosts.end())
633 return;
634 addCost(Inc: CostIt->second);
635 SROACostSavings -= CostIt->second;
636 SROACostSavingsLost += CostIt->second;
637 SROAArgCosts.erase(I: CostIt);
638 }
639
640 void onDisableLoadElimination() override {
641 addCost(Inc: LoadEliminationCost);
642 LoadEliminationCost = 0;
643 }
644
645 bool onCallBaseVisitStart(CallBase &Call) override {
646 if (std::optional<int> AttrCallThresholdBonus =
647 getStringFnAttrAsInt(CB&: Call, AttrKind: "call-threshold-bonus"))
648 Threshold += *AttrCallThresholdBonus;
649
650 if (std::optional<int> AttrCallCost =
651 getStringFnAttrAsInt(CB&: Call, AttrKind: "call-inline-cost")) {
652 addCost(Inc: *AttrCallCost);
653 // Prevent further processing of the call since we want to override its
654 // inline cost, not just add to it.
655 return false;
656 }
657 return true;
658 }
659
660 void onCallPenalty() override { addCost(Inc: CallPenalty); }
661
662 void onMemAccess() override { addCost(Inc: MemAccessCost); }
663
664 void onCallArgumentSetup(const CallBase &Call) override {
665 // Pay the price of the argument setup. We account for the average 1
666 // instruction per call argument setup here.
667 addCost(Inc: Call.arg_size() * InstrCost);
668 }
669 void onLoadRelativeIntrinsic() override {
670 // This is normally lowered to 4 LLVM instructions.
671 addCost(Inc: 3 * InstrCost);
672 }
673 void onLoweredCall(Function *F, CallBase &Call,
674 bool IsIndirectCall) override {
675 // We account for the average 1 instruction per call argument setup here.
676 addCost(Inc: Call.arg_size() * InstrCost);
677
678 // If we have a constant that we are calling as a function, we can peer
679 // through it and see the function target. This happens not infrequently
680 // during devirtualization and so we want to give it a hefty bonus for
681 // inlining, but cap that bonus in the event that inlining wouldn't pan out.
682 // Pretend to inline the function, with a custom threshold.
683 if (IsIndirectCall && BoostIndirectCalls) {
684 auto IndirectCallParams = Params;
685 IndirectCallParams.DefaultThreshold =
686 InlineConstants::IndirectCallThreshold;
687 /// FIXME: if InlineCostCallAnalyzer is derived from, this may need
688 /// to instantiate the derived class.
689 InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI,
690 GetAssumptionCache, GetBFI, PSI, ORE, false);
691 if (CA.analyze().isSuccess()) {
692 // We were able to inline the indirect call! Subtract the cost from the
693 // threshold to get the bonus we want to apply, but don't go below zero.
694 Cost -= std::max(a: 0, b: CA.getThreshold() - CA.getCost());
695 }
696 } else
697 // Otherwise simply add the cost for merely making the call.
698 addCost(Inc: TTI.getInlineCallPenalty(F: CandidateCall.getCaller(), Call,
699 DefaultCallPenalty: CallPenalty));
700 }
701
702 void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster,
703 bool DefaultDestUndefined) override {
704 if (!DefaultDestUndefined)
705 addCost(Inc: 2 * InstrCost);
706 // If suitable for a jump table, consider the cost for the table size and
707 // branch to destination.
708 // Maximum valid cost increased in this function.
709 if (JumpTableSize) {
710 int64_t JTCost =
711 static_cast<int64_t>(JumpTableSize) * InstrCost + 4 * InstrCost;
712 addCost(Inc: JTCost);
713 return;
714 }
715
716 if (NumCaseCluster <= 3) {
717 // Suppose a comparison includes one compare and one conditional branch.
718 addCost(Inc: NumCaseCluster * 2 * InstrCost);
719 return;
720 }
721
722 int64_t ExpectedNumberOfCompare =
723 getExpectedNumberOfCompare(NumCaseCluster);
724 int64_t SwitchCost = ExpectedNumberOfCompare * 2 * InstrCost;
725
726 addCost(Inc: SwitchCost);
727 }
728 void onMissedSimplification() override { addCost(Inc: InstrCost); }
729
730 void onInitializeSROAArg(AllocaInst *Arg) override {
731 assert(Arg != nullptr &&
732 "Should not initialize SROA costs for null value.");
733 auto SROAArgCost = TTI.getCallerAllocaCost(CB: &CandidateCall, AI: Arg);
734 SROACostSavings += SROAArgCost;
735 SROAArgCosts[Arg] = SROAArgCost;
736 }
737
738 void onAggregateSROAUse(AllocaInst *SROAArg) override {
739 auto CostIt = SROAArgCosts.find(Val: SROAArg);
740 assert(CostIt != SROAArgCosts.end() &&
741 "expected this argument to have a cost");
742 CostIt->second += InstrCost;
743 SROACostSavings += InstrCost;
744 }
745
746 void onBlockStart(const BasicBlock *BB) override { CostAtBBStart = Cost; }
747
748 void onBlockAnalyzed(const BasicBlock *BB) override {
749 if (CostBenefitAnalysisEnabled) {
750 // Keep track of the static size of live but cold basic blocks. For now,
751 // we define a cold basic block to be one that's never executed.
752 assert(GetBFI && "GetBFI must be available");
753 BlockFrequencyInfo *BFI = &(GetBFI(F));
754 assert(BFI && "BFI must be available");
755 auto ProfileCount = BFI->getBlockProfileCount(BB);
756 if (*ProfileCount == 0)
757 ColdSize += Cost - CostAtBBStart;
758 }
759
760 auto *TI = BB->getTerminator();
761 // If we had any successors at this point, than post-inlining is likely to
762 // have them as well. Note that we assume any basic blocks which existed
763 // due to branches or switches which folded above will also fold after
764 // inlining.
765 if (SingleBB && TI->getNumSuccessors() > 1) {
766 // Take off the bonus we applied to the threshold.
767 Threshold -= SingleBBBonus;
768 SingleBB = false;
769 }
770 }
771
772 void onInstructionAnalysisStart(const Instruction *I) override {
773 // This function is called to store the initial cost of inlining before
774 // the given instruction was assessed.
775 if (!PrintInstructionComments)
776 return;
777 InstructionCostDetailMap[I].CostBefore = Cost;
778 InstructionCostDetailMap[I].ThresholdBefore = Threshold;
779 }
780
781 void onInstructionAnalysisFinish(const Instruction *I) override {
782 // This function is called to find new values of cost and threshold after
783 // the instruction has been assessed.
784 if (!PrintInstructionComments)
785 return;
786 InstructionCostDetailMap[I].CostAfter = Cost;
787 InstructionCostDetailMap[I].ThresholdAfter = Threshold;
788 }
789
790 bool isCostBenefitAnalysisEnabled() {
791 if (!PSI || !PSI->hasProfileSummary())
792 return false;
793
794 if (!GetBFI)
795 return false;
796
797 if (InlineEnableCostBenefitAnalysis.getNumOccurrences()) {
798 // Honor the explicit request from the user.
799 if (!InlineEnableCostBenefitAnalysis)
800 return false;
801 } else {
802 // Otherwise, require instrumentation profile.
803 if (!(PSI->hasInstrumentationProfile() || PSI->hasSampleProfile()))
804 return false;
805 }
806
807 auto *Caller = CandidateCall.getParent()->getParent();
808 if (!Caller->getEntryCount())
809 return false;
810
811 BlockFrequencyInfo *CallerBFI = &(GetBFI(*Caller));
812 if (!CallerBFI)
813 return false;
814
815 // For now, limit to hot call site.
816 if (!PSI->isHotCallSite(CB: CandidateCall, BFI: CallerBFI))
817 return false;
818
819 // Make sure we have a nonzero entry count.
820 auto EntryCount = F.getEntryCount();
821 if (!EntryCount || !EntryCount->getCount())
822 return false;
823
824 BlockFrequencyInfo *CalleeBFI = &(GetBFI(F));
825 if (!CalleeBFI)
826 return false;
827
828 return true;
829 }
830
831 // A helper function to choose between command line override and default.
832 unsigned getInliningCostBenefitAnalysisSavingsMultiplier() const {
833 if (InlineSavingsMultiplier.getNumOccurrences())
834 return InlineSavingsMultiplier;
835 return TTI.getInliningCostBenefitAnalysisSavingsMultiplier();
836 }
837
838 // A helper function to choose between command line override and default.
839 unsigned getInliningCostBenefitAnalysisProfitableMultiplier() const {
840 if (InlineSavingsProfitableMultiplier.getNumOccurrences())
841 return InlineSavingsProfitableMultiplier;
842 return TTI.getInliningCostBenefitAnalysisProfitableMultiplier();
843 }
844
845 void OverrideCycleSavingsAndSizeForTesting(APInt &CycleSavings, int &Size) {
846 if (std::optional<int> AttrCycleSavings = getStringFnAttrAsInt(
847 CB&: CandidateCall, AttrKind: "inline-cycle-savings-for-test")) {
848 CycleSavings = *AttrCycleSavings;
849 }
850
851 if (std::optional<int> AttrRuntimeCost = getStringFnAttrAsInt(
852 CB&: CandidateCall, AttrKind: "inline-runtime-cost-for-test")) {
853 Size = *AttrRuntimeCost;
854 }
855 }
856
857 // Determine whether we should inline the given call site, taking into account
858 // both the size cost and the cycle savings. Return std::nullopt if we don't
859 // have sufficient profiling information to determine.
860 std::optional<bool> costBenefitAnalysis() {
861 if (!CostBenefitAnalysisEnabled)
862 return std::nullopt;
863
864 // buildInlinerPipeline in the pass builder sets HotCallSiteThreshold to 0
865 // for the prelink phase of the AutoFDO + ThinLTO build. Honor the logic by
866 // falling back to the cost-based metric.
867 // TODO: Improve this hacky condition.
868 if (Threshold == 0)
869 return std::nullopt;
870
871 assert(GetBFI);
872 BlockFrequencyInfo *CalleeBFI = &(GetBFI(F));
873 assert(CalleeBFI);
874
875 // The cycle savings expressed as the sum of InstrCost
876 // multiplied by the estimated dynamic count of each instruction we can
877 // avoid. Savings come from the call site cost, such as argument setup and
878 // the call instruction, as well as the instructions that are folded.
879 //
880 // We use 128-bit APInt here to avoid potential overflow. This variable
881 // should stay well below 10^^24 (or 2^^80) in practice. This "worst" case
882 // assumes that we can avoid or fold a billion instructions, each with a
883 // profile count of 10^^15 -- roughly the number of cycles for a 24-hour
884 // period on a 4GHz machine.
885 APInt CycleSavings(128, 0);
886
887 for (auto &BB : F) {
888 APInt CurrentSavings(128, 0);
889 for (auto &I : BB) {
890 if (BranchInst *BI = dyn_cast<BranchInst>(Val: &I)) {
891 // Count a conditional branch as savings if it becomes unconditional.
892 if (BI->isConditional() &&
893 isa_and_nonnull<ConstantInt>(
894 Val: SimplifiedValues.lookup(Val: BI->getCondition()))) {
895 CurrentSavings += InstrCost;
896 }
897 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(Val: &I)) {
898 if (isa_and_present<ConstantInt>(Val: SimplifiedValues.lookup(Val: SI->getCondition())))
899 CurrentSavings += InstrCost;
900 } else if (Value *V = dyn_cast<Value>(Val: &I)) {
901 // Count an instruction as savings if we can fold it.
902 if (SimplifiedValues.count(Val: V)) {
903 CurrentSavings += InstrCost;
904 }
905 }
906 }
907
908 auto ProfileCount = CalleeBFI->getBlockProfileCount(BB: &BB);
909 CurrentSavings *= *ProfileCount;
910 CycleSavings += CurrentSavings;
911 }
912
913 // Compute the cycle savings per call.
914 auto EntryProfileCount = F.getEntryCount();
915 assert(EntryProfileCount && EntryProfileCount->getCount());
916 auto EntryCount = EntryProfileCount->getCount();
917 CycleSavings += EntryCount / 2;
918 CycleSavings = CycleSavings.udiv(RHS: EntryCount);
919
920 // Compute the total savings for the call site.
921 auto *CallerBB = CandidateCall.getParent();
922 BlockFrequencyInfo *CallerBFI = &(GetBFI(*(CallerBB->getParent())));
923 CycleSavings += getCallsiteCost(TTI, Call: this->CandidateCall, DL);
924 CycleSavings *= *CallerBFI->getBlockProfileCount(BB: CallerBB);
925
926 // Remove the cost of the cold basic blocks to model the runtime cost more
927 // accurately. Both machine block placement and function splitting could
928 // place cold blocks further from hot blocks.
929 int Size = Cost - ColdSize;
930
931 // Allow tiny callees to be inlined regardless of whether they meet the
932 // savings threshold.
933 Size = Size > InlineSizeAllowance ? Size - InlineSizeAllowance : 1;
934
935 OverrideCycleSavingsAndSizeForTesting(CycleSavings, Size);
936 CostBenefit.emplace(args: APInt(128, Size), args&: CycleSavings);
937
938 // Let R be the ratio of CycleSavings to Size. We accept the inlining
939 // opportunity if R is really high and reject if R is really low. If R is
940 // somewhere in the middle, we fall back to the cost-based analysis.
941 //
942 // Specifically, let R = CycleSavings / Size, we accept the inlining
943 // opportunity if:
944 //
945 // PSI->getOrCompHotCountThreshold()
946 // R > -------------------------------------------------
947 // getInliningCostBenefitAnalysisSavingsMultiplier()
948 //
949 // and reject the inlining opportunity if:
950 //
951 // PSI->getOrCompHotCountThreshold()
952 // R <= ----------------------------------------------------
953 // getInliningCostBenefitAnalysisProfitableMultiplier()
954 //
955 // Otherwise, we fall back to the cost-based analysis.
956 //
957 // Implementation-wise, use multiplication (CycleSavings * Multiplier,
958 // HotCountThreshold * Size) rather than division to avoid precision loss.
959 APInt Threshold(128, PSI->getOrCompHotCountThreshold());
960 Threshold *= Size;
961
962 APInt UpperBoundCycleSavings = CycleSavings;
963 UpperBoundCycleSavings *= getInliningCostBenefitAnalysisSavingsMultiplier();
964 if (UpperBoundCycleSavings.uge(RHS: Threshold))
965 return true;
966
967 APInt LowerBoundCycleSavings = CycleSavings;
968 LowerBoundCycleSavings *=
969 getInliningCostBenefitAnalysisProfitableMultiplier();
970 if (LowerBoundCycleSavings.ult(RHS: Threshold))
971 return false;
972
973 // Otherwise, fall back to the cost-based analysis.
974 return std::nullopt;
975 }
976
977 InlineResult finalizeAnalysis() override {
978 // Loops generally act a lot like calls in that they act like barriers to
979 // movement, require a certain amount of setup, etc. So when optimising for
980 // size, we penalise any call sites that perform loops. We do this after all
981 // other costs here, so will likely only be dealing with relatively small
982 // functions (and hence DT and LI will hopefully be cheap).
983 auto *Caller = CandidateCall.getFunction();
984 if (Caller->hasMinSize()) {
985 DominatorTree DT(F);
986 LoopInfo LI(DT);
987 int NumLoops = 0;
988 for (Loop *L : LI) {
989 // Ignore loops that will not be executed
990 if (DeadBlocks.count(Ptr: L->getHeader()))
991 continue;
992 NumLoops++;
993 }
994 addCost(Inc: NumLoops * InlineConstants::LoopPenalty);
995 }
996
997 // We applied the maximum possible vector bonus at the beginning. Now,
998 // subtract the excess bonus, if any, from the Threshold before
999 // comparing against Cost.
1000 if (NumVectorInstructions <= NumInstructions / 10)
1001 Threshold -= VectorBonus;
1002 else if (NumVectorInstructions <= NumInstructions / 2)
1003 Threshold -= VectorBonus / 2;
1004
1005 if (std::optional<int> AttrCost =
1006 getStringFnAttrAsInt(CB&: CandidateCall, AttrKind: "function-inline-cost"))
1007 Cost = *AttrCost;
1008
1009 if (std::optional<int> AttrCostMult = getStringFnAttrAsInt(
1010 CB&: CandidateCall,
1011 AttrKind: InlineConstants::FunctionInlineCostMultiplierAttributeName))
1012 Cost *= *AttrCostMult;
1013
1014 if (std::optional<int> AttrThreshold =
1015 getStringFnAttrAsInt(CB&: CandidateCall, AttrKind: "function-inline-threshold"))
1016 Threshold = *AttrThreshold;
1017
1018 if (auto Result = costBenefitAnalysis()) {
1019 DecidedByCostBenefit = true;
1020 if (*Result)
1021 return InlineResult::success();
1022 else
1023 return InlineResult::failure(Reason: "Cost over threshold.");
1024 }
1025
1026 if (IgnoreThreshold)
1027 return InlineResult::success();
1028
1029 DecidedByCostThreshold = true;
1030 return Cost < std::max(a: 1, b: Threshold)
1031 ? InlineResult::success()
1032 : InlineResult::failure(Reason: "Cost over threshold.");
1033 }
1034
1035 bool shouldStop() override {
1036 if (IgnoreThreshold || ComputeFullInlineCost)
1037 return false;
1038 // Bail out the moment we cross the threshold. This means we'll under-count
1039 // the cost, but only when undercounting doesn't matter.
1040 if (Cost < Threshold)
1041 return false;
1042 DecidedByCostThreshold = true;
1043 return true;
1044 }
1045
1046 void onLoadEliminationOpportunity() override {
1047 LoadEliminationCost += InstrCost;
1048 }
1049
1050 InlineResult onAnalysisStart() override {
1051 // Perform some tweaks to the cost and threshold based on the direct
1052 // callsite information.
1053
1054 // We want to more aggressively inline vector-dense kernels, so up the
1055 // threshold, and we'll lower it if the % of vector instructions gets too
1056 // low. Note that these bonuses are some what arbitrary and evolved over
1057 // time by accident as much as because they are principled bonuses.
1058 //
1059 // FIXME: It would be nice to remove all such bonuses. At least it would be
1060 // nice to base the bonus values on something more scientific.
1061 assert(NumInstructions == 0);
1062 assert(NumVectorInstructions == 0);
1063
1064 // Update the threshold based on callsite properties
1065 updateThreshold(Call&: CandidateCall, Callee&: F);
1066
1067 // While Threshold depends on commandline options that can take negative
1068 // values, we want to enforce the invariant that the computed threshold and
1069 // bonuses are non-negative.
1070 assert(Threshold >= 0);
1071 assert(SingleBBBonus >= 0);
1072 assert(VectorBonus >= 0);
1073
1074 // Speculatively apply all possible bonuses to Threshold. If cost exceeds
1075 // this Threshold any time, and cost cannot decrease, we can stop processing
1076 // the rest of the function body.
1077 Threshold += (SingleBBBonus + VectorBonus);
1078
1079 // Give out bonuses for the callsite, as the instructions setting them up
1080 // will be gone after inlining.
1081 addCost(Inc: -getCallsiteCost(TTI, Call: this->CandidateCall, DL));
1082
1083 // If this function uses the coldcc calling convention, prefer not to inline
1084 // it.
1085 if (F.getCallingConv() == CallingConv::Cold)
1086 Cost += InlineConstants::ColdccPenalty;
1087
1088 LLVM_DEBUG(dbgs() << " Initial cost: " << Cost << "\n");
1089
1090 // Check if we're done. This can happen due to bonuses and penalties.
1091 if (Cost >= Threshold && !ComputeFullInlineCost)
1092 return InlineResult::failure(Reason: "high cost");
1093
1094 return InlineResult::success();
1095 }
1096
1097public:
1098 InlineCostCallAnalyzer(
1099 Function &Callee, CallBase &Call, const InlineParams &Params,
1100 const TargetTransformInfo &TTI,
1101 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
1102 function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr,
1103 ProfileSummaryInfo *PSI = nullptr,
1104 OptimizationRemarkEmitter *ORE = nullptr, bool BoostIndirect = true,
1105 bool IgnoreThreshold = false)
1106 : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, PSI, ORE),
1107 ComputeFullInlineCost(OptComputeFullInlineCost ||
1108 Params.ComputeFullInlineCost || ORE ||
1109 isCostBenefitAnalysisEnabled()),
1110 Params(Params), Threshold(Params.DefaultThreshold),
1111 BoostIndirectCalls(BoostIndirect), IgnoreThreshold(IgnoreThreshold),
1112 CostBenefitAnalysisEnabled(isCostBenefitAnalysisEnabled()),
1113 Writer(this) {
1114 AllowRecursiveCall = *Params.AllowRecursiveCall;
1115 }
1116
1117 /// Annotation Writer for instruction details
1118 InlineCostAnnotationWriter Writer;
1119
1120 void dump();
1121
1122 // Prints the same analysis as dump(), but its definition is not dependent
1123 // on the build.
1124 void print(raw_ostream &OS);
1125
1126 std::optional<InstructionCostDetail> getCostDetails(const Instruction *I) {
1127 if (InstructionCostDetailMap.contains(Val: I))
1128 return InstructionCostDetailMap[I];
1129 return std::nullopt;
1130 }
1131
1132 virtual ~InlineCostCallAnalyzer() = default;
1133 int getThreshold() const { return Threshold; }
1134 int getCost() const { return Cost; }
1135 int getStaticBonusApplied() const { return StaticBonusApplied; }
1136 std::optional<CostBenefitPair> getCostBenefitPair() { return CostBenefit; }
1137 bool wasDecidedByCostBenefit() const { return DecidedByCostBenefit; }
1138 bool wasDecidedByCostThreshold() const { return DecidedByCostThreshold; }
1139};
1140
1141// Return true if CB is the sole call to local function Callee.
1142static bool isSoleCallToLocalFunction(const CallBase &CB,
1143 const Function &Callee) {
1144 return Callee.hasLocalLinkage() && Callee.hasOneLiveUse() &&
1145 &Callee == CB.getCalledFunction();
1146}
1147
1148class InlineCostFeaturesAnalyzer final : public CallAnalyzer {
1149private:
1150 InlineCostFeatures Cost = {};
1151
1152 // FIXME: These constants are taken from the heuristic-based cost visitor.
1153 // These should be removed entirely in a later revision to avoid reliance on
1154 // heuristics in the ML inliner.
1155 static constexpr int JTCostMultiplier = 4;
1156 static constexpr int CaseClusterCostMultiplier = 2;
1157 static constexpr int SwitchDefaultDestCostMultiplier = 2;
1158 static constexpr int SwitchCostMultiplier = 2;
1159
1160 // FIXME: These are taken from the heuristic-based cost visitor: we should
1161 // eventually abstract these to the CallAnalyzer to avoid duplication.
1162 unsigned SROACostSavingOpportunities = 0;
1163 int VectorBonus = 0;
1164 int SingleBBBonus = 0;
1165 int Threshold = 5;
1166
1167 DenseMap<AllocaInst *, unsigned> SROACosts;
1168
1169 void increment(InlineCostFeatureIndex Feature, int64_t Delta = 1) {
1170 Cost[static_cast<size_t>(Feature)] += Delta;
1171 }
1172
1173 void set(InlineCostFeatureIndex Feature, int64_t Value) {
1174 Cost[static_cast<size_t>(Feature)] = Value;
1175 }
1176
1177 void onDisableSROA(AllocaInst *Arg) override {
1178 auto CostIt = SROACosts.find(Val: Arg);
1179 if (CostIt == SROACosts.end())
1180 return;
1181
1182 increment(Feature: InlineCostFeatureIndex::sroa_losses, Delta: CostIt->second);
1183 SROACostSavingOpportunities -= CostIt->second;
1184 SROACosts.erase(I: CostIt);
1185 }
1186
1187 void onDisableLoadElimination() override {
1188 set(Feature: InlineCostFeatureIndex::load_elimination, Value: 1);
1189 }
1190
1191 void onCallPenalty() override {
1192 increment(Feature: InlineCostFeatureIndex::call_penalty, Delta: CallPenalty);
1193 }
1194
1195 void onCallArgumentSetup(const CallBase &Call) override {
1196 increment(Feature: InlineCostFeatureIndex::call_argument_setup,
1197 Delta: Call.arg_size() * InstrCost);
1198 }
1199
1200 void onLoadRelativeIntrinsic() override {
1201 increment(Feature: InlineCostFeatureIndex::load_relative_intrinsic, Delta: 3 * InstrCost);
1202 }
1203
1204 void onLoweredCall(Function *F, CallBase &Call,
1205 bool IsIndirectCall) override {
1206 increment(Feature: InlineCostFeatureIndex::lowered_call_arg_setup,
1207 Delta: Call.arg_size() * InstrCost);
1208
1209 if (IsIndirectCall) {
1210 InlineParams IndirectCallParams = {/* DefaultThreshold*/ 0,
1211 /*HintThreshold*/ {},
1212 /*ColdThreshold*/ {},
1213 /*OptSizeThreshold*/ {},
1214 /*OptMinSizeThreshold*/ {},
1215 /*HotCallSiteThreshold*/ {},
1216 /*LocallyHotCallSiteThreshold*/ {},
1217 /*ColdCallSiteThreshold*/ {},
1218 /*ComputeFullInlineCost*/ true,
1219 /*EnableDeferral*/ true};
1220 IndirectCallParams.DefaultThreshold =
1221 InlineConstants::IndirectCallThreshold;
1222
1223 InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI,
1224 GetAssumptionCache, GetBFI, PSI, ORE, false,
1225 true);
1226 if (CA.analyze().isSuccess()) {
1227 increment(Feature: InlineCostFeatureIndex::nested_inline_cost_estimate,
1228 Delta: CA.getCost());
1229 increment(Feature: InlineCostFeatureIndex::nested_inlines, Delta: 1);
1230 }
1231 } else {
1232 onCallPenalty();
1233 }
1234 }
1235
1236 void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster,
1237 bool DefaultDestUndefined) override {
1238 if (!DefaultDestUndefined)
1239 increment(Feature: InlineCostFeatureIndex::switch_default_dest_penalty,
1240 Delta: SwitchDefaultDestCostMultiplier * InstrCost);
1241
1242 if (JumpTableSize) {
1243 int64_t JTCost = static_cast<int64_t>(JumpTableSize) * InstrCost +
1244 JTCostMultiplier * InstrCost;
1245 increment(Feature: InlineCostFeatureIndex::jump_table_penalty, Delta: JTCost);
1246 return;
1247 }
1248
1249 if (NumCaseCluster <= 3) {
1250 increment(Feature: InlineCostFeatureIndex::case_cluster_penalty,
1251 Delta: NumCaseCluster * CaseClusterCostMultiplier * InstrCost);
1252 return;
1253 }
1254
1255 int64_t ExpectedNumberOfCompare =
1256 getExpectedNumberOfCompare(NumCaseCluster);
1257
1258 int64_t SwitchCost =
1259 ExpectedNumberOfCompare * SwitchCostMultiplier * InstrCost;
1260 increment(Feature: InlineCostFeatureIndex::switch_penalty, Delta: SwitchCost);
1261 }
1262
1263 void onMissedSimplification() override {
1264 increment(Feature: InlineCostFeatureIndex::unsimplified_common_instructions,
1265 Delta: InstrCost);
1266 }
1267
1268 void onInitializeSROAArg(AllocaInst *Arg) override {
1269 auto SROAArgCost = TTI.getCallerAllocaCost(CB: &CandidateCall, AI: Arg);
1270 SROACosts[Arg] = SROAArgCost;
1271 SROACostSavingOpportunities += SROAArgCost;
1272 }
1273
1274 void onAggregateSROAUse(AllocaInst *Arg) override {
1275 SROACosts.find(Val: Arg)->second += InstrCost;
1276 SROACostSavingOpportunities += InstrCost;
1277 }
1278
1279 void onBlockAnalyzed(const BasicBlock *BB) override {
1280 if (BB->getTerminator()->getNumSuccessors() > 1)
1281 set(Feature: InlineCostFeatureIndex::is_multiple_blocks, Value: 1);
1282 Threshold -= SingleBBBonus;
1283 }
1284
1285 InlineResult finalizeAnalysis() override {
1286 auto *Caller = CandidateCall.getFunction();
1287 if (Caller->hasMinSize()) {
1288 DominatorTree DT(F);
1289 LoopInfo LI(DT);
1290 for (Loop *L : LI) {
1291 // Ignore loops that will not be executed
1292 if (DeadBlocks.count(Ptr: L->getHeader()))
1293 continue;
1294 increment(Feature: InlineCostFeatureIndex::num_loops,
1295 Delta: InlineConstants::LoopPenalty);
1296 }
1297 }
1298 set(Feature: InlineCostFeatureIndex::dead_blocks, Value: DeadBlocks.size());
1299 set(Feature: InlineCostFeatureIndex::simplified_instructions,
1300 Value: NumInstructionsSimplified);
1301 set(Feature: InlineCostFeatureIndex::constant_args, Value: NumConstantArgs);
1302 set(Feature: InlineCostFeatureIndex::constant_offset_ptr_args,
1303 Value: NumConstantOffsetPtrArgs);
1304 set(Feature: InlineCostFeatureIndex::sroa_savings, Value: SROACostSavingOpportunities);
1305
1306 if (NumVectorInstructions <= NumInstructions / 10)
1307 Threshold -= VectorBonus;
1308 else if (NumVectorInstructions <= NumInstructions / 2)
1309 Threshold -= VectorBonus / 2;
1310
1311 set(Feature: InlineCostFeatureIndex::threshold, Value: Threshold);
1312
1313 return InlineResult::success();
1314 }
1315
1316 bool shouldStop() override { return false; }
1317
1318 void onLoadEliminationOpportunity() override {
1319 increment(Feature: InlineCostFeatureIndex::load_elimination, Delta: 1);
1320 }
1321
1322 InlineResult onAnalysisStart() override {
1323 increment(Feature: InlineCostFeatureIndex::callsite_cost,
1324 Delta: -1 * getCallsiteCost(TTI, Call: this->CandidateCall, DL));
1325
1326 set(Feature: InlineCostFeatureIndex::cold_cc_penalty,
1327 Value: (F.getCallingConv() == CallingConv::Cold));
1328
1329 set(Feature: InlineCostFeatureIndex::last_call_to_static_bonus,
1330 Value: isSoleCallToLocalFunction(CB: CandidateCall, Callee: F));
1331
1332 // FIXME: we shouldn't repeat this logic in both the Features and Cost
1333 // analyzer - instead, we should abstract it to a common method in the
1334 // CallAnalyzer
1335 int SingleBBBonusPercent = 50;
1336 int VectorBonusPercent = TTI.getInlinerVectorBonusPercent();
1337 Threshold += TTI.adjustInliningThreshold(CB: &CandidateCall);
1338 Threshold *= TTI.getInliningThresholdMultiplier();
1339 SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
1340 VectorBonus = Threshold * VectorBonusPercent / 100;
1341 Threshold += (SingleBBBonus + VectorBonus);
1342
1343 return InlineResult::success();
1344 }
1345
1346public:
1347 InlineCostFeaturesAnalyzer(
1348 const TargetTransformInfo &TTI,
1349 function_ref<AssumptionCache &(Function &)> &GetAssumptionCache,
1350 function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
1351 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, Function &Callee,
1352 CallBase &Call)
1353 : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, PSI) {}
1354
1355 const InlineCostFeatures &features() const { return Cost; }
1356};
1357
1358} // namespace
1359
1360/// Test whether the given value is an Alloca-derived function argument.
1361bool CallAnalyzer::isAllocaDerivedArg(Value *V) {
1362 return SROAArgValues.count(Val: V);
1363}
1364
1365void CallAnalyzer::disableSROAForArg(AllocaInst *SROAArg) {
1366 onDisableSROA(Arg: SROAArg);
1367 EnabledSROAAllocas.erase(V: SROAArg);
1368 disableLoadElimination();
1369}
1370
1371void InlineCostAnnotationWriter::emitInstructionAnnot(
1372 const Instruction *I, formatted_raw_ostream &OS) {
1373 // The cost of inlining of the given instruction is printed always.
1374 // The threshold delta is printed only when it is non-zero. It happens
1375 // when we decided to give a bonus at a particular instruction.
1376 std::optional<InstructionCostDetail> Record = ICCA->getCostDetails(I);
1377 if (!Record)
1378 OS << "; No analysis for the instruction";
1379 else {
1380 OS << "; cost before = " << Record->CostBefore
1381 << ", cost after = " << Record->CostAfter
1382 << ", threshold before = " << Record->ThresholdBefore
1383 << ", threshold after = " << Record->ThresholdAfter << ", ";
1384 OS << "cost delta = " << Record->getCostDelta();
1385 if (Record->hasThresholdChanged())
1386 OS << ", threshold delta = " << Record->getThresholdDelta();
1387 }
1388 auto C = ICCA->getSimplifiedValue(I: const_cast<Instruction *>(I));
1389 if (C) {
1390 OS << ", simplified to ";
1391 (*C)->print(O&: OS, IsForDebug: true);
1392 }
1393 OS << "\n";
1394}
1395
1396/// If 'V' maps to a SROA candidate, disable SROA for it.
1397void CallAnalyzer::disableSROA(Value *V) {
1398 if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
1399 disableSROAForArg(SROAArg);
1400 }
1401}
1402
1403void CallAnalyzer::disableLoadElimination() {
1404 if (EnableLoadElimination) {
1405 onDisableLoadElimination();
1406 EnableLoadElimination = false;
1407 }
1408}
1409
1410/// Accumulate a constant GEP offset into an APInt if possible.
1411///
1412/// Returns false if unable to compute the offset for any reason. Respects any
1413/// simplified values known during the analysis of this callsite.
1414bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
1415 unsigned IntPtrWidth = DL.getIndexTypeSizeInBits(Ty: GEP.getType());
1416 assert(IntPtrWidth == Offset.getBitWidth());
1417
1418 for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
1419 GTI != GTE; ++GTI) {
1420 ConstantInt *OpC = dyn_cast<ConstantInt>(Val: GTI.getOperand());
1421 if (!OpC)
1422 if (Constant *SimpleOp = SimplifiedValues.lookup(Val: GTI.getOperand()))
1423 OpC = dyn_cast<ConstantInt>(Val: SimpleOp);
1424 if (!OpC)
1425 return false;
1426 if (OpC->isZero())
1427 continue;
1428
1429 // Handle a struct index, which adds its field offset to the pointer.
1430 if (StructType *STy = GTI.getStructTypeOrNull()) {
1431 unsigned ElementIdx = OpC->getZExtValue();
1432 const StructLayout *SL = DL.getStructLayout(Ty: STy);
1433 Offset += APInt(IntPtrWidth, SL->getElementOffset(Idx: ElementIdx));
1434 continue;
1435 }
1436
1437 APInt TypeSize(IntPtrWidth, GTI.getSequentialElementStride(DL));
1438 Offset += OpC->getValue().sextOrTrunc(width: IntPtrWidth) * TypeSize;
1439 }
1440 return true;
1441}
1442
1443/// Use TTI to check whether a GEP is free.
1444///
1445/// Respects any simplified values known during the analysis of this callsite.
1446bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) {
1447 SmallVector<Value *, 4> Operands;
1448 Operands.push_back(Elt: GEP.getOperand(i_nocapture: 0));
1449 for (const Use &Op : GEP.indices())
1450 if (Constant *SimpleOp = SimplifiedValues.lookup(Val: Op))
1451 Operands.push_back(Elt: SimpleOp);
1452 else
1453 Operands.push_back(Elt: Op);
1454 return TTI.getInstructionCost(U: &GEP, Operands,
1455 CostKind: TargetTransformInfo::TCK_SizeAndLatency) ==
1456 TargetTransformInfo::TCC_Free;
1457}
1458
1459bool CallAnalyzer::visitAlloca(AllocaInst &I) {
1460 disableSROA(V: I.getOperand(i_nocapture: 0));
1461
1462 // Check whether inlining will turn a dynamic alloca into a static
1463 // alloca and handle that case.
1464 if (I.isArrayAllocation()) {
1465 Constant *Size = SimplifiedValues.lookup(Val: I.getArraySize());
1466 if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Val: Size)) {
1467 // Sometimes a dynamic alloca could be converted into a static alloca
1468 // after this constant prop, and become a huge static alloca on an
1469 // unconditional CFG path. Avoid inlining if this is going to happen above
1470 // a threshold.
1471 // FIXME: If the threshold is removed or lowered too much, we could end up
1472 // being too pessimistic and prevent inlining non-problematic code. This
1473 // could result in unintended perf regressions. A better overall strategy
1474 // is needed to track stack usage during inlining.
1475 Type *Ty = I.getAllocatedType();
1476 AllocatedSize = SaturatingMultiplyAdd(
1477 X: AllocSize->getLimitedValue(),
1478 Y: DL.getTypeAllocSize(Ty).getKnownMinValue(), A: AllocatedSize);
1479 if (AllocatedSize > InlineConstants::MaxSimplifiedDynamicAllocaToInline)
1480 HasDynamicAlloca = true;
1481 return false;
1482 }
1483 }
1484
1485 // Accumulate the allocated size.
1486 if (I.isStaticAlloca()) {
1487 Type *Ty = I.getAllocatedType();
1488 AllocatedSize = SaturatingAdd(X: DL.getTypeAllocSize(Ty).getKnownMinValue(),
1489 Y: AllocatedSize);
1490 }
1491
1492 // FIXME: This is overly conservative. Dynamic allocas are inefficient for
1493 // a variety of reasons, and so we would like to not inline them into
1494 // functions which don't currently have a dynamic alloca. This simply
1495 // disables inlining altogether in the presence of a dynamic alloca.
1496 if (!I.isStaticAlloca())
1497 HasDynamicAlloca = true;
1498
1499 return false;
1500}
1501
1502bool CallAnalyzer::visitPHI(PHINode &I) {
1503 // FIXME: We need to propagate SROA *disabling* through phi nodes, even
1504 // though we don't want to propagate it's bonuses. The idea is to disable
1505 // SROA if it *might* be used in an inappropriate manner.
1506
1507 // Phi nodes are always zero-cost.
1508 // FIXME: Pointer sizes may differ between different address spaces, so do we
1509 // need to use correct address space in the call to getPointerSizeInBits here?
1510 // Or could we skip the getPointerSizeInBits call completely? As far as I can
1511 // see the ZeroOffset is used as a dummy value, so we can probably use any
1512 // bit width for the ZeroOffset?
1513 APInt ZeroOffset = APInt::getZero(numBits: DL.getPointerSizeInBits(AS: 0));
1514 bool CheckSROA = I.getType()->isPointerTy();
1515
1516 // Track the constant or pointer with constant offset we've seen so far.
1517 Constant *FirstC = nullptr;
1518 std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset};
1519 Value *FirstV = nullptr;
1520
1521 for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) {
1522 BasicBlock *Pred = I.getIncomingBlock(i);
1523 // If the incoming block is dead, skip the incoming block.
1524 if (DeadBlocks.count(Ptr: Pred))
1525 continue;
1526 // If the parent block of phi is not the known successor of the incoming
1527 // block, skip the incoming block.
1528 BasicBlock *KnownSuccessor = KnownSuccessors[Pred];
1529 if (KnownSuccessor && KnownSuccessor != I.getParent())
1530 continue;
1531
1532 Value *V = I.getIncomingValue(i);
1533 // If the incoming value is this phi itself, skip the incoming value.
1534 if (&I == V)
1535 continue;
1536
1537 Constant *C = dyn_cast<Constant>(Val: V);
1538 if (!C)
1539 C = SimplifiedValues.lookup(Val: V);
1540
1541 std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset};
1542 if (!C && CheckSROA)
1543 BaseAndOffset = ConstantOffsetPtrs.lookup(Val: V);
1544
1545 if (!C && !BaseAndOffset.first)
1546 // The incoming value is neither a constant nor a pointer with constant
1547 // offset, exit early.
1548 return true;
1549
1550 if (FirstC) {
1551 if (FirstC == C)
1552 // If we've seen a constant incoming value before and it is the same
1553 // constant we see this time, continue checking the next incoming value.
1554 continue;
1555 // Otherwise early exit because we either see a different constant or saw
1556 // a constant before but we have a pointer with constant offset this time.
1557 return true;
1558 }
1559
1560 if (FirstV) {
1561 // The same logic as above, but check pointer with constant offset here.
1562 if (FirstBaseAndOffset == BaseAndOffset)
1563 continue;
1564 return true;
1565 }
1566
1567 if (C) {
1568 // This is the 1st time we've seen a constant, record it.
1569 FirstC = C;
1570 continue;
1571 }
1572
1573 // The remaining case is that this is the 1st time we've seen a pointer with
1574 // constant offset, record it.
1575 FirstV = V;
1576 FirstBaseAndOffset = BaseAndOffset;
1577 }
1578
1579 // Check if we can map phi to a constant.
1580 if (FirstC) {
1581 SimplifiedValues[&I] = FirstC;
1582 return true;
1583 }
1584
1585 // Check if we can map phi to a pointer with constant offset.
1586 if (FirstBaseAndOffset.first) {
1587 ConstantOffsetPtrs[&I] = FirstBaseAndOffset;
1588
1589 if (auto *SROAArg = getSROAArgForValueOrNull(V: FirstV))
1590 SROAArgValues[&I] = SROAArg;
1591 }
1592
1593 return true;
1594}
1595
1596/// Check we can fold GEPs of constant-offset call site argument pointers.
1597/// This requires target data and inbounds GEPs.
1598///
1599/// \return true if the specified GEP can be folded.
1600bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) {
1601 // Check if we have a base + offset for the pointer.
1602 std::pair<Value *, APInt> BaseAndOffset =
1603 ConstantOffsetPtrs.lookup(Val: I.getPointerOperand());
1604 if (!BaseAndOffset.first)
1605 return false;
1606
1607 // Check if the offset of this GEP is constant, and if so accumulate it
1608 // into Offset.
1609 if (!accumulateGEPOffset(GEP&: cast<GEPOperator>(Val&: I), Offset&: BaseAndOffset.second))
1610 return false;
1611
1612 // Add the result as a new mapping to Base + Offset.
1613 ConstantOffsetPtrs[&I] = BaseAndOffset;
1614
1615 return true;
1616}
1617
1618bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
1619 auto *SROAArg = getSROAArgForValueOrNull(V: I.getPointerOperand());
1620
1621 // Lambda to check whether a GEP's indices are all constant.
1622 auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) {
1623 for (const Use &Op : GEP.indices())
1624 if (!isa<Constant>(Val: Op) && !SimplifiedValues.lookup(Val: Op))
1625 return false;
1626 return true;
1627 };
1628
1629 if (!DisableGEPConstOperand)
1630 if (simplifyInstruction(I))
1631 return true;
1632
1633 if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) {
1634 if (SROAArg)
1635 SROAArgValues[&I] = SROAArg;
1636
1637 // Constant GEPs are modeled as free.
1638 return true;
1639 }
1640
1641 // Variable GEPs will require math and will disable SROA.
1642 if (SROAArg)
1643 disableSROAForArg(SROAArg);
1644 return isGEPFree(GEP&: I);
1645}
1646
1647/// Simplify \p I if its operands are constants and update SimplifiedValues.
1648bool CallAnalyzer::simplifyInstruction(Instruction &I) {
1649 SmallVector<Constant *> COps;
1650 for (Value *Op : I.operands()) {
1651 Constant *COp = dyn_cast<Constant>(Val: Op);
1652 if (!COp)
1653 COp = SimplifiedValues.lookup(Val: Op);
1654 if (!COp)
1655 return false;
1656 COps.push_back(Elt: COp);
1657 }
1658 auto *C = ConstantFoldInstOperands(I: &I, Ops: COps, DL);
1659 if (!C)
1660 return false;
1661 SimplifiedValues[&I] = C;
1662 return true;
1663}
1664
1665/// Try to simplify a call to llvm.is.constant.
1666///
1667/// Duplicate the argument checking from CallAnalyzer::simplifyCallSite since
1668/// we expect calls of this specific intrinsic to be infrequent.
1669///
1670/// FIXME: Given that we know CB's parent (F) caller
1671/// (CandidateCall->getParent()->getParent()), we might be able to determine
1672/// whether inlining F into F's caller would change how the call to
1673/// llvm.is.constant would evaluate.
1674bool CallAnalyzer::simplifyIntrinsicCallIsConstant(CallBase &CB) {
1675 Value *Arg = CB.getArgOperand(i: 0);
1676 auto *C = dyn_cast<Constant>(Val: Arg);
1677
1678 if (!C)
1679 C = dyn_cast_or_null<Constant>(Val: SimplifiedValues.lookup(Val: Arg));
1680
1681 Type *RT = CB.getFunctionType()->getReturnType();
1682 SimplifiedValues[&CB] = ConstantInt::get(Ty: RT, V: C ? 1 : 0);
1683 return true;
1684}
1685
1686bool CallAnalyzer::simplifyIntrinsicCallObjectSize(CallBase &CB) {
1687 // As per the langref, "The fourth argument to llvm.objectsize determines if
1688 // the value should be evaluated at runtime."
1689 if (cast<ConstantInt>(Val: CB.getArgOperand(i: 3))->isOne())
1690 return false;
1691
1692 Value *V = lowerObjectSizeCall(ObjectSize: &cast<IntrinsicInst>(Val&: CB), DL, TLI: nullptr,
1693 /*MustSucceed=*/true);
1694 Constant *C = dyn_cast_or_null<Constant>(Val: V);
1695 if (C)
1696 SimplifiedValues[&CB] = C;
1697 return C;
1698}
1699
1700bool CallAnalyzer::visitBitCast(BitCastInst &I) {
1701 // Propagate constants through bitcasts.
1702 if (simplifyInstruction(I))
1703 return true;
1704
1705 // Track base/offsets through casts
1706 std::pair<Value *, APInt> BaseAndOffset =
1707 ConstantOffsetPtrs.lookup(Val: I.getOperand(i_nocapture: 0));
1708 // Casts don't change the offset, just wrap it up.
1709 if (BaseAndOffset.first)
1710 ConstantOffsetPtrs[&I] = BaseAndOffset;
1711
1712 // Also look for SROA candidates here.
1713 if (auto *SROAArg = getSROAArgForValueOrNull(V: I.getOperand(i_nocapture: 0)))
1714 SROAArgValues[&I] = SROAArg;
1715
1716 // Bitcasts are always zero cost.
1717 return true;
1718}
1719
1720bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
1721 // Propagate constants through ptrtoint.
1722 if (simplifyInstruction(I))
1723 return true;
1724
1725 // Track base/offset pairs when converted to a plain integer provided the
1726 // integer is large enough to represent the pointer.
1727 unsigned IntegerSize = I.getType()->getScalarSizeInBits();
1728 unsigned AS = I.getOperand(i_nocapture: 0)->getType()->getPointerAddressSpace();
1729 if (IntegerSize == DL.getPointerSizeInBits(AS)) {
1730 std::pair<Value *, APInt> BaseAndOffset =
1731 ConstantOffsetPtrs.lookup(Val: I.getOperand(i_nocapture: 0));
1732 if (BaseAndOffset.first)
1733 ConstantOffsetPtrs[&I] = BaseAndOffset;
1734 }
1735
1736 // This is really weird. Technically, ptrtoint will disable SROA. However,
1737 // unless that ptrtoint is *used* somewhere in the live basic blocks after
1738 // inlining, it will be nuked, and SROA should proceed. All of the uses which
1739 // would block SROA would also block SROA if applied directly to a pointer,
1740 // and so we can just add the integer in here. The only places where SROA is
1741 // preserved either cannot fire on an integer, or won't in-and-of themselves
1742 // disable SROA (ext) w/o some later use that we would see and disable.
1743 if (auto *SROAArg = getSROAArgForValueOrNull(V: I.getOperand(i_nocapture: 0)))
1744 SROAArgValues[&I] = SROAArg;
1745
1746 return TTI.getInstructionCost(U: &I, CostKind: TargetTransformInfo::TCK_SizeAndLatency) ==
1747 TargetTransformInfo::TCC_Free;
1748}
1749
1750bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
1751 // Propagate constants through ptrtoint.
1752 if (simplifyInstruction(I))
1753 return true;
1754
1755 // Track base/offset pairs when round-tripped through a pointer without
1756 // modifications provided the integer is not too large.
1757 Value *Op = I.getOperand(i_nocapture: 0);
1758 unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
1759 if (IntegerSize <= DL.getPointerTypeSizeInBits(I.getType())) {
1760 std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Val: Op);
1761 if (BaseAndOffset.first)
1762 ConstantOffsetPtrs[&I] = BaseAndOffset;
1763 }
1764
1765 // "Propagate" SROA here in the same manner as we do for ptrtoint above.
1766 if (auto *SROAArg = getSROAArgForValueOrNull(V: Op))
1767 SROAArgValues[&I] = SROAArg;
1768
1769 return TTI.getInstructionCost(U: &I, CostKind: TargetTransformInfo::TCK_SizeAndLatency) ==
1770 TargetTransformInfo::TCC_Free;
1771}
1772
1773bool CallAnalyzer::visitCastInst(CastInst &I) {
1774 // Propagate constants through casts.
1775 if (simplifyInstruction(I))
1776 return true;
1777
1778 // Disable SROA in the face of arbitrary casts we don't explicitly list
1779 // elsewhere.
1780 disableSROA(V: I.getOperand(i_nocapture: 0));
1781
1782 // If this is a floating-point cast, and the target says this operation
1783 // is expensive, this may eventually become a library call. Treat the cost
1784 // as such.
1785 switch (I.getOpcode()) {
1786 case Instruction::FPTrunc:
1787 case Instruction::FPExt:
1788 case Instruction::UIToFP:
1789 case Instruction::SIToFP:
1790 case Instruction::FPToUI:
1791 case Instruction::FPToSI:
1792 if (TTI.getFPOpCost(Ty: I.getType()) == TargetTransformInfo::TCC_Expensive)
1793 onCallPenalty();
1794 break;
1795 default:
1796 break;
1797 }
1798
1799 return TTI.getInstructionCost(U: &I, CostKind: TargetTransformInfo::TCK_SizeAndLatency) ==
1800 TargetTransformInfo::TCC_Free;
1801}
1802
1803bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
1804 return CandidateCall.paramHasAttr(ArgNo: A->getArgNo(), Kind: Attr);
1805}
1806
1807bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
1808 // Does the *call site* have the NonNull attribute set on an argument? We
1809 // use the attribute on the call site to memoize any analysis done in the
1810 // caller. This will also trip if the callee function has a non-null
1811 // parameter attribute, but that's a less interesting case because hopefully
1812 // the callee would already have been simplified based on that.
1813 if (Argument *A = dyn_cast<Argument>(Val: V))
1814 if (paramHasAttr(A, Attribute::Attr: NonNull))
1815 return true;
1816
1817 // Is this an alloca in the caller? This is distinct from the attribute case
1818 // above because attributes aren't updated within the inliner itself and we
1819 // always want to catch the alloca derived case.
1820 if (isAllocaDerivedArg(V))
1821 // We can actually predict the result of comparisons between an
1822 // alloca-derived value and null. Note that this fires regardless of
1823 // SROA firing.
1824 return true;
1825
1826 return false;
1827}
1828
1829bool CallAnalyzer::allowSizeGrowth(CallBase &Call) {
1830 // If the normal destination of the invoke or the parent block of the call
1831 // site is unreachable-terminated, there is little point in inlining this
1832 // unless there is literally zero cost.
1833 // FIXME: Note that it is possible that an unreachable-terminated block has a
1834 // hot entry. For example, in below scenario inlining hot_call_X() may be
1835 // beneficial :
1836 // main() {
1837 // hot_call_1();
1838 // ...
1839 // hot_call_N()
1840 // exit(0);
1841 // }
1842 // For now, we are not handling this corner case here as it is rare in real
1843 // code. In future, we should elaborate this based on BPI and BFI in more
1844 // general threshold adjusting heuristics in updateThreshold().
1845 if (InvokeInst *II = dyn_cast<InvokeInst>(Val: &Call)) {
1846 if (isa<UnreachableInst>(Val: II->getNormalDest()->getTerminator()))
1847 return false;
1848 } else if (isa<UnreachableInst>(Val: Call.getParent()->getTerminator()))
1849 return false;
1850
1851 return true;
1852}
1853
1854bool InlineCostCallAnalyzer::isColdCallSite(CallBase &Call,
1855 BlockFrequencyInfo *CallerBFI) {
1856 // If global profile summary is available, then callsite's coldness is
1857 // determined based on that.
1858 if (PSI && PSI->hasProfileSummary())
1859 return PSI->isColdCallSite(CB: Call, BFI: CallerBFI);
1860
1861 // Otherwise we need BFI to be available.
1862 if (!CallerBFI)
1863 return false;
1864
1865 // Determine if the callsite is cold relative to caller's entry. We could
1866 // potentially cache the computation of scaled entry frequency, but the added
1867 // complexity is not worth it unless this scaling shows up high in the
1868 // profiles.
1869 const BranchProbability ColdProb(ColdCallSiteRelFreq, 100);
1870 auto CallSiteBB = Call.getParent();
1871 auto CallSiteFreq = CallerBFI->getBlockFreq(BB: CallSiteBB);
1872 auto CallerEntryFreq =
1873 CallerBFI->getBlockFreq(BB: &(Call.getCaller()->getEntryBlock()));
1874 return CallSiteFreq < CallerEntryFreq * ColdProb;
1875}
1876
1877std::optional<int>
1878InlineCostCallAnalyzer::getHotCallSiteThreshold(CallBase &Call,
1879 BlockFrequencyInfo *CallerBFI) {
1880
1881 // If global profile summary is available, then callsite's hotness is
1882 // determined based on that.
1883 if (PSI && PSI->hasProfileSummary() && PSI->isHotCallSite(CB: Call, BFI: CallerBFI))
1884 return Params.HotCallSiteThreshold;
1885
1886 // Otherwise we need BFI to be available and to have a locally hot callsite
1887 // threshold.
1888 if (!CallerBFI || !Params.LocallyHotCallSiteThreshold)
1889 return std::nullopt;
1890
1891 // Determine if the callsite is hot relative to caller's entry. We could
1892 // potentially cache the computation of scaled entry frequency, but the added
1893 // complexity is not worth it unless this scaling shows up high in the
1894 // profiles.
1895 const BasicBlock *CallSiteBB = Call.getParent();
1896 BlockFrequency CallSiteFreq = CallerBFI->getBlockFreq(BB: CallSiteBB);
1897 BlockFrequency CallerEntryFreq = CallerBFI->getEntryFreq();
1898 std::optional<BlockFrequency> Limit = CallerEntryFreq.mul(Factor: HotCallSiteRelFreq);
1899 if (Limit && CallSiteFreq >= *Limit)
1900 return Params.LocallyHotCallSiteThreshold;
1901
1902 // Otherwise treat it normally.
1903 return std::nullopt;
1904}
1905
1906void InlineCostCallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) {
1907 // If no size growth is allowed for this inlining, set Threshold to 0.
1908 if (!allowSizeGrowth(Call)) {
1909 Threshold = 0;
1910 return;
1911 }
1912
1913 Function *Caller = Call.getCaller();
1914
1915 // return min(A, B) if B is valid.
1916 auto MinIfValid = [](int A, std::optional<int> B) {
1917 return B ? std::min(a: A, b: *B) : A;
1918 };
1919
1920 // return max(A, B) if B is valid.
1921 auto MaxIfValid = [](int A, std::optional<int> B) {
1922 return B ? std::max(a: A, b: *B) : A;
1923 };
1924
1925 // Various bonus percentages. These are multiplied by Threshold to get the
1926 // bonus values.
1927 // SingleBBBonus: This bonus is applied if the callee has a single reachable
1928 // basic block at the given callsite context. This is speculatively applied
1929 // and withdrawn if more than one basic block is seen.
1930 //
1931 // LstCallToStaticBonus: This large bonus is applied to ensure the inlining
1932 // of the last call to a static function as inlining such functions is
1933 // guaranteed to reduce code size.
1934 //
1935 // These bonus percentages may be set to 0 based on properties of the caller
1936 // and the callsite.
1937 int SingleBBBonusPercent = 50;
1938 int VectorBonusPercent = TTI.getInlinerVectorBonusPercent();
1939 int LastCallToStaticBonus = InlineConstants::LastCallToStaticBonus;
1940
1941 // Lambda to set all the above bonus and bonus percentages to 0.
1942 auto DisallowAllBonuses = [&]() {
1943 SingleBBBonusPercent = 0;
1944 VectorBonusPercent = 0;
1945 LastCallToStaticBonus = 0;
1946 };
1947
1948 // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available
1949 // and reduce the threshold if the caller has the necessary attribute.
1950 if (Caller->hasMinSize()) {
1951 Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold);
1952 // For minsize, we want to disable the single BB bonus and the vector
1953 // bonuses, but not the last-call-to-static bonus. Inlining the last call to
1954 // a static function will, at the minimum, eliminate the parameter setup and
1955 // call/return instructions.
1956 SingleBBBonusPercent = 0;
1957 VectorBonusPercent = 0;
1958 } else if (Caller->hasOptSize())
1959 Threshold = MinIfValid(Threshold, Params.OptSizeThreshold);
1960
1961 // Adjust the threshold based on inlinehint attribute and profile based
1962 // hotness information if the caller does not have MinSize attribute.
1963 if (!Caller->hasMinSize()) {
1964 if (Callee.hasFnAttribute(Attribute::InlineHint))
1965 Threshold = MaxIfValid(Threshold, Params.HintThreshold);
1966
1967 // FIXME: After switching to the new passmanager, simplify the logic below
1968 // by checking only the callsite hotness/coldness as we will reliably
1969 // have local profile information.
1970 //
1971 // Callsite hotness and coldness can be determined if sample profile is
1972 // used (which adds hotness metadata to calls) or if caller's
1973 // BlockFrequencyInfo is available.
1974 BlockFrequencyInfo *CallerBFI = GetBFI ? &(GetBFI(*Caller)) : nullptr;
1975 auto HotCallSiteThreshold = getHotCallSiteThreshold(Call, CallerBFI);
1976 if (!Caller->hasOptSize() && HotCallSiteThreshold) {
1977 LLVM_DEBUG(dbgs() << "Hot callsite.\n");
1978 // FIXME: This should update the threshold only if it exceeds the
1979 // current threshold, but AutoFDO + ThinLTO currently relies on this
1980 // behavior to prevent inlining of hot callsites during ThinLTO
1981 // compile phase.
1982 Threshold = *HotCallSiteThreshold;
1983 } else if (isColdCallSite(Call, CallerBFI)) {
1984 LLVM_DEBUG(dbgs() << "Cold callsite.\n");
1985 // Do not apply bonuses for a cold callsite including the
1986 // LastCallToStatic bonus. While this bonus might result in code size
1987 // reduction, it can cause the size of a non-cold caller to increase
1988 // preventing it from being inlined.
1989 DisallowAllBonuses();
1990 Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold);
1991 } else if (PSI) {
1992 // Use callee's global profile information only if we have no way of
1993 // determining this via callsite information.
1994 if (PSI->isFunctionEntryHot(F: &Callee)) {
1995 LLVM_DEBUG(dbgs() << "Hot callee.\n");
1996 // If callsite hotness can not be determined, we may still know
1997 // that the callee is hot and treat it as a weaker hint for threshold
1998 // increase.
1999 Threshold = MaxIfValid(Threshold, Params.HintThreshold);
2000 } else if (PSI->isFunctionEntryCold(F: &Callee)) {
2001 LLVM_DEBUG(dbgs() << "Cold callee.\n");
2002 // Do not apply bonuses for a cold callee including the
2003 // LastCallToStatic bonus. While this bonus might result in code size
2004 // reduction, it can cause the size of a non-cold caller to increase
2005 // preventing it from being inlined.
2006 DisallowAllBonuses();
2007 Threshold = MinIfValid(Threshold, Params.ColdThreshold);
2008 }
2009 }
2010 }
2011
2012 Threshold += TTI.adjustInliningThreshold(CB: &Call);
2013
2014 // Finally, take the target-specific inlining threshold multiplier into
2015 // account.
2016 Threshold *= TTI.getInliningThresholdMultiplier();
2017
2018 SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
2019 VectorBonus = Threshold * VectorBonusPercent / 100;
2020
2021 // If there is only one call of the function, and it has internal linkage,
2022 // the cost of inlining it drops dramatically. It may seem odd to update
2023 // Cost in updateThreshold, but the bonus depends on the logic in this method.
2024 if (isSoleCallToLocalFunction(CB: Call, Callee: F)) {
2025 Cost -= LastCallToStaticBonus;
2026 StaticBonusApplied = LastCallToStaticBonus;
2027 }
2028}
2029
2030bool CallAnalyzer::visitCmpInst(CmpInst &I) {
2031 Value *LHS = I.getOperand(i_nocapture: 0), *RHS = I.getOperand(i_nocapture: 1);
2032 // First try to handle simplified comparisons.
2033 if (simplifyInstruction(I))
2034 return true;
2035
2036 if (I.getOpcode() == Instruction::FCmp)
2037 return false;
2038
2039 // Otherwise look for a comparison between constant offset pointers with
2040 // a common base.
2041 Value *LHSBase, *RHSBase;
2042 APInt LHSOffset, RHSOffset;
2043 std::tie(args&: LHSBase, args&: LHSOffset) = ConstantOffsetPtrs.lookup(Val: LHS);
2044 if (LHSBase) {
2045 std::tie(args&: RHSBase, args&: RHSOffset) = ConstantOffsetPtrs.lookup(Val: RHS);
2046 if (RHSBase && LHSBase == RHSBase) {
2047 // We have common bases, fold the icmp to a constant based on the
2048 // offsets.
2049 Constant *CLHS = ConstantInt::get(Context&: LHS->getContext(), V: LHSOffset);
2050 Constant *CRHS = ConstantInt::get(Context&: RHS->getContext(), V: RHSOffset);
2051 if (Constant *C = ConstantExpr::getICmp(pred: I.getPredicate(), LHS: CLHS, RHS: CRHS)) {
2052 SimplifiedValues[&I] = C;
2053 ++NumConstantPtrCmps;
2054 return true;
2055 }
2056 }
2057 }
2058
2059 auto isImplicitNullCheckCmp = [](const CmpInst &I) {
2060 for (auto *User : I.users())
2061 if (auto *Instr = dyn_cast<Instruction>(Val: User))
2062 if (!Instr->getMetadata(KindID: LLVMContext::MD_make_implicit))
2063 return false;
2064 return true;
2065 };
2066
2067 // If the comparison is an equality comparison with null, we can simplify it
2068 // if we know the value (argument) can't be null
2069 if (I.isEquality() && isa<ConstantPointerNull>(Val: I.getOperand(i_nocapture: 1))) {
2070 if (isKnownNonNullInCallee(V: I.getOperand(i_nocapture: 0))) {
2071 bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
2072 SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(Ty: I.getType())
2073 : ConstantInt::getFalse(Ty: I.getType());
2074 return true;
2075 }
2076 // Implicit null checks act as unconditional branches and their comparisons
2077 // should be treated as simplified and free of cost.
2078 if (isImplicitNullCheckCmp(I))
2079 return true;
2080 }
2081 return handleSROA(V: I.getOperand(i_nocapture: 0), DoNotDisable: isa<ConstantPointerNull>(Val: I.getOperand(i_nocapture: 1)));
2082}
2083
2084bool CallAnalyzer::visitSub(BinaryOperator &I) {
2085 // Try to handle a special case: we can fold computing the difference of two
2086 // constant-related pointers.
2087 Value *LHS = I.getOperand(i_nocapture: 0), *RHS = I.getOperand(i_nocapture: 1);
2088 Value *LHSBase, *RHSBase;
2089 APInt LHSOffset, RHSOffset;
2090 std::tie(args&: LHSBase, args&: LHSOffset) = ConstantOffsetPtrs.lookup(Val: LHS);
2091 if (LHSBase) {
2092 std::tie(args&: RHSBase, args&: RHSOffset) = ConstantOffsetPtrs.lookup(Val: RHS);
2093 if (RHSBase && LHSBase == RHSBase) {
2094 // We have common bases, fold the subtract to a constant based on the
2095 // offsets.
2096 Constant *CLHS = ConstantInt::get(Context&: LHS->getContext(), V: LHSOffset);
2097 Constant *CRHS = ConstantInt::get(Context&: RHS->getContext(), V: RHSOffset);
2098 if (Constant *C = ConstantExpr::getSub(C1: CLHS, C2: CRHS)) {
2099 SimplifiedValues[&I] = C;
2100 ++NumConstantPtrDiffs;
2101 return true;
2102 }
2103 }
2104 }
2105
2106 // Otherwise, fall back to the generic logic for simplifying and handling
2107 // instructions.
2108 return Base::visitSub(I);
2109}
2110
2111bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
2112 Value *LHS = I.getOperand(i_nocapture: 0), *RHS = I.getOperand(i_nocapture: 1);
2113 Constant *CLHS = dyn_cast<Constant>(Val: LHS);
2114 if (!CLHS)
2115 CLHS = SimplifiedValues.lookup(Val: LHS);
2116 Constant *CRHS = dyn_cast<Constant>(Val: RHS);
2117 if (!CRHS)
2118 CRHS = SimplifiedValues.lookup(Val: RHS);
2119
2120 Value *SimpleV = nullptr;
2121 if (auto FI = dyn_cast<FPMathOperator>(Val: &I))
2122 SimpleV = simplifyBinOp(Opcode: I.getOpcode(), LHS: CLHS ? CLHS : LHS, RHS: CRHS ? CRHS : RHS,
2123 FMF: FI->getFastMathFlags(), Q: DL);
2124 else
2125 SimpleV =
2126 simplifyBinOp(Opcode: I.getOpcode(), LHS: CLHS ? CLHS : LHS, RHS: CRHS ? CRHS : RHS, Q: DL);
2127
2128 if (Constant *C = dyn_cast_or_null<Constant>(Val: SimpleV))
2129 SimplifiedValues[&I] = C;
2130
2131 if (SimpleV)
2132 return true;
2133
2134 // Disable any SROA on arguments to arbitrary, unsimplified binary operators.
2135 disableSROA(V: LHS);
2136 disableSROA(V: RHS);
2137
2138 // If the instruction is floating point, and the target says this operation
2139 // is expensive, this may eventually become a library call. Treat the cost
2140 // as such. Unless it's fneg which can be implemented with an xor.
2141 using namespace llvm::PatternMatch;
2142 if (I.getType()->isFloatingPointTy() &&
2143 TTI.getFPOpCost(Ty: I.getType()) == TargetTransformInfo::TCC_Expensive &&
2144 !match(V: &I, P: m_FNeg(X: m_Value())))
2145 onCallPenalty();
2146
2147 return false;
2148}
2149
2150bool CallAnalyzer::visitFNeg(UnaryOperator &I) {
2151 Value *Op = I.getOperand(i_nocapture: 0);
2152 Constant *COp = dyn_cast<Constant>(Val: Op);
2153 if (!COp)
2154 COp = SimplifiedValues.lookup(Val: Op);
2155
2156 Value *SimpleV = simplifyFNegInst(
2157 Op: COp ? COp : Op, FMF: cast<FPMathOperator>(Val&: I).getFastMathFlags(), Q: DL);
2158
2159 if (Constant *C = dyn_cast_or_null<Constant>(Val: SimpleV))
2160 SimplifiedValues[&I] = C;
2161
2162 if (SimpleV)
2163 return true;
2164
2165 // Disable any SROA on arguments to arbitrary, unsimplified fneg.
2166 disableSROA(V: Op);
2167
2168 return false;
2169}
2170
2171bool CallAnalyzer::visitLoad(LoadInst &I) {
2172 if (handleSROA(V: I.getPointerOperand(), DoNotDisable: I.isSimple()))
2173 return true;
2174
2175 // If the data is already loaded from this address and hasn't been clobbered
2176 // by any stores or calls, this load is likely to be redundant and can be
2177 // eliminated.
2178 if (EnableLoadElimination &&
2179 !LoadAddrSet.insert(Ptr: I.getPointerOperand()).second && I.isUnordered()) {
2180 onLoadEliminationOpportunity();
2181 return true;
2182 }
2183
2184 onMemAccess();
2185 return false;
2186}
2187
2188bool CallAnalyzer::visitStore(StoreInst &I) {
2189 if (handleSROA(V: I.getPointerOperand(), DoNotDisable: I.isSimple()))
2190 return true;
2191
2192 // The store can potentially clobber loads and prevent repeated loads from
2193 // being eliminated.
2194 // FIXME:
2195 // 1. We can probably keep an initial set of eliminatable loads substracted
2196 // from the cost even when we finally see a store. We just need to disable
2197 // *further* accumulation of elimination savings.
2198 // 2. We should probably at some point thread MemorySSA for the callee into
2199 // this and then use that to actually compute *really* precise savings.
2200 disableLoadElimination();
2201
2202 onMemAccess();
2203 return false;
2204}
2205
2206bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
2207 // Constant folding for extract value is trivial.
2208 if (simplifyInstruction(I))
2209 return true;
2210
2211 // SROA can't look through these, but they may be free.
2212 return Base::visitExtractValue(I);
2213}
2214
2215bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
2216 // Constant folding for insert value is trivial.
2217 if (simplifyInstruction(I))
2218 return true;
2219
2220 // SROA can't look through these, but they may be free.
2221 return Base::visitInsertValue(I);
2222}
2223
2224/// Try to simplify a call site.
2225///
2226/// Takes a concrete function and callsite and tries to actually simplify it by
2227/// analyzing the arguments and call itself with instsimplify. Returns true if
2228/// it has simplified the callsite to some other entity (a constant), making it
2229/// free.
2230bool CallAnalyzer::simplifyCallSite(Function *F, CallBase &Call) {
2231 // FIXME: Using the instsimplify logic directly for this is inefficient
2232 // because we have to continually rebuild the argument list even when no
2233 // simplifications can be performed. Until that is fixed with remapping
2234 // inside of instsimplify, directly constant fold calls here.
2235 if (!canConstantFoldCallTo(Call: &Call, F))
2236 return false;
2237
2238 // Try to re-map the arguments to constants.
2239 SmallVector<Constant *, 4> ConstantArgs;
2240 ConstantArgs.reserve(N: Call.arg_size());
2241 for (Value *I : Call.args()) {
2242 Constant *C = dyn_cast<Constant>(Val: I);
2243 if (!C)
2244 C = dyn_cast_or_null<Constant>(Val: SimplifiedValues.lookup(Val: I));
2245 if (!C)
2246 return false; // This argument doesn't map to a constant.
2247
2248 ConstantArgs.push_back(Elt: C);
2249 }
2250 if (Constant *C = ConstantFoldCall(Call: &Call, F, Operands: ConstantArgs)) {
2251 SimplifiedValues[&Call] = C;
2252 return true;
2253 }
2254
2255 return false;
2256}
2257
2258bool CallAnalyzer::visitCallBase(CallBase &Call) {
2259 if (!onCallBaseVisitStart(Call))
2260 return true;
2261
2262 if (Call.hasFnAttr(Attribute::ReturnsTwice) &&
2263 !F.hasFnAttribute(Attribute::ReturnsTwice)) {
2264 // This aborts the entire analysis.
2265 ExposesReturnsTwice = true;
2266 return false;
2267 }
2268 if (isa<CallInst>(Val: Call) && cast<CallInst>(Val&: Call).cannotDuplicate())
2269 ContainsNoDuplicateCall = true;
2270
2271 Function *F = Call.getCalledFunction();
2272 bool IsIndirectCall = !F;
2273 if (IsIndirectCall) {
2274 // Check if this happens to be an indirect function call to a known function
2275 // in this inline context. If not, we've done all we can.
2276 Value *Callee = Call.getCalledOperand();
2277 F = dyn_cast_or_null<Function>(Val: SimplifiedValues.lookup(Val: Callee));
2278 if (!F || F->getFunctionType() != Call.getFunctionType()) {
2279 onCallArgumentSetup(Call);
2280
2281 if (!Call.onlyReadsMemory())
2282 disableLoadElimination();
2283 return Base::visitCallBase(I&: Call);
2284 }
2285 }
2286
2287 assert(F && "Expected a call to a known function");
2288
2289 // When we have a concrete function, first try to simplify it directly.
2290 if (simplifyCallSite(F, Call))
2291 return true;
2292
2293 // Next check if it is an intrinsic we know about.
2294 // FIXME: Lift this into part of the InstVisitor.
2295 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: &Call)) {
2296 switch (II->getIntrinsicID()) {
2297 default:
2298 if (!Call.onlyReadsMemory() && !isAssumeLikeIntrinsic(I: II))
2299 disableLoadElimination();
2300 return Base::visitCallBase(I&: Call);
2301
2302 case Intrinsic::load_relative:
2303 onLoadRelativeIntrinsic();
2304 return false;
2305
2306 case Intrinsic::memset:
2307 case Intrinsic::memcpy:
2308 case Intrinsic::memmove:
2309 disableLoadElimination();
2310 // SROA can usually chew through these intrinsics, but they aren't free.
2311 return false;
2312 case Intrinsic::icall_branch_funnel:
2313 case Intrinsic::localescape:
2314 HasUninlineableIntrinsic = true;
2315 return false;
2316 case Intrinsic::vastart:
2317 InitsVargArgs = true;
2318 return false;
2319 case Intrinsic::launder_invariant_group:
2320 case Intrinsic::strip_invariant_group:
2321 if (auto *SROAArg = getSROAArgForValueOrNull(V: II->getOperand(i_nocapture: 0)))
2322 SROAArgValues[II] = SROAArg;
2323 return true;
2324 case Intrinsic::is_constant:
2325 return simplifyIntrinsicCallIsConstant(CB&: Call);
2326 case Intrinsic::objectsize:
2327 return simplifyIntrinsicCallObjectSize(CB&: Call);
2328 }
2329 }
2330
2331 if (F == Call.getFunction()) {
2332 // This flag will fully abort the analysis, so don't bother with anything
2333 // else.
2334 IsRecursiveCall = true;
2335 if (!AllowRecursiveCall)
2336 return false;
2337 }
2338
2339 if (TTI.isLoweredToCall(F)) {
2340 onLoweredCall(F, Call, IsIndirectCall);
2341 }
2342
2343 if (!(Call.onlyReadsMemory() || (IsIndirectCall && F->onlyReadsMemory())))
2344 disableLoadElimination();
2345 return Base::visitCallBase(I&: Call);
2346}
2347
2348bool CallAnalyzer::visitReturnInst(ReturnInst &RI) {
2349 // At least one return instruction will be free after inlining.
2350 bool Free = !HasReturn;
2351 HasReturn = true;
2352 return Free;
2353}
2354
2355bool CallAnalyzer::visitBranchInst(BranchInst &BI) {
2356 // We model unconditional branches as essentially free -- they really
2357 // shouldn't exist at all, but handling them makes the behavior of the
2358 // inliner more regular and predictable. Interestingly, conditional branches
2359 // which will fold away are also free.
2360 return BI.isUnconditional() || isa<ConstantInt>(Val: BI.getCondition()) ||
2361 BI.getMetadata(KindID: LLVMContext::MD_make_implicit) ||
2362 isa_and_nonnull<ConstantInt>(
2363 Val: SimplifiedValues.lookup(Val: BI.getCondition()));
2364}
2365
2366bool CallAnalyzer::visitSelectInst(SelectInst &SI) {
2367 bool CheckSROA = SI.getType()->isPointerTy();
2368 Value *TrueVal = SI.getTrueValue();
2369 Value *FalseVal = SI.getFalseValue();
2370
2371 Constant *TrueC = dyn_cast<Constant>(Val: TrueVal);
2372 if (!TrueC)
2373 TrueC = SimplifiedValues.lookup(Val: TrueVal);
2374 Constant *FalseC = dyn_cast<Constant>(Val: FalseVal);
2375 if (!FalseC)
2376 FalseC = SimplifiedValues.lookup(Val: FalseVal);
2377 Constant *CondC =
2378 dyn_cast_or_null<Constant>(Val: SimplifiedValues.lookup(Val: SI.getCondition()));
2379
2380 if (!CondC) {
2381 // Select C, X, X => X
2382 if (TrueC == FalseC && TrueC) {
2383 SimplifiedValues[&SI] = TrueC;
2384 return true;
2385 }
2386
2387 if (!CheckSROA)
2388 return Base::visitSelectInst(I&: SI);
2389
2390 std::pair<Value *, APInt> TrueBaseAndOffset =
2391 ConstantOffsetPtrs.lookup(Val: TrueVal);
2392 std::pair<Value *, APInt> FalseBaseAndOffset =
2393 ConstantOffsetPtrs.lookup(Val: FalseVal);
2394 if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) {
2395 ConstantOffsetPtrs[&SI] = TrueBaseAndOffset;
2396
2397 if (auto *SROAArg = getSROAArgForValueOrNull(V: TrueVal))
2398 SROAArgValues[&SI] = SROAArg;
2399 return true;
2400 }
2401
2402 return Base::visitSelectInst(I&: SI);
2403 }
2404
2405 // Select condition is a constant.
2406 Value *SelectedV = CondC->isAllOnesValue() ? TrueVal
2407 : (CondC->isNullValue()) ? FalseVal
2408 : nullptr;
2409 if (!SelectedV) {
2410 // Condition is a vector constant that is not all 1s or all 0s. If all
2411 // operands are constants, ConstantFoldSelectInstruction() can handle the
2412 // cases such as select vectors.
2413 if (TrueC && FalseC) {
2414 if (auto *C = ConstantFoldSelectInstruction(Cond: CondC, V1: TrueC, V2: FalseC)) {
2415 SimplifiedValues[&SI] = C;
2416 return true;
2417 }
2418 }
2419 return Base::visitSelectInst(I&: SI);
2420 }
2421
2422 // Condition is either all 1s or all 0s. SI can be simplified.
2423 if (Constant *SelectedC = dyn_cast<Constant>(Val: SelectedV)) {
2424 SimplifiedValues[&SI] = SelectedC;
2425 return true;
2426 }
2427
2428 if (!CheckSROA)
2429 return true;
2430
2431 std::pair<Value *, APInt> BaseAndOffset =
2432 ConstantOffsetPtrs.lookup(Val: SelectedV);
2433 if (BaseAndOffset.first) {
2434 ConstantOffsetPtrs[&SI] = BaseAndOffset;
2435
2436 if (auto *SROAArg = getSROAArgForValueOrNull(V: SelectedV))
2437 SROAArgValues[&SI] = SROAArg;
2438 }
2439
2440 return true;
2441}
2442
2443bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
2444 // We model unconditional switches as free, see the comments on handling
2445 // branches.
2446 if (isa<ConstantInt>(Val: SI.getCondition()))
2447 return true;
2448 if (Value *V = SimplifiedValues.lookup(Val: SI.getCondition()))
2449 if (isa<ConstantInt>(Val: V))
2450 return true;
2451
2452 // Assume the most general case where the switch is lowered into
2453 // either a jump table, bit test, or a balanced binary tree consisting of
2454 // case clusters without merging adjacent clusters with the same
2455 // destination. We do not consider the switches that are lowered with a mix
2456 // of jump table/bit test/binary search tree. The cost of the switch is
2457 // proportional to the size of the tree or the size of jump table range.
2458 //
2459 // NB: We convert large switches which are just used to initialize large phi
2460 // nodes to lookup tables instead in simplifycfg, so this shouldn't prevent
2461 // inlining those. It will prevent inlining in cases where the optimization
2462 // does not (yet) fire.
2463
2464 unsigned JumpTableSize = 0;
2465 BlockFrequencyInfo *BFI = GetBFI ? &(GetBFI(F)) : nullptr;
2466 unsigned NumCaseCluster =
2467 TTI.getEstimatedNumberOfCaseClusters(SI, JTSize&: JumpTableSize, PSI, BFI);
2468
2469 onFinalizeSwitch(JumpTableSize, NumCaseCluster, DefaultDestUndefined: SI.defaultDestUndefined());
2470 return false;
2471}
2472
2473bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
2474 // We never want to inline functions that contain an indirectbr. This is
2475 // incorrect because all the blockaddress's (in static global initializers
2476 // for example) would be referring to the original function, and this
2477 // indirect jump would jump from the inlined copy of the function into the
2478 // original function which is extremely undefined behavior.
2479 // FIXME: This logic isn't really right; we can safely inline functions with
2480 // indirectbr's as long as no other function or global references the
2481 // blockaddress of a block within the current function.
2482 HasIndirectBr = true;
2483 return false;
2484}
2485
2486bool CallAnalyzer::visitResumeInst(ResumeInst &RI) {
2487 // FIXME: It's not clear that a single instruction is an accurate model for
2488 // the inline cost of a resume instruction.
2489 return false;
2490}
2491
2492bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) {
2493 // FIXME: It's not clear that a single instruction is an accurate model for
2494 // the inline cost of a cleanupret instruction.
2495 return false;
2496}
2497
2498bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) {
2499 // FIXME: It's not clear that a single instruction is an accurate model for
2500 // the inline cost of a catchret instruction.
2501 return false;
2502}
2503
2504bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
2505 // FIXME: It might be reasonably to discount the cost of instructions leading
2506 // to unreachable as they have the lowest possible impact on both runtime and
2507 // code size.
2508 return true; // No actual code is needed for unreachable.
2509}
2510
2511bool CallAnalyzer::visitInstruction(Instruction &I) {
2512 // Some instructions are free. All of the free intrinsics can also be
2513 // handled by SROA, etc.
2514 if (TTI.getInstructionCost(U: &I, CostKind: TargetTransformInfo::TCK_SizeAndLatency) ==
2515 TargetTransformInfo::TCC_Free)
2516 return true;
2517
2518 // We found something we don't understand or can't handle. Mark any SROA-able
2519 // values in the operand list as no longer viable.
2520 for (const Use &Op : I.operands())
2521 disableSROA(V: Op);
2522
2523 return false;
2524}
2525
2526/// Analyze a basic block for its contribution to the inline cost.
2527///
2528/// This method walks the analyzer over every instruction in the given basic
2529/// block and accounts for their cost during inlining at this callsite. It
2530/// aborts early if the threshold has been exceeded or an impossible to inline
2531/// construct has been detected. It returns false if inlining is no longer
2532/// viable, and true if inlining remains viable.
2533InlineResult
2534CallAnalyzer::analyzeBlock(BasicBlock *BB,
2535 SmallPtrSetImpl<const Value *> &EphValues) {
2536 for (Instruction &I : *BB) {
2537 // FIXME: Currently, the number of instructions in a function regardless of
2538 // our ability to simplify them during inline to constants or dead code,
2539 // are actually used by the vector bonus heuristic. As long as that's true,
2540 // we have to special case debug intrinsics here to prevent differences in
2541 // inlining due to debug symbols. Eventually, the number of unsimplified
2542 // instructions shouldn't factor into the cost computation, but until then,
2543 // hack around it here.
2544 // Similarly, skip pseudo-probes.
2545 if (I.isDebugOrPseudoInst())
2546 continue;
2547
2548 // Skip ephemeral values.
2549 if (EphValues.count(Ptr: &I))
2550 continue;
2551
2552 ++NumInstructions;
2553 if (isa<ExtractElementInst>(Val: I) || I.getType()->isVectorTy())
2554 ++NumVectorInstructions;
2555
2556 // If the instruction simplified to a constant, there is no cost to this
2557 // instruction. Visit the instructions using our InstVisitor to account for
2558 // all of the per-instruction logic. The visit tree returns true if we
2559 // consumed the instruction in any way, and false if the instruction's base
2560 // cost should count against inlining.
2561 onInstructionAnalysisStart(I: &I);
2562
2563 if (Base::visit(I: &I))
2564 ++NumInstructionsSimplified;
2565 else
2566 onMissedSimplification();
2567
2568 onInstructionAnalysisFinish(I: &I);
2569 using namespace ore;
2570 // If the visit this instruction detected an uninlinable pattern, abort.
2571 InlineResult IR = InlineResult::success();
2572 if (IsRecursiveCall && !AllowRecursiveCall)
2573 IR = InlineResult::failure(Reason: "recursive");
2574 else if (ExposesReturnsTwice)
2575 IR = InlineResult::failure(Reason: "exposes returns twice");
2576 else if (HasDynamicAlloca)
2577 IR = InlineResult::failure(Reason: "dynamic alloca");
2578 else if (HasIndirectBr)
2579 IR = InlineResult::failure(Reason: "indirect branch");
2580 else if (HasUninlineableIntrinsic)
2581 IR = InlineResult::failure(Reason: "uninlinable intrinsic");
2582 else if (InitsVargArgs)
2583 IR = InlineResult::failure(Reason: "varargs");
2584 if (!IR.isSuccess()) {
2585 if (ORE)
2586 ORE->emit(RemarkBuilder: [&]() {
2587 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
2588 &CandidateCall)
2589 << NV("Callee", &F) << " has uninlinable pattern ("
2590 << NV("InlineResult", IR.getFailureReason())
2591 << ") and cost is not fully computed";
2592 });
2593 return IR;
2594 }
2595
2596 // If the caller is a recursive function then we don't want to inline
2597 // functions which allocate a lot of stack space because it would increase
2598 // the caller stack usage dramatically.
2599 if (IsCallerRecursive && AllocatedSize > RecurStackSizeThreshold) {
2600 auto IR =
2601 InlineResult::failure(Reason: "recursive and allocates too much stack space");
2602 if (ORE)
2603 ORE->emit(RemarkBuilder: [&]() {
2604 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
2605 &CandidateCall)
2606 << NV("Callee", &F) << " is "
2607 << NV("InlineResult", IR.getFailureReason())
2608 << ". Cost is not fully computed";
2609 });
2610 return IR;
2611 }
2612
2613 if (shouldStop())
2614 return InlineResult::failure(
2615 Reason: "Call site analysis is not favorable to inlining.");
2616 }
2617
2618 return InlineResult::success();
2619}
2620
2621/// Compute the base pointer and cumulative constant offsets for V.
2622///
2623/// This strips all constant offsets off of V, leaving it the base pointer, and
2624/// accumulates the total constant offset applied in the returned constant. It
2625/// returns 0 if V is not a pointer, and returns the constant '0' if there are
2626/// no constant offsets applied.
2627ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
2628 if (!V->getType()->isPointerTy())
2629 return nullptr;
2630
2631 unsigned AS = V->getType()->getPointerAddressSpace();
2632 unsigned IntPtrWidth = DL.getIndexSizeInBits(AS);
2633 APInt Offset = APInt::getZero(numBits: IntPtrWidth);
2634
2635 // Even though we don't look through PHI nodes, we could be called on an
2636 // instruction in an unreachable block, which may be on a cycle.
2637 SmallPtrSet<Value *, 4> Visited;
2638 Visited.insert(Ptr: V);
2639 do {
2640 if (GEPOperator *GEP = dyn_cast<GEPOperator>(Val: V)) {
2641 if (!GEP->isInBounds() || !accumulateGEPOffset(GEP&: *GEP, Offset))
2642 return nullptr;
2643 V = GEP->getPointerOperand();
2644 } else if (Operator::getOpcode(V) == Instruction::BitCast) {
2645 V = cast<Operator>(Val: V)->getOperand(i: 0);
2646 } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Val: V)) {
2647 if (GA->isInterposable())
2648 break;
2649 V = GA->getAliasee();
2650 } else {
2651 break;
2652 }
2653 assert(V->getType()->isPointerTy() && "Unexpected operand type!");
2654 } while (Visited.insert(Ptr: V).second);
2655
2656 Type *IdxPtrTy = DL.getIndexType(PtrTy: V->getType());
2657 return cast<ConstantInt>(Val: ConstantInt::get(Ty: IdxPtrTy, V: Offset));
2658}
2659
2660/// Find dead blocks due to deleted CFG edges during inlining.
2661///
2662/// If we know the successor of the current block, \p CurrBB, has to be \p
2663/// NextBB, the other successors of \p CurrBB are dead if these successors have
2664/// no live incoming CFG edges. If one block is found to be dead, we can
2665/// continue growing the dead block list by checking the successors of the dead
2666/// blocks to see if all their incoming edges are dead or not.
2667void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) {
2668 auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) {
2669 // A CFG edge is dead if the predecessor is dead or the predecessor has a
2670 // known successor which is not the one under exam.
2671 return (DeadBlocks.count(Ptr: Pred) ||
2672 (KnownSuccessors[Pred] && KnownSuccessors[Pred] != Succ));
2673 };
2674
2675 auto IsNewlyDead = [&](BasicBlock *BB) {
2676 // If all the edges to a block are dead, the block is also dead.
2677 return (!DeadBlocks.count(Ptr: BB) &&
2678 llvm::all_of(Range: predecessors(BB),
2679 P: [&](BasicBlock *P) { return IsEdgeDead(P, BB); }));
2680 };
2681
2682 for (BasicBlock *Succ : successors(BB: CurrBB)) {
2683 if (Succ == NextBB || !IsNewlyDead(Succ))
2684 continue;
2685 SmallVector<BasicBlock *, 4> NewDead;
2686 NewDead.push_back(Elt: Succ);
2687 while (!NewDead.empty()) {
2688 BasicBlock *Dead = NewDead.pop_back_val();
2689 if (DeadBlocks.insert(Ptr: Dead).second)
2690 // Continue growing the dead block lists.
2691 for (BasicBlock *S : successors(BB: Dead))
2692 if (IsNewlyDead(S))
2693 NewDead.push_back(Elt: S);
2694 }
2695 }
2696}
2697
2698/// Analyze a call site for potential inlining.
2699///
2700/// Returns true if inlining this call is viable, and false if it is not
2701/// viable. It computes the cost and adjusts the threshold based on numerous
2702/// factors and heuristics. If this method returns false but the computed cost
2703/// is below the computed threshold, then inlining was forcibly disabled by
2704/// some artifact of the routine.
2705InlineResult CallAnalyzer::analyze() {
2706 ++NumCallsAnalyzed;
2707
2708 auto Result = onAnalysisStart();
2709 if (!Result.isSuccess())
2710 return Result;
2711
2712 if (F.empty())
2713 return InlineResult::success();
2714
2715 Function *Caller = CandidateCall.getFunction();
2716 // Check if the caller function is recursive itself.
2717 for (User *U : Caller->users()) {
2718 CallBase *Call = dyn_cast<CallBase>(Val: U);
2719 if (Call && Call->getFunction() == Caller) {
2720 IsCallerRecursive = true;
2721 break;
2722 }
2723 }
2724
2725 // Populate our simplified values by mapping from function arguments to call
2726 // arguments with known important simplifications.
2727 auto CAI = CandidateCall.arg_begin();
2728 for (Argument &FAI : F.args()) {
2729 assert(CAI != CandidateCall.arg_end());
2730 if (Constant *C = dyn_cast<Constant>(Val: CAI))
2731 SimplifiedValues[&FAI] = C;
2732
2733 Value *PtrArg = *CAI;
2734 if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(V&: PtrArg)) {
2735 ConstantOffsetPtrs[&FAI] = std::make_pair(x&: PtrArg, y: C->getValue());
2736
2737 // We can SROA any pointer arguments derived from alloca instructions.
2738 if (auto *SROAArg = dyn_cast<AllocaInst>(Val: PtrArg)) {
2739 SROAArgValues[&FAI] = SROAArg;
2740 onInitializeSROAArg(Arg: SROAArg);
2741 EnabledSROAAllocas.insert(V: SROAArg);
2742 }
2743 }
2744 ++CAI;
2745 }
2746 NumConstantArgs = SimplifiedValues.size();
2747 NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
2748 NumAllocaArgs = SROAArgValues.size();
2749
2750 // FIXME: If a caller has multiple calls to a callee, we end up recomputing
2751 // the ephemeral values multiple times (and they're completely determined by
2752 // the callee, so this is purely duplicate work).
2753 SmallPtrSet<const Value *, 32> EphValues;
2754 CodeMetrics::collectEphemeralValues(L: &F, AC: &GetAssumptionCache(F), EphValues);
2755
2756 // The worklist of live basic blocks in the callee *after* inlining. We avoid
2757 // adding basic blocks of the callee which can be proven to be dead for this
2758 // particular call site in order to get more accurate cost estimates. This
2759 // requires a somewhat heavyweight iteration pattern: we need to walk the
2760 // basic blocks in a breadth-first order as we insert live successors. To
2761 // accomplish this, prioritizing for small iterations because we exit after
2762 // crossing our threshold, we use a small-size optimized SetVector.
2763 typedef SmallSetVector<BasicBlock *, 16> BBSetVector;
2764 BBSetVector BBWorklist;
2765 BBWorklist.insert(X: &F.getEntryBlock());
2766
2767 // Note that we *must not* cache the size, this loop grows the worklist.
2768 for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
2769 if (shouldStop())
2770 break;
2771
2772 BasicBlock *BB = BBWorklist[Idx];
2773 if (BB->empty())
2774 continue;
2775
2776 onBlockStart(BB);
2777
2778 // Disallow inlining a blockaddress with uses other than strictly callbr.
2779 // A blockaddress only has defined behavior for an indirect branch in the
2780 // same function, and we do not currently support inlining indirect
2781 // branches. But, the inliner may not see an indirect branch that ends up
2782 // being dead code at a particular call site. If the blockaddress escapes
2783 // the function, e.g., via a global variable, inlining may lead to an
2784 // invalid cross-function reference.
2785 // FIXME: pr/39560: continue relaxing this overt restriction.
2786 if (BB->hasAddressTaken())
2787 for (User *U : BlockAddress::get(BB: &*BB)->users())
2788 if (!isa<CallBrInst>(Val: *U))
2789 return InlineResult::failure(Reason: "blockaddress used outside of callbr");
2790
2791 // Analyze the cost of this block. If we blow through the threshold, this
2792 // returns false, and we can bail on out.
2793 InlineResult IR = analyzeBlock(BB, EphValues);
2794 if (!IR.isSuccess())
2795 return IR;
2796
2797 Instruction *TI = BB->getTerminator();
2798
2799 // Add in the live successors by first checking whether we have terminator
2800 // that may be simplified based on the values simplified by this call.
2801 if (BranchInst *BI = dyn_cast<BranchInst>(Val: TI)) {
2802 if (BI->isConditional()) {
2803 Value *Cond = BI->getCondition();
2804 if (ConstantInt *SimpleCond =
2805 dyn_cast_or_null<ConstantInt>(Val: SimplifiedValues.lookup(Val: Cond))) {
2806 BasicBlock *NextBB = BI->getSuccessor(i: SimpleCond->isZero() ? 1 : 0);
2807 BBWorklist.insert(X: NextBB);
2808 KnownSuccessors[BB] = NextBB;
2809 findDeadBlocks(CurrBB: BB, NextBB);
2810 continue;
2811 }
2812 }
2813 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(Val: TI)) {
2814 Value *Cond = SI->getCondition();
2815 if (ConstantInt *SimpleCond =
2816 dyn_cast_or_null<ConstantInt>(Val: SimplifiedValues.lookup(Val: Cond))) {
2817 BasicBlock *NextBB = SI->findCaseValue(C: SimpleCond)->getCaseSuccessor();
2818 BBWorklist.insert(X: NextBB);
2819 KnownSuccessors[BB] = NextBB;
2820 findDeadBlocks(CurrBB: BB, NextBB);
2821 continue;
2822 }
2823 }
2824
2825 // If we're unable to select a particular successor, just count all of
2826 // them.
2827 for (BasicBlock *Succ : successors(BB))
2828 BBWorklist.insert(X: Succ);
2829
2830 onBlockAnalyzed(BB);
2831 }
2832
2833 // If this is a noduplicate call, we can still inline as long as
2834 // inlining this would cause the removal of the caller (so the instruction
2835 // is not actually duplicated, just moved).
2836 if (!isSoleCallToLocalFunction(CB: CandidateCall, Callee: F) && ContainsNoDuplicateCall)
2837 return InlineResult::failure(Reason: "noduplicate");
2838
2839 // If the callee's stack size exceeds the user-specified threshold,
2840 // do not let it be inlined.
2841 // The command line option overrides a limit set in the function attributes.
2842 size_t FinalStackSizeThreshold = StackSizeThreshold;
2843 if (!StackSizeThreshold.getNumOccurrences())
2844 if (std::optional<int> AttrMaxStackSize = getStringFnAttrAsInt(
2845 F: Caller, AttrKind: InlineConstants::MaxInlineStackSizeAttributeName))
2846 FinalStackSizeThreshold = *AttrMaxStackSize;
2847 if (AllocatedSize > FinalStackSizeThreshold)
2848 return InlineResult::failure(Reason: "stacksize");
2849
2850 return finalizeAnalysis();
2851}
2852
2853void InlineCostCallAnalyzer::print(raw_ostream &OS) {
2854#define DEBUG_PRINT_STAT(x) OS << " " #x ": " << x << "\n"
2855 if (PrintInstructionComments)
2856 F.print(OS, AAW: &Writer);
2857 DEBUG_PRINT_STAT(NumConstantArgs);
2858 DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
2859 DEBUG_PRINT_STAT(NumAllocaArgs);
2860 DEBUG_PRINT_STAT(NumConstantPtrCmps);
2861 DEBUG_PRINT_STAT(NumConstantPtrDiffs);
2862 DEBUG_PRINT_STAT(NumInstructionsSimplified);
2863 DEBUG_PRINT_STAT(NumInstructions);
2864 DEBUG_PRINT_STAT(SROACostSavings);
2865 DEBUG_PRINT_STAT(SROACostSavingsLost);
2866 DEBUG_PRINT_STAT(LoadEliminationCost);
2867 DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
2868 DEBUG_PRINT_STAT(Cost);
2869 DEBUG_PRINT_STAT(Threshold);
2870#undef DEBUG_PRINT_STAT
2871}
2872
2873#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2874/// Dump stats about this call's analysis.
2875LLVM_DUMP_METHOD void InlineCostCallAnalyzer::dump() { print(OS&: dbgs()); }
2876#endif
2877
2878/// Test that there are no attribute conflicts between Caller and Callee
2879/// that prevent inlining.
2880static bool functionsHaveCompatibleAttributes(
2881 Function *Caller, Function *Callee, TargetTransformInfo &TTI,
2882 function_ref<const TargetLibraryInfo &(Function &)> &GetTLI) {
2883 // Note that CalleeTLI must be a copy not a reference. The legacy pass manager
2884 // caches the most recently created TLI in the TargetLibraryInfoWrapperPass
2885 // object, and always returns the same object (which is overwritten on each
2886 // GetTLI call). Therefore we copy the first result.
2887 auto CalleeTLI = GetTLI(*Callee);
2888 return (IgnoreTTIInlineCompatible ||
2889 TTI.areInlineCompatible(Caller, Callee)) &&
2890 GetTLI(*Caller).areInlineCompatible(CalleeTLI,
2891 AllowCallerSuperset: InlineCallerSupersetNoBuiltin) &&
2892 AttributeFuncs::areInlineCompatible(Caller: *Caller, Callee: *Callee);
2893}
2894
2895int llvm::getCallsiteCost(const TargetTransformInfo &TTI, const CallBase &Call,
2896 const DataLayout &DL) {
2897 int64_t Cost = 0;
2898 for (unsigned I = 0, E = Call.arg_size(); I != E; ++I) {
2899 if (Call.isByValArgument(ArgNo: I)) {
2900 // We approximate the number of loads and stores needed by dividing the
2901 // size of the byval type by the target's pointer size.
2902 PointerType *PTy = cast<PointerType>(Val: Call.getArgOperand(i: I)->getType());
2903 unsigned TypeSize = DL.getTypeSizeInBits(Ty: Call.getParamByValType(ArgNo: I));
2904 unsigned AS = PTy->getAddressSpace();
2905 unsigned PointerSize = DL.getPointerSizeInBits(AS);
2906 // Ceiling division.
2907 unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
2908
2909 // If it generates more than 8 stores it is likely to be expanded as an
2910 // inline memcpy so we take that as an upper bound. Otherwise we assume
2911 // one load and one store per word copied.
2912 // FIXME: The maxStoresPerMemcpy setting from the target should be used
2913 // here instead of a magic number of 8, but it's not available via
2914 // DataLayout.
2915 NumStores = std::min(a: NumStores, b: 8U);
2916
2917 Cost += 2 * NumStores * InstrCost;
2918 } else {
2919 // For non-byval arguments subtract off one instruction per call
2920 // argument.
2921 Cost += InstrCost;
2922 }
2923 }
2924 // The call instruction also disappears after inlining.
2925 Cost += InstrCost;
2926 Cost += TTI.getInlineCallPenalty(F: Call.getCaller(), Call, DefaultCallPenalty: CallPenalty);
2927
2928 return std::min<int64_t>(a: Cost, INT_MAX);
2929}
2930
2931InlineCost llvm::getInlineCost(
2932 CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI,
2933 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
2934 function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
2935 function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
2936 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
2937 return getInlineCost(Call, Callee: Call.getCalledFunction(), Params, CalleeTTI,
2938 GetAssumptionCache, GetTLI, GetBFI, PSI, ORE);
2939}
2940
2941std::optional<int> llvm::getInliningCostEstimate(
2942 CallBase &Call, TargetTransformInfo &CalleeTTI,
2943 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
2944 function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
2945 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
2946 const InlineParams Params = {/* DefaultThreshold*/ 0,
2947 /*HintThreshold*/ {},
2948 /*ColdThreshold*/ {},
2949 /*OptSizeThreshold*/ {},
2950 /*OptMinSizeThreshold*/ {},
2951 /*HotCallSiteThreshold*/ {},
2952 /*LocallyHotCallSiteThreshold*/ {},
2953 /*ColdCallSiteThreshold*/ {},
2954 /*ComputeFullInlineCost*/ true,
2955 /*EnableDeferral*/ true};
2956
2957 InlineCostCallAnalyzer CA(*Call.getCalledFunction(), Call, Params, CalleeTTI,
2958 GetAssumptionCache, GetBFI, PSI, ORE, true,
2959 /*IgnoreThreshold*/ true);
2960 auto R = CA.analyze();
2961 if (!R.isSuccess())
2962 return std::nullopt;
2963 return CA.getCost();
2964}
2965
2966std::optional<InlineCostFeatures> llvm::getInliningCostFeatures(
2967 CallBase &Call, TargetTransformInfo &CalleeTTI,
2968 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
2969 function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
2970 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
2971 InlineCostFeaturesAnalyzer CFA(CalleeTTI, GetAssumptionCache, GetBFI, PSI,
2972 ORE, *Call.getCalledFunction(), Call);
2973 auto R = CFA.analyze();
2974 if (!R.isSuccess())
2975 return std::nullopt;
2976 return CFA.features();
2977}
2978
2979std::optional<InlineResult> llvm::getAttributeBasedInliningDecision(
2980 CallBase &Call, Function *Callee, TargetTransformInfo &CalleeTTI,
2981 function_ref<const TargetLibraryInfo &(Function &)> GetTLI) {
2982
2983 // Cannot inline indirect calls.
2984 if (!Callee)
2985 return InlineResult::failure(Reason: "indirect call");
2986
2987 // When callee coroutine function is inlined into caller coroutine function
2988 // before coro-split pass,
2989 // coro-early pass can not handle this quiet well.
2990 // So we won't inline the coroutine function if it have not been unsplited
2991 if (Callee->isPresplitCoroutine())
2992 return InlineResult::failure(Reason: "unsplited coroutine call");
2993
2994 // Never inline calls with byval arguments that does not have the alloca
2995 // address space. Since byval arguments can be replaced with a copy to an
2996 // alloca, the inlined code would need to be adjusted to handle that the
2997 // argument is in the alloca address space (so it is a little bit complicated
2998 // to solve).
2999 unsigned AllocaAS = Callee->getParent()->getDataLayout().getAllocaAddrSpace();
3000 for (unsigned I = 0, E = Call.arg_size(); I != E; ++I)
3001 if (Call.isByValArgument(ArgNo: I)) {
3002 PointerType *PTy = cast<PointerType>(Val: Call.getArgOperand(i: I)->getType());
3003 if (PTy->getAddressSpace() != AllocaAS)
3004 return InlineResult::failure(Reason: "byval arguments without alloca"
3005 " address space");
3006 }
3007
3008 // Calls to functions with always-inline attributes should be inlined
3009 // whenever possible.
3010 if (Call.hasFnAttr(Attribute::AlwaysInline)) {
3011 if (Call.getAttributes().hasFnAttr(Attribute::NoInline))
3012 return InlineResult::failure(Reason: "noinline call site attribute");
3013
3014 auto IsViable = isInlineViable(Callee&: *Callee);
3015 if (IsViable.isSuccess())
3016 return InlineResult::success();
3017 return InlineResult::failure(Reason: IsViable.getFailureReason());
3018 }
3019
3020 // Never inline functions with conflicting attributes (unless callee has
3021 // always-inline attribute).
3022 Function *Caller = Call.getCaller();
3023 if (!functionsHaveCompatibleAttributes(Caller, Callee, TTI&: CalleeTTI, GetTLI))
3024 return InlineResult::failure(Reason: "conflicting attributes");
3025
3026 // Don't inline this call if the caller has the optnone attribute.
3027 if (Caller->hasOptNone())
3028 return InlineResult::failure(Reason: "optnone attribute");
3029
3030 // Don't inline a function that treats null pointer as valid into a caller
3031 // that does not have this attribute.
3032 if (!Caller->nullPointerIsDefined() && Callee->nullPointerIsDefined())
3033 return InlineResult::failure(Reason: "nullptr definitions incompatible");
3034
3035 // Don't inline functions which can be interposed at link-time.
3036 if (Callee->isInterposable())
3037 return InlineResult::failure(Reason: "interposable");
3038
3039 // Don't inline functions marked noinline.
3040 if (Callee->hasFnAttribute(Attribute::NoInline))
3041 return InlineResult::failure(Reason: "noinline function attribute");
3042
3043 // Don't inline call sites marked noinline.
3044 if (Call.isNoInline())
3045 return InlineResult::failure(Reason: "noinline call site attribute");
3046
3047 return std::nullopt;
3048}
3049
3050InlineCost llvm::getInlineCost(
3051 CallBase &Call, Function *Callee, const InlineParams &Params,
3052 TargetTransformInfo &CalleeTTI,
3053 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
3054 function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
3055 function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
3056 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
3057
3058 auto UserDecision =
3059 llvm::getAttributeBasedInliningDecision(Call, Callee, CalleeTTI, GetTLI);
3060
3061 if (UserDecision) {
3062 if (UserDecision->isSuccess())
3063 return llvm::InlineCost::getAlways(Reason: "always inline attribute");
3064 return llvm::InlineCost::getNever(Reason: UserDecision->getFailureReason());
3065 }
3066
3067 LLVM_DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName()
3068 << "... (caller:" << Call.getCaller()->getName()
3069 << ")\n");
3070
3071 InlineCostCallAnalyzer CA(*Callee, Call, Params, CalleeTTI,
3072 GetAssumptionCache, GetBFI, PSI, ORE);
3073 InlineResult ShouldInline = CA.analyze();
3074
3075 LLVM_DEBUG(CA.dump());
3076
3077 // Always make cost benefit based decision explicit.
3078 // We use always/never here since threshold is not meaningful,
3079 // as it's not what drives cost-benefit analysis.
3080 if (CA.wasDecidedByCostBenefit()) {
3081 if (ShouldInline.isSuccess())
3082 return InlineCost::getAlways(Reason: "benefit over cost",
3083 CostBenefit: CA.getCostBenefitPair());
3084 else
3085 return InlineCost::getNever(Reason: "cost over benefit", CostBenefit: CA.getCostBenefitPair());
3086 }
3087
3088 if (CA.wasDecidedByCostThreshold())
3089 return InlineCost::get(Cost: CA.getCost(), Threshold: CA.getThreshold(),
3090 StaticBonus: CA.getStaticBonusApplied());
3091
3092 // No details on how the decision was made, simply return always or never.
3093 return ShouldInline.isSuccess()
3094 ? InlineCost::getAlways(Reason: "empty function")
3095 : InlineCost::getNever(Reason: ShouldInline.getFailureReason());
3096}
3097
3098InlineResult llvm::isInlineViable(Function &F) {
3099 bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice);
3100 for (BasicBlock &BB : F) {
3101 // Disallow inlining of functions which contain indirect branches.
3102 if (isa<IndirectBrInst>(Val: BB.getTerminator()))
3103 return InlineResult::failure(Reason: "contains indirect branches");
3104
3105 // Disallow inlining of blockaddresses which are used by non-callbr
3106 // instructions.
3107 if (BB.hasAddressTaken())
3108 for (User *U : BlockAddress::get(BB: &BB)->users())
3109 if (!isa<CallBrInst>(Val: *U))
3110 return InlineResult::failure(Reason: "blockaddress used outside of callbr");
3111
3112 for (auto &II : BB) {
3113 CallBase *Call = dyn_cast<CallBase>(Val: &II);
3114 if (!Call)
3115 continue;
3116
3117 // Disallow recursive calls.
3118 Function *Callee = Call->getCalledFunction();
3119 if (&F == Callee)
3120 return InlineResult::failure(Reason: "recursive call");
3121
3122 // Disallow calls which expose returns-twice to a function not previously
3123 // attributed as such.
3124 if (!ReturnsTwice && isa<CallInst>(Val: Call) &&
3125 cast<CallInst>(Val: Call)->canReturnTwice())
3126 return InlineResult::failure(Reason: "exposes returns-twice attribute");
3127
3128 if (Callee)
3129 switch (Callee->getIntrinsicID()) {
3130 default:
3131 break;
3132 case llvm::Intrinsic::icall_branch_funnel:
3133 // Disallow inlining of @llvm.icall.branch.funnel because current
3134 // backend can't separate call targets from call arguments.
3135 return InlineResult::failure(
3136 Reason: "disallowed inlining of @llvm.icall.branch.funnel");
3137 case llvm::Intrinsic::localescape:
3138 // Disallow inlining functions that call @llvm.localescape. Doing this
3139 // correctly would require major changes to the inliner.
3140 return InlineResult::failure(
3141 Reason: "disallowed inlining of @llvm.localescape");
3142 case llvm::Intrinsic::vastart:
3143 // Disallow inlining of functions that initialize VarArgs with
3144 // va_start.
3145 return InlineResult::failure(
3146 Reason: "contains VarArgs initialized with va_start");
3147 }
3148 }
3149 }
3150
3151 return InlineResult::success();
3152}
3153
3154// APIs to create InlineParams based on command line flags and/or other
3155// parameters.
3156
3157InlineParams llvm::getInlineParams(int Threshold) {
3158 InlineParams Params;
3159
3160 // This field is the threshold to use for a callee by default. This is
3161 // derived from one or more of:
3162 // * optimization or size-optimization levels,
3163 // * a value passed to createFunctionInliningPass function, or
3164 // * the -inline-threshold flag.
3165 // If the -inline-threshold flag is explicitly specified, that is used
3166 // irrespective of anything else.
3167 if (InlineThreshold.getNumOccurrences() > 0)
3168 Params.DefaultThreshold = InlineThreshold;
3169 else
3170 Params.DefaultThreshold = Threshold;
3171
3172 // Set the HintThreshold knob from the -inlinehint-threshold.
3173 Params.HintThreshold = HintThreshold;
3174
3175 // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold.
3176 Params.HotCallSiteThreshold = HotCallSiteThreshold;
3177
3178 // If the -locally-hot-callsite-threshold is explicitly specified, use it to
3179 // populate LocallyHotCallSiteThreshold. Later, we populate
3180 // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if
3181 // we know that optimization level is O3 (in the getInlineParams variant that
3182 // takes the opt and size levels).
3183 // FIXME: Remove this check (and make the assignment unconditional) after
3184 // addressing size regression issues at O2.
3185 if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0)
3186 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
3187
3188 // Set the ColdCallSiteThreshold knob from the
3189 // -inline-cold-callsite-threshold.
3190 Params.ColdCallSiteThreshold = ColdCallSiteThreshold;
3191
3192 // Set the OptMinSizeThreshold and OptSizeThreshold params only if the
3193 // -inlinehint-threshold commandline option is not explicitly given. If that
3194 // option is present, then its value applies even for callees with size and
3195 // minsize attributes.
3196 // If the -inline-threshold is not specified, set the ColdThreshold from the
3197 // -inlinecold-threshold even if it is not explicitly passed. If
3198 // -inline-threshold is specified, then -inlinecold-threshold needs to be
3199 // explicitly specified to set the ColdThreshold knob
3200 if (InlineThreshold.getNumOccurrences() == 0) {
3201 Params.OptMinSizeThreshold = InlineConstants::OptMinSizeThreshold;
3202 Params.OptSizeThreshold = InlineConstants::OptSizeThreshold;
3203 Params.ColdThreshold = ColdThreshold;
3204 } else if (ColdThreshold.getNumOccurrences() > 0) {
3205 Params.ColdThreshold = ColdThreshold;
3206 }
3207 return Params;
3208}
3209
3210InlineParams llvm::getInlineParams() {
3211 return getInlineParams(Threshold: DefaultThreshold);
3212}
3213
3214// Compute the default threshold for inlining based on the opt level and the
3215// size opt level.
3216static int computeThresholdFromOptLevels(unsigned OptLevel,
3217 unsigned SizeOptLevel) {
3218 if (OptLevel > 2)
3219 return InlineConstants::OptAggressiveThreshold;
3220 if (SizeOptLevel == 1) // -Os
3221 return InlineConstants::OptSizeThreshold;
3222 if (SizeOptLevel == 2) // -Oz
3223 return InlineConstants::OptMinSizeThreshold;
3224 return DefaultThreshold;
3225}
3226
3227InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) {
3228 auto Params =
3229 getInlineParams(Threshold: computeThresholdFromOptLevels(OptLevel, SizeOptLevel));
3230 // At O3, use the value of -locally-hot-callsite-threshold option to populate
3231 // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only
3232 // when it is specified explicitly.
3233 if (OptLevel > 2)
3234 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
3235 return Params;
3236}
3237
3238PreservedAnalyses
3239InlineCostAnnotationPrinterPass::run(Function &F,
3240 FunctionAnalysisManager &FAM) {
3241 PrintInstructionComments = true;
3242 std::function<AssumptionCache &(Function &)> GetAssumptionCache =
3243 [&](Function &F) -> AssumptionCache & {
3244 return FAM.getResult<AssumptionAnalysis>(IR&: F);
3245 };
3246 Module *M = F.getParent();
3247 ProfileSummaryInfo PSI(*M);
3248 DataLayout DL(M);
3249 TargetTransformInfo TTI(DL);
3250 // FIXME: Redesign the usage of InlineParams to expand the scope of this pass.
3251 // In the current implementation, the type of InlineParams doesn't matter as
3252 // the pass serves only for verification of inliner's decisions.
3253 // We can add a flag which determines InlineParams for this run. Right now,
3254 // the default InlineParams are used.
3255 const InlineParams Params = llvm::getInlineParams();
3256 for (BasicBlock &BB : F) {
3257 for (Instruction &I : BB) {
3258 if (CallInst *CI = dyn_cast<CallInst>(Val: &I)) {
3259 Function *CalledFunction = CI->getCalledFunction();
3260 if (!CalledFunction || CalledFunction->isDeclaration())
3261 continue;
3262 OptimizationRemarkEmitter ORE(CalledFunction);
3263 InlineCostCallAnalyzer ICCA(*CalledFunction, *CI, Params, TTI,
3264 GetAssumptionCache, nullptr, &PSI, &ORE);
3265 ICCA.analyze();
3266 OS << " Analyzing call of " << CalledFunction->getName()
3267 << "... (caller:" << CI->getCaller()->getName() << ")\n";
3268 ICCA.print(OS);
3269 OS << "\n";
3270 }
3271 }
3272 }
3273 return PreservedAnalyses::all();
3274}
3275

source code of llvm/lib/Analysis/InlineCost.cpp