1 | //===- InlineCost.cpp - Cost analysis for inliner -------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file implements inline cost analysis. |
10 | // |
11 | //===----------------------------------------------------------------------===// |
12 | |
13 | #include "llvm/Analysis/InlineCost.h" |
14 | #include "llvm/ADT/STLExtras.h" |
15 | #include "llvm/ADT/SetVector.h" |
16 | #include "llvm/ADT/SmallPtrSet.h" |
17 | #include "llvm/ADT/SmallVector.h" |
18 | #include "llvm/ADT/Statistic.h" |
19 | #include "llvm/Analysis/AssumptionCache.h" |
20 | #include "llvm/Analysis/BlockFrequencyInfo.h" |
21 | #include "llvm/Analysis/CodeMetrics.h" |
22 | #include "llvm/Analysis/ConstantFolding.h" |
23 | #include "llvm/Analysis/InstructionSimplify.h" |
24 | #include "llvm/Analysis/LoopInfo.h" |
25 | #include "llvm/Analysis/MemoryBuiltins.h" |
26 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
27 | #include "llvm/Analysis/ProfileSummaryInfo.h" |
28 | #include "llvm/Analysis/TargetLibraryInfo.h" |
29 | #include "llvm/Analysis/TargetTransformInfo.h" |
30 | #include "llvm/Analysis/ValueTracking.h" |
31 | #include "llvm/Config/llvm-config.h" |
32 | #include "llvm/IR/AssemblyAnnotationWriter.h" |
33 | #include "llvm/IR/CallingConv.h" |
34 | #include "llvm/IR/DataLayout.h" |
35 | #include "llvm/IR/Dominators.h" |
36 | #include "llvm/IR/GetElementPtrTypeIterator.h" |
37 | #include "llvm/IR/GlobalAlias.h" |
38 | #include "llvm/IR/InstVisitor.h" |
39 | #include "llvm/IR/IntrinsicInst.h" |
40 | #include "llvm/IR/Operator.h" |
41 | #include "llvm/IR/PatternMatch.h" |
42 | #include "llvm/Support/CommandLine.h" |
43 | #include "llvm/Support/Debug.h" |
44 | #include "llvm/Support/FormattedStream.h" |
45 | #include "llvm/Support/raw_ostream.h" |
46 | #include <climits> |
47 | #include <limits> |
48 | #include <optional> |
49 | |
50 | using namespace llvm; |
51 | |
52 | #define DEBUG_TYPE "inline-cost" |
53 | |
54 | STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed" ); |
55 | |
56 | static cl::opt<int> |
57 | DefaultThreshold("inlinedefault-threshold" , cl::Hidden, cl::init(Val: 225), |
58 | cl::desc("Default amount of inlining to perform" )); |
59 | |
60 | // We introduce this option since there is a minor compile-time win by avoiding |
61 | // addition of TTI attributes (target-features in particular) to inline |
62 | // candidates when they are guaranteed to be the same as top level methods in |
63 | // some use cases. If we avoid adding the attribute, we need an option to avoid |
64 | // checking these attributes. |
65 | static cl::opt<bool> IgnoreTTIInlineCompatible( |
66 | "ignore-tti-inline-compatible" , cl::Hidden, cl::init(Val: false), |
67 | cl::desc("Ignore TTI attributes compatibility check between callee/caller " |
68 | "during inline cost calculation" )); |
69 | |
70 | static cl::opt<bool> ( |
71 | "print-instruction-comments" , cl::Hidden, cl::init(Val: false), |
72 | cl::desc("Prints comments for instruction based on inline cost analysis" )); |
73 | |
74 | static cl::opt<int> InlineThreshold( |
75 | "inline-threshold" , cl::Hidden, cl::init(Val: 225), |
76 | cl::desc("Control the amount of inlining to perform (default = 225)" )); |
77 | |
78 | static cl::opt<int> HintThreshold( |
79 | "inlinehint-threshold" , cl::Hidden, cl::init(Val: 325), |
80 | cl::desc("Threshold for inlining functions with inline hint" )); |
81 | |
82 | static cl::opt<int> |
83 | ColdCallSiteThreshold("inline-cold-callsite-threshold" , cl::Hidden, |
84 | cl::init(Val: 45), |
85 | cl::desc("Threshold for inlining cold callsites" )); |
86 | |
87 | static cl::opt<bool> InlineEnableCostBenefitAnalysis( |
88 | "inline-enable-cost-benefit-analysis" , cl::Hidden, cl::init(Val: false), |
89 | cl::desc("Enable the cost-benefit analysis for the inliner" )); |
90 | |
91 | // InlineSavingsMultiplier overrides per TTI multipliers iff it is |
92 | // specified explicitly in command line options. This option is exposed |
93 | // for tuning and testing. |
94 | static cl::opt<int> InlineSavingsMultiplier( |
95 | "inline-savings-multiplier" , cl::Hidden, cl::init(Val: 8), |
96 | cl::desc("Multiplier to multiply cycle savings by during inlining" )); |
97 | |
98 | // InlineSavingsProfitableMultiplier overrides per TTI multipliers iff it is |
99 | // specified explicitly in command line options. This option is exposed |
100 | // for tuning and testing. |
101 | static cl::opt<int> InlineSavingsProfitableMultiplier( |
102 | "inline-savings-profitable-multiplier" , cl::Hidden, cl::init(Val: 4), |
103 | cl::desc("A multiplier on top of cycle savings to decide whether the " |
104 | "savings won't justify the cost" )); |
105 | |
106 | static cl::opt<int> |
107 | InlineSizeAllowance("inline-size-allowance" , cl::Hidden, cl::init(Val: 100), |
108 | cl::desc("The maximum size of a callee that get's " |
109 | "inlined without sufficient cycle savings" )); |
110 | |
111 | // We introduce this threshold to help performance of instrumentation based |
112 | // PGO before we actually hook up inliner with analysis passes such as BPI and |
113 | // BFI. |
114 | static cl::opt<int> ColdThreshold( |
115 | "inlinecold-threshold" , cl::Hidden, cl::init(Val: 45), |
116 | cl::desc("Threshold for inlining functions with cold attribute" )); |
117 | |
118 | static cl::opt<int> |
119 | HotCallSiteThreshold("hot-callsite-threshold" , cl::Hidden, cl::init(Val: 3000), |
120 | cl::desc("Threshold for hot callsites " )); |
121 | |
122 | static cl::opt<int> LocallyHotCallSiteThreshold( |
123 | "locally-hot-callsite-threshold" , cl::Hidden, cl::init(Val: 525), |
124 | cl::desc("Threshold for locally hot callsites " )); |
125 | |
126 | static cl::opt<int> ColdCallSiteRelFreq( |
127 | "cold-callsite-rel-freq" , cl::Hidden, cl::init(Val: 2), |
128 | cl::desc("Maximum block frequency, expressed as a percentage of caller's " |
129 | "entry frequency, for a callsite to be cold in the absence of " |
130 | "profile information." )); |
131 | |
132 | static cl::opt<uint64_t> HotCallSiteRelFreq( |
133 | "hot-callsite-rel-freq" , cl::Hidden, cl::init(Val: 60), |
134 | cl::desc("Minimum block frequency, expressed as a multiple of caller's " |
135 | "entry frequency, for a callsite to be hot in the absence of " |
136 | "profile information." )); |
137 | |
138 | static cl::opt<int> |
139 | InstrCost("inline-instr-cost" , cl::Hidden, cl::init(Val: 5), |
140 | cl::desc("Cost of a single instruction when inlining" )); |
141 | |
142 | static cl::opt<int> |
143 | MemAccessCost("inline-memaccess-cost" , cl::Hidden, cl::init(Val: 0), |
144 | cl::desc("Cost of load/store instruction when inlining" )); |
145 | |
146 | static cl::opt<int> CallPenalty( |
147 | "inline-call-penalty" , cl::Hidden, cl::init(Val: 25), |
148 | cl::desc("Call penalty that is applied per callsite when inlining" )); |
149 | |
150 | static cl::opt<size_t> |
151 | StackSizeThreshold("inline-max-stacksize" , cl::Hidden, |
152 | cl::init(Val: std::numeric_limits<size_t>::max()), |
153 | cl::desc("Do not inline functions with a stack size " |
154 | "that exceeds the specified limit" )); |
155 | |
156 | static cl::opt<size_t> RecurStackSizeThreshold( |
157 | "recursive-inline-max-stacksize" , cl::Hidden, |
158 | cl::init(Val: InlineConstants::TotalAllocaSizeRecursiveCaller), |
159 | cl::desc("Do not inline recursive functions with a stack " |
160 | "size that exceeds the specified limit" )); |
161 | |
162 | static cl::opt<bool> OptComputeFullInlineCost( |
163 | "inline-cost-full" , cl::Hidden, |
164 | cl::desc("Compute the full inline cost of a call site even when the cost " |
165 | "exceeds the threshold." )); |
166 | |
167 | static cl::opt<bool> InlineCallerSupersetNoBuiltin( |
168 | "inline-caller-superset-nobuiltin" , cl::Hidden, cl::init(Val: true), |
169 | cl::desc("Allow inlining when caller has a superset of callee's nobuiltin " |
170 | "attributes." )); |
171 | |
172 | static cl::opt<bool> DisableGEPConstOperand( |
173 | "disable-gep-const-evaluation" , cl::Hidden, cl::init(Val: false), |
174 | cl::desc("Disables evaluation of GetElementPtr with constant operands" )); |
175 | |
176 | namespace llvm { |
177 | std::optional<int> getStringFnAttrAsInt(const Attribute &Attr) { |
178 | if (Attr.isValid()) { |
179 | int AttrValue = 0; |
180 | if (!Attr.getValueAsString().getAsInteger(Radix: 10, Result&: AttrValue)) |
181 | return AttrValue; |
182 | } |
183 | return std::nullopt; |
184 | } |
185 | |
186 | std::optional<int> getStringFnAttrAsInt(CallBase &CB, StringRef AttrKind) { |
187 | return getStringFnAttrAsInt(Attr: CB.getFnAttr(Kind: AttrKind)); |
188 | } |
189 | |
190 | std::optional<int> getStringFnAttrAsInt(Function *F, StringRef AttrKind) { |
191 | return getStringFnAttrAsInt(Attr: F->getFnAttribute(Kind: AttrKind)); |
192 | } |
193 | |
194 | namespace InlineConstants { |
195 | int getInstrCost() { return InstrCost; } |
196 | |
197 | } // namespace InlineConstants |
198 | |
199 | } // namespace llvm |
200 | |
201 | namespace { |
202 | class InlineCostCallAnalyzer; |
203 | |
204 | // This struct is used to store information about inline cost of a |
205 | // particular instruction |
206 | struct InstructionCostDetail { |
207 | int CostBefore = 0; |
208 | int CostAfter = 0; |
209 | int ThresholdBefore = 0; |
210 | int ThresholdAfter = 0; |
211 | |
212 | int getThresholdDelta() const { return ThresholdAfter - ThresholdBefore; } |
213 | |
214 | int getCostDelta() const { return CostAfter - CostBefore; } |
215 | |
216 | bool hasThresholdChanged() const { return ThresholdAfter != ThresholdBefore; } |
217 | }; |
218 | |
219 | class InlineCostAnnotationWriter : public AssemblyAnnotationWriter { |
220 | private: |
221 | InlineCostCallAnalyzer *const ICCA; |
222 | |
223 | public: |
224 | InlineCostAnnotationWriter(InlineCostCallAnalyzer *ICCA) : ICCA(ICCA) {} |
225 | void emitInstructionAnnot(const Instruction *I, |
226 | formatted_raw_ostream &OS) override; |
227 | }; |
228 | |
229 | /// Carry out call site analysis, in order to evaluate inlinability. |
230 | /// NOTE: the type is currently used as implementation detail of functions such |
231 | /// as llvm::getInlineCost. Note the function_ref constructor parameters - the |
232 | /// expectation is that they come from the outer scope, from the wrapper |
233 | /// functions. If we want to support constructing CallAnalyzer objects where |
234 | /// lambdas are provided inline at construction, or where the object needs to |
235 | /// otherwise survive past the scope of the provided functions, we need to |
236 | /// revisit the argument types. |
237 | class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> { |
238 | typedef InstVisitor<CallAnalyzer, bool> Base; |
239 | friend class InstVisitor<CallAnalyzer, bool>; |
240 | |
241 | protected: |
242 | virtual ~CallAnalyzer() = default; |
243 | /// The TargetTransformInfo available for this compilation. |
244 | const TargetTransformInfo &TTI; |
245 | |
246 | /// Getter for the cache of @llvm.assume intrinsics. |
247 | function_ref<AssumptionCache &(Function &)> GetAssumptionCache; |
248 | |
249 | /// Getter for BlockFrequencyInfo |
250 | function_ref<BlockFrequencyInfo &(Function &)> GetBFI; |
251 | |
252 | /// Profile summary information. |
253 | ProfileSummaryInfo *PSI; |
254 | |
255 | /// The called function. |
256 | Function &F; |
257 | |
258 | // Cache the DataLayout since we use it a lot. |
259 | const DataLayout &DL; |
260 | |
261 | /// The OptimizationRemarkEmitter available for this compilation. |
262 | OptimizationRemarkEmitter *ORE; |
263 | |
264 | /// The candidate callsite being analyzed. Please do not use this to do |
265 | /// analysis in the caller function; we want the inline cost query to be |
266 | /// easily cacheable. Instead, use the cover function paramHasAttr. |
267 | CallBase &CandidateCall; |
268 | |
269 | /// Extension points for handling callsite features. |
270 | // Called before a basic block was analyzed. |
271 | virtual void onBlockStart(const BasicBlock *BB) {} |
272 | |
273 | /// Called after a basic block was analyzed. |
274 | virtual void onBlockAnalyzed(const BasicBlock *BB) {} |
275 | |
276 | /// Called before an instruction was analyzed |
277 | virtual void onInstructionAnalysisStart(const Instruction *I) {} |
278 | |
279 | /// Called after an instruction was analyzed |
280 | virtual void onInstructionAnalysisFinish(const Instruction *I) {} |
281 | |
282 | /// Called at the end of the analysis of the callsite. Return the outcome of |
283 | /// the analysis, i.e. 'InlineResult(true)' if the inlining may happen, or |
284 | /// the reason it can't. |
285 | virtual InlineResult finalizeAnalysis() { return InlineResult::success(); } |
286 | /// Called when we're about to start processing a basic block, and every time |
287 | /// we are done processing an instruction. Return true if there is no point in |
288 | /// continuing the analysis (e.g. we've determined already the call site is |
289 | /// too expensive to inline) |
290 | virtual bool shouldStop() { return false; } |
291 | |
292 | /// Called before the analysis of the callee body starts (with callsite |
293 | /// contexts propagated). It checks callsite-specific information. Return a |
294 | /// reason analysis can't continue if that's the case, or 'true' if it may |
295 | /// continue. |
296 | virtual InlineResult onAnalysisStart() { return InlineResult::success(); } |
297 | /// Called if the analysis engine decides SROA cannot be done for the given |
298 | /// alloca. |
299 | virtual void onDisableSROA(AllocaInst *Arg) {} |
300 | |
301 | /// Called the analysis engine determines load elimination won't happen. |
302 | virtual void onDisableLoadElimination() {} |
303 | |
304 | /// Called when we visit a CallBase, before the analysis starts. Return false |
305 | /// to stop further processing of the instruction. |
306 | virtual bool onCallBaseVisitStart(CallBase &Call) { return true; } |
307 | |
308 | /// Called to account for a call. |
309 | virtual void onCallPenalty() {} |
310 | |
311 | /// Called to account for a load or store. |
312 | virtual void onMemAccess(){}; |
313 | |
314 | /// Called to account for the expectation the inlining would result in a load |
315 | /// elimination. |
316 | virtual void onLoadEliminationOpportunity() {} |
317 | |
318 | /// Called to account for the cost of argument setup for the Call in the |
319 | /// callee's body (not the callsite currently under analysis). |
320 | virtual void onCallArgumentSetup(const CallBase &Call) {} |
321 | |
322 | /// Called to account for a load relative intrinsic. |
323 | virtual void onLoadRelativeIntrinsic() {} |
324 | |
325 | /// Called to account for a lowered call. |
326 | virtual void onLoweredCall(Function *F, CallBase &Call, bool IsIndirectCall) { |
327 | } |
328 | |
329 | /// Account for a jump table of given size. Return false to stop further |
330 | /// processing the switch instruction |
331 | virtual bool onJumpTable(unsigned JumpTableSize) { return true; } |
332 | |
333 | /// Account for a case cluster of given size. Return false to stop further |
334 | /// processing of the instruction. |
335 | virtual bool onCaseCluster(unsigned NumCaseCluster) { return true; } |
336 | |
337 | /// Called at the end of processing a switch instruction, with the given |
338 | /// number of case clusters. |
339 | virtual void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster, |
340 | bool DefaultDestUndefined) {} |
341 | |
342 | /// Called to account for any other instruction not specifically accounted |
343 | /// for. |
344 | virtual void onMissedSimplification() {} |
345 | |
346 | /// Start accounting potential benefits due to SROA for the given alloca. |
347 | virtual void onInitializeSROAArg(AllocaInst *Arg) {} |
348 | |
349 | /// Account SROA savings for the AllocaInst value. |
350 | virtual void onAggregateSROAUse(AllocaInst *V) {} |
351 | |
352 | bool handleSROA(Value *V, bool DoNotDisable) { |
353 | // Check for SROA candidates in comparisons. |
354 | if (auto *SROAArg = getSROAArgForValueOrNull(V)) { |
355 | if (DoNotDisable) { |
356 | onAggregateSROAUse(V: SROAArg); |
357 | return true; |
358 | } |
359 | disableSROAForArg(SROAArg); |
360 | } |
361 | return false; |
362 | } |
363 | |
364 | bool IsCallerRecursive = false; |
365 | bool IsRecursiveCall = false; |
366 | bool ExposesReturnsTwice = false; |
367 | bool HasDynamicAlloca = false; |
368 | bool ContainsNoDuplicateCall = false; |
369 | bool HasReturn = false; |
370 | bool HasIndirectBr = false; |
371 | bool HasUninlineableIntrinsic = false; |
372 | bool InitsVargArgs = false; |
373 | |
374 | /// Number of bytes allocated statically by the callee. |
375 | uint64_t AllocatedSize = 0; |
376 | unsigned NumInstructions = 0; |
377 | unsigned NumVectorInstructions = 0; |
378 | |
379 | /// While we walk the potentially-inlined instructions, we build up and |
380 | /// maintain a mapping of simplified values specific to this callsite. The |
381 | /// idea is to propagate any special information we have about arguments to |
382 | /// this call through the inlinable section of the function, and account for |
383 | /// likely simplifications post-inlining. The most important aspect we track |
384 | /// is CFG altering simplifications -- when we prove a basic block dead, that |
385 | /// can cause dramatic shifts in the cost of inlining a function. |
386 | DenseMap<Value *, Constant *> SimplifiedValues; |
387 | |
388 | /// Keep track of the values which map back (through function arguments) to |
389 | /// allocas on the caller stack which could be simplified through SROA. |
390 | DenseMap<Value *, AllocaInst *> SROAArgValues; |
391 | |
392 | /// Keep track of Allocas for which we believe we may get SROA optimization. |
393 | DenseSet<AllocaInst *> EnabledSROAAllocas; |
394 | |
395 | /// Keep track of values which map to a pointer base and constant offset. |
396 | DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs; |
397 | |
398 | /// Keep track of dead blocks due to the constant arguments. |
399 | SmallPtrSet<BasicBlock *, 16> DeadBlocks; |
400 | |
401 | /// The mapping of the blocks to their known unique successors due to the |
402 | /// constant arguments. |
403 | DenseMap<BasicBlock *, BasicBlock *> KnownSuccessors; |
404 | |
405 | /// Model the elimination of repeated loads that is expected to happen |
406 | /// whenever we simplify away the stores that would otherwise cause them to be |
407 | /// loads. |
408 | bool EnableLoadElimination = true; |
409 | |
410 | /// Whether we allow inlining for recursive call. |
411 | bool AllowRecursiveCall = false; |
412 | |
413 | SmallPtrSet<Value *, 16> LoadAddrSet; |
414 | |
415 | AllocaInst *getSROAArgForValueOrNull(Value *V) const { |
416 | auto It = SROAArgValues.find(Val: V); |
417 | if (It == SROAArgValues.end() || EnabledSROAAllocas.count(V: It->second) == 0) |
418 | return nullptr; |
419 | return It->second; |
420 | } |
421 | |
422 | // Custom simplification helper routines. |
423 | bool isAllocaDerivedArg(Value *V); |
424 | void disableSROAForArg(AllocaInst *SROAArg); |
425 | void disableSROA(Value *V); |
426 | void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB); |
427 | void disableLoadElimination(); |
428 | bool isGEPFree(GetElementPtrInst &GEP); |
429 | bool canFoldInboundsGEP(GetElementPtrInst &I); |
430 | bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset); |
431 | bool simplifyCallSite(Function *F, CallBase &Call); |
432 | bool simplifyInstruction(Instruction &I); |
433 | bool simplifyIntrinsicCallIsConstant(CallBase &CB); |
434 | bool simplifyIntrinsicCallObjectSize(CallBase &CB); |
435 | ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V); |
436 | |
437 | /// Return true if the given argument to the function being considered for |
438 | /// inlining has the given attribute set either at the call site or the |
439 | /// function declaration. Primarily used to inspect call site specific |
440 | /// attributes since these can be more precise than the ones on the callee |
441 | /// itself. |
442 | bool paramHasAttr(Argument *A, Attribute::AttrKind Attr); |
443 | |
444 | /// Return true if the given value is known non null within the callee if |
445 | /// inlined through this particular callsite. |
446 | bool isKnownNonNullInCallee(Value *V); |
447 | |
448 | /// Return true if size growth is allowed when inlining the callee at \p Call. |
449 | bool allowSizeGrowth(CallBase &Call); |
450 | |
451 | // Custom analysis routines. |
452 | InlineResult analyzeBlock(BasicBlock *BB, |
453 | SmallPtrSetImpl<const Value *> &EphValues); |
454 | |
455 | // Disable several entry points to the visitor so we don't accidentally use |
456 | // them by declaring but not defining them here. |
457 | void visit(Module *); |
458 | void visit(Module &); |
459 | void visit(Function *); |
460 | void visit(Function &); |
461 | void visit(BasicBlock *); |
462 | void visit(BasicBlock &); |
463 | |
464 | // Provide base case for our instruction visit. |
465 | bool visitInstruction(Instruction &I); |
466 | |
467 | // Our visit overrides. |
468 | bool visitAlloca(AllocaInst &I); |
469 | bool visitPHI(PHINode &I); |
470 | bool visitGetElementPtr(GetElementPtrInst &I); |
471 | bool visitBitCast(BitCastInst &I); |
472 | bool visitPtrToInt(PtrToIntInst &I); |
473 | bool visitIntToPtr(IntToPtrInst &I); |
474 | bool visitCastInst(CastInst &I); |
475 | bool visitCmpInst(CmpInst &I); |
476 | bool visitSub(BinaryOperator &I); |
477 | bool visitBinaryOperator(BinaryOperator &I); |
478 | bool visitFNeg(UnaryOperator &I); |
479 | bool visitLoad(LoadInst &I); |
480 | bool visitStore(StoreInst &I); |
481 | bool visitExtractValue(ExtractValueInst &I); |
482 | bool visitInsertValue(InsertValueInst &I); |
483 | bool visitCallBase(CallBase &Call); |
484 | bool visitReturnInst(ReturnInst &RI); |
485 | bool visitBranchInst(BranchInst &BI); |
486 | bool visitSelectInst(SelectInst &SI); |
487 | bool visitSwitchInst(SwitchInst &SI); |
488 | bool visitIndirectBrInst(IndirectBrInst &IBI); |
489 | bool visitResumeInst(ResumeInst &RI); |
490 | bool visitCleanupReturnInst(CleanupReturnInst &RI); |
491 | bool visitCatchReturnInst(CatchReturnInst &RI); |
492 | bool visitUnreachableInst(UnreachableInst &I); |
493 | |
494 | public: |
495 | CallAnalyzer(Function &Callee, CallBase &Call, const TargetTransformInfo &TTI, |
496 | function_ref<AssumptionCache &(Function &)> GetAssumptionCache, |
497 | function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr, |
498 | ProfileSummaryInfo *PSI = nullptr, |
499 | OptimizationRemarkEmitter *ORE = nullptr) |
500 | : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI), |
501 | PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE), |
502 | CandidateCall(Call) {} |
503 | |
504 | InlineResult analyze(); |
505 | |
506 | std::optional<Constant *> getSimplifiedValue(Instruction *I) { |
507 | if (SimplifiedValues.contains(Val: I)) |
508 | return SimplifiedValues[I]; |
509 | return std::nullopt; |
510 | } |
511 | |
512 | // Keep a bunch of stats about the cost savings found so we can print them |
513 | // out when debugging. |
514 | unsigned NumConstantArgs = 0; |
515 | unsigned NumConstantOffsetPtrArgs = 0; |
516 | unsigned NumAllocaArgs = 0; |
517 | unsigned NumConstantPtrCmps = 0; |
518 | unsigned NumConstantPtrDiffs = 0; |
519 | unsigned NumInstructionsSimplified = 0; |
520 | |
521 | void dump(); |
522 | }; |
523 | |
524 | // Considering forming a binary search, we should find the number of nodes |
525 | // which is same as the number of comparisons when lowered. For a given |
526 | // number of clusters, n, we can define a recursive function, f(n), to find |
527 | // the number of nodes in the tree. The recursion is : |
528 | // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3, |
529 | // and f(n) = n, when n <= 3. |
530 | // This will lead a binary tree where the leaf should be either f(2) or f(3) |
531 | // when n > 3. So, the number of comparisons from leaves should be n, while |
532 | // the number of non-leaf should be : |
533 | // 2^(log2(n) - 1) - 1 |
534 | // = 2^log2(n) * 2^-1 - 1 |
535 | // = n / 2 - 1. |
536 | // Considering comparisons from leaf and non-leaf nodes, we can estimate the |
537 | // number of comparisons in a simple closed form : |
538 | // n + n / 2 - 1 = n * 3 / 2 - 1 |
539 | int64_t getExpectedNumberOfCompare(int NumCaseCluster) { |
540 | return 3 * static_cast<int64_t>(NumCaseCluster) / 2 - 1; |
541 | } |
542 | |
543 | /// FIXME: if it is necessary to derive from InlineCostCallAnalyzer, note |
544 | /// the FIXME in onLoweredCall, when instantiating an InlineCostCallAnalyzer |
545 | class InlineCostCallAnalyzer final : public CallAnalyzer { |
546 | const bool ComputeFullInlineCost; |
547 | int LoadEliminationCost = 0; |
548 | /// Bonus to be applied when percentage of vector instructions in callee is |
549 | /// high (see more details in updateThreshold). |
550 | int VectorBonus = 0; |
551 | /// Bonus to be applied when the callee has only one reachable basic block. |
552 | int SingleBBBonus = 0; |
553 | |
554 | /// Tunable parameters that control the analysis. |
555 | const InlineParams &Params; |
556 | |
557 | // This DenseMap stores the delta change in cost and threshold after |
558 | // accounting for the given instruction. The map is filled only with the |
559 | // flag PrintInstructionComments on. |
560 | DenseMap<const Instruction *, InstructionCostDetail> InstructionCostDetailMap; |
561 | |
562 | /// Upper bound for the inlining cost. Bonuses are being applied to account |
563 | /// for speculative "expected profit" of the inlining decision. |
564 | int Threshold = 0; |
565 | |
566 | /// The amount of StaticBonus applied. |
567 | int StaticBonusApplied = 0; |
568 | |
569 | /// Attempt to evaluate indirect calls to boost its inline cost. |
570 | const bool BoostIndirectCalls; |
571 | |
572 | /// Ignore the threshold when finalizing analysis. |
573 | const bool IgnoreThreshold; |
574 | |
575 | // True if the cost-benefit-analysis-based inliner is enabled. |
576 | const bool CostBenefitAnalysisEnabled; |
577 | |
578 | /// Inlining cost measured in abstract units, accounts for all the |
579 | /// instructions expected to be executed for a given function invocation. |
580 | /// Instructions that are statically proven to be dead based on call-site |
581 | /// arguments are not counted here. |
582 | int Cost = 0; |
583 | |
584 | // The cumulative cost at the beginning of the basic block being analyzed. At |
585 | // the end of analyzing each basic block, "Cost - CostAtBBStart" represents |
586 | // the size of that basic block. |
587 | int CostAtBBStart = 0; |
588 | |
589 | // The static size of live but cold basic blocks. This is "static" in the |
590 | // sense that it's not weighted by profile counts at all. |
591 | int ColdSize = 0; |
592 | |
593 | // Whether inlining is decided by cost-threshold analysis. |
594 | bool DecidedByCostThreshold = false; |
595 | |
596 | // Whether inlining is decided by cost-benefit analysis. |
597 | bool DecidedByCostBenefit = false; |
598 | |
599 | // The cost-benefit pair computed by cost-benefit analysis. |
600 | std::optional<CostBenefitPair> CostBenefit; |
601 | |
602 | bool SingleBB = true; |
603 | |
604 | unsigned SROACostSavings = 0; |
605 | unsigned SROACostSavingsLost = 0; |
606 | |
607 | /// The mapping of caller Alloca values to their accumulated cost savings. If |
608 | /// we have to disable SROA for one of the allocas, this tells us how much |
609 | /// cost must be added. |
610 | DenseMap<AllocaInst *, int> SROAArgCosts; |
611 | |
612 | /// Return true if \p Call is a cold callsite. |
613 | bool isColdCallSite(CallBase &Call, BlockFrequencyInfo *CallerBFI); |
614 | |
615 | /// Update Threshold based on callsite properties such as callee |
616 | /// attributes and callee hotness for PGO builds. The Callee is explicitly |
617 | /// passed to support analyzing indirect calls whose target is inferred by |
618 | /// analysis. |
619 | void updateThreshold(CallBase &Call, Function &Callee); |
620 | /// Return a higher threshold if \p Call is a hot callsite. |
621 | std::optional<int> getHotCallSiteThreshold(CallBase &Call, |
622 | BlockFrequencyInfo *CallerBFI); |
623 | |
624 | /// Handle a capped 'int' increment for Cost. |
625 | void addCost(int64_t Inc) { |
626 | Inc = std::clamp<int64_t>(val: Inc, INT_MIN, INT_MAX); |
627 | Cost = std::clamp<int64_t>(val: Inc + Cost, INT_MIN, INT_MAX); |
628 | } |
629 | |
630 | void onDisableSROA(AllocaInst *Arg) override { |
631 | auto CostIt = SROAArgCosts.find(Val: Arg); |
632 | if (CostIt == SROAArgCosts.end()) |
633 | return; |
634 | addCost(Inc: CostIt->second); |
635 | SROACostSavings -= CostIt->second; |
636 | SROACostSavingsLost += CostIt->second; |
637 | SROAArgCosts.erase(I: CostIt); |
638 | } |
639 | |
640 | void onDisableLoadElimination() override { |
641 | addCost(Inc: LoadEliminationCost); |
642 | LoadEliminationCost = 0; |
643 | } |
644 | |
645 | bool onCallBaseVisitStart(CallBase &Call) override { |
646 | if (std::optional<int> AttrCallThresholdBonus = |
647 | getStringFnAttrAsInt(CB&: Call, AttrKind: "call-threshold-bonus" )) |
648 | Threshold += *AttrCallThresholdBonus; |
649 | |
650 | if (std::optional<int> AttrCallCost = |
651 | getStringFnAttrAsInt(CB&: Call, AttrKind: "call-inline-cost" )) { |
652 | addCost(Inc: *AttrCallCost); |
653 | // Prevent further processing of the call since we want to override its |
654 | // inline cost, not just add to it. |
655 | return false; |
656 | } |
657 | return true; |
658 | } |
659 | |
660 | void onCallPenalty() override { addCost(Inc: CallPenalty); } |
661 | |
662 | void onMemAccess() override { addCost(Inc: MemAccessCost); } |
663 | |
664 | void onCallArgumentSetup(const CallBase &Call) override { |
665 | // Pay the price of the argument setup. We account for the average 1 |
666 | // instruction per call argument setup here. |
667 | addCost(Inc: Call.arg_size() * InstrCost); |
668 | } |
669 | void onLoadRelativeIntrinsic() override { |
670 | // This is normally lowered to 4 LLVM instructions. |
671 | addCost(Inc: 3 * InstrCost); |
672 | } |
673 | void onLoweredCall(Function *F, CallBase &Call, |
674 | bool IsIndirectCall) override { |
675 | // We account for the average 1 instruction per call argument setup here. |
676 | addCost(Inc: Call.arg_size() * InstrCost); |
677 | |
678 | // If we have a constant that we are calling as a function, we can peer |
679 | // through it and see the function target. This happens not infrequently |
680 | // during devirtualization and so we want to give it a hefty bonus for |
681 | // inlining, but cap that bonus in the event that inlining wouldn't pan out. |
682 | // Pretend to inline the function, with a custom threshold. |
683 | if (IsIndirectCall && BoostIndirectCalls) { |
684 | auto IndirectCallParams = Params; |
685 | IndirectCallParams.DefaultThreshold = |
686 | InlineConstants::IndirectCallThreshold; |
687 | /// FIXME: if InlineCostCallAnalyzer is derived from, this may need |
688 | /// to instantiate the derived class. |
689 | InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI, |
690 | GetAssumptionCache, GetBFI, PSI, ORE, false); |
691 | if (CA.analyze().isSuccess()) { |
692 | // We were able to inline the indirect call! Subtract the cost from the |
693 | // threshold to get the bonus we want to apply, but don't go below zero. |
694 | Cost -= std::max(a: 0, b: CA.getThreshold() - CA.getCost()); |
695 | } |
696 | } else |
697 | // Otherwise simply add the cost for merely making the call. |
698 | addCost(Inc: TTI.getInlineCallPenalty(F: CandidateCall.getCaller(), Call, |
699 | DefaultCallPenalty: CallPenalty)); |
700 | } |
701 | |
702 | void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster, |
703 | bool DefaultDestUndefined) override { |
704 | if (!DefaultDestUndefined) |
705 | addCost(Inc: 2 * InstrCost); |
706 | // If suitable for a jump table, consider the cost for the table size and |
707 | // branch to destination. |
708 | // Maximum valid cost increased in this function. |
709 | if (JumpTableSize) { |
710 | int64_t JTCost = |
711 | static_cast<int64_t>(JumpTableSize) * InstrCost + 4 * InstrCost; |
712 | addCost(Inc: JTCost); |
713 | return; |
714 | } |
715 | |
716 | if (NumCaseCluster <= 3) { |
717 | // Suppose a comparison includes one compare and one conditional branch. |
718 | addCost(Inc: NumCaseCluster * 2 * InstrCost); |
719 | return; |
720 | } |
721 | |
722 | int64_t ExpectedNumberOfCompare = |
723 | getExpectedNumberOfCompare(NumCaseCluster); |
724 | int64_t SwitchCost = ExpectedNumberOfCompare * 2 * InstrCost; |
725 | |
726 | addCost(Inc: SwitchCost); |
727 | } |
728 | void onMissedSimplification() override { addCost(Inc: InstrCost); } |
729 | |
730 | void onInitializeSROAArg(AllocaInst *Arg) override { |
731 | assert(Arg != nullptr && |
732 | "Should not initialize SROA costs for null value." ); |
733 | auto SROAArgCost = TTI.getCallerAllocaCost(CB: &CandidateCall, AI: Arg); |
734 | SROACostSavings += SROAArgCost; |
735 | SROAArgCosts[Arg] = SROAArgCost; |
736 | } |
737 | |
738 | void onAggregateSROAUse(AllocaInst *SROAArg) override { |
739 | auto CostIt = SROAArgCosts.find(Val: SROAArg); |
740 | assert(CostIt != SROAArgCosts.end() && |
741 | "expected this argument to have a cost" ); |
742 | CostIt->second += InstrCost; |
743 | SROACostSavings += InstrCost; |
744 | } |
745 | |
746 | void onBlockStart(const BasicBlock *BB) override { CostAtBBStart = Cost; } |
747 | |
748 | void onBlockAnalyzed(const BasicBlock *BB) override { |
749 | if (CostBenefitAnalysisEnabled) { |
750 | // Keep track of the static size of live but cold basic blocks. For now, |
751 | // we define a cold basic block to be one that's never executed. |
752 | assert(GetBFI && "GetBFI must be available" ); |
753 | BlockFrequencyInfo *BFI = &(GetBFI(F)); |
754 | assert(BFI && "BFI must be available" ); |
755 | auto ProfileCount = BFI->getBlockProfileCount(BB); |
756 | if (*ProfileCount == 0) |
757 | ColdSize += Cost - CostAtBBStart; |
758 | } |
759 | |
760 | auto *TI = BB->getTerminator(); |
761 | // If we had any successors at this point, than post-inlining is likely to |
762 | // have them as well. Note that we assume any basic blocks which existed |
763 | // due to branches or switches which folded above will also fold after |
764 | // inlining. |
765 | if (SingleBB && TI->getNumSuccessors() > 1) { |
766 | // Take off the bonus we applied to the threshold. |
767 | Threshold -= SingleBBBonus; |
768 | SingleBB = false; |
769 | } |
770 | } |
771 | |
772 | void onInstructionAnalysisStart(const Instruction *I) override { |
773 | // This function is called to store the initial cost of inlining before |
774 | // the given instruction was assessed. |
775 | if (!PrintInstructionComments) |
776 | return; |
777 | InstructionCostDetailMap[I].CostBefore = Cost; |
778 | InstructionCostDetailMap[I].ThresholdBefore = Threshold; |
779 | } |
780 | |
781 | void onInstructionAnalysisFinish(const Instruction *I) override { |
782 | // This function is called to find new values of cost and threshold after |
783 | // the instruction has been assessed. |
784 | if (!PrintInstructionComments) |
785 | return; |
786 | InstructionCostDetailMap[I].CostAfter = Cost; |
787 | InstructionCostDetailMap[I].ThresholdAfter = Threshold; |
788 | } |
789 | |
790 | bool isCostBenefitAnalysisEnabled() { |
791 | if (!PSI || !PSI->hasProfileSummary()) |
792 | return false; |
793 | |
794 | if (!GetBFI) |
795 | return false; |
796 | |
797 | if (InlineEnableCostBenefitAnalysis.getNumOccurrences()) { |
798 | // Honor the explicit request from the user. |
799 | if (!InlineEnableCostBenefitAnalysis) |
800 | return false; |
801 | } else { |
802 | // Otherwise, require instrumentation profile. |
803 | if (!(PSI->hasInstrumentationProfile() || PSI->hasSampleProfile())) |
804 | return false; |
805 | } |
806 | |
807 | auto *Caller = CandidateCall.getParent()->getParent(); |
808 | if (!Caller->getEntryCount()) |
809 | return false; |
810 | |
811 | BlockFrequencyInfo *CallerBFI = &(GetBFI(*Caller)); |
812 | if (!CallerBFI) |
813 | return false; |
814 | |
815 | // For now, limit to hot call site. |
816 | if (!PSI->isHotCallSite(CB: CandidateCall, BFI: CallerBFI)) |
817 | return false; |
818 | |
819 | // Make sure we have a nonzero entry count. |
820 | auto EntryCount = F.getEntryCount(); |
821 | if (!EntryCount || !EntryCount->getCount()) |
822 | return false; |
823 | |
824 | BlockFrequencyInfo *CalleeBFI = &(GetBFI(F)); |
825 | if (!CalleeBFI) |
826 | return false; |
827 | |
828 | return true; |
829 | } |
830 | |
831 | // A helper function to choose between command line override and default. |
832 | unsigned getInliningCostBenefitAnalysisSavingsMultiplier() const { |
833 | if (InlineSavingsMultiplier.getNumOccurrences()) |
834 | return InlineSavingsMultiplier; |
835 | return TTI.getInliningCostBenefitAnalysisSavingsMultiplier(); |
836 | } |
837 | |
838 | // A helper function to choose between command line override and default. |
839 | unsigned getInliningCostBenefitAnalysisProfitableMultiplier() const { |
840 | if (InlineSavingsProfitableMultiplier.getNumOccurrences()) |
841 | return InlineSavingsProfitableMultiplier; |
842 | return TTI.getInliningCostBenefitAnalysisProfitableMultiplier(); |
843 | } |
844 | |
845 | void OverrideCycleSavingsAndSizeForTesting(APInt &CycleSavings, int &Size) { |
846 | if (std::optional<int> AttrCycleSavings = getStringFnAttrAsInt( |
847 | CB&: CandidateCall, AttrKind: "inline-cycle-savings-for-test" )) { |
848 | CycleSavings = *AttrCycleSavings; |
849 | } |
850 | |
851 | if (std::optional<int> AttrRuntimeCost = getStringFnAttrAsInt( |
852 | CB&: CandidateCall, AttrKind: "inline-runtime-cost-for-test" )) { |
853 | Size = *AttrRuntimeCost; |
854 | } |
855 | } |
856 | |
857 | // Determine whether we should inline the given call site, taking into account |
858 | // both the size cost and the cycle savings. Return std::nullopt if we don't |
859 | // have sufficient profiling information to determine. |
860 | std::optional<bool> costBenefitAnalysis() { |
861 | if (!CostBenefitAnalysisEnabled) |
862 | return std::nullopt; |
863 | |
864 | // buildInlinerPipeline in the pass builder sets HotCallSiteThreshold to 0 |
865 | // for the prelink phase of the AutoFDO + ThinLTO build. Honor the logic by |
866 | // falling back to the cost-based metric. |
867 | // TODO: Improve this hacky condition. |
868 | if (Threshold == 0) |
869 | return std::nullopt; |
870 | |
871 | assert(GetBFI); |
872 | BlockFrequencyInfo *CalleeBFI = &(GetBFI(F)); |
873 | assert(CalleeBFI); |
874 | |
875 | // The cycle savings expressed as the sum of InstrCost |
876 | // multiplied by the estimated dynamic count of each instruction we can |
877 | // avoid. Savings come from the call site cost, such as argument setup and |
878 | // the call instruction, as well as the instructions that are folded. |
879 | // |
880 | // We use 128-bit APInt here to avoid potential overflow. This variable |
881 | // should stay well below 10^^24 (or 2^^80) in practice. This "worst" case |
882 | // assumes that we can avoid or fold a billion instructions, each with a |
883 | // profile count of 10^^15 -- roughly the number of cycles for a 24-hour |
884 | // period on a 4GHz machine. |
885 | APInt CycleSavings(128, 0); |
886 | |
887 | for (auto &BB : F) { |
888 | APInt CurrentSavings(128, 0); |
889 | for (auto &I : BB) { |
890 | if (BranchInst *BI = dyn_cast<BranchInst>(Val: &I)) { |
891 | // Count a conditional branch as savings if it becomes unconditional. |
892 | if (BI->isConditional() && |
893 | isa_and_nonnull<ConstantInt>( |
894 | Val: SimplifiedValues.lookup(Val: BI->getCondition()))) { |
895 | CurrentSavings += InstrCost; |
896 | } |
897 | } else if (SwitchInst *SI = dyn_cast<SwitchInst>(Val: &I)) { |
898 | if (isa_and_present<ConstantInt>(Val: SimplifiedValues.lookup(Val: SI->getCondition()))) |
899 | CurrentSavings += InstrCost; |
900 | } else if (Value *V = dyn_cast<Value>(Val: &I)) { |
901 | // Count an instruction as savings if we can fold it. |
902 | if (SimplifiedValues.count(Val: V)) { |
903 | CurrentSavings += InstrCost; |
904 | } |
905 | } |
906 | } |
907 | |
908 | auto ProfileCount = CalleeBFI->getBlockProfileCount(BB: &BB); |
909 | CurrentSavings *= *ProfileCount; |
910 | CycleSavings += CurrentSavings; |
911 | } |
912 | |
913 | // Compute the cycle savings per call. |
914 | auto EntryProfileCount = F.getEntryCount(); |
915 | assert(EntryProfileCount && EntryProfileCount->getCount()); |
916 | auto EntryCount = EntryProfileCount->getCount(); |
917 | CycleSavings += EntryCount / 2; |
918 | CycleSavings = CycleSavings.udiv(RHS: EntryCount); |
919 | |
920 | // Compute the total savings for the call site. |
921 | auto *CallerBB = CandidateCall.getParent(); |
922 | BlockFrequencyInfo *CallerBFI = &(GetBFI(*(CallerBB->getParent()))); |
923 | CycleSavings += getCallsiteCost(TTI, Call: this->CandidateCall, DL); |
924 | CycleSavings *= *CallerBFI->getBlockProfileCount(BB: CallerBB); |
925 | |
926 | // Remove the cost of the cold basic blocks to model the runtime cost more |
927 | // accurately. Both machine block placement and function splitting could |
928 | // place cold blocks further from hot blocks. |
929 | int Size = Cost - ColdSize; |
930 | |
931 | // Allow tiny callees to be inlined regardless of whether they meet the |
932 | // savings threshold. |
933 | Size = Size > InlineSizeAllowance ? Size - InlineSizeAllowance : 1; |
934 | |
935 | OverrideCycleSavingsAndSizeForTesting(CycleSavings, Size); |
936 | CostBenefit.emplace(args: APInt(128, Size), args&: CycleSavings); |
937 | |
938 | // Let R be the ratio of CycleSavings to Size. We accept the inlining |
939 | // opportunity if R is really high and reject if R is really low. If R is |
940 | // somewhere in the middle, we fall back to the cost-based analysis. |
941 | // |
942 | // Specifically, let R = CycleSavings / Size, we accept the inlining |
943 | // opportunity if: |
944 | // |
945 | // PSI->getOrCompHotCountThreshold() |
946 | // R > ------------------------------------------------- |
947 | // getInliningCostBenefitAnalysisSavingsMultiplier() |
948 | // |
949 | // and reject the inlining opportunity if: |
950 | // |
951 | // PSI->getOrCompHotCountThreshold() |
952 | // R <= ---------------------------------------------------- |
953 | // getInliningCostBenefitAnalysisProfitableMultiplier() |
954 | // |
955 | // Otherwise, we fall back to the cost-based analysis. |
956 | // |
957 | // Implementation-wise, use multiplication (CycleSavings * Multiplier, |
958 | // HotCountThreshold * Size) rather than division to avoid precision loss. |
959 | APInt Threshold(128, PSI->getOrCompHotCountThreshold()); |
960 | Threshold *= Size; |
961 | |
962 | APInt UpperBoundCycleSavings = CycleSavings; |
963 | UpperBoundCycleSavings *= getInliningCostBenefitAnalysisSavingsMultiplier(); |
964 | if (UpperBoundCycleSavings.uge(RHS: Threshold)) |
965 | return true; |
966 | |
967 | APInt LowerBoundCycleSavings = CycleSavings; |
968 | LowerBoundCycleSavings *= |
969 | getInliningCostBenefitAnalysisProfitableMultiplier(); |
970 | if (LowerBoundCycleSavings.ult(RHS: Threshold)) |
971 | return false; |
972 | |
973 | // Otherwise, fall back to the cost-based analysis. |
974 | return std::nullopt; |
975 | } |
976 | |
977 | InlineResult finalizeAnalysis() override { |
978 | // Loops generally act a lot like calls in that they act like barriers to |
979 | // movement, require a certain amount of setup, etc. So when optimising for |
980 | // size, we penalise any call sites that perform loops. We do this after all |
981 | // other costs here, so will likely only be dealing with relatively small |
982 | // functions (and hence DT and LI will hopefully be cheap). |
983 | auto *Caller = CandidateCall.getFunction(); |
984 | if (Caller->hasMinSize()) { |
985 | DominatorTree DT(F); |
986 | LoopInfo LI(DT); |
987 | int NumLoops = 0; |
988 | for (Loop *L : LI) { |
989 | // Ignore loops that will not be executed |
990 | if (DeadBlocks.count(Ptr: L->getHeader())) |
991 | continue; |
992 | NumLoops++; |
993 | } |
994 | addCost(Inc: NumLoops * InlineConstants::LoopPenalty); |
995 | } |
996 | |
997 | // We applied the maximum possible vector bonus at the beginning. Now, |
998 | // subtract the excess bonus, if any, from the Threshold before |
999 | // comparing against Cost. |
1000 | if (NumVectorInstructions <= NumInstructions / 10) |
1001 | Threshold -= VectorBonus; |
1002 | else if (NumVectorInstructions <= NumInstructions / 2) |
1003 | Threshold -= VectorBonus / 2; |
1004 | |
1005 | if (std::optional<int> AttrCost = |
1006 | getStringFnAttrAsInt(CB&: CandidateCall, AttrKind: "function-inline-cost" )) |
1007 | Cost = *AttrCost; |
1008 | |
1009 | if (std::optional<int> AttrCostMult = getStringFnAttrAsInt( |
1010 | CB&: CandidateCall, |
1011 | AttrKind: InlineConstants::FunctionInlineCostMultiplierAttributeName)) |
1012 | Cost *= *AttrCostMult; |
1013 | |
1014 | if (std::optional<int> AttrThreshold = |
1015 | getStringFnAttrAsInt(CB&: CandidateCall, AttrKind: "function-inline-threshold" )) |
1016 | Threshold = *AttrThreshold; |
1017 | |
1018 | if (auto Result = costBenefitAnalysis()) { |
1019 | DecidedByCostBenefit = true; |
1020 | if (*Result) |
1021 | return InlineResult::success(); |
1022 | else |
1023 | return InlineResult::failure(Reason: "Cost over threshold." ); |
1024 | } |
1025 | |
1026 | if (IgnoreThreshold) |
1027 | return InlineResult::success(); |
1028 | |
1029 | DecidedByCostThreshold = true; |
1030 | return Cost < std::max(a: 1, b: Threshold) |
1031 | ? InlineResult::success() |
1032 | : InlineResult::failure(Reason: "Cost over threshold." ); |
1033 | } |
1034 | |
1035 | bool shouldStop() override { |
1036 | if (IgnoreThreshold || ComputeFullInlineCost) |
1037 | return false; |
1038 | // Bail out the moment we cross the threshold. This means we'll under-count |
1039 | // the cost, but only when undercounting doesn't matter. |
1040 | if (Cost < Threshold) |
1041 | return false; |
1042 | DecidedByCostThreshold = true; |
1043 | return true; |
1044 | } |
1045 | |
1046 | void onLoadEliminationOpportunity() override { |
1047 | LoadEliminationCost += InstrCost; |
1048 | } |
1049 | |
1050 | InlineResult onAnalysisStart() override { |
1051 | // Perform some tweaks to the cost and threshold based on the direct |
1052 | // callsite information. |
1053 | |
1054 | // We want to more aggressively inline vector-dense kernels, so up the |
1055 | // threshold, and we'll lower it if the % of vector instructions gets too |
1056 | // low. Note that these bonuses are some what arbitrary and evolved over |
1057 | // time by accident as much as because they are principled bonuses. |
1058 | // |
1059 | // FIXME: It would be nice to remove all such bonuses. At least it would be |
1060 | // nice to base the bonus values on something more scientific. |
1061 | assert(NumInstructions == 0); |
1062 | assert(NumVectorInstructions == 0); |
1063 | |
1064 | // Update the threshold based on callsite properties |
1065 | updateThreshold(Call&: CandidateCall, Callee&: F); |
1066 | |
1067 | // While Threshold depends on commandline options that can take negative |
1068 | // values, we want to enforce the invariant that the computed threshold and |
1069 | // bonuses are non-negative. |
1070 | assert(Threshold >= 0); |
1071 | assert(SingleBBBonus >= 0); |
1072 | assert(VectorBonus >= 0); |
1073 | |
1074 | // Speculatively apply all possible bonuses to Threshold. If cost exceeds |
1075 | // this Threshold any time, and cost cannot decrease, we can stop processing |
1076 | // the rest of the function body. |
1077 | Threshold += (SingleBBBonus + VectorBonus); |
1078 | |
1079 | // Give out bonuses for the callsite, as the instructions setting them up |
1080 | // will be gone after inlining. |
1081 | addCost(Inc: -getCallsiteCost(TTI, Call: this->CandidateCall, DL)); |
1082 | |
1083 | // If this function uses the coldcc calling convention, prefer not to inline |
1084 | // it. |
1085 | if (F.getCallingConv() == CallingConv::Cold) |
1086 | Cost += InlineConstants::ColdccPenalty; |
1087 | |
1088 | LLVM_DEBUG(dbgs() << " Initial cost: " << Cost << "\n" ); |
1089 | |
1090 | // Check if we're done. This can happen due to bonuses and penalties. |
1091 | if (Cost >= Threshold && !ComputeFullInlineCost) |
1092 | return InlineResult::failure(Reason: "high cost" ); |
1093 | |
1094 | return InlineResult::success(); |
1095 | } |
1096 | |
1097 | public: |
1098 | InlineCostCallAnalyzer( |
1099 | Function &Callee, CallBase &Call, const InlineParams &Params, |
1100 | const TargetTransformInfo &TTI, |
1101 | function_ref<AssumptionCache &(Function &)> GetAssumptionCache, |
1102 | function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr, |
1103 | ProfileSummaryInfo *PSI = nullptr, |
1104 | OptimizationRemarkEmitter *ORE = nullptr, bool BoostIndirect = true, |
1105 | bool IgnoreThreshold = false) |
1106 | : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, PSI, ORE), |
1107 | ComputeFullInlineCost(OptComputeFullInlineCost || |
1108 | Params.ComputeFullInlineCost || ORE || |
1109 | isCostBenefitAnalysisEnabled()), |
1110 | Params(Params), Threshold(Params.DefaultThreshold), |
1111 | BoostIndirectCalls(BoostIndirect), IgnoreThreshold(IgnoreThreshold), |
1112 | CostBenefitAnalysisEnabled(isCostBenefitAnalysisEnabled()), |
1113 | Writer(this) { |
1114 | AllowRecursiveCall = *Params.AllowRecursiveCall; |
1115 | } |
1116 | |
1117 | /// Annotation Writer for instruction details |
1118 | InlineCostAnnotationWriter Writer; |
1119 | |
1120 | void dump(); |
1121 | |
1122 | // Prints the same analysis as dump(), but its definition is not dependent |
1123 | // on the build. |
1124 | void print(raw_ostream &OS); |
1125 | |
1126 | std::optional<InstructionCostDetail> getCostDetails(const Instruction *I) { |
1127 | if (InstructionCostDetailMap.contains(Val: I)) |
1128 | return InstructionCostDetailMap[I]; |
1129 | return std::nullopt; |
1130 | } |
1131 | |
1132 | virtual ~InlineCostCallAnalyzer() = default; |
1133 | int getThreshold() const { return Threshold; } |
1134 | int getCost() const { return Cost; } |
1135 | int getStaticBonusApplied() const { return StaticBonusApplied; } |
1136 | std::optional<CostBenefitPair> getCostBenefitPair() { return CostBenefit; } |
1137 | bool wasDecidedByCostBenefit() const { return DecidedByCostBenefit; } |
1138 | bool wasDecidedByCostThreshold() const { return DecidedByCostThreshold; } |
1139 | }; |
1140 | |
1141 | // Return true if CB is the sole call to local function Callee. |
1142 | static bool isSoleCallToLocalFunction(const CallBase &CB, |
1143 | const Function &Callee) { |
1144 | return Callee.hasLocalLinkage() && Callee.hasOneLiveUse() && |
1145 | &Callee == CB.getCalledFunction(); |
1146 | } |
1147 | |
1148 | class InlineCostFeaturesAnalyzer final : public CallAnalyzer { |
1149 | private: |
1150 | InlineCostFeatures Cost = {}; |
1151 | |
1152 | // FIXME: These constants are taken from the heuristic-based cost visitor. |
1153 | // These should be removed entirely in a later revision to avoid reliance on |
1154 | // heuristics in the ML inliner. |
1155 | static constexpr int JTCostMultiplier = 4; |
1156 | static constexpr int CaseClusterCostMultiplier = 2; |
1157 | static constexpr int SwitchDefaultDestCostMultiplier = 2; |
1158 | static constexpr int SwitchCostMultiplier = 2; |
1159 | |
1160 | // FIXME: These are taken from the heuristic-based cost visitor: we should |
1161 | // eventually abstract these to the CallAnalyzer to avoid duplication. |
1162 | unsigned SROACostSavingOpportunities = 0; |
1163 | int VectorBonus = 0; |
1164 | int SingleBBBonus = 0; |
1165 | int Threshold = 5; |
1166 | |
1167 | DenseMap<AllocaInst *, unsigned> SROACosts; |
1168 | |
1169 | void increment(InlineCostFeatureIndex Feature, int64_t Delta = 1) { |
1170 | Cost[static_cast<size_t>(Feature)] += Delta; |
1171 | } |
1172 | |
1173 | void set(InlineCostFeatureIndex Feature, int64_t Value) { |
1174 | Cost[static_cast<size_t>(Feature)] = Value; |
1175 | } |
1176 | |
1177 | void onDisableSROA(AllocaInst *Arg) override { |
1178 | auto CostIt = SROACosts.find(Val: Arg); |
1179 | if (CostIt == SROACosts.end()) |
1180 | return; |
1181 | |
1182 | increment(Feature: InlineCostFeatureIndex::sroa_losses, Delta: CostIt->second); |
1183 | SROACostSavingOpportunities -= CostIt->second; |
1184 | SROACosts.erase(I: CostIt); |
1185 | } |
1186 | |
1187 | void onDisableLoadElimination() override { |
1188 | set(Feature: InlineCostFeatureIndex::load_elimination, Value: 1); |
1189 | } |
1190 | |
1191 | void onCallPenalty() override { |
1192 | increment(Feature: InlineCostFeatureIndex::call_penalty, Delta: CallPenalty); |
1193 | } |
1194 | |
1195 | void onCallArgumentSetup(const CallBase &Call) override { |
1196 | increment(Feature: InlineCostFeatureIndex::call_argument_setup, |
1197 | Delta: Call.arg_size() * InstrCost); |
1198 | } |
1199 | |
1200 | void onLoadRelativeIntrinsic() override { |
1201 | increment(Feature: InlineCostFeatureIndex::load_relative_intrinsic, Delta: 3 * InstrCost); |
1202 | } |
1203 | |
1204 | void onLoweredCall(Function *F, CallBase &Call, |
1205 | bool IsIndirectCall) override { |
1206 | increment(Feature: InlineCostFeatureIndex::lowered_call_arg_setup, |
1207 | Delta: Call.arg_size() * InstrCost); |
1208 | |
1209 | if (IsIndirectCall) { |
1210 | InlineParams IndirectCallParams = {/* DefaultThreshold*/ 0, |
1211 | /*HintThreshold*/ {}, |
1212 | /*ColdThreshold*/ {}, |
1213 | /*OptSizeThreshold*/ {}, |
1214 | /*OptMinSizeThreshold*/ {}, |
1215 | /*HotCallSiteThreshold*/ {}, |
1216 | /*LocallyHotCallSiteThreshold*/ {}, |
1217 | /*ColdCallSiteThreshold*/ {}, |
1218 | /*ComputeFullInlineCost*/ true, |
1219 | /*EnableDeferral*/ true}; |
1220 | IndirectCallParams.DefaultThreshold = |
1221 | InlineConstants::IndirectCallThreshold; |
1222 | |
1223 | InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI, |
1224 | GetAssumptionCache, GetBFI, PSI, ORE, false, |
1225 | true); |
1226 | if (CA.analyze().isSuccess()) { |
1227 | increment(Feature: InlineCostFeatureIndex::nested_inline_cost_estimate, |
1228 | Delta: CA.getCost()); |
1229 | increment(Feature: InlineCostFeatureIndex::nested_inlines, Delta: 1); |
1230 | } |
1231 | } else { |
1232 | onCallPenalty(); |
1233 | } |
1234 | } |
1235 | |
1236 | void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster, |
1237 | bool DefaultDestUndefined) override { |
1238 | if (!DefaultDestUndefined) |
1239 | increment(Feature: InlineCostFeatureIndex::switch_default_dest_penalty, |
1240 | Delta: SwitchDefaultDestCostMultiplier * InstrCost); |
1241 | |
1242 | if (JumpTableSize) { |
1243 | int64_t JTCost = static_cast<int64_t>(JumpTableSize) * InstrCost + |
1244 | JTCostMultiplier * InstrCost; |
1245 | increment(Feature: InlineCostFeatureIndex::jump_table_penalty, Delta: JTCost); |
1246 | return; |
1247 | } |
1248 | |
1249 | if (NumCaseCluster <= 3) { |
1250 | increment(Feature: InlineCostFeatureIndex::case_cluster_penalty, |
1251 | Delta: NumCaseCluster * CaseClusterCostMultiplier * InstrCost); |
1252 | return; |
1253 | } |
1254 | |
1255 | int64_t ExpectedNumberOfCompare = |
1256 | getExpectedNumberOfCompare(NumCaseCluster); |
1257 | |
1258 | int64_t SwitchCost = |
1259 | ExpectedNumberOfCompare * SwitchCostMultiplier * InstrCost; |
1260 | increment(Feature: InlineCostFeatureIndex::switch_penalty, Delta: SwitchCost); |
1261 | } |
1262 | |
1263 | void onMissedSimplification() override { |
1264 | increment(Feature: InlineCostFeatureIndex::unsimplified_common_instructions, |
1265 | Delta: InstrCost); |
1266 | } |
1267 | |
1268 | void onInitializeSROAArg(AllocaInst *Arg) override { |
1269 | auto SROAArgCost = TTI.getCallerAllocaCost(CB: &CandidateCall, AI: Arg); |
1270 | SROACosts[Arg] = SROAArgCost; |
1271 | SROACostSavingOpportunities += SROAArgCost; |
1272 | } |
1273 | |
1274 | void onAggregateSROAUse(AllocaInst *Arg) override { |
1275 | SROACosts.find(Val: Arg)->second += InstrCost; |
1276 | SROACostSavingOpportunities += InstrCost; |
1277 | } |
1278 | |
1279 | void onBlockAnalyzed(const BasicBlock *BB) override { |
1280 | if (BB->getTerminator()->getNumSuccessors() > 1) |
1281 | set(Feature: InlineCostFeatureIndex::is_multiple_blocks, Value: 1); |
1282 | Threshold -= SingleBBBonus; |
1283 | } |
1284 | |
1285 | InlineResult finalizeAnalysis() override { |
1286 | auto *Caller = CandidateCall.getFunction(); |
1287 | if (Caller->hasMinSize()) { |
1288 | DominatorTree DT(F); |
1289 | LoopInfo LI(DT); |
1290 | for (Loop *L : LI) { |
1291 | // Ignore loops that will not be executed |
1292 | if (DeadBlocks.count(Ptr: L->getHeader())) |
1293 | continue; |
1294 | increment(Feature: InlineCostFeatureIndex::num_loops, |
1295 | Delta: InlineConstants::LoopPenalty); |
1296 | } |
1297 | } |
1298 | set(Feature: InlineCostFeatureIndex::dead_blocks, Value: DeadBlocks.size()); |
1299 | set(Feature: InlineCostFeatureIndex::simplified_instructions, |
1300 | Value: NumInstructionsSimplified); |
1301 | set(Feature: InlineCostFeatureIndex::constant_args, Value: NumConstantArgs); |
1302 | set(Feature: InlineCostFeatureIndex::constant_offset_ptr_args, |
1303 | Value: NumConstantOffsetPtrArgs); |
1304 | set(Feature: InlineCostFeatureIndex::sroa_savings, Value: SROACostSavingOpportunities); |
1305 | |
1306 | if (NumVectorInstructions <= NumInstructions / 10) |
1307 | Threshold -= VectorBonus; |
1308 | else if (NumVectorInstructions <= NumInstructions / 2) |
1309 | Threshold -= VectorBonus / 2; |
1310 | |
1311 | set(Feature: InlineCostFeatureIndex::threshold, Value: Threshold); |
1312 | |
1313 | return InlineResult::success(); |
1314 | } |
1315 | |
1316 | bool shouldStop() override { return false; } |
1317 | |
1318 | void onLoadEliminationOpportunity() override { |
1319 | increment(Feature: InlineCostFeatureIndex::load_elimination, Delta: 1); |
1320 | } |
1321 | |
1322 | InlineResult onAnalysisStart() override { |
1323 | increment(Feature: InlineCostFeatureIndex::callsite_cost, |
1324 | Delta: -1 * getCallsiteCost(TTI, Call: this->CandidateCall, DL)); |
1325 | |
1326 | set(Feature: InlineCostFeatureIndex::cold_cc_penalty, |
1327 | Value: (F.getCallingConv() == CallingConv::Cold)); |
1328 | |
1329 | set(Feature: InlineCostFeatureIndex::last_call_to_static_bonus, |
1330 | Value: isSoleCallToLocalFunction(CB: CandidateCall, Callee: F)); |
1331 | |
1332 | // FIXME: we shouldn't repeat this logic in both the Features and Cost |
1333 | // analyzer - instead, we should abstract it to a common method in the |
1334 | // CallAnalyzer |
1335 | int SingleBBBonusPercent = 50; |
1336 | int VectorBonusPercent = TTI.getInlinerVectorBonusPercent(); |
1337 | Threshold += TTI.adjustInliningThreshold(CB: &CandidateCall); |
1338 | Threshold *= TTI.getInliningThresholdMultiplier(); |
1339 | SingleBBBonus = Threshold * SingleBBBonusPercent / 100; |
1340 | VectorBonus = Threshold * VectorBonusPercent / 100; |
1341 | Threshold += (SingleBBBonus + VectorBonus); |
1342 | |
1343 | return InlineResult::success(); |
1344 | } |
1345 | |
1346 | public: |
1347 | InlineCostFeaturesAnalyzer( |
1348 | const TargetTransformInfo &TTI, |
1349 | function_ref<AssumptionCache &(Function &)> &GetAssumptionCache, |
1350 | function_ref<BlockFrequencyInfo &(Function &)> GetBFI, |
1351 | ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, Function &Callee, |
1352 | CallBase &Call) |
1353 | : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, PSI) {} |
1354 | |
1355 | const InlineCostFeatures &features() const { return Cost; } |
1356 | }; |
1357 | |
1358 | } // namespace |
1359 | |
1360 | /// Test whether the given value is an Alloca-derived function argument. |
1361 | bool CallAnalyzer::isAllocaDerivedArg(Value *V) { |
1362 | return SROAArgValues.count(Val: V); |
1363 | } |
1364 | |
1365 | void CallAnalyzer::disableSROAForArg(AllocaInst *SROAArg) { |
1366 | onDisableSROA(Arg: SROAArg); |
1367 | EnabledSROAAllocas.erase(V: SROAArg); |
1368 | disableLoadElimination(); |
1369 | } |
1370 | |
1371 | void InlineCostAnnotationWriter::emitInstructionAnnot( |
1372 | const Instruction *I, formatted_raw_ostream &OS) { |
1373 | // The cost of inlining of the given instruction is printed always. |
1374 | // The threshold delta is printed only when it is non-zero. It happens |
1375 | // when we decided to give a bonus at a particular instruction. |
1376 | std::optional<InstructionCostDetail> Record = ICCA->getCostDetails(I); |
1377 | if (!Record) |
1378 | OS << "; No analysis for the instruction" ; |
1379 | else { |
1380 | OS << "; cost before = " << Record->CostBefore |
1381 | << ", cost after = " << Record->CostAfter |
1382 | << ", threshold before = " << Record->ThresholdBefore |
1383 | << ", threshold after = " << Record->ThresholdAfter << ", " ; |
1384 | OS << "cost delta = " << Record->getCostDelta(); |
1385 | if (Record->hasThresholdChanged()) |
1386 | OS << ", threshold delta = " << Record->getThresholdDelta(); |
1387 | } |
1388 | auto C = ICCA->getSimplifiedValue(I: const_cast<Instruction *>(I)); |
1389 | if (C) { |
1390 | OS << ", simplified to " ; |
1391 | (*C)->print(O&: OS, IsForDebug: true); |
1392 | } |
1393 | OS << "\n" ; |
1394 | } |
1395 | |
1396 | /// If 'V' maps to a SROA candidate, disable SROA for it. |
1397 | void CallAnalyzer::disableSROA(Value *V) { |
1398 | if (auto *SROAArg = getSROAArgForValueOrNull(V)) { |
1399 | disableSROAForArg(SROAArg); |
1400 | } |
1401 | } |
1402 | |
1403 | void CallAnalyzer::disableLoadElimination() { |
1404 | if (EnableLoadElimination) { |
1405 | onDisableLoadElimination(); |
1406 | EnableLoadElimination = false; |
1407 | } |
1408 | } |
1409 | |
1410 | /// Accumulate a constant GEP offset into an APInt if possible. |
1411 | /// |
1412 | /// Returns false if unable to compute the offset for any reason. Respects any |
1413 | /// simplified values known during the analysis of this callsite. |
1414 | bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) { |
1415 | unsigned IntPtrWidth = DL.getIndexTypeSizeInBits(Ty: GEP.getType()); |
1416 | assert(IntPtrWidth == Offset.getBitWidth()); |
1417 | |
1418 | for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP); |
1419 | GTI != GTE; ++GTI) { |
1420 | ConstantInt *OpC = dyn_cast<ConstantInt>(Val: GTI.getOperand()); |
1421 | if (!OpC) |
1422 | if (Constant *SimpleOp = SimplifiedValues.lookup(Val: GTI.getOperand())) |
1423 | OpC = dyn_cast<ConstantInt>(Val: SimpleOp); |
1424 | if (!OpC) |
1425 | return false; |
1426 | if (OpC->isZero()) |
1427 | continue; |
1428 | |
1429 | // Handle a struct index, which adds its field offset to the pointer. |
1430 | if (StructType *STy = GTI.getStructTypeOrNull()) { |
1431 | unsigned ElementIdx = OpC->getZExtValue(); |
1432 | const StructLayout *SL = DL.getStructLayout(Ty: STy); |
1433 | Offset += APInt(IntPtrWidth, SL->getElementOffset(Idx: ElementIdx)); |
1434 | continue; |
1435 | } |
1436 | |
1437 | APInt TypeSize(IntPtrWidth, GTI.getSequentialElementStride(DL)); |
1438 | Offset += OpC->getValue().sextOrTrunc(width: IntPtrWidth) * TypeSize; |
1439 | } |
1440 | return true; |
1441 | } |
1442 | |
1443 | /// Use TTI to check whether a GEP is free. |
1444 | /// |
1445 | /// Respects any simplified values known during the analysis of this callsite. |
1446 | bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) { |
1447 | SmallVector<Value *, 4> Operands; |
1448 | Operands.push_back(Elt: GEP.getOperand(i_nocapture: 0)); |
1449 | for (const Use &Op : GEP.indices()) |
1450 | if (Constant *SimpleOp = SimplifiedValues.lookup(Val: Op)) |
1451 | Operands.push_back(Elt: SimpleOp); |
1452 | else |
1453 | Operands.push_back(Elt: Op); |
1454 | return TTI.getInstructionCost(U: &GEP, Operands, |
1455 | CostKind: TargetTransformInfo::TCK_SizeAndLatency) == |
1456 | TargetTransformInfo::TCC_Free; |
1457 | } |
1458 | |
1459 | bool CallAnalyzer::visitAlloca(AllocaInst &I) { |
1460 | disableSROA(V: I.getOperand(i_nocapture: 0)); |
1461 | |
1462 | // Check whether inlining will turn a dynamic alloca into a static |
1463 | // alloca and handle that case. |
1464 | if (I.isArrayAllocation()) { |
1465 | Constant *Size = SimplifiedValues.lookup(Val: I.getArraySize()); |
1466 | if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Val: Size)) { |
1467 | // Sometimes a dynamic alloca could be converted into a static alloca |
1468 | // after this constant prop, and become a huge static alloca on an |
1469 | // unconditional CFG path. Avoid inlining if this is going to happen above |
1470 | // a threshold. |
1471 | // FIXME: If the threshold is removed or lowered too much, we could end up |
1472 | // being too pessimistic and prevent inlining non-problematic code. This |
1473 | // could result in unintended perf regressions. A better overall strategy |
1474 | // is needed to track stack usage during inlining. |
1475 | Type *Ty = I.getAllocatedType(); |
1476 | AllocatedSize = SaturatingMultiplyAdd( |
1477 | X: AllocSize->getLimitedValue(), |
1478 | Y: DL.getTypeAllocSize(Ty).getKnownMinValue(), A: AllocatedSize); |
1479 | if (AllocatedSize > InlineConstants::MaxSimplifiedDynamicAllocaToInline) |
1480 | HasDynamicAlloca = true; |
1481 | return false; |
1482 | } |
1483 | } |
1484 | |
1485 | // Accumulate the allocated size. |
1486 | if (I.isStaticAlloca()) { |
1487 | Type *Ty = I.getAllocatedType(); |
1488 | AllocatedSize = SaturatingAdd(X: DL.getTypeAllocSize(Ty).getKnownMinValue(), |
1489 | Y: AllocatedSize); |
1490 | } |
1491 | |
1492 | // FIXME: This is overly conservative. Dynamic allocas are inefficient for |
1493 | // a variety of reasons, and so we would like to not inline them into |
1494 | // functions which don't currently have a dynamic alloca. This simply |
1495 | // disables inlining altogether in the presence of a dynamic alloca. |
1496 | if (!I.isStaticAlloca()) |
1497 | HasDynamicAlloca = true; |
1498 | |
1499 | return false; |
1500 | } |
1501 | |
1502 | bool CallAnalyzer::visitPHI(PHINode &I) { |
1503 | // FIXME: We need to propagate SROA *disabling* through phi nodes, even |
1504 | // though we don't want to propagate it's bonuses. The idea is to disable |
1505 | // SROA if it *might* be used in an inappropriate manner. |
1506 | |
1507 | // Phi nodes are always zero-cost. |
1508 | // FIXME: Pointer sizes may differ between different address spaces, so do we |
1509 | // need to use correct address space in the call to getPointerSizeInBits here? |
1510 | // Or could we skip the getPointerSizeInBits call completely? As far as I can |
1511 | // see the ZeroOffset is used as a dummy value, so we can probably use any |
1512 | // bit width for the ZeroOffset? |
1513 | APInt ZeroOffset = APInt::getZero(numBits: DL.getPointerSizeInBits(AS: 0)); |
1514 | bool CheckSROA = I.getType()->isPointerTy(); |
1515 | |
1516 | // Track the constant or pointer with constant offset we've seen so far. |
1517 | Constant *FirstC = nullptr; |
1518 | std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset}; |
1519 | Value *FirstV = nullptr; |
1520 | |
1521 | for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) { |
1522 | BasicBlock *Pred = I.getIncomingBlock(i); |
1523 | // If the incoming block is dead, skip the incoming block. |
1524 | if (DeadBlocks.count(Ptr: Pred)) |
1525 | continue; |
1526 | // If the parent block of phi is not the known successor of the incoming |
1527 | // block, skip the incoming block. |
1528 | BasicBlock *KnownSuccessor = KnownSuccessors[Pred]; |
1529 | if (KnownSuccessor && KnownSuccessor != I.getParent()) |
1530 | continue; |
1531 | |
1532 | Value *V = I.getIncomingValue(i); |
1533 | // If the incoming value is this phi itself, skip the incoming value. |
1534 | if (&I == V) |
1535 | continue; |
1536 | |
1537 | Constant *C = dyn_cast<Constant>(Val: V); |
1538 | if (!C) |
1539 | C = SimplifiedValues.lookup(Val: V); |
1540 | |
1541 | std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset}; |
1542 | if (!C && CheckSROA) |
1543 | BaseAndOffset = ConstantOffsetPtrs.lookup(Val: V); |
1544 | |
1545 | if (!C && !BaseAndOffset.first) |
1546 | // The incoming value is neither a constant nor a pointer with constant |
1547 | // offset, exit early. |
1548 | return true; |
1549 | |
1550 | if (FirstC) { |
1551 | if (FirstC == C) |
1552 | // If we've seen a constant incoming value before and it is the same |
1553 | // constant we see this time, continue checking the next incoming value. |
1554 | continue; |
1555 | // Otherwise early exit because we either see a different constant or saw |
1556 | // a constant before but we have a pointer with constant offset this time. |
1557 | return true; |
1558 | } |
1559 | |
1560 | if (FirstV) { |
1561 | // The same logic as above, but check pointer with constant offset here. |
1562 | if (FirstBaseAndOffset == BaseAndOffset) |
1563 | continue; |
1564 | return true; |
1565 | } |
1566 | |
1567 | if (C) { |
1568 | // This is the 1st time we've seen a constant, record it. |
1569 | FirstC = C; |
1570 | continue; |
1571 | } |
1572 | |
1573 | // The remaining case is that this is the 1st time we've seen a pointer with |
1574 | // constant offset, record it. |
1575 | FirstV = V; |
1576 | FirstBaseAndOffset = BaseAndOffset; |
1577 | } |
1578 | |
1579 | // Check if we can map phi to a constant. |
1580 | if (FirstC) { |
1581 | SimplifiedValues[&I] = FirstC; |
1582 | return true; |
1583 | } |
1584 | |
1585 | // Check if we can map phi to a pointer with constant offset. |
1586 | if (FirstBaseAndOffset.first) { |
1587 | ConstantOffsetPtrs[&I] = FirstBaseAndOffset; |
1588 | |
1589 | if (auto *SROAArg = getSROAArgForValueOrNull(V: FirstV)) |
1590 | SROAArgValues[&I] = SROAArg; |
1591 | } |
1592 | |
1593 | return true; |
1594 | } |
1595 | |
1596 | /// Check we can fold GEPs of constant-offset call site argument pointers. |
1597 | /// This requires target data and inbounds GEPs. |
1598 | /// |
1599 | /// \return true if the specified GEP can be folded. |
1600 | bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) { |
1601 | // Check if we have a base + offset for the pointer. |
1602 | std::pair<Value *, APInt> BaseAndOffset = |
1603 | ConstantOffsetPtrs.lookup(Val: I.getPointerOperand()); |
1604 | if (!BaseAndOffset.first) |
1605 | return false; |
1606 | |
1607 | // Check if the offset of this GEP is constant, and if so accumulate it |
1608 | // into Offset. |
1609 | if (!accumulateGEPOffset(GEP&: cast<GEPOperator>(Val&: I), Offset&: BaseAndOffset.second)) |
1610 | return false; |
1611 | |
1612 | // Add the result as a new mapping to Base + Offset. |
1613 | ConstantOffsetPtrs[&I] = BaseAndOffset; |
1614 | |
1615 | return true; |
1616 | } |
1617 | |
1618 | bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) { |
1619 | auto *SROAArg = getSROAArgForValueOrNull(V: I.getPointerOperand()); |
1620 | |
1621 | // Lambda to check whether a GEP's indices are all constant. |
1622 | auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) { |
1623 | for (const Use &Op : GEP.indices()) |
1624 | if (!isa<Constant>(Val: Op) && !SimplifiedValues.lookup(Val: Op)) |
1625 | return false; |
1626 | return true; |
1627 | }; |
1628 | |
1629 | if (!DisableGEPConstOperand) |
1630 | if (simplifyInstruction(I)) |
1631 | return true; |
1632 | |
1633 | if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) { |
1634 | if (SROAArg) |
1635 | SROAArgValues[&I] = SROAArg; |
1636 | |
1637 | // Constant GEPs are modeled as free. |
1638 | return true; |
1639 | } |
1640 | |
1641 | // Variable GEPs will require math and will disable SROA. |
1642 | if (SROAArg) |
1643 | disableSROAForArg(SROAArg); |
1644 | return isGEPFree(GEP&: I); |
1645 | } |
1646 | |
1647 | /// Simplify \p I if its operands are constants and update SimplifiedValues. |
1648 | bool CallAnalyzer::simplifyInstruction(Instruction &I) { |
1649 | SmallVector<Constant *> COps; |
1650 | for (Value *Op : I.operands()) { |
1651 | Constant *COp = dyn_cast<Constant>(Val: Op); |
1652 | if (!COp) |
1653 | COp = SimplifiedValues.lookup(Val: Op); |
1654 | if (!COp) |
1655 | return false; |
1656 | COps.push_back(Elt: COp); |
1657 | } |
1658 | auto *C = ConstantFoldInstOperands(I: &I, Ops: COps, DL); |
1659 | if (!C) |
1660 | return false; |
1661 | SimplifiedValues[&I] = C; |
1662 | return true; |
1663 | } |
1664 | |
1665 | /// Try to simplify a call to llvm.is.constant. |
1666 | /// |
1667 | /// Duplicate the argument checking from CallAnalyzer::simplifyCallSite since |
1668 | /// we expect calls of this specific intrinsic to be infrequent. |
1669 | /// |
1670 | /// FIXME: Given that we know CB's parent (F) caller |
1671 | /// (CandidateCall->getParent()->getParent()), we might be able to determine |
1672 | /// whether inlining F into F's caller would change how the call to |
1673 | /// llvm.is.constant would evaluate. |
1674 | bool CallAnalyzer::simplifyIntrinsicCallIsConstant(CallBase &CB) { |
1675 | Value *Arg = CB.getArgOperand(i: 0); |
1676 | auto *C = dyn_cast<Constant>(Val: Arg); |
1677 | |
1678 | if (!C) |
1679 | C = dyn_cast_or_null<Constant>(Val: SimplifiedValues.lookup(Val: Arg)); |
1680 | |
1681 | Type *RT = CB.getFunctionType()->getReturnType(); |
1682 | SimplifiedValues[&CB] = ConstantInt::get(Ty: RT, V: C ? 1 : 0); |
1683 | return true; |
1684 | } |
1685 | |
1686 | bool CallAnalyzer::simplifyIntrinsicCallObjectSize(CallBase &CB) { |
1687 | // As per the langref, "The fourth argument to llvm.objectsize determines if |
1688 | // the value should be evaluated at runtime." |
1689 | if (cast<ConstantInt>(Val: CB.getArgOperand(i: 3))->isOne()) |
1690 | return false; |
1691 | |
1692 | Value *V = lowerObjectSizeCall(ObjectSize: &cast<IntrinsicInst>(Val&: CB), DL, TLI: nullptr, |
1693 | /*MustSucceed=*/true); |
1694 | Constant *C = dyn_cast_or_null<Constant>(Val: V); |
1695 | if (C) |
1696 | SimplifiedValues[&CB] = C; |
1697 | return C; |
1698 | } |
1699 | |
1700 | bool CallAnalyzer::visitBitCast(BitCastInst &I) { |
1701 | // Propagate constants through bitcasts. |
1702 | if (simplifyInstruction(I)) |
1703 | return true; |
1704 | |
1705 | // Track base/offsets through casts |
1706 | std::pair<Value *, APInt> BaseAndOffset = |
1707 | ConstantOffsetPtrs.lookup(Val: I.getOperand(i_nocapture: 0)); |
1708 | // Casts don't change the offset, just wrap it up. |
1709 | if (BaseAndOffset.first) |
1710 | ConstantOffsetPtrs[&I] = BaseAndOffset; |
1711 | |
1712 | // Also look for SROA candidates here. |
1713 | if (auto *SROAArg = getSROAArgForValueOrNull(V: I.getOperand(i_nocapture: 0))) |
1714 | SROAArgValues[&I] = SROAArg; |
1715 | |
1716 | // Bitcasts are always zero cost. |
1717 | return true; |
1718 | } |
1719 | |
1720 | bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) { |
1721 | // Propagate constants through ptrtoint. |
1722 | if (simplifyInstruction(I)) |
1723 | return true; |
1724 | |
1725 | // Track base/offset pairs when converted to a plain integer provided the |
1726 | // integer is large enough to represent the pointer. |
1727 | unsigned IntegerSize = I.getType()->getScalarSizeInBits(); |
1728 | unsigned AS = I.getOperand(i_nocapture: 0)->getType()->getPointerAddressSpace(); |
1729 | if (IntegerSize == DL.getPointerSizeInBits(AS)) { |
1730 | std::pair<Value *, APInt> BaseAndOffset = |
1731 | ConstantOffsetPtrs.lookup(Val: I.getOperand(i_nocapture: 0)); |
1732 | if (BaseAndOffset.first) |
1733 | ConstantOffsetPtrs[&I] = BaseAndOffset; |
1734 | } |
1735 | |
1736 | // This is really weird. Technically, ptrtoint will disable SROA. However, |
1737 | // unless that ptrtoint is *used* somewhere in the live basic blocks after |
1738 | // inlining, it will be nuked, and SROA should proceed. All of the uses which |
1739 | // would block SROA would also block SROA if applied directly to a pointer, |
1740 | // and so we can just add the integer in here. The only places where SROA is |
1741 | // preserved either cannot fire on an integer, or won't in-and-of themselves |
1742 | // disable SROA (ext) w/o some later use that we would see and disable. |
1743 | if (auto *SROAArg = getSROAArgForValueOrNull(V: I.getOperand(i_nocapture: 0))) |
1744 | SROAArgValues[&I] = SROAArg; |
1745 | |
1746 | return TTI.getInstructionCost(U: &I, CostKind: TargetTransformInfo::TCK_SizeAndLatency) == |
1747 | TargetTransformInfo::TCC_Free; |
1748 | } |
1749 | |
1750 | bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) { |
1751 | // Propagate constants through ptrtoint. |
1752 | if (simplifyInstruction(I)) |
1753 | return true; |
1754 | |
1755 | // Track base/offset pairs when round-tripped through a pointer without |
1756 | // modifications provided the integer is not too large. |
1757 | Value *Op = I.getOperand(i_nocapture: 0); |
1758 | unsigned IntegerSize = Op->getType()->getScalarSizeInBits(); |
1759 | if (IntegerSize <= DL.getPointerTypeSizeInBits(I.getType())) { |
1760 | std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Val: Op); |
1761 | if (BaseAndOffset.first) |
1762 | ConstantOffsetPtrs[&I] = BaseAndOffset; |
1763 | } |
1764 | |
1765 | // "Propagate" SROA here in the same manner as we do for ptrtoint above. |
1766 | if (auto *SROAArg = getSROAArgForValueOrNull(V: Op)) |
1767 | SROAArgValues[&I] = SROAArg; |
1768 | |
1769 | return TTI.getInstructionCost(U: &I, CostKind: TargetTransformInfo::TCK_SizeAndLatency) == |
1770 | TargetTransformInfo::TCC_Free; |
1771 | } |
1772 | |
1773 | bool CallAnalyzer::visitCastInst(CastInst &I) { |
1774 | // Propagate constants through casts. |
1775 | if (simplifyInstruction(I)) |
1776 | return true; |
1777 | |
1778 | // Disable SROA in the face of arbitrary casts we don't explicitly list |
1779 | // elsewhere. |
1780 | disableSROA(V: I.getOperand(i_nocapture: 0)); |
1781 | |
1782 | // If this is a floating-point cast, and the target says this operation |
1783 | // is expensive, this may eventually become a library call. Treat the cost |
1784 | // as such. |
1785 | switch (I.getOpcode()) { |
1786 | case Instruction::FPTrunc: |
1787 | case Instruction::FPExt: |
1788 | case Instruction::UIToFP: |
1789 | case Instruction::SIToFP: |
1790 | case Instruction::FPToUI: |
1791 | case Instruction::FPToSI: |
1792 | if (TTI.getFPOpCost(Ty: I.getType()) == TargetTransformInfo::TCC_Expensive) |
1793 | onCallPenalty(); |
1794 | break; |
1795 | default: |
1796 | break; |
1797 | } |
1798 | |
1799 | return TTI.getInstructionCost(U: &I, CostKind: TargetTransformInfo::TCK_SizeAndLatency) == |
1800 | TargetTransformInfo::TCC_Free; |
1801 | } |
1802 | |
1803 | bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) { |
1804 | return CandidateCall.paramHasAttr(ArgNo: A->getArgNo(), Kind: Attr); |
1805 | } |
1806 | |
1807 | bool CallAnalyzer::isKnownNonNullInCallee(Value *V) { |
1808 | // Does the *call site* have the NonNull attribute set on an argument? We |
1809 | // use the attribute on the call site to memoize any analysis done in the |
1810 | // caller. This will also trip if the callee function has a non-null |
1811 | // parameter attribute, but that's a less interesting case because hopefully |
1812 | // the callee would already have been simplified based on that. |
1813 | if (Argument *A = dyn_cast<Argument>(Val: V)) |
1814 | if (paramHasAttr(A, Attribute::Attr: NonNull)) |
1815 | return true; |
1816 | |
1817 | // Is this an alloca in the caller? This is distinct from the attribute case |
1818 | // above because attributes aren't updated within the inliner itself and we |
1819 | // always want to catch the alloca derived case. |
1820 | if (isAllocaDerivedArg(V)) |
1821 | // We can actually predict the result of comparisons between an |
1822 | // alloca-derived value and null. Note that this fires regardless of |
1823 | // SROA firing. |
1824 | return true; |
1825 | |
1826 | return false; |
1827 | } |
1828 | |
1829 | bool CallAnalyzer::allowSizeGrowth(CallBase &Call) { |
1830 | // If the normal destination of the invoke or the parent block of the call |
1831 | // site is unreachable-terminated, there is little point in inlining this |
1832 | // unless there is literally zero cost. |
1833 | // FIXME: Note that it is possible that an unreachable-terminated block has a |
1834 | // hot entry. For example, in below scenario inlining hot_call_X() may be |
1835 | // beneficial : |
1836 | // main() { |
1837 | // hot_call_1(); |
1838 | // ... |
1839 | // hot_call_N() |
1840 | // exit(0); |
1841 | // } |
1842 | // For now, we are not handling this corner case here as it is rare in real |
1843 | // code. In future, we should elaborate this based on BPI and BFI in more |
1844 | // general threshold adjusting heuristics in updateThreshold(). |
1845 | if (InvokeInst *II = dyn_cast<InvokeInst>(Val: &Call)) { |
1846 | if (isa<UnreachableInst>(Val: II->getNormalDest()->getTerminator())) |
1847 | return false; |
1848 | } else if (isa<UnreachableInst>(Val: Call.getParent()->getTerminator())) |
1849 | return false; |
1850 | |
1851 | return true; |
1852 | } |
1853 | |
1854 | bool InlineCostCallAnalyzer::isColdCallSite(CallBase &Call, |
1855 | BlockFrequencyInfo *CallerBFI) { |
1856 | // If global profile summary is available, then callsite's coldness is |
1857 | // determined based on that. |
1858 | if (PSI && PSI->hasProfileSummary()) |
1859 | return PSI->isColdCallSite(CB: Call, BFI: CallerBFI); |
1860 | |
1861 | // Otherwise we need BFI to be available. |
1862 | if (!CallerBFI) |
1863 | return false; |
1864 | |
1865 | // Determine if the callsite is cold relative to caller's entry. We could |
1866 | // potentially cache the computation of scaled entry frequency, but the added |
1867 | // complexity is not worth it unless this scaling shows up high in the |
1868 | // profiles. |
1869 | const BranchProbability ColdProb(ColdCallSiteRelFreq, 100); |
1870 | auto CallSiteBB = Call.getParent(); |
1871 | auto CallSiteFreq = CallerBFI->getBlockFreq(BB: CallSiteBB); |
1872 | auto CallerEntryFreq = |
1873 | CallerBFI->getBlockFreq(BB: &(Call.getCaller()->getEntryBlock())); |
1874 | return CallSiteFreq < CallerEntryFreq * ColdProb; |
1875 | } |
1876 | |
1877 | std::optional<int> |
1878 | InlineCostCallAnalyzer::getHotCallSiteThreshold(CallBase &Call, |
1879 | BlockFrequencyInfo *CallerBFI) { |
1880 | |
1881 | // If global profile summary is available, then callsite's hotness is |
1882 | // determined based on that. |
1883 | if (PSI && PSI->hasProfileSummary() && PSI->isHotCallSite(CB: Call, BFI: CallerBFI)) |
1884 | return Params.HotCallSiteThreshold; |
1885 | |
1886 | // Otherwise we need BFI to be available and to have a locally hot callsite |
1887 | // threshold. |
1888 | if (!CallerBFI || !Params.LocallyHotCallSiteThreshold) |
1889 | return std::nullopt; |
1890 | |
1891 | // Determine if the callsite is hot relative to caller's entry. We could |
1892 | // potentially cache the computation of scaled entry frequency, but the added |
1893 | // complexity is not worth it unless this scaling shows up high in the |
1894 | // profiles. |
1895 | const BasicBlock *CallSiteBB = Call.getParent(); |
1896 | BlockFrequency CallSiteFreq = CallerBFI->getBlockFreq(BB: CallSiteBB); |
1897 | BlockFrequency CallerEntryFreq = CallerBFI->getEntryFreq(); |
1898 | std::optional<BlockFrequency> Limit = CallerEntryFreq.mul(Factor: HotCallSiteRelFreq); |
1899 | if (Limit && CallSiteFreq >= *Limit) |
1900 | return Params.LocallyHotCallSiteThreshold; |
1901 | |
1902 | // Otherwise treat it normally. |
1903 | return std::nullopt; |
1904 | } |
1905 | |
1906 | void InlineCostCallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) { |
1907 | // If no size growth is allowed for this inlining, set Threshold to 0. |
1908 | if (!allowSizeGrowth(Call)) { |
1909 | Threshold = 0; |
1910 | return; |
1911 | } |
1912 | |
1913 | Function *Caller = Call.getCaller(); |
1914 | |
1915 | // return min(A, B) if B is valid. |
1916 | auto MinIfValid = [](int A, std::optional<int> B) { |
1917 | return B ? std::min(a: A, b: *B) : A; |
1918 | }; |
1919 | |
1920 | // return max(A, B) if B is valid. |
1921 | auto MaxIfValid = [](int A, std::optional<int> B) { |
1922 | return B ? std::max(a: A, b: *B) : A; |
1923 | }; |
1924 | |
1925 | // Various bonus percentages. These are multiplied by Threshold to get the |
1926 | // bonus values. |
1927 | // SingleBBBonus: This bonus is applied if the callee has a single reachable |
1928 | // basic block at the given callsite context. This is speculatively applied |
1929 | // and withdrawn if more than one basic block is seen. |
1930 | // |
1931 | // LstCallToStaticBonus: This large bonus is applied to ensure the inlining |
1932 | // of the last call to a static function as inlining such functions is |
1933 | // guaranteed to reduce code size. |
1934 | // |
1935 | // These bonus percentages may be set to 0 based on properties of the caller |
1936 | // and the callsite. |
1937 | int SingleBBBonusPercent = 50; |
1938 | int VectorBonusPercent = TTI.getInlinerVectorBonusPercent(); |
1939 | int LastCallToStaticBonus = InlineConstants::LastCallToStaticBonus; |
1940 | |
1941 | // Lambda to set all the above bonus and bonus percentages to 0. |
1942 | auto DisallowAllBonuses = [&]() { |
1943 | SingleBBBonusPercent = 0; |
1944 | VectorBonusPercent = 0; |
1945 | LastCallToStaticBonus = 0; |
1946 | }; |
1947 | |
1948 | // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available |
1949 | // and reduce the threshold if the caller has the necessary attribute. |
1950 | if (Caller->hasMinSize()) { |
1951 | Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold); |
1952 | // For minsize, we want to disable the single BB bonus and the vector |
1953 | // bonuses, but not the last-call-to-static bonus. Inlining the last call to |
1954 | // a static function will, at the minimum, eliminate the parameter setup and |
1955 | // call/return instructions. |
1956 | SingleBBBonusPercent = 0; |
1957 | VectorBonusPercent = 0; |
1958 | } else if (Caller->hasOptSize()) |
1959 | Threshold = MinIfValid(Threshold, Params.OptSizeThreshold); |
1960 | |
1961 | // Adjust the threshold based on inlinehint attribute and profile based |
1962 | // hotness information if the caller does not have MinSize attribute. |
1963 | if (!Caller->hasMinSize()) { |
1964 | if (Callee.hasFnAttribute(Attribute::InlineHint)) |
1965 | Threshold = MaxIfValid(Threshold, Params.HintThreshold); |
1966 | |
1967 | // FIXME: After switching to the new passmanager, simplify the logic below |
1968 | // by checking only the callsite hotness/coldness as we will reliably |
1969 | // have local profile information. |
1970 | // |
1971 | // Callsite hotness and coldness can be determined if sample profile is |
1972 | // used (which adds hotness metadata to calls) or if caller's |
1973 | // BlockFrequencyInfo is available. |
1974 | BlockFrequencyInfo *CallerBFI = GetBFI ? &(GetBFI(*Caller)) : nullptr; |
1975 | auto HotCallSiteThreshold = getHotCallSiteThreshold(Call, CallerBFI); |
1976 | if (!Caller->hasOptSize() && HotCallSiteThreshold) { |
1977 | LLVM_DEBUG(dbgs() << "Hot callsite.\n" ); |
1978 | // FIXME: This should update the threshold only if it exceeds the |
1979 | // current threshold, but AutoFDO + ThinLTO currently relies on this |
1980 | // behavior to prevent inlining of hot callsites during ThinLTO |
1981 | // compile phase. |
1982 | Threshold = *HotCallSiteThreshold; |
1983 | } else if (isColdCallSite(Call, CallerBFI)) { |
1984 | LLVM_DEBUG(dbgs() << "Cold callsite.\n" ); |
1985 | // Do not apply bonuses for a cold callsite including the |
1986 | // LastCallToStatic bonus. While this bonus might result in code size |
1987 | // reduction, it can cause the size of a non-cold caller to increase |
1988 | // preventing it from being inlined. |
1989 | DisallowAllBonuses(); |
1990 | Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold); |
1991 | } else if (PSI) { |
1992 | // Use callee's global profile information only if we have no way of |
1993 | // determining this via callsite information. |
1994 | if (PSI->isFunctionEntryHot(F: &Callee)) { |
1995 | LLVM_DEBUG(dbgs() << "Hot callee.\n" ); |
1996 | // If callsite hotness can not be determined, we may still know |
1997 | // that the callee is hot and treat it as a weaker hint for threshold |
1998 | // increase. |
1999 | Threshold = MaxIfValid(Threshold, Params.HintThreshold); |
2000 | } else if (PSI->isFunctionEntryCold(F: &Callee)) { |
2001 | LLVM_DEBUG(dbgs() << "Cold callee.\n" ); |
2002 | // Do not apply bonuses for a cold callee including the |
2003 | // LastCallToStatic bonus. While this bonus might result in code size |
2004 | // reduction, it can cause the size of a non-cold caller to increase |
2005 | // preventing it from being inlined. |
2006 | DisallowAllBonuses(); |
2007 | Threshold = MinIfValid(Threshold, Params.ColdThreshold); |
2008 | } |
2009 | } |
2010 | } |
2011 | |
2012 | Threshold += TTI.adjustInliningThreshold(CB: &Call); |
2013 | |
2014 | // Finally, take the target-specific inlining threshold multiplier into |
2015 | // account. |
2016 | Threshold *= TTI.getInliningThresholdMultiplier(); |
2017 | |
2018 | SingleBBBonus = Threshold * SingleBBBonusPercent / 100; |
2019 | VectorBonus = Threshold * VectorBonusPercent / 100; |
2020 | |
2021 | // If there is only one call of the function, and it has internal linkage, |
2022 | // the cost of inlining it drops dramatically. It may seem odd to update |
2023 | // Cost in updateThreshold, but the bonus depends on the logic in this method. |
2024 | if (isSoleCallToLocalFunction(CB: Call, Callee: F)) { |
2025 | Cost -= LastCallToStaticBonus; |
2026 | StaticBonusApplied = LastCallToStaticBonus; |
2027 | } |
2028 | } |
2029 | |
2030 | bool CallAnalyzer::visitCmpInst(CmpInst &I) { |
2031 | Value *LHS = I.getOperand(i_nocapture: 0), *RHS = I.getOperand(i_nocapture: 1); |
2032 | // First try to handle simplified comparisons. |
2033 | if (simplifyInstruction(I)) |
2034 | return true; |
2035 | |
2036 | if (I.getOpcode() == Instruction::FCmp) |
2037 | return false; |
2038 | |
2039 | // Otherwise look for a comparison between constant offset pointers with |
2040 | // a common base. |
2041 | Value *LHSBase, *RHSBase; |
2042 | APInt LHSOffset, RHSOffset; |
2043 | std::tie(args&: LHSBase, args&: LHSOffset) = ConstantOffsetPtrs.lookup(Val: LHS); |
2044 | if (LHSBase) { |
2045 | std::tie(args&: RHSBase, args&: RHSOffset) = ConstantOffsetPtrs.lookup(Val: RHS); |
2046 | if (RHSBase && LHSBase == RHSBase) { |
2047 | // We have common bases, fold the icmp to a constant based on the |
2048 | // offsets. |
2049 | Constant *CLHS = ConstantInt::get(Context&: LHS->getContext(), V: LHSOffset); |
2050 | Constant *CRHS = ConstantInt::get(Context&: RHS->getContext(), V: RHSOffset); |
2051 | if (Constant *C = ConstantExpr::getICmp(pred: I.getPredicate(), LHS: CLHS, RHS: CRHS)) { |
2052 | SimplifiedValues[&I] = C; |
2053 | ++NumConstantPtrCmps; |
2054 | return true; |
2055 | } |
2056 | } |
2057 | } |
2058 | |
2059 | auto isImplicitNullCheckCmp = [](const CmpInst &I) { |
2060 | for (auto *User : I.users()) |
2061 | if (auto *Instr = dyn_cast<Instruction>(Val: User)) |
2062 | if (!Instr->getMetadata(KindID: LLVMContext::MD_make_implicit)) |
2063 | return false; |
2064 | return true; |
2065 | }; |
2066 | |
2067 | // If the comparison is an equality comparison with null, we can simplify it |
2068 | // if we know the value (argument) can't be null |
2069 | if (I.isEquality() && isa<ConstantPointerNull>(Val: I.getOperand(i_nocapture: 1))) { |
2070 | if (isKnownNonNullInCallee(V: I.getOperand(i_nocapture: 0))) { |
2071 | bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE; |
2072 | SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(Ty: I.getType()) |
2073 | : ConstantInt::getFalse(Ty: I.getType()); |
2074 | return true; |
2075 | } |
2076 | // Implicit null checks act as unconditional branches and their comparisons |
2077 | // should be treated as simplified and free of cost. |
2078 | if (isImplicitNullCheckCmp(I)) |
2079 | return true; |
2080 | } |
2081 | return handleSROA(V: I.getOperand(i_nocapture: 0), DoNotDisable: isa<ConstantPointerNull>(Val: I.getOperand(i_nocapture: 1))); |
2082 | } |
2083 | |
2084 | bool CallAnalyzer::visitSub(BinaryOperator &I) { |
2085 | // Try to handle a special case: we can fold computing the difference of two |
2086 | // constant-related pointers. |
2087 | Value *LHS = I.getOperand(i_nocapture: 0), *RHS = I.getOperand(i_nocapture: 1); |
2088 | Value *LHSBase, *RHSBase; |
2089 | APInt LHSOffset, RHSOffset; |
2090 | std::tie(args&: LHSBase, args&: LHSOffset) = ConstantOffsetPtrs.lookup(Val: LHS); |
2091 | if (LHSBase) { |
2092 | std::tie(args&: RHSBase, args&: RHSOffset) = ConstantOffsetPtrs.lookup(Val: RHS); |
2093 | if (RHSBase && LHSBase == RHSBase) { |
2094 | // We have common bases, fold the subtract to a constant based on the |
2095 | // offsets. |
2096 | Constant *CLHS = ConstantInt::get(Context&: LHS->getContext(), V: LHSOffset); |
2097 | Constant *CRHS = ConstantInt::get(Context&: RHS->getContext(), V: RHSOffset); |
2098 | if (Constant *C = ConstantExpr::getSub(C1: CLHS, C2: CRHS)) { |
2099 | SimplifiedValues[&I] = C; |
2100 | ++NumConstantPtrDiffs; |
2101 | return true; |
2102 | } |
2103 | } |
2104 | } |
2105 | |
2106 | // Otherwise, fall back to the generic logic for simplifying and handling |
2107 | // instructions. |
2108 | return Base::visitSub(I); |
2109 | } |
2110 | |
2111 | bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) { |
2112 | Value *LHS = I.getOperand(i_nocapture: 0), *RHS = I.getOperand(i_nocapture: 1); |
2113 | Constant *CLHS = dyn_cast<Constant>(Val: LHS); |
2114 | if (!CLHS) |
2115 | CLHS = SimplifiedValues.lookup(Val: LHS); |
2116 | Constant *CRHS = dyn_cast<Constant>(Val: RHS); |
2117 | if (!CRHS) |
2118 | CRHS = SimplifiedValues.lookup(Val: RHS); |
2119 | |
2120 | Value *SimpleV = nullptr; |
2121 | if (auto FI = dyn_cast<FPMathOperator>(Val: &I)) |
2122 | SimpleV = simplifyBinOp(Opcode: I.getOpcode(), LHS: CLHS ? CLHS : LHS, RHS: CRHS ? CRHS : RHS, |
2123 | FMF: FI->getFastMathFlags(), Q: DL); |
2124 | else |
2125 | SimpleV = |
2126 | simplifyBinOp(Opcode: I.getOpcode(), LHS: CLHS ? CLHS : LHS, RHS: CRHS ? CRHS : RHS, Q: DL); |
2127 | |
2128 | if (Constant *C = dyn_cast_or_null<Constant>(Val: SimpleV)) |
2129 | SimplifiedValues[&I] = C; |
2130 | |
2131 | if (SimpleV) |
2132 | return true; |
2133 | |
2134 | // Disable any SROA on arguments to arbitrary, unsimplified binary operators. |
2135 | disableSROA(V: LHS); |
2136 | disableSROA(V: RHS); |
2137 | |
2138 | // If the instruction is floating point, and the target says this operation |
2139 | // is expensive, this may eventually become a library call. Treat the cost |
2140 | // as such. Unless it's fneg which can be implemented with an xor. |
2141 | using namespace llvm::PatternMatch; |
2142 | if (I.getType()->isFloatingPointTy() && |
2143 | TTI.getFPOpCost(Ty: I.getType()) == TargetTransformInfo::TCC_Expensive && |
2144 | !match(V: &I, P: m_FNeg(X: m_Value()))) |
2145 | onCallPenalty(); |
2146 | |
2147 | return false; |
2148 | } |
2149 | |
2150 | bool CallAnalyzer::visitFNeg(UnaryOperator &I) { |
2151 | Value *Op = I.getOperand(i_nocapture: 0); |
2152 | Constant *COp = dyn_cast<Constant>(Val: Op); |
2153 | if (!COp) |
2154 | COp = SimplifiedValues.lookup(Val: Op); |
2155 | |
2156 | Value *SimpleV = simplifyFNegInst( |
2157 | Op: COp ? COp : Op, FMF: cast<FPMathOperator>(Val&: I).getFastMathFlags(), Q: DL); |
2158 | |
2159 | if (Constant *C = dyn_cast_or_null<Constant>(Val: SimpleV)) |
2160 | SimplifiedValues[&I] = C; |
2161 | |
2162 | if (SimpleV) |
2163 | return true; |
2164 | |
2165 | // Disable any SROA on arguments to arbitrary, unsimplified fneg. |
2166 | disableSROA(V: Op); |
2167 | |
2168 | return false; |
2169 | } |
2170 | |
2171 | bool CallAnalyzer::visitLoad(LoadInst &I) { |
2172 | if (handleSROA(V: I.getPointerOperand(), DoNotDisable: I.isSimple())) |
2173 | return true; |
2174 | |
2175 | // If the data is already loaded from this address and hasn't been clobbered |
2176 | // by any stores or calls, this load is likely to be redundant and can be |
2177 | // eliminated. |
2178 | if (EnableLoadElimination && |
2179 | !LoadAddrSet.insert(Ptr: I.getPointerOperand()).second && I.isUnordered()) { |
2180 | onLoadEliminationOpportunity(); |
2181 | return true; |
2182 | } |
2183 | |
2184 | onMemAccess(); |
2185 | return false; |
2186 | } |
2187 | |
2188 | bool CallAnalyzer::visitStore(StoreInst &I) { |
2189 | if (handleSROA(V: I.getPointerOperand(), DoNotDisable: I.isSimple())) |
2190 | return true; |
2191 | |
2192 | // The store can potentially clobber loads and prevent repeated loads from |
2193 | // being eliminated. |
2194 | // FIXME: |
2195 | // 1. We can probably keep an initial set of eliminatable loads substracted |
2196 | // from the cost even when we finally see a store. We just need to disable |
2197 | // *further* accumulation of elimination savings. |
2198 | // 2. We should probably at some point thread MemorySSA for the callee into |
2199 | // this and then use that to actually compute *really* precise savings. |
2200 | disableLoadElimination(); |
2201 | |
2202 | onMemAccess(); |
2203 | return false; |
2204 | } |
2205 | |
2206 | bool CallAnalyzer::(ExtractValueInst &I) { |
2207 | // Constant folding for extract value is trivial. |
2208 | if (simplifyInstruction(I)) |
2209 | return true; |
2210 | |
2211 | // SROA can't look through these, but they may be free. |
2212 | return Base::visitExtractValue(I); |
2213 | } |
2214 | |
2215 | bool CallAnalyzer::visitInsertValue(InsertValueInst &I) { |
2216 | // Constant folding for insert value is trivial. |
2217 | if (simplifyInstruction(I)) |
2218 | return true; |
2219 | |
2220 | // SROA can't look through these, but they may be free. |
2221 | return Base::visitInsertValue(I); |
2222 | } |
2223 | |
2224 | /// Try to simplify a call site. |
2225 | /// |
2226 | /// Takes a concrete function and callsite and tries to actually simplify it by |
2227 | /// analyzing the arguments and call itself with instsimplify. Returns true if |
2228 | /// it has simplified the callsite to some other entity (a constant), making it |
2229 | /// free. |
2230 | bool CallAnalyzer::simplifyCallSite(Function *F, CallBase &Call) { |
2231 | // FIXME: Using the instsimplify logic directly for this is inefficient |
2232 | // because we have to continually rebuild the argument list even when no |
2233 | // simplifications can be performed. Until that is fixed with remapping |
2234 | // inside of instsimplify, directly constant fold calls here. |
2235 | if (!canConstantFoldCallTo(Call: &Call, F)) |
2236 | return false; |
2237 | |
2238 | // Try to re-map the arguments to constants. |
2239 | SmallVector<Constant *, 4> ConstantArgs; |
2240 | ConstantArgs.reserve(N: Call.arg_size()); |
2241 | for (Value *I : Call.args()) { |
2242 | Constant *C = dyn_cast<Constant>(Val: I); |
2243 | if (!C) |
2244 | C = dyn_cast_or_null<Constant>(Val: SimplifiedValues.lookup(Val: I)); |
2245 | if (!C) |
2246 | return false; // This argument doesn't map to a constant. |
2247 | |
2248 | ConstantArgs.push_back(Elt: C); |
2249 | } |
2250 | if (Constant *C = ConstantFoldCall(Call: &Call, F, Operands: ConstantArgs)) { |
2251 | SimplifiedValues[&Call] = C; |
2252 | return true; |
2253 | } |
2254 | |
2255 | return false; |
2256 | } |
2257 | |
2258 | bool CallAnalyzer::visitCallBase(CallBase &Call) { |
2259 | if (!onCallBaseVisitStart(Call)) |
2260 | return true; |
2261 | |
2262 | if (Call.hasFnAttr(Attribute::ReturnsTwice) && |
2263 | !F.hasFnAttribute(Attribute::ReturnsTwice)) { |
2264 | // This aborts the entire analysis. |
2265 | ExposesReturnsTwice = true; |
2266 | return false; |
2267 | } |
2268 | if (isa<CallInst>(Val: Call) && cast<CallInst>(Val&: Call).cannotDuplicate()) |
2269 | ContainsNoDuplicateCall = true; |
2270 | |
2271 | Function *F = Call.getCalledFunction(); |
2272 | bool IsIndirectCall = !F; |
2273 | if (IsIndirectCall) { |
2274 | // Check if this happens to be an indirect function call to a known function |
2275 | // in this inline context. If not, we've done all we can. |
2276 | Value *Callee = Call.getCalledOperand(); |
2277 | F = dyn_cast_or_null<Function>(Val: SimplifiedValues.lookup(Val: Callee)); |
2278 | if (!F || F->getFunctionType() != Call.getFunctionType()) { |
2279 | onCallArgumentSetup(Call); |
2280 | |
2281 | if (!Call.onlyReadsMemory()) |
2282 | disableLoadElimination(); |
2283 | return Base::visitCallBase(I&: Call); |
2284 | } |
2285 | } |
2286 | |
2287 | assert(F && "Expected a call to a known function" ); |
2288 | |
2289 | // When we have a concrete function, first try to simplify it directly. |
2290 | if (simplifyCallSite(F, Call)) |
2291 | return true; |
2292 | |
2293 | // Next check if it is an intrinsic we know about. |
2294 | // FIXME: Lift this into part of the InstVisitor. |
2295 | if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: &Call)) { |
2296 | switch (II->getIntrinsicID()) { |
2297 | default: |
2298 | if (!Call.onlyReadsMemory() && !isAssumeLikeIntrinsic(I: II)) |
2299 | disableLoadElimination(); |
2300 | return Base::visitCallBase(I&: Call); |
2301 | |
2302 | case Intrinsic::load_relative: |
2303 | onLoadRelativeIntrinsic(); |
2304 | return false; |
2305 | |
2306 | case Intrinsic::memset: |
2307 | case Intrinsic::memcpy: |
2308 | case Intrinsic::memmove: |
2309 | disableLoadElimination(); |
2310 | // SROA can usually chew through these intrinsics, but they aren't free. |
2311 | return false; |
2312 | case Intrinsic::icall_branch_funnel: |
2313 | case Intrinsic::localescape: |
2314 | HasUninlineableIntrinsic = true; |
2315 | return false; |
2316 | case Intrinsic::vastart: |
2317 | InitsVargArgs = true; |
2318 | return false; |
2319 | case Intrinsic::launder_invariant_group: |
2320 | case Intrinsic::strip_invariant_group: |
2321 | if (auto *SROAArg = getSROAArgForValueOrNull(V: II->getOperand(i_nocapture: 0))) |
2322 | SROAArgValues[II] = SROAArg; |
2323 | return true; |
2324 | case Intrinsic::is_constant: |
2325 | return simplifyIntrinsicCallIsConstant(CB&: Call); |
2326 | case Intrinsic::objectsize: |
2327 | return simplifyIntrinsicCallObjectSize(CB&: Call); |
2328 | } |
2329 | } |
2330 | |
2331 | if (F == Call.getFunction()) { |
2332 | // This flag will fully abort the analysis, so don't bother with anything |
2333 | // else. |
2334 | IsRecursiveCall = true; |
2335 | if (!AllowRecursiveCall) |
2336 | return false; |
2337 | } |
2338 | |
2339 | if (TTI.isLoweredToCall(F)) { |
2340 | onLoweredCall(F, Call, IsIndirectCall); |
2341 | } |
2342 | |
2343 | if (!(Call.onlyReadsMemory() || (IsIndirectCall && F->onlyReadsMemory()))) |
2344 | disableLoadElimination(); |
2345 | return Base::visitCallBase(I&: Call); |
2346 | } |
2347 | |
2348 | bool CallAnalyzer::visitReturnInst(ReturnInst &RI) { |
2349 | // At least one return instruction will be free after inlining. |
2350 | bool Free = !HasReturn; |
2351 | HasReturn = true; |
2352 | return Free; |
2353 | } |
2354 | |
2355 | bool CallAnalyzer::visitBranchInst(BranchInst &BI) { |
2356 | // We model unconditional branches as essentially free -- they really |
2357 | // shouldn't exist at all, but handling them makes the behavior of the |
2358 | // inliner more regular and predictable. Interestingly, conditional branches |
2359 | // which will fold away are also free. |
2360 | return BI.isUnconditional() || isa<ConstantInt>(Val: BI.getCondition()) || |
2361 | BI.getMetadata(KindID: LLVMContext::MD_make_implicit) || |
2362 | isa_and_nonnull<ConstantInt>( |
2363 | Val: SimplifiedValues.lookup(Val: BI.getCondition())); |
2364 | } |
2365 | |
2366 | bool CallAnalyzer::visitSelectInst(SelectInst &SI) { |
2367 | bool CheckSROA = SI.getType()->isPointerTy(); |
2368 | Value *TrueVal = SI.getTrueValue(); |
2369 | Value *FalseVal = SI.getFalseValue(); |
2370 | |
2371 | Constant *TrueC = dyn_cast<Constant>(Val: TrueVal); |
2372 | if (!TrueC) |
2373 | TrueC = SimplifiedValues.lookup(Val: TrueVal); |
2374 | Constant *FalseC = dyn_cast<Constant>(Val: FalseVal); |
2375 | if (!FalseC) |
2376 | FalseC = SimplifiedValues.lookup(Val: FalseVal); |
2377 | Constant *CondC = |
2378 | dyn_cast_or_null<Constant>(Val: SimplifiedValues.lookup(Val: SI.getCondition())); |
2379 | |
2380 | if (!CondC) { |
2381 | // Select C, X, X => X |
2382 | if (TrueC == FalseC && TrueC) { |
2383 | SimplifiedValues[&SI] = TrueC; |
2384 | return true; |
2385 | } |
2386 | |
2387 | if (!CheckSROA) |
2388 | return Base::visitSelectInst(I&: SI); |
2389 | |
2390 | std::pair<Value *, APInt> TrueBaseAndOffset = |
2391 | ConstantOffsetPtrs.lookup(Val: TrueVal); |
2392 | std::pair<Value *, APInt> FalseBaseAndOffset = |
2393 | ConstantOffsetPtrs.lookup(Val: FalseVal); |
2394 | if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) { |
2395 | ConstantOffsetPtrs[&SI] = TrueBaseAndOffset; |
2396 | |
2397 | if (auto *SROAArg = getSROAArgForValueOrNull(V: TrueVal)) |
2398 | SROAArgValues[&SI] = SROAArg; |
2399 | return true; |
2400 | } |
2401 | |
2402 | return Base::visitSelectInst(I&: SI); |
2403 | } |
2404 | |
2405 | // Select condition is a constant. |
2406 | Value *SelectedV = CondC->isAllOnesValue() ? TrueVal |
2407 | : (CondC->isNullValue()) ? FalseVal |
2408 | : nullptr; |
2409 | if (!SelectedV) { |
2410 | // Condition is a vector constant that is not all 1s or all 0s. If all |
2411 | // operands are constants, ConstantFoldSelectInstruction() can handle the |
2412 | // cases such as select vectors. |
2413 | if (TrueC && FalseC) { |
2414 | if (auto *C = ConstantFoldSelectInstruction(Cond: CondC, V1: TrueC, V2: FalseC)) { |
2415 | SimplifiedValues[&SI] = C; |
2416 | return true; |
2417 | } |
2418 | } |
2419 | return Base::visitSelectInst(I&: SI); |
2420 | } |
2421 | |
2422 | // Condition is either all 1s or all 0s. SI can be simplified. |
2423 | if (Constant *SelectedC = dyn_cast<Constant>(Val: SelectedV)) { |
2424 | SimplifiedValues[&SI] = SelectedC; |
2425 | return true; |
2426 | } |
2427 | |
2428 | if (!CheckSROA) |
2429 | return true; |
2430 | |
2431 | std::pair<Value *, APInt> BaseAndOffset = |
2432 | ConstantOffsetPtrs.lookup(Val: SelectedV); |
2433 | if (BaseAndOffset.first) { |
2434 | ConstantOffsetPtrs[&SI] = BaseAndOffset; |
2435 | |
2436 | if (auto *SROAArg = getSROAArgForValueOrNull(V: SelectedV)) |
2437 | SROAArgValues[&SI] = SROAArg; |
2438 | } |
2439 | |
2440 | return true; |
2441 | } |
2442 | |
2443 | bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) { |
2444 | // We model unconditional switches as free, see the comments on handling |
2445 | // branches. |
2446 | if (isa<ConstantInt>(Val: SI.getCondition())) |
2447 | return true; |
2448 | if (Value *V = SimplifiedValues.lookup(Val: SI.getCondition())) |
2449 | if (isa<ConstantInt>(Val: V)) |
2450 | return true; |
2451 | |
2452 | // Assume the most general case where the switch is lowered into |
2453 | // either a jump table, bit test, or a balanced binary tree consisting of |
2454 | // case clusters without merging adjacent clusters with the same |
2455 | // destination. We do not consider the switches that are lowered with a mix |
2456 | // of jump table/bit test/binary search tree. The cost of the switch is |
2457 | // proportional to the size of the tree or the size of jump table range. |
2458 | // |
2459 | // NB: We convert large switches which are just used to initialize large phi |
2460 | // nodes to lookup tables instead in simplifycfg, so this shouldn't prevent |
2461 | // inlining those. It will prevent inlining in cases where the optimization |
2462 | // does not (yet) fire. |
2463 | |
2464 | unsigned JumpTableSize = 0; |
2465 | BlockFrequencyInfo *BFI = GetBFI ? &(GetBFI(F)) : nullptr; |
2466 | unsigned NumCaseCluster = |
2467 | TTI.getEstimatedNumberOfCaseClusters(SI, JTSize&: JumpTableSize, PSI, BFI); |
2468 | |
2469 | onFinalizeSwitch(JumpTableSize, NumCaseCluster, DefaultDestUndefined: SI.defaultDestUndefined()); |
2470 | return false; |
2471 | } |
2472 | |
2473 | bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) { |
2474 | // We never want to inline functions that contain an indirectbr. This is |
2475 | // incorrect because all the blockaddress's (in static global initializers |
2476 | // for example) would be referring to the original function, and this |
2477 | // indirect jump would jump from the inlined copy of the function into the |
2478 | // original function which is extremely undefined behavior. |
2479 | // FIXME: This logic isn't really right; we can safely inline functions with |
2480 | // indirectbr's as long as no other function or global references the |
2481 | // blockaddress of a block within the current function. |
2482 | HasIndirectBr = true; |
2483 | return false; |
2484 | } |
2485 | |
2486 | bool CallAnalyzer::visitResumeInst(ResumeInst &RI) { |
2487 | // FIXME: It's not clear that a single instruction is an accurate model for |
2488 | // the inline cost of a resume instruction. |
2489 | return false; |
2490 | } |
2491 | |
2492 | bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) { |
2493 | // FIXME: It's not clear that a single instruction is an accurate model for |
2494 | // the inline cost of a cleanupret instruction. |
2495 | return false; |
2496 | } |
2497 | |
2498 | bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) { |
2499 | // FIXME: It's not clear that a single instruction is an accurate model for |
2500 | // the inline cost of a catchret instruction. |
2501 | return false; |
2502 | } |
2503 | |
2504 | bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) { |
2505 | // FIXME: It might be reasonably to discount the cost of instructions leading |
2506 | // to unreachable as they have the lowest possible impact on both runtime and |
2507 | // code size. |
2508 | return true; // No actual code is needed for unreachable. |
2509 | } |
2510 | |
2511 | bool CallAnalyzer::visitInstruction(Instruction &I) { |
2512 | // Some instructions are free. All of the free intrinsics can also be |
2513 | // handled by SROA, etc. |
2514 | if (TTI.getInstructionCost(U: &I, CostKind: TargetTransformInfo::TCK_SizeAndLatency) == |
2515 | TargetTransformInfo::TCC_Free) |
2516 | return true; |
2517 | |
2518 | // We found something we don't understand or can't handle. Mark any SROA-able |
2519 | // values in the operand list as no longer viable. |
2520 | for (const Use &Op : I.operands()) |
2521 | disableSROA(V: Op); |
2522 | |
2523 | return false; |
2524 | } |
2525 | |
2526 | /// Analyze a basic block for its contribution to the inline cost. |
2527 | /// |
2528 | /// This method walks the analyzer over every instruction in the given basic |
2529 | /// block and accounts for their cost during inlining at this callsite. It |
2530 | /// aborts early if the threshold has been exceeded or an impossible to inline |
2531 | /// construct has been detected. It returns false if inlining is no longer |
2532 | /// viable, and true if inlining remains viable. |
2533 | InlineResult |
2534 | CallAnalyzer::analyzeBlock(BasicBlock *BB, |
2535 | SmallPtrSetImpl<const Value *> &EphValues) { |
2536 | for (Instruction &I : *BB) { |
2537 | // FIXME: Currently, the number of instructions in a function regardless of |
2538 | // our ability to simplify them during inline to constants or dead code, |
2539 | // are actually used by the vector bonus heuristic. As long as that's true, |
2540 | // we have to special case debug intrinsics here to prevent differences in |
2541 | // inlining due to debug symbols. Eventually, the number of unsimplified |
2542 | // instructions shouldn't factor into the cost computation, but until then, |
2543 | // hack around it here. |
2544 | // Similarly, skip pseudo-probes. |
2545 | if (I.isDebugOrPseudoInst()) |
2546 | continue; |
2547 | |
2548 | // Skip ephemeral values. |
2549 | if (EphValues.count(Ptr: &I)) |
2550 | continue; |
2551 | |
2552 | ++NumInstructions; |
2553 | if (isa<ExtractElementInst>(Val: I) || I.getType()->isVectorTy()) |
2554 | ++NumVectorInstructions; |
2555 | |
2556 | // If the instruction simplified to a constant, there is no cost to this |
2557 | // instruction. Visit the instructions using our InstVisitor to account for |
2558 | // all of the per-instruction logic. The visit tree returns true if we |
2559 | // consumed the instruction in any way, and false if the instruction's base |
2560 | // cost should count against inlining. |
2561 | onInstructionAnalysisStart(I: &I); |
2562 | |
2563 | if (Base::visit(I: &I)) |
2564 | ++NumInstructionsSimplified; |
2565 | else |
2566 | onMissedSimplification(); |
2567 | |
2568 | onInstructionAnalysisFinish(I: &I); |
2569 | using namespace ore; |
2570 | // If the visit this instruction detected an uninlinable pattern, abort. |
2571 | InlineResult IR = InlineResult::success(); |
2572 | if (IsRecursiveCall && !AllowRecursiveCall) |
2573 | IR = InlineResult::failure(Reason: "recursive" ); |
2574 | else if (ExposesReturnsTwice) |
2575 | IR = InlineResult::failure(Reason: "exposes returns twice" ); |
2576 | else if (HasDynamicAlloca) |
2577 | IR = InlineResult::failure(Reason: "dynamic alloca" ); |
2578 | else if (HasIndirectBr) |
2579 | IR = InlineResult::failure(Reason: "indirect branch" ); |
2580 | else if (HasUninlineableIntrinsic) |
2581 | IR = InlineResult::failure(Reason: "uninlinable intrinsic" ); |
2582 | else if (InitsVargArgs) |
2583 | IR = InlineResult::failure(Reason: "varargs" ); |
2584 | if (!IR.isSuccess()) { |
2585 | if (ORE) |
2586 | ORE->emit(RemarkBuilder: [&]() { |
2587 | return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline" , |
2588 | &CandidateCall) |
2589 | << NV("Callee" , &F) << " has uninlinable pattern (" |
2590 | << NV("InlineResult" , IR.getFailureReason()) |
2591 | << ") and cost is not fully computed" ; |
2592 | }); |
2593 | return IR; |
2594 | } |
2595 | |
2596 | // If the caller is a recursive function then we don't want to inline |
2597 | // functions which allocate a lot of stack space because it would increase |
2598 | // the caller stack usage dramatically. |
2599 | if (IsCallerRecursive && AllocatedSize > RecurStackSizeThreshold) { |
2600 | auto IR = |
2601 | InlineResult::failure(Reason: "recursive and allocates too much stack space" ); |
2602 | if (ORE) |
2603 | ORE->emit(RemarkBuilder: [&]() { |
2604 | return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline" , |
2605 | &CandidateCall) |
2606 | << NV("Callee" , &F) << " is " |
2607 | << NV("InlineResult" , IR.getFailureReason()) |
2608 | << ". Cost is not fully computed" ; |
2609 | }); |
2610 | return IR; |
2611 | } |
2612 | |
2613 | if (shouldStop()) |
2614 | return InlineResult::failure( |
2615 | Reason: "Call site analysis is not favorable to inlining." ); |
2616 | } |
2617 | |
2618 | return InlineResult::success(); |
2619 | } |
2620 | |
2621 | /// Compute the base pointer and cumulative constant offsets for V. |
2622 | /// |
2623 | /// This strips all constant offsets off of V, leaving it the base pointer, and |
2624 | /// accumulates the total constant offset applied in the returned constant. It |
2625 | /// returns 0 if V is not a pointer, and returns the constant '0' if there are |
2626 | /// no constant offsets applied. |
2627 | ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) { |
2628 | if (!V->getType()->isPointerTy()) |
2629 | return nullptr; |
2630 | |
2631 | unsigned AS = V->getType()->getPointerAddressSpace(); |
2632 | unsigned IntPtrWidth = DL.getIndexSizeInBits(AS); |
2633 | APInt Offset = APInt::getZero(numBits: IntPtrWidth); |
2634 | |
2635 | // Even though we don't look through PHI nodes, we could be called on an |
2636 | // instruction in an unreachable block, which may be on a cycle. |
2637 | SmallPtrSet<Value *, 4> Visited; |
2638 | Visited.insert(Ptr: V); |
2639 | do { |
2640 | if (GEPOperator *GEP = dyn_cast<GEPOperator>(Val: V)) { |
2641 | if (!GEP->isInBounds() || !accumulateGEPOffset(GEP&: *GEP, Offset)) |
2642 | return nullptr; |
2643 | V = GEP->getPointerOperand(); |
2644 | } else if (Operator::getOpcode(V) == Instruction::BitCast) { |
2645 | V = cast<Operator>(Val: V)->getOperand(i: 0); |
2646 | } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Val: V)) { |
2647 | if (GA->isInterposable()) |
2648 | break; |
2649 | V = GA->getAliasee(); |
2650 | } else { |
2651 | break; |
2652 | } |
2653 | assert(V->getType()->isPointerTy() && "Unexpected operand type!" ); |
2654 | } while (Visited.insert(Ptr: V).second); |
2655 | |
2656 | Type *IdxPtrTy = DL.getIndexType(PtrTy: V->getType()); |
2657 | return cast<ConstantInt>(Val: ConstantInt::get(Ty: IdxPtrTy, V: Offset)); |
2658 | } |
2659 | |
2660 | /// Find dead blocks due to deleted CFG edges during inlining. |
2661 | /// |
2662 | /// If we know the successor of the current block, \p CurrBB, has to be \p |
2663 | /// NextBB, the other successors of \p CurrBB are dead if these successors have |
2664 | /// no live incoming CFG edges. If one block is found to be dead, we can |
2665 | /// continue growing the dead block list by checking the successors of the dead |
2666 | /// blocks to see if all their incoming edges are dead or not. |
2667 | void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) { |
2668 | auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) { |
2669 | // A CFG edge is dead if the predecessor is dead or the predecessor has a |
2670 | // known successor which is not the one under exam. |
2671 | return (DeadBlocks.count(Ptr: Pred) || |
2672 | (KnownSuccessors[Pred] && KnownSuccessors[Pred] != Succ)); |
2673 | }; |
2674 | |
2675 | auto IsNewlyDead = [&](BasicBlock *BB) { |
2676 | // If all the edges to a block are dead, the block is also dead. |
2677 | return (!DeadBlocks.count(Ptr: BB) && |
2678 | llvm::all_of(Range: predecessors(BB), |
2679 | P: [&](BasicBlock *P) { return IsEdgeDead(P, BB); })); |
2680 | }; |
2681 | |
2682 | for (BasicBlock *Succ : successors(BB: CurrBB)) { |
2683 | if (Succ == NextBB || !IsNewlyDead(Succ)) |
2684 | continue; |
2685 | SmallVector<BasicBlock *, 4> NewDead; |
2686 | NewDead.push_back(Elt: Succ); |
2687 | while (!NewDead.empty()) { |
2688 | BasicBlock *Dead = NewDead.pop_back_val(); |
2689 | if (DeadBlocks.insert(Ptr: Dead).second) |
2690 | // Continue growing the dead block lists. |
2691 | for (BasicBlock *S : successors(BB: Dead)) |
2692 | if (IsNewlyDead(S)) |
2693 | NewDead.push_back(Elt: S); |
2694 | } |
2695 | } |
2696 | } |
2697 | |
2698 | /// Analyze a call site for potential inlining. |
2699 | /// |
2700 | /// Returns true if inlining this call is viable, and false if it is not |
2701 | /// viable. It computes the cost and adjusts the threshold based on numerous |
2702 | /// factors and heuristics. If this method returns false but the computed cost |
2703 | /// is below the computed threshold, then inlining was forcibly disabled by |
2704 | /// some artifact of the routine. |
2705 | InlineResult CallAnalyzer::analyze() { |
2706 | ++NumCallsAnalyzed; |
2707 | |
2708 | auto Result = onAnalysisStart(); |
2709 | if (!Result.isSuccess()) |
2710 | return Result; |
2711 | |
2712 | if (F.empty()) |
2713 | return InlineResult::success(); |
2714 | |
2715 | Function *Caller = CandidateCall.getFunction(); |
2716 | // Check if the caller function is recursive itself. |
2717 | for (User *U : Caller->users()) { |
2718 | CallBase *Call = dyn_cast<CallBase>(Val: U); |
2719 | if (Call && Call->getFunction() == Caller) { |
2720 | IsCallerRecursive = true; |
2721 | break; |
2722 | } |
2723 | } |
2724 | |
2725 | // Populate our simplified values by mapping from function arguments to call |
2726 | // arguments with known important simplifications. |
2727 | auto CAI = CandidateCall.arg_begin(); |
2728 | for (Argument &FAI : F.args()) { |
2729 | assert(CAI != CandidateCall.arg_end()); |
2730 | if (Constant *C = dyn_cast<Constant>(Val: CAI)) |
2731 | SimplifiedValues[&FAI] = C; |
2732 | |
2733 | Value *PtrArg = *CAI; |
2734 | if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(V&: PtrArg)) { |
2735 | ConstantOffsetPtrs[&FAI] = std::make_pair(x&: PtrArg, y: C->getValue()); |
2736 | |
2737 | // We can SROA any pointer arguments derived from alloca instructions. |
2738 | if (auto *SROAArg = dyn_cast<AllocaInst>(Val: PtrArg)) { |
2739 | SROAArgValues[&FAI] = SROAArg; |
2740 | onInitializeSROAArg(Arg: SROAArg); |
2741 | EnabledSROAAllocas.insert(V: SROAArg); |
2742 | } |
2743 | } |
2744 | ++CAI; |
2745 | } |
2746 | NumConstantArgs = SimplifiedValues.size(); |
2747 | NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size(); |
2748 | NumAllocaArgs = SROAArgValues.size(); |
2749 | |
2750 | // FIXME: If a caller has multiple calls to a callee, we end up recomputing |
2751 | // the ephemeral values multiple times (and they're completely determined by |
2752 | // the callee, so this is purely duplicate work). |
2753 | SmallPtrSet<const Value *, 32> EphValues; |
2754 | CodeMetrics::collectEphemeralValues(L: &F, AC: &GetAssumptionCache(F), EphValues); |
2755 | |
2756 | // The worklist of live basic blocks in the callee *after* inlining. We avoid |
2757 | // adding basic blocks of the callee which can be proven to be dead for this |
2758 | // particular call site in order to get more accurate cost estimates. This |
2759 | // requires a somewhat heavyweight iteration pattern: we need to walk the |
2760 | // basic blocks in a breadth-first order as we insert live successors. To |
2761 | // accomplish this, prioritizing for small iterations because we exit after |
2762 | // crossing our threshold, we use a small-size optimized SetVector. |
2763 | typedef SmallSetVector<BasicBlock *, 16> BBSetVector; |
2764 | BBSetVector BBWorklist; |
2765 | BBWorklist.insert(X: &F.getEntryBlock()); |
2766 | |
2767 | // Note that we *must not* cache the size, this loop grows the worklist. |
2768 | for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) { |
2769 | if (shouldStop()) |
2770 | break; |
2771 | |
2772 | BasicBlock *BB = BBWorklist[Idx]; |
2773 | if (BB->empty()) |
2774 | continue; |
2775 | |
2776 | onBlockStart(BB); |
2777 | |
2778 | // Disallow inlining a blockaddress with uses other than strictly callbr. |
2779 | // A blockaddress only has defined behavior for an indirect branch in the |
2780 | // same function, and we do not currently support inlining indirect |
2781 | // branches. But, the inliner may not see an indirect branch that ends up |
2782 | // being dead code at a particular call site. If the blockaddress escapes |
2783 | // the function, e.g., via a global variable, inlining may lead to an |
2784 | // invalid cross-function reference. |
2785 | // FIXME: pr/39560: continue relaxing this overt restriction. |
2786 | if (BB->hasAddressTaken()) |
2787 | for (User *U : BlockAddress::get(BB: &*BB)->users()) |
2788 | if (!isa<CallBrInst>(Val: *U)) |
2789 | return InlineResult::failure(Reason: "blockaddress used outside of callbr" ); |
2790 | |
2791 | // Analyze the cost of this block. If we blow through the threshold, this |
2792 | // returns false, and we can bail on out. |
2793 | InlineResult IR = analyzeBlock(BB, EphValues); |
2794 | if (!IR.isSuccess()) |
2795 | return IR; |
2796 | |
2797 | Instruction *TI = BB->getTerminator(); |
2798 | |
2799 | // Add in the live successors by first checking whether we have terminator |
2800 | // that may be simplified based on the values simplified by this call. |
2801 | if (BranchInst *BI = dyn_cast<BranchInst>(Val: TI)) { |
2802 | if (BI->isConditional()) { |
2803 | Value *Cond = BI->getCondition(); |
2804 | if (ConstantInt *SimpleCond = |
2805 | dyn_cast_or_null<ConstantInt>(Val: SimplifiedValues.lookup(Val: Cond))) { |
2806 | BasicBlock *NextBB = BI->getSuccessor(i: SimpleCond->isZero() ? 1 : 0); |
2807 | BBWorklist.insert(X: NextBB); |
2808 | KnownSuccessors[BB] = NextBB; |
2809 | findDeadBlocks(CurrBB: BB, NextBB); |
2810 | continue; |
2811 | } |
2812 | } |
2813 | } else if (SwitchInst *SI = dyn_cast<SwitchInst>(Val: TI)) { |
2814 | Value *Cond = SI->getCondition(); |
2815 | if (ConstantInt *SimpleCond = |
2816 | dyn_cast_or_null<ConstantInt>(Val: SimplifiedValues.lookup(Val: Cond))) { |
2817 | BasicBlock *NextBB = SI->findCaseValue(C: SimpleCond)->getCaseSuccessor(); |
2818 | BBWorklist.insert(X: NextBB); |
2819 | KnownSuccessors[BB] = NextBB; |
2820 | findDeadBlocks(CurrBB: BB, NextBB); |
2821 | continue; |
2822 | } |
2823 | } |
2824 | |
2825 | // If we're unable to select a particular successor, just count all of |
2826 | // them. |
2827 | for (BasicBlock *Succ : successors(BB)) |
2828 | BBWorklist.insert(X: Succ); |
2829 | |
2830 | onBlockAnalyzed(BB); |
2831 | } |
2832 | |
2833 | // If this is a noduplicate call, we can still inline as long as |
2834 | // inlining this would cause the removal of the caller (so the instruction |
2835 | // is not actually duplicated, just moved). |
2836 | if (!isSoleCallToLocalFunction(CB: CandidateCall, Callee: F) && ContainsNoDuplicateCall) |
2837 | return InlineResult::failure(Reason: "noduplicate" ); |
2838 | |
2839 | // If the callee's stack size exceeds the user-specified threshold, |
2840 | // do not let it be inlined. |
2841 | // The command line option overrides a limit set in the function attributes. |
2842 | size_t FinalStackSizeThreshold = StackSizeThreshold; |
2843 | if (!StackSizeThreshold.getNumOccurrences()) |
2844 | if (std::optional<int> AttrMaxStackSize = getStringFnAttrAsInt( |
2845 | F: Caller, AttrKind: InlineConstants::MaxInlineStackSizeAttributeName)) |
2846 | FinalStackSizeThreshold = *AttrMaxStackSize; |
2847 | if (AllocatedSize > FinalStackSizeThreshold) |
2848 | return InlineResult::failure(Reason: "stacksize" ); |
2849 | |
2850 | return finalizeAnalysis(); |
2851 | } |
2852 | |
2853 | void InlineCostCallAnalyzer::print(raw_ostream &OS) { |
2854 | #define DEBUG_PRINT_STAT(x) OS << " " #x ": " << x << "\n" |
2855 | if (PrintInstructionComments) |
2856 | F.print(OS, AAW: &Writer); |
2857 | DEBUG_PRINT_STAT(NumConstantArgs); |
2858 | DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs); |
2859 | DEBUG_PRINT_STAT(NumAllocaArgs); |
2860 | DEBUG_PRINT_STAT(NumConstantPtrCmps); |
2861 | DEBUG_PRINT_STAT(NumConstantPtrDiffs); |
2862 | DEBUG_PRINT_STAT(NumInstructionsSimplified); |
2863 | DEBUG_PRINT_STAT(NumInstructions); |
2864 | DEBUG_PRINT_STAT(SROACostSavings); |
2865 | DEBUG_PRINT_STAT(SROACostSavingsLost); |
2866 | DEBUG_PRINT_STAT(LoadEliminationCost); |
2867 | DEBUG_PRINT_STAT(ContainsNoDuplicateCall); |
2868 | DEBUG_PRINT_STAT(Cost); |
2869 | DEBUG_PRINT_STAT(Threshold); |
2870 | #undef DEBUG_PRINT_STAT |
2871 | } |
2872 | |
2873 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
2874 | /// Dump stats about this call's analysis. |
2875 | LLVM_DUMP_METHOD void InlineCostCallAnalyzer::dump() { print(OS&: dbgs()); } |
2876 | #endif |
2877 | |
2878 | /// Test that there are no attribute conflicts between Caller and Callee |
2879 | /// that prevent inlining. |
2880 | static bool functionsHaveCompatibleAttributes( |
2881 | Function *Caller, Function *Callee, TargetTransformInfo &TTI, |
2882 | function_ref<const TargetLibraryInfo &(Function &)> &GetTLI) { |
2883 | // Note that CalleeTLI must be a copy not a reference. The legacy pass manager |
2884 | // caches the most recently created TLI in the TargetLibraryInfoWrapperPass |
2885 | // object, and always returns the same object (which is overwritten on each |
2886 | // GetTLI call). Therefore we copy the first result. |
2887 | auto CalleeTLI = GetTLI(*Callee); |
2888 | return (IgnoreTTIInlineCompatible || |
2889 | TTI.areInlineCompatible(Caller, Callee)) && |
2890 | GetTLI(*Caller).areInlineCompatible(CalleeTLI, |
2891 | AllowCallerSuperset: InlineCallerSupersetNoBuiltin) && |
2892 | AttributeFuncs::areInlineCompatible(Caller: *Caller, Callee: *Callee); |
2893 | } |
2894 | |
2895 | int llvm::getCallsiteCost(const TargetTransformInfo &TTI, const CallBase &Call, |
2896 | const DataLayout &DL) { |
2897 | int64_t Cost = 0; |
2898 | for (unsigned I = 0, E = Call.arg_size(); I != E; ++I) { |
2899 | if (Call.isByValArgument(ArgNo: I)) { |
2900 | // We approximate the number of loads and stores needed by dividing the |
2901 | // size of the byval type by the target's pointer size. |
2902 | PointerType *PTy = cast<PointerType>(Val: Call.getArgOperand(i: I)->getType()); |
2903 | unsigned TypeSize = DL.getTypeSizeInBits(Ty: Call.getParamByValType(ArgNo: I)); |
2904 | unsigned AS = PTy->getAddressSpace(); |
2905 | unsigned PointerSize = DL.getPointerSizeInBits(AS); |
2906 | // Ceiling division. |
2907 | unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize; |
2908 | |
2909 | // If it generates more than 8 stores it is likely to be expanded as an |
2910 | // inline memcpy so we take that as an upper bound. Otherwise we assume |
2911 | // one load and one store per word copied. |
2912 | // FIXME: The maxStoresPerMemcpy setting from the target should be used |
2913 | // here instead of a magic number of 8, but it's not available via |
2914 | // DataLayout. |
2915 | NumStores = std::min(a: NumStores, b: 8U); |
2916 | |
2917 | Cost += 2 * NumStores * InstrCost; |
2918 | } else { |
2919 | // For non-byval arguments subtract off one instruction per call |
2920 | // argument. |
2921 | Cost += InstrCost; |
2922 | } |
2923 | } |
2924 | // The call instruction also disappears after inlining. |
2925 | Cost += InstrCost; |
2926 | Cost += TTI.getInlineCallPenalty(F: Call.getCaller(), Call, DefaultCallPenalty: CallPenalty); |
2927 | |
2928 | return std::min<int64_t>(a: Cost, INT_MAX); |
2929 | } |
2930 | |
2931 | InlineCost llvm::getInlineCost( |
2932 | CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI, |
2933 | function_ref<AssumptionCache &(Function &)> GetAssumptionCache, |
2934 | function_ref<const TargetLibraryInfo &(Function &)> GetTLI, |
2935 | function_ref<BlockFrequencyInfo &(Function &)> GetBFI, |
2936 | ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) { |
2937 | return getInlineCost(Call, Callee: Call.getCalledFunction(), Params, CalleeTTI, |
2938 | GetAssumptionCache, GetTLI, GetBFI, PSI, ORE); |
2939 | } |
2940 | |
2941 | std::optional<int> llvm::getInliningCostEstimate( |
2942 | CallBase &Call, TargetTransformInfo &CalleeTTI, |
2943 | function_ref<AssumptionCache &(Function &)> GetAssumptionCache, |
2944 | function_ref<BlockFrequencyInfo &(Function &)> GetBFI, |
2945 | ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) { |
2946 | const InlineParams Params = {/* DefaultThreshold*/ 0, |
2947 | /*HintThreshold*/ {}, |
2948 | /*ColdThreshold*/ {}, |
2949 | /*OptSizeThreshold*/ {}, |
2950 | /*OptMinSizeThreshold*/ {}, |
2951 | /*HotCallSiteThreshold*/ {}, |
2952 | /*LocallyHotCallSiteThreshold*/ {}, |
2953 | /*ColdCallSiteThreshold*/ {}, |
2954 | /*ComputeFullInlineCost*/ true, |
2955 | /*EnableDeferral*/ true}; |
2956 | |
2957 | InlineCostCallAnalyzer CA(*Call.getCalledFunction(), Call, Params, CalleeTTI, |
2958 | GetAssumptionCache, GetBFI, PSI, ORE, true, |
2959 | /*IgnoreThreshold*/ true); |
2960 | auto R = CA.analyze(); |
2961 | if (!R.isSuccess()) |
2962 | return std::nullopt; |
2963 | return CA.getCost(); |
2964 | } |
2965 | |
2966 | std::optional<InlineCostFeatures> llvm::getInliningCostFeatures( |
2967 | CallBase &Call, TargetTransformInfo &CalleeTTI, |
2968 | function_ref<AssumptionCache &(Function &)> GetAssumptionCache, |
2969 | function_ref<BlockFrequencyInfo &(Function &)> GetBFI, |
2970 | ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) { |
2971 | InlineCostFeaturesAnalyzer CFA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, |
2972 | ORE, *Call.getCalledFunction(), Call); |
2973 | auto R = CFA.analyze(); |
2974 | if (!R.isSuccess()) |
2975 | return std::nullopt; |
2976 | return CFA.features(); |
2977 | } |
2978 | |
2979 | std::optional<InlineResult> llvm::getAttributeBasedInliningDecision( |
2980 | CallBase &Call, Function *Callee, TargetTransformInfo &CalleeTTI, |
2981 | function_ref<const TargetLibraryInfo &(Function &)> GetTLI) { |
2982 | |
2983 | // Cannot inline indirect calls. |
2984 | if (!Callee) |
2985 | return InlineResult::failure(Reason: "indirect call" ); |
2986 | |
2987 | // When callee coroutine function is inlined into caller coroutine function |
2988 | // before coro-split pass, |
2989 | // coro-early pass can not handle this quiet well. |
2990 | // So we won't inline the coroutine function if it have not been unsplited |
2991 | if (Callee->isPresplitCoroutine()) |
2992 | return InlineResult::failure(Reason: "unsplited coroutine call" ); |
2993 | |
2994 | // Never inline calls with byval arguments that does not have the alloca |
2995 | // address space. Since byval arguments can be replaced with a copy to an |
2996 | // alloca, the inlined code would need to be adjusted to handle that the |
2997 | // argument is in the alloca address space (so it is a little bit complicated |
2998 | // to solve). |
2999 | unsigned AllocaAS = Callee->getParent()->getDataLayout().getAllocaAddrSpace(); |
3000 | for (unsigned I = 0, E = Call.arg_size(); I != E; ++I) |
3001 | if (Call.isByValArgument(ArgNo: I)) { |
3002 | PointerType *PTy = cast<PointerType>(Val: Call.getArgOperand(i: I)->getType()); |
3003 | if (PTy->getAddressSpace() != AllocaAS) |
3004 | return InlineResult::failure(Reason: "byval arguments without alloca" |
3005 | " address space" ); |
3006 | } |
3007 | |
3008 | // Calls to functions with always-inline attributes should be inlined |
3009 | // whenever possible. |
3010 | if (Call.hasFnAttr(Attribute::AlwaysInline)) { |
3011 | if (Call.getAttributes().hasFnAttr(Attribute::NoInline)) |
3012 | return InlineResult::failure(Reason: "noinline call site attribute" ); |
3013 | |
3014 | auto IsViable = isInlineViable(Callee&: *Callee); |
3015 | if (IsViable.isSuccess()) |
3016 | return InlineResult::success(); |
3017 | return InlineResult::failure(Reason: IsViable.getFailureReason()); |
3018 | } |
3019 | |
3020 | // Never inline functions with conflicting attributes (unless callee has |
3021 | // always-inline attribute). |
3022 | Function *Caller = Call.getCaller(); |
3023 | if (!functionsHaveCompatibleAttributes(Caller, Callee, TTI&: CalleeTTI, GetTLI)) |
3024 | return InlineResult::failure(Reason: "conflicting attributes" ); |
3025 | |
3026 | // Don't inline this call if the caller has the optnone attribute. |
3027 | if (Caller->hasOptNone()) |
3028 | return InlineResult::failure(Reason: "optnone attribute" ); |
3029 | |
3030 | // Don't inline a function that treats null pointer as valid into a caller |
3031 | // that does not have this attribute. |
3032 | if (!Caller->nullPointerIsDefined() && Callee->nullPointerIsDefined()) |
3033 | return InlineResult::failure(Reason: "nullptr definitions incompatible" ); |
3034 | |
3035 | // Don't inline functions which can be interposed at link-time. |
3036 | if (Callee->isInterposable()) |
3037 | return InlineResult::failure(Reason: "interposable" ); |
3038 | |
3039 | // Don't inline functions marked noinline. |
3040 | if (Callee->hasFnAttribute(Attribute::NoInline)) |
3041 | return InlineResult::failure(Reason: "noinline function attribute" ); |
3042 | |
3043 | // Don't inline call sites marked noinline. |
3044 | if (Call.isNoInline()) |
3045 | return InlineResult::failure(Reason: "noinline call site attribute" ); |
3046 | |
3047 | return std::nullopt; |
3048 | } |
3049 | |
3050 | InlineCost llvm::getInlineCost( |
3051 | CallBase &Call, Function *Callee, const InlineParams &Params, |
3052 | TargetTransformInfo &CalleeTTI, |
3053 | function_ref<AssumptionCache &(Function &)> GetAssumptionCache, |
3054 | function_ref<const TargetLibraryInfo &(Function &)> GetTLI, |
3055 | function_ref<BlockFrequencyInfo &(Function &)> GetBFI, |
3056 | ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) { |
3057 | |
3058 | auto UserDecision = |
3059 | llvm::getAttributeBasedInliningDecision(Call, Callee, CalleeTTI, GetTLI); |
3060 | |
3061 | if (UserDecision) { |
3062 | if (UserDecision->isSuccess()) |
3063 | return llvm::InlineCost::getAlways(Reason: "always inline attribute" ); |
3064 | return llvm::InlineCost::getNever(Reason: UserDecision->getFailureReason()); |
3065 | } |
3066 | |
3067 | LLVM_DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName() |
3068 | << "... (caller:" << Call.getCaller()->getName() |
3069 | << ")\n" ); |
3070 | |
3071 | InlineCostCallAnalyzer CA(*Callee, Call, Params, CalleeTTI, |
3072 | GetAssumptionCache, GetBFI, PSI, ORE); |
3073 | InlineResult ShouldInline = CA.analyze(); |
3074 | |
3075 | LLVM_DEBUG(CA.dump()); |
3076 | |
3077 | // Always make cost benefit based decision explicit. |
3078 | // We use always/never here since threshold is not meaningful, |
3079 | // as it's not what drives cost-benefit analysis. |
3080 | if (CA.wasDecidedByCostBenefit()) { |
3081 | if (ShouldInline.isSuccess()) |
3082 | return InlineCost::getAlways(Reason: "benefit over cost" , |
3083 | CostBenefit: CA.getCostBenefitPair()); |
3084 | else |
3085 | return InlineCost::getNever(Reason: "cost over benefit" , CostBenefit: CA.getCostBenefitPair()); |
3086 | } |
3087 | |
3088 | if (CA.wasDecidedByCostThreshold()) |
3089 | return InlineCost::get(Cost: CA.getCost(), Threshold: CA.getThreshold(), |
3090 | StaticBonus: CA.getStaticBonusApplied()); |
3091 | |
3092 | // No details on how the decision was made, simply return always or never. |
3093 | return ShouldInline.isSuccess() |
3094 | ? InlineCost::getAlways(Reason: "empty function" ) |
3095 | : InlineCost::getNever(Reason: ShouldInline.getFailureReason()); |
3096 | } |
3097 | |
3098 | InlineResult llvm::isInlineViable(Function &F) { |
3099 | bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice); |
3100 | for (BasicBlock &BB : F) { |
3101 | // Disallow inlining of functions which contain indirect branches. |
3102 | if (isa<IndirectBrInst>(Val: BB.getTerminator())) |
3103 | return InlineResult::failure(Reason: "contains indirect branches" ); |
3104 | |
3105 | // Disallow inlining of blockaddresses which are used by non-callbr |
3106 | // instructions. |
3107 | if (BB.hasAddressTaken()) |
3108 | for (User *U : BlockAddress::get(BB: &BB)->users()) |
3109 | if (!isa<CallBrInst>(Val: *U)) |
3110 | return InlineResult::failure(Reason: "blockaddress used outside of callbr" ); |
3111 | |
3112 | for (auto &II : BB) { |
3113 | CallBase *Call = dyn_cast<CallBase>(Val: &II); |
3114 | if (!Call) |
3115 | continue; |
3116 | |
3117 | // Disallow recursive calls. |
3118 | Function *Callee = Call->getCalledFunction(); |
3119 | if (&F == Callee) |
3120 | return InlineResult::failure(Reason: "recursive call" ); |
3121 | |
3122 | // Disallow calls which expose returns-twice to a function not previously |
3123 | // attributed as such. |
3124 | if (!ReturnsTwice && isa<CallInst>(Val: Call) && |
3125 | cast<CallInst>(Val: Call)->canReturnTwice()) |
3126 | return InlineResult::failure(Reason: "exposes returns-twice attribute" ); |
3127 | |
3128 | if (Callee) |
3129 | switch (Callee->getIntrinsicID()) { |
3130 | default: |
3131 | break; |
3132 | case llvm::Intrinsic::icall_branch_funnel: |
3133 | // Disallow inlining of @llvm.icall.branch.funnel because current |
3134 | // backend can't separate call targets from call arguments. |
3135 | return InlineResult::failure( |
3136 | Reason: "disallowed inlining of @llvm.icall.branch.funnel" ); |
3137 | case llvm::Intrinsic::localescape: |
3138 | // Disallow inlining functions that call @llvm.localescape. Doing this |
3139 | // correctly would require major changes to the inliner. |
3140 | return InlineResult::failure( |
3141 | Reason: "disallowed inlining of @llvm.localescape" ); |
3142 | case llvm::Intrinsic::vastart: |
3143 | // Disallow inlining of functions that initialize VarArgs with |
3144 | // va_start. |
3145 | return InlineResult::failure( |
3146 | Reason: "contains VarArgs initialized with va_start" ); |
3147 | } |
3148 | } |
3149 | } |
3150 | |
3151 | return InlineResult::success(); |
3152 | } |
3153 | |
3154 | // APIs to create InlineParams based on command line flags and/or other |
3155 | // parameters. |
3156 | |
3157 | InlineParams llvm::getInlineParams(int Threshold) { |
3158 | InlineParams Params; |
3159 | |
3160 | // This field is the threshold to use for a callee by default. This is |
3161 | // derived from one or more of: |
3162 | // * optimization or size-optimization levels, |
3163 | // * a value passed to createFunctionInliningPass function, or |
3164 | // * the -inline-threshold flag. |
3165 | // If the -inline-threshold flag is explicitly specified, that is used |
3166 | // irrespective of anything else. |
3167 | if (InlineThreshold.getNumOccurrences() > 0) |
3168 | Params.DefaultThreshold = InlineThreshold; |
3169 | else |
3170 | Params.DefaultThreshold = Threshold; |
3171 | |
3172 | // Set the HintThreshold knob from the -inlinehint-threshold. |
3173 | Params.HintThreshold = HintThreshold; |
3174 | |
3175 | // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold. |
3176 | Params.HotCallSiteThreshold = HotCallSiteThreshold; |
3177 | |
3178 | // If the -locally-hot-callsite-threshold is explicitly specified, use it to |
3179 | // populate LocallyHotCallSiteThreshold. Later, we populate |
3180 | // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if |
3181 | // we know that optimization level is O3 (in the getInlineParams variant that |
3182 | // takes the opt and size levels). |
3183 | // FIXME: Remove this check (and make the assignment unconditional) after |
3184 | // addressing size regression issues at O2. |
3185 | if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0) |
3186 | Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold; |
3187 | |
3188 | // Set the ColdCallSiteThreshold knob from the |
3189 | // -inline-cold-callsite-threshold. |
3190 | Params.ColdCallSiteThreshold = ColdCallSiteThreshold; |
3191 | |
3192 | // Set the OptMinSizeThreshold and OptSizeThreshold params only if the |
3193 | // -inlinehint-threshold commandline option is not explicitly given. If that |
3194 | // option is present, then its value applies even for callees with size and |
3195 | // minsize attributes. |
3196 | // If the -inline-threshold is not specified, set the ColdThreshold from the |
3197 | // -inlinecold-threshold even if it is not explicitly passed. If |
3198 | // -inline-threshold is specified, then -inlinecold-threshold needs to be |
3199 | // explicitly specified to set the ColdThreshold knob |
3200 | if (InlineThreshold.getNumOccurrences() == 0) { |
3201 | Params.OptMinSizeThreshold = InlineConstants::OptMinSizeThreshold; |
3202 | Params.OptSizeThreshold = InlineConstants::OptSizeThreshold; |
3203 | Params.ColdThreshold = ColdThreshold; |
3204 | } else if (ColdThreshold.getNumOccurrences() > 0) { |
3205 | Params.ColdThreshold = ColdThreshold; |
3206 | } |
3207 | return Params; |
3208 | } |
3209 | |
3210 | InlineParams llvm::getInlineParams() { |
3211 | return getInlineParams(Threshold: DefaultThreshold); |
3212 | } |
3213 | |
3214 | // Compute the default threshold for inlining based on the opt level and the |
3215 | // size opt level. |
3216 | static int computeThresholdFromOptLevels(unsigned OptLevel, |
3217 | unsigned SizeOptLevel) { |
3218 | if (OptLevel > 2) |
3219 | return InlineConstants::OptAggressiveThreshold; |
3220 | if (SizeOptLevel == 1) // -Os |
3221 | return InlineConstants::OptSizeThreshold; |
3222 | if (SizeOptLevel == 2) // -Oz |
3223 | return InlineConstants::OptMinSizeThreshold; |
3224 | return DefaultThreshold; |
3225 | } |
3226 | |
3227 | InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) { |
3228 | auto Params = |
3229 | getInlineParams(Threshold: computeThresholdFromOptLevels(OptLevel, SizeOptLevel)); |
3230 | // At O3, use the value of -locally-hot-callsite-threshold option to populate |
3231 | // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only |
3232 | // when it is specified explicitly. |
3233 | if (OptLevel > 2) |
3234 | Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold; |
3235 | return Params; |
3236 | } |
3237 | |
3238 | PreservedAnalyses |
3239 | InlineCostAnnotationPrinterPass::run(Function &F, |
3240 | FunctionAnalysisManager &FAM) { |
3241 | PrintInstructionComments = true; |
3242 | std::function<AssumptionCache &(Function &)> GetAssumptionCache = |
3243 | [&](Function &F) -> AssumptionCache & { |
3244 | return FAM.getResult<AssumptionAnalysis>(IR&: F); |
3245 | }; |
3246 | Module *M = F.getParent(); |
3247 | ProfileSummaryInfo PSI(*M); |
3248 | DataLayout DL(M); |
3249 | TargetTransformInfo TTI(DL); |
3250 | // FIXME: Redesign the usage of InlineParams to expand the scope of this pass. |
3251 | // In the current implementation, the type of InlineParams doesn't matter as |
3252 | // the pass serves only for verification of inliner's decisions. |
3253 | // We can add a flag which determines InlineParams for this run. Right now, |
3254 | // the default InlineParams are used. |
3255 | const InlineParams Params = llvm::getInlineParams(); |
3256 | for (BasicBlock &BB : F) { |
3257 | for (Instruction &I : BB) { |
3258 | if (CallInst *CI = dyn_cast<CallInst>(Val: &I)) { |
3259 | Function *CalledFunction = CI->getCalledFunction(); |
3260 | if (!CalledFunction || CalledFunction->isDeclaration()) |
3261 | continue; |
3262 | OptimizationRemarkEmitter ORE(CalledFunction); |
3263 | InlineCostCallAnalyzer ICCA(*CalledFunction, *CI, Params, TTI, |
3264 | GetAssumptionCache, nullptr, &PSI, &ORE); |
3265 | ICCA.analyze(); |
3266 | OS << " Analyzing call of " << CalledFunction->getName() |
3267 | << "... (caller:" << CI->getCaller()->getName() << ")\n" ; |
3268 | ICCA.print(OS); |
3269 | OS << "\n" ; |
3270 | } |
3271 | } |
3272 | } |
3273 | return PreservedAnalyses::all(); |
3274 | } |
3275 | |